Markov chain algorithm (markov algorithm) awk C++ C language implementation code

  • 2020-04-02 02:35:48
  • OfStack

1. Problem description

Markov chain algorithm is used to generate a random section of English, the idea is very simple. First, the data is read in, and then the data is divided into prefix and suffix, and the suffix is obtained randomly through the prefix, so as to produce a piece of random English that can be read.

For the sake of illustration, suppose we have the following:
 

   Show your flowcharts and conceal your tables and I will be mystified. Show your tables and your flowcharts will be obvious.
 

Suppose the length of the prefix is 2, then we process the input and get the following data. We first obtain a prefix, then randomly select a word in the suffix list of the prefix, then change the prefix and repeat the process. In this way, the sentence we produce will be readable.

Here is the processed data:


The prefix   The suffix
show your  flowcharts tables
your flowcharts  and will
flowcharts and  conceal
flowcharts will  be
your tables  and and
will be  mystified. obvious.
be mystified.  show
be obvious.  (end)

Deal with the text of the markov chain algorithm will be the first to take lead show your, and then remove random flowcharts or table two words, assuming that choice is flowcharts, the new prefix is your flowcharts, similarly, choosing a table, a new prefix is your table, a new prefix your flowcharts, then select it again the suffix, this time in the and and will randomly selected, repeat the above process, It produces a readable text. The specific description is as follows:

 Set up the  w1  and  w2  Are the first two words of the text 
The output w1 and w2 Cycle:
    Pick at random w3, It's in the text w1 w2 One of the suffixes of
    print w3
    the w1 and w2 Respectively for w2 and w3
    Repeated cycle

2. Awk programs

The markov chain algorithm is not difficult, as we will see later, and it is quite cumbersome to solve this problem with c, whereas it only takes 5 minutes with awk. This is simply a question of demonstrating the benefits of awk.

There is an associative array in awk, which can be used to indicate the relationship between prefix and suffix. The procedure is as follows:


# markov.awk: markov chain algorithm for 2-word prefixes
BEGIN { MAXGEN = 10000; NONWORD = "n"; w1 = w2 = NONWORD }

{  for (i = 1; i <= NF; i++) {   # read all words
    statetab[w1,w2,++nsuffix[w1,w2]] = $i
    w1 = w2
    w2 = $i
  }
}

END {
  statetab[w1,w2,++nsuffix[w1,w2]] = NONWORD # add tail
  w1 = w2 = NONWORD
  for (i = 0; i < MAXGEN; i++) { # generate
    r = int(rand()*nsuffix[w1,w2]) + 1 # nsuffix >= 1
    p = statetab[w1,w2,r]
    if (p == NONWORD)
      exit
    print p
    w1 = w2     # advance chain
    w2 = p
  }
}  

3. C + + program

The main difficulty of this problem lies in the random acquisition of suffixes by prefixes. In C++, we can achieve the corresponding relationship between prefixes and suffixes by means of map, so as to achieve a high development efficiency.






#include <time.h>
#include <iostream>
#include <string>
#include <deque>
#include <map>
#include <vector>

using namespace std;

const int NPREF = 2;
const char NONWORD[] = "n";  // cannot appear as real line: we remove newlines
const int MAXGEN = 10000; // maximum words generated

typedef deque<string> Prefix;

map<Prefix, vector<string> > statetab; // prefix -> suffixes

void    build(Prefix&, istream&);
void    generate(int nwords);
void    add(Prefix&, const string&);

// markov main: markov-chain random text generation
int main(void)
{
  int nwords = MAXGEN;
  Prefix prefix; // current input prefix

  srand(time(NULL));
  for (int i = 0; i < NPREF; i++)
    add(prefix, NONWORD);
  build(prefix, cin);
  add(prefix, NONWORD);
  generate(nwords);
  return 0;
}

// build: read input words, build state table
void build(Prefix& prefix, istream& in)
{
  string buf;

  while (in >> buf)
    add(prefix, buf);
}

// add: add word to suffix deque, update prefix
void add(Prefix& prefix, const string& s)
{
  if (prefix.size() == NPREF) {
    statetab[prefix].push_back(s);
    prefix.pop_front();
  }
  prefix.push_back(s);
}

// generate: produce output, one word per line
void generate(int nwords)
{
  Prefix prefix;
  int i;

  for (i = 0; i < NPREF; i++)
    add(prefix, NONWORD);
  for (i = 0; i < nwords; i++) {
    vector<string>& suf = statetab[prefix];
    const string& w = suf[rand() % suf.size()];
    if (w == NONWORD)
      break;
    cout << w << "n";
    prefix.pop_front(); // advance
    prefix.push_back(w);
  }
}

4. The c program

If you want the program to run fast enough, you can only implement it in a lower-level language. When we implement it in c language, we have to consider all kinds of problems. First, the first problem is how to express the relationship between prefix and suffix.

We use prefix key and suffix value to store the relationship between prefix and suffix, and we know that hash table is the fastest to find, so it makes sense to use hash table here, just to see if you can think of using prefix as key, based on the above ideas, a little bit more careful, there is no big problem.








#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "eprintf.h"

enum {
  NPREF  = 2,  
  NHASH  = 4093, 
  MAXGEN = 10000 
};

typedef struct State State;
typedef struct Suffix Suffix;

struct State { 
  char  *pref[NPREF];  
  Suffix *suf;      
  State  *next;     
};

struct Suffix { 
  char  *word;     
  Suffix *next;     
};

State  *lookup(char *prefix[], int create);
void  build(char *prefix[], FILE*);
void  generate(int nwords);
void  add(char *prefix[], char *word);

State  *statetab[NHASH];  

char NONWORD[] = "n"; 


int main(void)
{
  int i, nwords = MAXGEN;
  char *prefix[NPREF];    

  int c;
  long seed;

  setprogname("markov");
  seed = time(NULL);

  srand(seed);
  for (i = 0; i < NPREF; i++) 
    prefix[i] = NONWORD;
  build(prefix, stdin);
  add(prefix, NONWORD);
  generate(nwords);
  return 0;
}  

const int MULTIPLIER = 31; 


unsigned int hash(char *s[NPREF])
{
  unsigned int h;
  unsigned char *p;
  int i;

  h = 0;
  for (i = 0; i < NPREF; i++)
    for (p = (unsigned char *) s[i]; *p != '0'; p++)
      h = MULTIPLIER * h + *p;
  return h % NHASH;
}




State* lookup(char *prefix[NPREF], int create)
{
  int i, h;
  State *sp;

  h = hash(prefix);
  for (sp = statetab[h]; sp != NULL; sp = sp->next) {
    for (i = 0; i < NPREF; i++)
      if (strcmp(prefix[i], sp->pref[i]) != 0)
        break;
    if (i == NPREF)   
      return sp;
  }
  if (create) {
    sp = (State *) emalloc(sizeof(State));
    for (i = 0; i < NPREF; i++)
      sp->pref[i] = prefix[i];
    sp->suf = NULL;
    sp->next = statetab[h];
    statetab[h] = sp;
  }
  return sp;
}


void addsuffix(State *sp, char *suffix)
{
  Suffix *suf;

  suf = (Suffix *) emalloc(sizeof(Suffix));
  suf->word = suffix;
  suf->next = sp->suf;
  sp->suf = suf;
}


void add(char *prefix[NPREF], char *suffix)
{
  State *sp;

  sp = lookup(prefix, 1); 
  addsuffix(sp, suffix);
  
  memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
  prefix[NPREF-1] = suffix;
}


void build(char *prefix[NPREF], FILE *f)
{
  char buf[100], fmt[10];

  
  sprintf(fmt, "%%%ds", sizeof(buf)-1);
  while (fscanf(f, fmt, buf) != EOF)
    add(prefix, estrdup(buf));
}


void generate(int nwords)
{
  State *sp;
  Suffix *suf;
  char *prefix[NPREF], *w;
  int i, nmatch;

  for (i = 0; i < NPREF; i++) 
    prefix[i] = NONWORD;

  for (i = 0; i < nwords; i++) {
    sp = lookup(prefix, 0);
    if (sp == NULL)
      eprintf("internal error: lookup failed");
    nmatch = 0;
    for (suf = sp->suf; suf != NULL; suf = suf->next)
      if (rand() % ++nmatch == 0) /* prob = 1/nmatch */
        w = suf->word;
    if (nmatch == 0)
      eprintf("internal error: no suffix %d %s", i, prefix[0]);
    if (strcmp(w, NONWORD) == 0)
      break;
    printf("%sn", w);
    memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
    prefix[NPREF-1] = w;
  }
}

Related articles: