nodejs simple implementation of Chinese and English translation

  • 2020-06-03 05:50:47
  • OfStack

Help former colleagues to solve a need, Chinese project translation English project ~~~

Considering the specific implementation, if it is intelligent, it must do the grammar analysis of Chinese, but I think it is difficult.

Therefore, the final solution is to traverse the file, match the Chinese phrase, and then carry out human translation to replace the Chinese phrase with the translated content. Of course, later still need to manually check, after all, the code in Chinese, may affect the relevant procedures.

Although nodejs is a main thread, asynchronous file reading and writing and event response mechanism must also call the thread. In the actual programming, there is no need to consider the related problems of the thread.

The code is not complex as follows, after writing, the appropriate package under


var fs = require('fs');
var http = require('http');
var filePath = 'D:\\WORK_new\\';
var logPath = 'D:\\chinese.log';

var map = {};
var num = 0;

var dictionary = (function () {
  var map = {};
  return {
    logPath: 'D:\\chinese.log',
    set: function (key, val) {
      map[key] = val || '';
    },
    get: function (key) {
      return map[key]||'';
    },
    save2File: function () {
      fs.writeFile(this.logPath, JSON.stringify(map).replace(/","/g,'",\r\n"'),{encoding:'utf8',flag:'w'}, function (err) {
        if (err) throw err;
      }); 
    },
    loadFile: function (callback) {
      fs.readFile(this.logPath, {encoding:'utf8'},function (err, data) {
        map = JSON.parse(data);
        callback();
      })
    },
    translateByGoogle: function (callback) {
      var index = 0;
      for (var key in map) {
        if (map[key] == '') {
          index++;
          (function (key) {
            http.get("http://translate.google.cn/translate_a/t?client=t&hl=zh-CN&sl=zh-CN&tl=en&ie=UTF-8&oe=UTF-8&oc=2&otf=1&ssel=3&tsel=6&sc=2&q="+key, function(res) {
              res.setEncoding('utf8');
              var body = "";
              res.on('data', function (chunk) {
                body+=chunk;  
              }).on('end', function (){ 
                var obj = eval('('+body+')');
                map[key] = obj[0][0][0];
                index--;
                if (index == 0) {
                  callback();
                }
              });
            }).on('error', function(e) {
              console.log('http error');
              index--;
              if (index == 0) {
                callback();
              }
              console.log("Got error: " + e.message);
            });
          })(key);
        }
      }
    }
  }
})();

function File () {
  var index = 0;
  var _readFile = function (pathStr, fileBack, doneBack) {
    fs.readFile(pathStr,{encoding:'utf8'}, function (err, data) {
      index--;
      if (err) {
        data = "";
        console.log(err,pathStr)
        //throw err;
      }
      fileBack(data,pathStr);
      if (index == 0) {
        doneBack();
      }
    });
  };
  var _walkDir = function (pathStr, fileBack, doneBack) {
    fs.readdir(pathStr, function (err, files) {
      files.forEach(function (file) {
        if(fs.statSync(pathStr + '/' + file).isDirectory()){
          _walkDir(pathStr + '/' + file, fileBack, doneBack);
        } else {
          if (/.js$|.html$|.htm$|.jsp$/.test(file)){
            index ++;
            _readFile(pathStr + '/' + file, fileBack, doneBack);
          }
          return;
        }
      });
    });
  }
  this.walkDir = function (pathStr, fileBack, doneBack) {
    index = 0;
    _walkDir(pathStr, fileBack, doneBack);
  }
}

// The first 1 step   Access to Chinese 
dictionary.logPath = logPath;

new File().walkDir(filePath, function (data) {
  if (!!data) {
    var match = data.match(/[\u4e00-\u9faf]+/g);
    if (!!match) {
      match.forEach(function (mat) {
        dictionary.set(mat);
      })
    }
  }
}, function () {
  console.log(' Access to Chinese  OK');
  dictionary.save2File();
})


// The first 2 step  google translation 
/*
dictionary.loadFile(function () {
  dictionary.translateByGoogle(function () {
    dictionary.save2File();
  })
});
*/
// The first 3 step   Chinese replace 
/*
dictionary.loadFile(function () {
  new File().walkDir(filePath, function (data,pathStr) {
    fs.writeFile(pathStr, data.replace(/[\u4e00-\u9faf]+/g, function (ch) {
      return dictionary.get(ch);
    }),{encoding:'ascii',flag:'w'}, function (err) {
      if (err) throw err;
    }); 
  }, function () {
    console.log(' Chinese replace  OK');
  })
});
*/

There are problems

1.nodejs encoding problem: in window environment, GBK encoding is not well supported, mainly due to the processing of utf8 file

2. The efficiency can be optimized by threading, which is not considered in depth

3. Match, there may be a single punctuation phrase, etc., the need to manually check

In reality, the files were GBK, and some were utf8, but later, when I was thinking of implementing it through scripting language quickhand,

1. File encoding problem, judgment through search

Determine if the first 3 bytes of the file are ef bb bf, but this is only for utf8 with BOM

For utf8 format without BOM, byte feature code judgment is required (difficult and limited energy, the above scheme is used, and manual screening is performed for the case without BOM).

2 because fast multithreaded convenient programming is very simple, 1 straight thought that multithreaded efficiency must be better than a single thread. The actual situation is not the same as thought, single-threaded than multi-threaded much faster. The main bottleneck seems to be the read/write file IO.

Above is the content of this article, I hope you can enjoy.


Related articles: