Python implements multiple processes to import CSV data to MySQL
- 2020-05-26 09:34:22
- OfStack
Some time ago, I helped a colleague to handle the requirement of importing CSV data into MySQL. The two large CSV files have 3GB, 21 million records and 7GB and 35 million records respectively. For this magnitude of data, a simple single-process/single-threaded import would take a long time and eventually be implemented in a multi-process manner. Without further details on the specific process, please record the following points:
Batch insert rather than strip insert To speed up insertion, don't index yet Producer and consumer models, the main process reads files, and multiple worker processes perform inserts Be careful to control the amount of worker and avoid putting too much pressure on MySQL Note the exception caused by handling dirty data The raw data is GBK encoded, so be careful to convert to UTF-8 Encapsulate the command line tool with clickThe specific code implementation is as follows:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import csv
import logging
import multiprocessing
import os
import warnings
import click
import MySQLdb
import sqlalchemy
warnings.filterwarnings('ignore', category=MySQLdb.Warning)
# The number of records that were bulk-inserted
BATCH = 5000
DB_URI = 'mysql://root@localhost:3306/example?charset=utf8'
engine = sqlalchemy.create_engine(DB_URI)
def get_table_cols(table):
sql = 'SELECT * FROM `{table}` LIMIT 0'.format(table=table)
res = engine.execute(sql)
return res.keys()
def insert_many(table, cols, rows, cursor):
sql = 'INSERT INTO `{table}` ({cols}) VALUES ({marks})'.format(
table=table,
cols=', '.join(cols),
marks=', '.join(['%s'] * len(cols)))
cursor.execute(sql, *rows)
logging.info('process %s inserted %s rows into table %s', os.getpid(), len(rows), table)
def insert_worker(table, cols, queue):
rows = []
# Each child process creates its own engine object
cursor = sqlalchemy.create_engine(DB_URI)
while True:
row = queue.get()
if row is None:
if rows:
insert_many(table, cols, rows, cursor)
break
rows.append(row)
if len(rows) == BATCH:
insert_many(table, cols, rows, cursor)
rows = []
def insert_parallel(table, reader, w=10):
cols = get_table_cols(table)
# Data queue, the main process reads the file and writes data into it, worker The process reads data from the queue
# Pay attention to 1 Control the size of the queue to avoid too slow consumption resulting in the accumulation of too much data, too much memory
queue = multiprocessing.Queue(maxsize=w*BATCH*2)
workers = []
for i in range(w):
p = multiprocessing.Process(target=insert_worker, args=(table, cols, queue))
p.start()
workers.append(p)
logging.info('starting # %s worker process, pid: %s...', i + 1, p.pid)
dirty_data_file = './{}_dirty_rows.csv'.format(table)
xf = open(dirty_data_file, 'w')
writer = csv.writer(xf, delimiter=reader.dialect.delimiter)
for line in reader:
# Record and skip dirty data : The number of keys is not 1 to
if len(line) != len(cols):
writer.writerow(line)
continue
# the None Value is replaced by 'NULL'
clean_line = [None if x == 'NULL' else x for x in line]
# Write data to a queue
queue.put(tuple(clean_line))
if reader.line_num % 500000 == 0:
logging.info('put %s tasks into queue.', reader.line_num)
xf.close()
# For each worker Send a signal that the task is over
logging.info('send close signal to worker processes')
for i in range(w):
queue.put(None)
for p in workers:
p.join()
def convert_file_to_utf8(f, rv_file=None):
if not rv_file:
name, ext = os.path.splitext(f)
if isinstance(name, unicode):
name = name.encode('utf8')
rv_file = '{}_utf8{}'.format(name, ext)
logging.info('start to process file %s', f)
with open(f) as infd:
with open(rv_file, 'w') as outfd:
lines = []
loop = 0
chunck = 200000
first_line = infd.readline().strip(codecs.BOM_UTF8).strip() + '\n'
lines.append(first_line)
for line in infd:
clean_line = line.decode('gb18030').encode('utf8')
clean_line = clean_line.rstrip() + '\n'
lines.append(clean_line)
if len(lines) == chunck:
outfd.writelines(lines)
lines = []
loop += 1
logging.info('processed %s lines.', loop * chunck)
outfd.writelines(lines)
logging.info('processed %s lines.', loop * chunck + len(lines))
@click.group()
def cli():
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
@cli.command('gbk_to_utf8')
@click.argument('f')
def convert_gbk_to_utf8(f):
convert_file_to_utf8(f)
@cli.command('load')
@click.option('-t', '--table', required=True, help=' The name of the table ')
@click.option('-i', '--filename', required=True, help=' The input file ')
@click.option('-w', '--workers', default=10, help='worker Quantity, default 10')
def load_fac_day_pro_nos_sal_table(table, filename, workers):
with open(filename) as fd:
fd.readline() # skip header
reader = csv.reader(fd)
insert_parallel(table, reader, w=workers)
if __name__ == '__main__':
cli()
The above is the article for everyone to share all no one, hope you can like