Realization of Large File Segmentation and Merging with python

  • 2021-07-24 11:27:46
  • OfStack

Many times, we will face the problem that large files cannot be loaded into memory, or we have to transfer large files. At this time, we need to consider dividing large files into small files for processing.

The following is an implementation of dividing and merging components with python.


import os
FILE_DIR = os.path.dirname(os.path.abspath(__file__))

#========================================================
#  File operation 
#========================================================
def get_filelist1(dir, postfix):
  '''
   Returns a list of file names by suffix 
  INPUT ->  Directory address ,  File suffix 
  OUTPUT ->  File name list 
  '''
  return [os.path.join(dir, f) for f in os.listdir(dir) if f.endswith(postfix)]

def get_filelist2(dir, preffix):
  '''
   Returns a list of file names by prefix 
  INPUT ->  Directory address ,  File prefix 
  OUTPUT ->  File name list 
  '''
  return [os.path.join(dir, f) for f in os.listdir(dir) if f.startswith(preffix)]

def get_file_postfix(filename):
  '''
   Get the file name suffix 
  INPUT ->  Filename 
  OUTPUT ->  File suffix 
  '''
  file = os.path.splitext(filename)
  preffix, postfix = file
  return postfix

def get_file_preffix(filename):
  '''
   Get the file name prefix 
  INPUT ->  Filename 
  OUTPUT ->  File prefix 
  '''
  file = os.path.splitext(filename)
  preffix, postfix = file
  return preffix

def file_chunkspilt(path, filename, chunksize):
  '''
   The file is divided into several sub-files according to the data block size 
  INPUT ->  File directory ,  Filename ,  Size per data block 
  '''
  if chunksize > 0:
    filepath = path+'/'+filename
    partnum = 0
    inputfile = open(filepath, 'rb')
    while True:
      chunk = inputfile.read(chunksize)
      if not chunk:
        break
      partnum += 1
      newfilename = os.path.join(path, (filename+'_%04d' % partnum))
      sub_file = open(newfilename, 'wb')
      sub_file.write(chunk)
      sub_file.close()
    inputfile.close()
  else:
    print('chunksize must bigger than 0!')

def file_linespilt(path, filename, limit):
  '''
   The file is divided into multiple subfiles according to lines 
  INPUT ->  File directory ,  Filename ,  Number of rows 
  '''
  if limit > 0:
    preffix = get_file_preffix(filename)
    postfix = get_file_postfix(filename)
    file_count = 0
    l_list = []
    with open(path+'/'+filename, 'rb') as f:
      for line in f:
        l_list.append(line)
        if len(l_list) < limit:
          continue
        subfile = preffix+"_"+str(file_count)+"."+postfix
        with open(FILE_DIR+'/'+subfile, 'wb') as file:
          for l in l_list[:-1]:
            file.write(l)
          file.write(l_list[-1].strip())
          l_list=[]
          file_count += 1
  else:
    print('limit must bigger than 0!')

def file_combine(path, filename):
  '''
   Subfile merge 
  INPUT ->  File directory ,  Filename 
  '''
  filepath = path+'/'+filename
  partnum = 0
  outputfile = open(filepath, 'wb')
  subfile_list = get_filelist2(FILE_DIR, filename+'_')
  for subfile in subfile_list:
    temp = open(subfile, 'rb')
    outputfile.write(temp.read())
    temp.close()
  outputfile.close()

Related articles: