Realization of Large File Segmentation and Merging with python
- 2021-07-24 11:27:46
- OfStack
Many times, we will face the problem that large files cannot be loaded into memory, or we have to transfer large files. At this time, we need to consider dividing large files into small files for processing.
The following is an implementation of dividing and merging components with python.
import os
FILE_DIR = os.path.dirname(os.path.abspath(__file__))
#========================================================
# File operation
#========================================================
def get_filelist1(dir, postfix):
'''
Returns a list of file names by suffix
INPUT -> Directory address , File suffix
OUTPUT -> File name list
'''
return [os.path.join(dir, f) for f in os.listdir(dir) if f.endswith(postfix)]
def get_filelist2(dir, preffix):
'''
Returns a list of file names by prefix
INPUT -> Directory address , File prefix
OUTPUT -> File name list
'''
return [os.path.join(dir, f) for f in os.listdir(dir) if f.startswith(preffix)]
def get_file_postfix(filename):
'''
Get the file name suffix
INPUT -> Filename
OUTPUT -> File suffix
'''
file = os.path.splitext(filename)
preffix, postfix = file
return postfix
def get_file_preffix(filename):
'''
Get the file name prefix
INPUT -> Filename
OUTPUT -> File prefix
'''
file = os.path.splitext(filename)
preffix, postfix = file
return preffix
def file_chunkspilt(path, filename, chunksize):
'''
The file is divided into several sub-files according to the data block size
INPUT -> File directory , Filename , Size per data block
'''
if chunksize > 0:
filepath = path+'/'+filename
partnum = 0
inputfile = open(filepath, 'rb')
while True:
chunk = inputfile.read(chunksize)
if not chunk:
break
partnum += 1
newfilename = os.path.join(path, (filename+'_%04d' % partnum))
sub_file = open(newfilename, 'wb')
sub_file.write(chunk)
sub_file.close()
inputfile.close()
else:
print('chunksize must bigger than 0!')
def file_linespilt(path, filename, limit):
'''
The file is divided into multiple subfiles according to lines
INPUT -> File directory , Filename , Number of rows
'''
if limit > 0:
preffix = get_file_preffix(filename)
postfix = get_file_postfix(filename)
file_count = 0
l_list = []
with open(path+'/'+filename, 'rb') as f:
for line in f:
l_list.append(line)
if len(l_list) < limit:
continue
subfile = preffix+"_"+str(file_count)+"."+postfix
with open(FILE_DIR+'/'+subfile, 'wb') as file:
for l in l_list[:-1]:
file.write(l)
file.write(l_list[-1].strip())
l_list=[]
file_count += 1
else:
print('limit must bigger than 0!')
def file_combine(path, filename):
'''
Subfile merge
INPUT -> File directory , Filename
'''
filepath = path+'/'+filename
partnum = 0
outputfile = open(filepath, 'wb')
subfile_list = get_filelist2(FILE_DIR, filename+'_')
for subfile in subfile_list:
temp = open(subfile, 'rb')
outputfile.write(temp.read())
temp.close()
outputfile.close()