How to Parse excel File in python Automated Test

2021-07-03 00:18:25
OfStack

Preface

In automated testing, we only use files or databases to store data, so files can be csv, xlsx, xml or even txt files. Usually excel files are our first choice, and excel is very convenient for writing test cases or storing test data. So today, we will summarize the methods of processing excel files in different modules, and package them directly, which is convenient for us to use directly in the future and increase work efficiency.

openpyxl

openpyxl is a third party library. First, we install it directly using the command pip install openpyxl

Note: When openpyxl operates excel, both row and column numbers are calculated from 1

Encapsulation code


"""
------------------------------------
@Time : 2019/5/13 18:00
@Auth : linux超
@File : ParseExcel.py
@IDE : PyCharm
@Motto: Real warriors,dare to face the bleak warning,dare to face the incisive error!
------------------------------------
"""
from openpyxl import load_workbook
from openpyxl.styles import Font
from openpyxl.styles.colors import BLACK
from collections import namedtuple


class ParseExcel(object):
"""解析excel文件"""

def __init__(self, filename, sheet_name=None):
try:
self.filename = filename
self.sheet_name = sheet_name
self.wb = load_workbook(self.filename)
if self.sheet_name is None:
self.work_sheet = self.wb.active
else:
self.work_sheet = self.wb[self.sheet_name]
except FileNotFoundError as e:
raise e

def get_max_row_num(self):
"""获取最大行号"""
max_row_num = self.work_sheet.max_row
return max_row_num

def get_max_column_num(self):
"""获取最大列号"""
max_column = self.work_sheet.max_column
return max_column

def get_cell_value(self, coordinate=None, row=None, column=None):
"""获取指定单元格的数据"""
if coordinate is not None:
try:
return self.work_sheet[coordinate].value
except Exception as e:
raise e
elif coordinate is None and row is not None and column is not None:
if isinstance(row, int) and isinstance(column, int):
return self.work_sheet.cell(row=row, column=column).value
else:
raise TypeError('row and column must be type int')
else:
raise Exception("Insufficient Coordinate of cell!")

def get_row_value(self, row):
"""获取某1行的数据"""
column_num = self.get_max_column_num()
row_value = []
if isinstance(row, int):
for column in range(1, column_num + 1):
values_row = self.work_sheet.cell(row, column).value
row_value.append(values_row)
return row_value
else:
raise TypeError('row must be type int')

def get_column_value(self, column):
"""获取某1列数据"""
row_num = self.get_max_column_num()
column_value = []
if isinstance(column, int):
for row in range(1, row_num + 1):
values_column = self.work_sheet.cell(row, column).value
column_value.append(values_column)
return column_value
else:
raise TypeError('column must be type int')

def get_all_value_1(self):
"""获取指定表单的所有数据(除去表头)"""
max_row_num = self.get_max_row_num()
max_column = self.get_max_column_num()
values = []
for row in range(2, max_row_num + 1):
value_list = []
for column in range(1, max_column + 1):
value = self.work_sheet.cell(row, column).value
value_list.append(value)
values.append(value_list)
return values

def get_all_value_2(self):
"""获取指定表单的所有数据(除去表头)"""
rows_obj = self.work_sheet.iter_rows(min_row=2, max_row=self.work_sheet.max_row,
values_only=True) # 指定values_only 会直接提取数据不需要再使用cell().value
values = []
for row_tuple in rows_obj:
value_list = []
for value in row_tuple:
value_list.append(value)
values.append(value_list)
return values

def get_excel_title(self):
"""获取sheet表头"""
title_key = tuple(self.work_sheet.iter_rows(max_row=1, values_only=True))[0]
return title_key

def get_listdict_all_value(self):
"""获取所有数据，返回嵌套字典的列表"""
sheet_title = self.get_excel_title()
all_values = self.get_all_value_2()
value_list = []
for value in all_values:
value_list.append(dict(zip(sheet_title, value)))
return value_list

def get_list_nametuple_all_value(self):
"""获取所有数据，返回嵌套命名元组的列表"""
sheet_title = self.get_excel_title()
values = self.get_all_value_2()

excel = namedtuple('excel', sheet_title)
value_list = []
for value in values:
e = excel(*value)
value_list.append(e)
return value_list
def write_cell(self, row, column, value=None, bold=True, color=BLACK):
"""
指定单元格写入数据
:param work_sheet:
:param row: 行号
:param column: 列号
:param value: 待写入数据
:param bold: 加粗， 默认加粗
:param color: 字体颜色，默认黑色
:return:
"""
try:
if isinstance(row, int) and isinstance(column, int):
cell_obj = self.work_sheet.cell(row, column)
cell_obj.font = Font(color=color, bold=bold)
cell_obj.value = value
self.wb.save(self.filename)
else:
raise TypeError('row and column must be type int')
except Exception as e:
raise e
if __name__ == '__main__':
pe = ParseExcel('testdata.xlsx')
# sheet = pe.get_sheet_object('testcase')
column_row = pe.get_max_column_num()
print('最大列号:', column_row)
max_row = pe.get_max_row_num()
print('最大行号:', max_row)
#
cell_value_1 = pe.get_cell_value(row=2, column=3)
print('第%d行, 第%d列的数据为: %s' % (2, 3, cell_value_1))
cell_value_2 = pe.get_cell_value(coordinate='A5')
print('A5单元格的数据为: {}'.format(cell_value_2))
value_row = pe.get_row_value(3)
print('第{}行的数据为:{}'.format(3, value_row))
value_column = pe.get_column_value(2)
print('第{}列的数据为:{}'.format(2, value_column))
#
values_1 = pe.get_all_value_1()
print('第1种方式获取所有数据\n', values_1)
values_2 = pe.get_all_value_2()
print('第2种方式获取所有数据\n', values_2)
title = pe.get_excel_title()
print('表头为\n{}'.format(title))
dict_value = pe.get_listdict_all_value()
print('所有数据组成的嵌套字典的列表:\n', dict_value)
#
namedtuple_value = pe.get_list_nametuple_all_value()
print('所有数据组成的嵌套命名元组的列表:\n', namedtuple_value)
pe.write_cell(1, 2, 'Tc_title')

# add by linux Super at 2019/05/22 15:58

The above encapsulation, such as if two sheet are used to write data in the same excel file at the same time, will be a little small bug (after writing, you will find that one of the two forms has no data)

In fact, the reason is very simple: different objects have their own independent attributes. When you write operations, each object only saves its own form, so after the last one object writes data, it only saves its own form, while the forms of other objects are actually not saved. To solve this problem, the code encapsulated above has been slightly changed


"""
------------------------------------
@Time : 2019/5/22 9:11
@Auth : linux Super 
@File : ParseExcel.py
@IDE : PyCharm
@Motto: Real warriors,dare to face the bleak warning,dare to face the incisive error!
------------------------------------
"""
from openpyxl import load_workbook
from openpyxl.styles import Font
from openpyxl.styles.colors import BLACK
from collections import namedtuple
class ParseExcel(object):
""" Analyse excel Documents """
def __init__(self, filename):
try:
self.filename = filename
self.__wb = load_workbook(self.filename)
except FileNotFoundError as e:
raise e
def get_max_row_num(self, sheet_name):
""" Get the maximum line number """
max_row_num = self.__wb[sheet_name].max_row
return max_row_num
def get_max_column_num(self, sheet_name):
""" Get the maximum column number """
max_column = self.__wb[sheet_name].max_column
return max_column
def get_cell_value(self, sheet_name, coordinate=None, row=None, column=None):
""" Gets the data of the specified cell """
if coordinate is not None:
try:
return self.__wb[sheet_name][coordinate].value
except Exception as e:
raise e
elif coordinate is None and row is not None and column is not None:
if isinstance(row, int) and isinstance(column, int):
return self.__wb[sheet_name].cell(row=row, column=column).value
else:
raise TypeError('row and column must be type int')
else:
raise Exception("Insufficient Coordinate of cell!")
def get_row_value(self, sheet_name, row):
""" Get a 1 Row data """
column_num = self.get_max_column_num(sheet_name)
row_value = []
if isinstance(row, int):
for column in range(1, column_num + 1):
values_row = self.__wb[sheet_name].cell(row, column).value
row_value.append(values_row)
return row_value
else:
raise TypeError('row must be type int')
def get_column_value(self, sheet_name, column):
""" Get a 1 Column data """
row_num = self.get_max_column_num(sheet_name)
column_value = []
if isinstance(column, int):
for row in range(1, row_num + 1):
values_column = self.__wb[sheet_name].cell(row, column).value
column_value.append(values_column)
return column_value
else:
raise TypeError('column must be type int')
def get_all_value_1(self, sheet_name):
""" Gets all data for the specified form ( Remove header )"""
max_row_num = self.get_max_row_num(sheet_name)
max_column = self.get_max_column_num(sheet_name)
values = []
for row in range(2, max_row_num + 1):
value_list = []
for column in range(1, max_column + 1):
value = self.__wb[sheet_name].cell(row, column).value
value_list.append(value)
values.append(value_list)
return values
def get_all_value_2(self, sheet_name):
""" Gets all data for the specified form ( Remove header )"""
rows_obj = self.__wb[sheet_name].iter_rows(min_row=2, max_row=self.__wb[sheet_name].max_row, values_only=True)
values = []
for row_tuple in rows_obj:
value_list = []
for value in row_tuple:
value_list.append(value)
values.append(value_list)
return values
def get_excel_title(self, sheet_name):
""" Get sheet Header """
title_key = tuple(self.__wb[sheet_name].iter_rows(max_row=1, values_only=True))[0]
return title_key
def get_listdict_all_value(self, sheet_name):
""" Gets all data and returns a list of nested dictionaries """
sheet_title = self.get_excel_title(sheet_name)
all_values = self.get_all_value_2(sheet_name)
value_list = []
for value in all_values:
value_list.append(dict(zip(sheet_title, value)))
return value_list
def get_list_nametuple_all_value(self, sheet_name):
""" Gets all data and returns a list of nested named tuples """
sheet_title = self.get_excel_title(sheet_name)
values = self.get_all_value_2(sheet_name)
excel = namedtuple('excel', sheet_title)
value_list = []
for value in values:
e = excel(*value)
value_list.append(e)
return value_list
def write_cell(self, sheet_name, row, column, value=None, bold=True, color=BLACK):
if isinstance(row, int) and isinstance(column, int):
try:
cell_obj = self.__wb[sheet_name].cell(row, column)
cell_obj.font = Font(color=color, bold=bold)
cell_obj.value = value
self.__wb.save(self.filename)
except Exception as e:
raise e
else:
raise TypeError('row and column must be type int')
if __name__ == '__main__':
pe = ParseExcel('testdata.xlsx')
print(pe.get_all_value_2('division'))
print(pe.get_list_nametuple_all_value('division'))
column_row = pe.get_max_column_num('division')
print(' Maximum column number :', column_row)
max_row = pe.get_max_row_num('division')
print(' Maximum line number :', max_row)
cell_value_1 = pe.get_cell_value('division', row=2, column=3)
print(' No. 1 %d Row ,  No. 1 %d The data of the column is : %s' % (2, 3, cell_value_1))
cell_value_2 = pe.get_cell_value('division', coordinate='A5')
print('A5 The data of the cell is : {}'.format(cell_value_2))
value_row = pe.get_row_value('division', 3)
print(' No. 1 {} The data of the row is :{}'.format(3, value_row))
value_column = pe.get_column_value('division', 2)
print(' No. 1 {} The data of the column is :{}'.format(2, value_column))
values_1 = pe.get_all_value_1('division')
print(' No. 1 1 Get all the data in one way or another \n', values_1)
values_2 = pe.get_all_value_2('division')
print(' No. 1 2 Get all the data in one way or another \n', values_2)
title = pe.get_excel_title('division')
print(' Header is \n{}'.format(title))
dict_value = pe.get_listdict_all_value('division')
print(' List of nested dictionaries consisting of all data :\n', dict_value)
namedtuple_value = pe.get_list_nametuple_all_value('division')
print(' List of nested named tuples of all data :\n', namedtuple_value)
pe.write_cell('division', 1, 2, 'Tc_title')

xlrd

Install xlrd, this module only supports read operation. If you want to write, you need to use xlwt or xlutils with xlrd, but you can only write new excel files with xlwt, but you can't write original files, so you choose xlutils here

But another problem is that if xlutils is used, our excel file needs to be suffixed with. xls. An error will be reported because the xlsx suffix cannot be written (pro-test, because the formatting_info parameter is not compatible with the format of the new version of xlsx)

Note: When xlrd operates excel, both row and column numbers are calculated from 0

Encapsulation code


"""
------------------------------------
@Time : 2019/5/13 21:22
@Auth : linux Super 
@File : ParseExcel_xlrd.py
@IDE : PyCharm
@Motto: Real warriors,dare to face the bleak warning,dare to face the incisive error!
------------------------------------
"""
import xlrd
from xlutils import copy
from collections import namedtuple

class ParseExcel(object):
# xlrd  Analyse excel ,   Both row numbers and column numbers are derived from 0 Beginning 
def __init__(self, filename, sheet):
try:
self.filename = filename
self.sheet = sheet
self.wb = xlrd.open_workbook(self.filename, formatting_info=True)
if isinstance(sheet, str):
self.sheet = self.wb.sheet_by_name(sheet)
elif isinstance(sheet, int):
self.sheet = self.wb.sheet_by_index(sheet)
else:
raise TypeError('sheet must be int or str')
except Exception as e:
raise e

def get_max_row(self):
""" Gets the maximum line number of the form """
max_row_num = self.sheet.nrows
return max_row_num

def get_max_column(self):
""" Gets the maximum column number of the form """
min_row_num = self.sheet.ncols
return min_row_num

def get_cell_value(self, row, column):
""" Get the data of a cell """
if isinstance(row, int) and isinstance(column, int):
values = self.sheet.cell(row-1, column-1).value
return values
else:
raise TypeError('row and column must be type int')

def get_row_values(self, row):
""" Get a 1 Row data """
if isinstance(row, int):
values = self.sheet.row_values(row-1)
return values
else:
raise TypeError('row must be type int')

def get_column_values(self, column):
""" Get a 1 Column data """

if isinstance(column, int):
values = self.sheet.col_values(column-1)
return values
else:
raise TypeError('column must be type int')

def get_table_title(self):
""" Get header """
table_title = self.get_row_values(1)
return table_title

def get_all_values_dict(self):
""" Gets all the data, excluding the header, and returns 1 List of nested dictionaries """
max_row = self.get_max_row()
table_title = self.get_table_title()
value_list = []
for row in range(2, max_row):
values = self.get_row_values(row)
value_list.append(dict(zip(table_title, values)))
return value_list

def get_all_values_nametuple(self):
""" Gets all the data, excluding the header, and returns 1 List of nested named tuples """
table_title = self.get_table_title()
max_row = self.get_max_row()
excel = namedtuple('excel', table_title)
value_list = []
for row in range(2, max_row):
values = self.get_row_values(row)
e = excel(*values)
value_list.append(e)
return value_list

def write_value(self, sheet_index, row, column, value):
""" Write data to a cell """
if isinstance(row, int) and isinstance(column, int):
if isinstance(sheet_index, int):
wb = copy.copy(self.wb)
worksheet = wb.get_sheet(sheet_index)
worksheet.write(row-1, column-1, value)
wb.save(self.filename)
else:
raise TypeError('{} must be int'.format(sheet_index))
else:
raise TypeError('{} and {} must be int'.format(row, column))

if __name__ == '__main__':
pe = ParseExcel('testdata.xls', 'testcase')
print(' Maximum line number :', pe.get_max_row())
print(' Maximum column number :', pe.get_max_column())
print(' No. 1 2 Line number 3 Column data :', pe.get_cell_value(2, 3))
print(' No. 1 2 Row data ', pe.get_row_values(2))
print(' No. 1 3 Column data ', pe.get_column_values(3))
print(' Header :', pe.get_table_title())
print(' All data is returned to the list of nested dictionaries :', pe.get_all_values_dict())
print(' All data returns a list of nested named tuples :', pe.get_all_values_nametuple())
 　　  pe.write_value(0, 1, 3, 'test')

pandas

pandas is a data analysis library, always feel in the automated test using pandas analysis excel file read data a bit overqualified, no matter how, or pandas analysis excel file write 1

I only encapsulate reading here. If I write, I have a little problem. Let's change it later and add the code.

Please install pandas install pandas first

Encapsulation code


"""
------------------------------------
@Time : 2019/5/13 14:00
@Auth : linux Super 
@File : ParseExcel_pandas.py
@IDE : PyCharm
@Motto: Real warriors,dare to face the bleak warning,dare to face the incisive error!
------------------------------------
"""
import pandas as pd


class ParseExcel(object):
def __init__(self, filename, sheet_name=None):
try:
self.filename = filename
self.sheet_name = sheet_name
self.df = pd.read_excel(self.filename, self.sheet_name)
except Exception as e:
raise e

def get_row_num(self):
""" Gets a list of line numbers ,  From 0 Beginning """
row_num_list = self.df.index.values
return row_num_list

def get_cell_value(self, row, column):
""" Get a 1 Cells of data """
try:
if isinstance(row, int) and isinstance(column, int):
cell_value = self.df.ix[row-2, column-1] # ix The row parameter of is based on the valid data row and is derived from the 0 Begin 
return cell_value
else:
raise TypeError('row and column must be type int')
except Exception as e:
raise e

def get_table_title(self):
""" Gets the table header,   Return list """
table_title = self.df.columns.values
return table_title

def get_row_value(self, row):
""" Get a 1 Row data,   Line number from 1 Begin """
try:
if isinstance(row, int):
row_data = self.df.ix[row-2].values
return row_data
else:
raise TypeError('row must be type int')
except Exception as e:
raise e

def get_column_value(self, col_name):
""" Get a 1 Column data """
try:
if isinstance(col_name, str):
col_data = self.df[col_name].values
return col_data
else:
raise TypeError('col_name must be type str')
except Exception as e:
raise e
def get_all_value(self):
""" Get all the data, excluding the header ,  Returns a list of nested dictionaries """
rows_num = self.get_row_num()
table_title = self.get_table_title()
values_list = []
for i in rows_num:
row_data = self.df.ix[i, table_title].to_dict()
values_list.append(row_data)
return values_list
if __name__ == '__main__':
pe = ParseExcel('testdata.xlsx', 'testcase')
print(pe.get_row_num())
print(pe.get_table_title())
print(pe.get_all_value())
print(pe.get_row_value(2))
print(pe.get_cell_value(2, 3))
print(pe.get_column_value('Tc_title'))

Summarize

Three methods and four libraries xlrd, openpyxl, xlwt and pandas are used to operate excel files. Personal feeling is that openpyxl is more suitable for automation. Of course, different people have different choices, and it is not very different to use it.

The above three methods can be used directly, and there is no need to package them!