Python data analysis true IP request Pandas details
- 2020-05-17 05:42:57
- OfStack
preface
pandas is a data analysis package with more advanced data structures and tools based on Numpy. The core of Numpy is ndarray, and pandas is developed around the core data structures Series and DataFrame. Series and DataFrame correspond to a 1-dimensional sequence and a 2-dimensional table structure, respectively. The default import method of pandas is as follows:
from pandas import Series,DataFrame
import pandas as pd
1.1. Pandas analysis steps
1. Load the log data
2. Load area_ip data
3. The number of real_ip requests is processed to COUNT. SQL:
SELECT inet_aton(l.real_ip),
count(*),
a.addr
FROM log AS l
INNER JOIN area_ip AS a
ON a.start_ip_num <= inet_aton(l.real_ip)
AND a.end_ip_num >= inet_aton(l.real_ip)
GROUP BY real_ip
ORDER BY count(*)
LIMIT 0, 100;
Code 1.2.
cat pd_ng_log_stat.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from ng_line_parser import NgLineParser
import pandas as pd
import socket
import struct
class PDNgLogStat(object):
def __init__(self):
self.ng_line_parser = NgLineParser()
def _log_line_iter(self, pathes):
""" Parse each in the file 1 Line and generate 1 An iterator """
for path in pathes:
with open(path, 'r') as f:
for index, line in enumerate(f):
self.ng_line_parser.parse(line)
yield self.ng_line_parser.to_dict()
def _ip2num(self, ip):
""" Used for IP Convert to Numbers """
ip_num = -1
try:
# will IP Converted into INT/LONG digital
ip_num = socket.ntohl(struct.unpack("I",socket.inet_aton(str(ip)))[0])
except:
pass
finally:
return ip_num
def _get_addr_by_ip(self, ip):
""" By giving the IP Get the address """
ip_num = self._ip2num(ip)
try:
addr_df = self.ip_addr_df[(self.ip_addr_df.ip_start_num <= ip_num) &
(ip_num <= self.ip_addr_df.ip_end_num)]
addr = addr_df.at[addr_df.index.tolist()[0], 'addr']
return addr
except:
return None
def load_data(self, path):
""" Data generation by loading the given file path DataFrame"""
self.df = pd.DataFrame(self._log_line_iter(path))
def uv_real_ip(self, top = 100):
""" statistical cdn ip The amount """
group_by_cols = ['real_ip'] # Columns that need to be grouped , Only this column is evaluated and displayed
# Direct statistics
url_req_grp = self.df[group_by_cols].groupby(
self.df['real_ip'])
return url_req_grp.agg(['count'])['real_ip'].nlargest(top, 'count')
def uv_real_ip_addr(self, top = 100):
""" statistical real ip Address the amount """
cnt_df = self.uv_real_ip(top)
# add ip address column
cnt_df.insert(len(cnt_df.columns),
'addr',
cnt_df.index.map(self._get_addr_by_ip))
return cnt_df
def load_ip_addr(self, path):
""" loading IP"""
cols = ['id', 'ip_start_num', 'ip_end_num',
'ip_start', 'ip_end', 'addr', 'operator']
self.ip_addr_df = pd.read_csv(path, sep='\t', names=cols, index_col='id')
return self.ip_addr_df
def main():
file_pathes = ['www.ttmark.com.access.log']
pd_ng_log_stat = PDNgLogStat()
pd_ng_log_stat.load_data(file_pathes)
# loading ip address
area_ip_path = 'area_ip.csv'
pd_ng_log_stat.load_ip_addr(area_ip_path)
# statistical The real IP traffic and address
print pd_ng_log_stat.uv_real_ip_addr()
if __name__ == '__main__':
main()
Run statistics and output results
python pd_ng_log_stat.py
count addr
real_ip
60.191.123.80 101013 Hangzhou city, zhejiang province
- 32691 None
218.30.118.79 22523 The Beijing municipal
......
136.243.152.18 889 Germany
157.55.39.219 889 The United States
66.249.65.170 888 The United States
[100 rows x 2 columns]
conclusion
The above is the whole content of this article, I hope the content of this article to your study or work to bring 1 definite help, if you have questions you can leave a message to communicate.