Python data analysis true IP request Pandas details

  • 2020-05-17 05:42:57
  • OfStack

preface

pandas is a data analysis package with more advanced data structures and tools based on Numpy. The core of Numpy is ndarray, and pandas is developed around the core data structures Series and DataFrame. Series and DataFrame correspond to a 1-dimensional sequence and a 2-dimensional table structure, respectively. The default import method of pandas is as follows:


from pandas import Series,DataFrame
import pandas as pd

1.1. Pandas analysis steps

1. Load the log data

2. Load area_ip data

3. The number of real_ip requests is processed to COUNT. SQL:


SELECT inet_aton(l.real_ip),
  count(*),
  a.addr
FROM log AS l
INNER JOIN area_ip AS a
  ON a.start_ip_num <= inet_aton(l.real_ip)
  AND a.end_ip_num >= inet_aton(l.real_ip)
GROUP BY real_ip
ORDER BY count(*)
LIMIT 0, 100;

Code 1.2.


cat pd_ng_log_stat.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
 
from ng_line_parser import NgLineParser
 
import pandas as pd
import socket
import struct
 
class PDNgLogStat(object):
 
  def __init__(self):
    self.ng_line_parser = NgLineParser()
 
  def _log_line_iter(self, pathes):
    """ Parse each in the file 1 Line and generate 1 An iterator """
    for path in pathes:
      with open(path, 'r') as f:
        for index, line in enumerate(f):
          self.ng_line_parser.parse(line)
          yield self.ng_line_parser.to_dict()
 
  def _ip2num(self, ip):
    """ Used for IP Convert to Numbers """
    ip_num = -1
    try:
      #  will IP Converted into INT/LONG  digital 
      ip_num = socket.ntohl(struct.unpack("I",socket.inet_aton(str(ip)))[0])
    except:
      pass
    finally:
      return ip_num
 
  def _get_addr_by_ip(self, ip):
    """ By giving the IP Get the address """
    ip_num = self._ip2num(ip)
 
    try:
      addr_df = self.ip_addr_df[(self.ip_addr_df.ip_start_num <= ip_num) & 
                   (ip_num <= self.ip_addr_df.ip_end_num)]
      addr = addr_df.at[addr_df.index.tolist()[0], 'addr']
      return addr
    except:
      return None
           
  def load_data(self, path):
    """ Data generation by loading the given file path  DataFrame"""
    self.df = pd.DataFrame(self._log_line_iter(path))
 
 
  def uv_real_ip(self, top = 100):
    """ statistical cdn ip The amount """
    group_by_cols = ['real_ip'] #  Columns that need to be grouped , Only this column is evaluated and displayed 
     
    #  Direct statistics 
    url_req_grp = self.df[group_by_cols].groupby(
                   self.df['real_ip'])
    return url_req_grp.agg(['count'])['real_ip'].nlargest(top, 'count')
     
  def uv_real_ip_addr(self, top = 100):
    """ statistical real ip  Address the amount """
    cnt_df = self.uv_real_ip(top)
 
    #  add  ip  address   column 
    cnt_df.insert(len(cnt_df.columns),
           'addr',
           cnt_df.index.map(self._get_addr_by_ip))
    return cnt_df
     
  def load_ip_addr(self, path):
    """ loading IP"""
    cols = ['id', 'ip_start_num', 'ip_end_num',
        'ip_start', 'ip_end', 'addr', 'operator']
    self.ip_addr_df = pd.read_csv(path, sep='\t', names=cols, index_col='id')
    return self.ip_addr_df
 
def main():
  file_pathes = ['www.ttmark.com.access.log']
 
  pd_ng_log_stat = PDNgLogStat()
  pd_ng_log_stat.load_data(file_pathes)
 
  #  loading  ip  address 
  area_ip_path = 'area_ip.csv'
  pd_ng_log_stat.load_ip_addr(area_ip_path)
 
  #  statistical   The real  IP  traffic   and   address 
  print pd_ng_log_stat.uv_real_ip_addr()
 
if __name__ == '__main__':
  main()

Run statistics and output results


python pd_ng_log_stat.py
 
         count  addr
real_ip            
60.191.123.80  101013  Hangzhou city, zhejiang province 
-        32691  None
218.30.118.79  22523    The Beijing municipal 
......
136.243.152.18   889    Germany 
157.55.39.219   889    The United States 
66.249.65.170   888    The United States 
 
[100 rows x 2 columns]

conclusion

The above is the whole content of this article, I hope the content of this article to your study or work to bring 1 definite help, if you have questions you can leave a message to communicate.


Related articles: