Two ways to fake HTTP headers in python using urllib2
- 2020-04-02 13:47:53
- OfStack
When collecting web page information, it is often necessary to forge headers to realize the effective execution of the collection script
Next, we'll use the header section of urllib2 to fake the header to collect information
Methods 1.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:urllib2-header.py
import urllib2
import sys
# Crawling web content - Send the header -1
url= "//www.jb51.net"
send_headers = {
'Host':'www.jb51.net',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection':'keep-alive'
}
req = urllib2.Request(url,headers=send_headers)
r = urllib2.urlopen(req)
html = r.read() # Back to web content
receive_header = r.info() # The header information returned
# sys.getfilesystemencoding()
html = html.decode('utf-8','replace').encode(sys.getfilesystemencoding()) # transcoding : Avoid garbled output
print receive_header
# print '####################################'
print html
Method 2,
#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:urllib2-header.py
import urllib2
import sys
url = '//www.jb51.net'
req = urllib2.Request(url)
req.add_header('Referer','//www.jb51.net/')
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
r = urllib2.urlopen(req)
html = r.read()
receive_header = r.info()
html = html.decode('utf-8').encode(sys.getfilesystemencoding())
print receive_header
print '#####################################'
print html