Python implementation login zhihu personal collection and saved as a word file
- 2020-04-02 14:42:01
- OfStack
This procedure was actually completed a long time ago, has not been sent, while recently not very busy to share with you.
BeautifulSoup module and urllib2 module implementation, and then saved as word using python docx module, installation method is a lot of online search, I won't repeat.
The main function is to log in zhihu, and then save the personal collection of questions and answers as a word document, so that you can consult when there is no network.
There is a regular, simply don't use too bad... Despise yourself...
Also, now that it's a question, all the answers will be saved. See if you have time to save only the first answer or the answers to the favorites page questions. Otherwise the saved word will surprise you if you have too many
In the login may need verification code, if prompted to enter verification code in the folder below the program can see the verification code picture, according to the input is ok.
# -*- coding: utf-8 -*-
# Log in zhihu to grab personal collection And save it as word
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import urllib
import urllib2
import cookielib
import string
import re
from bs4 import BeautifulSoup
from docx import Document
from docx import *
from docx.shared import Inches
from sys import exit
import os
# This is because you need to use the Internet at work socket The agent
#import socks
#import socket
#socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5,"127.0.0.1",8088)
#socket.socket =socks.socksocket
loginurl='http://www.zhihu.com/login'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36',}
postdata={
'_xsrf': 'acab9d276ea217226d9cc94a84a231f7',
'email': '',
'password': '',
'rememberme':'y'
}
if not os.path.exists('myimg'):
os.mkdir('myimg')
if os.path.exists('123.docx'):
os.remove('123.docx')
if os.path.exists('checkcode.gif'):
os.remove('checkcode.gif')
mydoc=Document()
questiontitle=''
#----------------------------------------------------------------------
def dealimg(imgcontent):
soup=BeautifulSoup(imgcontent)
try:
for imglink in soup.findAll('img'):
if imglink is not None :
myimg= imglink.get('src')
#print myimg
if myimg.find('http')>=0:
imgsrc=urllib2.urlopen(myimg).read()
imgnamere=re.compile(r'httpS*/')
imgname=imgnamere.sub('',myimg)
#print imgname
with open(u'myimg'+'/'+imgname,'wb') as code:
code.write(imgsrc)
mydoc.add_picture(u'myimg/'+imgname,width=Inches(1.25))
except:
pass
strinfo=re.compile(r'<noscript>[sS]*</noscript>')
imgcontent=strinfo.sub('',imgcontent)
strinfo=re.compile(r'<img class[sS]*</>')
imgcontent=strinfo.sub('',imgcontent)
#show all
strinfo=re.compile(r'<a class="toggle-expand[sS]*</a>')
imgcontent=strinfo.sub('',imgcontent)
strinfo=re.compile(r'<a class=" wrap external"[sS]*rel="nofollow noreferrer" target="_blank">')
imgcontent=strinfo.sub('',imgcontent)
imgcontent=imgcontent.replace('<i class="icon-external"></i></a>','')
imgcontent=imgcontent.replace('</b>','').replace('</p>','').replace('<p>','').replace('<p>','').replace('<br>','')
return imgcontent
def enterquestionpage(pageurl):
html=urllib2.urlopen(pageurl).read()
soup=BeautifulSoup(html)
questiontitle=soup.title.string
mydoc.add_heading(questiontitle,level=3)
for div in soup.findAll('div',{'class':'fixed-summary zm-editable-content clearfix'}):
#print div
conent=str(div).replace('<div class="fixed-summary zm-editable-content clearfix">','').replace('</div>','')
conent=conent.decode('utf-8')
conent=conent.replace('<br/>','n')
conent=dealimg(conent)
### This piece is too complicated Have time to find out if there is any treatment html The module
conent=conent.replace('<div class="fixed-summary-mask">','').replace('<blockquote>','').replace('<b>','').replace('<strong>','').replace('</strong>','').replace('<em>','').replace('</em>','').replace('</blockquote>','')
mydoc.add_paragraph(conent,style='BodyText3')
"""file=open('222.txt','a')
file.write(str(conent))
file.close()"""
def entercollectpage(pageurl):
html=urllib2.urlopen(pageurl).read()
soup=BeautifulSoup(html)
for div in soup.findAll('div',{'class':'zm-item'}):
h2content=div.find('h2',{'class':'zm-item-title'})
#print h2content
if h2content is not None:
link=h2content.find('a')
mylink=link.get('href')
quectionlink='http://www.zhihu.com'+mylink
enterquestionpage(quectionlink)
print quectionlink
def loginzhihu():
postdatastr=urllib.urlencode(postdata)
'''
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
urllib2.install_opener(opener)
'''
h = urllib2.urlopen(loginurl)
request = urllib2.Request(loginurl,postdatastr,headers)
request.get_origin_req_host
response = urllib2.urlopen(request)
#print response.geturl()
text = response.read()
collecturl='http://www.zhihu.com/collections'
req=urllib2.urlopen(collecturl)
if str(req.geturl())=='http://www.zhihu.com/?next=%2Fcollections':
print 'login fail!'
return
txt=req.read()
soup=BeautifulSoup(txt)
count=0
divs =soup.findAll('div',{'class':'zm-item'})
if divs is None:
print 'login fail!'
return
print 'login ok!n'
for div in divs:
link=div.find('a')
mylink=link.get('href')
collectlink='http://www.zhihu.com'+mylink
entercollectpage(collectlink)
print collectlink
# This is where we did the test , Value gets a collection
#count+=1
#if count==1:
# return
def getcheckcode(thehtml):
soup=BeautifulSoup(thehtml)
div=soup.find('div',{'class':'js-captcha captcha-wrap'})
if div is not None:
#print div
imgsrc=div.find('img')
imglink=imgsrc.get('src')
if imglink is not None:
imglink='http://www.zhihu.com'+imglink
imgcontent=urllib2.urlopen(imglink).read()
with open('checkcode.gif','wb') as code:
code.write(imgcontent)
return True
else:
return False
return False
if __name__=='__main__':
import getpass
username=raw_input('input username:')
password=getpass.getpass('Enter password: ')
postdata['email']=username
postdata['password']=password
postdatastr=urllib.urlencode(postdata)
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
urllib2.install_opener(opener)
h = urllib2.urlopen(loginurl)
request = urllib2.Request(loginurl,postdatastr,headers)
response = urllib2.urlopen(request)
txt = response.read()
if getcheckcode(txt):
checkcode=raw_input('input checkcode:')
postdata['captcha']=checkcode
loginzhihu()
mydoc.save('123.docx')
else:
loginzhihu()
mydoc.save('123.docx')
print 'the end'
raw_input()
Ok, that's about it. If you have any good Suggestions or comments, I will reply as soon as possible.