1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| import requests import os from bs4 import BeautifulSoup from PyPDF2 import PdfFileMerger os.system("title 人民日报下载器") headers={'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Mobile Safari/537.36 Edg/92.0.902.55'} indexurl="http://paper.people.com.cn/rmrb/paperindex.htm" indexr=requests.get(indexurl,headers=headers) indexb=BeautifulSoup(indexr.content,"lxml") url="http://paper.people.com.cn/rmrb/"+str(indexb.select('meta[http-equiv="REFRESH"]')[0]["content"])[7:] r=requests.get(url,headers=headers) b=BeautifulSoup(r.content,"lxml") pdfpath="http://paper.people.com.cn/rmrb"+str(b.select("body > div.main.w1000 > div.left.paper-box > div.paper-bot > p.right.btn > a")[0])[17:-11] pagelist=len(b.select('a[id="pageLink"]')) namelist=b.select("#pageLink") date=str(b.select("body > div.main.w1000 > div.right.right-main > div.date-box > p.date.left")[0].get_text())[-16:-5] merge=PdfFileMerger(strict=False) print("最新一期报纸是:"+date+"\n本期共有"+str(pagelist)+"版\n开始下载...") for i in range(pagelist): path=pdfpath[:-21]+(str(i+1).rjust(2,"0"))+pdfpath[-19:-6]+(str(i+1).rjust(2,"0"))+pdfpath[-4:] print(str(namelist[i].get_text())) d=requests.get(path) with open(str(namelist[i].get_text())+".pdf","wb") as code: code.write(d.content) merge.append(str(namelist[i].get_text()+".pdf")) print("下载完毕 正在合并") try: with open("人民日报"+date+".pdf",'wb') as out: merge.write(out) except : pass merge.close() for n in range(pagelist): os.remove(str(namelist[n].get_text())+".pdf") print("最新一期人民日报已经保存到:人民日报"+date+".pdf") os.system("pause")
|