0%

人民日报下载器

自动下载最新一期人民日报并合并到单个PDF文件

source code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import requests
import os
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileMerger #generic.py 587行注释掉报错信息
os.system("title 人民日报下载器")
headers={'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Mobile Safari/537.36 Edg/92.0.902.55'}
indexurl="http://paper.people.com.cn/rmrb/paperindex.htm"
indexr=requests.get(indexurl,headers=headers)
indexb=BeautifulSoup(indexr.content,"lxml")
url="http://paper.people.com.cn/rmrb/"+str(indexb.select('meta[http-equiv="REFRESH"]')[0]["content"])[7:]
r=requests.get(url,headers=headers)
b=BeautifulSoup(r.content,"lxml")
pdfpath="http://paper.people.com.cn/rmrb"+str(b.select("body > div.main.w1000 > div.left.paper-box > div.paper-bot > p.right.btn > a")[0])[17:-11]
pagelist=len(b.select('a[id="pageLink"]'))
namelist=b.select("#pageLink")
date=str(b.select("body > div.main.w1000 > div.right.right-main > div.date-box > p.date.left")[0].get_text())[-16:-5]
merge=PdfFileMerger(strict=False)
print("最新一期报纸是:"+date+"\n本期共有"+str(pagelist)+"版\n开始下载...")
for i in range(pagelist):
path=pdfpath[:-21]+(str(i+1).rjust(2,"0"))+pdfpath[-19:-6]+(str(i+1).rjust(2,"0"))+pdfpath[-4:]
print(str(namelist[i].get_text()))
d=requests.get(path)
with open(str(namelist[i].get_text())+".pdf","wb") as code:
code.write(d.content)
merge.append(str(namelist[i].get_text()+".pdf"))
print("下载完毕 正在合并")
try:
with open("人民日报"+date+".pdf",'wb') as out:
merge.write(out)
except :
pass
merge.close()
for n in range(pagelist):
os.remove(str(namelist[n].get_text())+".pdf")
print("最新一期人民日报已经保存到:人民日报"+date+".pdf")
os.system("pause")