从爬虫看多进程开发
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了从爬虫看多进程开发,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含4433字,纯文字阅读大概需要7分钟。
内容图文
![从爬虫看多进程开发](/upload/InfoBanner/zyjiaocheng/940/017e1f7b504d4bf39434bd6fdb35b8a4.jpg)
简介
因为写英文应用文与写作需要参考新闻信息,但是,我脑子里除了报纸没有其他更好的信息整合平台。遂打算下载renming日报
参考链接
https://www.liaoxuefeng.com/wiki/1016959663602400/1017628290184064
https://blog.csdn.net/qq_38161040/article/details/88366427
https://blog.csdn.net/baidu_28479651/article/details/76158051?utm_source=blogxgwz7
code 第一版
70%手动 30%自动 需要频繁的创建文件夹和更改下载次数
# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
import urllib.request
import re
import os
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html):
reg = r'([A-Z]\d+)' #匹配了G176200001
url_re = re.compile(reg)
url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
return(url_lst)
def getFile(url):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
if __name__ == '__main__':
tmp = "http://paper.people.com.cn/rmrb/page/2020-03/26/01/rmrb20200326";
for i in range(20):
#print(i)
# http://paper.people.com.cn/rmrb/page/2020-03/26/02/rmrb2020032602.pdf
# http://paper.people.com.cn/rmrb/page/2020-03/26/03/rmrb2020032603.pdf
if(i+1 <10):
getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/0"+str(i+1)+"/rmrb202003070"+str(i+1)+".pdf")
else:
getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/"+str(i+1)+"/rmrb20200307"+str(i+1)+".pdf")
code 第二版 自动创建文件夹版本
下载速度较慢需要等待
# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
import urllib.request
import re
import os
import shutil
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html):
reg = r'([A-Z]\d+)' #匹配了G176200001
url_re = re.compile(reg)
url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
return(url_lst)
def getFile(url):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
return file_name
if __name__ == '__main__':
for i in range(29):
folderName=""
data = str(i+1);
if(i+1 < 10):
data = "0"+data;
folderName = "02"+data;
os.mkdir(folderName)
for j in range(20):
fineName = ""
try:
if(j+1 <10):
fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/0"+str(j+1)+"/rmrb202002"+data+"0"+str(j+1)+".pdf";
tmp = getFile(fileName)
else:
fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/"+str(j+1)+"/rmrb202002"+data+str(j+1)+".pdf";
tmp = getFile(fileName)
shutil.move(tmp,folderName)
except OSError:
pass
continue
code 多进程下载
超级爽
# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
import urllib.request
import re
import os
import shutil
from multiprocessing import Pool
import time
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html):
reg = r'([A-Z]\d+)' #匹配了G176200001
url_re = re.compile(reg)
url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
return(url_lst)
def getFile(url):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
return file_name
def download(i):
folderName=""
data = str(i+1);
if(i+1 < 10):
data = "0"+data;
folderName = "01"+data;
os.mkdir(folderName)
for j in range(20):
fineName = ""
try:
if(j+1 <10):
fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/0"+str(j+1)+"/rmrb202001"+data+"0"+str(j+1)+".pdf";
tmp = getFile(fileName)
else:
fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/"+str(j+1)+"/rmrb202001"+data+str(j+1)+".pdf";
tmp = getFile(fileName)
shutil.move(tmp,folderName)
except OSError:
pass
continue
if __name__ == '__main__':
p = Pool(31)
for i in range(31):
p.apply_async(download, args = (i,))
p.close()
p.join()
print('All subprocesses done.')
内容总结
以上是互联网集市为您收集整理的从爬虫看多进程开发全部内容,希望文章能够帮你解决从爬虫看多进程开发所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。