使用代理池多线程爬取亚马逊网站(python,requests,bs4)
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了使用代理池多线程爬取亚马逊网站(python,requests,bs4),小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含9595字,纯文字阅读大概需要14分钟。
内容图文
ProxyPool 爬虫代理IP池使用
准备与配置
redis准备
下载redis压缩包,解压
双击redis-server.exe即可运行,或使用cmd进入解压文件夹后:
redis-server.exe redis.windows.conf
更详细步骤参考菜鸟教程
启动项目:
下载proxy_pool安装包
# 将此proxy_pool文件夹复制到项目文件夹下(或直接git pull),用编译器打开项目
# 若编译器提示需要导入import,直接导入,若无提示可以手动导入飘红import处缺少的包,无飘红则继续往后进行
# 从编译器终端进入代理池文件夹
cd proxy_pool
# 启动调度程序
python proxyPool.py schedule
# 可能会出现错误提示缺包,如click等,提示缺了什么就在终端里pip install相应包,然后重复上一步
# 在编译器里新建另一个终端继续下面的操作
# 启动webApi服务
python proxyPool.py server
# 同样若缺包则重复上一步
# lxml的安装容易出错,可以不用指定的4.3.1版本,用最新版的就好了
# 有时会报这个错误:TypeError: required field "type_ignores" missing from Module
# 这是因为python3.8与werkseug的兼容问题,参照这个博客https://www.jianshu.com/p/95588bf4e63d?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation的方案去解决即可
测试与使用
- Api接口介绍
启动web服务后, 默认配置下会开启 http://127.0.0.1:5010 的api接口服务:
api | method | Description | arg |
---|---|---|---|
/ | GET | api介绍 | None |
/get | GET | 随机获取一个代理 | None |
/get_all | GET | 获取所有代理 | None |
/get_status | GET | 查看代理数量 | None |
/delete | GET | 删除代理 | proxy=host:ip |
- 测试是否成功
等待服务运行一会过后,cmd进入redis文件夹:
redis-cli.exe -h 127.0.0.1 -p 6379
keys *
若显示不为空则抓取到的ip成功入库
此时使用浏览器测试api:http://127.0.0.1:5010/get_all/
都成功就可以开始放心爬了
代码
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import xlwt
import time
import random
import threading
import queue
from fake_useragent import UserAgent
beginNum=0 #设置开始行数
endnum=100 #设置结束行数
srcdata='srcdata.xlsx' #设置读取的原始文件(存有一列产品号)
savefile1='prime.xls' #设置存储prime网页数据的位置
savefile2='normal.xls' #设置存储normal网页数据的位置
ua = UserAgent()# 代理池
q = queue.Queue() # 存放被反爬之后待重爬的队列
# cookie的信息,也需要一个列表进行更换
cookie = {'session-id': '459-4568418-5692641', 'ubid-acbcn': '459-5049899-3055220',
'x-wl-uid': '1AK7YMFc9IzusayDn2fT6Topjz3iAOpR3EeA2UQSqco8fo5PbK2aCpyBA/fdPMfKFqZRHc4IeyuU=',
'session-token': 'OH1wPvfOj6Tylq2nnJcdn5wyxycR/lqyGsGU3+lUtU4mbC0ZD9s8/4Oihd1BlskUQG8zRbLVs9vfWXuiJmnRlDT4x35ircp2uLxOLNYQ4j5pzdFJIqqoZUnhHSJUq2yK80P3LqH8An7faXRCPW9BIqX1wu0WmHlSS9vYAPKA/2SGdV9b//EljYjIVCBjOuR/dKRiYEeGK3li0RJOVz7+vMWg7Rnzbx89QxlbCp0WyquZyVxG6f2mNw=="',
'session-id-time': '2082787201l'}
# url头
url = 'https://www.amazon.com/dp/'
df = pd.read_excel(srcdata)
workbook1 = xlwt.Workbook(encoding='utf-8')
workbook2 = xlwt.Workbook(encoding='utf-8')
nm = beginNum
CRAWL_EXIT = False
# 自定义线程
class crawlThread(threading.Thread):
def __init__(self, name, lock):
threading.Thread.__init__(self)
self.name = name
self.lock = lock
def run(self):
global nm
global CRAWL_EXIT
global q
selfnm=0
count404 = 0
primenum=0#prime网页数
normalnum=0#normal网页数
out = self.name
worksheet1 = workbook1.add_sheet(out)
worksheet2 = workbook2.add_sheet(out)
temp=('asin','电影名', '时长', '上映年份', '导演', '演员', '类别', '语言', '格式','版本','制片方','用户评分')
for i in range(12):
worksheet1.write(0, i, temp[i])
worksheet2.write(0, i, temp[i])
self.lock.acquire() # 获得锁
workbook1.save(savefile1)
workbook2.save(savefile2)
self.lock.release() # 释放锁
while (CRAWL_EXIT != True):
#电影名、时长、上映年份、导演、演员、类别、语言、格式、制片方、用户评分
mtitle=""
runtime=""
release=""
director=""
actor=""
genre=""
language=""
mformat=""
version=""
producer=""
review=""
# 请求头部信息,主要是浏览器相关设置
head = {
'user-agent': ua.random,
'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6',
'accept': 'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br'
}
try:
pr = requests.get('http://127.0.0.1:5010/get_all/').json()
if not pr:
proxy = '127.0.0.1:1087'
else:
proxy = random.choice(pr)
# 代理信息
proxy_id = {"http": proxy}
except:
print("can't connect to the sever")
continue
self.lock.acquire() # 获得锁
#检查待重爬队列
if q.empty():
# 产品id
pro_id = str(df.loc[nm]['asin'])
newurl = url + pro_id + "/"
mynm=nm
nm = nm + 1
else:
mynm = q.get()
pro_id = str(df.loc[mynm]['asin'])
newurl = url + pro_id + "/"
self.lock.release() # 释放锁
if nm >= endnum and q.empty():
CRAWL_EXIT = True
try:
#=============开始获取网页============#
req = requests.get(newurl, headers=head, proxies=proxy_id, cookies=cookie)
selfnm=selfnm+1
except:
print("获取网页失败")
q.put(mynm) # 把该序号加入待重爬队列
continue
#========开始解析网页========#
soup = BeautifulSoup(req.text, 'lxml')
# 用于验证是否被反爬
button = soup.find('button')
# 用于验证是否404
title = soup.find('title')
if title.text == 'Amazon.com Page Not Found' or title.text == 'Page Not Found':
count404 = count404 + 1
print(self.name + '遇到id:' + pro_id + '是404')
continue
if button:
if button.text == 'Continue shopping':
q.put(mynm) # 把该序号加入待重爬队列
try:
delproxy = 'http://127.0.0.1:5010/delete/?proxy=' + proxy['proxy'] + '/'
requests.get(delproxy) # 删掉被反爬的ip
except:
print("can't delete ip")
continue
#区分网页类型
tip1 = soup.find('a', attrs={"class": "av-retail-m-nav-text-logo"})
tip2 = soup.find('div', attrs={"id": "wayfinding-breadcrumbs_container"})
if tip1 != None:
print(self.name,"爬取prime网页")
primenum=primenum+1
# ============搜寻page1对应内容==========#
try:
mtitle = soup.find('h1', attrs={"class": "_1GTSsh _2Q73m9", "data-automation-id": "title"}).text
except:
print(pro_id+"定位电影名失败")
try:
runtime = soup.find(name='span', attrs={"data-automation-id": "runtime-badge"}).text
except:
print(pro_id+"定位时长失败")
try:
release = soup.find(name='span', attrs={"data-automation-id": "release-year-badge"}).text
except:
print(pro_id+"定位发行日期失败")
try:
review = soup.find(name='span', attrs={"class": "a-size-base a-nowrap"}).text
except:
print(pro_id+"定位用户打分失败")
try:
info = soup.find_all('div', attrs={'class': '_1ONDJH'})
try:
for dl in info[0].find_all('dl'):
dl1 = dl.dt
dl2 = dl.dd
if re.search(r'Director', dl1.text, re.M | re.I):
director = dl2.text
elif re.search(r'Starring', dl1.text, re.M | re.I):
actor = dl2.text
elif re.search(r'Genre', dl1.text, re.M | re.I):
genre = dl2.text
elif re.search(r'languages', dl1.text, re.M | re.I):
language = dl2.text
except:
print(pro_id+"定位info1失败")
pass
try:
for dl in info[1].find_all('dl'):
dl1 = dl.dt
dl2 = dl.dd
if re.search(r'Producer', dl1.text, re.M | re.I):
producer = dl2.text
elif re.search(r'Format', dl1.text, re.M | re.I):
mformat = dl2.text
except:
print(pro_id+"定位info2失败")
except:
print(pro_id+"定位详细资料失败")
try:
temp=(pro_id,mtitle,runtime,release,director,actor,genre,language,mformat,version,producer,review)
for i in range(12):
worksheet1.write(primenum,i,temp[i])
#self.lock.acquire() # 获得锁
workbook1.save(savefile1)
#self.lock.release() # 释放锁
except:
print(pro_id+"保存失败")
if tip2 != None:
print(self.name,"爬取normal网页")
normalnum=normalnum+1
# ============搜寻page2对应内容==========#
try:
mtitle = soup.find(name='span', attrs={"id": "productTitle"}).text
except:
print(pro_id,"normal网页定位电影名失败")
try:
ge=soup.find(name='div',attrs={"id":'wayfinding-breadcrumbs_feature_div'}).text
genre=str(ge).split('?')[-1]
except:
pass
try:
review= soup.find(name='span',attrs={"class": "a-icon-alt"}).text
except:
print(pro_id,"第一次定位用户评分失败")
try:
review= soup.find(name='span',attrs={"class": "a-size-medium a-color-base"}).text
except:
print(pro_id,"第二次定位用户评分失败")
try:
bylineinfo= soup.find_all(name='span',attrs={"class":"author notFaded"})
for span in bylineinfo:
lt=str(span.text).split('(')
if re.search(r'Actor', lt[1], re.M | re.I):
if actor != "":
actor = actor +","+lt[0]
else:
actor = lt[0]
elif re.search(r'Director', lt[1], re.M | re.I):
director = lt[0]
except:
print(pro_id,"第一次定位bylineinfo失败")
try:
bylineinfo = soup.find(name='div',attrs={"id":"bylineInfo"}).text
lt=str(bylineinfo).split(':')
mformat=lt[-1]
except:
print(pro_id,"第二次定位bylineinfo失败")
try:
detail = soup.find(name='ul', attrs={'class': 'a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list'})
for li in detail.find_all('li'):
span1 = li.span.find_all('span')[0]
span2 = li.span.find_all('span')[1]
if re.search(r'Run time', span1.text, re.M | re.I):
runtime = span2.text
elif re.search(r'Release date', span1.text, re.M | re.I):
release = span2.text
elif re.search(r'Director', span1.text, re.M | re.I):
director = span2.text
elif re.search(r'Actor', span1.text, re.M | re.I):
actor = span2.text
elif re.search(r'Format', span1.text, re.M | re.I):
version = span2.text
elif re.search(r'Producer', span1.text, re.M | re.I):
producer = span2.text
elif re.search(r'Language', span1.text, re.M | re.I):
language = span2.text
except:
print(pro_id,"定位detail失败")
try:
temp=(pro_id,mtitle,runtime,release,director,actor,genre,language,mformat,version,producer,review)
for i in range(12):
worksheet2.write(normalnum,i,temp[i])
#self.lock.acquire() # 获得锁
workbook2.save(savefile2)
#self.lock.release() # 释放锁
except:
print(pro_id+"保存失败")
if selfnm<5:
time.sleep( random.uniform(1.5,2.9))
elif selfnm<10:
time.sleep(random.uniform(2.4,3.9))
elif nm<15:
time.sleep(random.uniform(3.5,5.4))
elif selfnm>= 20:
print(self.name+'休眠120秒')
selfnm=0
time.sleep(120)
print(self.name+'结束休眠')
print(self.name,'失效网站数:', count404)
if __name__ == "__main__":
# 创建锁
lock = threading.Lock()
# 采集线程的名字
crawlList = ["crawler1", "crawler2", "crawler3", "crarler4", "crawler5"]
# 存储采集线程的列表集合
threadcrawl = []
for threadName in crawlList:
thread = crawlThread(threadName, lock)
thread.start()
threadcrawl.append(thread)
for thread in threadcrawl:
thread.join()
print("finish")
内容总结
以上是互联网集市为您收集整理的使用代理池多线程爬取亚马逊网站(python,requests,bs4)全部内容,希望文章能够帮你解决使用代理池多线程爬取亚马逊网站(python,requests,bs4)所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。