python学习笔记1豆瓣图书信息下载保存至csv
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了python学习笔记1豆瓣图书信息下载保存至csv,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含3547字,纯文字阅读大概需要6分钟。
内容图文
![python学习笔记1豆瓣图书信息下载保存至csv](/upload/InfoBanner/zyjiaocheng/718/f9632d2f3410420e97341e55a3d0ee76.jpg)
1 import os #建立类目文档,未实现 2 import re 3 import requests 4 from bs4 import BeautifulSoup 5 import csv 6 import random 7 import time 8 #提取类目表 9 #建立子类url 10 books = [] 11 ourl = 'https://book.douban.com/tag/' 12 headers ={ 13 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' 14 } 15 response = requests.get(ourl,headers = headers) 16 response.encoding = 'utf-8' 17 #定位类目位置, 测试 每大类只选取第一行选取2页 总计480本 18 soup = BeautifulSoup(response.text,'html.parser') 19 dw_= soup.select('#content > div > div.article > div:nth-child(2) > div > table > tbody > tr:nth-child(1) > td > a') 20 leimu = [] #类目表 21 link = [] #类目链接表 22 for dw in dw_: 23 leimu.append(dw.string) 24 link.append('https://book.douban.com'+dw['href']) 25 for href_ in link:#每类首页 26 for page in range(0,40,20): 27 data = { 28 'start':page, 29 'type': 'T' 30 } 31 headers2 = { 32 'Referer': 'https://www.baidu.com/', 33 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' 34 } 35 html2 = requests.get(href_, params=data,headers=headers2) 36 html2.encoding = 'utf-8' 37 soup2 = BeautifulSoup(html2.text, 'html.parser') 38 names = soup2.select('#subject_list > ul > li > div.info > h2 > a') 39 details = soup2.select('#subject_list > ul > li > div.info > div.pub') 40 scores = soup2.select('#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums') 41 briefs = soup2.select('#subject_list > ul > li > div.info > p') 42 # tag1 = soup2.select('#content > h1') 43 # tag2 = tag1[0].get_text() 44 # tag3 = re.findall(r'\:\s*(.*)',tag2) 45 # tag = tag3[0] #感觉有点多余 从标签处拿到标签 46 47 for name, detail, score, brief in zip(names, details, scores, briefs): 48 try: 49 dict_book = {} 50 name1 = name.get_text().strip() 51 name = ''.join(name1.split()) # 消除所有空格 52 detail = detail.get_text().split('/') 53 author = detail[0].strip() 54 pubtime = detail[-2].strip() 55 price1 = detail[-1].strip() 56 price2 = re.findall(r'(\d+\.\d{0,3}).*', price1) 57 price = price2[0] # 正则保留数字 58 score = score.get_text() 59 brief = brief.get_text() 60 dict_book['书名'] = name 61 dict_book['作者'] = author 62 dict_book['上市时间'] = pubtime 63 dict_book['价格'] = price 64 dict_book['书籍评分'] = score 65 dict_book['内容简介'] = brief 66 books.append(dict_book) 67 time.sleep(random.random() * 3)#随机休眠 68 print(name) 69 except IndexError as e: 70 print('IndexError:',e) 71 finally: 72 print('finally') 73 with open(r'flieName.csv', 'w',errors='ignore') as csvfile: 74 filednames = ['书名', '作者', '上市时间', '价格', '书籍评分', '内容简介'] 75 writer = csv.DictWriter(csvfile, filednames) 76 writer.writeheader() 77 for book_ in books: 78 writer.writerow({ 79 '书名': book_['书名'], '作者': book_['作者'], '上市时间': book_['上市时间'], '价格': book_['价格'], 80 '书籍评分': book_['书籍评分'], '内容简介': book_['内容简介'] 81 })
还需添加ip池 未实现
内容总结
以上是互联网集市为您收集整理的python学习笔记1豆瓣图书信息下载保存至csv全部内容,希望文章能够帮你解决python学习笔记1豆瓣图书信息下载保存至csv所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。