首页 / PYTHON / python爬取酒店信息练习
python爬取酒店信息练习
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了python爬取酒店信息练习,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含8931字,纯文字阅读大概需要13分钟。
内容图文
![python爬取酒店信息练习](/upload/InfoBanner/zyjiaocheng/857/bdda16a7d4a440829de4469b3d68c0d0.jpg)
爬取酒店信息,首先知道要用到那些库。本次使用request库区获取网页,使用bs4来解析网页,使用selenium来进行模拟浏览。
本次要爬取的美团网的蚌埠酒店信息及其评价。爬取的网址为“http://hotel.meituan.com/bengbu/”。首先获取导航页的相关信息,具体代码如下
url = 'http://hotel.meituan.com/bengbu/' # 获取酒店分页信息,返回最大页码 html = requests.get(url).text soup = BeautifulSoup(html,'html.parser') page_info = soup.find_all('li',class_='page-link') # 获取酒店首页的页面导航条信息 get_page_num = page_info[-1].find('a').get_text() # 获取酒店页面的总页数 print(get_page_num)
获取了上面的信息,就可以选择一个具体网页,利用Google浏览器的F12查看具体的元素,利用xpath定位相关元素,把获取的信息保存在文件夹下,具体方法代码如下
# 获取所有酒店详细信息 def get_hotel_info(url): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36') browser = webdriver.PhantomJS("D:/PhantomJS/phantomjs-2.1.1-windows/bin/phantomjs", desired_capabilities=dcap) #指定phantomjs程序路径 browser.get(url) hotel_info = {} page_num = 1 while(page_num < int(get_page_num)+1): # 获取一个页面的所有酒店信息 for item in browser.find_elements_by_class_name('info-wrapper'): hotel_info['name'] = item.find_element_by_class_name('poi-title').text hotel_info['star'] = item.find_element_by_class_name('poi-grade').text hotel_info['consumers'] = item.find_element_by_class_name('poi-buy-num').text hotel_info['link'] = item.find_element_by_class_name('poi-title').get_attribute('href') print("酒店名称:{}".format(hotel_info['name'])) print("酒店评分:{}".format(hotel_info['star'])) print("酒店销量:{}".format(hotel_info['consumers'])) print("酒店链接:{}".format(hotel_info['link'])) f = open("酒店信息.txt", 'a', encoding="utf8") f.write(hotel_info['name']+"\n"+hotel_info['star']+"\n"+hotel_info['consumers']+"\n"+hotel_info['link']+"\n") u = hotel_info['link'][25:-1] # print(u) # 获取酒店前10页评论内容(动态加载的静态爬取) for i in range(10): page = i + 1 s = i * 10 print("正在加载第" + str(page) + "页评论") html = "http://ihotel.meituan.com/group/v1/poi/comment/" + u + "?sortType=default&noempty=1&withpic=0&filter=all&limit=10&offset=" + str( s)+"&X-FOR-WITH=" # print(html) # 第一次只使用一个header导致爬取信息不全,添加多个可以正常爬取 my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" ] randdom_header = random.choice(my_headers) headers = { "User-Agent":randdom_header, "Host":"ihotel.meituan.com" } r = requests.get(html,headers=headers) print(r.text) data = json.loads(r.text,strict=False) # print(data) comments = data['data']['feedback'] for n in comments: replytime = n['feedbacktime'] content = n['comment'] # print("评论时间:", replytime) # print("评论内容:", content) f = open("jieguo-1.txt", 'a',encoding="utf8") f.write(content+"\n") browser.find_element_by_class_name('paginator').find_element_by_class_name('next').find_element_by_tag_name('a').click() # 一个页面写完后,通过点击"下一页"图标至下一页,继续获取 time.sleep(1) page_num += 1
实现了上述的方法,就可以把完整的酒店信息抓取下来,所有代码如下:
1 # encoding="utf8" 2 # 爱学习的兔兔 3 import requests 4 from bs4 import BeautifulSoup 5 from selenium import webdriver 6 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 7 import time 8 import json 9 import random 10 11 url = 'http://hotel.meituan.com/bengbu/' 12 13 # 获取酒店分页信息,返回最大页码 14 html = requests.get(url).text 15 soup = BeautifulSoup(html,'html.parser') 16 page_info = soup.find_all('li',class_='page-link') # 获取酒店首页的页面导航条信息 17 get_page_num = page_info[-1].find('a').get_text() # 获取酒店页面的总页数 18 print(get_page_num) # 返回酒店页面的 19 20 # 获取所有酒店详细信息 21 def get_hotel_info(url): 22 dcap = dict(DesiredCapabilities.PHANTOMJS) 23 dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36') 24 browser = webdriver.PhantomJS("D:/PhantomJS/phantomjs-2.1.1-windows/bin/phantomjs", desired_capabilities=dcap) #指定phantomjs程序路径 25 browser.get(url) 26 hotel_info = {} 27 page_num = 1 28 29 30 while(page_num < int(get_page_num)+1): 31 # 获取一个页面的所有酒店信息 32 for item in browser.find_elements_by_class_name('info-wrapper'): 33 hotel_info['name'] = item.find_element_by_class_name('poi-title').text 34 hotel_info['star'] = item.find_element_by_class_name('poi-grade').text 35 hotel_info['consumers'] = item.find_element_by_class_name('poi-buy-num').text 36 hotel_info['link'] = item.find_element_by_class_name('poi-title').get_attribute('href') 37 print("酒店名称:{}".format(hotel_info['name'])) 38 print("酒店评分:{}".format(hotel_info['star'])) 39 print("酒店销量:{}".format(hotel_info['consumers'])) 40 print("酒店链接:{}".format(hotel_info['link'])) 41 f = open("酒店信息.txt", 'a', encoding="utf8") 42 f.write(hotel_info['name']+"\n"+hotel_info['star']+"\n"+hotel_info['consumers']+"\n"+hotel_info['link']+"\n") 43 u = hotel_info['link'][25:-1] 44 # print(u) 45 # 获取酒店前10页评论内容(动态加载的静态爬取) 46 for i in range(10): 47 page = i + 1 48 s = i * 10 49 print("正在加载第" + str(page) + "页评论") 50 html = "http://ihotel.meituan.com/group/v1/poi/comment/" + u + "?sortType=default&noempty=1&withpic=0&filter=all&limit=10&offset=" + str( 51 s)+"&X-FOR-WITH=" 52 # print(html) 53 # 第一次只使用一个header导致爬取信息不全,添加多个可以正常爬取 54 my_headers = [ 55 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", 56 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", 57 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", 58 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", 59 "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" 60 ] 61 randdom_header = random.choice(my_headers) 62 headers = { 63 "User-Agent":randdom_header, 64 "Host":"ihotel.meituan.com" 65 } 66 r = requests.get(html,headers=headers) 67 print(r.text) 68 data = json.loads(r.text,strict=False) 69 # print(data) 70 comments = data['data']['feedback'] 71 for n in comments: 72 replytime = n['feedbacktime'] 73 content = n['comment'] 74 # print("评论时间:", replytime) 75 # print("评论内容:", content) 76 f = open("jieguo-1.txt", 'a',encoding="utf8") 77 f.write(content+"\n") 78 79 80 81 82 browser.find_element_by_class_name('paginator').find_element_by_class_name('next').find_element_by_tag_name('a').click() # 一个页面写完后,通过点击"下一页"图标至下一页,继续获取 83 time.sleep(1) 84 page_num += 1 85 86 def main(): 87 get_hotel_info(url) 88 89 if '__main__' == __name__: 90 main()
这样就顺利的拿到了酒店信息和评价,为了简单分析下拿到的数据,使用了SnowNLP分词看数据,发现效果一般,又使用了jieba分词和词云来分析,得出一张图片如下图:
这里没有对符号进行过滤,只能给出一个大体的评价关系图。具体代码图下:
# encoding="utf8" # SnowNLP分词 # 爱学习的兔兔 from snownlp import SnowNLP f = open("jieguo-1.txt","r",encoding="utf8") r = f.readlines() #按行读取 #for line in r: s = SnowNLP(str(r)) for sentence in s.sentences: print(sentence) # jieba分词与词云 import jieba.posseg as posseg from collections import Counter from wordcloud import WordCloud #for line in r: words = [w for w,f in posseg.cut(str(r))] print(words) c = Counter(words) print(c.most_common(20)) wc = WordCloud(font_path='c:\\Windows\\Fonts\\simkai.ttf', height=1080, width=1920).generate_from_frequencies(c) image = wc.to_image() image.show() wc.to_file("ex2.png")
整体走下来,感觉写个简单的爬虫能学到不少有用的信息。
内容总结
以上是互联网集市为您收集整理的python爬取酒店信息练习全部内容,希望文章能够帮你解决python爬取酒店信息练习所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。