首页 / PYTHON / python——爬取学而思官网
python——爬取学而思官网
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了python——爬取学而思官网,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含5360字,纯文字阅读大概需要8分钟。
内容图文
```python
import re
import time
import pandas as pds
import numpy
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
browser = webdriver.Chrome() #驱动谷歌浏览器
#进入网站
def enter(url,element):
wait = WebDriverWait(browser, 2)
try:
browser.get(url)
wait.until(
EC.presence_of_element_located((By.XPATH,element)),
)
except TimeoutException:
result = "在"+url+'\n'+'未定位到'+element
print(result)
#获取节点的文本信息
def get_detail(element):
try:
elements = browser.find_element_by_xpath(element)
detail = elements.text
except :
detail = "无"
return detail
#获取节点的属性信息
def get_element_attribute(element, attribute):
elements = browser.find_element_by_xpath(element)
return elements.get_attribute(attribute)
#点击节点
def click_element(element):
elements = browser.find_element_by_xpath(element).click()
#输入内容并回车
def send_word(element,text):
elements = browser.find_element_by_xpath(element)
elements.send_keys(text)
elements.send_keys(Keys.ENTER)
def clear_word(element):
elements = browser.find_element_by_xpath(element).clear()
def get_ele_cnt(element):
lis = browser.find_elements_by_xpath(element)
return len(lis)
#获取A年级有多少条,多少页数据
def get_each_class(element1,element2):
m = get_detail(element1)
lis = browser.find_elements_by_xpath(element2)
n = lis[-1].text
return m,n
#获取每个班级的详细信息
def get_class_detail(element):
classname = get_detail(element+'//div[@class="item_header"]/div[1]')
teaching_mode = get_detail(element+'//div[@class="item_header"]/div[2]')
dtbegindate = get_detail(element+'//div[@class="item_info"]/span[1]')
dtdate = get_detail(element+'//div[@class="item_info"]/span[2]')
address = get_detail(element+'//div[@class="item_info"]/span[3]')
teacher_main = get_detail(element+'//div[@class="teacher"]/div[@class="teacher_main"]')
teacher_vice = get_detail(element+'//div[@class="teacher"]/div[@class="teacher_vice"]')
if teaching_mode=="在线":
teacher_tag = get_detail(element+'//div[@class="teacher"]/div[@class="remain_tag"]')
else:
teacher_tag = get_detail(element+'//div[@class="teacher"]/div[@class="teacher_tag"]')
class_price = get_detail(element+'//div[@class="item_footer"]/div[@class="left"]')
return classname,teaching_mode,dtbegindate,dtdate,address,teacher_main,teacher_vice,teacher_tag,class_price
#添加部门,年级
def sdept_grade(i,j):
if i == 1:
sdept = "幼儿部"
if j == 1:
classtype = "托班"
elif j == 2:
classtype = "小班"
elif j==3:
classtype = "中班"
elif j==4:
classtype = "大班"
elif i ==2:
sdept = "小学部"
if j == 1:
classtype = "一年级"
elif j == 2:
classtype = "二年级"
elif j==3:
classtype = "三年级"
elif j==4:
classtype = "四年级"
elif j==5:
classtype = "五年级"
elif j==6:
classtype = "六年级"
elif j==7:
classtype = "小学组"
elif i==3:
sdept = "初中部"
if j == 1:
classtype = "初一"
elif j == 2:
classtype = "初二"
elif j==3:
classtype = "中考"
elif j==4:
classtype = "初中组"
else:
sdept = "高中部"
if j == 1:
classtype = "高一"
elif j == 2:
classtype = "高二"
elif j==3:
classtype = "高考"
elif j==4:
classtype = "高中组"
return (sdept,classtype)
#写入csv
#获取url中的表并写入文件
def write_csv(i , school):
writeschool=pds.DataFrame([[i,school]])
writeschool.to_csv('C:/Users/Administrator/Desktop/一批文分数线.csv', sep=',', mode='a',index = False,header = False)
#主函数
def main():
url = 'https://www.speiyou.com/shanxi_xian/list'
enter(url, '//*[@id="test"]/div/ul/li[1]/a') #进入网站并获取节点
click_element('//div[@class="modal_btn"]') #点击"我知道了"
#多个年级,班级个数
for i in range(1,5):
if i == 2:
jj = 8
else:
jj = 5
for j in range(1,jj):
(sdept,grade)=sdept_grade(i,j)
#点击年级下拉键,点选A年级
click_element('//*[@id="__layout"]/div/header/div[3]/div/span/div[2]/span') #点击年级下拉
click_element('//div[@class="grade_container"]//li['+str(i)+']/div/span['+str(j)+']')
time.sleep(3)
#获取A年级的总条数m,总页数n
(m,n)=get_each_class('//span[@class="el-pagination__total"]','//ul[@class="el-pager"]//li')
print(m, n)
#进入第1到n页
for page in range(1,int(n)+1):
print(page)
#点击下一页
click_element('//*[@id="__layout"]/div/div/section/div[3]/div/button[2]/i')
#获取每页的班级数量
classcnt = get_ele_cnt('//*[@id="__layout"]/div/div/section/div[2]/div[@class="card_list"]/div')
#获取每个班级课程信息,部门,年级,班级名称,授课类型,上课日期,上课时间,上课地点,主讲教师,辅导教师,班级状态,价格
for k in range(1,classcnt+1):
(classname,teaching_mode,dtbegindate,dtdate,address,teacher_main,teacher_vice,teacher_tag,class_price)=get_class_detail('//*[@id="__layout"]/div/div/section/div[2]/div[@class="card_list"]/div['+str(k)+']')
s_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
#写入Excel或数据库中
writeschool=pds.DataFrame([[s_time,sdept,grade,m,classname,teaching_mode,dtbegindate,dtdate,address,teacher_main,teacher_vice,teacher_tag,class_price]])
writeschool.to_csv('C:/Users/Administrator/Desktop/学而思finnal.csv', sep=',', mode='a',index = False,header = False,encoding='utf_8_sig')
browser.close() #关闭浏览器
#调用主函数
if __name__ == "__main__":
main()
乔眉
发布了55 篇原创文章 · 获赞 17 · 访问量 1万+
私信
关注
内容总结
以上是互联网集市为您收集整理的python——爬取学而思官网全部内容,希望文章能够帮你解决python——爬取学而思官网所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。