首页 / 爬虫 / 使用selenium做简单爬虫的实例
使用selenium做简单爬虫的实例
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了使用selenium做简单爬虫的实例,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含5944字,纯文字阅读大概需要9分钟。
内容图文
![使用selenium做简单爬虫的实例](/upload/InfoBanner/zyjiaocheng/1187/61bd73850631422b80bb3eb4ceb2143f.jpg)
selenium 是一个Web自动化测试的软件包,可以用于自动测试Web应用,也可以用于当作简单的爬虫制作工具,
这是一个简单的demo,用于爬取Google APP Store中的一个类别:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
|
# -*- coding: utf-8 -*-
from
selenium
import
webdriver
from
selenium.webdriver.common.keys
import
Keys
from
selenium.webdriver.support.ui
import
WebDriverWait
from
time
import
sleep
import
sqlite3
import
sys
# connect the sqlite3
def
Conn_DB(db_name
=
‘app_info.db‘
):
try
:
conn
=
sqlite3.connect(db_name)
except
Exception, e:
print
"Conn Error "
, e
return
conn
# get the category of the apps
def
Get_Category(root_address):
url_list
=
root_address.split(
‘/‘
)
return
url_list[
-
1
].replace(
"?"
,
‘ ‘
).split(
‘ ‘
)[
0
]
# we have to login so that to get the info from every app
def
Login_Google(browser, category_root_address):
browser.get(category_root_address)
# click to login
login_link
=
browser.find_element_by_id(
‘gb_70‘
)
webdriver.ActionChains(browser).move_to_element(login_link).click(login_link).perform()
# input your email here
email
=
browser.find_element_by_name(
‘Email‘
)
# you should input your email here
email.send_keys(‘‘)
# input your password here
pwd
=
browser.find_element_by_name(
‘Passwd‘
)
# you should input your password for your email here
pwd.send_keys(‘‘)
pwd.send_keys(Keys.RETURN)
print
‘Login Success‘
# load the whole page and then return the number of the apps under the category
def
Load_All_Apps(browser):
# try to load the whole page to select want I want, the magic number 13 is based on the test
for
times
in
xrange
(
13
):
browser.execute_script(
"window.scrollTo(0, document.body.scrollHeight);"
)
sleep(
2.5
)
browser.execute_script(
"window.scrollTo(0, document.body.scrollHeight * 0.5);"
)
sleep(
2.5
)
print
times
# click the show more button to load more apps
show_more_button
=
browser.execute_script(
"return document.querySelector(‘#show-more-button‘)[‘style‘][‘cssText‘];"
)
if
show_more_button !
=
‘display: none;‘
:
browser.execute_script(
"document.querySelector(‘#show-more-button‘).click();"
)
print
‘click button‘
print
show_more_button
# to the bottom of the page
browser.execute_script(
"window.scrollTo(0, 0);"
)
number
=
browser.execute_script(
"return document.querySelectorAll(‘button.price‘).length;"
)
print
number
return
number
def
Click_Install_Button(browser, category_root_address):
get_permissions_code
=
"""var permissions = document.querySelectorAll(‘.perm-description‘);
var precise_locaton = ‘precise location (GPS and network-based)‘;
var approximate_location = ‘approximate location (network-based)‘;
var ways = ‘‘;
for (var perm in permissions) {
if (permissions[perm].innerHTML == precise_locaton) {
ways += ‘p‘;
} else if (permissions[perm].innerHTML == approximate_location) {
ways += ‘a‘;
}
}
return ways;"""
# get all install button objects
get_button_list_code
=
"""return document.querySelectorAll(‘button.price‘);"""
button_list
=
browser.execute_script(get_button_list_code)
# print dir(button_list[0])
# button_list.reverse()
numbers_of_button
=
len
(button_list)
count
=
0
# index = 1
sleep(
3
)
#webdriver.ActionChains(browser).move_to_element(button_list[1]).click(button_list[1]).perform()
#sleep(1)
#browser.execute_script("document.querySelector(‘#purchase-cancel-button‘).click();")
#webdriver.ActionChains(browser).move_to_element(button_list[3]).click(button_list[3]).perform()
#sleep(1)
#browser.execute_script("document.querySelector(‘#purchase-cancel-button‘).click();")
category
=
Get_Category(category_root_address)
get_app_address_code
=
"""var app_address_list = document.querySelectorAll("h2 a");var list = [];
for (var i = 0; i < app_address_list.length; i++) {list.push(app_address_list[i][‘href‘]);} return list;"""
address_list
=
browser.execute_script(get_app_address_code)
conndb
=
Conn_DB()
db_cursor
=
conndb.cursor()
number_of_i_want
=
0
insert_sql
=
u
"""insert into app_info (categroy, name, link, get_geo_ways) values (‘{0}‘, ‘{1}‘, ‘{2}‘, ‘{3}‘)"""
for
index
in
range
(
1
, numbers_of_button,
2
):
try
:
webdriver.ActionChains(browser).move_to_element(button_list[index]).click(button_list[index]).perform()
sleep(
3.5
)
count
+
=
1
#index += 2
except
IndexError:
print
"Out of index"
break
try
:
print
"Count "
, count
perms
=
browser.execute_script(get_permissions_code)
sleep(
2
)
appname
=
browser.execute_script(
"return document.querySelector(‘.purchase-header .title‘).innerHTML;"
)
print
u
"App id is: "
, appname , u
"Perm is: "
, perms, u
"Address is: "
, address_list[count
-
1
]
if
perms:
sql_with_data
=
insert_sql.
format
(category, appname, address_list[count
-
1
], perms)
db_cursor.execute(sql_with_data)
conndb.commit()
number_of_i_want
+
=
1
except
Exception, e:
print
"Error for "
, e,
"Number is "
, count,
"Pers is"
, perms
continue
# click cancle button
browser.execute_script(
"document.querySelector(‘#purchase-cancel-button‘).click();"
)
sleep(
1
)
print
"compary "
, count , numbers_of_button,
"I want :"
, number_of_i_want
db_cursor.close()
conndb.close()
# print browser.execute_script()
if
__name__
=
=
‘__main__‘
:
driver
=
webdriver.Chrome()
Login_Google(driver, root_address)
Load_All_Apps(driver)
Click_Install_Button(driver, root_address)
#sys.exit()
fd
=
file
(
"./res.txt"
,
"wb"
)
fd.write(
"over"
)
fd.close()
|
原文:http://www.cnblogs.com/jaw-crusher/p/3669387.html
内容总结
以上是互联网集市为您收集整理的使用selenium做简单爬虫的实例全部内容,希望文章能够帮你解决使用selenium做简单爬虫的实例所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。