使用PhantomJS进行非GUI爬取斗鱼房间列表

下载合适版本的PhantomJS,下载地址:http://phantomjs.org/download.html
解压后进入bin目录,将phantomjs.exe放于与代码文件同一目录下,或者在代码self.driver = webdriver.PhantomJS()的括号中填入phantomjs.exe的绝对路径

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#coding:utf-8

import unittest
from selenium import webdriver
from bs4 import BeautifulSoup

import sys

sys.path.append("..")
reload(sys)
sys.setdefaultencoding("utf-8")

class seleniumTest(unittest.TestCase):
#初始化webdriver
def setUp(self):
self.driver = webdriver.PhantomJS()

def testEle(self):
driver = self.driver
#driver.get获取浏览器加载完成后的源码
driver.get('http://www.douyu.com/directory/all')
soup = BeautifulSoup(driver.page_source, 'xml')
while True:
titles = soup.find_all('h3', {'class': 'ellipsis'})
for title in titles:
with open('text.txt', 'a') as f:
f.write(title.get_text().strip().replace("\n", "") + '\n')
#当最后一页不可点击时跳出循环
if driver.page_source.find('shark-pager-disable-next') != -1:
break
#点击跳转获取下一页
elem = driver.find_element_by_class_name('shark-pager-next')
elem.click()
soup = BeautifulSoup(driver.page_source, 'xml')

def tearDown(self):
print '=======================完成啦====================='

if __name__ == "__main__":
unittest.main()