Date:2016-12-6
By:Black Crow
前言:
本次作业为课程第五部分的作业,模拟登陆豆瓣。主要使用的是selenium的webdriver模拟登陆,使用lxml来抓XPATH定位。
作业效果:
我的代码:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import lxml.html
def get_douban_info():
login_url ='https://accounts.douban.com/login'
driver = webdriver.Chrome()
driver.get(login_url)
account = driver.find_element_by_id('email')
account.clear()#清楚框框里的字,下同
account.send_keys('********')#真实账号被隐藏,请换成自己实际的账号
password =driver.find_element_by_id('password')
password.clear()
password.send_keys('***********')#真实密码被隐藏,请换成自己实际的密码
captcha = driver.find_element_by_id('captcha_image')
if captcha:#如果抓到验证码了,恭喜你,需要打码
captcha_field = driver.find_element_by_id('captcha_field')
captcha_field.clear()
captcha_field.send_keys(input('captcha is:'))#验证码手动挡
account.send_keys(Keys.RETURN)
else:#如果没有就直接回车吧
account.send_keys(Keys.RETURN)
html=driver.page_source#抓页面
selector= lxml.html.fromstring(html)#通过lxml来抓XPATH
content = selector.xpath('//div[@class="usr-pic"]/a/@href')
for url in content:
driver.get(url)
new_content =driver.page_source
selector1 =lxml.html.fromstring(new_content)
# print(new_content)
locations =selector1.xpath('//div[@class="user-info"]/a/text()')#抓的是list,下同
dates = selector1.xpath('//div[@class="pl"]/text()')
imgs = selector1.xpath('//div[@class="basic-info"]/img/@src')
intros = selector1.xpath('//span[@id="intro_display"]/text()')
new_intro =[]
for intro in intros:
intro=intro.strip().replace('\n','').replace('\xa0','')#去除换行等
new_intro.append(intro)
data = {
'url':url,
'location':locations[0],
'date':dates[1],
'img':imgs[0],
'intro':new_intro
}
print(data)
driver.close()#抓完关窗口
get_douban_info()
####总结:
>1. 手动打码好low逼,急需解决输入验证码的问题。