selenium 综合应用(1)
需求
模拟浏览器,从京东首页开始输入关键词,爬取两页商品的名称、价格和一页评价(只包括文字,不包括视频)。评价包括用户的名称和星数。
网页分析
首先定位搜索框的位置
前面说过商品页的商品是动态加载。不然我们只能得到 30 个商品的数据
定位商品的链接
4. 商品详情页的需要爬取的数据的定位
5. 翻页
代码
from selenium import webdriver
from time import sleep
from lxml import etree
import re
key = input("请输入商品名称:")
page_index = int(input("请输入要爬取的页数:"))
bor = webdriver.Chrome("chromedriver.exe")
bor.get("https://www.jd.com/")
# 定位搜索框
input_key = bor.find_element_by_id("key")
input_key.send_keys(key)
sleep(2)
# 定位按钮
btn = bor.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
bor.execute_script("arguments[0].click()", btn)
sleep(2)
index = 0
while index < page_index:
# 滚轮到底,破解动态加载
bor.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
page = bor.page_source
tree = etree.HTML(page)
# 商品 li 标签的列表
good_lists = tree.xpath('//*[@id="J_goodsList"]/ul/li')
# print(len(good_lists))
for i in range(1, len(good_lists)+1):
# 商品的链接
chain = bor.find_element_by_xpath('//*[@id="J_goodsList"]/ul/li[' + str(i) + ']//a[@target="_blank"]')
# 点击,来到商品详情页
bor.execute_script("arguments[0].click()", chain)
sleep(2)
# 跳转到新打开的窗口
handles = bor.window_handles
bor.switch_to.window(handles[-1])
# 点击商品评价
good_btn = bor.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]')
bor.execute_script("arguments[0].click()", good_btn)
# 滚轮到底,破解动态加载
bor.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
good_tree = etree.HTML(bor.page_source)
good_name = good_tree.xpath('//div[@class="sku-name"]//text()')[0]
good_name = re.sub(r"\s", "", good_name)
good_price = good_tree.xpath('//div[@class="dd"]/span/span[2]//text()')[0] + " 元"
# print(good_price, "\n", good_name)
# print("正在爬取", good_name + "……")
with open("商品.txt", "a", encoding = "utf-8") as fp:
fp.write("商品名称:" + good_name + "\n")
fp.write("商品价格:" + good_price + "\n")
user_name_list = good_tree.xpath('//*[@id="comment-0"]/div/div[1]/div[1]/img/@alt')
star_list = good_tree.xpath('//*[@id="comment-0"]/div/div[2]/div[1]/@class')
conten_listt = good_tree.xpath('//*[@id="comment-0"]/div/div[2]/p//text()')
for i in range(len(user_name_list)):
user_name = user_name_list[i]
star = star_list[i]
content =conten_listt[i]
with open("商品.txt", "a", encoding = "utf-8") as fp:
fp.write("用户评价:" + "\n")
fp.write("\t" + "用户名:" + user_name + "\n")
fp.write("\t" + "星数:" + star + "\n")
fp.write("\t" + content + "\n" + "\n")
# print(good_name, "爬取成功!!!")
sleep(2)
# 关闭当前窗口,并跳转到一开始的窗口
bor.close()
bor.switch_to.window(handles[0])
break
index += 1
# 输入页数并翻页
input_text = bor.find_element_by_xpath('//*[@id="J_bottomPage"]/span[2]/input')
key = index + 1
input_text.clear()
input_text.send_keys(str(key))
sleep(5)
btn = bor.find_element_by_xpath('//*[@id="J_bottomPage"]/span[2]/a')
bor.execute_script("arguments[0].click()", btn)
sleep(2)
bor.quit()