嘘~ 正在从服务器偷取页面 . . .

简易界面爬虫小项目


项目目录

注意事项

  1. 本来是要再做一个界面在运行时显示正在爬取哪个内容
  2. log 文件本来是要做一个日志文件,不过其实是单机版本其实好像没什么必要(实际上是嫌太麻烦了)

代码演示

lib 文件夹下 main.py 文件

import os,sys

path = os.path.dirname(os.path.dirname(__file__))

sys.path.append(path) # 添加到环境变量

if __name__ == '__main__':
    from bin.JingDongProject import jing_dong_project

    jing_dong_project()

bin 文件夹下 JingDongProject.py 文件

from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *    # 导入PyQt5部件
from PyQt5.QtWidgets import QMainWindow, QApplication,QLabel,QTableWidgetItem,QPushButton,QLineEdit,QGridLayout,QWidget,QTableWidget

from core.UI.Start import Ui_Dialog

class Window(QDialog,Ui_Dialog): # 实例化初始界面
    def __init__(self):
        super().__init__()
        self.setupUi(self)
    def setup_ui(self):
        pass


class jing_dong_project:
    def __init__(self):
        app = QApplication([])

        window = Window()
        window.show()
        app.exec_()

core 文件夹下 Spider 文件夹下 spider.py 文件

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.EdgeOptions import EdgeOptions
from conf import Settings
from time import sleep
from lxml import etree
import os

class Spider:
    def __init__(self, goods_name, paixu, goods_page, comment_page):
        self.goods_name = goods_name
        self.paixu = paixu
        self.goods_page = goods_page
        self.comment_page = comment_page

        self.run()

    def run(self):
        self.start_web()

        self.search_goods()

        self.Paixu()

        self.goods_message()

        self.driver.close()

    def start_web(self):
        """初始化浏览器"""
        # 1.创建Edge浏览器对象,这会在电脑上在打开一个浏览器窗口
        options = EdgeOptions()
        options.add_arguments([r"--headless", r"--disable-gpu"]) # 无头浏览器

        self.driver = webdriver.Edge(capabilities=options.to_capabilities())
        # self.driver = webdriver.Edge()

        # 2.通过浏览器向服务器发送URL请求
        self.driver.get(Settings.URL)

        sleep(Settings.SLEEP_TIME)

    def page_read(self):
        """网页源码解析"""
        # 获取此时的页面源码
        page = self.driver.page_source
        # xpath 解析
        self.tree = etree.HTML(page)

    def search_goods(self):
        """输入商品名称,并点击搜索"""
        name_input = self.driver.find_element_by_id("key")
        name_input.send_keys(self.goods_name)
        sleep(Settings.SLEEP_TIME)

        # 点击搜索
        click = self.driver.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
        self.driver.execute_script('arguments[0].click()', click)
        sleep(Settings.SLEEP_TIME)

    def goods_message(self):
        """获取商品信息"""
        """
        商品页翻页循环
            每次循环获取商品价格、名称等信息(这也是一个循环,因为一页有多个商品)
                获取一个商品信息后(即一个循环后)
                    进入商品详情页获取评论,评论翻页也需要循环
        """
        # 排序方式

        page = 1
        while page <= int(self.goods_page):
            print(page)
            # 滚动加载该页面所有信息
            self.driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(Settings.SLEEP_TIME)

            self.goods_price_name()

            self.page_read()

            page_list_length = len(self.tree.xpath('//*[@id="J_bottomPage"]/span[1]/a')) # 页数条的长度,最后一个是下一页
            # 点击下一页
            next_page = self.driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[{}]'.format(page_list_length))
            self.driver.execute_script('arguments[0].click()', next_page)
            page += 1

    def goods_price_name(self):
        """爬取商品信息"""
        # 获取商品价格

        self.page_read()

        length = len(self.tree.xpath('//*[@id="J_goodsList"]/ul/li')) # 获取该页商品数量

        count = 1
        while count <= length:
            print(count)

            self.page_read()

            if len(self.tree.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[3]/strong/i/text()'.format(count))) != 0:
                self.goods_price = self.tree.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[3]/strong/i/text()'.format(count))[0]
            elif len(self.tree.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[2]/strong/i/text()'.format(count))) != 0:
                self.goods_price = self.tree.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[2]/strong/i/text()'.format(count))[0]
            print(self.goods_price)

            # 获取商品名称
            # //*[@id="J_goodsList"]/ul/li[1]/div/div[4]/a/em
            # //*[@id="J_goodsList"]/ul/li[1]/div/div[3]/a/em
            if len(self.tree.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[4]/a/em/text()'.format(count))) != 0:
                self.goods_name = self.tree.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[4]/a/em/text()'.format(count))
            elif len(self.tree.xpath('//*[@id="J_goodsList"]/ul/li[{}]/div/div[3]/a/em/text()'.format(count))) != 0:
                self.goods_name = self.tree.xpath(
                    '//*[@id="J_goodsList"]/ul/li[{}]/div/div[3]/a/em/text()'.format(count))

            num = ""
            for i in self.goods_name:
                i = i.replace(" ", "")
                i = i.replace("\n", "")
                i = i.replace("\t", "")
                num += i
            self.goods_name = num
            print(self.goods_name)

            # 进入商品详情页面
            xpath = '//*[@id="J_goodsList"]/ul/li[{}]/div/div[1]/a'.format(count)
            good_detail = self.driver.find_element_by_xpath(xpath)
            self.driver.execute_script('arguments[0].click()', good_detail)
            sleep(Settings.SLEEP_TIME)

            self.comment()

            count += 1

    def comment(self):
        # 获取商品评论
        self.new_window()

        # 滑到底部,等待评论加载完成
        comment_box = self.driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]')
        self.driver.execute_script('arguments[0].click()', comment_box)
        self.driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        sleep(Settings.SLEEP_TIME)

        self.page_read()
        for i in range(1, int(self.comment_page) + 1):
            self.page_read()
            if len(self.tree.xpath('//*[@id="comment-0"]/div[@class="comment-item"]')) == 0: # 评论过少或没有评论,直接退出循环
                break
            comment_counter = len(self.tree.xpath('//*[@id="comment-0"]/div[@class="comment-item"]'))
            count = 1
            while count <= comment_counter:
                self.page_read()
                self.content(count)
                count += 1

            # 点击下一页
            page_list_length = len(self.tree.xpath('//*[@id="comment-0"]/div[12]/div/div/a'))  # 页数条的长度,最后一个是下一页
            # 点击下一页
            if page_list_length != 0:
                next_page = self.driver.find_element_by_xpath(
                    '//*[@id="comment-0"]/div[12]/div/div/a[{}]'.format(page_list_length))
                self.driver.execute_script('arguments[0].click()', next_page)
            else:
                self.close_window()
                break

        self.close_window()

    def new_window(self):
        # 获取窗口,返回为一个列表
        handles = self.driver.window_handles
        # 最后一个是新打开的窗口,跳转到这个窗口
        self.driver.switch_to.window(handles[-1])

    def close_window(self):
        # 关闭新打开的窗口
        self.driver.close()
        handles = self.driver.window_handles
        self.driver.switch_to.window(handles[0])

    def content(self, count):
        # 获取此时的页面源码
        self.page_read()

        # 用户名
        username = self.tree.xpath('//*[@id="comment-0"]/div[{}]/div[1]/div[1]/text()'.format(count))[1].strip(" ")

        # 是否是会员
        huiyuan = self.tree.xpath('//*[@id="comment-0"]/div[{}]/div[1]/div[2]/a/text()'.format(count))
        if len(huiyuan) == 0:
            huiyuan = "None"
        else:
            huiyuan = huiyuan[0]

        # 评价内容
        content = self.tree.xpath('//*[@id="comment-0"]/div[{}]/div[2]/p/text()'.format(count))[0]

        # 星级
        star = self.tree.xpath('//*[@id="comment-0"]/div[{}]/div[2]/div[1]/@class'.format(count))[0][-5:]

        # print(star)

        with open(Settings.DOWNLOAD_PATH, "a", encoding="utf-8")as f:
            f.write(self.goods_name + "|" + self.goods_price + "|" + username + "|" + huiyuan + "|" + star + "|" + content)
            f.write("\n")

    def Paixu(self):
        dic = {
            "综合": 1,
            "销量": 2,
            "评论数": 3,
            "新品": 4
        }
        num = dic.get(self.paixu)
        xpath = '//*[@id="J_filter"]/div[1]/div[1]/a[{}]'.format(num)
        sales_count = self.driver.find_element_by_xpath(xpath)
        self.driver.execute_script('arguments[0].click()', sales_count)
        sleep(Settings.SLEEP_TIME)

UI 文件夹下 Start.py 文件

from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *    # 导入PyQt5部件
from PyQt5.QtWidgets import QMainWindow, QApplication,QLabel,QTableWidgetItem,QPushButton,QLineEdit,QGridLayout,QWidget,QTableWidget

class Ui_Dialog(object):
    def setupUi(self, Dialog):
        if not Dialog.objectName():
            Dialog.setObjectName(u"Dialog")
        Dialog.resize(421, 320)
        self.textEdit = QTextEdit(Dialog)
        self.textEdit.setObjectName(u"textEdit")
        self.textEdit.setGeometry(QRect(70, 70, 301, 61))

        self.pushButton = QPushButton(Dialog)
        self.pushButton.setObjectName(u"pushButton")
        self.pushButton.setGeometry(QRect(180, 190, 91, 41))
        self.pushButton.clicked.connect(self.user_select) # 用户选择界面

        self.retranslateUi(Dialog)

        QMetaObject.connectSlotsByName(Dialog)
    # setupUi

    def retranslateUi(self, Dialog):
        Dialog.setWindowTitle(QCoreApplication.translate("Dialog", u"Dialog", None))
        self.textEdit.setHtml(QCoreApplication.translate("Dialog", u"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n"
"<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\n"
"p, li { white-space: pre-wrap; }\n"
"</style></head><body style=\" font-family:'SimSun'; font-size:9pt; font-weight:400; font-style:normal;\">\n"
"<p align=\"center\" style=\" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;\"><span style=\" font-size:22pt; font-weight:600;\">\u5546\u54c1\u8bc4\u8bba\u722c\u53d6</span></p></body></html>", None))
        self.pushButton.setText(QCoreApplication.translate("Dialog", u"\u5f00\u59cb\u4f7f\u7528", None))
    # retranslateUi

    def user_select(self):
        from core.UI.User_Select import user_selelct_window
        self.window = user_selelct_window()
        self.window.show()

UI 文件夹下 User_Select.py

from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *    # 导入PyQt5部件
from PyQt5.QtWidgets import QMainWindow, QApplication,QLabel,QTableWidgetItem,QPushButton,QLineEdit,QGridLayout,QWidget,QTableWidget


class Ui_Dialog(object):
    def setupUi(self, Dialog):
        if not Dialog.objectName():
            Dialog.setObjectName(u"Dialog")
        Dialog.resize(619, 464)
        self.pushButton = QPushButton(Dialog)
        self.pushButton.setObjectName(u"pushButton")
        self.pushButton.setGeometry(QRect(490, 50, 111, 61))
        self.pushButton.clicked.connect(self.run)

        self.textEdit = QTextEdit(Dialog)
        self.textEdit.setObjectName(u"textEdit")
        self.textEdit.setGeometry(QRect(60, 50, 391, 61))

        self.label = QLabel(Dialog)
        self.label.setObjectName(u"label")
        self.label.setGeometry(QRect(70, 190, 131, 51))

        self.textEdit_2 = QTextEdit(Dialog)
        self.textEdit_2.setObjectName(u"textEdit_2")
        self.textEdit_2.setGeometry(QRect(210, 190, 271, 41))

        self.label_2 = QLabel(Dialog)
        self.label_2.setObjectName(u"label_2")
        self.label_2.setGeometry(QRect(70, 260, 131, 51))

        self.comboBox = QComboBox(Dialog)
        self.comboBox.addItem("")
        self.comboBox.addItem("")
        self.comboBox.addItem("")
        self.comboBox.addItem("")
        self.comboBox.setObjectName(u"comboBox")
        self.comboBox.setGeometry(QRect(220, 270, 111, 31))

        self.label_3 = QLabel(Dialog)
        self.label_3.setObjectName(u"label_3")
        self.label_3.setGeometry(QRect(70, 320, 131, 51))

        self.textEdit_3 = QTextEdit(Dialog)
        self.textEdit_3.setObjectName(u"textEdit_3")
        self.textEdit_3.setGeometry(QRect(210, 330, 341, 31))

        self.label_4 = QLabel(Dialog)
        self.label_4.setObjectName(u"label_4")
        self.label_4.setGeometry(QRect(30, 380, 171, 51))

        self.textEdit_4 = QTextEdit(Dialog)
        self.textEdit_4.setObjectName(u"textEdit_4")
        self.textEdit_4.setGeometry(QRect(210, 390, 341, 31))

        self.retranslateUi(Dialog)

        QMetaObject.connectSlotsByName(Dialog)
    # setupUi

    def retranslateUi(self, Dialog):
        Dialog.setWindowTitle(QCoreApplication.translate("Dialog", u"Dialog", None))
        self.pushButton.setText(QCoreApplication.translate("Dialog", u"\u5f00\u59cb\u722c\u53d6", None))
        self.textEdit.setHtml(QCoreApplication.translate("Dialog", u"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n"
"<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\n"
"p, li { white-space: pre-wrap; }\n"
"</style></head><body style=\" font-family:'SimSun'; font-size:9pt; font-weight:400; font-style:normal;\">\n"
"<p align=\"center\" style=\" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;\"><span style=\" font-size:24pt; font-weight:600;\">\u5546\u54c1\u8bc4\u8bba\u722c\u53d6</span></p></body></html>", None))
        self.label.setText(QCoreApplication.translate("Dialog", u"\u8981\u722c\u53d6\u7684\u5546\u54c1\u540d\u79f0", None))
        self.label_2.setText(QCoreApplication.translate("Dialog", u"\u9009\u62e9\u9875\u9762\u6392\u5e8f\u65b9\u5f0f", None))
        self.comboBox.setItemText(0, QCoreApplication.translate("Dialog", u"\u7efc\u5408", None))
        self.comboBox.setItemText(1, QCoreApplication.translate("Dialog", u"\u9500\u91cf", None))
        self.comboBox.setItemText(2, QCoreApplication.translate("Dialog", u"\u8bc4\u8bba\u6570", None))
        self.comboBox.setItemText(3, QCoreApplication.translate("Dialog", u"\u65b0\u54c1", None))

        self.label_3.setText(QCoreApplication.translate("Dialog", u"\u8981\u722c\u53d6\u7684\u5546\u54c1\u9875\u6570", None))
        self.textEdit_3.setHtml(QCoreApplication.translate("Dialog", u"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n"
"<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\n"
"p, li { white-space: pre-wrap; }\n"
"</style></head><body style=\" font-family:'SimSun'; font-size:9pt; font-weight:400; font-style:normal;\">\n"
"<p style=\" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;\">\u6700\u597d\u4fdd\u8bc1\u8be5\u5546\u54c1\u9875\u6570 \u5927\u4e8e\u7b49\u4e8e \u586b\u5199\u7684\u9875\u6570</p></body></html>", None))
        self.label_4.setText(QCoreApplication.translate("Dialog", u"\u8981\u722c\u53d6\u7684\u5546\u54c1\u8bc4\u8bba\u6570\u9875\u6570", None))
        self.textEdit_4.setHtml(QCoreApplication.translate("Dialog", u"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n"
"<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\n"
"p, li { white-space: pre-wrap; }\n"
"</style></head><body style=\" font-family:'SimSun'; font-size:9pt; font-weight:400; font-style:normal;\">\n"
"<p style=\" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;\">\u6700\u597d\u4fdd\u8bc1\u8be5\u5546\u54c1\u9875\u6570 \u5927\u4e8e\u7b49\u4e8e \u586b\u5199\u7684\u9875\u6570</p></body></html>", None))
    # retranslateUi

    def run(self):
        name = self.textEdit_2.toPlainText() # 要爬取的商品名称
        paixu = self.comboBox.currentText() # 页面商品的排序方式
        good_page = self.textEdit_3.toPlainText() # 要爬取的商品页数
        comment_page = self.textEdit_4.toPlainText() # 要爬取的商品评论数页数

        from core.UI.Wait import user_wait_window
        self.window = user_wait_window()
        self.window.show()

        from core.Spider.spider import Spider
        Spider(name, paixu, good_page, comment_page)

class user_selelct_window(QDialog, Ui_Dialog): # 打开用户选择页面
    def __init__(self):
        super().__init__()
        self.setupUi(self)

UI 文件夹下的 Wait.py

from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *    # 导入PyQt5部件
from PyQt5.QtWidgets import QMainWindow, QApplication,QLabel,QTableWidgetItem,QPushButton,QLineEdit,QGridLayout,QWidget,QTableWidget

class Ui_Dialog(object):
    def setupUi(self, Dialog):
        if not Dialog.objectName():
            Dialog.setObjectName(u"Dialog")
        Dialog.resize(614, 187)
        self.textEdit_2 = QTextEdit(Dialog)
        self.textEdit_2.setObjectName(u"textEdit_2")
        self.textEdit_2.setGeometry(QRect(70, 60, 481, 61))

        self.retranslateUi(Dialog)

        QMetaObject.connectSlotsByName(Dialog)
    # setupUi

    def retranslateUi(self, Dialog):
        Dialog.setWindowTitle(QCoreApplication.translate("Dialog", u"Dialog", None))
        self.textEdit_2.setHtml(QCoreApplication.translate("Dialog", u"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n"
"<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\n"
"p, li { white-space: pre-wrap; }\n"
"</style></head><body style=\" font-family:'SimSun'; font-size:9pt; font-weight:400; font-style:normal;\">\n"
"<p align=\"center\" style=\" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;\"><span style=\" font-size:22pt; font-weight:600;\">\u6b63\u5728\u722c\u53d6\uff0c\u8bf7\u8010\u5fc3\u7b49\u5f85</span></p></body></html>", None))
    # retranslateUi

class user_wait_window(QDialog, Ui_Dialog): # 打开等待页面
    def __init__(self):
        super().__init__()
        self.setupUi(self)

conf 文件夹下的 Settings.py

URL = "https://www.jd.com/"
# URL = "https://search.jd.com/Search?keyword=%E8%A1%80%E6%BA%90&wq=%E8%A1%80%E6%BA%90&pvid=068dc4b95f284265bec11485d5d227ab&page=9&s=241&click=0"

SLEEP_TIME = 5

DOWNLOAD_PATH = r'C:\Users\ASUS\Desktop\code\pyqt5_JingDong\download\download.txt'

EXCEL_PATH = "C:\Users\ASUS\Desktop\code\pyqt5_JingDong\download\data.xls"

download 文件夹下的 ToExcel.py 文件

import os,sys

path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

sys.path.append(path)

if __name__ == '__main__':
    from conf.Settings import DOWNLOAD_PATH,EXCEL_PATH
    from openpyxl import Workbook

    book = Workbook()
    sheet = book.active

    title = ("goods_name", "price", "username", "huiyuan", "star", "comment content")

    sheet.append(title)

    if os.path.isfile(DOWNLOAD_PATH):
        with open(DOWNLOAD_PATH, "r", encoding="utf-8") as f:
            for line in f:
                line_list = line.split("|")
                # self.goods_name + | + price + "|" + username + "|" + huiyuan + "|" + star + "|" + content
                sheet.append(line_list)

        book.save(EXCEL_PATH)

        print("over!!!")
    else:
        print("can't find the DOANLOAD file!!!")

运行演示


其他

代码已经上传至 github 项目
JingDongSpider


文章作者: New Ass
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 New Ass !
  目录