Python网络爬虫实战之九:Selenium进阶操作与爬取京东商品评论

0.307字数 555阅读 1745

目录:Python网络爬虫实战系列

正文:

一、Selenium进阶操作

1、回顾 Selenium 打开有界面的浏览器

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser = webdriver.Chrome()
try:
    browser.get('https://www.baidu.com')
    input = browser.find_element_by_id('kw')
    input.send_keys('Python')
    input.send_keys(Keys.ENTER)
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
    print(browser.current_url)
    print(browser.get_cookies())
    print(browser.page_source)
finally:
    browser.close()

2、回顾 Selenium 打开无界面的浏览器

from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # 设置headless模型
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://www.baidu.com')
print(driver.page_source)
driver.close()

3、页面交互:模拟人工在淘宝上搜索

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get("http://www.taobao.com")
input_str = browser.find_element_by_id('q')
input_str.send_keys("ipad")
time.sleep(1)
input_str.clear()
input_str.send_keys("macBook pro")
button = browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]')
button.click()

4、动作链: 模拟人工拖拽图像元素

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
url = "http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
browser.get(url)
browser.switch_to.frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
actions = ActionChains(browser)
actions.drag_and_drop(source, target)
actions.perform()

5、执行JS:模拟人工在知乎上下拉滚动条到页面底部

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("http://www.zhihu.com/explore")
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')

6、获取节点信息:知乎首页中“提问”按钮

from selenium import webdriver

browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
input = browser.find_element_by_class_name('zu-top-add-question')
print(input.id)
print(input.location)
print(input.tag_name)
print(input.size)

7、隐式等待

from selenium import webdriver

browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get('https://www.zhihu.com/explore')
input = browser.find_element_by_class_name('zu-top-add-question')
print(input)

8、显式等待

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

browser = webdriver.Chrome()
browser.get('https://www.taobao.com/')
wait = WebDriverWait(browser, 10)
input = wait.until(EC.presence_of_element_located((By.ID, 'q')))
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
print(input, button)

9、切换Frame

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

browser = webdriver.Chrome()
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
browser.switch_to.frame('iframeResult')
try:
    logo = browser.find_element_by_class_name('logo')
except NoSuchElementException:
    print('NO LOGO in iframeResult')
browser.switch_to.parent_frame()
try:
    logo = browser.find_element_by_class_name('logo')
    print(logo)
    print(logo.text)
except NoSuchElementException:
    print('NO LOGO in parent_frame')

10、前进与后退

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get('https://www.baidu.com/')
browser.get('https://www.taobao.com/')
browser.get('https://www.zhihu.com/')
browser.back()
time.sleep(1)
browser.forward()
browser.close()

11、浏览器的选项卡操作

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
print(browser.window_handles)
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(1)
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.zhihu.com/')

二、爬取京东商品评论

1、爬取京东的商品评论——通过打开界面浏览器

from selenium import webdriver
from urllib.parse import quote

driver = webdriver.Chrome()  # 打开浏览器
key = '红酒'  # 设置搜索商品关键词
url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8'  # 构造url
driver.get(url)  # 打开url
driver.implicitly_wait(3)  # 等待
links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[3]/a')  # 查找当前页面的商品链接
urls = [l.get_attribute('href') for l in links]
url = urls[1]  # 获取第一个商品链接
driver.get(url)  # 打开页面
driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click()  # 点击商品评论
# 获取评论数据
comment_list = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
comment_text_list = [c.text for c in comment_list]
driver.find_element_by_link_text('下一页').click()  # 点击下一页评论
driver.close()

2、爬取京东的商品评论——通过无界面浏览器

from selenium import webdriver
from urllib.parse import quote

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(chrome_options=chrome_options)
key = '红酒'  # 设置搜索商品关键词
url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8'  # 构造url
driver.get(url)  # 打开url
driver.implicitly_wait(3)  # 等待
links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[3]/a')  # 查找当前页面的商品链接
urls = [l.get_attribute('href') for l in links]
url = urls[1]  # 获取第一个商品链接
driver.get(url)  # 打开页面
driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click()  # 点击商品评论
# 获取评论数据
comment_list = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
comment_text_list = [c.text for c in comment_list]
# driver.find_element_by_link_text('下一页').click()  # TODO 报错:Message: no such element: Unable to locate element: {"method":"link text","selector":"下一页"}

3、爬取京东的商品评论——通过封装函数的形式

from selenium import webdriver
from urllib.parse import quote
import pandas as pd
from selenium.common.exceptions import StaleElementReferenceException


def get_page_comment(driver):
    try:
        content = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
        content_list = [c.text for c in content]
    except StaleElementReferenceException as msg:
        print(u"get_page_comment异常%s" % msg)
        print(u"重新get_page_comment")
        content = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
        content_list = [c.text for c in content]
    return content_list


def get_page_all_comment(driver, i):
    all_content = get_page_comment(driver)
    while True:
        try:
            driver.find_element_by_link_text('下一页').click()
            all_content = all_content + get_page_comment(driver)
        except:
            print("没有下一页了 - " + str(i))  # TODO 点击下一页,获取失败,待优化
            break
    return all_content


def get_all_comment(urls, driver, outpath='D:/DataguruPyhton/PythonSpider/images/'):
    i = 0
    for url in urls:
        i += 1
        driver.get(url)
        driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click()  # 点击商品详情
        name = driver.find_element_by_xpath('/html/body/div[8]/div/div[2]/div[1]').text
        print("文件%d - %s" % (i, name))
        comment = get_page_all_comment(driver, i)
        comment = pd.DataFrame(comment)
        comment.to_csv(outpath + str(i) + '.csv')
    return None


def get_links(key, driver):
    url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8'  # 构造url
    driver.get(url)  # 打开url
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')  # 滚动到页面底部
    driver.implicitly_wait(3)  # 等待
    links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a')  # 查找当前页面的商品链接
    urls = [l.get_attribute('href') for l in links]
    return urls


def main(key):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')  # 设置headless模型
    driver = webdriver.Chrome(chrome_options=chrome_options)
    urls = get_links(key, driver)
    get_all_comment(urls, driver, outpath='D:/DataguruPyhton/PythonSpider/images/')


main('红酒')

三、本篇文章中的代码,运行环境

  • python 3.6.4
  • selenium 3.8.0
  • goole chrome 68.0.3440.106(正式版本) (64 位)
  • chromedriver.exe

推荐阅读更多精彩内容