python Selenium crawler crawling JD search page information#
1. What is the Selenium framework?#
-
Selenium is a tool for testing web applications. It runs directly in the browser, just like a real user operation.
-
Supported browsers include IE, Firefox, Safari, Google Chrome, Opera, etc., covering multiple platforms.
-
Automation testing, dynamic spider with JavaScript.
2. Environment configuration#
Here I use the Firefox browser driver geckodriver. If you are using the Google browser, you need to use chromedriver.
Download link: geckodriver
Note that the browser driver version here should correspond to the browser version. It is recommended to download the latest version of the driver and update the browser to the latest version.
Here I downloaded geckodriver-v0.29.1-win64.zip 1.44 MB
After decompression, you can get the geckodriver.exe
file, and add this executable file to the Scripts
folder in your Python installation directory. If you are using Anaconda, you need to add it to the Scripts
folder in Anaconda3
.
Enter geckodriver
in the cmd, and you will get
(crawler) C:\Users\HP>geckodriver 1630680105974 geckodriver INFO Listening on 127.0.0.1:4444
If it appears in this form, it means the installation is successful.
Download the Selenium
library and install the Selenium framework using the most commonly used command in Python, pip install Selenium
.
3. Code writing#
3.1 Import relevant libraries#
# Import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
3.2 Simulate browser operation#
# Simulate browser access to JD page to solve anti-crawling
def spider(url, keyword):
# Initialize the browser
driver = webdriver.Firefox()
# Visit the website
driver.get(url)
# time.sleep(2) # 20
try:
driver.implicitly_wait(10) # Implicitly wait to ensure that the nodes are fully loaded, imperceptible
# Search
# Locate the search box
input_tag = driver.find_element_by_id('key')
# Simulate keyboard input of keywords
input_tag.send_keys(keyword)
# Simulate pressing the Enter key to search
input_tag.send_keys(Keys.ENTER)
get_goods(driver)
finally:
pass
# time.sleep(10)
# # Close the browser, always executed
# driver.close()
3.3 Fetch the required data#
# Locate the product data to be crawled
def get_goods(driver):
try:
# Locate each product
goods = driver.find_elements_by_class_name('gl-item')
# Product name, href, price, comments
for good in goods:
p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n', '')
detail_url = good.find_element_by_tag_name('a').get_attribute('href')
price = good.find_element_by_css_selector('.p-price i').text
p_commit = good.find_element_by_css_selector('.p-commit a').text
msg = """
Product: %s
Link: %s
Price: %s
Comments: %s
""" % (p_name, detail_url, price, p_commit)
print(msg)
# with open("")
# A large amount of data
# Locate the next page
button = driver.find_element_by_partial_link_text('Next')
# Simulate click operation
button.click()
time.sleep(2)
get_goods(driver)
except Exception as e:
print(e)
3.4 Program entry#
if __name__ == '__main__':
spider('https://www.jd.com/', keyword='jk')
Enter the website here, and the keyword is the search keyword.
4. Complete code#
# Environment configuration
"""
Browser driver geckodriver
1630635571195 geckodriver INFO Listening on 127.0.0.1:4444
"""
# Import libraries
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
# Simulate browser access to JD page to solve anti-crawling
def spider(url, keyword):
# Initialize the browser
driver = webdriver.Firefox()
# Visit the website
driver.get(url)
# time.sleep(2) # 20
try:
driver.implicitly_wait(10) # Implicitly wait to ensure that the nodes are fully loaded, imperceptible
# Search
# Locate the search box
input_tag = driver.find_element_by_id('key')
# Simulate keyboard input of keywords
input_tag.send_keys(keyword)
# Simulate pressing the Enter key to search
input_tag.send_keys(Keys.ENTER)
get_goods(driver)
finally:
pass
# time.sleep(10)
# # Close the browser, always executed
# driver.close()
# Locate the product data to be crawled
def get_goods(driver):
try:
# Locate each product
goods = driver.find_elements_by_class_name('gl-item')
# Product name, href, price, comments
for good in goods:
p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n', '')
detail_url = good.find_element_by_tag_name('a').get_attribute('href')
price = good.find_element_by_css_selector('.p-price i').text
p_commit = good.find_element_by_css_selector('.p-commit a').text
msg = """
Product: %s
Link: %s
Price: %s
Comments: %s
""" % (p_name, detail_url, price, p_commit)
print(msg)
# with open("")
# A large amount of data
# Locate the next page
button = driver.find_element_by_partial_link_text('Next')
# Simulate click operation
button.click()
time.sleep(2)
get_goods(driver)
except Exception as e:
print(e)
if __name__ == '__main__':
spider('https://www.jd.com/', keyword='jk')