反爬虫机制是网站为了防止自动化工具抓取其内容而采取的各种技术和策略。了解这些机制可以帮助你设计更有效的爬虫程序。下面我将详细讲解常见的反爬虫机制及其应对策略。
IP 封禁
User-Agent 检测
验证码(CAPTCHA)
JavaScript 渲染
动态 URL 生成
Cookie 和 Session ID
HTTP 请求头检查
限制请求频率
隐藏数据
法律限制
import requests from random import choice proxies = [ "http://proxy1.example.com:8080", "http://proxy2.example.com:8080", "http://proxy3.example.com:8080" ] def get_with_proxy(url): proxy = {"http": choice(proxies), "https": choice(proxies)} try: response = requests.get(url, proxies=proxy, timeout=5) return response except Exception as e: print(f"Failed to fetch {url}: {e}") return None response = get_with_proxy("https://example.com") if response: print(response.text) import requests from fake_useragent import UserAgent ua = UserAgent() def get_with_random_ua(url): headers = {'User-Agent': ua.random} try: response = requests.get(url, headers=headers, timeout=5) return response except Exception as e: print(f"Failed to fetch {url}: {e}") return None response = get_with_random_ua("https://example.com") if response: print(response.text) from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC def get_with_selenium(url): driver = webdriver.Chrome() driver.get(url) try: element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "some_id")) ) print(element.text) finally: driver.quit() get_with_selenium("https://example.com") 接下来我会给出一些具体的代码案例,来演示如何处理一些常见的反爬虫机制。这里我们将重点介绍以下几个方面:
当然可以!接下来,我将进一步优化之前的代码示例,以提高它们的稳定性和效率。我们将包括更完善的错误处理、重试机制、日志记录以及其他实用的功能。
import requests from random import choice import logging from time import sleep # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # 代理池列表 proxies = [ "http://proxy1.example.com:8080", "http://proxy2.example.com:8080", "http://proxy3.example.com:8080" ] def get_with_proxy(url, max_retries=3): retries = 0 while retries < max_retries: try: # 随机选择一个代理 proxy = {"http": choice(proxies), "https": choice(proxies)} # 发送带有代理的请求 response = requests.get(url, proxies=proxy, timeout=5) return response except Exception as e: retries += 1 logging.warning(f"Failed to fetch {url} using proxy. Retry {retries}/{max_retries}. Error: {e}") sleep(1) # 等待一秒后重试 logging.error(f"Max retries reached for {url} using proxy.") return None # 测试 url = "https://httpbin.org/ip" response = get_with_proxy(url) if response: print(response.text) import requests from fake_useragent import UserAgent import logging from time import sleep # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # 创建 UserAgent 对象 ua = UserAgent() def get_with_random_ua(url, max_retries=3): retries = 0 while retries < max_retries: try: # 设置随机的 User-Agent headers = {'User-Agent': ua.random} # 发送带有随机 User-Agent 的请求 response = requests.get(url, headers=headers, timeout=5) return response except Exception as e: retries += 1 logging.warning(f"Failed to fetch {url} using random UA. Retry {retries}/{max_retries}. Error: {e}") sleep(1) # 等待一秒后重试 logging.error(f"Max retries reached for {url} using random UA.") return None # 测试 url = "https://httpbin.org/headers" response = get_with_random_ua(url) if response: print(response.text) from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import logging from time import sleep # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def get_with_selenium(url, max_retries=3): retries = 0 while retries < max_retries: try: # 配置 Chrome 选项 chrome_options = Options() chrome_options.add_argument("--headless") # 无头模式 # 初始化 WebDriver driver = webdriver.Chrome(options=chrome_options) try: # 访问目标网址 driver.get(url) # 等待某个元素加载完成 element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "some_element_id")) ) # 获取页面源码 page_source = driver.page_source # 输出页面源码 logging.info("Page source retrieved successfully.") return page_source finally: # 关闭 WebDriver driver.quit() return page_source except Exception as e: retries += 1 logging.warning(f"Failed to fetch {url} using Selenium. Retry {retries}/{max_retries}. Error: {e}") sleep(1) # 等待一秒后重试 logging.error(f"Max retries reached for {url} using Selenium.") return None # 测试 url = "https://www.example.com" page_source = get_with_selenium(url) if page_source: print(page_source) 验证码处理通常较为复杂,这里使用了一个简单的 OCR 解决方案作为示例。请注意,实际应用中可能需要更复杂的 OCR 处理或者使用专门的服务来处理验证码。
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from PIL import Image import pytesseract from io import BytesIO import logging from time import sleep # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def solve_captcha(driver, captcha_element, max_retries=3): retries = 0 while retries < max_retries: try: location = captcha_element.location size = captcha_element.size screenshot = driver.get_screenshot_as_png() im = Image.open(BytesIO(screenshot)) left = location['x'] top = location['y'] right = location['x'] + size['width'] bottom = location['y'] + size['height'] im = im.crop((left, top, right, bottom)) captcha_text = pytesseract.image_to_string(im) return captcha_text.strip() except Exception as e: retries += 1 logging.warning(f"Failed to solve captcha. Retry {retries}/{max_retries}. Error: {e}") sleep(1) # 等待一秒后重试 logging.error("Max retries reached for solving captcha.") return None def get_with_captcha(url, max_retries=3): retries = 0 while retries < max_retries: try: # 初始化 WebDriver driver = webdriver.Chrome() try: # 访问目标网址 driver.get(url) # 等待验证码出现 captcha_element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "captcha_image")) ) # 解析验证码 captcha_text = solve_captcha(driver, captcha_element) # 打印解析出的验证码 logging.info(f"Solved captcha: {captcha_text}") # 假设有一个输入框用于提交验证码 input_field = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "captcha_input")) ) input_field.send_keys(captcha_text) # 提交表单或点击按钮 submit_button = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.ID, "submit_button")) ) submit_button.click() # 等待页面加载完成 WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "content_after_captcha")) ) # 获取页面源码 page_source = driver.page_source # 输出页面源码 logging.info("Page source retrieved successfully.") return page_source finally: # 关闭 WebDriver driver.quit() return page_source except Exception as e: retries += 1 logging.warning(f"Failed to fetch {url} using captcha. Retry {retries}/{max_retries}. Error: {e}") sleep(1) # 等待一秒后重试 logging.error(f"Max retries reached for {url} using captcha.") return None # 测试 url = "https://www.example.com/captcha" page_source = get_with_captcha(url) if page_source: print(page_source) logging 库记录爬虫的状态和错误信息,包括重试次数和最终失败的情况。