Python网络爬虫实战从Requests到Scrapy的完整指南引言网络爬虫是数据采集和分析的重要工具。作为从Python转向Rust的后端开发者我发现Python的爬虫生态非常成熟从简单的Requests到强大的Scrapy框架能够满足各种爬虫需求。本文将从实战角度出发深入探讨Python网络爬虫的最佳实践帮助你构建高效、稳定的爬虫系统。一、网络爬虫概述1.1 爬虫类型类型特点适用场景静态爬虫爬取静态HTML页面简单网站、数据采集动态爬虫处理JavaScript渲染现代SPA应用增量爬虫定期更新数据新闻、博客监控分布式爬虫多节点协作大规模数据采集1.2 爬虫架构┌─────────────────────────────────────────────────────┐ │ 调度层 │ │ URL队列 → 调度器 → 请求分发 │ ├─────────────────────────────────────────────────────┤ │ 抓取层 │ │ 请求模块 → 页面解析 → 数据提取 │ ├─────────────────────────────────────────────────────┤ │ 存储层 │ │ 数据清洗 → 数据存储 → 数据备份 │ └─────────────────────────────────────────────────────┘二、Requests基础爬虫2.1 基本请求import requests url https://example.com response requests.get(url) print(f状态码: {response.status_code}) print(f响应头: {response.headers}) print(f响应内容: {response.text[:500]})2.2 请求参数params {key1: value1, key2: value2} headers { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Referer: https://example.com } response requests.get( https://api.example.com/data, paramsparams, headersheaders, timeout10 )2.3 会话管理session requests.Session() session.headers.update({User-Agent: MyBot/1.0}) # 保持登录状态 session.post(https://example.com/login, data{username: user, password: pass}) # 后续请求自动携带cookie response session.get(https://example.com/dashboard)三、BeautifulSoup解析3.1 HTML解析from bs4 import BeautifulSoup html response.text soup BeautifulSoup(html, html.parser) # 查找标签 title soup.title.string print(f页面标题: {title}) # 查找元素 links soup.find_all(a, hrefTrue) for link in links[:5]: print(f链接: {link[href]} - {link.get_text()}) # 使用CSS选择器 articles soup.select(article.post) for article in articles: title article.select_one(h2.title).get_text() summary article.select_one(p.summary).get_text() print(f{title}: {summary})3.2 数据提取实战def extract_news_items(html): soup BeautifulSoup(html, html.parser) news_items [] for item in soup.select(div.news-item): title item.select_one(h3).get_text(stripTrue) url item.select_one(a)[href] date item.select_one(span.date).get_text(stripTrue) category item.select_one(span.category).get_text(stripTrue) news_items.append({ title: title, url: url, date: date, category: category }) return news_items四、Scrapy框架4.1 创建项目scrapy startproject my_spider cd my_spider scrapy genspider example example.com4.2 编写爬虫import scrapy class ExampleSpider(scrapy.Spider): name example allowed_domains [example.com] start_urls [https://example.com/news] def parse(self, response): for article in response.css(article.post): yield { title: article.css(h2.title::text).get(), url: article.css(a::attr(href)).get(), summary: article.css(p.summary::text).get(), date: article.css(time::attr(datetime)).get() } # 分页处理 next_page response.css(a.next-page::attr(href)).get() if next_page: yield response.follow(next_page, self.parse)4.3 配置文件# settings.py USER_AGENT MySpider/1.0 (http://www.example.com) ROBOTSTXT_OBEY True DOWNLOAD_DELAY 2 CONCURRENT_REQUESTS 8 ITEM_PIPELINES { my_spider.pipelines.MySpiderPipeline: 300, }4.4 数据管道class MySpiderPipeline: def process_item(self, item, spider): # 数据清洗 item[title] item[title].strip() item[summary] item[summary].strip() # 数据存储 self.store_item(item) return item def store_item(self, item): # 存储到数据库或文件 pass五、动态页面爬取5.1 使用Seleniumfrom selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driver webdriver.Chrome() driver.get(https://example.com/dynamic-page) # 等待元素加载 wait WebDriverWait(driver, 10) element wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, div.content)) ) # 提取数据 content element.text print(content) driver.quit()5.2 使用Playwrightfrom playwright.sync_api import sync_playwright with sync_playwright() as p: browser p.chromium.launch(headlessTrue) page browser.new_page() page.goto(https://example.com/dynamic-page) # 等待网络空闲 page.wait_for_load_state(networkidle) # 提取数据 items page.query_selector_all(div.item) for item in items: title item.query_selector(h3).inner_text() print(title) browser.close()六、反爬策略6.1 请求频率控制import time from random import randint class RateLimiter: def __init__(self, min_delay1, max_delay3): self.min_delay min_delay self.max_delay max_delay def wait(self): delay randint(self.min_delay * 1000, self.max_delay * 1000) / 1000 time.sleep(delay) rate_limiter RateLimiter() # 在请求之间等待 rate_limiter.wait() response requests.get(url)6.2 User-Agent轮换USER_AGENTS [ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36, ] headers {User-Agent: random.choice(USER_AGENTS)} response requests.get(url, headersheaders)6.3 使用代理proxies { http: http://proxy-server:port, https: https://proxy-server:port } response requests.get(url, proxiesproxies)七、实战完整爬虫系统7.1 项目结构my_crawler/ ├── crawler/ │ ├── __init__.py │ ├── spiders/ │ │ ├── news_spider.py │ │ └── product_spider.py │ ├── pipelines/ │ │ └── database_pipeline.py │ └── settings.py ├── data/ ├── logs/ └── main.py7.2 主程序from scrapy.crawler import CrawlerProcess from crawler.settings import Settings from crawler.spiders.news_spider import NewsSpider def main(): process CrawlerProcess(settingsSettings()) process.crawl(NewsSpider) process.start() if __name__ __main__: main()7.3 数据库存储import sqlite3 class DatabasePipeline: def __init__(self): self.conn sqlite3.connect(data/crawler.db) self.cursor self.conn.cursor() self.create_table() def create_table(self): self.cursor.execute( CREATE TABLE IF NOT EXISTS news ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, url TEXT UNIQUE, summary TEXT, date TEXT, category TEXT ) ) self.conn.commit() def process_item(self, item, spider): try: self.cursor.execute( INSERT OR IGNORE INTO news (title, url, summary, date, category) VALUES (?, ?, ?, ?, ?) , (item[title], item[url], item[summary], item[date], item[category])) self.conn.commit() except Exception as e: spider.logger.error(f存储失败: {e}) return item八、爬虫最佳实践8.1 遵守robots.txt# 检查robots.txt from urllib.robotparser import RobotFileParser rp RobotFileParser() rp.set_url(https://example.com/robots.txt) rp.read() if rp.can_fetch(MyBot, https://example.com/news): # 可以爬取 response requests.get(https://example.com/news) else: print(该页面禁止爬取)8.2 设置合理的请求头headers { User-Agent: MyCrawler/1.0 (https://example.com/crawler), Accept: text/html,application/xhtmlxml,application/xml;q0.9,*/*;q0.8, Accept-Language: zh-CN,zh;q0.9,en;q0.8, Accept-Encoding: gzip, deflate, br, Connection: keep-alive, Cache-Control: max-age0, }8.3 错误处理try: response requests.get(url, timeout10) response.raise_for_status() except requests.exceptions.RequestException as e: print(f请求失败: {e}) # 可以选择重试或跳过九、总结Python的爬虫生态非常强大从简单的Requests到专业的Scrapy框架能够满足各种数据采集需求。作为后端开发者掌握爬虫技能不仅能够帮助我们获取数据还能为数据分析和机器学习提供数据支持。关键要点选择合适的工具根据需求选择Requests、BeautifulSoup、Scrapy或Playwright遵守规则尊重网站的robots.txt和使用条款反爬应对实现请求频率控制、User-Agent轮换、代理使用数据存储合理设计数据存储方案错误处理完善的异常处理机制从Python转向Rust后我发现Rust的reqwest库在性能方面有很大优势适合构建高性能的爬虫系统。延伸阅读Scrapy官方文档Requests官方文档BeautifulSoup教程Playwright官方指南
Python网络爬虫实战:从Requests到Scrapy的完整指南
发布时间:2026/5/22 19:40:46
Python网络爬虫实战从Requests到Scrapy的完整指南引言网络爬虫是数据采集和分析的重要工具。作为从Python转向Rust的后端开发者我发现Python的爬虫生态非常成熟从简单的Requests到强大的Scrapy框架能够满足各种爬虫需求。本文将从实战角度出发深入探讨Python网络爬虫的最佳实践帮助你构建高效、稳定的爬虫系统。一、网络爬虫概述1.1 爬虫类型类型特点适用场景静态爬虫爬取静态HTML页面简单网站、数据采集动态爬虫处理JavaScript渲染现代SPA应用增量爬虫定期更新数据新闻、博客监控分布式爬虫多节点协作大规模数据采集1.2 爬虫架构┌─────────────────────────────────────────────────────┐ │ 调度层 │ │ URL队列 → 调度器 → 请求分发 │ ├─────────────────────────────────────────────────────┤ │ 抓取层 │ │ 请求模块 → 页面解析 → 数据提取 │ ├─────────────────────────────────────────────────────┤ │ 存储层 │ │ 数据清洗 → 数据存储 → 数据备份 │ └─────────────────────────────────────────────────────┘二、Requests基础爬虫2.1 基本请求import requests url https://example.com response requests.get(url) print(f状态码: {response.status_code}) print(f响应头: {response.headers}) print(f响应内容: {response.text[:500]})2.2 请求参数params {key1: value1, key2: value2} headers { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Referer: https://example.com } response requests.get( https://api.example.com/data, paramsparams, headersheaders, timeout10 )2.3 会话管理session requests.Session() session.headers.update({User-Agent: MyBot/1.0}) # 保持登录状态 session.post(https://example.com/login, data{username: user, password: pass}) # 后续请求自动携带cookie response session.get(https://example.com/dashboard)三、BeautifulSoup解析3.1 HTML解析from bs4 import BeautifulSoup html response.text soup BeautifulSoup(html, html.parser) # 查找标签 title soup.title.string print(f页面标题: {title}) # 查找元素 links soup.find_all(a, hrefTrue) for link in links[:5]: print(f链接: {link[href]} - {link.get_text()}) # 使用CSS选择器 articles soup.select(article.post) for article in articles: title article.select_one(h2.title).get_text() summary article.select_one(p.summary).get_text() print(f{title}: {summary})3.2 数据提取实战def extract_news_items(html): soup BeautifulSoup(html, html.parser) news_items [] for item in soup.select(div.news-item): title item.select_one(h3).get_text(stripTrue) url item.select_one(a)[href] date item.select_one(span.date).get_text(stripTrue) category item.select_one(span.category).get_text(stripTrue) news_items.append({ title: title, url: url, date: date, category: category }) return news_items四、Scrapy框架4.1 创建项目scrapy startproject my_spider cd my_spider scrapy genspider example example.com4.2 编写爬虫import scrapy class ExampleSpider(scrapy.Spider): name example allowed_domains [example.com] start_urls [https://example.com/news] def parse(self, response): for article in response.css(article.post): yield { title: article.css(h2.title::text).get(), url: article.css(a::attr(href)).get(), summary: article.css(p.summary::text).get(), date: article.css(time::attr(datetime)).get() } # 分页处理 next_page response.css(a.next-page::attr(href)).get() if next_page: yield response.follow(next_page, self.parse)4.3 配置文件# settings.py USER_AGENT MySpider/1.0 (http://www.example.com) ROBOTSTXT_OBEY True DOWNLOAD_DELAY 2 CONCURRENT_REQUESTS 8 ITEM_PIPELINES { my_spider.pipelines.MySpiderPipeline: 300, }4.4 数据管道class MySpiderPipeline: def process_item(self, item, spider): # 数据清洗 item[title] item[title].strip() item[summary] item[summary].strip() # 数据存储 self.store_item(item) return item def store_item(self, item): # 存储到数据库或文件 pass五、动态页面爬取5.1 使用Seleniumfrom selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driver webdriver.Chrome() driver.get(https://example.com/dynamic-page) # 等待元素加载 wait WebDriverWait(driver, 10) element wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, div.content)) ) # 提取数据 content element.text print(content) driver.quit()5.2 使用Playwrightfrom playwright.sync_api import sync_playwright with sync_playwright() as p: browser p.chromium.launch(headlessTrue) page browser.new_page() page.goto(https://example.com/dynamic-page) # 等待网络空闲 page.wait_for_load_state(networkidle) # 提取数据 items page.query_selector_all(div.item) for item in items: title item.query_selector(h3).inner_text() print(title) browser.close()六、反爬策略6.1 请求频率控制import time from random import randint class RateLimiter: def __init__(self, min_delay1, max_delay3): self.min_delay min_delay self.max_delay max_delay def wait(self): delay randint(self.min_delay * 1000, self.max_delay * 1000) / 1000 time.sleep(delay) rate_limiter RateLimiter() # 在请求之间等待 rate_limiter.wait() response requests.get(url)6.2 User-Agent轮换USER_AGENTS [ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36, ] headers {User-Agent: random.choice(USER_AGENTS)} response requests.get(url, headersheaders)6.3 使用代理proxies { http: http://proxy-server:port, https: https://proxy-server:port } response requests.get(url, proxiesproxies)七、实战完整爬虫系统7.1 项目结构my_crawler/ ├── crawler/ │ ├── __init__.py │ ├── spiders/ │ │ ├── news_spider.py │ │ └── product_spider.py │ ├── pipelines/ │ │ └── database_pipeline.py │ └── settings.py ├── data/ ├── logs/ └── main.py7.2 主程序from scrapy.crawler import CrawlerProcess from crawler.settings import Settings from crawler.spiders.news_spider import NewsSpider def main(): process CrawlerProcess(settingsSettings()) process.crawl(NewsSpider) process.start() if __name__ __main__: main()7.3 数据库存储import sqlite3 class DatabasePipeline: def __init__(self): self.conn sqlite3.connect(data/crawler.db) self.cursor self.conn.cursor() self.create_table() def create_table(self): self.cursor.execute( CREATE TABLE IF NOT EXISTS news ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, url TEXT UNIQUE, summary TEXT, date TEXT, category TEXT ) ) self.conn.commit() def process_item(self, item, spider): try: self.cursor.execute( INSERT OR IGNORE INTO news (title, url, summary, date, category) VALUES (?, ?, ?, ?, ?) , (item[title], item[url], item[summary], item[date], item[category])) self.conn.commit() except Exception as e: spider.logger.error(f存储失败: {e}) return item八、爬虫最佳实践8.1 遵守robots.txt# 检查robots.txt from urllib.robotparser import RobotFileParser rp RobotFileParser() rp.set_url(https://example.com/robots.txt) rp.read() if rp.can_fetch(MyBot, https://example.com/news): # 可以爬取 response requests.get(https://example.com/news) else: print(该页面禁止爬取)8.2 设置合理的请求头headers { User-Agent: MyCrawler/1.0 (https://example.com/crawler), Accept: text/html,application/xhtmlxml,application/xml;q0.9,*/*;q0.8, Accept-Language: zh-CN,zh;q0.9,en;q0.8, Accept-Encoding: gzip, deflate, br, Connection: keep-alive, Cache-Control: max-age0, }8.3 错误处理try: response requests.get(url, timeout10) response.raise_for_status() except requests.exceptions.RequestException as e: print(f请求失败: {e}) # 可以选择重试或跳过九、总结Python的爬虫生态非常强大从简单的Requests到专业的Scrapy框架能够满足各种数据采集需求。作为后端开发者掌握爬虫技能不仅能够帮助我们获取数据还能为数据分析和机器学习提供数据支持。关键要点选择合适的工具根据需求选择Requests、BeautifulSoup、Scrapy或Playwright遵守规则尊重网站的robots.txt和使用条款反爬应对实现请求频率控制、User-Agent轮换、代理使用数据存储合理设计数据存储方案错误处理完善的异常处理机制从Python转向Rust后我发现Rust的reqwest库在性能方面有很大优势适合构建高性能的爬虫系统。延伸阅读Scrapy官方文档Requests官方文档BeautifulSoup教程Playwright官方指南