本文目录导读:
我来详细介绍几种Python爬取网页的方法,从简单到复杂。
基础方法:urllib(Python内置)
import urllib.request
# 最简单的GET请求
url = 'https://example.com'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html[:500]) # 打印前500个字符
最常用方法:requests库(推荐)
安装
pip install requests
基本使用
import requests
# GET请求
url = 'https://api.github.com'
response = requests.get(url)
print(response.status_code) # 状态码
print(response.text) # 文本内容
print(response.json()) # JSON解析
# POST请求
data = {'key': 'value'}
response = requests.post('https://httpbin.org/post', data=data)
添加请求头(反爬虫)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'your_cookie_here'
}
response = requests.get(url, headers=headers, timeout=10)
解析HTML内容
使用BeautifulSoup
pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 查找元素= soup.find('h1').text
links = soup.find_all('a')
for link in links:
print(link.get('href'))
# CSS选择器
content = soup.select('.content p')
images = soup.select('img[src]')
使用正则表达式
import re
import requests
response = requests.get('https://example.com')
# 提取所有链接
links = re.findall(r'href="(https?://[^"]+)"', response.text)
处理动态加载的页面(Selenium)
pip install selenium # 还需要下载对应的浏览器驱动
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 设置无头浏览器
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
# 访问网页
driver.get('https://example.com')
# 等待元素加载
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "content"))
)
# 获取页面源码
html = driver.page_source
driver.quit()
完整示例:爬取新闻标题
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def scrape_news(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # 检查请求是否成功
soup = BeautifulSoup(response.text, 'html.parser')
news_list = []
# 假设爬取类似于Hacker News的页面
articles = soup.find_all('tr', class_='athing')
for article in articles:
title_element = article.find('span', class_='titleline')
if title_element:
title = title_element.text
link = title_element.find('a')['href']
news_list.append({
'title': title,
'url': link
})
return news_list
except requests.RequestException as e:
print(f"请求失败: {e}")
return []
# 使用示例
url = 'https://news.ycombinator.com/'
news = scrape_news(url)
# 保存到CSV
df = pd.DataFrame(news)
df.to_csv('news.csv', index=False)
print(f"成功爬取 {len(news)} 条新闻")
进阶技巧
使用代理IP
proxies = {
'http': 'http://127.0.0.1:8080',
'https': 'https://127.0.0.1:8080'
}
response = requests.get(url, proxies=proxies)
处理登录(Session)
session = requests.Session()
login_data = {
'username': 'your_username',
'password': 'your_password'
}
session.post('https://example.com/login', data=login_data)
response = session.get('https://example.com/dashboard')
异步爬取(提升效率)
import aiohttp
import asyncio
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main():
urls = ['https://example1.com', 'https://example2.com']
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, url) for url in urls]
results = await asyncio.gather(*tasks)
return results
# 运行异步爬虫
results = asyncio.run(main())
注意事项
- 遵守robots.txt:检查网站的爬虫协议
- 设置合理的请求间隔:
time.sleep(1) - 处理异常:添加try-except
- 使用代理:避免IP被封
- 数据合法性:不要用于商业用途未经许可
开始爬取前,建议先用浏览器开发者工具(F12)分析网页结构,确定要爬取的目标元素。
标签: Python爬虫