如何用Python爬取网页?

访客 python案例 2

本文目录导读:

  1. 基础方法:urllib(Python内置)
  2. 最常用方法:requests库(推荐)
  3. 解析HTML内容
  4. 处理动态加载的页面(Selenium)
  5. 完整示例:爬取新闻标题
  6. 进阶技巧
  7. 注意事项

我来详细介绍几种Python爬取网页的方法,从简单到复杂。

基础方法:urllib(Python内置)

import urllib.request
# 最简单的GET请求
url = 'https://example.com'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html[:500])  # 打印前500个字符

最常用方法:requests库(推荐)

安装

pip install requests

基本使用

import requests
# GET请求
url = 'https://api.github.com'
response = requests.get(url)
print(response.status_code)  # 状态码
print(response.text)         # 文本内容
print(response.json())       # JSON解析
# POST请求
data = {'key': 'value'}
response = requests.post('https://httpbin.org/post', data=data)

添加请求头(反爬虫)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cookie': 'your_cookie_here'
}
response = requests.get(url, headers=headers, timeout=10)

解析HTML内容

使用BeautifulSoup

pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 查找元素= soup.find('h1').text
links = soup.find_all('a')
for link in links:
    print(link.get('href'))
# CSS选择器
content = soup.select('.content p')
images = soup.select('img[src]')

使用正则表达式

import re
import requests
response = requests.get('https://example.com')
# 提取所有链接
links = re.findall(r'href="(https?://[^"]+)"', response.text)

处理动态加载的页面(Selenium)

pip install selenium
# 还需要下载对应的浏览器驱动
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 设置无头浏览器
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
# 访问网页
driver.get('https://example.com')
# 等待元素加载
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, "content"))
)
# 获取页面源码
html = driver.page_source
driver.quit()

完整示例:爬取新闻标题

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def scrape_news(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # 检查请求是否成功
        soup = BeautifulSoup(response.text, 'html.parser')
        news_list = []
        # 假设爬取类似于Hacker News的页面
        articles = soup.find_all('tr', class_='athing')
        for article in articles:
            title_element = article.find('span', class_='titleline')
            if title_element:
                title = title_element.text
                link = title_element.find('a')['href']
                news_list.append({
                    'title': title,
                    'url': link
                })
        return news_list
    except requests.RequestException as e:
        print(f"请求失败: {e}")
        return []
# 使用示例
url = 'https://news.ycombinator.com/'
news = scrape_news(url)
# 保存到CSV
df = pd.DataFrame(news)
df.to_csv('news.csv', index=False)
print(f"成功爬取 {len(news)} 条新闻")

进阶技巧

使用代理IP

proxies = {
    'http': 'http://127.0.0.1:8080',
    'https': 'https://127.0.0.1:8080'
}
response = requests.get(url, proxies=proxies)

处理登录(Session)

session = requests.Session()
login_data = {
    'username': 'your_username',
    'password': 'your_password'
}
session.post('https://example.com/login', data=login_data)
response = session.get('https://example.com/dashboard')

异步爬取(提升效率)

import aiohttp
import asyncio
async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()
async def main():
    urls = ['https://example1.com', 'https://example2.com']
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        results = await asyncio.gather(*tasks)
        return results
# 运行异步爬虫
results = asyncio.run(main())

注意事项

  1. 遵守robots.txt:检查网站的爬虫协议
  2. 设置合理的请求间隔time.sleep(1)
  3. 处理异常:添加try-except
  4. 使用代理:避免IP被封
  5. 数据合法性:不要用于商业用途未经许可

开始爬取前,建议先用浏览器开发者工具(F12)分析网页结构,确定要爬取的目标元素。

标签: Python爬虫

抱歉,评论功能暂时关闭!