Pythonjson数据爬取案例?

wen python案例 2

本文目录导读:

  1. 案例1:爬取免费API接口的JSON数据
  2. 案例2:爬取动态加载的JSON数据
  3. 案例3:爬取需要登录验证的JSON数据
  4. 案例4:爬取分页JSON数据
  5. 案例5:处理复杂的嵌套JSON数据

我来分享几个Python爬取JSON数据的实战案例。

案例1:爬取免费API接口的JSON数据

import requests
import json
# 案例:获取天气数据(免费API)
def fetch_weather_data():
    url = "https://api.open-meteo.com/v1/forecast"
    params = {
        "latitude": 39.9042,
        "longitude": 116.4074,
        "current_weather": True,
        "hourly": "temperature_2m,precipitation"
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        # 直接获取JSON数据
        weather_data = response.json()
        # 提取关键信息
        current_weather = weather_data['current_weather']
        print(f"温度: {current_weather['temperature']}°C")
        print(f"风速: {current_weather['windspeed']} km/h")
        print(f"天气代码: {current_weather['weathercode']}")
        return weather_data
    except requests.exceptions.RequestException as e:
        print(f"请求失败: {e}")
        return None
# 运行示例
weather_info = fetch_weather_data()

案例2:爬取动态加载的JSON数据

import requests
import json
from bs4 import BeautifulSoup
def fetch_dynamic_json():
    """
    案例:从某些网站通过XHR请求获取JSON数据
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json, text/plain, */*',
        'X-Requested-With': 'XMLHttpRequest'
    }
    # 模拟API请求
    url = "https://jsonplaceholder.typicode.com/posts"
    try:
        # 发送GET请求
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            posts = response.json()
            # 处理多个JSON对象
            for post in posts[:5]:  # 只取前5条
                print(f"ID: {post['id']}")
                print(f"标题: {post['title']}")
                print(f"内容: {post['body'][:50]}...")
                print("-" * 50)
            # 保存到文件
            with open('posts.json', 'w', encoding='utf-8') as f:
                json.dump(posts, f, ensure_ascii=False, indent=2)
            return posts
        else:
            print(f"请求失败,状态码: {response.status_code}")
    except Exception as e:
        print(f"发生错误: {e}")
# 运行
data = fetch_dynamic_json()

案例3:爬取需要登录验证的JSON数据

import requests
import json
class DataFetcher:
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Content-Type': 'application/json'
        }
    def login(self, username, password):
        """模拟登录获取token"""
        login_url = "https://api.example.com/login"
        login_data = {
            "username": username,
            "password": password
        }
        try:
            response = self.session.post(
                login_url, 
                json=login_data, 
                headers=self.headers
            )
            if response.status_code == 200:
                token = response.json().get('token')
                # 保存token到headers
                self.headers['Authorization'] = f'Bearer {token}'
                return True
        except Exception as e:
            print(f"登录失败: {e}")
            return False
    def fetch_protected_data(self):
        """爬取需要认证的数据"""
        data_url = "https://api.example.com/protected/data"
        try:
            response = self.session.get(
                data_url, 
                headers=self.headers
            )
            if response.status_code == 200:
                return response.json()
            else:
                print(f"获取数据失败: {response.status_code}")
        except Exception as e:
            print(f"请求失败: {e}")
        return None
# 使用示例
# fetcher = DataFetcher()
# if fetcher.login('your_username', 'your_password'):
#     data = fetcher.fetch_protected_data()
#     print(data)

案例4:爬取分页JSON数据

import requests
import json
import time
def fetch_paginated_data():
    """
    案例:爬取分页数据
    """
    base_url = "https://api.github.com/search/repositories"
    query_params = {
        "q": "python",
        "sort": "stars",
        "order": "desc",
        "per_page": 10,
        "page": 1
    }
    all_repos = []
    max_pages = 3  # 只爬取3页示例
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Accept': 'application/json'
    }
    for page in range(1, max_pages + 1):
        query_params['page'] = page
        try:
            response = requests.get(
                base_url, 
                params=query_params, 
                headers=headers
            )
            if response.status_code == 200:
                data = response.json()
                items = data.get('items', [])
                for repo in items:
                    repo_info = {
                        'name': repo['full_name'],
                        'stars': repo['stargazers_count'],
                        'url': repo['html_url'],
                        'description': repo.get('description', 'No description')
                    }
                    all_repos.append(repo_info)
                print(f"第{page}页数据已获取")
                time.sleep(1)  # 避免请求过快
        except Exception as e:
            print(f"第{page}页请求失败: {e}")
    # 保存所有数据
    with open('repositories.json', 'w', encoding='utf-8') as f:
        json.dump(all_repos, f, ensure_ascii=False, indent=2)
    return all_repos
# 运行
repos = fetch_paginated_data()
print(f"共获取到 {len(repos)} 个仓库信息")

案例5:处理复杂的嵌套JSON数据

import requests
import json
from typing import Dict, List
class JSONDataProcessor:
    def __init__(self, data: Dict):
        self.data = data
    @staticmethod
    def fetch_from_url(url: str) -> Dict:
        """从URL获取JSON数据"""
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    def extract_nested_values(self, keys: List[str]) -> List:
        """提取嵌套JSON中的值"""
        values = []
        def extract_recursive(d, key_path):
            if isinstance(d, dict):
                if key_path[0] in d:
                    if len(key_path) == 1:
                        values.append(d[key_path[0]])
                    else:
                        extract_recursive(d[key_path[0]], key_path[1:])
                else:
                    for v in d.values():
                        extract_recursive(v, key_path)
            elif isinstance(d, list):
                for item in d:
                    extract_recursive(item, key_path)
        extract_recursive(self.data, keys)
        return values
    def flatten_json(self, prefix='') -> Dict:
        """扁平化嵌套JSON"""
        flat = {}
        def process(key, value):
            full_key = f"{prefix}.{key}" if prefix else key
            if isinstance(value, dict):
                for k, v in value.items():
                    process(k, v)
            elif isinstance(value, list):
                for i, item in enumerate(value):
                    process(f"{full_key}_{i}", item)
            else:
                flat[full_key] = value
        for k, v in self.data.items():
            process(k, v)
        return flat
# 使用示例
url = "https://api.github.com/repos/python/cpython"
try:
    data = JSONDataProcessor.fetch_from_url(url)
    processor = JSONDataProcessor(data)
    # 提取特定嵌套字段
    selected = processor.extract_nested_values(['owner', 'login'])
    print(f"仓库所有者: {selected}")
    # 扁平化JSON
    flat_data = processor.flatten_json()
    print(f"扁平化后的键值对数量: {len(flat_data)}")
except Exception as e:
    print(f"错误: {e}")
  1. 错误处理:始终使用try-except块处理网络和JSON解析错误
  2. 请求头设置:添加合适的User-Agent和Content-Type
  3. 限速:使用time.sleep()避免请求过快
  4. 数据验证:使用response.raise_for_status()检查请求状态
  5. 保存数据:及时将数据保存到文件,避免数据丢失
  6. 动态数据:对于页面动态加载的数据,使用浏览器开发者工具分析XHR请求

这些案例涵盖了JSON数据爬取的主要场景,你可以根据实际需求选择使用。

标签: 城市 数据分析

抱歉,评论功能暂时关闭!