Python文字识别案例实操?

wen python案例 2

本文目录导读:

  1. 方案一:使用Tesseract OCR
  2. 方案二:使用PaddleOCR(推荐)
  3. 方案三:从视频中提取文字
  4. 完整实战案例:发票识别系统
  5. 注意事项

我来为你提供一个完整的Python文字识别(OCR)实操案例,使用常见的Tesseract和PaddleOCR两种方案。

使用Tesseract OCR

环境准备

首先安装Tesseract引擎和Python库:

# 安装Tesseract引擎
# Windows: 下载安装 https://github.com/UB-Mannheim/tesseract/wiki
# macOS: brew install tesseract
# Linux: sudo apt-get install tesseract-ocr
# 安装Python库
pip install pytesseract pillow opencv-python

基础文字识别示例

import pytesseract
from PIL import Image
import cv2
import numpy as np
# 配置Tesseract路径(Windows需要)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def basic_ocr(image_path):
    """基础文字识别"""
    # 打开图片
    image = Image.open(image_path)
    # 进行OCR识别
    text = pytesseract.image_to_string(image, lang='chi_sim+eng')  # 中文+英文
    return text
# 使用示例
result = basic_ocr('example.jpg')
print("识别结果:")
print(result)

图像预处理优化

def preprocess_and_ocr(image_path):
    """图像预处理后识别,提高准确率"""
    # 读取图片
    img = cv2.imread(image_path)
    # 转换为灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 二值化处理
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    # 降噪处理
    denoised = cv2.medianBlur(binary, 3)
    # 进行OCR识别
    # 使用不同的配置参数
    custom_config = r'--oem 3 --psm 6'  # OEM: 3=LSTM模式, PSM: 6=假设统一的文本块
    text = pytesseract.image_to_string(denoised, 
                                      lang='chi_sim+eng',
                                      config=custom_config)
    return text
def extract_text_with_boxes(image_path):
    """提取文字和位置信息"""
    img = cv2.imread(image_path)
    # 获取文字位置信息
    boxes = pytesseract.image_to_boxes(img, lang='chi_sim+eng')
    # 绘制文字框
    h, w, _ = img.shape
    for box in boxes.splitlines():
        box = box.split(' ')
        char = box[0]
        x, y, x2, y2 = int(box[1]), int(box[2]), int(box[3]), int(box[4])
        cv2.rectangle(img, (x, h-y), (x2, h-y2), (0, 255, 0), 2)
        cv2.putText(img, char, (x, h-y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
    cv2.imshow('Text Boxes', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    # 返回详细数据
    data = pytesseract.image_to_data(img, lang='chi_sim+eng', output_type=pytesseract.Output.DICT)
    return data

使用PaddleOCR(推荐)

PaddleOCR在中文识别上效果更好,准确率更高。

安装

pip install paddlepaddle paddleocr

基础使用

from paddleocr import PaddleOCR
import cv2
def paddle_ocr_basic(image_path):
    """PaddleOCR基础识别"""
    # 初始化OCR
    ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False)  # ch为中文
    # 进行识别
    result = ocr.ocr(image_path, cls=True)
    # 解析结果
    for line in result:
        for item in line:
            # item: [位置信息, (文字, 置信度)]
            box = item[0]  # 文本位置
            text = item[1][0]  # 识别文字
            confidence = item[1][1]  # 置信度
            print(f"文字: {text}, 置信度: {confidence:.2f}")
    return result
def draw_ocr_result(image_path, result):
    """绘制识别结果"""
    img = cv2.imread(image_path)
    for line in result:
        for item in line:
            box = item[0]
            text = item[1][0]
            # 绘制文本框
            points = np.array(box, dtype=np.int32).reshape((-1, 1, 2))
            cv2.polylines(img, [points], True, (0, 255, 0), 2)
            # 显示文字
            x, y = int(box[0][0]), int(box[0][1])
            cv2.putText(img, text, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 
                       0.7, (0, 0, 255), 2)
    cv2.imshow('OCR Result', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

批量处理示例

import os
from paddleocr import PaddleOCR
import json
class BatchOCRProcessor:
    def __init__(self):
        self.ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False)
    def process_single(self, image_path):
        """处理单个图片"""
        result = self.ocr.ocr(image_path, cls=True)
        texts = []
        for line in result:
            for item in line:
                texts.append({
                    'text': item[1][0],
                    'confidence': float(item[1][1]),
                    'box': item[0]
                })
        return texts
    def process_batch(self, input_dir, output_file):
        """批量处理文件夹中的图片"""
        results = {}
        # 遍历所有图片
        for filename in os.listdir(input_dir):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
                filepath = os.path.join(input_dir, filename)
                print(f"处理中: {filename}")
                texts = self.process_single(filepath)
                results[filename] = texts
        # 保存到JSON文件
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        return results
# 使用示例
processor = BatchOCRProcessor()
results = processor.process_batch('images_folder', 'ocr_results.json')

从视频中提取文字

import cv2
from paddleocr import PaddleOCR
def video_ocr(video_path, output_frames=False):
    """从视频中提取文字"""
    ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False)
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = 0
    text_results = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1
        # 每隔30帧处理一次(避免重复)
        if frame_count % 30 == 0:
            # OCR识别
            result = ocr.ocr(frame, cls=True)
            # 提取文字
            frame_texts = []
            for line in result:
                for item in line:
                    text = item[1][0]
                    frame_texts.append(text)
                    # 在帧上绘制结果
                    box = item[0]
                    points = np.array(box, dtype=np.int32).reshape((-1, 1, 2))
                    cv2.polylines(frame, [points], True, (0, 255, 0), 2)
            text_results.append({
                'frame': frame_count,
                'texts': frame_texts
            })
            # 显示实时结果
            cv2.imshow('Video OCR', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    cap.release()
    cv2.destroyAllWindows()
    return text_results

完整实战案例:发票识别系统

import re
from paddleocr import PaddleOCR
import cv2
import json
class InvoiceOCR:
    """发票文字识别系统"""
    def __init__(self):
        self.ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False)
    def extract_invoice_info(self, image_path):
        """提取发票关键信息"""
        result = self.ocr.ocr(image_path, cls=True)
        invoice_info = {
            'invoice_number': None,  # 发票号码
            'date': None,            # 日期
            'total_amount': None,    # 总金额
            'seller': None,          # 销售方
            'buyer': None,           # 购买方
            'items': []              # 商品明细
        }
        full_text = ""
        for line in result:
            for item in line:
                text = item[1][0]
                confidence = item[1][1]
                full_text += text + " "
                # 使用正则匹配关键信息
                # 匹配发票号码(数字序列)
                if re.match(r'^\d{8,10}$', text):
                    invoice_info['invoice_number'] = text
                # 匹配日期
                date_pattern = r'\d{4}[-年]\d{1,2}[-月]\d{1,2}[日]?'
                if re.search(date_pattern, text):
                    invoice_info['date'] = text
                # 匹配金额
                amount_pattern = r'[¥¥]?\d+\.\d{2}'
                if re.search(amount_pattern, text):
                    invoice_info['total_amount'] = text
        # 提取商品明细
        lines = full_text.split('\n')
        for line in lines:
            if '商品' in line or '明细' in line:
                invoice_info['items'].append(line)
        return invoice_info
    def save_to_json(self, info, output_path):
        """保存为JSON格式"""
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(info, f, ensure_ascii=False, indent=2)
        print(f"结果已保存到: {output_path}")
# 使用示例
if __name__ == "__main__":
    # 基础OCR测试
    print("=== 基础OCR测试 ===")
    text = basic_ocr('test_image.jpg')
    print(text)
    # PaddleOCR测试(推荐)
    print("\n=== PaddleOCR测试 ===")
    processor = InvoiceOCR()
    info = processor.extract_invoice_info('invoice.jpg')
    print("提取的发票信息:")
    print(json.dumps(info, ensure_ascii=False, indent=2))
    # 批量处理
    print("\n=== 批量处理 ===")
    batch_processor = BatchOCRProcessor()
    results = batch_processor.process_batch('invoices_folder', 'all_invoices.json')

注意事项

  1. 图像质量:清晰的图片能大幅提升识别准确率
  2. 预处理:适当进行二值化、去噪等预处理
  3. 语言设置:根据图片内容选择正确的语言包
  4. GPU加速:如果有NVIDIA GPU,建议启用GPU加速
  5. 识别模式:PaddleOCR的准确率通常高于Tesseract

这个案例包含了从基础到进阶的OCR使用场景,你可以根据实际需求选择合适的方法。

标签: 图像文字提取

抱歉,评论功能暂时关闭!