OCR 质量改进指南

问题背景

日本茶道专家知识库存在严重的 OCR 质量问题：

换行符比例：28.0%（正常应 < 5%）
平均段长：1.03 字符（正常应 > 20）
文本高度碎片化，几乎每个字都被换行符分隔

原文示例：

缅
宫
障口＂丁烹
心
摹毒掏f 玄．
y 
,,, : 
传
承
千
年
的
本
茶
道

应该是：

传承千年的日本茶道

🚀 快速开始（3 步搞定）

如果您已经有原始 PDF 文件，只需 3 个命令：

# 1. 安装依赖（首次使用）
cd d:\Mises\mises-behavior-engine
.\scripts\install_paddleocr.ps1

# 2. 处理 PDF
python scripts/reprocess_pdf_with_paddleocr.py "原始PDF路径.pdf" --kb-id f7cdefb77b24

# 3. 重启服务
docker-compose restart mbe-api

完整使用指南请查看：docs/guides/PADDLEOCR_USAGE_GUIDE.md

方案 1: 使用专业 OCR 工具重新识别（推荐）⭐

完整重新处理脚本

我为您创建了一个自动化脚本来重新处理 PDF：

# scripts/reprocess_pdf_with_ocr.py

"""
使用高质量 OCR 重新处理 PDF 文档
支持 PaddleOCR 和 Tesseract
"""
import json
import sys
from pathlib import Path
from typing import List, Tuple
import re

# 选择 OCR 引擎
OCR_ENGINE = "paddleocr"  # 或 "tesseract"

def init_ocr():
    """初始化 OCR 引擎"""
    if OCR_ENGINE == "paddleocr":
        from paddleocr import PaddleOCR
        return PaddleOCR(
            use_angle_cls=True,
            lang='ch',
            use_gpu=True,
            show_log=False,
            det_model_dir='./models/det',
            rec_model_dir='./models/rec'
        )
    elif OCR_ENGINE == "tesseract":
        import pytesseract
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
        return pytesseract
    else:
        raise ValueError(f"不支持的 OCR 引擎: {OCR_ENGINE}")

def pdf_to_images(pdf_path: str, dpi: int = 300) -> List:
    """将 PDF 转换为图片"""
    from pdf2image import convert_from_path
    
    print(f"📄 转换 PDF 为图片 (DPI: {dpi})...")
    images = convert_from_path(
        pdf_path,
        dpi=dpi,
        fmt='png',
        thread_count=4
    )
    print(f"✅ 转换完成，共 {len(images)} 页")
    return images

def ocr_image_paddleocr(ocr, image) -> str:
    """使用 PaddleOCR 识别图片"""
    import numpy as np
    
    # 转换为 numpy 数组
    img_array = np.array(image)
    
    # OCR 识别
    result = ocr.ocr(img_array, cls=True)
    
    if not result or not result[0]:
        return ""
    
    # 提取文本（按位置排序）
    lines = []
    for line in result[0]:
        text = line[1][0]
        confidence = line[1][1]
        
        # 过滤低置信度结果
        if confidence > 0.7:
            lines.append(text)
    
    return '\n'.join(lines)

def ocr_image_tesseract(ocr, image) -> str:
    """使用 Tesseract 识别图片"""
    text = ocr.image_to_string(
        image,
        lang='chi_sim+eng',
        config='--psm 6 --oem 3'
    )
    return text.strip()

def clean_ocr_text(text: str) -> str:
    """清理 OCR 结果"""
    # 移除过多的空行
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # 移除行首行尾空白
    lines = [line.strip() for line in text.split('\n')]
    text = '\n'.join(lines)
    
    # 移除孤立的单字符行（通常是识别错误）
    lines = []
    for line in text.split('\n'):
        if len(line) > 1 or line in '。，！？；：""''（）【】':
            lines.append(line)
    
    return '\n'.join(lines)

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[dict]:
    """分块文本"""
    chunks = []
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    
    current_chunk = ""
    chunk_index = 0
    
    for para in paragraphs:
        if len(current_chunk) + len(para) <= chunk_size:
            current_chunk += para + "\n\n"
        else:
            if current_chunk:
                chunks.append({
                    "id": f"chunk_{chunk_index}",
                    "text": current_chunk.strip(),
                    "chunk_index": chunk_index
                })
                chunk_index += 1
            
            # 重叠处理
            if overlap > 0:
                words = para.split()
                overlap_text = ' '.join(words[-overlap:]) if len(words) > overlap else para
                current_chunk = overlap_text + "\n\n"
            else:
                current_chunk = para + "\n\n"
    
    # 最后一个块
    if current_chunk:
        chunks.append({
            "id": f"chunk_{chunk_index}",
            "text": current_chunk.strip(),
            "chunk_index": chunk_index
        })
    
    return chunks

def reprocess_pdf(pdf_path: str, output_kb_id: str = None):
    """重新处理 PDF"""
    pdf_path = Path(pdf_path)
    
    if not pdf_path.exists():
        print(f"❌ 文件不存在: {pdf_path}")
        return
    
    kb_id = output_kb_id or pdf_path.stem
    
    print(f"\n🚀 开始重新处理: {pdf_path.name}")
    print(f"📊 OCR 引擎: {OCR_ENGINE}")
    print(f"🆔 知识库 ID: {kb_id}\n")
    
    # 初始化 OCR
    ocr = init_ocr()
    
    # 转换 PDF 为图片
    images = pdf_to_images(str(pdf_path), dpi=300)
    
    # OCR 识别
    all_text = []
    for i, image in enumerate(images, 1):
        print(f"🔍 识别第 {i}/{len(images)} 页...", end=' ')
        
        if OCR_ENGINE == "paddleocr":
            text = ocr_image_paddleocr(ocr, image)
        else:
            text = ocr_image_tesseract(ocr, image)
        
        cleaned_text = clean_ocr_text(text)
        all_text.append(cleaned_text)
        
        print(f"✅ 完成 ({len(cleaned_text)} 字符)")
    
    # 合并文本
    full_text = "\n\n".join(all_text)
    print(f"\n📝 总字符数: {len(full_text)}")
    
    # 分块
    chunks = chunk_text(full_text, chunk_size=1200, overlap=100)
    print(f"📦 生成分块: {len(chunks)} 个")
    
    # 保存结果
    output_file = Path("knowledge_bases") / f"{kb_id}_chunks.json"
    output_file.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)
    
    print(f"\n✅ 重新处理完成！")
    print(f"📁 输出文件: {output_file}")
    
    # 质量评估
    from src.knowledge.pdf_processor import get_pdf_processor
    processor = get_pdf_processor()
    quality = processor.evaluate_kb_quality(chunks, has_vectors=False)
    
    print(f"\n📊 质量评估:")
    print(f"   总分: {quality.overall_score:.1f} ({quality.quality_level})")
    if quality.issues:
        print(f"   问题: {', '.join(quality.issues[:3])}")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("用法: python reprocess_pdf_with_ocr.py <PDF文件路径> [知识库ID]")
        print("\n示例:")
        print("  python reprocess_pdf_with_ocr.py uploads/知日·日本茶道完全入门.pdf f7cdefb77b24")
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    kb_id = sys.argv[2] if len(sys.argv) > 2 else None
    
    reprocess_pdf(pdf_path, kb_id)

使用步骤

安装依赖：

pip install paddleocr pdf2image pillow

运行脚本：

python scripts/reprocess_pdf_with_ocr.py "原始PDF路径.pdf" f7cdefb77b24

验证结果：

python scripts/check_kb_quality.py

重新部署：

# 重新生成向量
python scripts/regenerate_vectors.py f7cdefb77b24

# 重启服务
docker-compose restart mbe-api

方案 2: 文本后处理修复（快速临时方案）

如果暂时找不到原始 PDF，可以尝试修复现有文本：

# scripts/fix_fragmented_text.py

"""
修复高度碎片化的文本（合并单字符行）
"""
import json
import re
from pathlib import Path

def fix_fragmented_chunk(text: str) -> str:
    """修复单个分块的文本碎片化"""
    lines = text.split('\n')
    
    # 合并连续的单字符行
    fixed_lines = []
    buffer = []
    
    for line in lines:
        line = line.strip()
        
        if not line:
            # 空行：输出缓冲区并保留空行
            if buffer:
                fixed_lines.append(''.join(buffer))
                buffer = []
            fixed_lines.append('')
        elif len(line) == 1:
            # 单字符：加入缓冲区
            buffer.append(line)
        else:
            # 多字符：输出缓冲区，然后输出当前行
            if buffer:
                fixed_lines.append(''.join(buffer))
                buffer = []
            fixed_lines.append(line)
    
    # 处理剩余缓冲区
    if buffer:
        fixed_lines.append(''.join(buffer))
    
    # 合并过多的空行
    text = '\n'.join(fixed_lines)
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text

def fix_kb_text(kb_id: str):
    """修复知识库文本"""
    chunk_file = Path("knowledge_bases") / f"{kb_id}_chunks.json"
    
    if not chunk_file.exists():
        print(f"❌ 文件不存在: {chunk_file}")
        return
    
    # 读取分块
    with open(chunk_file, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    
    print(f"📦 处理 {len(chunks)} 个分块...")
    
    # 修复每个分块
    for chunk in chunks:
        original = chunk['text']
        fixed = fix_fragmented_chunk(original)
        chunk['text'] = fixed
        
        reduction = len(original) - len(fixed)
        print(f"  - {chunk['id']}: {len(original)} → {len(fixed)} (-{reduction})")
    
    # 备份原文件
    backup_file = chunk_file.with_suffix('.json.backup')
    chunk_file.rename(backup_file)
    print(f"📁 原文件已备份: {backup_file}")
    
    # 保存修复后的文件
    with open(chunk_file, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)
    
    print(f"✅ 修复完成！")

if __name__ == "__main__":
    fix_kb_text("f7cdefb77b24")

运行：

python scripts/fix_fragmented_text.py

方案 3: 寻找替代数据源

如果原始 PDF 质量太差，建议：

寻找电子版：
- 京东读书、微信读书、多看阅读
- Z-Library、Library Genesis
- 出版社官网
使用网络资源：
- 使用 web_processor.py 从网页抓取相关内容
- 维基百科、百度百科、专业网站
重新创建知识库：
- 手工整理核心知识点
- 结合多个来源补充内容

质量对比

方案	处理时间	质量提升	成本
PaddleOCR	10-30分钟	⭐⭐⭐⭐⭐	免费
Tesseract	15-40分钟	⭐⭐⭐⭐	免费
商业 OCR	5-10分钟	⭐⭐⭐⭐⭐	~50-200元
文本修复	1-5分钟	⭐⭐	免费
替代数据源	数小时-数天	⭐⭐⭐⭐⭐	免费-付费

建议操作顺序：

立即运行文本修复脚本（临时改善）
获取原始 PDF
使用 PaddleOCR 重新处理（最终方案）
更新部署

需要我帮您创建这些脚本并测试吗？

MBE 文档中心

OCR 质量改进指南

问题背景

🚀 快速开始（3 步搞定）

方案 1: 使用专业 OCR 工具重新识别（推荐）⭐

推荐工具

1. PaddleOCR（免费，中文效果最好）

2. Tesseract 5.x（免费，通用性好）

3. 腾讯云/阿里云 OCR（付费，准确率最高）

完整重新处理脚本

使用步骤

方案 2: 文本后处理修复（快速临时方案）

方案 3: 寻找替代数据源

推荐流程

质量对比