# GLM-OCR Python SDK封装:简化client.predict调用,支持超时重试
## 1. 项目背景与需求
在实际的AI项目开发中,我们经常需要与各种模型服务进行交互。GLM-OCR作为一个强大的多模态OCR模型,提供了丰富的文档理解能力,但在实际使用过程中,开发者可能会遇到一些挑战:
- 原生Gradio Client的predict调用相对复杂,需要手动处理参数
- 网络不稳定时缺乏重试机制,容易导致请求失败
- 错误处理不够完善,需要开发者自行封装
- 缺少统一的配置管理和超时控制
针对这些问题,我们开发了一个专门的Python SDK封装,让GLM-OCR的调用变得更加简单、稳定和高效。
## 2. SDK封装设计思路
### 2.1 核心目标
我们的SDK封装主要围绕以下几个核心目标进行设计:
- **简化调用**:将复杂的predict调用封装成简单的方法
- **增强稳定性**:添加超时重试机制,提高请求成功率
- **统一错误处理**:提供标准化的异常处理方式
- **配置管理**:支持灵活的配置选项
### 2.2 技术架构
```python
class GLMOCRClient:
"""GLM-OCR客户端封装类"""
def __init__(self, base_url, max_retries=3, timeout=30):
self.client = Client(base_url)
self.max_retries = max_retries
self.timeout = timeout
def recognize_text(self, image_path, **kwargs):
"""文本识别封装"""
pass
def recognize_table(self, image_path, **kwargs):
"""表格识别封装"""
pass
def recognize_formula(self, image_path, **kwargs):
"""公式识别封装"""
pass
```
## 3. 安装与快速开始
### 3.1 安装依赖
首先确保你已经安装了必要的依赖包:
```bash
pip install gradio_client requests retrying
```
### 3.2 基本使用示例
```python
from glm_ocr_sdk import GLMOCRClient
# 创建客户端实例
client = GLMOCRClient(
base_url="http://localhost:7860",
max_retries=3, # 最大重试次数
timeout=30 # 超时时间(秒)
)
# 文本识别
result = client.recognize_text("/path/to/image.png")
print(result)
# 表格识别
result = client.recognize_table("/path/to/table.png")
print(result)
# 公式识别
result = client.recognize_formula("/path/to/formula.png")
print(result)
```
## 4. 核心功能实现
### 4.1 超时重试机制
我们使用装饰器模式实现了智能的重试机制:
```python
import time
from functools import wraps
def retry_on_failure(max_retries=3, delay=1):
"""重试装饰器"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
last_exception = None
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
last_exception = e
if attempt < max_retries - 1:
time.sleep(delay * (2 ** attempt)) # 指数退避
continue
raise last_exception
return wrapper
return decorator
```
### 4.2 统一的predict封装
```python
class GLMOCRClient:
# ... 其他代码 ...
@retry_on_failure(max_retries=3)
def _predict(self, image_path, prompt_type, **kwargs):
"""统一的predict调用封装"""
try:
result = self.client.predict(
image_path=image_path,
prompt=prompt_type,
api_name="/predict",
**kwargs
)
return result
except Exception as e:
raise GLMOCRException(f"预测失败: {str(e)}")
def recognize_text(self, image_path, **kwargs):
"""文本识别"""
return self._predict(image_path, "Text Recognition:", **kwargs)
def recognize_table(self, image_path, **kwargs):
"""表格识别"""
return self._predict(image_path, "Table Recognition:", **kwargs)
def recognize_formula(self, image_path, **kwargs):
"""公式识别"""
return self._predict(image_path, "Formula Recognition:", **kwargs)
```
### 4.3 错误处理与异常定义
```python
class GLMOCRException(Exception):
"""GLM-OCR自定义异常"""
pass
class GLMOCRTimeoutException(GLMOCRException):
"""超时异常"""
pass
class GLMOCRServiceException(GLMOCRException):
"""服务异常"""
pass
```
## 5. 高级功能与配置
### 5.1 批量处理支持
```python
def batch_recognize(self, image_paths, prompt_type, parallel=3):
"""批量识别支持"""
from concurrent.futures import ThreadPoolExecutor
results = []
with ThreadPoolExecutor(max_workers=parallel) as executor:
futures = []
for image_path in image_paths:
future = executor.submit(
self._predict, image_path, prompt_type
)
futures.append(future)
for future in futures:
try:
results.append(future.result())
except Exception as e:
results.append({"error": str(e)})
return results
```
### 5.2 配置管理
```python
class GLMOCRConfig:
"""配置管理类"""
DEFAULT_CONFIG = {
'max_retries': 3,
'timeout': 30,
'base_url': 'http://localhost:7860',
'delay_factor': 1,
'max_delay': 10
}
def __init__(self, **kwargs):
self.config = {**self.DEFAULT_CONFIG, **kwargs}
def get(self, key, default=None):
return self.config.get(key, default)
```
## 6. 完整SDK代码示例
```python
import time
from functools import wraps
from gradio_client import Client
from typing import List, Optional
class GLMOCRException(Exception):
"""GLM-OCR自定义异常基类"""
pass
class GLMOCRTimeoutException(GLMOCRException):
"""超时异常"""
pass
def retry_on_failure(max_retries=3, delay=1, max_delay=10):
"""带指数退避的重试装饰器"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
last_exception = None
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
last_exception = e
if attempt < max_retries - 1:
wait_time = min(delay * (2 ** attempt), max_delay)
time.sleep(wait_time)
continue
raise GLMOCRException(f"操作失败,重试{max_retries}次后仍无法完成: {last_exception}")
return wrapper
return decorator
class GLMOCRClient:
"""GLM-OCR Python SDK客户端"""
def __init__(self, base_url: str = "http://localhost:7860",
max_retries: int = 3,
timeout: int = 30):
"""
初始化GLM-OCR客户端
Args:
base_url: 服务地址,默认为 http://localhost:7860
max_retries: 最大重试次数,默认为3
timeout: 超时时间(秒),默认为30
"""
self.base_url = base_url
self.max_retries = max_retries
self.timeout = timeout
self.client = Client(base_url, timeout=timeout)
@retry_on_failure(max_retries=3)
def _predict(self, image_path: str, prompt: str, **kwargs):
"""统一的predict调用封装"""
try:
result = self.client.predict(
image_path=image_path,
prompt=prompt,
api_name="/predict",
**kwargs
)
return result
except Exception as e:
raise GLMOCRException(f"预测调用失败: {str(e)}")
def recognize_text(self, image_path: str, **kwargs) -> str:
"""
文本识别
Args:
image_path: 图片路径
**kwargs: 其他预测参数
Returns:
str: 识别结果文本
"""
return self._predict(image_path, "Text Recognition:", **kwargs)
def recognize_table(self, image_path: str, **kwargs) -> str:
"""
表格识别
Args:
image_path: 图片路径
**kwargs: 其他预测参数
Returns:
str: 表格识别结果
"""
return self._predict(image_path, "Table Recognition:", **kwargs)
def recognize_formula(self, image_path: str, **kwargs) -> str:
"""
公式识别
Args:
image_path: 图片路径
**kwargs: 其他预测参数
Returns:
str: 公式识别结果
"""
return self._predict(image_path, "Formula Recognition:", **kwargs)
def batch_recognize(self, image_paths: List[str],
prompt_type: str = "Text Recognition:",
parallel: int = 3) -> List[str]:
"""
批量识别多张图片
Args:
image_paths: 图片路径列表
prompt_type: 提示类型
parallel: 并行处理数量
Returns:
List[str]: 识别结果列表
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
results = []
with ThreadPoolExecutor(max_workers=parallel) as executor:
future_to_path = {
executor.submit(
self._predict,
image_path,
prompt_type
): image_path for image_path in image_paths
}
for future in as_completed(future_to_path):
image_path = future_to_path[future]
try:
result = future.result()
results.append(result)
except Exception as e:
results.append(f"处理 {image_path} 时出错: {str(e)}")
return results
# 使用示例
if __name__ == "__main__":
# 创建客户端
client = GLMOCRClient(
base_url="http://localhost:7860",
max_retries=3,
timeout=30
)
# 单张图片识别
try:
result = client.recognize_text("/path/to/image.png")
print("识别结果:", result)
except GLMOCRException as e:
print("识别失败:", e)
# 批量识别
image_paths = ["image1.png", "image2.png", "image3.png"]
results = client.batch_recognize(image_paths, "Text Recognition:")
for i, result in enumerate(results):
print(f"图片 {i+1} 结果: {result}")
```
## 7. 实际应用案例
### 7.1 文档自动化处理
```python
import os
from pathlib import Path
def process_documents_folder(folder_path, output_file):
"""处理整个文件夹的文档"""
client = GLMOCRClient("http://localhost:7860")
# 获取所有图片文件
image_extensions = ['.png', '.jpg', '.jpeg', '.webp']
image_files = []
for ext in image_extensions:
image_files.extend(Path(folder_path).glob(f"*{ext}"))
# 批量处理
results = client.batch_recognize(
[str(path) for path in image_files],
"Text Recognition:",
parallel=2 # 控制并发数,避免资源耗尽
)
# 保存结果
with open(output_file, 'w', encoding='utf-8') as f:
for image_path, result in zip(image_files, results):
f.write(f"=== {image_path.name} ===\n")
f.write(f"{result}\n\n")
print(f"处理完成,结果已保存到 {output_file}")
```
### 7.2 实时监控与重试
```python
class MonitoringGLMOCRClient(GLMOCRClient):
"""带监控功能的客户端"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.success_count = 0
self.failure_count = 0
self.total_time = 0
@retry_on_failure(max_retries=3)
def _predict(self, image_path, prompt, **kwargs):
start_time = time.time()
try:
result = super()._predict(image_path, prompt, **kwargs)
end_time = time.time()
self.success_count += 1
self.total_time += (end_time - start_time)
return result
except Exception as e:
self.failure_count += 1
raise e
def get_stats(self):
"""获取统计信息"""
total_requests = self.success_count + self.failure_count
success_rate = (self.success_count / total_requests * 100) if total_requests > 0 else 0
avg_time = (self.total_time / self.success_count) if self.success_count > 0 else 0
return {
'total_requests': total_requests,
'success_count': self.success_count,
'failure_count': self.failure_count,
'success_rate': f"{success_rate:.2f}%",
'average_time': f"{avg_time:.2f}s"
}
```
## 8. 总结
通过这个GLM-OCR Python SDK封装,我们成功解决了原生Gradio Client调用复杂、缺乏重试机制等问题。主要优势包括:
- **简化调用**:将复杂的predict调用封装成简单的recognize_text、recognize_table等方法
- **增强稳定性**:内置智能重试机制,支持指数退避策略
- **批量处理**:支持并发处理多张图片,提高处理效率
- **错误处理**:提供统一的异常处理机制,便于调试和监控
- **配置灵活**:支持自定义重试次数、超时时间等参数
这个SDK封装不仅适用于GLM-OCR,其设计思路和实现方式也可以借鉴到其他Gradio-based的模型服务中,为AI应用的开发提供更加稳定和高效的基础设施支持。
---
> **获取更多AI镜像**
>
> 想探索更多AI镜像和应用场景?访问 [CSDN星图镜像广场](https://ai.csdn.net/?utm_source=mirror_blog_end),提供丰富的预置镜像,覆盖大模型推理、图像生成、视频生成、模型微调等多个领域,支持一键部署。