# LibreOffice隐藏技巧:用Python脚本实现PPT自动转PDF并添加水印(2023最新版)
在快节奏的商业环境中,市场人员和设计师经常需要处理大量演示文档。无论是客户提案、产品展示还是内部汇报,将PPT转换为PDF并添加统一的水印是常见的需求。传统的手动操作不仅耗时耗力,还容易出错。本文将介绍如何利用Python脚本结合LibreOffice的强大功能,实现PPT自动转PDF并添加水印的一站式解决方案。
## 1. 环境准备与基础配置
在开始之前,我们需要确保系统已经安装了必要的软件和库。LibreOffice作为开源办公套件,提供了丰富的命令行接口和API,可以满足各种自动化需求。
### 1.1 安装LibreOffice和Python
对于不同操作系统,安装方式略有差异:
- **Ubuntu/Debian**:
```bash
sudo apt update
sudo apt install libreoffice python3 python3-pip
```
- **Windows**:
从LibreOffice官网下载安装包,Python推荐使用官方安装程序或Miniconda。
安装完成后,验证LibreOffice是否可用:
```bash
soffice --version
```
### 1.2 安装Python依赖库
我们需要安装几个关键的Python库来支持我们的自动化脚本:
```bash
pip install pyuno unotools pillow
```
> 注意:`pyuno`是LibreOffice的Python接口,`unotools`提供了更方便的API封装,`Pillow`用于图像处理。
### 1.3 配置LibreOffice服务模式
为了高效处理多个文档,我们可以让LibreOffice以服务模式运行:
```python
import subprocess
# 启动LibreOffice服务
lo_process = subprocess.Popen([
"soffice",
"--headless",
"--invisible",
"--nocrashreport",
"--nodefault",
"--nologo",
"--nofirststartwizard",
"--accept=socket,host=localhost,port=2002;urp;"
])
```
## 2. PPT转PDF基础实现
掌握了基础配置后,我们先实现最基本的PPT转PDF功能。
### 2.1 简单转换命令
使用LibreOffice命令行工具可以直接完成格式转换:
```bash
soffice --headless --convert-to pdf:impress_pdf_Export presentation.pptx --outdir output/
```
对应的Python实现:
```python
import subprocess
def convert_ppt_to_pdf(input_file, output_dir):
subprocess.run([
"soffice",
"--headless",
"--convert-to",
"pdf:impress_pdf_Export",
input_file,
"--outdir",
output_dir
])
```
### 2.2 批量处理多个文件
对于需要处理多个PPT文件的情况,我们可以扩展上述函数:
```python
import os
def batch_convert_ppt_to_pdf(input_dir, output_dir):
os.makedirs(output_dir, exist_ok=True)
for filename in os.listdir(input_dir):
if filename.endswith((".ppt", ".pptx")):
input_path = os.path.join(input_dir, filename)
convert_ppt_to_pdf(input_path, output_dir)
```
### 2.3 性能优化技巧
处理大量文件时,可以考虑以下优化措施:
- 使用`--norestore`参数避免恢复检测
- 设置合理的超时时间
- 并行处理多个文件(但要注意LibreOffice的资源占用)
优化后的代码示例:
```python
from concurrent.futures import ThreadPoolExecutor
def optimized_batch_convert(files, output_dir, max_workers=4):
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for file in files:
futures.append(executor.submit(
convert_ppt_to_pdf, file, output_dir
))
for future in futures:
future.result()
```
## 3. 高级水印添加技术
基础转换完成后,我们来实现更专业的PDF水印功能。
### 3.1 使用LibreOffice API添加水印
LibreOffice的API允许我们直接操作文档内容:
```python
import uno
from com.sun.star.beans import PropertyValue
def add_watermark(input_pdf, output_pdf, watermark_text):
local_context = uno.getComponentContext()
resolver = local_context.ServiceManager.createInstanceWithContext(
"com.sun.star.bridge.UnoUrlResolver", local_context
)
context = resolver.resolve(
"uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext"
)
desktop = context.ServiceManager.createInstanceWithContext(
"com.sun.star.frame.Desktop", context
)
# 打开PDF文件
url = uno.systemPathToFileUrl(os.path.abspath(input_pdf))
doc = desktop.loadComponentFromURL(
url, "_blank", 0, tuple()
)
# 获取第一页
pages = doc.getDrawPages()
page = pages.getByIndex(0)
# 创建水印文字
watermark = doc.createInstance("com.sun.star.drawing.TextShape")
watermark.setString(watermark_text)
watermark.TextAutoGrowHeight = True
watermark.TextAutoGrowWidth = True
watermark.CharColor = 0x999999 # 灰色
watermark.CharHeight = 48
watermark.CharWeight = 150 # 加粗
watermark.RotateAngle = 3000 # 30度
# 设置水印位置和大小
watermark.setPosition(uno.createUnoStruct("com.sun.star.awt.Point", 5000, 5000))
watermark.setSize(uno.createUnoStruct("com.sun.star.awt.Size", 10000, 1000))
# 添加到页面
page.add(watermark)
# 保存
output_url = uno.systemPathToFileUrl(os.path.abspath(output_pdf))
doc.storeToURL(output_url, tuple())
doc.close(True)
```
### 3.2 使用Pillow添加图像水印
如果需要使用图片作为水印,可以结合Pillow库:
```python
from PIL import Image, ImageDraw, ImageFont
import tempfile
def add_image_watermark(input_pdf, output_pdf, watermark_image_path, opacity=0.3):
# 将PDF转换为PNG
with tempfile.TemporaryDirectory() as tmpdir:
subprocess.run([
"convert",
"-density", "150",
input_pdf,
"-quality", "90",
os.path.join(tmpdir, "page_%02d.png")
])
# 处理每一页
watermarked_images = []
for page in sorted(os.listdir(tmpdir)):
if page.startswith("page_") and page.endswith(".png"):
img_path = os.path.join(tmpdir, page)
img = Image.open(img_path).convert("RGBA")
# 添加水印
watermark = Image.open(watermark_image_path).convert("RGBA")
watermark = watermark.resize(
(img.width // 3, img.height // 3),
Image.Resampling.LANCZOS
)
# 设置透明度
watermark = watermark.copy()
watermark.putalpha(int(255 * opacity))
# 将水印放在中央
position = (
(img.width - watermark.width) // 2,
(img.height - watermark.height) // 2
)
img.paste(watermark, position, watermark)
# 保存处理后的图像
output_path = os.path.join(tmpdir, f"watermarked_{page}")
img.save(output_path, "PNG")
watermarked_images.append(output_path)
# 将处理后的图像合并为PDF
subprocess.run([
"convert",
*watermarked_images,
output_pdf
])
```
### 3.3 水印样式自定义
为了让水印更加专业,我们可以提供多种自定义选项:
```python
from enum import Enum
class WatermarkPosition(Enum):
CENTER = "center"
TOP_LEFT = "top_left"
TOP_RIGHT = "top_right"
BOTTOM_LEFT = "bottom_left"
BOTTOM_RIGHT = "bottom_right"
TILED = "tiled"
def add_custom_watermark(input_pdf, output_pdf, watermark_text,
position=WatermarkPosition.CENTER,
font_size=48, color=0x999999,
angle=30, opacity=0.5):
# 实现细节类似前面的add_watermark函数
# 根据position参数调整水印位置
# 支持平铺(tiled)等高级布局
pass
```
## 4. 完整解决方案与实战案例
将前面介绍的各个模块组合起来,我们可以构建一个完整的PPT转PDF加水印的解决方案。
### 4.1 完整工作流程
1. **输入处理**:接收PPT文件或目录
2. **格式转换**:将PPT转换为PDF
3. **水印添加**:根据配置添加文字或图片水印
4. **输出管理**:保存到指定位置,保持原始文件名结构
```python
import os
from datetime import datetime
class PPTProcessor:
def __init__(self, config):
self.config = config
self.lo_process = None
def __enter__(self):
self.start_libreoffice()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop_libreoffice()
def start_libreoffice(self):
self.lo_process = subprocess.Popen([
"soffice",
"--headless",
"--invisible",
"--nocrashreport",
"--nodefault",
"--nologo",
"--nofirststartwizard",
"--accept=socket,host=localhost,port=2002;urp;"
])
def stop_libreoffice(self):
if self.lo_process:
self.lo_process.terminate()
try:
self.lo_process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.lo_process.kill()
def process_file(self, input_path, output_dir):
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 生成输出文件名
basename = os.path.basename(input_path)
pdf_name = os.path.splitext(basename)[0] + ".pdf"
temp_pdf = os.path.join(output_dir, f"temp_{pdf_name}")
final_pdf = os.path.join(output_dir, pdf_name)
# 第一步:PPT转PDF
convert_ppt_to_pdf(input_path, output_dir)
# 临时文件重命名处理
temp_path = os.path.join(output_dir, basename.replace(".pptx", ".pdf").replace(".ppt", ".pdf"))
os.rename(temp_path, temp_pdf)
# 第二步:添加水印
if self.config.get("watermark_text"):
add_watermark(
temp_pdf, final_pdf,
self.config["watermark_text"],
font_size=self.config.get("font_size", 48),
color=self.config.get("color", 0x999999),
opacity=self.config.get("opacity", 0.3)
)
elif self.config.get("watermark_image"):
add_image_watermark(
temp_pdf, final_pdf,
self.config["watermark_image"],
opacity=self.config.get("opacity", 0.3)
)
else:
os.rename(temp_pdf, final_pdf)
# 清理临时文件
if os.path.exists(temp_pdf):
os.remove(temp_pdf)
return final_pdf
def process_directory(self, input_dir, output_dir):
results = []
for filename in os.listdir(input_dir):
if filename.lower().endswith((".ppt", ".pptx")):
input_path = os.path.join(input_dir, filename)
try:
output_path = self.process_file(input_path, output_dir)
results.append((filename, output_path, "success"))
except Exception as e:
results.append((filename, str(e), "failed"))
return results
```
### 4.2 配置示例
我们可以使用YAML文件来定义处理配置:
```yaml
# config.yaml
watermark:
text: "CONFIDENTIAL"
font_size: 56
color: "#CCCCCC" # 浅灰色
opacity: 0.4
angle: 45
input:
directory: "/path/to/ppt/files"
file_pattern: "*.pptx"
output:
directory: "/path/to/output"
overwrite: false
logging:
level: "INFO"
file: "/var/log/ppt_processor.log"
```
对应的配置加载代码:
```python
import yaml
def load_config(config_path):
with open(config_path) as f:
config = yaml.safe_load(f)
return config
```
### 4.3 实战案例:市场资料批量处理
假设市场部门每周需要处理50份产品演示PPT,转换为PDF并添加"内部使用"水印:
```python
config = {
"watermark_text": "内部使用 - 严禁外传",
"font_size": 42,
"color": 0x888888,
"opacity": 0.3,
"angle": 30
}
input_dir = "/mnt/share/marketing/presentations"
output_dir = "/mnt/share/marketing/pdf_versions"
with PPTProcessor(config) as processor:
results = processor.process_directory(input_dir, output_dir)
success_count = sum(1 for r in results if r[2] == "success")
print(f"处理完成: {success_count}个成功, {len(results)-success_count}个失败")
```
对于失败案例,我们可以记录详细日志以便后续排查:
```python
import logging
logging.basicConfig(
filename="ppt_processor.log",
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
for filename, result, status in results:
if status == "failed":
logging.error(f"处理失败: {filename} - {result}")
else:
logging.info(f"处理成功: {filename} -> {result}")
```
## 5. 高级技巧与性能优化
掌握了基础功能后,我们来探讨一些高级技巧和优化策略。
### 5.1 使用LibreOffice宏增强功能
LibreOffice支持录制和使用宏,我们可以利用这个特性扩展功能:
```python
def apply_macro_to_pdf(input_pdf, output_pdf, macro_name):
subprocess.run([
"soffice",
"--headless",
"macro:///Standard.Module1.{}".format(macro_name),
input_pdf,
"--outdir", os.path.dirname(output_pdf)
])
```
### 5.2 多线程处理优化
对于大量文件,我们可以实现更精细的多线程控制:
```python
import queue
import threading
class PPTWorker(threading.Thread):
def __init__(self, task_queue, result_queue, config):
super().__init__()
self.task_queue = task_queue
self.result_queue = result_queue
self.config = config
self.daemon = True
def run(self):
with PPTProcessor(self.config) as processor:
while True:
try:
task = self.task_queue.get_nowait()
except queue.Empty:
break
try:
result = processor.process_file(task["input"], task["output"])
self.result_queue.put({
"input": task["input"],
"output": result,
"status": "success"
})
except Exception as e:
self.result_queue.put({
"input": task["input"],
"error": str(e),
"status": "failed"
})
finally:
self.task_queue.task_done()
def parallel_process(files, output_dir, config, num_workers=4):
task_queue = queue.Queue()
result_queue = queue.Queue()
for file in files:
task_queue.put({
"input": file,
"output": output_dir
})
workers = []
for _ in range(num_workers):
worker = PPTWorker(task_queue, result_queue, config)
worker.start()
workers.append(worker)
task_queue.join()
results = []
while not result_queue.empty():
results.append(result_queue.get())
return results
```
### 5.3 内存管理与错误恢复
长时间运行的批处理需要注意内存管理和错误恢复:
```python
class ResilientPPTProcessor:
def __init__(self, config, max_retries=3):
self.config = config
self.max_retries = max_retries
self.restart_count = 0
self.max_restarts = 5
def process_with_retry(self, input_path, output_dir):
last_error = None
for attempt in range(self.max_retries):
try:
with PPTProcessor(self.config) as processor:
return processor.process_file(input_path, output_dir)
except Exception as e:
last_error = e
if self.restart_count < self.max_restarts:
self.restart_count += 1
continue
raise last_error
raise last_error
```
### 5.4 与云存储集成
现代工作流程常常需要与云存储服务集成,我们可以扩展处理器支持云存储:
```python
from google.cloud import storage
class CloudPPTProcessor(PPTProcessor):
def __init__(self, config, cloud_config):
super().__init__(config)
self.cloud_config = cloud_config
self.storage_client = storage.Client.from_service_account_json(
cloud_config["service_account_key"]
)
self.bucket = self.storage_client.bucket(cloud_config["bucket_name"])
def download_from_cloud(self, cloud_path, local_path):
blob = self.bucket.blob(cloud_path)
blob.download_to_filename(local_path)
def upload_to_cloud(self, local_path, cloud_path):
blob = self.bucket.blob(cloud_path)
blob.upload_from_filename(local_path)
def process_cloud_file(self, input_cloud_path, output_cloud_path):
with tempfile.TemporaryDirectory() as tmpdir:
# 下载文件
local_input = os.path.join(tmpdir, os.path.basename(input_cloud_path))
self.download_from_cloud(input_cloud_path, local_input)
# 处理文件
local_output = os.path.join(tmpdir, "processed_" + os.path.basename(input_cloud_path))
self.process_file(local_input, tmpdir)
# 上传结果
output_filename = os.path.splitext(os.path.basename(input_cloud_path))[0] + ".pdf"
final_local_output = os.path.join(tmpdir, output_filename)
os.rename(local_output, final_local_output)
self.upload_to_cloud(final_local_output, output_cloud_path)
return output_cloud_path
```
## 6. 异常处理与日志记录
健壮的生产环境应用需要完善的异常处理和日志记录机制。
### 6.1 常见错误处理
LibreOffice处理过程中可能遇到的典型错误:
```python
class LibreOfficeError(Exception):
pass
class ConversionError(LibreOfficeError):
pass
class WatermarkError(LibreOfficeError):
pass
def handle_conversion(input_file, output_dir):
try:
# 尝试转换
result = subprocess.run(
["soffice", "--headless", "--convert-to", "pdf", input_file, "--outdir", output_dir],
capture_output=True,
text=True,
timeout=300
)
if result.returncode != 0:
if "filter" in result.stderr and "not found" in result.stderr:
raise ConversionError(f"缺少必要的过滤器: {result.stderr}")
elif "could not load" in result.stderr:
raise ConversionError(f"无法加载文件: {result.stderr}")
else:
raise ConversionError(f"未知错误: {result.stderr}")
# 验证输出文件
output_file = os.path.join(
output_dir,
os.path.splitext(os.path.basename(input_file))[0] + ".pdf"
)
if not os.path.exists(output_file):
raise ConversionError("转换成功但输出文件不存在")
if os.path.getsize(output_file) == 0:
raise ConversionError("输出文件为空")
return output_file
except subprocess.TimeoutExpired:
raise ConversionError("转换超时")
except FileNotFoundError:
raise ConversionError("LibreOffice未安装或路径错误")
```
### 6.2 结构化日志记录
使用结构化日志便于后续分析:
```python
import json
from datetime import datetime
class StructuredLogger:
def __init__(self, log_file):
self.log_file = log_file
def log(self, event_type, **kwargs):
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"event": event_type,
**kwargs
}
with open(self.log_file, "a") as f:
f.write(json.dumps(log_entry) + "\n")
# 使用示例
logger = StructuredLogger("processing.log")
logger.log("conversion_start", input_file="presentation.pptx")
try:
output = convert_ppt_to_pdf("presentation.pptx", "output")
logger.log("conversion_success", input_file="presentation.pptx", output_file=output)
except Exception as e:
logger.log("conversion_failed", input_file="presentation.pptx", error=str(e))
```
### 6.3 监控与告警
对于关键业务应用,可以集成监控告警:
```python
import requests
class Monitor:
def __init__(self, webhook_url):
self.webhook_url = webhook_url
def send_alert(self, message, level="error"):
payload = {
"text": f"[{level.upper()}] {message}",
"attachments": []
}
try:
requests.post(self.webhook_url, json=payload)
except Exception as e:
print(f"无法发送告警: {e}")
# 使用示例
monitor = Monitor("https://hooks.slack.com/services/...")
try:
process_files()
except Exception as e:
monitor.send_alert(f"PPT处理失败: {str(e)}")
raise
```
## 7. 容器化部署方案
为了便于在不同环境中部署,我们可以将整个解决方案容器化。
### 7.1 Dockerfile配置
```dockerfile
FROM ubuntu:22.04
# 安装基础依赖
RUN apt-get update && apt-get install -y \
libreoffice \
python3 \
python3-pip \
imagemagick \
fonts-noto-cjk \
fonts-liberation \
&& rm -rf /var/lib/apt/lists/*
# 安装Python依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY app /app
WORKDIR /app
# 设置环境变量
ENV PYTHONUNBUFFERED=1
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
# 启动脚本
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
```
### 7.2 启动脚本
```bash
#!/bin/bash
# entrypoint.sh
# 启动LibreOffice监听
soffice --headless --invisible --nocrashreport \
--nodefault --nologo --nofirststartwizard \
--accept="socket,host=0.0.0.0,port=2002;urp;" &
# 等待LibreOffice启动
sleep 5
# 运行Python应用
exec python3 main.py "$@"
```
### 7.3 Kubernetes部署
对于大规模部署,可以使用Kubernetes:
```yaml
# k8s-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ppt-processor
spec:
replicas: 3
selector:
matchLabels:
app: ppt-processor
template:
metadata:
labels:
app: ppt-processor
spec:
containers:
- name: processor
image: your-registry/ppt-processor:latest
ports:
- containerPort: 2002
resources:
limits:
memory: "1Gi"
cpu: "1"
requests:
memory: "512Mi"
cpu: "500m"
volumeMounts:
- name: data-volume
mountPath: /data
volumes:
- name: data-volume
persistentVolumeClaim:
claimName: ppt-pvc
```
### 7.4 性能调优建议
在容器环境中,可以调整以下参数优化性能:
- 限制LibreOffice内存使用:`--nofirststartwizard --norestore`
- 调整JVM参数(如果使用Java组件):`-env:URE_BOOTSTRAP=... -env:JFW_PLUGIN_DO_NOT_CHECK_ACCESSIBILITY=1`
- 设置合理的超时时间
- 根据负载动态调整副本数量
## 8. 替代方案比较与选择
虽然LibreOffice是一个强大的工具,但在某些场景下可能需要考虑替代方案。
### 8.1 方案对比表
| 方案 | 优点 | 缺点 | 适用场景 |
|------|------|------|----------|
| LibreOffice命令行 | 免费开源,支持多种格式,功能全面 | 资源占用高,批量处理速度一般 | 需要处理多种文档格式,预算有限 |
| Microsoft Office COM接口 | 兼容性好,性能优秀 | 需要Windows和Office授权,成本高 | 企业环境,已部署Office |
| 云API(如Google Docs) | 无需本地安装,易于扩展 | 需要网络,可能有隐私问题 | 云原生应用,需要高可用性 |
| 专业PDF库(PyPDF2等) | 轻量级,专注PDF处理 | 无法直接处理PPT,功能有限 | 只需要简单PDF操作 |
### 8.2 混合方案
在实际项目中,可以结合多种技术:
```python
def convert_ppt_to_pdf_hybrid(input_file, output_file):
# 先尝试用LibreOffice
try:
convert_with_libreoffice(input_file, output_file)
return
except ConversionError:
pass
# 失败后尝试云API
try:
convert_with_cloud_api(input_file, output_file)
return
except CloudError:
pass
# 最后尝试其他本地工具
convert_with_fallback_tool(input_file, output_file)
```
### 8.3 选择建议
根据项目需求选择合适方案:
- **预算有限的开源项目**:LibreOffice + Python
- **企业Windows环境**:Microsoft Office COM接口
- **高并发云应用**:专业云API服务
- **简单PDF处理**:PyPDF2/ReportLab等库
## 9. 扩展功能与定制开发
基础功能实现后,可以考虑扩展更多实用功能。
### 9.1 PDF元数据处理
```python
from PyPDF2 import PdfReader, PdfWriter
def update_pdf_metadata(input_pdf, output_pdf, metadata):
reader = PdfReader(input_pdf)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
# 添加/更新元数据
for key, value in metadata.items():
writer.add_metadata({key: value})
with open(output_pdf, "wb") as f:
writer.write(f)
```
### 9.2 PDF加密与权限控制
```python
def encrypt_pdf(input_pdf, output_pdf, password, permissions=[]):
reader = PdfReader(input_pdf)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
# 设置权限
writer.encrypt(
user_password=password,
owner_password=None,
use_128bit=True,
permissions=permissions
)
with open(output_pdf, "wb") as f:
writer.write(f)
```
### 9.3 与文档管理系统集成
```python
class DMSIntegrator:
def __init__(self, dms_config):
self.dms_config = dms_config
def upload_to_dms(self, file_path, metadata):
# 实现与具体DMS的集成
pass
def process_and_upload(self, input_file):
# 临时目录处理
with tempfile.TemporaryDirectory() as tmpdir:
# 转换文件
output_pdf = os.path.join(tmpdir, os.path.basename(input_file).replace(".pptx", ".pdf"))
convert_ppt_to_pdf(input_file, tmpdir)
# 添加水印
if self.dms_config.get("watermark"):
watermarked_pdf = os.path.join(tmpdir, "watermarked_" + os.path.basename(output_pdf))
add_watermark(output_pdf, watermarked_pdf, self.dms_config["watermark"])
output_pdf = watermarked_pdf
# 上传到DMS
doc_id = self.upload_to_dms(output_pdf, {
"source": input_file,
"processed_at": datetime.now().isoformat()
})
return doc_id
```
### 9.4 自动化工作流示例
结合以上功能,我们可以构建完整的自动化工作流:
```python
def automated_workflow(config):
# 初始化组件
logger = StructuredLogger(config["log_file"])
monitor = Monitor(config["monitor_webhook"])
dms_integrator = DMSIntegrator(config["dms"])
try:
logger.log("workflow_start", config=config)
# 1. 收集待处理文件
file_collector = FileCollector(config["input"])
files = file_collector.collect_files()
logger.log("files_collected", count=len(files))
# 2. 并行处理
processor_config = config["processor"]
with ThreadPoolExecutor(max_workers=processor_config["max_workers"]) as executor:
futures = []
for file in files:
futures.append(executor.submit(
process_single_file,
file,
processor_config,
dms_integrator
))
results = []
for future in as_completed(futures):
try:
result = future.result()
results.append(result)
logger.log("file_processed", file=result["input"], status="success")
except Exception as e:
logger.log("file_failed", error=str(e), status="failed")
monitor.send_alert(f"文件处理失败: {str(e)}")
# 3. 生成报告
report = generate_report(results)
logger.log("workflow_complete", summary=report)
return report
except Exception as e:
logger.log("workflow_failed", error=str(e))
monitor.send_alert(f"工作流失败: {str(e)}")
raise
```
## 10. 维护与持续改进
任何生产环境应用都需要考虑长期维护和持续改进。
### 10.1 版本兼容性处理
LibreOffice不同版本可能有差异,需要做兼容处理:
```python
def get_libreoffice_version():
try:
result = subprocess.run(
["soffice", "--version"],
capture_output=True,
text=True
)
version_str = result.stdout.strip()
return version_str.split()[1] # 例如 "7.2.4.2"
except Exception:
return None
def check_compatibility():
version = get_libreoffice_version()
if not version:
raise RuntimeError("无法获取LibreOffice版本")
major, minor = map(int, version.split(".")[:2])
if (major, minor) < (7, 2):
print(f"警告: LibreOffice {version} 不是官方支持的版本")
```
### 10.2 自动化测试策略
实现自动化测试确保功能稳定:
```python
import unittest
from unittest.mock import patch
class TestPPTProcessor(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.test_dir = tempfile.mkdtemp()
cls.sample_ppt = os.path.join(cls.test_dir, "test.pptx")
# 创建测试PPT文件...
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.test_dir)
def test_conversion(self):
output_dir = os.path.join(self.test_dir, "output")
os.makedirs(output_dir, exist_ok=True)
result = convert_ppt_to_pdf(self.sample_ppt, output_dir)
self.assertTrue(os.path.exists(result))
self.assertGreater(os.path.getsize(result), 0)
@patch("subprocess.run")
def test_conversion_failure(self, mock_run):
mock_run.return_value.returncode = 1
mock_run.return_value.stderr = "模拟错误"
with self.assertRaises(ConversionError):
convert_ppt_to_pdf("dummy.pptx", "dummy_out")
```
### 10.3 性能监控与优化
持续监控性能指标:
```python
import time
import psutil
class PerformanceMonitor:
def __init__(self):
self.start_time = time.time()
self.start_cpu = psutil.cpu_percent()
self.start_mem = psutil.virtual_memory().percent
def get_stats(self):
duration = time.time() - self.start_time
cpu_usage = psutil.cpu_percent() - self.start_cpu
mem_usage = psutil.virtual_memory().percent - self.start_mem
return {
"duration_sec": round(duration, 2),
"cpu_usage": round(cpu_usage, 1),
"mem_usage": round(mem_usage, 1)
}
# 使用示例
def process_with_monitoring(input_file, output_dir):
monitor = PerformanceMonitor()
try:
result = convert_ppt_to_pdf(input_file, output_dir)
stats = monitor.get_stats()
logger.log("conversion_stats", **stats, file=input_file)
return result
except Exception as e:
stats = monitor.get_stats()
logger.log("conversion_failed_stats", **stats, error=str(e))
raise
```
### 10.4 用户反馈与迭代
建立用户反馈机制:
```python
def collect_feedback(output_pdf):
feedback_file = os.path.splitext(output_pdf)[0] + "_feedback.json"
if os.path.exists(feedback_file):
with open(feedback_file) as f:
return json.load(f)
return None
def save_feedback(output_pdf, quality_rating, comments=""):
feedback = {
"timestamp": datetime.now().isoformat(),
"file": output_pdf,
"quality": quality_rating,
"comments": comments
}
feedback_file = os.path.splitext(output_pdf)[0] + "_feedback.json"
with open(feedback_file, "w") as f:
json.dump(feedback, f)
```
在实际项目中,我们通过逐步迭代改进了水印的渲染质量,发现LibreOffice 7.4版本对中文水印的支持明显优于早期版本。对于需要处理大量文件的场景,建议使用SSD存储并适当增加处理节点的内存配置。