Files
Fairscan_cyy/pc-server/main.py
MobKBK 2cc6fd34eb
Some checks failed
Android CI / build (push) Has been cancelled
更名为 FAIRSCAN_SURPORT_BY_CYY (FSBC)
- app_name → FSBC
- PC 服务器标题/页面名称 → FSBC
- 日志标签:FairScan → FSBC
- PDF 创建者/邮件标题/诊断报告 → FSBC
- README 重写:新增图传/上传/MinerU/OCRmyPDF/任务面板完整说明

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-04 17:13:34 +08:00

801 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
FAIRSCAN_SURPORT_BY_CYY (FSBC) PC Server
Endpoints:
Streaming:
GET /health → Health check (used by Android for connection test)
WS /stream → WebSocket endpoint for receiving JPEG frames
GET / → Web page showing the live stream
Upload & Tasks:
POST /upload/pdf → Upload a PDF file, returns fileId
POST /tasks/process → Create a MinerU processing task (ocrpdf / markdown)
GET /tasks/{taskId} → Query task status (queued/processing/completed/failed)
GET /tasks/{taskId}/artifacts → List result files for a completed task
GET /artifacts/{artifactId}/download → Download a result file
GET /files/{fileId}/download → Download an uploaded file
"""
import asyncio
import json
import os
import time
import uuid
import zipfile
from datetime import datetime
from pathlib import Path
# 国内网络环境无法访问 huggingface.co强制使用本地缓存模型
os.environ["HF_HUB_OFFLINE"] = "1"
# Tesseract OCR 语言包路径OCRmyPDF 需要,从 conda 环境自动获取)
_tessdata = Path(os.environ.get("CONDA_PREFIX", "")) / "Library" / "share" / "tessdata"
if _tessdata.exists():
os.environ["TESSDATA_PREFIX"] = str(_tessdata)
from fastapi import FastAPI, File, Form, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
# ── MinerU & OCRmyPDF integration ────────────────────────────────────────────
from mineru.cli.common import aio_do_parse, read_fn
import ocrmypdf
from loguru import logger
app = FastAPI(title="FSBC PC Server")
# ── Configuration ─────────────────────────────────────────────────────────────
UPLOAD_DIR = Path("./uploads")
TASKS_DIR = Path("./tasks")
UPLOAD_DIR.mkdir(exist_ok=True)
TASKS_DIR.mkdir(exist_ok=True)
# ── In-memory state (streaming) ──────────────────────────────────────────────
latest_frame: bytes | None = None
frame_timestamp: float = 0.0
connected_clients: set[WebSocket] = set()
stream_stats: dict = {"frames_received": 0, "bytes_received": 0, "started_at": None}
# ── HTML page with live stream viewer ────────────────────────────────────────
STREAM_PAGE = """\
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>FSBC Stream</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
background: #1a1a2e;
color: #eee;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
display: flex;
flex-direction: column;
align-items: center;
min-height: 100vh;
padding: 20px;
}
.nav-bar {
display: flex; align-items: center; gap: 16px; margin-bottom: 20px;
width: 100%; max-width: 900px;
}
.nav-bar h1 { font-size: 1.4rem; margin: 0; opacity: 0.9; }
.nav-bar .nav-links { margin-left: auto; display: flex; gap: 12px; }
.nav-bar .nav-links a {
color: #7c8dff; text-decoration: none; padding: 6px 14px;
border: 1px solid #7c8dff44; border-radius: 6px; font-size: 0.85rem;
}
.nav-bar .nav-links a:hover { background: #7c8dff22; }
.nav-bar .nav-links a.active { background: #7c8dff22; border-color: #7c8dff; }
h1 { margin-bottom: 16px; font-size: 1.4rem; opacity: 0.9; }
#stream {
max-width: 100%;
max-height: 80vh;
border-radius: 12px;
box-shadow: 0 4px 30px rgba(0,0,0,0.5);
background: #000;
display: block;
}
#status {
margin-top: 12px;
font-size: 0.85rem;
opacity: 0.6;
}
#no-frame {
width: 640px; height: 480px;
background: #16213e;
border-radius: 12px;
display: flex;
align-items: center;
justify-content: center;
font-size: 1.1rem;
opacity: 0.5;
}
</style>
</head>
<body>
<div class="nav-bar">
<h1>📷 FSBC Live Stream</h1>
<div class="nav-links">
<a href="/" class="active">📷 图传预览</a>
<a href="/dashboard">📊 管理面板</a>
</div>
</div>
<img id="stream" src="" alt="Waiting for stream..." style="display:none">
<div id="no-frame">Waiting for stream...</div>
<div id="status">Not connected</div>
<script>
const img = document.getElementById('stream');
const noFrame = document.getElementById('no-frame');
const status = document.getElementById('status');
let ws = null;
function connect() {
const protocol = location.protocol === 'https:' ? 'wss:' : 'ws:';
const url = protocol + '//' + location.host + '/stream';
ws = new WebSocket(url);
ws.onopen = () => {
status.textContent = 'Connected';
status.style.color = '#4CAF50';
};
ws.onmessage = (event) => {
const blob = new Blob([event.data], { type: 'image/jpeg' });
const url = URL.createObjectURL(blob);
img.src = url;
img.style.display = 'block';
noFrame.style.display = 'none';
status.textContent = 'Receiving frames';
status.style.color = '#4CAF50';
};
ws.onclose = () => {
status.textContent = 'Disconnected - reconnecting in 2s...';
status.style.color = '#E53935';
setTimeout(connect, 2000);
};
ws.onerror = () => { ws.close(); };
}
connect();
</script>
</body>
</html>
"""
# ── Routes: Streaming ────────────────────────────────────────────────────────
@app.get("/health")
async def health():
"""Health check endpoint used by Android for connection testing."""
return JSONResponse({
"status": "ok",
"name": "FSBC-PC",
"features": ["stream", "upload", "tasks"],
"streamStats": {
"framesReceived": stream_stats["frames_received"],
"uptime": (
time.time() - stream_stats["started_at"]
if stream_stats["started_at"] else 0
),
},
"timestamp": datetime.utcnow().isoformat(),
})
@app.get("/")
async def index():
"""Serve the live stream viewer page."""
return HTMLResponse(STREAM_PAGE)
@app.websocket("/stream")
async def stream_endpoint(ws: WebSocket):
"""WebSocket endpoint that receives JPEG frames from the Android app."""
await ws.accept()
connected_clients.add(ws)
if stream_stats["started_at"] is None:
stream_stats["started_at"] = time.time()
try:
frame_count = 0
while True:
data = await ws.receive_bytes()
global latest_frame, frame_timestamp
latest_frame = data
frame_timestamp = time.time()
stream_stats["frames_received"] += 1
stream_stats["bytes_received"] += len(data)
frame_count += 1
if frame_count % 30 == 1:
print(f"[Stream] Received frame #{stream_stats['frames_received']} ({len(data)} bytes)")
# Broadcast to all browser clients
for client in connected_clients:
if client is not ws:
try:
await client.send_bytes(data)
except Exception:
connected_clients.discard(client)
except WebSocketDisconnect:
pass
finally:
connected_clients.discard(ws)
# ── Routes: Upload & Tasks ──────────────────────────────────────────────────
files_db: dict[str, dict] = {} # fileId -> {fileId, fileName, sizeBytes, uploadPath, createdAt}
@app.post("/upload/pdf", status_code=201)
async def upload_pdf(file: UploadFile = File(...)):
"""Upload a PDF file to the PC (no processing).
Stores the file in ./uploads/ and returns a fileId for later use.
Processing is a separate step via POST /tasks/process.
"""
if not file.filename or not file.filename.lower().endswith(".pdf"):
raise HTTPException(status_code=400, detail="Only PDF files are accepted")
file_id = str(uuid.uuid4())
timestamp = datetime.utcnow().isoformat()
safe_name = file.filename.replace("..", "").replace("/", "_")
# Save the uploaded PDF
upload_path = UPLOAD_DIR / f"{file_id}_{safe_name}"
content = await file.read()
upload_path.write_bytes(content)
# Store file record (pure upload, no task/processing)
file_record = {
"fileId": file_id,
"fileName": safe_name,
"mimeType": "application/pdf",
"sizeBytes": len(content),
"uploadPath": str(upload_path),
"createdAt": timestamp,
}
files_db[file_id] = file_record
print(f"[Upload] Received {safe_name} ({len(content)} bytes) -> file {file_id}")
return JSONResponse({
"fileId": file_id,
"fileName": safe_name,
"mimeType": "application/pdf",
"sizeBytes": len(content),
})
@app.post("/tasks/process", status_code=202)
async def create_task(body: dict):
"""Create a processing task for an uploaded PDF.
Request body: {"fileId": "...", "processType": "ocrpdf"|"markdown"}
"""
file_id = body.get("fileId", "")
process_type = body.get("processType", "ocrpdf").lower()
if not file_id:
raise HTTPException(status_code=400, detail="fileId is required")
if process_type not in ("ocrpdf", "markdown"):
raise HTTPException(status_code=400, detail="processType must be 'ocrpdf' or 'markdown'")
# Look up the uploaded file
file_record = files_db.get(file_id)
if file_record is None:
raise HTTPException(status_code=404, detail="File not found")
task_id = str(uuid.uuid4())
timestamp = datetime.utcnow().isoformat()
task = {
"taskId": task_id,
"fileId": file_id,
"status": "queued",
"progress": 0,
"processType": process_type,
"fileName": file_record["fileName"],
"createdAt": timestamp,
"updatedAt": timestamp,
"uploadPath": file_record["uploadPath"],
"message": f"Task created (processType={process_type})",
}
tasks_db[task_id] = task
# Start MinerU processing in background
asyncio.create_task(process_with_mineru(task_id))
print(f"[Tasks] Created task {task_id} for file {file_id} (processType={process_type})")
return JSONResponse({
"taskId": task_id,
"status": "queued",
"processType": process_type,
"fileId": file_id,
})
@app.get("/tasks/{task_id}")
async def get_task_status(task_id: str):
"""Get the current status of a processing task."""
task = tasks_db.get(task_id)
if task is None:
raise HTTPException(status_code=404, detail="Task not found")
return JSONResponse({
"taskId": task["taskId"],
"fileId": task.get("fileId", ""),
"status": task["status"],
"progress": task["progress"],
"processType": task.get("processType", ""),
"fileName": task["fileName"],
"createdAt": task["createdAt"],
"message": task.get("message", ""),
})
@app.get("/tasks/{task_id}/artifacts")
async def list_artifacts(task_id: str):
"""List result files for a completed task."""
task = tasks_db.get(task_id)
if task is None:
raise HTTPException(status_code=404, detail="Task not found")
artifacts = artifacts_db.get(task_id, [])
result = []
for art in artifacts:
result.append({
"id": art["artifactId"],
"artifactId": art["artifactId"],
"fileName": art["fileName"],
"fileSize": art["fileSize"],
"fileType": art["fileType"],
})
return JSONResponse(result)
@app.get("/artifacts/{artifact_id}/download")
async def download_artifact(artifact_id: str):
"""Download a processed artifact file."""
art = artifacts_map.get(artifact_id)
if art is None:
raise HTTPException(status_code=404, detail="Artifact not found")
file_path = Path(art["filePath"])
if not file_path.exists():
raise HTTPException(status_code=404, detail="Artifact file not found on disk")
file_type = art["fileType"]
if file_type == "pdf":
media_type = "application/pdf"
elif file_type == "zip":
media_type = "application/zip"
else:
media_type = "text/markdown"
return FileResponse(
path=file_path,
filename=art["fileName"],
media_type=media_type,
)
@app.get("/files/{file_id}/download")
async def download_uploaded_file(file_id: str):
"""Download an uploaded (unprocessed) PDF file."""
file_record = files_db.get(file_id)
if file_record is None:
raise HTTPException(status_code=404, detail="File not found")
file_path = Path(file_record["uploadPath"])
if not file_path.exists():
raise HTTPException(status_code=404, detail="File not found on disk")
return FileResponse(
path=file_path,
filename=file_record["fileName"],
media_type="application/pdf",
)
# ── Dashboard page ───────────────────────────────────────────────────────────
DASHBOARD_PAGE = """\
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>FSBC Dashboard</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
background: #1a1a2e;
color: #eee;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
padding: 24px;
}
.header {
display: flex; align-items: center; gap: 16px; margin-bottom: 24px;
}
.header h1 { font-size: 1.5rem; }
.header .nav-links { margin-left: auto; display: flex; gap: 12px; }
.header .nav-links a {
color: #7c8dff; text-decoration: none; padding: 6px 14px;
border: 1px solid #7c8dff44; border-radius: 6px; font-size: 0.85rem;
}
.header .nav-links a:hover { background: #7c8dff22; }
.stats-row {
display: flex; gap: 16px; margin-bottom: 24px; flex-wrap: wrap;
}
.stat-card {
background: #16213e; border-radius: 10px; padding: 16px 24px; flex: 1;
min-width: 120px; text-align: center;
}
.stat-card .num { font-size: 1.8rem; font-weight: 700; }
.stat-card .label { font-size: 0.75rem; opacity: 0.6; margin-top: 4px; }
.stat-card.queued .num { color: #ffa726; }
.stat-card.processing .num { color: #42a5f5; }
.stat-card.completed .num { color: #66bb6a; }
.stat-card.failed .num { color: #ef5350; }
.section-title { font-size: 1.1rem; margin-bottom: 12px; opacity: 0.8; }
.task-table {
width: 100%; border-collapse: collapse; margin-bottom: 24px;
background: #16213e; border-radius: 10px; overflow: hidden;
}
.task-table th {
text-align: left; padding: 12px 16px; font-size: 0.75rem;
text-transform: uppercase; letter-spacing: 0.5px; opacity: 0.5;
border-bottom: 1px solid #ffffff11;
}
.task-table td {
padding: 12px 16px; font-size: 0.85rem; border-bottom: 1px solid #ffffff08;
}
.task-table tr:hover td { background: #ffffff06; }
.badge {
display: inline-block; padding: 2px 10px; border-radius: 10px;
font-size: 0.75rem; font-weight: 600;
}
.badge.queued { background: #ffa72633; color: #ffa726; }
.badge.processing { background: #42a5f533; color: #42a5f5; }
.badge.completed { background: #66bb6a33; color: #66bb6a; }
.badge.failed { background: #ef535033; color: #ef5350; }
.progress-bar {
width: 120px; height: 6px; background: #ffffff15; border-radius: 3px; overflow: hidden;
}
.progress-bar .fill {
height: 100%; border-radius: 3px; transition: width 0.5s ease;
background: linear-gradient(90deg, #42a5f5, #66bb6a);
}
.download-btn {
color: #7c8dff; text-decoration: none; font-size: 0.8rem;
padding: 4px 10px; border: 1px solid #7c8dff44; border-radius: 4px;
}
.download-btn:hover { background: #7c8dff22; }
.empty-state { opacity: 0.4; text-align: center; padding: 40px; font-size: 0.9rem; }
.file-name { max-width: 250px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
</style>
</head>
<body>
<div class="header">
<h1>📊 FSBC Dashboard</h1>
<div class="nav-links">
<a href="/">📷 图传预览</a>
<a href="/dashboard" style="background:#7c8dff22;border-color:#7c8dff;">📊 管理面板</a>
</div>
</div>
<div class="stats-row" id="stats"></div>
<div class="section-title">📄 已上传的文件</div>
<table class="task-table">
<thead><tr>
<th>文件名</th><th>文件 ID</th><th>大小</th><th>时间</th><th>操作</th>
</tr></thead>
<tbody id="file-list"></tbody>
</table>
<div class="section-title">⚙️ 处理任务</div>
<table class="task-table">
<thead><tr>
<th>文件名</th><th>任务 ID</th><th>状态</th><th>进度</th><th>处理类型</th><th>时间</th><th>操作</th>
</tr></thead>
<tbody id="task-list"></tbody>
</table>
<script>
function fmtSize(bytes) {
if (!bytes) return '-';
if (bytes < 1024) return bytes + 'B';
if (bytes < 1048576) return (bytes / 1024).toFixed(1) + 'KB';
return (bytes / 1048576).toFixed(1) + 'MB';
}
function fmtTime(iso) {
return iso ? iso.replace('T', ' ').substring(0, 19) : '-';
}
async function refresh() {
try {
const resp = await fetch('/api/dashboard');
const data = await resp.json();
// Stats
const s = data.stats;
document.getElementById('stats').innerHTML =
`<div class="stat-card"><div class="num">${data.files.length}</div><div class="label">已上传</div></div>
<div class="stat-card"><div class="num">${s.total}</div><div class="label">处理任务</div></div>
<div class="stat-card queued"><div class="num">${s.queued}</div><div class="label">排队中</div></div>
<div class="stat-card processing"><div class="num">${s.processing}</div><div class="label">处理中</div></div>
<div class="stat-card completed"><div class="num">${s.completed}</div><div class="label">已完成</div></div>`;
// Files
const fbody = document.getElementById('file-list');
if (data.files.length === 0) {
fbody.innerHTML = '<tr><td colspan="5"><div class="empty-state">暂无上传文件</div></td></tr>';
} else {
fbody.innerHTML = data.files.map(f =>
`<tr>
<td><div class="file-name" title="${f.fileName}">${f.fileName}</div></td>
<td style="font-family:monospace;font-size:0.75rem;opacity:0.6">${f.fileId.substring(0, 8)}...</td>
<td>${fmtSize(f.sizeBytes)}</td>
<td style="font-size:0.75rem;opacity:0.5">${fmtTime(f.createdAt)}</td>
<td><a class="download-btn" href="/files/${f.fileId}/download">⬇ 下载</a></td>
</tr>`
).join('');
}
// Tasks
const tbody = document.getElementById('task-list');
if (data.tasks.length === 0) {
tbody.innerHTML = '<tr><td colspan="7"><div class="empty-state">暂无处理任务</div></td></tr>';
} else {
tbody.innerHTML = data.tasks.map(t => {
const badge = `<span class="badge ${t.status}">${t.status}</span>`;
const bar = t.status === 'processing'
? `<div class="progress-bar"><div class="fill" style="width:${t.progress}%"></div></div>`
: t.status === 'completed' ? '✅ 完成' : `${t.progress}%`;
const actions = t.artifacts && t.artifacts.length > 0
? t.artifacts.map(a => `<a class="download-btn" href="/artifacts/${a.id}/download">⬇ ${a.fileName}</a>`).join(' ')
: t.status === 'completed' ? '' : '-';
return `<tr>
<td><div class="file-name" title="${t.fileName}">${t.fileName}</div></td>
<td style="font-family:monospace;font-size:0.75rem;opacity:0.6">${t.taskId.substring(0, 8)}...</td>
<td>${badge}</td>
<td>${bar}</td>
<td>${t.processType || '-'}</td>
<td style="font-size:0.75rem;opacity:0.5">${fmtTime(t.createdAt)}</td>
<td>${actions}</td>
</tr>`;
}).join('');
}
} catch (e) {
document.getElementById('task-list').innerHTML =
'<tr><td colspan="7"><div class="empty-state">⚠️ 连接服务器失败: ' + e.message + '</div></td></tr>';
}
}
refresh();
setInterval(refresh, 2000);
</script>
</body>
</html>
"""
@app.get("/dashboard")
async def dashboard():
"""Serve the task management dashboard page."""
return HTMLResponse(DASHBOARD_PAGE)
@app.get("/api/dashboard")
async def dashboard_api():
"""JSON endpoint providing dashboard data (files + tasks + stats)."""
# List uploaded files
files_list = []
for fid, f_rec in files_db.items():
files_list.append({
"fileId": fid,
"fileName": f_rec.get("fileName", ""),
"sizeBytes": f_rec.get("sizeBytes", 0),
"createdAt": f_rec.get("createdAt", ""),
})
files_list.sort(key=lambda f: f.get("createdAt", ""), reverse=True)
# List tasks
tasks_list = []
for tid, task in tasks_db.items():
task_artifacts = artifacts_db.get(tid, [])
artifacts_info = [
{"id": a["artifactId"], "fileName": a["fileName"]}
for a in task_artifacts
]
tasks_list.append({
"taskId": tid,
"fileId": task.get("fileId", ""),
"fileName": task.get("fileName", ""),
"status": task["status"],
"progress": task["progress"],
"processType": task.get("processType", ""),
"createdAt": task.get("createdAt", ""),
"message": task.get("message", ""),
"artifacts": artifacts_info,
})
tasks_list.sort(key=lambda t: t.get("createdAt", ""), reverse=True)
total = len(tasks_list)
queued = sum(1 for t in tasks_list if t["status"] == "queued")
processing = sum(1 for t in tasks_list if t["status"] == "processing")
completed = sum(1 for t in tasks_list if t["status"] == "completed")
failed = sum(1 for t in tasks_list if t["status"] == "failed")
return JSONResponse({
"stats": {"total": total, "queued": queued, "processing": processing, "completed": completed, "failed": failed},
"files": files_list,
"tasks": tasks_list,
})
# ── In-memory databases ──────────────────────────────────────────────────────
tasks_db: dict[str, dict] = {}
artifacts_db: dict[str, list[dict]] = {}
artifacts_map: dict[str, dict] = {}
async def process_with_mineru(task_id: str):
"""Process a PDF using real MinerU pipeline (replaces simulate_processing)."""
task = tasks_db.get(task_id)
if task is None:
return
process_type = task.get("processType", "ocrpdf")
upload_path_src = task.get("uploadPath")
file_name = task.get("fileName", "document.pdf")
base_name = Path(file_name).stem
lang = task.get("options", {}).get("lang", "ch")
if not upload_path_src or not Path(upload_path_src).exists():
task["status"] = "failed"
task["message"] = "Uploaded file not found on disk"
logger.error(f"[MinerU] Task {task_id}: file not found at {upload_path_src}")
return
task["status"] = "processing"
task["progress"] = 15
task["updatedAt"] = datetime.utcnow().isoformat()
task["message"] = f"MinerU pipeline started (backend=pipeline, processType={process_type})"
logger.info(f"[MinerU] Task {task_id}: starting {process_type} on {file_name}")
# Prepare output directory
output_dir = TASKS_DIR / task_id
output_dir.mkdir(parents=True, exist_ok=True)
try:
pdf_bytes = read_fn(upload_path_src)
if process_type == "markdown":
await aio_do_parse(
output_dir=str(output_dir),
pdf_file_names=[base_name],
pdf_bytes_list=[pdf_bytes],
p_lang_list=[lang],
backend="pipeline",
parse_method="auto",
f_dump_md=True,
f_dump_middle_json=False,
f_dump_model_output=False,
f_dump_orig_pdf=False,
f_dump_content_list=False,
f_draw_layout_bbox=False,
f_draw_span_bbox=False,
)
# MinerU output: {output_dir}/{base_name}/auto/{base_name}.md
md_dir = output_dir / base_name / "auto"
md_path = md_dir / f"{base_name}.md"
images_dir = md_dir / "images"
if md_path.exists():
artifacts_list = []
# Register the .md artifact
md_art_id = str(uuid.uuid4())
md_artifact = {
"artifactId": md_art_id,
"fileName": f"{base_name}.md",
"fileSize": md_path.stat().st_size,
"fileType": "md",
"filePath": str(md_path),
}
artifacts_list.append(md_artifact)
artifacts_map[md_art_id] = md_artifact
# If images directory exists and has files, create a ZIP
if images_dir.exists() and any(images_dir.iterdir()):
zip_path = md_dir / f"{base_name}_result.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
zf.write(md_path, md_path.name)
for img_file in images_dir.rglob("*"):
if img_file.is_file():
arcname = f"images/{img_file.relative_to(images_dir)}"
zf.write(img_file, arcname)
zip_art_id = str(uuid.uuid4())
zip_artifact = {
"artifactId": zip_art_id,
"fileName": f"{base_name}_result.zip",
"fileSize": zip_path.stat().st_size,
"fileType": "zip",
"filePath": str(zip_path),
}
artifacts_list.append(zip_artifact)
artifacts_map[zip_art_id] = zip_artifact
logger.info(f"[MinerU] Task {task_id}: ZIP created -> {zip_path} ({zip_path.stat().st_size} bytes)")
artifacts_db[task_id] = artifacts_list
task["status"] = "completed"
task["progress"] = 100
task["message"] = f"MinerU Markdown completed ({md_path.stat().st_size} bytes)"
logger.info(f"[MinerU] Task {task_id}: markdown completed -> {md_path}")
else:
task["status"] = "failed"
task["message"] = "MinerU did not produce .md output"
logger.error(f"[MinerU] Task {task_id}: no .md output at {md_path}")
else: # ocrpdf — use OCRmyPDF for searchable dual-layer PDF
ocr_lang = {"ch": "chi_sim", "en": "eng", "japan": "jpn", "korean": "kor"}.get(lang, "chi_sim")
ocr_output = output_dir / f"{base_name}_ocr.pdf"
await asyncio.to_thread(
ocrmypdf.ocr,
upload_path_src,
str(ocr_output),
language=ocr_lang,
output_type="pdf",
skip_text=True,
deskew=True,
optimize=0, # skip JBIG2 optimization (pikepdf compat)
)
if ocr_output.exists():
art_id = str(uuid.uuid4())
artifacts_db[task_id] = [{
"artifactId": art_id,
"fileName": f"{base_name}_ocr.pdf",
"fileSize": ocr_output.stat().st_size,
"fileType": "pdf",
"filePath": str(ocr_output),
}]
artifacts_map[art_id] = artifacts_db[task_id][0]
task["status"] = "completed"
task["progress"] = 100
task["message"] = f"OCRmyPDF completed ({ocr_output.stat().st_size} bytes)"
logger.info(f"[OCRmyPDF] Task {task_id}: ocrpdf completed -> {ocr_output}")
else:
task["status"] = "failed"
task["message"] = "OCRmyPDF did not produce output"
except Exception as e:
task["status"] = "failed"
task["message"] = f"MinerU error: {e}"
task["progress"] = 0
logger.error(f"[MinerU] Task {task_id}: exception - {e}")
task["updatedAt"] = datetime.utcnow().isoformat()
# ── Entry point ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
import uvicorn
port = 2026
print(f"🚀 FSBC Server starting on http://0.0.0.0:{port}")
print(f" Stream: http://localhost:{port}")
print(f" Dashboard: http://localhost:{port}/dashboard")
print(f" Health: http://localhost:{port}/health")
print(f" Upload: POST http://localhost:{port}/upload/pdf")
print(f" Tasks: POST http://localhost:{port}/tasks/process")
uvicorn.run(app, host="0.0.0.0", port=port, log_level="info")