Some checks failed
Android CI / build (push) Has been cancelled
- app_name → FSBC - PC 服务器标题/页面名称 → FSBC - 日志标签:FairScan → FSBC - PDF 创建者/邮件标题/诊断报告 → FSBC - README 重写:新增图传/上传/MinerU/OCRmyPDF/任务面板完整说明 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
801 lines
31 KiB
Python
801 lines
31 KiB
Python
"""
|
||
FAIRSCAN_SURPORT_BY_CYY (FSBC) PC Server
|
||
|
||
Endpoints:
|
||
Streaming:
|
||
GET /health → Health check (used by Android for connection test)
|
||
WS /stream → WebSocket endpoint for receiving JPEG frames
|
||
GET / → Web page showing the live stream
|
||
|
||
Upload & Tasks:
|
||
POST /upload/pdf → Upload a PDF file, returns fileId
|
||
POST /tasks/process → Create a MinerU processing task (ocrpdf / markdown)
|
||
GET /tasks/{taskId} → Query task status (queued/processing/completed/failed)
|
||
GET /tasks/{taskId}/artifacts → List result files for a completed task
|
||
GET /artifacts/{artifactId}/download → Download a result file
|
||
GET /files/{fileId}/download → Download an uploaded file
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
import os
|
||
import time
|
||
import uuid
|
||
import zipfile
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
# 国内网络环境无法访问 huggingface.co,强制使用本地缓存模型
|
||
os.environ["HF_HUB_OFFLINE"] = "1"
|
||
# Tesseract OCR 语言包路径(OCRmyPDF 需要,从 conda 环境自动获取)
|
||
_tessdata = Path(os.environ.get("CONDA_PREFIX", "")) / "Library" / "share" / "tessdata"
|
||
if _tessdata.exists():
|
||
os.environ["TESSDATA_PREFIX"] = str(_tessdata)
|
||
|
||
from fastapi import FastAPI, File, Form, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
|
||
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
||
|
||
# ── MinerU & OCRmyPDF integration ────────────────────────────────────────────
|
||
|
||
from mineru.cli.common import aio_do_parse, read_fn
|
||
import ocrmypdf
|
||
from loguru import logger
|
||
|
||
app = FastAPI(title="FSBC PC Server")
|
||
|
||
# ── Configuration ─────────────────────────────────────────────────────────────
|
||
|
||
UPLOAD_DIR = Path("./uploads")
|
||
TASKS_DIR = Path("./tasks")
|
||
UPLOAD_DIR.mkdir(exist_ok=True)
|
||
TASKS_DIR.mkdir(exist_ok=True)
|
||
|
||
|
||
# ── In-memory state (streaming) ──────────────────────────────────────────────
|
||
|
||
latest_frame: bytes | None = None
|
||
frame_timestamp: float = 0.0
|
||
connected_clients: set[WebSocket] = set()
|
||
stream_stats: dict = {"frames_received": 0, "bytes_received": 0, "started_at": None}
|
||
|
||
|
||
# ── HTML page with live stream viewer ────────────────────────────────────────
|
||
|
||
STREAM_PAGE = """\
|
||
<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<title>FSBC Stream</title>
|
||
<style>
|
||
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||
body {
|
||
background: #1a1a2e;
|
||
color: #eee;
|
||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||
display: flex;
|
||
flex-direction: column;
|
||
align-items: center;
|
||
min-height: 100vh;
|
||
padding: 20px;
|
||
}
|
||
.nav-bar {
|
||
display: flex; align-items: center; gap: 16px; margin-bottom: 20px;
|
||
width: 100%; max-width: 900px;
|
||
}
|
||
.nav-bar h1 { font-size: 1.4rem; margin: 0; opacity: 0.9; }
|
||
.nav-bar .nav-links { margin-left: auto; display: flex; gap: 12px; }
|
||
.nav-bar .nav-links a {
|
||
color: #7c8dff; text-decoration: none; padding: 6px 14px;
|
||
border: 1px solid #7c8dff44; border-radius: 6px; font-size: 0.85rem;
|
||
}
|
||
.nav-bar .nav-links a:hover { background: #7c8dff22; }
|
||
.nav-bar .nav-links a.active { background: #7c8dff22; border-color: #7c8dff; }
|
||
h1 { margin-bottom: 16px; font-size: 1.4rem; opacity: 0.9; }
|
||
#stream {
|
||
max-width: 100%;
|
||
max-height: 80vh;
|
||
border-radius: 12px;
|
||
box-shadow: 0 4px 30px rgba(0,0,0,0.5);
|
||
background: #000;
|
||
display: block;
|
||
}
|
||
#status {
|
||
margin-top: 12px;
|
||
font-size: 0.85rem;
|
||
opacity: 0.6;
|
||
}
|
||
#no-frame {
|
||
width: 640px; height: 480px;
|
||
background: #16213e;
|
||
border-radius: 12px;
|
||
display: flex;
|
||
align-items: center;
|
||
justify-content: center;
|
||
font-size: 1.1rem;
|
||
opacity: 0.5;
|
||
}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<div class="nav-bar">
|
||
<h1>📷 FSBC Live Stream</h1>
|
||
<div class="nav-links">
|
||
<a href="/" class="active">📷 图传预览</a>
|
||
<a href="/dashboard">📊 管理面板</a>
|
||
</div>
|
||
</div>
|
||
<img id="stream" src="" alt="Waiting for stream..." style="display:none">
|
||
<div id="no-frame">Waiting for stream...</div>
|
||
<div id="status">Not connected</div>
|
||
<script>
|
||
const img = document.getElementById('stream');
|
||
const noFrame = document.getElementById('no-frame');
|
||
const status = document.getElementById('status');
|
||
let ws = null;
|
||
|
||
function connect() {
|
||
const protocol = location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||
const url = protocol + '//' + location.host + '/stream';
|
||
ws = new WebSocket(url);
|
||
|
||
ws.onopen = () => {
|
||
status.textContent = 'Connected';
|
||
status.style.color = '#4CAF50';
|
||
};
|
||
|
||
ws.onmessage = (event) => {
|
||
const blob = new Blob([event.data], { type: 'image/jpeg' });
|
||
const url = URL.createObjectURL(blob);
|
||
img.src = url;
|
||
img.style.display = 'block';
|
||
noFrame.style.display = 'none';
|
||
status.textContent = 'Receiving frames';
|
||
status.style.color = '#4CAF50';
|
||
};
|
||
|
||
ws.onclose = () => {
|
||
status.textContent = 'Disconnected - reconnecting in 2s...';
|
||
status.style.color = '#E53935';
|
||
setTimeout(connect, 2000);
|
||
};
|
||
|
||
ws.onerror = () => { ws.close(); };
|
||
}
|
||
|
||
connect();
|
||
</script>
|
||
</body>
|
||
</html>
|
||
"""
|
||
|
||
|
||
# ── Routes: Streaming ────────────────────────────────────────────────────────
|
||
|
||
@app.get("/health")
|
||
async def health():
|
||
"""Health check endpoint used by Android for connection testing."""
|
||
return JSONResponse({
|
||
"status": "ok",
|
||
"name": "FSBC-PC",
|
||
"features": ["stream", "upload", "tasks"],
|
||
"streamStats": {
|
||
"framesReceived": stream_stats["frames_received"],
|
||
"uptime": (
|
||
time.time() - stream_stats["started_at"]
|
||
if stream_stats["started_at"] else 0
|
||
),
|
||
},
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
})
|
||
|
||
|
||
@app.get("/")
|
||
async def index():
|
||
"""Serve the live stream viewer page."""
|
||
return HTMLResponse(STREAM_PAGE)
|
||
|
||
|
||
@app.websocket("/stream")
|
||
async def stream_endpoint(ws: WebSocket):
|
||
"""WebSocket endpoint that receives JPEG frames from the Android app."""
|
||
await ws.accept()
|
||
connected_clients.add(ws)
|
||
if stream_stats["started_at"] is None:
|
||
stream_stats["started_at"] = time.time()
|
||
|
||
try:
|
||
frame_count = 0
|
||
while True:
|
||
data = await ws.receive_bytes()
|
||
global latest_frame, frame_timestamp
|
||
latest_frame = data
|
||
frame_timestamp = time.time()
|
||
stream_stats["frames_received"] += 1
|
||
stream_stats["bytes_received"] += len(data)
|
||
frame_count += 1
|
||
if frame_count % 30 == 1:
|
||
print(f"[Stream] Received frame #{stream_stats['frames_received']} ({len(data)} bytes)")
|
||
# Broadcast to all browser clients
|
||
for client in connected_clients:
|
||
if client is not ws:
|
||
try:
|
||
await client.send_bytes(data)
|
||
except Exception:
|
||
connected_clients.discard(client)
|
||
except WebSocketDisconnect:
|
||
pass
|
||
finally:
|
||
connected_clients.discard(ws)
|
||
|
||
|
||
# ── Routes: Upload & Tasks ──────────────────────────────────────────────────
|
||
|
||
files_db: dict[str, dict] = {} # fileId -> {fileId, fileName, sizeBytes, uploadPath, createdAt}
|
||
|
||
|
||
@app.post("/upload/pdf", status_code=201)
|
||
async def upload_pdf(file: UploadFile = File(...)):
|
||
"""Upload a PDF file to the PC (no processing).
|
||
|
||
Stores the file in ./uploads/ and returns a fileId for later use.
|
||
Processing is a separate step via POST /tasks/process.
|
||
"""
|
||
if not file.filename or not file.filename.lower().endswith(".pdf"):
|
||
raise HTTPException(status_code=400, detail="Only PDF files are accepted")
|
||
|
||
file_id = str(uuid.uuid4())
|
||
timestamp = datetime.utcnow().isoformat()
|
||
safe_name = file.filename.replace("..", "").replace("/", "_")
|
||
|
||
# Save the uploaded PDF
|
||
upload_path = UPLOAD_DIR / f"{file_id}_{safe_name}"
|
||
content = await file.read()
|
||
upload_path.write_bytes(content)
|
||
|
||
# Store file record (pure upload, no task/processing)
|
||
file_record = {
|
||
"fileId": file_id,
|
||
"fileName": safe_name,
|
||
"mimeType": "application/pdf",
|
||
"sizeBytes": len(content),
|
||
"uploadPath": str(upload_path),
|
||
"createdAt": timestamp,
|
||
}
|
||
files_db[file_id] = file_record
|
||
|
||
print(f"[Upload] Received {safe_name} ({len(content)} bytes) -> file {file_id}")
|
||
return JSONResponse({
|
||
"fileId": file_id,
|
||
"fileName": safe_name,
|
||
"mimeType": "application/pdf",
|
||
"sizeBytes": len(content),
|
||
})
|
||
|
||
|
||
@app.post("/tasks/process", status_code=202)
|
||
async def create_task(body: dict):
|
||
"""Create a processing task for an uploaded PDF.
|
||
|
||
Request body: {"fileId": "...", "processType": "ocrpdf"|"markdown"}
|
||
"""
|
||
file_id = body.get("fileId", "")
|
||
process_type = body.get("processType", "ocrpdf").lower()
|
||
|
||
if not file_id:
|
||
raise HTTPException(status_code=400, detail="fileId is required")
|
||
if process_type not in ("ocrpdf", "markdown"):
|
||
raise HTTPException(status_code=400, detail="processType must be 'ocrpdf' or 'markdown'")
|
||
|
||
# Look up the uploaded file
|
||
file_record = files_db.get(file_id)
|
||
if file_record is None:
|
||
raise HTTPException(status_code=404, detail="File not found")
|
||
|
||
task_id = str(uuid.uuid4())
|
||
timestamp = datetime.utcnow().isoformat()
|
||
|
||
task = {
|
||
"taskId": task_id,
|
||
"fileId": file_id,
|
||
"status": "queued",
|
||
"progress": 0,
|
||
"processType": process_type,
|
||
"fileName": file_record["fileName"],
|
||
"createdAt": timestamp,
|
||
"updatedAt": timestamp,
|
||
"uploadPath": file_record["uploadPath"],
|
||
"message": f"Task created (processType={process_type})",
|
||
}
|
||
tasks_db[task_id] = task
|
||
|
||
# Start MinerU processing in background
|
||
asyncio.create_task(process_with_mineru(task_id))
|
||
|
||
print(f"[Tasks] Created task {task_id} for file {file_id} (processType={process_type})")
|
||
return JSONResponse({
|
||
"taskId": task_id,
|
||
"status": "queued",
|
||
"processType": process_type,
|
||
"fileId": file_id,
|
||
})
|
||
|
||
|
||
@app.get("/tasks/{task_id}")
|
||
async def get_task_status(task_id: str):
|
||
"""Get the current status of a processing task."""
|
||
task = tasks_db.get(task_id)
|
||
if task is None:
|
||
raise HTTPException(status_code=404, detail="Task not found")
|
||
|
||
return JSONResponse({
|
||
"taskId": task["taskId"],
|
||
"fileId": task.get("fileId", ""),
|
||
"status": task["status"],
|
||
"progress": task["progress"],
|
||
"processType": task.get("processType", ""),
|
||
"fileName": task["fileName"],
|
||
"createdAt": task["createdAt"],
|
||
"message": task.get("message", ""),
|
||
})
|
||
|
||
|
||
@app.get("/tasks/{task_id}/artifacts")
|
||
async def list_artifacts(task_id: str):
|
||
"""List result files for a completed task."""
|
||
task = tasks_db.get(task_id)
|
||
if task is None:
|
||
raise HTTPException(status_code=404, detail="Task not found")
|
||
|
||
artifacts = artifacts_db.get(task_id, [])
|
||
result = []
|
||
for art in artifacts:
|
||
result.append({
|
||
"id": art["artifactId"],
|
||
"artifactId": art["artifactId"],
|
||
"fileName": art["fileName"],
|
||
"fileSize": art["fileSize"],
|
||
"fileType": art["fileType"],
|
||
})
|
||
return JSONResponse(result)
|
||
|
||
|
||
@app.get("/artifacts/{artifact_id}/download")
|
||
async def download_artifact(artifact_id: str):
|
||
"""Download a processed artifact file."""
|
||
art = artifacts_map.get(artifact_id)
|
||
if art is None:
|
||
raise HTTPException(status_code=404, detail="Artifact not found")
|
||
|
||
file_path = Path(art["filePath"])
|
||
if not file_path.exists():
|
||
raise HTTPException(status_code=404, detail="Artifact file not found on disk")
|
||
|
||
file_type = art["fileType"]
|
||
if file_type == "pdf":
|
||
media_type = "application/pdf"
|
||
elif file_type == "zip":
|
||
media_type = "application/zip"
|
||
else:
|
||
media_type = "text/markdown"
|
||
return FileResponse(
|
||
path=file_path,
|
||
filename=art["fileName"],
|
||
media_type=media_type,
|
||
)
|
||
|
||
|
||
@app.get("/files/{file_id}/download")
|
||
async def download_uploaded_file(file_id: str):
|
||
"""Download an uploaded (unprocessed) PDF file."""
|
||
file_record = files_db.get(file_id)
|
||
if file_record is None:
|
||
raise HTTPException(status_code=404, detail="File not found")
|
||
|
||
file_path = Path(file_record["uploadPath"])
|
||
if not file_path.exists():
|
||
raise HTTPException(status_code=404, detail="File not found on disk")
|
||
|
||
return FileResponse(
|
||
path=file_path,
|
||
filename=file_record["fileName"],
|
||
media_type="application/pdf",
|
||
)
|
||
|
||
|
||
# ── Dashboard page ───────────────────────────────────────────────────────────
|
||
|
||
DASHBOARD_PAGE = """\
|
||
<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<title>FSBC Dashboard</title>
|
||
<style>
|
||
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||
body {
|
||
background: #1a1a2e;
|
||
color: #eee;
|
||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||
padding: 24px;
|
||
}
|
||
.header {
|
||
display: flex; align-items: center; gap: 16px; margin-bottom: 24px;
|
||
}
|
||
.header h1 { font-size: 1.5rem; }
|
||
.header .nav-links { margin-left: auto; display: flex; gap: 12px; }
|
||
.header .nav-links a {
|
||
color: #7c8dff; text-decoration: none; padding: 6px 14px;
|
||
border: 1px solid #7c8dff44; border-radius: 6px; font-size: 0.85rem;
|
||
}
|
||
.header .nav-links a:hover { background: #7c8dff22; }
|
||
.stats-row {
|
||
display: flex; gap: 16px; margin-bottom: 24px; flex-wrap: wrap;
|
||
}
|
||
.stat-card {
|
||
background: #16213e; border-radius: 10px; padding: 16px 24px; flex: 1;
|
||
min-width: 120px; text-align: center;
|
||
}
|
||
.stat-card .num { font-size: 1.8rem; font-weight: 700; }
|
||
.stat-card .label { font-size: 0.75rem; opacity: 0.6; margin-top: 4px; }
|
||
.stat-card.queued .num { color: #ffa726; }
|
||
.stat-card.processing .num { color: #42a5f5; }
|
||
.stat-card.completed .num { color: #66bb6a; }
|
||
.stat-card.failed .num { color: #ef5350; }
|
||
.section-title { font-size: 1.1rem; margin-bottom: 12px; opacity: 0.8; }
|
||
.task-table {
|
||
width: 100%; border-collapse: collapse; margin-bottom: 24px;
|
||
background: #16213e; border-radius: 10px; overflow: hidden;
|
||
}
|
||
.task-table th {
|
||
text-align: left; padding: 12px 16px; font-size: 0.75rem;
|
||
text-transform: uppercase; letter-spacing: 0.5px; opacity: 0.5;
|
||
border-bottom: 1px solid #ffffff11;
|
||
}
|
||
.task-table td {
|
||
padding: 12px 16px; font-size: 0.85rem; border-bottom: 1px solid #ffffff08;
|
||
}
|
||
.task-table tr:hover td { background: #ffffff06; }
|
||
.badge {
|
||
display: inline-block; padding: 2px 10px; border-radius: 10px;
|
||
font-size: 0.75rem; font-weight: 600;
|
||
}
|
||
.badge.queued { background: #ffa72633; color: #ffa726; }
|
||
.badge.processing { background: #42a5f533; color: #42a5f5; }
|
||
.badge.completed { background: #66bb6a33; color: #66bb6a; }
|
||
.badge.failed { background: #ef535033; color: #ef5350; }
|
||
.progress-bar {
|
||
width: 120px; height: 6px; background: #ffffff15; border-radius: 3px; overflow: hidden;
|
||
}
|
||
.progress-bar .fill {
|
||
height: 100%; border-radius: 3px; transition: width 0.5s ease;
|
||
background: linear-gradient(90deg, #42a5f5, #66bb6a);
|
||
}
|
||
.download-btn {
|
||
color: #7c8dff; text-decoration: none; font-size: 0.8rem;
|
||
padding: 4px 10px; border: 1px solid #7c8dff44; border-radius: 4px;
|
||
}
|
||
.download-btn:hover { background: #7c8dff22; }
|
||
.empty-state { opacity: 0.4; text-align: center; padding: 40px; font-size: 0.9rem; }
|
||
.file-name { max-width: 250px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<div class="header">
|
||
<h1>📊 FSBC Dashboard</h1>
|
||
<div class="nav-links">
|
||
<a href="/">📷 图传预览</a>
|
||
<a href="/dashboard" style="background:#7c8dff22;border-color:#7c8dff;">📊 管理面板</a>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="stats-row" id="stats"></div>
|
||
|
||
<div class="section-title">📄 已上传的文件</div>
|
||
<table class="task-table">
|
||
<thead><tr>
|
||
<th>文件名</th><th>文件 ID</th><th>大小</th><th>时间</th><th>操作</th>
|
||
</tr></thead>
|
||
<tbody id="file-list"></tbody>
|
||
</table>
|
||
|
||
<div class="section-title">⚙️ 处理任务</div>
|
||
<table class="task-table">
|
||
<thead><tr>
|
||
<th>文件名</th><th>任务 ID</th><th>状态</th><th>进度</th><th>处理类型</th><th>时间</th><th>操作</th>
|
||
</tr></thead>
|
||
<tbody id="task-list"></tbody>
|
||
</table>
|
||
|
||
<script>
|
||
function fmtSize(bytes) {
|
||
if (!bytes) return '-';
|
||
if (bytes < 1024) return bytes + 'B';
|
||
if (bytes < 1048576) return (bytes / 1024).toFixed(1) + 'KB';
|
||
return (bytes / 1048576).toFixed(1) + 'MB';
|
||
}
|
||
function fmtTime(iso) {
|
||
return iso ? iso.replace('T', ' ').substring(0, 19) : '-';
|
||
}
|
||
|
||
async function refresh() {
|
||
try {
|
||
const resp = await fetch('/api/dashboard');
|
||
const data = await resp.json();
|
||
|
||
// Stats
|
||
const s = data.stats;
|
||
document.getElementById('stats').innerHTML =
|
||
`<div class="stat-card"><div class="num">${data.files.length}</div><div class="label">已上传</div></div>
|
||
<div class="stat-card"><div class="num">${s.total}</div><div class="label">处理任务</div></div>
|
||
<div class="stat-card queued"><div class="num">${s.queued}</div><div class="label">排队中</div></div>
|
||
<div class="stat-card processing"><div class="num">${s.processing}</div><div class="label">处理中</div></div>
|
||
<div class="stat-card completed"><div class="num">${s.completed}</div><div class="label">已完成</div></div>`;
|
||
|
||
// Files
|
||
const fbody = document.getElementById('file-list');
|
||
if (data.files.length === 0) {
|
||
fbody.innerHTML = '<tr><td colspan="5"><div class="empty-state">暂无上传文件</div></td></tr>';
|
||
} else {
|
||
fbody.innerHTML = data.files.map(f =>
|
||
`<tr>
|
||
<td><div class="file-name" title="${f.fileName}">${f.fileName}</div></td>
|
||
<td style="font-family:monospace;font-size:0.75rem;opacity:0.6">${f.fileId.substring(0, 8)}...</td>
|
||
<td>${fmtSize(f.sizeBytes)}</td>
|
||
<td style="font-size:0.75rem;opacity:0.5">${fmtTime(f.createdAt)}</td>
|
||
<td><a class="download-btn" href="/files/${f.fileId}/download">⬇ 下载</a></td>
|
||
</tr>`
|
||
).join('');
|
||
}
|
||
|
||
// Tasks
|
||
const tbody = document.getElementById('task-list');
|
||
if (data.tasks.length === 0) {
|
||
tbody.innerHTML = '<tr><td colspan="7"><div class="empty-state">暂无处理任务</div></td></tr>';
|
||
} else {
|
||
tbody.innerHTML = data.tasks.map(t => {
|
||
const badge = `<span class="badge ${t.status}">${t.status}</span>`;
|
||
const bar = t.status === 'processing'
|
||
? `<div class="progress-bar"><div class="fill" style="width:${t.progress}%"></div></div>`
|
||
: t.status === 'completed' ? '✅ 完成' : `${t.progress}%`;
|
||
const actions = t.artifacts && t.artifacts.length > 0
|
||
? t.artifacts.map(a => `<a class="download-btn" href="/artifacts/${a.id}/download">⬇ ${a.fileName}</a>`).join(' ')
|
||
: t.status === 'completed' ? '✅' : '-';
|
||
return `<tr>
|
||
<td><div class="file-name" title="${t.fileName}">${t.fileName}</div></td>
|
||
<td style="font-family:monospace;font-size:0.75rem;opacity:0.6">${t.taskId.substring(0, 8)}...</td>
|
||
<td>${badge}</td>
|
||
<td>${bar}</td>
|
||
<td>${t.processType || '-'}</td>
|
||
<td style="font-size:0.75rem;opacity:0.5">${fmtTime(t.createdAt)}</td>
|
||
<td>${actions}</td>
|
||
</tr>`;
|
||
}).join('');
|
||
}
|
||
} catch (e) {
|
||
document.getElementById('task-list').innerHTML =
|
||
'<tr><td colspan="7"><div class="empty-state">⚠️ 连接服务器失败: ' + e.message + '</div></td></tr>';
|
||
}
|
||
}
|
||
refresh();
|
||
setInterval(refresh, 2000);
|
||
</script>
|
||
</body>
|
||
</html>
|
||
"""
|
||
|
||
|
||
@app.get("/dashboard")
|
||
async def dashboard():
|
||
"""Serve the task management dashboard page."""
|
||
return HTMLResponse(DASHBOARD_PAGE)
|
||
|
||
|
||
@app.get("/api/dashboard")
|
||
async def dashboard_api():
|
||
"""JSON endpoint providing dashboard data (files + tasks + stats)."""
|
||
# List uploaded files
|
||
files_list = []
|
||
for fid, f_rec in files_db.items():
|
||
files_list.append({
|
||
"fileId": fid,
|
||
"fileName": f_rec.get("fileName", ""),
|
||
"sizeBytes": f_rec.get("sizeBytes", 0),
|
||
"createdAt": f_rec.get("createdAt", ""),
|
||
})
|
||
files_list.sort(key=lambda f: f.get("createdAt", ""), reverse=True)
|
||
|
||
# List tasks
|
||
tasks_list = []
|
||
for tid, task in tasks_db.items():
|
||
task_artifacts = artifacts_db.get(tid, [])
|
||
artifacts_info = [
|
||
{"id": a["artifactId"], "fileName": a["fileName"]}
|
||
for a in task_artifacts
|
||
]
|
||
tasks_list.append({
|
||
"taskId": tid,
|
||
"fileId": task.get("fileId", ""),
|
||
"fileName": task.get("fileName", ""),
|
||
"status": task["status"],
|
||
"progress": task["progress"],
|
||
"processType": task.get("processType", ""),
|
||
"createdAt": task.get("createdAt", ""),
|
||
"message": task.get("message", ""),
|
||
"artifacts": artifacts_info,
|
||
})
|
||
tasks_list.sort(key=lambda t: t.get("createdAt", ""), reverse=True)
|
||
|
||
total = len(tasks_list)
|
||
queued = sum(1 for t in tasks_list if t["status"] == "queued")
|
||
processing = sum(1 for t in tasks_list if t["status"] == "processing")
|
||
completed = sum(1 for t in tasks_list if t["status"] == "completed")
|
||
failed = sum(1 for t in tasks_list if t["status"] == "failed")
|
||
|
||
return JSONResponse({
|
||
"stats": {"total": total, "queued": queued, "processing": processing, "completed": completed, "failed": failed},
|
||
"files": files_list,
|
||
"tasks": tasks_list,
|
||
})
|
||
|
||
|
||
# ── In-memory databases ──────────────────────────────────────────────────────
|
||
|
||
tasks_db: dict[str, dict] = {}
|
||
artifacts_db: dict[str, list[dict]] = {}
|
||
artifacts_map: dict[str, dict] = {}
|
||
|
||
|
||
async def process_with_mineru(task_id: str):
|
||
"""Process a PDF using real MinerU pipeline (replaces simulate_processing)."""
|
||
task = tasks_db.get(task_id)
|
||
if task is None:
|
||
return
|
||
|
||
process_type = task.get("processType", "ocrpdf")
|
||
upload_path_src = task.get("uploadPath")
|
||
file_name = task.get("fileName", "document.pdf")
|
||
base_name = Path(file_name).stem
|
||
lang = task.get("options", {}).get("lang", "ch")
|
||
|
||
if not upload_path_src or not Path(upload_path_src).exists():
|
||
task["status"] = "failed"
|
||
task["message"] = "Uploaded file not found on disk"
|
||
logger.error(f"[MinerU] Task {task_id}: file not found at {upload_path_src}")
|
||
return
|
||
|
||
task["status"] = "processing"
|
||
task["progress"] = 15
|
||
task["updatedAt"] = datetime.utcnow().isoformat()
|
||
task["message"] = f"MinerU pipeline started (backend=pipeline, processType={process_type})"
|
||
logger.info(f"[MinerU] Task {task_id}: starting {process_type} on {file_name}")
|
||
|
||
# Prepare output directory
|
||
output_dir = TASKS_DIR / task_id
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
try:
|
||
pdf_bytes = read_fn(upload_path_src)
|
||
|
||
if process_type == "markdown":
|
||
await aio_do_parse(
|
||
output_dir=str(output_dir),
|
||
pdf_file_names=[base_name],
|
||
pdf_bytes_list=[pdf_bytes],
|
||
p_lang_list=[lang],
|
||
backend="pipeline",
|
||
parse_method="auto",
|
||
f_dump_md=True,
|
||
f_dump_middle_json=False,
|
||
f_dump_model_output=False,
|
||
f_dump_orig_pdf=False,
|
||
f_dump_content_list=False,
|
||
f_draw_layout_bbox=False,
|
||
f_draw_span_bbox=False,
|
||
)
|
||
# MinerU output: {output_dir}/{base_name}/auto/{base_name}.md
|
||
md_dir = output_dir / base_name / "auto"
|
||
md_path = md_dir / f"{base_name}.md"
|
||
images_dir = md_dir / "images"
|
||
|
||
if md_path.exists():
|
||
artifacts_list = []
|
||
|
||
# Register the .md artifact
|
||
md_art_id = str(uuid.uuid4())
|
||
md_artifact = {
|
||
"artifactId": md_art_id,
|
||
"fileName": f"{base_name}.md",
|
||
"fileSize": md_path.stat().st_size,
|
||
"fileType": "md",
|
||
"filePath": str(md_path),
|
||
}
|
||
artifacts_list.append(md_artifact)
|
||
artifacts_map[md_art_id] = md_artifact
|
||
|
||
# If images directory exists and has files, create a ZIP
|
||
if images_dir.exists() and any(images_dir.iterdir()):
|
||
zip_path = md_dir / f"{base_name}_result.zip"
|
||
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
||
zf.write(md_path, md_path.name)
|
||
for img_file in images_dir.rglob("*"):
|
||
if img_file.is_file():
|
||
arcname = f"images/{img_file.relative_to(images_dir)}"
|
||
zf.write(img_file, arcname)
|
||
zip_art_id = str(uuid.uuid4())
|
||
zip_artifact = {
|
||
"artifactId": zip_art_id,
|
||
"fileName": f"{base_name}_result.zip",
|
||
"fileSize": zip_path.stat().st_size,
|
||
"fileType": "zip",
|
||
"filePath": str(zip_path),
|
||
}
|
||
artifacts_list.append(zip_artifact)
|
||
artifacts_map[zip_art_id] = zip_artifact
|
||
logger.info(f"[MinerU] Task {task_id}: ZIP created -> {zip_path} ({zip_path.stat().st_size} bytes)")
|
||
|
||
artifacts_db[task_id] = artifacts_list
|
||
task["status"] = "completed"
|
||
task["progress"] = 100
|
||
task["message"] = f"MinerU Markdown completed ({md_path.stat().st_size} bytes)"
|
||
logger.info(f"[MinerU] Task {task_id}: markdown completed -> {md_path}")
|
||
else:
|
||
task["status"] = "failed"
|
||
task["message"] = "MinerU did not produce .md output"
|
||
logger.error(f"[MinerU] Task {task_id}: no .md output at {md_path}")
|
||
|
||
else: # ocrpdf — use OCRmyPDF for searchable dual-layer PDF
|
||
ocr_lang = {"ch": "chi_sim", "en": "eng", "japan": "jpn", "korean": "kor"}.get(lang, "chi_sim")
|
||
ocr_output = output_dir / f"{base_name}_ocr.pdf"
|
||
|
||
await asyncio.to_thread(
|
||
ocrmypdf.ocr,
|
||
upload_path_src,
|
||
str(ocr_output),
|
||
language=ocr_lang,
|
||
output_type="pdf",
|
||
skip_text=True,
|
||
deskew=True,
|
||
optimize=0, # skip JBIG2 optimization (pikepdf compat)
|
||
)
|
||
|
||
if ocr_output.exists():
|
||
art_id = str(uuid.uuid4())
|
||
artifacts_db[task_id] = [{
|
||
"artifactId": art_id,
|
||
"fileName": f"{base_name}_ocr.pdf",
|
||
"fileSize": ocr_output.stat().st_size,
|
||
"fileType": "pdf",
|
||
"filePath": str(ocr_output),
|
||
}]
|
||
artifacts_map[art_id] = artifacts_db[task_id][0]
|
||
task["status"] = "completed"
|
||
task["progress"] = 100
|
||
task["message"] = f"OCRmyPDF completed ({ocr_output.stat().st_size} bytes)"
|
||
logger.info(f"[OCRmyPDF] Task {task_id}: ocrpdf completed -> {ocr_output}")
|
||
else:
|
||
task["status"] = "failed"
|
||
task["message"] = "OCRmyPDF did not produce output"
|
||
|
||
except Exception as e:
|
||
task["status"] = "failed"
|
||
task["message"] = f"MinerU error: {e}"
|
||
task["progress"] = 0
|
||
logger.error(f"[MinerU] Task {task_id}: exception - {e}")
|
||
|
||
task["updatedAt"] = datetime.utcnow().isoformat()
|
||
|
||
|
||
# ── Entry point ──────────────────────────────────────────────────────────────
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
port = 2026
|
||
print(f"🚀 FSBC Server starting on http://0.0.0.0:{port}")
|
||
print(f" Stream: http://localhost:{port}")
|
||
print(f" Dashboard: http://localhost:{port}/dashboard")
|
||
print(f" Health: http://localhost:{port}/health")
|
||
print(f" Upload: POST http://localhost:{port}/upload/pdf")
|
||
print(f" Tasks: POST http://localhost:{port}/tasks/process")
|
||
uvicorn.run(app, host="0.0.0.0", port=port, log_level="info")
|