há 4 meses atrás · 652cc5b0ab
--- a/app.py
+++ b/app.py
@@ -1,14 +1,21 @@
 
															 import os
														
 
															 import sys
														
 
															 import uuid
														
 
															+import tempfile
														
 
															-from flask import Flask, request, jsonify
														
 
															+import pdfplumber
														
 
															+from flask import Flask, request, jsonify, send_file
														
 
															 from add_signature import find_signature_positions, add_signature_to_pdf
														
 
															 from add_watermark import add_watermark_to_pdf
														
 
															+from extract_table import extract_temp_time, extract_pdf_table_to_excel, extract_temp_by_datetime_pattern, allowed_file, \
														
 
															+    safe_filename
														
 
															 from lib import Qiniu
														
 
															+from werkzeug.utils import secure_filename
														
 
															 app = Flask(__name__)
														
 
															+UPLOAD_FOLDER = tempfile.gettempdir()
														
 
															+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
														
 
															 @app.route('/add_signature', methods=['POST'])
														
@@ -93,11 +100,70 @@ def add_watermark():
 
															         }), 500
														
 
															+@app.route('/extract_table', methods=['POST'])
														
 
															+def extract_table():
														
 
															+    if 'file' not in request.files:
														
 
															+        return jsonify({'error': 'No file part'}), 400
														
 
															+
														
 
															+    file = request.files['file']
														
 
															+
														
 
															+    if file.filename == '':
														
 
															+        return jsonify({'error': 'No selected file'}), 400
														
 
															+
														
 
															+    if file and allowed_file(file.filename):
														
 
															+        original_filename = file.filename
														
 
															+        filename = safe_filename(original_filename)
														
 
															+        # filename = secure_filename(safe_name)
														
 
															+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
														
 
															+        file.save(filepath)
														
 
															+        df = None
														
 
															+        with pdfplumber.open(filepath) as pdf:
														
 
															+            # 获取第一页
														
 
															+            first_page = pdf.pages[0]
														
 
															+            text = first_page.extract_text()
														
 
															+            if text:
														
 
															+                if "温湿度数据报告" in text:
														
 
															+                    df = extract_pdf_table_to_excel(filepath)
														
 
															+                if "历史数据表" in text:
														
 
															+                    df =  extract_temp_time(filepath)
														
 
															+                if "设备汇总报告" in text:
														
 
															+                    df = extract_temp_by_datetime_pattern(filepath)
														
 
															+
														
 
															+        if df is None:
														
 
															+            os.remove(filepath)
														
 
															+            return jsonify({'error': '所有处理方法均失败'}), 400
														
 
															+
														
 
															+        # 保存Excel文件
														
 
															+        output_filename = filename.replace('.pdf', '.xlsx')
														
 
															+        output_path = os.path.join(app.config['UPLOAD_FOLDER'], output_filename)
														
 
															+        df.to_excel(output_path, index=False, engine='openpyxl')
														
 
															+
														
 
															+        # 删除上传的PDF文件
														
 
															+        os.remove(filepath)
														
 
															+
														
 
															+        # 返回Excel文件
														
 
															+        try:
														
 
															+            return send_file(
														
 
															+                output_path,
														
 
															+                as_attachment=True,
														
 
															+                download_name=output_filename,
														
 
															+                mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
														
 
															+            )
														
 
															+        finally:
														
 
															+            # 确保临时文件最终被删除
														
 
															+            try:
														
 
															+                os.remove(output_path)
														
 
															+            except:
														
 
															+                pass
														
 
															+    else:
														
 
															+        return jsonify({'error': 'Invalid file type'}), 400
														
 
															+
														
 
															+
														
 
															 if __name__ == '__main__':
														
 
															     print("项目地址：", os.path.dirname(__file__))
														
 
															     if len(sys.argv) != 2:
														
 
															         print("请填写端口号")
														
 
															         sys.exit()
														
 
															-    app.debug = True  # 设置调试模式，生产模式的时候要关掉debug
														
 
															+    # app.debug = True  # 设置调试模式，生产模式的时候要关掉debug
														
 
															     # app.config['JSON_AS_ASCII'] = False
														
 
															     app.run(host='0.0.0.0', port=6500, debug=True)
														
--- a/extract_table.py
+++ b/extract_table.py
@@ -0,0 +1,133 @@
 
															+import pandas as pd
														
 
															+import pdfplumber
														
 
															+import re
														
 
															+from datetime import datetime
														
 
															+
														
 
															+import tabula
														
 
															+
														
 
															+ALLOWED_EXTENSIONS = {'pdf'}
														
 
															+
														
 
															+
														
 
															+def allowed_file(filename):
														
 
															+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
														
 
															+
														
 
															+
														
 
															+def safe_filename(filename):
														
 
															+    """生成安全的文件名，同时保留中文"""
														
 
															+    # 保留中文、字母、数字、下划线和点
														
 
															+    keep_chars = (' ', '.', '_', '-')
														
 
															+    filename = "".join(c for c in filename if c.isalnum() or c in keep_chars).rstrip()
														
 
															+    return filename
														
 
															+
														
 
															+
														
 
															+def get_pdf_page_count(pdf_path):
														
 
															+    with pdfplumber.open(pdf_path) as pdf:
														
 
															+        page_count = len(pdf.pages)
														
 
															+    return page_count
														
 
															+
														
 
															+
														
 
															+def extract_temp_time(pdf_path):
														
 
															+    """第一种处理方法：基于文本分割的提取"""
														
 
															+    cleaned_data = []
														
 
															+    with pdfplumber.open(pdf_path) as pdf:
														
 
															+        for page in pdf.pages:
														
 
															+            text = page.extract_text()
														
 
															+            if text:
														
 
															+                text_list = text.split("\n")
														
 
															+                for txt in text_list:
														
 
															+                    if ("历史数据表" not in txt) and ("时间" not in txt):
														
 
															+                        foo = [p for p in re.split(r'\s{1,}', txt.strip()) if p]
														
 
															+                        if len(foo) < 5:
														
 
															+                            print(foo)
														
 
															+                            continue
														
 
															+                        date_time, name, ids, temp, humi = foo[0] + " " + foo[1], foo[2], foo[3], foo[4], foo[5]
														
 
															+                        if foo[5] == "--":
														
 
															+                            humi = ""
														
 
															+                        cleaned_data.append([date_time, name, ids, temp, humi])
														
 
															+
														
 
															+    df = pd.DataFrame(
														
 
															+        cleaned_data,
														
 
															+        columns=['时间', '名称', '编号', '温度', '湿度']
														
 
															+    )
														
 
															+    df = df.sort_values('时间').reset_index(drop=True)
														
 
															+    return df
														
 
															+
														
 
															+
														
 
															+def extract_pdf_table_to_excel(pdf_path):
														
 
															+    """第二种处理方法：基于表格提取"""
														
 
															+    cleaned_data = []
														
 
															+    with pdfplumber.open(pdf_path) as pdf:
														
 
															+        for page in pdf.pages[2:]:
														
 
															+            tables = page.extract_table()
														
 
															+            if tables:
														
 
															+                if len(tables) >= 2:
														
 
															+                    for table in tables[1:]:
														
 
															+                        for row in table:
														
 
															+                            for cell in row.split('\n'):
														
 
															+                                foo = str(cell).strip().split("   ")
														
 
															+                                if len(foo) == 4:
														
 
															+                                    date_time, temp, humi = foo[0].replace("/", "-") + " " + foo[1], foo[2], foo[3]
														
 
															+                                    # 拆分日期和时间
														
 
															+                                    cleaned_data.append([date_time, temp, humi])
														
 
															+
														
 
															+    result_df = pd.DataFrame(
														
 
															+        cleaned_data,
														
 
															+        columns=['时间', '温度', '湿度']
														
 
															+    )
														
 
															+    result_df = result_df.sort_values('时间').reset_index(drop=True)
														
 
															+    return result_df
														
 
															+
														
 
															+
														
 
															+def extract_temp_by_datetime_pattern(pdf_path):
														
 
															+    """第三种处理方法：基于日期时间模式和温度符号的提取"""
														
 
															+    all_data = []
														
 
															+    datetime_pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}')
														
 
															+
														
 
															+    with pdfplumber.open(pdf_path) as pdf:
														
 
															+        # 从第二页开始处理（索引1）
														
 
															+        for page in pdf.pages[1:]:
														
 
															+            text = page.extract_text()
														
 
															+            if not text:
														
 
															+                continue
														
 
															+
														
 
															+            lines = text.split('\n')
														
 
															+
														
 
															+            for line in lines:
														
 
															+                # 检查行是否包含日期时间格式和温度符号
														
 
															+                if datetime_pattern.search(line) and '℃' in line:
														
 
															+                    parts = line.split()
														
 
															+                    if len(parts) >= 3:
														
 
															+                        # 提取时间部分
														
 
															+                        time_str = ' '.join(parts[:2])
														
 
															+                        try:
														
 
															+                            # 转换为datetime对象
														
 
															+                            time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
														
 
															+                            # 提取温度值（去掉℃符号）
														
 
															+                            temp_str = parts[2].replace('℃', '')
														
 
															+                            try:
														
 
															+                                # 添加到数据列表
														
 
															+                                all_data.append({'时间': time, '温度': temp_str})
														
 
															+                            except ValueError:
														
 
															+                                continue
														
 
															+                        except ValueError:
														
 
															+                            continue
														
 
															+
														
 
															+                    if len(parts) >= 6:
														
 
															+                        # 提取时间部分
														
 
															+                        time_str = ' '.join(parts[3:5])
														
 
															+                        try:
														
 
															+                            # 转换为datetime对象
														
 
															+                            time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
														
 
															+                            # 提取温度值（去掉℃符号）
														
 
															+                            temp_str = parts[5].replace('℃', '')
														
 
															+                            try:
														
 
															+                                # 添加到数据列表
														
 
															+                                all_data.append({'时间': time, '温度': temp_str})
														
 
															+                            except ValueError:
														
 
															+                                continue
														
 
															+                        except ValueError:
														
 
															+                            continue
														
 
															+
														
 
															+    df = pd.DataFrame(all_data, columns=['时间', '温度'])
														
 
															+    df = df.sort_values('时间').reset_index(drop=True)
														
 
															+    return df