hace 4 meses · 652cc5b0ab
--- a/app.py
+++ b/app.py
@@ -1,14 +1,21 @@
 
				 import os
			
 
				 import sys
			
 
				 import uuid
			
 
				+import tempfile
			
 
				 
			
 
				-from flask import Flask, request, jsonify
			
 
				+import pdfplumber
			
 
				+from flask import Flask, request, jsonify, send_file
			
 
				 
			
 
				 from add_signature import find_signature_positions, add_signature_to_pdf
			
 
				 from add_watermark import add_watermark_to_pdf
			
 
				+from extract_table import extract_temp_time, extract_pdf_table_to_excel, extract_temp_by_datetime_pattern, allowed_file, \
			
 
				+    safe_filename
			
 
				 from lib import Qiniu
			
 
				+from werkzeug.utils import secure_filename
			
 
				 
			
 
				 app = Flask(__name__)
			
 
				+UPLOAD_FOLDER = tempfile.gettempdir()
			
 
				+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
			
 
				 
			
 
				 
			
 
				 @app.route('/add_signature', methods=['POST'])
			
@@ -93,11 +100,70 @@ def add_watermark():
 
				         }), 500
			
 
				 
			
 
				 
			
 
				+@app.route('/extract_table', methods=['POST'])
			
 
				+def extract_table():
			
 
				+    if 'file' not in request.files:
			
 
				+        return jsonify({'error': 'No file part'}), 400
			
 
				+
			
 
				+    file = request.files['file']
			
 
				+
			
 
				+    if file.filename == '':
			
 
				+        return jsonify({'error': 'No selected file'}), 400
			
 
				+
			
 
				+    if file and allowed_file(file.filename):
			
 
				+        original_filename = file.filename
			
 
				+        filename = safe_filename(original_filename)
			
 
				+        # filename = secure_filename(safe_name)
			
 
				+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
			
 
				+        file.save(filepath)
			
 
				+        df = None
			
 
				+        with pdfplumber.open(filepath) as pdf:
			
 
				+            # 获取第一页
			
 
				+            first_page = pdf.pages[0]
			
 
				+            text = first_page.extract_text()
			
 
				+            if text:
			
 
				+                if "温湿度数据报告" in text:
			
 
				+                    df = extract_pdf_table_to_excel(filepath)
			
 
				+                if "历史数据表" in text:
			
 
				+                    df =  extract_temp_time(filepath)
			
 
				+                if "设备汇总报告" in text:
			
 
				+                    df = extract_temp_by_datetime_pattern(filepath)
			
 
				+
			
 
				+        if df is None:
			
 
				+            os.remove(filepath)
			
 
				+            return jsonify({'error': '所有处理方法均失败'}), 400
			
 
				+
			
 
				+        # 保存Excel文件
			
 
				+        output_filename = filename.replace('.pdf', '.xlsx')
			
 
				+        output_path = os.path.join(app.config['UPLOAD_FOLDER'], output_filename)
			
 
				+        df.to_excel(output_path, index=False, engine='openpyxl')
			
 
				+
			
 
				+        # 删除上传的PDF文件
			
 
				+        os.remove(filepath)
			
 
				+
			
 
				+        # 返回Excel文件
			
 
				+        try:
			
 
				+            return send_file(
			
 
				+                output_path,
			
 
				+                as_attachment=True,
			
 
				+                download_name=output_filename,
			
 
				+                mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
			
 
				+            )
			
 
				+        finally:
			
 
				+            # 确保临时文件最终被删除
			
 
				+            try:
			
 
				+                os.remove(output_path)
			
 
				+            except:
			
 
				+                pass
			
 
				+    else:
			
 
				+        return jsonify({'error': 'Invalid file type'}), 400
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     print("项目地址：", os.path.dirname(__file__))
			
 
				     if len(sys.argv) != 2:
			
 
				         print("请填写端口号")
			
 
				         sys.exit()
			
 
				-    app.debug = True  # 设置调试模式，生产模式的时候要关掉debug
			
 
				+    # app.debug = True  # 设置调试模式，生产模式的时候要关掉debug
			
 
				     # app.config['JSON_AS_ASCII'] = False
			
 
				     app.run(host='0.0.0.0', port=6500, debug=True)
			
--- a/extract_table.py
+++ b/extract_table.py
@@ -0,0 +1,133 @@
 
				+import pandas as pd
			
 
				+import pdfplumber
			
 
				+import re
			
 
				+from datetime import datetime
			
 
				+
			
 
				+import tabula
			
 
				+
			
 
				+ALLOWED_EXTENSIONS = {'pdf'}
			
 
				+
			
 
				+
			
 
				+def allowed_file(filename):
			
 
				+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
			
 
				+
			
 
				+
			
 
				+def safe_filename(filename):
			
 
				+    """生成安全的文件名，同时保留中文"""
			
 
				+    # 保留中文、字母、数字、下划线和点
			
 
				+    keep_chars = (' ', '.', '_', '-')
			
 
				+    filename = "".join(c for c in filename if c.isalnum() or c in keep_chars).rstrip()
			
 
				+    return filename
			
 
				+
			
 
				+
			
 
				+def get_pdf_page_count(pdf_path):
			
 
				+    with pdfplumber.open(pdf_path) as pdf:
			
 
				+        page_count = len(pdf.pages)
			
 
				+    return page_count
			
 
				+
			
 
				+
			
 
				+def extract_temp_time(pdf_path):
			
 
				+    """第一种处理方法：基于文本分割的提取"""
			
 
				+    cleaned_data = []
			
 
				+    with pdfplumber.open(pdf_path) as pdf:
			
 
				+        for page in pdf.pages:
			
 
				+            text = page.extract_text()
			
 
				+            if text:
			
 
				+                text_list = text.split("\n")
			
 
				+                for txt in text_list:
			
 
				+                    if ("历史数据表" not in txt) and ("时间" not in txt):
			
 
				+                        foo = [p for p in re.split(r'\s{1,}', txt.strip()) if p]
			
 
				+                        if len(foo) < 5:
			
 
				+                            print(foo)
			
 
				+                            continue
			
 
				+                        date_time, name, ids, temp, humi = foo[0] + " " + foo[1], foo[2], foo[3], foo[4], foo[5]
			
 
				+                        if foo[5] == "--":
			
 
				+                            humi = ""
			
 
				+                        cleaned_data.append([date_time, name, ids, temp, humi])
			
 
				+
			
 
				+    df = pd.DataFrame(
			
 
				+        cleaned_data,
			
 
				+        columns=['时间', '名称', '编号', '温度', '湿度']
			
 
				+    )
			
 
				+    df = df.sort_values('时间').reset_index(drop=True)
			
 
				+    return df
			
 
				+
			
 
				+
			
 
				+def extract_pdf_table_to_excel(pdf_path):
			
 
				+    """第二种处理方法：基于表格提取"""
			
 
				+    cleaned_data = []
			
 
				+    with pdfplumber.open(pdf_path) as pdf:
			
 
				+        for page in pdf.pages[2:]:
			
 
				+            tables = page.extract_table()
			
 
				+            if tables:
			
 
				+                if len(tables) >= 2:
			
 
				+                    for table in tables[1:]:
			
 
				+                        for row in table:
			
 
				+                            for cell in row.split('\n'):
			
 
				+                                foo = str(cell).strip().split("   ")
			
 
				+                                if len(foo) == 4:
			
 
				+                                    date_time, temp, humi = foo[0].replace("/", "-") + " " + foo[1], foo[2], foo[3]
			
 
				+                                    # 拆分日期和时间
			
 
				+                                    cleaned_data.append([date_time, temp, humi])
			
 
				+
			
 
				+    result_df = pd.DataFrame(
			
 
				+        cleaned_data,
			
 
				+        columns=['时间', '温度', '湿度']
			
 
				+    )
			
 
				+    result_df = result_df.sort_values('时间').reset_index(drop=True)
			
 
				+    return result_df
			
 
				+
			
 
				+
			
 
				+def extract_temp_by_datetime_pattern(pdf_path):
			
 
				+    """第三种处理方法：基于日期时间模式和温度符号的提取"""
			
 
				+    all_data = []
			
 
				+    datetime_pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}')
			
 
				+
			
 
				+    with pdfplumber.open(pdf_path) as pdf:
			
 
				+        # 从第二页开始处理（索引1）
			
 
				+        for page in pdf.pages[1:]:
			
 
				+            text = page.extract_text()
			
 
				+            if not text:
			
 
				+                continue
			
 
				+
			
 
				+            lines = text.split('\n')
			
 
				+
			
 
				+            for line in lines:
			
 
				+                # 检查行是否包含日期时间格式和温度符号
			
 
				+                if datetime_pattern.search(line) and '℃' in line:
			
 
				+                    parts = line.split()
			
 
				+                    if len(parts) >= 3:
			
 
				+                        # 提取时间部分
			
 
				+                        time_str = ' '.join(parts[:2])
			
 
				+                        try:
			
 
				+                            # 转换为datetime对象
			
 
				+                            time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
			
 
				+                            # 提取温度值（去掉℃符号）
			
 
				+                            temp_str = parts[2].replace('℃', '')
			
 
				+                            try:
			
 
				+                                # 添加到数据列表
			
 
				+                                all_data.append({'时间': time, '温度': temp_str})
			
 
				+                            except ValueError:
			
 
				+                                continue
			
 
				+                        except ValueError:
			
 
				+                            continue
			
 
				+
			
 
				+                    if len(parts) >= 6:
			
 
				+                        # 提取时间部分
			
 
				+                        time_str = ' '.join(parts[3:5])
			
 
				+                        try:
			
 
				+                            # 转换为datetime对象
			
 
				+                            time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
			
 
				+                            # 提取温度值（去掉℃符号）
			
 
				+                            temp_str = parts[5].replace('℃', '')
			
 
				+                            try:
			
 
				+                                # 添加到数据列表
			
 
				+                                all_data.append({'时间': time, '温度': temp_str})
			
 
				+                            except ValueError:
			
 
				+                                continue
			
 
				+                        except ValueError:
			
 
				+                            continue
			
 
				+
			
 
				+    df = pd.DataFrame(all_data, columns=['时间', '温度'])
			
 
				+    df = df.sort_values('时间').reset_index(drop=True)
			
 
				+    return df