Explorar el Código

add:导出pdf温湿度数据

zoie hace 5 días
padre
commit
652cc5b0ab
Se han modificado 2 ficheros con 201 adiciones y 2 borrados
  1. 68 2
      app.py
  2. 133 0
      extract_table.py

+ 68 - 2
app.py

@@ -1,14 +1,21 @@
 import os
 import sys
 import uuid
+import tempfile
 
-from flask import Flask, request, jsonify
+import pdfplumber
+from flask import Flask, request, jsonify, send_file
 
 from add_signature import find_signature_positions, add_signature_to_pdf
 from add_watermark import add_watermark_to_pdf
+from extract_table import extract_temp_time, extract_pdf_table_to_excel, extract_temp_by_datetime_pattern, allowed_file, \
+    safe_filename
 from lib import Qiniu
+from werkzeug.utils import secure_filename
 
 app = Flask(__name__)
+UPLOAD_FOLDER = tempfile.gettempdir()
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 
 
 @app.route('/add_signature', methods=['POST'])
@@ -93,11 +100,70 @@ def add_watermark():
         }), 500
 
 
+@app.route('/extract_table', methods=['POST'])
+def extract_table():
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file part'}), 400
+
+    file = request.files['file']
+
+    if file.filename == '':
+        return jsonify({'error': 'No selected file'}), 400
+
+    if file and allowed_file(file.filename):
+        original_filename = file.filename
+        filename = safe_filename(original_filename)
+        # filename = secure_filename(safe_name)
+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        file.save(filepath)
+        df = None
+        with pdfplumber.open(filepath) as pdf:
+            # 获取第一页
+            first_page = pdf.pages[0]
+            text = first_page.extract_text()
+            if text:
+                if "温湿度数据报告" in text:
+                    df = extract_pdf_table_to_excel(filepath)
+                if "历史数据表" in text:
+                    df =  extract_temp_time(filepath)
+                if "设备汇总报告" in text:
+                    df = extract_temp_by_datetime_pattern(filepath)
+
+        if df is None:
+            os.remove(filepath)
+            return jsonify({'error': '所有处理方法均失败'}), 400
+
+        # 保存Excel文件
+        output_filename = filename.replace('.pdf', '.xlsx')
+        output_path = os.path.join(app.config['UPLOAD_FOLDER'], output_filename)
+        df.to_excel(output_path, index=False, engine='openpyxl')
+
+        # 删除上传的PDF文件
+        os.remove(filepath)
+
+        # 返回Excel文件
+        try:
+            return send_file(
+                output_path,
+                as_attachment=True,
+                download_name=output_filename,
+                mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+            )
+        finally:
+            # 确保临时文件最终被删除
+            try:
+                os.remove(output_path)
+            except:
+                pass
+    else:
+        return jsonify({'error': 'Invalid file type'}), 400
+
+
 if __name__ == '__main__':
     print("项目地址:", os.path.dirname(__file__))
     if len(sys.argv) != 2:
         print("请填写端口号")
         sys.exit()
-    app.debug = True  # 设置调试模式,生产模式的时候要关掉debug
+    # app.debug = True  # 设置调试模式,生产模式的时候要关掉debug
     # app.config['JSON_AS_ASCII'] = False
     app.run(host='0.0.0.0', port=6500, debug=True)

+ 133 - 0
extract_table.py

@@ -0,0 +1,133 @@
+import pandas as pd
+import pdfplumber
+import re
+from datetime import datetime
+
+import tabula
+
+ALLOWED_EXTENSIONS = {'pdf'}
+
+
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+
+
+def safe_filename(filename):
+    """生成安全的文件名,同时保留中文"""
+    # 保留中文、字母、数字、下划线和点
+    keep_chars = (' ', '.', '_', '-')
+    filename = "".join(c for c in filename if c.isalnum() or c in keep_chars).rstrip()
+    return filename
+
+
+def get_pdf_page_count(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf:
+        page_count = len(pdf.pages)
+    return page_count
+
+
+def extract_temp_time(pdf_path):
+    """第一种处理方法:基于文本分割的提取"""
+    cleaned_data = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            text = page.extract_text()
+            if text:
+                text_list = text.split("\n")
+                for txt in text_list:
+                    if ("历史数据表" not in txt) and ("时间" not in txt):
+                        foo = [p for p in re.split(r'\s{1,}', txt.strip()) if p]
+                        if len(foo) < 5:
+                            print(foo)
+                            continue
+                        date_time, name, ids, temp, humi = foo[0] + " " + foo[1], foo[2], foo[3], foo[4], foo[5]
+                        if foo[5] == "--":
+                            humi = ""
+                        cleaned_data.append([date_time, name, ids, temp, humi])
+
+    df = pd.DataFrame(
+        cleaned_data,
+        columns=['时间', '名称', '编号', '温度', '湿度']
+    )
+    df = df.sort_values('时间').reset_index(drop=True)
+    return df
+
+
+def extract_pdf_table_to_excel(pdf_path):
+    """第二种处理方法:基于表格提取"""
+    cleaned_data = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages[2:]:
+            tables = page.extract_table()
+            if tables:
+                if len(tables) >= 2:
+                    for table in tables[1:]:
+                        for row in table:
+                            for cell in row.split('\n'):
+                                foo = str(cell).strip().split("   ")
+                                if len(foo) == 4:
+                                    date_time, temp, humi = foo[0].replace("/", "-") + " " + foo[1], foo[2], foo[3]
+                                    # 拆分日期和时间
+                                    cleaned_data.append([date_time, temp, humi])
+
+    result_df = pd.DataFrame(
+        cleaned_data,
+        columns=['时间', '温度', '湿度']
+    )
+    result_df = result_df.sort_values('时间').reset_index(drop=True)
+    return result_df
+
+
+def extract_temp_by_datetime_pattern(pdf_path):
+    """第三种处理方法:基于日期时间模式和温度符号的提取"""
+    all_data = []
+    datetime_pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}')
+
+    with pdfplumber.open(pdf_path) as pdf:
+        # 从第二页开始处理(索引1)
+        for page in pdf.pages[1:]:
+            text = page.extract_text()
+            if not text:
+                continue
+
+            lines = text.split('\n')
+
+            for line in lines:
+                # 检查行是否包含日期时间格式和温度符号
+                if datetime_pattern.search(line) and '℃' in line:
+                    parts = line.split()
+                    if len(parts) >= 3:
+                        # 提取时间部分
+                        time_str = ' '.join(parts[:2])
+                        try:
+                            # 转换为datetime对象
+                            time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
+                            # 提取温度值(去掉℃符号)
+                            temp_str = parts[2].replace('℃', '')
+                            try:
+                                # 添加到数据列表
+                                all_data.append({'时间': time, '温度': temp_str})
+                            except ValueError:
+                                continue
+                        except ValueError:
+                            continue
+
+                    if len(parts) >= 6:
+                        # 提取时间部分
+                        time_str = ' '.join(parts[3:5])
+                        try:
+                            # 转换为datetime对象
+                            time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
+                            # 提取温度值(去掉℃符号)
+                            temp_str = parts[5].replace('℃', '')
+                            try:
+                                # 添加到数据列表
+                                all_data.append({'时间': time, '温度': temp_str})
+                            except ValueError:
+                                continue
+                        except ValueError:
+                            continue
+
+    df = pd.DataFrame(all_data, columns=['时间', '温度'])
+    df = df.sort_values('时间').reset_index(drop=True)
+    return df