Parcourir la source

add:添加第五种表格提取方式

zoie il y a 2 mois
Parent
commit
e0dc1fe6c4
2 fichiers modifiés avec 82 ajouts et 2 suppressions
  1. 4 1
      app.py
  2. 78 1
      extract_table.py

+ 4 - 1
app.py

@@ -9,7 +9,7 @@ from flask import Flask, request, jsonify, send_file
 from add_signature import find_signature_positions, add_signature_to_pdf
 from add_watermark import add_watermark_to_pdf
 from extract_table import extract_temp_time, extract_pdf_table_to_excel, extract_temp_by_datetime_pattern, allowed_file, \
-    safe_filename, extract_temperature_data_from_pdf
+    safe_filename, extract_temperature_data_from_pdf, extract_data_from_pdf_5
 from lib import Qiniu
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
@@ -132,6 +132,9 @@ def extract_table():
                     df = extract_temp_by_datetime_pattern(filepath)
                 if "详细数据" in text:
                     df = extract_temperature_data_from_pdf(filepath)
+                else:
+                    df = extract_data_from_pdf_5(filepath)
+
 
         if df is None:
             os.remove(filepath)

+ 78 - 1
extract_table.py

@@ -132,6 +132,7 @@ def extract_temp_by_datetime_pattern(pdf_path):
     df = df.sort_values('时间').reset_index(drop=True)
     return df
 
+
 def extract_temperature_data_from_pdf(pdf_path):
     """
     从PDF文件中提取时间和温度数据
@@ -157,4 +158,80 @@ def extract_temperature_data_from_pdf(pdf_path):
     df = pd.DataFrame(all_data, columns=['时间', '温度'])
     # 按时间排序
     df = df.sort_values('时间').reset_index(drop=True)
-    return df
+    return df
+
+
+# 第五种处理方法
+def extract_data_from_pdf_5(pdf_path):
+    """
+    从PDF文件中提取时间和温度数据
+    """
+    cleaned_data = []
+    valid_table_lines = []  # 存储目标文件中有效表格行
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            page_content = page.extract_text()
+            if not page_content:
+                continue  # 跳过空页
+            # 按换行符分割为单行,去除首尾空格,排除页码行(如“第1页/共3页”)
+            page_lines = [
+                line.strip() for line in page_content.split("\n")
+                if "第" not in line or "页" not in line
+            ]
+            for line in page_lines:
+                if '时间' in line:
+                    continue
+                # 筛选条件:含时间(任意年月日时分)+ 温度(数字℃)+ 湿度(数字%)特征
+                has_time = re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}", line)
+                has_temp = re.search(r"\d+\.?\d*℃", line)
+                has_humi = re.search(r"\d+\.?\d*%", line)
+                if has_time and has_temp:
+                    valid_table_lines.append(line)
+
+
+    for line_idx, line_content in enumerate(valid_table_lines, 1):
+        parts = line_content.split(" ")
+        if len(parts) == 4:
+            data = {
+                "时间": parts[0] + " " + parts[1],
+                "温度": parts[2].replace("℃", ""),
+            }
+            cleaned_data.append(data)
+        if len(parts) == 8:
+            data1 = {
+                "时间": parts[0] + " " + parts[1],
+                "温度": parts[2].replace("℃", ""),
+            }
+            data2 = {
+                "时间": parts[4] + " " + parts[5],
+                "温度": parts[6].replace("℃", ""),
+            }
+            cleaned_data.append(data1)
+            cleaned_data.append(data2)
+        if len(parts) == 6:
+            data = {
+                "时间": parts[0] + " " + parts[1],
+                "温度": parts[2].replace("℃", ""),
+                "湿度": parts[4].replace("%", ""),
+            }
+            cleaned_data.append(data)
+        if len(parts) == 12:
+            data1 = {
+                "时间": parts[0] + " " + parts[1],
+                "温度": parts[2].replace("℃", ""),
+                "湿度": parts[4].replace("%", ""),
+            }
+            data2 = {
+                "时间": parts[6] + " " + parts[7],
+                "温度": parts[8].replace("℃", ""),
+                "湿度": parts[10].replace("%", ""),
+            }
+            cleaned_data.append(data1)
+            cleaned_data.append(data2)
+
+
+    # 转换为DataFrame
+    df = pd.DataFrame(cleaned_data, columns=['时间', '温度', '湿度'])
+    # 按时间排序
+    df = df.sort_values('时间').reset_index(drop=True)
+    return df