extract_table.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import pandas as pd
  2. import pdfplumber
  3. import re
  4. from datetime import datetime
  5. import tabula
  6. ALLOWED_EXTENSIONS = {'pdf'}
  7. def allowed_file(filename):
  8. return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
  9. def safe_filename(filename):
  10. """生成安全的文件名,同时保留中文"""
  11. # 保留中文、字母、数字、下划线和点
  12. keep_chars = (' ', '.', '_', '-')
  13. filename = "".join(c for c in filename if c.isalnum() or c in keep_chars).rstrip()
  14. return filename
  15. def get_pdf_page_count(pdf_path):
  16. with pdfplumber.open(pdf_path) as pdf:
  17. page_count = len(pdf.pages)
  18. return page_count
  19. def extract_temp_time(pdf_path):
  20. """第一种处理方法:基于文本分割的提取"""
  21. cleaned_data = []
  22. with pdfplumber.open(pdf_path) as pdf:
  23. for page in pdf.pages:
  24. text = page.extract_text()
  25. if text:
  26. text_list = text.split("\n")
  27. for txt in text_list:
  28. if ("历史数据表" not in txt) and ("时间" not in txt):
  29. foo = [p for p in re.split(r'\s{1,}', txt.strip()) if p]
  30. if len(foo) < 5:
  31. print(foo)
  32. continue
  33. date_time, name, ids, temp, humi = foo[0] + " " + foo[1], foo[2], foo[3], foo[4], foo[5]
  34. if foo[5] == "--":
  35. humi = ""
  36. cleaned_data.append([date_time, name, ids, temp, humi])
  37. df = pd.DataFrame(
  38. cleaned_data,
  39. columns=['时间', '名称', '编号', '温度', '湿度']
  40. )
  41. df = df.sort_values('时间').reset_index(drop=True)
  42. return df
  43. def extract_pdf_table_to_excel(pdf_path):
  44. """第二种处理方法:基于表格提取"""
  45. cleaned_data = []
  46. with pdfplumber.open(pdf_path) as pdf:
  47. for page in pdf.pages[2:]:
  48. tables = page.extract_table()
  49. if tables:
  50. if len(tables) >= 2:
  51. for table in tables[1:]:
  52. for row in table:
  53. for cell in row.split('\n'):
  54. foo = str(cell).strip().split(" ")
  55. if len(foo) == 4:
  56. date_time, temp, humi = foo[0].replace("/", "-") + " " + foo[1], foo[2], foo[3]
  57. # 拆分日期和时间
  58. cleaned_data.append([date_time, temp, humi])
  59. result_df = pd.DataFrame(
  60. cleaned_data,
  61. columns=['时间', '温度', '湿度']
  62. )
  63. result_df = result_df.sort_values('时间').reset_index(drop=True)
  64. return result_df
  65. def extract_temp_by_datetime_pattern(pdf_path):
  66. """第三种处理方法:基于日期时间模式和温度符号的提取"""
  67. all_data = []
  68. datetime_pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}')
  69. with pdfplumber.open(pdf_path) as pdf:
  70. # 从第二页开始处理(索引1)
  71. for page in pdf.pages[1:]:
  72. text = page.extract_text()
  73. if not text:
  74. continue
  75. lines = text.split('\n')
  76. for line in lines:
  77. # 检查行是否包含日期时间格式和温度符号
  78. if datetime_pattern.search(line) and '℃' in line:
  79. parts = line.split()
  80. if len(parts) >= 3:
  81. # 提取时间部分
  82. time_str = ' '.join(parts[:2])
  83. try:
  84. # 转换为datetime对象
  85. time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
  86. # 提取温度值(去掉℃符号)
  87. temp_str = parts[2].replace('℃', '')
  88. try:
  89. # 添加到数据列表
  90. all_data.append({'时间': time, '温度': temp_str})
  91. except ValueError:
  92. continue
  93. except ValueError:
  94. continue
  95. if len(parts) >= 6:
  96. # 提取时间部分
  97. time_str = ' '.join(parts[3:5])
  98. try:
  99. # 转换为datetime对象
  100. time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
  101. # 提取温度值(去掉℃符号)
  102. temp_str = parts[5].replace('℃', '')
  103. try:
  104. # 添加到数据列表
  105. all_data.append({'时间': time, '温度': temp_str})
  106. except ValueError:
  107. continue
  108. except ValueError:
  109. continue
  110. df = pd.DataFrame(all_data, columns=['时间', '温度'])
  111. df = df.sort_values('时间').reset_index(drop=True)
  112. return df