nmcspider.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. import io
  2. import os.path
  3. import time
  4. from datetime import datetime
  5. from logging import log
  6. from urllib.parse import urlparse
  7. import scrapy
  8. from PIL import Image
  9. from selenium import webdriver
  10. from selenium.common import TimeoutException
  11. from selenium.webdriver import ActionChains
  12. from selenium.webdriver.chrome.options import Options
  13. from selenium.webdriver.common.by import By
  14. from selenium.webdriver.support import expected_conditions as EC
  15. from selenium.webdriver.support.wait import WebDriverWait
  16. from nmc.compress_image import capture_element_screenshot
  17. def js_is_complete(driver):
  18. return driver.execute_script("return document.readyState") == "complete"
  19. def retry_get_url(driver, url, retries=3, delay=5):
  20. for attempt in range(retries):
  21. try:
  22. driver.get(url)
  23. # 可以在这里添加额外的检查,确认页面是否成功加载
  24. return True # 成功访问URL,退出循环
  25. except TimeoutException:
  26. if attempt < retries - 1: # 如果还没达到最大重试次数
  27. time.sleep(delay) # 等待一段时间后重试
  28. else:
  29. return False # 所有重试都失败了
  30. class NmcspiderSpider(scrapy.Spider):
  31. name = "nmcspider"
  32. allowed_domains = ["www.nmc.cn"]
  33. start_urls = ["http://www.nmc.cn/rest/province"]
  34. def start_requests(self):
  35. yield scrapy.Request(url=self.start_urls[0], callback=self.parse_provinces)
  36. def __init__(self):
  37. chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
  38. chrome_options = Options()
  39. # 添加路径到chrome_optionsmmm
  40. chrome_options = webdriver.ChromeOptions()
  41. chrome_options.add_experimental_option("detach", True)
  42. chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
  43. # 如果需要添加其他选项,例如禁用浏览器窗口,可以这样:
  44. chrome_options.add_argument('--headless')
  45. chrome_options.add_argument('--disable-gpu')
  46. chrome_options.add_argument('--no-sandbox')
  47. self.driver = webdriver.Chrome(options=chrome_options)
  48. def close(self, reason):
  49. self.driver.quit()
  50. def parse_provinces(self, response):
  51. provinces_data = response.json()
  52. # 遍历省份数据,为每个省份生成新的请求
  53. for province in provinces_data:
  54. # 假设每个省份有url字段,基于此创建新的请求
  55. province_url = province.get('code') # 使用实际的键名,根据实际返回的JSON结构
  56. if province_url:
  57. yield scrapy.Request(url="http://www.nmc.cn/rest/province/" + province_url,
  58. callback=self.parse_province_details)
  59. def parse_province_details(self, response):
  60. provinces_data = response.json()
  61. for provinces in provinces_data:
  62. province_url = provinces.get('url')
  63. province = provinces.get('province')
  64. city = provinces.get('city')
  65. if province_url:
  66. yield scrapy.Request(url="http://www.nmc.cn" + province_url, callback=self.parse,
  67. meta={"province": province, "city": city})
  68. def parse(self, response):
  69. # 省份名称
  70. province = response.meta['province']
  71. # 城市名称
  72. city = response.meta['city']
  73. success = retry_get_url(self.driver, response.url)
  74. if success == False:
  75. self.logger.error(f"{response.url}超过最大重试次数")
  76. else:
  77. # self.driver.get(response.url)
  78. self.driver.set_window_size(1920, 1080)
  79. # 创建文件目录
  80. today_str = datetime.now().strftime("%Y-%m-%d")
  81. path_to_save = os.path.join("resource", province, city, today_str)
  82. full_path = os.path.join(".", path_to_save)
  83. if not os.path.exists(full_path):
  84. os.makedirs(full_path)
  85. # 获取第一张图片-天气预报
  86. element = WebDriverWait(self.driver, 20).until(
  87. EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div')))
  88. self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
  89. location = element.location
  90. size = element.size
  91. # 获取天气预报图
  92. # 使用PIL和numpy处理图像
  93. screenshot = self.driver.get_screenshot_as_png()
  94. image = Image.open(io.BytesIO(screenshot))
  95. crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
  96. cropped_image = image.crop(crop_area)
  97. # 将图像转换为RGB模式,因为JPEG格式需要此模式
  98. cropped_image = cropped_image.convert("RGB")
  99. # 保存裁剪后的图片
  100. url = urlparse(response.url)
  101. parts = url.path.split('/')
  102. target_part = '_'.join([parts[3], parts[4].split('.')[0]])
  103. name = target_part + "_1.png"
  104. jpeg_name = name.rsplit('.', 1)[0] + ".jpg"
  105. jpeg_path = os.path.join(full_path, jpeg_name)
  106. quality = 50
  107. # image_path = os.path.join(full_path, name)
  108. cropped_image.save(jpeg_path, "JPEG", quality=quality)
  109. # 获取第二张图片
  110. highcharts = WebDriverWait(self.driver, 20).until(
  111. EC.presence_of_element_located((By.ID, 'realChart')))
  112. self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
  113. highcharts)
  114. # time.sleep(2)
  115. # 获取屏幕截图
  116. screenshot = self.driver.get_screenshot_as_png()
  117. script = """
  118. var element = document.getElementById('realChart');
  119. var rect = element.getBoundingClientRect();
  120. return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
  121. """
  122. position_info = self.driver.execute_script(script)
  123. # 获取天气预报图
  124. # 使用PIL和numpy处理图像
  125. image = Image.open(io.BytesIO(screenshot))
  126. crop_area = (
  127. position_info['left'], position_info['top'],
  128. position_info['left'] + position_info['width'],
  129. position_info['top'] + position_info['height'])
  130. cropped_image = image.crop(crop_area)
  131. # 保存裁剪后的图片
  132. name = target_part + "_2.png"
  133. image_path = os.path.join(full_path, name)
  134. cropped_image.save(image_path)
  135. try:
  136. # 第三张图片
  137. # 等待js全部加载完成
  138. js_is_complete(self.driver)
  139. WebDriverWait(self.driver, 20).until(js_is_complete)
  140. highcharts = WebDriverWait(self.driver, 20).until(
  141. EC.presence_of_element_located((By.CSS_SELECTOR, '#container')))
  142. self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
  143. highcharts)
  144. # 找到点击的按钮
  145. WebDriverWait(self.driver, 20).until(
  146. EC.presence_of_element_located((By.CSS_SELECTOR,
  147. '#container> .highcharts-container >svg > g > g > g >g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0')))
  148. element = self.driver.find_element(By.CSS_SELECTOR,
  149. '#container> .highcharts-container >svg > g > g > g >g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0')
  150. self.driver.execute_script("arguments[0].dispatchEvent(new Event('click'));",
  151. element)
  152. # 等待点击之后页面加载完成
  153. WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.ID, 'container')))
  154. # time.sleep(2)
  155. # 获取屏幕截图
  156. screenshot = self.driver.get_screenshot_as_png()
  157. scripts = """
  158. var element = document.getElementById('container');
  159. var rect = element.getBoundingClientRect();
  160. return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
  161. """
  162. position_info = self.driver.execute_script(scripts)
  163. # 获取天气预报图
  164. # 使用PIL和numpy处理图像
  165. image = Image.open(io.BytesIO(screenshot))
  166. crop_area = (
  167. position_info['left'], position_info['top'] + 50,
  168. position_info['left'] + position_info['width'],
  169. position_info['top'] + position_info['height'])
  170. cropped_image = image.crop(crop_area)
  171. # 保存裁剪后的图片
  172. name = target_part + "_3.png"
  173. image_path = os.path.join(full_path, name)
  174. cropped_image.save(image_path)
  175. except TimeoutException:
  176. default_image_path = r"E:\pySpider\nmc\default_image.png" # 这里填写默认图片的路径
  177. backup_image = Image.open(default_image_path)
  178. name = target_part + "_3.png"
  179. image_path = os.path.join(full_path, name)
  180. backup_image.save(image_path)
  181. current_url = self.driver.current_url # 获取当前页面的URL
  182. self.logger.error(
  183. f"在页面 {current_url} 上,在指定时间内未能找到指定元素使用默认图片")