nmcspider.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. import io
  2. import os.path
  3. import scrapy
  4. from PIL import Image
  5. from selenium import webdriver
  6. from selenium.webdriver.chrome.options import Options
  7. from selenium.webdriver.common.by import By
  8. from selenium.webdriver.support import expected_conditions as EC
  9. from selenium.webdriver.support.wait import WebDriverWait
  10. class NmcspiderSpider(scrapy.Spider):
  11. name = "nmcspider"
  12. allowed_domains = ["www.nmc.cn"]
  13. start_urls = ["http://www.nmc.cn/rest/province"]
  14. def start_requests(self):
  15. yield scrapy.Request(url=self.start_urls[0], callback=self.parse_provinces)
  16. def __init__(self):
  17. chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
  18. chrome_options = Options()
  19. # 添加路径到chrome_options
  20. chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
  21. # 如果需要添加其他选项,例如禁用浏览器窗口,可以这样:
  22. # chrome_options.add_argument('--headless')
  23. # chrome_options.add_argument('--disable-gpu')
  24. self.driver = webdriver.Chrome(options=chrome_options)
  25. def closed(self, reason):
  26. self.driver.quit()
  27. def parse_provinces(self, response):
  28. provinces_data = response.json()
  29. # 遍历省份数据,为每个省份生成新的请求
  30. for province in provinces_data:
  31. # 假设每个省份有url字段,基于此创建新的请求
  32. province_url = province.get('code') # 使用实际的键名,根据实际返回的JSON结构
  33. if province_url:
  34. yield scrapy.Request(url="http://www.nmc.cn/rest/province/" + province_url,
  35. callback=self.parse_province_details)
  36. def parse_province_details(self, response):
  37. provinces_data = response.json()
  38. for provinces in provinces_data:
  39. province_url = provinces.get('url')
  40. province = provinces.get('province')
  41. city = provinces.get('city')
  42. if province_url:
  43. yield scrapy.Request(url="http://www.nmc.cn" + province_url, callback=self.parse,
  44. meta={"province": province, "city": city})
  45. def parse(self, response):
  46. # 省份名称
  47. province = response.meta['province']
  48. # 城市名称
  49. city = response.meta['city']
  50. self.driver.get(response.url)
  51. self.driver.set_window_size(1920, 1080)
  52. title = response.xpath('//*[@id="realChart"]/div[1]/span[1]/text()').get()
  53. # 创建文件目录
  54. path_to_save = os.path.join(province, city)
  55. full_path = os.path.join(".", path_to_save)
  56. if not os.path.exists(full_path):
  57. os.makedirs(full_path)
  58. try:
  59. elemen = self.driver.find_element(By.ID, 'forecastChart')
  60. self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elemen)
  61. element = WebDriverWait(self.driver, 20).until(
  62. EC.presence_of_element_located((By.ID, 'forecastChart')))
  63. location = elemen.location
  64. size = elemen.size
  65. # 获取天气预报图
  66. # 使用PIL和numpy处理图像
  67. screenshot = self.driver.get_screenshot_as_png()
  68. image = Image.open(io.BytesIO(screenshot))
  69. crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
  70. print(crop_area, "=========================")
  71. cropped_image = image.crop(crop_area)
  72. # 保存裁剪后的图片
  73. image_path = os.path.join(full_path, city + '天气预报图.png')
  74. cropped_image.save(image_path)
  75. finally:
  76. print("关闭浏览器")