123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- import io
- import os.path
- import scrapy
- from PIL import Image
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- class NmcspiderSpider(scrapy.Spider):
- name = "nmcspider"
- allowed_domains = ["www.nmc.cn"]
- start_urls = ["http://www.nmc.cn/rest/province"]
- def start_requests(self):
- yield scrapy.Request(url=self.start_urls[0], callback=self.parse_provinces)
- def __init__(self):
- chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
- chrome_options = Options()
- # 添加路径到chrome_options
- chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
- # 如果需要添加其他选项,例如禁用浏览器窗口,可以这样:
- # chrome_options.add_argument('--headless')
- # chrome_options.add_argument('--disable-gpu')
- self.driver = webdriver.Chrome(options=chrome_options)
- def closed(self, reason):
- self.driver.quit()
- def parse_provinces(self, response):
- provinces_data = response.json()
- # 遍历省份数据,为每个省份生成新的请求
- for province in provinces_data:
- # 假设每个省份有url字段,基于此创建新的请求
- province_url = province.get('code') # 使用实际的键名,根据实际返回的JSON结构
- if province_url:
- yield scrapy.Request(url="http://www.nmc.cn/rest/province/" + province_url,
- callback=self.parse_province_details)
- def parse_province_details(self, response):
- provinces_data = response.json()
- for provinces in provinces_data:
- province_url = provinces.get('url')
- province = provinces.get('province')
- city = provinces.get('city')
- if province_url:
- yield scrapy.Request(url="http://www.nmc.cn" + province_url, callback=self.parse,
- meta={"province": province, "city": city})
- def parse(self, response):
- # 省份名称
- province = response.meta['province']
- # 城市名称
- city = response.meta['city']
- self.driver.get(response.url)
- self.driver.set_window_size(1920, 1080)
- title = response.xpath('//*[@id="realChart"]/div[1]/span[1]/text()').get()
- # 创建文件目录
- path_to_save = os.path.join(province, city)
- full_path = os.path.join(".", path_to_save)
- if not os.path.exists(full_path):
- os.makedirs(full_path)
- try:
- elemen = self.driver.find_element(By.ID, 'forecastChart')
- self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elemen)
- element = WebDriverWait(self.driver, 20).until(
- EC.presence_of_element_located((By.ID, 'forecastChart')))
- location = elemen.location
- size = elemen.size
- # 获取天气预报图
- # 使用PIL和numpy处理图像
- screenshot = self.driver.get_screenshot_as_png()
- image = Image.open(io.BytesIO(screenshot))
- crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
- print(crop_area, "=========================")
- cropped_image = image.crop(crop_area)
- # 保存裁剪后的图片
- image_path = os.path.join(full_path, city + '天气预报图.png')
- cropped_image.save(image_path)
- finally:
- print("关闭浏览器")
|