huangyan
/
nmcSpider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
							import io
import os.path

import scrapy
from PIL import Image
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


class NmcspiderSpider(scrapy.Spider):
    name = "nmcspider"
    allowed_domains = ["www.nmc.cn"]
    start_urls = ["http://www.nmc.cn/rest/province"]

    def start_requests(self):
        yield scrapy.Request(url=self.start_urls[0], callback=self.parse_provinces)

    def __init__(self):
        chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
        chrome_options = Options()
        # 添加路径到chrome_options
        chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
        # 如果需要添加其他选项，例如禁用浏览器窗口，可以这样：
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=chrome_options)

    def closed(self, reason):
        self.driver.quit()

    def parse_provinces(self, response):
        provinces_data = response.json()
        # 遍历省份数据，为每个省份生成新的请求
        for province in provinces_data:
            # 假设每个省份有url字段，基于此创建新的请求
            province_url = province.get('code')  # 使用实际的键名，根据实际返回的JSON结构
            if province_url:
                yield scrapy.Request(url="http://www.nmc.cn/rest/province/" + province_url,
                                     callback=self.parse_province_details)

    def parse_province_details(self, response):
        provinces_data = response.json()

        for provinces in provinces_data:
            province_url = provinces.get('url')
            province = provinces.get('province')
            city = provinces.get('city')
            if province_url:
                yield scrapy.Request(url="http://www.nmc.cn" + province_url, callback=self.parse,
                                     meta={"province": province, "city": city})

    def parse(self, response):
        # 省份名称
        province = response.meta['province']
        # 城市名称
        city = response.meta['city']
        self.driver.get(response.url)
        self.driver.set_window_size(1920, 1080)
        title = response.xpath('//*[@id="realChart"]/div[1]/span[1]/text()').get()
        # 创建文件目录
        path_to_save = os.path.join(province, city)
        full_path = os.path.join(".", path_to_save)
        if not os.path.exists(full_path):
            os.makedirs(full_path)
        try:
            elemen = self.driver.find_element(By.ID, 'forecastChart')
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elemen)
            element = WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.ID, 'forecastChart')))
            location = elemen.location
            size = elemen.size
            # 获取天气预报图
            # 使用PIL和numpy处理图像
            screenshot = self.driver.get_screenshot_as_png()
            image = Image.open(io.BytesIO(screenshot))
            crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
            print(crop_area, "=========================")
            cropped_image = image.crop(crop_area)
            # 保存裁剪后的图片
            image_path = os.path.join(full_path, city + '天气预报图.png')
            cropped_image.save(image_path)

        finally:
            print("关闭浏览器")