settings.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. # Scrapy settings for nmc project
  2. #
  3. # For simplicity, this file contains only settings considered important or
  4. # commonly used. You can find more settings consulting the documentation:
  5. #
  6. # https://docs.scrapy.org/en/latest/topics/settings.html
  7. # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  8. # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  9. BOT_NAME = "nmc"
  10. SPIDER_MODULES = ["nmc.spiders"]
  11. NEWSPIDER_MODULE = "nmc.spiders"
  12. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  13. USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
  14. REACTOR = 'twisted.internet.selectreactor.SelectReactor'
  15. LOG_LEVEL = 'ERROR'
  16. # Obey robots.txt rules
  17. ROBOTSTXT_OBEY = False
  18. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  19. CONCURRENT_REQUESTS = 32
  20. RETRY_TIMES = 3 # 请求失败时重试次数
  21. DOWNLOAD_TIMEOUT = 15 # 默认下载超时时间,单位秒
  22. # Configure a delay for requests for the same website (default: 0)
  23. # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
  24. # See also autothrottle settings and docs
  25. DOWNLOAD_DELAY = 1
  26. RANDOMIZE_DOWNLOAD_DELAY = True
  27. # The download delay setting will honor only one of:
  28. # CONCURRENT_REQUESTS_PER_DOMAIN = 16
  29. # CONCURRENT_REQUESTS_PER_IP = 16
  30. # 配置置日志级别,DEBUG, INFO, WARNING, ERROR等,默认为WARNING
  31. # 配置置日志格式
  32. LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
  33. # 自定义一个文件处理器,将日志写入文件
  34. LOG_FILE = 'logs/scrapy.log' # 指定日志文件路径,相对或绝对路径
  35. # 配置日志处理器,这里只配置了一个文件处理器
  36. LOG_HANDLERS = {
  37. 'file': {
  38. 'class': 'logging.FileHandler',
  39. 'formatter': 'default',
  40. 'filename': LOG_FILE,
  41. },
  42. }
  43. # 定义日志处理器格式化器
  44. LOG_FORMATTERS = {
  45. 'default': {
  46. 'format': LOG_FORMAT,
  47. 'datefmt': '%Y-%m-%d %H:%M:%S',
  48. },
  49. }
  50. # 配置日志处理器字典,定义哪些logger使用哪些处理器
  51. LOGGGER_DELEGATION = {
  52. 'scrapy': {
  53. 'handlers': ['file'],
  54. 'level': LOG_LEVEL,
  55. 'propagate': False, # 不传播给父logger
  56. },
  57. 'twisted.core.engine': { # 可以类似方式配置其他核心组件
  58. 'handlers': ['file'],
  59. 'level': LOG_LEVEL,
  60. 'propagate': False,
  61. },
  62. }
  63. # Disable cookies (enabled by default)
  64. COOKIES_ENABLED = False
  65. # Disable Telnet Console (enabled by default)
  66. # TELNETCONSOLE_ENABLED = False
  67. # Override the default request headers:
  68. DEFAULT_REQUEST_HEADERS = {
  69. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
  70. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  71. 'Accept-Language': 'en',
  72. }
  73. # Enable or disable spider middlewares
  74. # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  75. # SPIDER_MIDDLEWARES = {
  76. # "nmc.middlewares.NmcSpiderMiddleware": 543,
  77. # }
  78. # Enable or disable downloader middlewares
  79. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  80. # DOWNLOADER_MIDDLEWARES = {
  81. # # "nmc.middlewares.NmcDownloaderMiddleware": 543,
  82. # "nmc.middlewares.SeleniumDownloaderMiddleware": 543,
  83. # 'scrapy.downloadermiddlewares.useragent.UseragentMiddleware': None
  84. # }
  85. # Enable or disable extensions
  86. # See https://docs.scrapy.org/en/latest/topics/extensions.html
  87. # EXTENSIONS = {
  88. # "scrapy.extensions.telnet.TelnetConsole": None,
  89. # }
  90. # Configure item pipelines
  91. # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  92. ITEM_PIPELINES = {
  93. "nmc.pipelines.NmcPipeline": 300,
  94. }
  95. # Enable and configure the AutoThrottle extension (disabled by default)
  96. # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
  97. # AUTOTHROTTLE_ENABLED = True
  98. # The initial download delay
  99. # AUTOTHROTTLE_START_DELAY = 5
  100. # The maximum download delay to be set in case of high latencies
  101. # AUTOTHROTTLE_MAX_DELAY = 60
  102. # The average number of requests Scrapy should be sending in parallel to
  103. # each remote server
  104. # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  105. # Enable showing throttling stats for every response received:
  106. # AUTOTHROTTLE_DEBUG = False
  107. # Enable and configure HTTP caching (disabled by default)
  108. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  109. # HTTPCACHE_ENABLED = True
  110. # HTTPCACHE_EXPIRATION_SECS = 0
  111. # HTTPCACHE_DIR = "httpcache"
  112. # HTTPCACHE_IGNORE_HTTP_CODES = []
  113. # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
  114. # Set settings whose default value is deprecated to a future-proof value
  115. REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
  116. TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
  117. FEED_EXPORT_ENCODING = "utf-8"