网络推广100种方法免费,seo优化关键词分类,网站关键词多少合适,wordpress中的分类页Scrapy-Redis分布式策略#xff1a; Scrapy_redis在scrapy的基础上实现了更多#xff0c;更强大的功能#xff0c;具体体现在#xff1a; reqeust去重#xff0c;爬虫持久化#xff0c;和轻松实现分布式 假设有四台电脑#xff1a;Windows 10、Mac OS X、Ubuntu 16.04、…Scrapy-Redis分布式策略 Scrapy_redis在scrapy的基础上实现了更多更强大的功能具体体现在 reqeust去重爬虫持久化和轻松实现分布式 假设有四台电脑Windows 10、Mac OS X、Ubuntu 16.04、CentOS 7.2任意一台电脑都可以作为 Master端 或 Slaver端比如 Master端(核心服务器) 使用 Windows 10搭建一个Redis数据库不负责爬取只负责url指纹判重、Request的分配以及数据的存储 Slaver端(爬虫程序执行端) 使用 Mac OS X 、Ubuntu 16.04、CentOS 7.2负责执行爬虫程序运行过程中提交新的Request给Master 首先Slaver端从Master端拿任务Request、url进行数据抓取Slaver抓取数据的同时产生新任务的Request便提交给 Master 处理 Master端只有一个Redis数据库负责将未处理的Request去重和任务分配将处理后的Request加入待爬队列并且存储爬取的数据。
Scrapy-Redis默认使用的就是这种策略我们实现起来很简单因为任务调度等工作Scrapy-Redis都已经帮我们做好了我们只需要继承RedisSpider、指定redis_key就行了。
缺点是Scrapy-Redis调度的任务是Request对象里面信息量比较大不仅包含url还有callback函数、headers等信息可能导致的结果就是会降低爬虫速度、而且会占用Redis大量的存储空间所以如果要保证效率那么就需要一定硬件水平。 当当网图书信息抓取案例 1、创建Scrapy项目 使用全局命令startproject创建项目创建新文件夹并且使用命令进入文件夹创建一个名为jingdong的Scrapy项目。 [python] view plain copy scrapy startproject dangdang 2.使用项目命令genspider创建Spider [python] view plain copy scrapy genspider dangdang dangdang.com 3、发送请求接受响应提取数据 # -*- coding: utf-8 -*-
import scrapy
from scrapy_redis.spiders import RedisSpider
from copy import deepcopyclass DangdangSpider(RedisSpider):name dangdangallowed_domains [dangdang.com]# start_urls [http://book.dangdang.com/]redis_key dangdangdef parse(self, response):div_list response.xpath(//div[classcon flq_body]/div)# print(len(div_list),(*100)for div in div_list:#大分类item {}item[b_cate] div.xpath(./dl/dt//text()).extract()#中间分类dl_list div.xpath(./div//dl[classinner_dl])# print(len(dl_list),)*100)for dl in dl_list:item[m_cate] dl.xpath(./dt/a/text()).extract_first()#获取小分类a_list dl.xpath(./dd/a)# print(-*100,len(a_list))for a in a_list:item[s_cate] a.xpath(./title).extract_first()item[s_href] a.xpath(./href).extract_first()if item[s_href] is not None:yield scrapy.Request( #发送图书列表页的请求item[s_href],callbackself.parse_book_list,meta {item:deepcopy(item)})def parse_book_list(self,response):item response.meta[item]li_list response.xpath(//ul[classbigimg]/li)for li in li_list:item[book_title] li.xpath(./a/title).extract_first()item[book_href] li.xpath(./a/href).extract_first()item[book_detail] li.xpath(./p[classdetail]/text()).extract_first()item[book_price] li.xpath(.//span[classsearch_now_price]/text()).extract_first()item[book_author] li.xpath(./p[classsearch_book_author]/span[1]/a/title).extract_first()item[book_publish_date] li.xpath(./p[classsearch_book_author]/span[2]/text()).extract_first()item[book_press] li.xpath(./p[classsearch_book_author]/span[3]/a/title).extract_first()print(item)4.pipelines设置保存文件 # -*- coding: utf-8 -*-# Define your item pipelines here
#
# Dont forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlclass BookPipeline(object):def process_item(self, item, spider):item[book_name] item[book_name].strip() if item[book_name] is not None else Noneitem[book_publish_date] item[book_publish_date].strip() if item[book_publish_date] is not None else Noneprint(item)# return item5.配置settings设置文件保存在redis中 注意setting中的配置都是可以自己设定的意味着我们的可以重写去重和调度器的方法包括是否要把数据存储到redis(pipeline)view plain cop # -*- coding: utf-8 -*-# Scrapy settings for book project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME bookSPIDER_MODULES [book.spiders]
NEWSPIDER_MODULE book.spiders#实现scrapyredis的功能持久化的功能
DUPEFILTER_CLASS scrapy_redis.dupefilter.RFPDupeFilter
SCHEDULER scrapy_redis.scheduler.Scheduler
SCHEDULER_PERSIST True
REDIS_URL redis://127.0.0.1:6379# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36# Obey robots.txt rules
ROBOTSTXT_OBEY False# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS 32# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN 16
#CONCURRENT_REQUESTS_PER_IP 16# Disable cookies (enabled by default)
#COOKIES_ENABLED False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS {
# Accept: text/html,application/xhtmlxml,application/xml;q0.9,*/*;q0.8,
# Accept-Language: en,
#}# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES {
# book.middlewares.BookSpiderMiddleware: 543,
#}# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES {
# book.middlewares.MyCustomDownloaderMiddleware: 543,
#}# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS {
# scrapy.extensions.telnet.TelnetConsole: None,
#}# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES {book.pipelines.BookPipeline: 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED True
# The initial download delay
#AUTOTHROTTLE_START_DELAY 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG False# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED True
#HTTPCACHE_EXPIRATION_SECS 0
#HTTPCACHE_DIR httpcache
#HTTPCACHE_IGNORE_HTTP_CODES []
#HTTPCACHE_STORAGE scrapy.extensions.httpcache.FilesystemCacheStorage6.进行爬取执行项目命令crawl启动Spider [python] view plain copy scrapy crawl dangdang 注意setting中的配置都是可以自己设定的意味着我们的可以重写去重和调度器的方法包括是否要把数据存储到redis(pipeline)] view plain cop