python - I can't add ImagePipeline to my scrapy script -
i'm developing script download images web, i'm in troubles it. know it's developed shit but...i need course using scrapy, here goes:
when throw script, shows in terminal list of urls parsed in "image_urls."
{'image_urls': [u'https://thehackerway.files.wordpress.com/2017/04/1.png?w=595&h=382', u'https://thehackerway.files.wordpress.com/2017/04/2.png?w=595&h=327',...}
but image doesn't download. checked prompt/output, , discovered that, in theory, itempipeline wasn't charge
[scrapy] info: enabled item pipelines:[]
i'm going let code right here, tried hard different setups, wasn't successful. need in this.
from scrapy.linkextractors import linkextractor scrapy.selector import selector scrapy import request scrapy.item import item, field scrapy.spiders import crawlspider, rule scrapy.xlib.pydispatch import dispatcher twisted.internet import reactor scrapy import signals scrapy.utils.project import get_project_settings scrapy.crawler import crawlerprocess scrapy.settings import settings scrapy.pipelines.images import imagespipeline class hackerwayitem(item): image_urls = field() images = field() class bloggerspider(crawlspider): name="thehackerway" start_urls=['https://thehackerway.com'] allowed_domains=['thehackerway.com'] #custom_setting ={ # 'item_pipelines' : 'scrapy.pipelines.images.imagespipeline', # 'images_store': '/home/bodhidharma/escritorio/practicas/images/', # 'media_allow_redirects': 'true' #} #settings = settings() #settings = get_project_settings() #settings.set('item_pipelines': {'scrapy.pipelines.images.imagespipeline':1}) rules = [rule(linkextractor(allow=['.'], allow_domains=['thehackerway.com']), callback='parse_blog')] def parse_blog(self, response): print "link parseado %s" % response.url hxs = selector(response) item = hackerwayitem() #urls = hxs.xpath("//img/@src").extract() #for u in urls: #item ['image_urls']=u item ['image_urls'] = hxs.xpath("//img/@src").extract() #item ['image_urls']=hxs.xpath("//*[@id='post-3762']/div[2]/p[24]/a/img/@src").extract() #item ['file_urls']=hxs.xpath("@href").extract() yield item #print (self.settings.attributes.values()) #print (item) #return item # retornando el item. def catch_item(sender, item, **kwargs): #print ("item extraido: ")#, item pass if __name__ == '__main__': dispatcher.connect(catch_item, signal=signals.item_passed) dispatcher.connect(reactor.stop, signal=signals.spider_closed) spider = bloggerspider() process = crawlerprocess({ 'user_agent': 'mozilla/4.0 (compatible; msie 7.0; windows nt 5.1)' }) process.crawl(spider) print ("\n[+] starting scrapy engine...") #crawler.start() #reactor.run() process.start()
Comments
Post a Comment