python - I can't add ImagePipeline to my scrapy script -


i'm developing script download images web, i'm in troubles it. know it's developed shit but...i need course using scrapy, here goes:

when throw script, shows in terminal list of urls parsed in "image_urls."

{'image_urls': [u'https://thehackerway.files.wordpress.com/2017/04/1.png?w=595&h=382',             u'https://thehackerway.files.wordpress.com/2017/04/2.png?w=595&h=327',...} 

but image doesn't download. checked prompt/output, , discovered that, in theory, itempipeline wasn't charge

[scrapy] info: enabled item pipelines:[] 

i'm going let code right here, tried hard different setups, wasn't successful. need in this.

from scrapy.linkextractors import linkextractor scrapy.selector import selector scrapy import request scrapy.item import item, field scrapy.spiders import crawlspider, rule scrapy.xlib.pydispatch import dispatcher twisted.internet import reactor scrapy import signals scrapy.utils.project import get_project_settings scrapy.crawler import crawlerprocess scrapy.settings import settings scrapy.pipelines.images import imagespipeline   class hackerwayitem(item):     image_urls = field()     images = field()  class bloggerspider(crawlspider):     name="thehackerway"     start_urls=['https://thehackerway.com']     allowed_domains=['thehackerway.com']     #custom_setting ={     #   'item_pipelines' : 'scrapy.pipelines.images.imagespipeline',     #   'images_store': '/home/bodhidharma/escritorio/practicas/images/',     #   'media_allow_redirects': 'true'     #}     #settings = settings()     #settings = get_project_settings()     #settings.set('item_pipelines':     {'scrapy.pipelines.images.imagespipeline':1})     rules = [rule(linkextractor(allow=['.'], allow_domains=['thehackerway.com']), callback='parse_blog')]  def parse_blog(self, response):     print "link parseado %s" % response.url     hxs = selector(response)     item = hackerwayitem()     #urls = hxs.xpath("//img/@src").extract()     #for u in urls:     #item ['image_urls']=u     item ['image_urls'] = hxs.xpath("//img/@src").extract()     #item ['image_urls']=hxs.xpath("//*[@id='post-3762']/div[2]/p[24]/a/img/@src").extract()     #item ['file_urls']=hxs.xpath("@href").extract()     yield item     #print (self.settings.attributes.values())     #print (item)     #return item # retornando el item.   def catch_item(sender, item, **kwargs):     #print ("item extraido: ")#, item     pass  if __name__ == '__main__':     dispatcher.connect(catch_item, signal=signals.item_passed)     dispatcher.connect(reactor.stop, signal=signals.spider_closed)       spider = bloggerspider()      process = crawlerprocess({ 'user_agent': 'mozilla/4.0 (compatible; msie 7.0; windows nt 5.1)'     })      process.crawl(spider)     print ("\n[+] starting scrapy engine...")     #crawler.start()     #reactor.run()      process.start() 


Comments

Popular posts from this blog

resizing Telegram inline keyboard -

command line - How can a Python program background itself? -

php - "cURL error 28: Resolving timed out" on Wordpress on Azure App Service on Linux -