retrieve specific links from web page using python and BeautifulSoup -
i have been trying retrieve href link page , using variable next href link. stuck @ 1 point have multiple href links different file extension(like zip,md5 etc) , needed zip extension file. here code trying implement.
import httplib2 beautifulsoup import beautifulsoup, soupstrainer http = httplib2.http() status, response = http.request('http://example.com') link in beautifulsoup(response, parseonlythese=soupstrainer('a')): if link.has_key('href'): if '/abc' in link['href']: basename = link['href'].split("/")[11] print basename status, response = http.request('http://example.com/%basename',basename) link in beautifulsoup(response, parseonlythese=soupstrainer('a')): if link.has_key('href'): if '/abc' in link['href']: basename = link['href'].split("/")[11] print basename
try it:
import os # yoy code here link in beautifulsoup(response, parseonlythese=soupstrainer('a')): if link.has_key('href'): if '/abc' in link['href']: basename = link['href'].split("/")[11] # check file extension filename, file_extension = os.path.splitext(basename) print basename, file_extension if file_extension.lower() == 'zip': continue # last code
Comments
Post a Comment