python - Add data from multiple scrapers to a csv file with same headers -
i have 2 scrapers provides output in same format: link, description. can them both output in csv file. can't output data in same csv file, outputs data 1 of scrapers.
i have tried use 'a' append file when export data, reflects data first scraper.
my question is
how can add data same csv file multiple scrapers in way adds data in new row?
example of structure want in csv:
header: [link, description] row 1: [link scraper 1, description scraper 1] row 2: [link scraper 1, description scraper 1] row 3: [link scraper 2, description scraper 2]
code scraper 1
import csv ; import requests bs4 import beautifulsoup outfile = open('deloitteimplementtest.csv','a') writer = csv.writer(outfile) writer.writerow(["job_link", "job_desc"]) res = requests.get("http://deloittedk.easycruit.com/?_sp=136ecff9b65625bf.1504382903200&icid=top_").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a") link in links: item_link = link.get("href").strip() item_text = link.text.replace("view position","").encode('utf-8').strip() writer.writerow([item_link, item_text]) print(item_link, item_text)
code scraper 2
import csv ; import requests bs4 import beautifulsoup outfile = open('deloitteimplementtest.csv','a') writer = csv.writer(outfile) writer.writerow(["job_link", "job_desc"]) res = requests.get("http://implementconsultinggroup.com/career/#/6257").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a") li in soup.find('ul', class_='list-articles list').find_all('li'): level = li.find_all('dd', {'class': 'author'})[1].get_text() if "graduate" in level: links = li.find_all(href=true) link in links: if "career" in link.get("href") , 'copenhagen' in link.text: item_link = link.get("href").strip() item_text = link.text.replace("view position","").encode('utf-8').strip() writer.writerow([item_link, item_text]) print(item_link, item_text)
edited code
#!/usr/bin/env python import requests bs4 import beautifulsoup import csv import os class createcsv: def __init__(self, filename): try: self.csvfile = open(filename,'ab') headers = ['link','description'] self.writer = csv.dictwriter(self.csvfile, delimiter='\t', fieldnames=headers) if os.stat(filename).st_size == 0: # write header once self.writer.writeheader() except exception, error: print error def write_row(self,link,desc): self.writer.writerow({'link':link, 'description':desc}) def __del__(self): self.csvfile.close() res = requests.get("http://deloittedk.easycruit.com/?_sp=136ecff9b65625bf.1504382903200&icid=top_").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a") # here create "test.csv" # use append values outfile = createcsv('test10.csv') link in links: item_link = link.get("href").strip() item_text = link.text.replace("view position","").encode('utf-8').strip() # append values "test.csv" outfile.write_row(item_link, item_text) # remember second scraper write on same .csv file # first one, need use same 'createcsv' object - # in case "outfile". res = requests.get("http://implementconsultinggroup.com/career/#/6257").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a") li in soup.find('ul', class_='list-articles list').find_all('li'): level = li.find_all('dd', {'class': 'author'})[1].get_text() if "graduate" in level: links = li.find_all(href=true) link in links: if "career" in link.get("href") , 'copenhagen' in link.text: item_link = link.get("href").strip() item_text = link.text.replace("view position","").encode('utf-8').strip() # use same 'createcsv' object outfile.write_row(item_link, item_text)
an easy solution create class handle csv
-appending process. class follows:
import csv import os class createcsv: def __init__(self, filename): try: self.csvfile = open(filename,'ab') headers = ['link','description'] self.writer = csv.dictwriter(self.csvfile, delimiter='\t', fieldnames=headers) if os.stat(filename).st_size == 0: # write header once self.writer.writeheader() except exception, error: print error def write_row(self,link,desc): self.writer.writerow({'link':link, 'description':desc}) def __del__(self): self.csvfile.close()
as can see make use of stat.st_size
variable able not write csv-header every time want append new item on csv file.
update
now can combine scrapers so:
#!/usr/bin/env python import requests bs4 import beautifulsoup import csv import os class createcsv: def __init__(self, filename): try: self.csvfile = open(filename,'ab') headers = ['link','description'] self.writer = csv.dictwriter(self.csvfile, delimiter='\t', fieldnames=headers) if os.stat(filename).st_size == 0: # write header once self.writer.writeheader() except exception, error: print error def write_row(self,link,desc): self.writer.writerow({'link':link, 'description':desc}) def __del__(self): self.csvfile.close() res = requests.get("http://your_1st_url_here").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a") # here create "test.csv" # use append values outfile = createcsv('test.csv') link in links: item_link = link.get("href").strip() item_text = link.text.replace("view position","").encode('utf-8').strip() # append values "test.csv" outfile.write_row(item_link, item_text) # remember second scraper write on same .csv file # first one, need use same 'createcsv' object - # in case "outfile". res = requests.get("http://your_2nd_url_here").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a") li in soup.find('ul', class_='list-articles list').find_all('li'): level = li.find_all('dd', {'class': 'author'})[1].get_text() if "graduate" in level: links = li.find_all(href=true) link in links: if "career" in link.get("href") , 'copenhagen' in link.text: item_link = link.get("href").strip() item_text = link.text.replace("view position","").encode('utf-8').strip() # use same 'createcsv' object outfile.write_row(item_link, item_text)
Comments
Post a Comment