python - Add data from multiple scrapers to a csv file with same headers -


i have 2 scrapers provides output in same format: link, description. can them both output in csv file. can't output data in same csv file, outputs data 1 of scrapers.

i have tried use 'a' append file when export data, reflects data first scraper.

my question is

how can add data same csv file multiple scrapers in way adds data in new row?

example of structure want in csv:

header: [link, description]  row 1: [link scraper 1, description scraper 1] row 2: [link scraper 1, description scraper 1] row 3: [link scraper 2, description scraper 2] 

code scraper 1

import csv ; import requests bs4 import beautifulsoup  outfile = open('deloitteimplementtest.csv','a') writer = csv.writer(outfile) writer.writerow(["job_link", "job_desc"])  res = requests.get("http://deloittedk.easycruit.com/?_sp=136ecff9b65625bf.1504382903200&icid=top_").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a")  link in links:         item_link = link.get("href").strip()         item_text = link.text.replace("view position","").encode('utf-8').strip()         writer.writerow([item_link, item_text])         print(item_link, item_text) 

code scraper 2

import csv ; import requests bs4 import beautifulsoup  outfile = open('deloitteimplementtest.csv','a') writer = csv.writer(outfile) writer.writerow(["job_link", "job_desc"])  res = requests.get("http://implementconsultinggroup.com/career/#/6257").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a")  li in soup.find('ul', class_='list-articles list').find_all('li'):     level = li.find_all('dd', {'class': 'author'})[1].get_text()     if "graduate" in level:         links = li.find_all(href=true)         link in links:             if "career" in link.get("href") , 'copenhagen' in link.text:                 item_link = link.get("href").strip()                 item_text = link.text.replace("view position","").encode('utf-8').strip()                 writer.writerow([item_link, item_text])                 print(item_link, item_text) 

edited code

#!/usr/bin/env python  import requests bs4 import beautifulsoup import csv import os  class createcsv:     def __init__(self, filename):         try:             self.csvfile = open(filename,'ab')             headers = ['link','description']             self.writer = csv.dictwriter(self.csvfile, delimiter='\t', fieldnames=headers)              if os.stat(filename).st_size == 0:  # write header once                 self.writer.writeheader()                except exception, error:             print error      def write_row(self,link,desc):         self.writer.writerow({'link':link, 'description':desc})      def __del__(self):         self.csvfile.close()  res = requests.get("http://deloittedk.easycruit.com/?_sp=136ecff9b65625bf.1504382903200&icid=top_").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a")  # here create "test.csv"  # use append values outfile = createcsv('test10.csv')  link in links:         item_link = link.get("href").strip()         item_text = link.text.replace("view position","").encode('utf-8').strip()         # append values "test.csv"         outfile.write_row(item_link, item_text)  # remember second scraper write on same .csv file  # first one, need use same 'createcsv' object -  # in case "outfile".   res = requests.get("http://implementconsultinggroup.com/career/#/6257").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a")  li in soup.find('ul', class_='list-articles list').find_all('li'):     level = li.find_all('dd', {'class': 'author'})[1].get_text()     if "graduate" in level:         links = li.find_all(href=true)         link in links:             if "career" in link.get("href") , 'copenhagen' in link.text:                 item_link = link.get("href").strip()                 item_text = link.text.replace("view position","").encode('utf-8').strip()                 # use same 'createcsv' object                 outfile.write_row(item_link, item_text) 

an easy solution create class handle csv-appending process. class follows:

import csv import os  class createcsv:     def __init__(self, filename):         try:             self.csvfile = open(filename,'ab')             headers = ['link','description']             self.writer = csv.dictwriter(self.csvfile, delimiter='\t', fieldnames=headers)              if os.stat(filename).st_size == 0:  # write header once                 self.writer.writeheader()                except exception, error:             print error      def write_row(self,link,desc):         self.writer.writerow({'link':link, 'description':desc})      def __del__(self):         self.csvfile.close() 

as can see make use of stat.st_size variable able not write csv-header every time want append new item on csv file.


update

now can combine scrapers so:

#!/usr/bin/env python  import requests bs4 import beautifulsoup import csv import os  class createcsv:     def __init__(self, filename):         try:             self.csvfile = open(filename,'ab')             headers = ['link','description']             self.writer = csv.dictwriter(self.csvfile, delimiter='\t', fieldnames=headers)              if os.stat(filename).st_size == 0:  # write header once                 self.writer.writeheader()                except exception, error:             print error      def write_row(self,link,desc):         self.writer.writerow({'link':link, 'description':desc})      def __del__(self):         self.csvfile.close()  res = requests.get("http://your_1st_url_here").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a")  # here create "test.csv"  # use append values outfile = createcsv('test.csv')  link in links:         item_link = link.get("href").strip()         item_text = link.text.replace("view position","").encode('utf-8').strip()         # append values "test.csv"         outfile.write_row(item_link, item_text)  # remember second scraper write on same .csv file  # first one, need use same 'createcsv' object -  # in case "outfile".   res = requests.get("http://your_2nd_url_here").text soup = beautifulsoup(res,"lxml") links = soup.find_all("a")  li in soup.find('ul', class_='list-articles list').find_all('li'):     level = li.find_all('dd', {'class': 'author'})[1].get_text()     if "graduate" in level:         links = li.find_all(href=true)         link in links:             if "career" in link.get("href") , 'copenhagen' in link.text:                 item_link = link.get("href").strip()                 item_text = link.text.replace("view position","").encode('utf-8').strip()                 # use same 'createcsv' object                 outfile.write_row(item_link, item_text) 

Comments

Popular posts from this blog

resizing Telegram inline keyboard -

command line - How can a Python program background itself? -

php - "cURL error 28: Resolving timed out" on Wordpress on Azure App Service on Linux -