How to append dictionary to csv without appending keys - python

I have a dictionary features = {'feature1' : 'hi', 'feature2': 'second feature', 'feature3': 'third feature'}. I need to save it to a csv file. But this dictionary gets renewed in each iteration and a new dictionary is appended to existing csv file. I am using it in scrapy.
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
# ('/category/', 'parse_category'),
]
def parse(self,response):
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
# print("hi here")
features = {}
features["ad_url"] = response.request.url
#filling feature dictionary
df = pd.DataFrame.from_dict(features , orient='index')
df = df.transpose()
df.to_csv("result.csv",mode = 'a', index = False)
The problem is that this saves dictionary to csv along with key as well. I am attaching the picture of excel sheet here:
enter image description here
Intuitively speaking header should be filled only once at the top and not every time in every other row. How do I do that?

class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
# ('/category/', 'parse_category'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI':'FILEname.csv'}
def parse(self,response):
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
# print("hi here")
item = {}
item["ad_url"] = response.request.url
yield item
to run it scrapy crawl spidername

Related

Extracting data from web with selenium and inserting it in a pandas dataframe

I have a problem, I cannot "take" the data I have extracted from selenium and store them somewhere to manipulate or store them
I am grabbing the data, like so:
try:
books = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.ID, "titleitem"))
)
finally:
driver.quit()
inside the try function I have extracted the data like this:
for i, book in enumerate(books):
splited = books[i].text.split("\n")
writer = str(splited[0])
title = str(splited[1])
publiser = str(splited[2])
country = str(splited[3])
ISBN = str(splited[4])
So in the end I have this code to extract exactly the data I want:
try:
books = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.ID, "titleitem"))
)
for i, book in enumerate(books):
splited = books[i].text.split("\n")
writer = str(splited[0])
title = str(splited[1])
publiser = str(splited[2])
country = str(splited[3])
ISBN = str(splited[4])
finally:
driver.quit()
Those variables are the things I want to grab.
When I print them, they appear normal (as they are on the website)
But then, I try to insert them to a pandas dataframe, like this
(fake_books is declared as a pd.DataFrame()):
tmp = pd.Series({'title' : title, 'author': writer, 'publiser': ekdoths})
fake_books = fake_books.append(tmp)
I have also tries a list of dictionaries:
books = [{}]
...
for i, book in enumerate(books):
splited = books[i].text.split("\n")
books[i]['writer'] = str(splited[0])
books[i]['title'] = str(splited[1])
books[i]['ekdoths'] = str(splited[2])
books[i]['polh'] = str(splited[3])
books[i]['ISBN'] = str(splited[4])
Neither of those things work, the programm is just "lagging" and printing an emply dataframe of list
I always use this method, I create a list of dictionaries then I pass it into pd.DataFrame
# create empty list as the beginning of the code
df_list = []
for i, book in enumerate(books):
splited = books[i].text.split("\n")
writer = str(splited[0])
title = str(splited[1])
publiser = str(splited[2])
country = str(splited[3])
ISBN = str(splited[4])
# add the scraped data into dictionary then append it into df_list
df_list.append({"writer":writer, "title":title, "publiser":publiser, "country":country, "ISBN":ISBN})
# and the end of your code after scraping all you want
df = pd.DataFrame(df_list)

divide list of elements in scrapy output into seperate rows

I am trying to separate the output from Scrapy into separate lines in an Excel file but I get something like this
In other words each output from variant id, price and name should be in placed in seperate lines in Excel.
I am using scrapy-xlsx 0.1.1 library to export output to xlsx file (it cannot be in csv).
Please tell me where is the issue.
import scrapy
from ..items import ZooplusItem
import re
class ZooplusDeSpider(scrapy.Spider):
name = 'zooplus_de'
allowed_domains = ['zooplus.de']
start_urls = ['https://www.zooplus.de/shop/hunde/hundefutter_trockenfutter/diaetfutter']
def parse(self, response):
for link in response.css('.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-2.MuiGrid-justify-xs-flex-end'):
items = ZooplusItem()
redirect_urls = response.request.meta.get('redirect_urls')
items['url'] = link.redirect_urls[0] if redirect_urls else response.request.url
items['product_url'] = link.css('.MuiGrid-root.product-image a::attr(href)').getall()
items['title'] = link.css('h3 a::text').getall()
items['id'] = link.css('h3 a::attr(id)').getall()
items['review'] = link.css('span.sc-fzoaKM.kVcaXm::text').getall()
items['review'] = re.sub(r'\D', " ", str(items['review']))
items['review'] = items['review'].replace(" ", "")
#items['review'] = int(items['review'])
items['rate'] = len(link.css('a.v3-link i[role=full-star]'))
items['variant_id'] = [i.strip().split('/n') for i in link.css('.jss114.jss115::text').extract()]
items['variant_name'] = [i.strip().split('/n') for i in link.css('.sc-fzqARJ.cHdpSy:not(.jss114.jss115)::text').extract()]
items['variant_price'] = [i.strip().split('/n') for i in link.css('div.product__prices_col meta::attr(content)').extract()]
yield items
If you want to store all the variants with common information duplicated, then you need to loop through each variant and yield that separately. You can copy the common information you've already collected and add to that.
In summary replace
items['variant_id'] = [i.strip().split('/n') for i in link.css('.jss114.jss115::text').extract()]
items['variant_name'] = [i.strip().split('/n') for i in link.css('.sc-fzqARJ.cHdpSy:not(.jss114.jss115)::text').extract()]
items['variant_price'] = [i.strip().split('/n') for i in link.css('div.product__prices_col meta::attr(content)').extract()]
yield item
with something like
for i in link.css("[data-zta='product-variant']"):
variant = items.copy()
variant["variant_id"] = i.attrib["data-variant-id"]
variant["variant_name"] = "".join(i.css(".title > div::text").getall()).strip()
variant['variant_price'] = i.css("[itemprop='price']::attr(content)").get()
yield variant

crawler update data to an array, yield inside a loop

I want to continuous crawl and update an array value using loop because I need to click some button to get next value on array.
However it seem yield inside loop works as parallel thread and the item was yield many time.
What I want is go through the loop, update data and yield item only one time.
example:
current output:
{'field1': 'data1',
'filed2' : 'data2',
'field3' : ['data31']}
{'field1': 'data1',
'filed2' : 'data2',
'field3' : ['data32']}
{'field1': 'data1',
'filed2' : 'data2',
'field3' : ['data33']}
expected:
{'field1': 'data1',
'filed2' : 'data2',
'field3' : ['data31', 'data32', 'data3']}
Here is my code
def parse_individual_listings(self, response):
...
data = {}
data[field1] = 'data1'
data[field1] = 'data2'
...
for i in range(3):
yield scrapy.Request(
urlparse.urljoin(response.url, link['href']), #different link
callback=self.parse_individual_tabs,
meta={'data': data, 'n':i};
)
def parse_individual_tabs(self, response):
data = response.meta['data']
i = response.meta['i']
...
# keep populating `data`
data[field3][i] = "data3[i]" #this value change when I click a button to update
yield data
Try to use inline_requests library (https://pypi.org/project/scrapy-inline-requests/). It let's you to make requests inside the same function. It is useful to collect data to one object instead of yielding different. Check this example with some pseudocode:
from inline_requests import inline_requests
from scrapy import Selector
#inline_requests
def parse_individual_listings(self, response):
...
data = {}
data[field1] = 'data1'
data[field1] = 'data2'
...
for i in range(3):
extra_req = yield scrapy.Request(
response.urljoin(link['href']), #different link
)
# apply your logics here, say extract some data
sel = Selector(text=extra_req.text)
data['field3'].append(sel.css('some css selector').get())
yield data
Follow multi thread and unsynchronized idea, I end up using mutex lock in parse_individual_tabs and it seems work fine.
from threading import Lock
...
mutex = Lock()
count = 0
...
def parse_individual_tabs(self, response):
self.mutex.acquire(1)
try:
count += 1
data = response.meta['data']
i = response.meta['i']
...
# keep populating `data`
data[field3][i] = "data3[i]" #this value change when I click a button to update
finally:
self.mutex.release()
if (count == 3): #check if this is callback of last yield Request
yield data
else:
return

When scrapy finishes I want to create a dataframe from all data crawled

I'm trying to scrape some websites. I would want to store all data scraped in a final dataframe named Tabel_Final. I stored each attribute in a different list and then I'm trying to concatenate the lists in the final dataframe and then output it as csv to verify the results. I have a different method in the code, where I append all data scraped directly in a csv, but I would need that dataframe so much :( Any help, please?
This is my code:
import scrapy
import json
import csv
import re
import pandas as pd
name_list = []
category_list = []
type_list = []
model_list = []
model_name_list = []
model_code_list = []
Tabel_Final = pd.DataFrame(columns=['Country','Category', 'Type', 'Model', 'Name', 'SKU'])
class QuotesSpider(scrapy.Spider):
name = "quotes1"
def start_requests(self):
with open('input.csv','r') as csvf:
urlreader = csv.reader(csvf, delimiter=',',quotechar='"')
for url in urlreader:
if url[0]=="y":
yield scrapy.Request(url[1])
def parse(self, response):
regex = re.compile(r'"product"\s*:\s*(.+?\})', re.DOTALL)
regex1 = re.compile(r'"pathIndicator"\s*:\s*(.+?\})', re.DOTALL)
source_json1 = response.xpath("//script[contains(., 'var digitalData')]/text()").re_first(regex)
source_json2 = response.xpath("//script[contains(., 'var digitalData')]/text()").re_first(regex1)
model_code = response.xpath('//script').re_first('modelCode.*?"(.*)"')
name = response.xpath("//meta[#property='og:country-name']/#content").extract_first()
source_arr = response.xpath("//script[contains(., 'COUNTRY_SHOP_STATUS')]/text()").extract()
color = response.xpath("//div[#class='product-details__toggler-info-title']//span[#class='product-details__toggler-selected']/#title").extract()
if source_json1 and source_json2:
source_json1 = re.sub(r'//[^\n]+', "", source_json1)
source_json2 = re.sub(r'//[^\n]+', "", source_json2)
product = json.loads(source_json1)
path = json.loads(source_json2)
product_category = product["pvi_type_name"]
product_type = product["pvi_subtype_name"]
product_model = path["depth_5"]
product_name = product["model_name"]
if source_json1 and source_json2:
source1 = source_json1[0]
source2 = source_json2[0]
name_list.append(name)
category_list.append(product_category)
type_list.append(product_type)
model_list.append(product_model)
model_name_list.append(product_name)
model_code_list.append(model_code)
with open('output.csv','a',newline='') as csvfile:
fieldnames = ['Country','Category','Type','Model','Name','SK','Color']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if product_category:
writer.writerow({'Country': name, 'Category': product_category, 'Type': product_type, 'Model': product_model, 'Name': product_name, 'SK': model_code, 'Color': color})
if source_arr:
categorie = re.findall('product.pvi_type_name.*"(.*)"', source_arr[0])
tip = re.findall('product.pvi_subtype_name.*"(.*)"', source_arr[0])
model = re.findall('product.displayName.*"(.*)"', source_arr[0])
model_nume = re.findall('product.model_name.*"(.*)"', source_arr[0])
name_list.append(name)
category_list.append(categorie)
type_list.append(tip)
model_list.append(model)
model_name_list.append(model_nume)
model_code_list.append(model_code)
with open('output.csv', 'a',newline='') as csvfile:
fieldnames = ['Country','Category','Type','Model','Name','SK','Color']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow({'Country': name, 'Category': categorie, 'Type': tip, 'Model': model, 'Name': model_nume, 'SK': model_code, 'Color': color})
Tabel_Final.append(list(zip(name_list, category_list, type_list, model_list, model_name_list, model_code_list)))
return Tabel_Final
I recommend that you split your codebase:
A scrapy project to fetch the required data and export it in CSV or JSON Lines format
A separate script where you load that output into a Pandas DataFrame and you do whatever you want with it
Otherwise, you should learn how to run Scrapy from a script and refactor your code accordingly. I have a pet project that follows this approach:
I defined a Scrapy pipeline that stores all scraped data into a module variable.
Then I execute the spider from my script as documented and after the scraping finishes I import and read the module variable where I stored the data.

Export scrapy items to different files

I'm scraping review from moocs likes this one
From there I'm getting all the course details, 5 items and another 6 items from each review itself.
This is the code I have for the course details:
def parse_reviews(self, response):
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
return l.load_item()
Now I want to include the review details, another 5 items for each review.
Since the course data is common for all the reviews I want to store it in a different file and use course name/id to relate the data afterward.
This is the code I have for the review's items:
for review in response.xpath('//*[#class="review-body"]'):
review_body = review.xpath('.//div[#class="review-body__content"]//text()').extract()
course_stage = review.xpath('.//*[#class="review-body-info__course-stage--completed"]//text()').extract()
user_name = review.xpath('.//*[#class="review-body__username"]//text()').extract()
review_date = review.xpath('.//*[#itemprop="datePublished"]/#datetime').extract()
score = review.xpath('.//*[#class="sr-only"]//text()').extract()
I tried to work with a temporary solution, returning all the items for each case but is not working either:
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
#l = MyItemLoader(selector=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
for review in response.xpath('//*[#class="review-body"]'):
l.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
l.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
l.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
l.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
l.add_xpath('score', './/*[#class="sr-only"]//text()')
yield l.load_item()
The output file for that script is corrupted, cells are displaced and the size of the fields is not correct.
EDIT:
I want to have two files at the output:
The first one containing:
course_title,course_description,course_instructors,course_key_concepts,course_link
And the second one with:
course_title,review_body,course_stage,user_name,review_date,score
The issue is you are mixing everything up into a single item, which is not the right way to do it. You should create two items: MoocsItem and MoocsReviewItem.
And then update the code like below
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Now what you want is that different item type goes in different csv files, which is what the below SO thread answers:
How can scrapy export items to separate csv files per item
I have not tested the below, but the code will look something like this:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
def item_type(item):
return type(item).__name__.replace('Item','').lower() # TeamItem => team
class MultiCSVItemPipeline(object):
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
You need to make sure the ITEM_PIPELINES is updated to use this MultiCSVItemPipeline class
ITEM_PIPELINES = {
'mybot.pipelines.MultiCSVItemPipeline': 300,
}

Categories