divide list of elements in scrapy output into seperate rows - python

I am trying to separate the output from Scrapy into separate lines in an Excel file but I get something like this
In other words each output from variant id, price and name should be in placed in seperate lines in Excel.
I am using scrapy-xlsx 0.1.1 library to export output to xlsx file (it cannot be in csv).
Please tell me where is the issue.
import scrapy
from ..items import ZooplusItem
import re
class ZooplusDeSpider(scrapy.Spider):
name = 'zooplus_de'
allowed_domains = ['zooplus.de']
start_urls = ['https://www.zooplus.de/shop/hunde/hundefutter_trockenfutter/diaetfutter']
def parse(self, response):
for link in response.css('.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-2.MuiGrid-justify-xs-flex-end'):
items = ZooplusItem()
redirect_urls = response.request.meta.get('redirect_urls')
items['url'] = link.redirect_urls[0] if redirect_urls else response.request.url
items['product_url'] = link.css('.MuiGrid-root.product-image a::attr(href)').getall()
items['title'] = link.css('h3 a::text').getall()
items['id'] = link.css('h3 a::attr(id)').getall()
items['review'] = link.css('span.sc-fzoaKM.kVcaXm::text').getall()
items['review'] = re.sub(r'\D', " ", str(items['review']))
items['review'] = items['review'].replace(" ", "")
#items['review'] = int(items['review'])
items['rate'] = len(link.css('a.v3-link i[role=full-star]'))
items['variant_id'] = [i.strip().split('/n') for i in link.css('.jss114.jss115::text').extract()]
items['variant_name'] = [i.strip().split('/n') for i in link.css('.sc-fzqARJ.cHdpSy:not(.jss114.jss115)::text').extract()]
items['variant_price'] = [i.strip().split('/n') for i in link.css('div.product__prices_col meta::attr(content)').extract()]
yield items

If you want to store all the variants with common information duplicated, then you need to loop through each variant and yield that separately. You can copy the common information you've already collected and add to that.
In summary replace
items['variant_id'] = [i.strip().split('/n') for i in link.css('.jss114.jss115::text').extract()]
items['variant_name'] = [i.strip().split('/n') for i in link.css('.sc-fzqARJ.cHdpSy:not(.jss114.jss115)::text').extract()]
items['variant_price'] = [i.strip().split('/n') for i in link.css('div.product__prices_col meta::attr(content)').extract()]
yield item
with something like
for i in link.css("[data-zta='product-variant']"):
variant = items.copy()
variant["variant_id"] = i.attrib["data-variant-id"]
variant["variant_name"] = "".join(i.css(".title > div::text").getall()).strip()
variant['variant_price'] = i.css("[itemprop='price']::attr(content)").get()
yield variant

Related

Extracting data from web with selenium and inserting it in a pandas dataframe

I have a problem, I cannot "take" the data I have extracted from selenium and store them somewhere to manipulate or store them
I am grabbing the data, like so:
try:
books = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.ID, "titleitem"))
)
finally:
driver.quit()
inside the try function I have extracted the data like this:
for i, book in enumerate(books):
splited = books[i].text.split("\n")
writer = str(splited[0])
title = str(splited[1])
publiser = str(splited[2])
country = str(splited[3])
ISBN = str(splited[4])
So in the end I have this code to extract exactly the data I want:
try:
books = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.ID, "titleitem"))
)
for i, book in enumerate(books):
splited = books[i].text.split("\n")
writer = str(splited[0])
title = str(splited[1])
publiser = str(splited[2])
country = str(splited[3])
ISBN = str(splited[4])
finally:
driver.quit()
Those variables are the things I want to grab.
When I print them, they appear normal (as they are on the website)
But then, I try to insert them to a pandas dataframe, like this
(fake_books is declared as a pd.DataFrame()):
tmp = pd.Series({'title' : title, 'author': writer, 'publiser': ekdoths})
fake_books = fake_books.append(tmp)
I have also tries a list of dictionaries:
books = [{}]
...
for i, book in enumerate(books):
splited = books[i].text.split("\n")
books[i]['writer'] = str(splited[0])
books[i]['title'] = str(splited[1])
books[i]['ekdoths'] = str(splited[2])
books[i]['polh'] = str(splited[3])
books[i]['ISBN'] = str(splited[4])
Neither of those things work, the programm is just "lagging" and printing an emply dataframe of list
I always use this method, I create a list of dictionaries then I pass it into pd.DataFrame
# create empty list as the beginning of the code
df_list = []
for i, book in enumerate(books):
splited = books[i].text.split("\n")
writer = str(splited[0])
title = str(splited[1])
publiser = str(splited[2])
country = str(splited[3])
ISBN = str(splited[4])
# add the scraped data into dictionary then append it into df_list
df_list.append({"writer":writer, "title":title, "publiser":publiser, "country":country, "ISBN":ISBN})
# and the end of your code after scraping all you want
df = pd.DataFrame(df_list)

TypeError: list indices must be integers or slices, not Tag. One of my loops isn't working

The ultimate goal of this is to output select data columns to a .csv. I had it working once to where it only got the first table on the page but I needed both. Now it says this. Im quite new to python and IDK how I got to this point in the first place. I needed the call and put table but on the web page the calls came first and when I did .find I only got the calls. I am working on this with a friend and he put in the last two functions. He could get the columns I wanted but now we only get the calls. I tried to fix it and now it say the error in the title.
import bs4
import requests
import pandas as pd
import csv
from bs4 import BeautifulSoup
#sets desired ticker. in the future you could make this long
def ticker():
ticker = ['GME','NYMT']
return ticker
#creates list of urls for scrapet to grab
def ticker_site():
ticker_site = ['https://finance.yahoo.com/quote/'+x+'/options?p='+x for x in ticker()]
return ticker_site
optionRows = []
for i in range(len(ticker_site())):
optionRows.append([])
def ticker_gets():
option_page = ticker_site()
requested_page = requests.get(option_page[i])
ticker_soup = BeautifulSoup(requested_page.text,'html.parser')
return ticker_soup
def soup_search():
table = ticker_gets()
both_tables = table.find_all('table')
call_table = both_tables[0]
put_table= both_tables[1]
call_rows = call_table.find('tr')
put_rows = put_table.find('tr')
#makes the call table
for call in call_rows:
whole_call_table = call.find_all('td')
call_row = [y.text for y in whole_call_table]
optionRows[call].append(call_row)
#makes the put table
for put in put_rows:
whole_put_table = put.find_all('td')
put_row = [z.text for z in whole_put_table]
optionRows[put].append(put_row)
for i in range(len(optionRows)):
optionRows[i] = optionRows[i][1:len(optionRows[i])]
return optionRows
def getColumns(columnIndexes=[2, 4, 5]):
newList = []
for tickerIndex in range(len(soup_search())):
newList.append([])
indexCount = 0
for j in soup_search()[tickerIndex]:
newList[tickerIndex].append([])
for i in columnIndexes:
newList[tickerIndex][indexCount].append(j[i])
indexCount += 1
return newList
def csvOutputer():
rows = getColumns()
fields = ["Ticker", "Strike", "Bid", "Ask"]
with open('newcsv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
for i in range(len(ticker())):
for j in rows[i]:
j.insert(0, ticker()[i])
write.writerow(j)
csvOutputer()

How to append dictionary to csv without appending keys

I have a dictionary features = {'feature1' : 'hi', 'feature2': 'second feature', 'feature3': 'third feature'}. I need to save it to a csv file. But this dictionary gets renewed in each iteration and a new dictionary is appended to existing csv file. I am using it in scrapy.
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
# ('/category/', 'parse_category'),
]
def parse(self,response):
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
# print("hi here")
features = {}
features["ad_url"] = response.request.url
#filling feature dictionary
df = pd.DataFrame.from_dict(features , orient='index')
df = df.transpose()
df.to_csv("result.csv",mode = 'a', index = False)
The problem is that this saves dictionary to csv along with key as well. I am attaching the picture of excel sheet here:
enter image description here
Intuitively speaking header should be filled only once at the top and not every time in every other row. How do I do that?
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
# ('/category/', 'parse_category'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI':'FILEname.csv'}
def parse(self,response):
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
# print("hi here")
item = {}
item["ad_url"] = response.request.url
yield item
to run it scrapy crawl spidername

Parsing a JSON using specific key words using Python

I'm trying to parse a JSON of a sites stock.
The JSON: https://www.ssense.com/en-us/men/sneakers.json
So I want to take some keywords from the user. Then I want to parse the JSON using these keywords to find the name of the item and (in this specific case) return the ID, SKU and the URL.
So for example:
If I inputted "Black Fennec" I want to parse the JSON and find the ID,SKU, and URL of Black Fennec Sneakers (that have an ID of 3297299, a SKU of 191422M237006, and a url of /men/product/ps-paul-smith/black-fennec-sneakers/3297299 )
I have never attempted doing anything like this. Based on some guides that show how to parse a JSON I started out with this:
r = requests.Session()
stock = r.get("https://www.ssense.com/en-us/men/sneakers.json",headers = headers)
obj json_data = json.loads(stock.text)
However I am now confused. How do I find the product based off the keywords and how do I get the ID,Url and the SKU or it?
Theres a number of ways to handle the output. not sure what you want to do with it. But this should get you going.
EDIT 1:
import requests
r = requests.Session()
obj_json_data = r.get("https://www.ssense.com/en-us/men/sneakers.json").json()
products = obj_json_data['products']
keyword = input('Enter a keyword: ')
for product in products:
if keyword.upper() in product['name'].upper():
name = product['name']
id_var = product['id']
sku = product['sku']
url = product['url']
print ('Product: %s\nID: %s\nSKU: %s\nURL: %s' %(name, id_var, sku, url))
# if you only want to return the first match, uncomment next line
#break
I also have it setup to store it into a dataframe, and or a list too. Just to give some options of where to go with it.
import requests
import pandas as pd
r = requests.Session()
obj_json_data = r.get("https://www.ssense.com/en-us/men/sneakers.json").json()
products = obj_json_data['products']
keyword = input('Enter a keyword: ')
products_found = []
results = pd.DataFrame()
for product in products:
if keyword.upper() in product['name'].upper():
name = product['name']
id_var = product['id']
sku = product['sku']
url = product['url']
temp_df = pd.DataFrame([[name, id_var, sku, url]], columns=['name','id','sku','url'])
results = results.append(temp_df)
products_found = products_found.append(name)
print ('Product: %s\nID: %s\nSKU: %s\nURL: %s' %(name, id_var, sku, url))
if products_found == []:
print ('Nothing found')
EDIT 2: Here is another way to do it by converting the json to a dataframe, then filtering by those rows that have the keyword in the name (this is actually a better solution in my opinion)
import requests
import pandas as pd
from pandas.io.json import json_normalize
r = requests.Session()
obj_json_data = r.get("https://www.ssense.com/en-us/men/sneakers.json").json()
products = obj_json_data['products']
products_df = json_normalize(products)
keyword = input('Enter a keyword: ')
products_found = []
results = pd.DataFrame()
results = products_df[products_df['name'].str.contains(keyword, case = False)]
#print (results[['name', 'id', 'sku', 'url']])
products_found = list(results['name'])
if products_found == []:
print ('Nothing found')
else:
print ('Found: '+ str(products_found))

Xpath field repetition

I want to put it in data,And I'm going to insert MySQL,But the resulting content field is repeated,I think it's my for circular logic error,So how do I change?
def get_user_data(self,start_url):
html = requests.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
contents = selector.xpath('//span[#class="ctt"]/text()')
times = selector.xpath('//span[#class="ct"]/text()')
data = {}
for each_text in contents:
data['content'] = each_text
for each_time in times:
month_day, time, device = each_time.split(maxsplit=2)
data['mobile_phone'] = device
data['create_time'] = month_day + '\n' + time
data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.mysql.insert(data)

Categories