I have a CSV containing ~600k partnumbers to be uploaded to my website's inventory. However, this CSV only contains limited information. We're missing pricing and other related information. To get this information I'm required to make requests to the provider's API, and add it into the CSV. At the moment, I've been splitting this part-file into 6 pieces and running the script on each of these files simultaneously. If I run one script it will take hours. Whereas if I split it up, it will go considerably faster.
The Process:
Read Partnumber from CSV
Make request
If errors, continue, and notate error
If inventory, write to inventory.csv with ID and warehouse info
Place part info into results.csv
Onto the next one
I was thinking that I could assign each item a unique ID, have the script request that information, go back into the original csv and finally place the information back into the original document.
How can I utilize the full potential of the system I'm running this script on?
Here's what I've got so far:
import csv
import zeep
wsdl = '#####'
client = zeep.Client(wsdl=wsdl)
def get_data():
with open('partfile.csv')as f:
parts = csv.reader(f, delimiter='|')
with open('results.csv' , 'w+') as outfile:
with open('inventory.csv', 'w+') as inventoryfile:
output = csv.writer(outfile, delimiter=',')
inventoryoutput = csv.writer(inventoryfile, delimiter=',')
inventoryoutput.writerow([
'ID',
'WarehouseNumber',
'WarehouseName',
'QuantityAvailable'
])
# Header Row
output.writerow([
'ID',
'Make',
'Part Number',
'Price',
'Dealer Price',
'Retail Price',
'List Price',
'Core Cost',
'Part Description',
'Is Discontinued',
'Is Dropship Only',
'Is Refrigerant',
'Is Oversize',
'Is Hazmat',
'Sub Parts',
'Cross Reference Parts',
'Log',
'Total Inventory'
])
itemId = 0
for row in parts:
try:
item = client.service.ExactPartLookup('#####', '#####', row[0], row[1])
if (item == None):
raise Exception('Item is None')
except:
write_error(row[1])
continue
item = item.PartInformation_v2[0]
totalInventory = 0
data = [
itemId,
item.Make,
item.PartNumber,
item.Price,
item.Dealer,
item.Retail,
item.List,
item.CoreCost,
item.PartDescription,
item.IsDiscontinued,
item.IsDropShipOnly,
item.IsRefrigerant,
item.IsOversize,
item.IsHazmat,
item.SubParts,
item.CrossReferenceParts,
item.Log
]
print(item.PartNumber)
if (item.Inventory != None):
inventory = item.Inventory.InventoryInformation_v2
iterator = 0
for i in inventory:
inventoryoutput.writerow([
itemId,
inventory[iterator].WarehouseNumber,
inventory[iterator].WarehouseName,
inventory[iterator].QuantityAvailable
])
totalInventory += inventory[iterator].QuantityAvailable
iterator += 1
data.append(totalInventory)
itemId += 1
output.writerow(data)
def write_error( partNumber ):
with open("errors.log", "a+") as errorfile:
errorfile.write("Error! Part Number: " + partNumber + "\n")
get_data()
Please let me know if there is anymore information I could provide.
Thank you!
Related
I'm building a web scraper and I'm able to print all he data I need, but I'm struggling adding the data to my csv file, I feel like I need to add another for loop or even a function. Currently I'm able to get it to print one row of scraped data values, but it skips the 64 other rows of data values.
So far I've tried to put in another for loop and break up each variable into it's own function, but it just breaks my code, Here's what I have so far, I feel like I'm just missing something too.
#Gets listing box
listingBox = searchGrid.find_elements(By.CLASS_NAME, 'v2-listing-card')
#Loops through each listing box
for listingBoxes in listingBox:
listingUrl = []
listingImg = []
listingTitle = []
listingPrice = []
#Gets listing url
listingUrl = listingBoxes.find_element(By.CSS_SELECTOR, 'a.listing-link')
print("LISTING URL:", listingUrl.get_attribute('href'))
#Gets listing image
listingImg = listingBoxes.find_element(By.CSS_SELECTOR, 'img.wt-position-absolute')
print("IMAGE:", listingImg.get_attribute('src'))
#Gets listing title
listingTitle = listingBoxes.find_element(By.CLASS_NAME, 'wt-text-caption')
print("TITLE:", listingTitle.text)
#Gets price
listingPrice = listingBoxes.find_element(By.CLASS_NAME, 'currency-value')
print("ITEM PRICE: $", listingPrice.get_attribute("innerHTML"))
#Gets seller name
# listingSellerName = listingBoxes.find_element(By.XPATH, '/html/body/main/div/div[1]/div/div[3]/div[8]/div[2]/div[10]/div[1]/div/div/ol/li/div/div/a[1]/div[2]/div[2]/span[3]')
# print("SELLER NAME:", listingSellerName.get_attribute("innerHTML"))
print("---------------")
finally:
driver.quit()
data = {'Listing URL': listingUrl, 'Listing Thumbnail': listingImg,'Listing Title': listingTitle, 'Listing Price': listingPrice}
df = pd.DataFrame.from_dict(data, orient='index')
df = df.transpose()
df.to_csv('raw_data.csv')
print('Data has been scrapped and added.')
In your code each loop reset the lists listingUrl, listingImg etc that's why df contains only one row of scraped data, corresponding to the last loop executed. If you want to add elements to a list you have to define the list BEFORE the loop and then use the .append() method inside the loop.
Then, instead of doing listingUrl.get_attribute('href') you will do listingUrl[-1].get_attribute('href') where [-1] means that you are taking the last element of the list.
listingUrl = []
listingImg = []
listingTitle = []
listingPrice = []
for listingBoxes in listingBox:
#Gets listing url
listingUrl.append( listingBoxes.find_element(By.CSS_SELECTOR, 'a.listing-link') )
print("LISTING URL:", listingUrl[-1].get_attribute('href'))
#Gets listing image
listingImg.append( listingBoxes.find_element(By.CSS_SELECTOR, 'img.wt-position-absolute') )
print("IMAGE:", listingImg[-1].get_attribute('src'))
#Gets listing title
listingTitle.append( listingBoxes.find_element(By.CLASS_NAME, 'wt-text-caption') )
print("TITLE:", listingTitle[-1].text)
#Gets price
listingPrice.append( listingBoxes.find_element(By.CLASS_NAME, 'currency-value') )
print("ITEM PRICE: $", listingPrice[-1].get_attribute("innerHTML"))
I have recently created a python program that would import my finances from a .csv file and transfer it onto google sheets. However, I am struggling to figure out how to fix the names that my bank gives me.
Example:
ME DC SI XXXXXXXXXXXXXXXX NETFLIX should just be NETFLIX,
POS XXXXXXXXXXXXXXXX STEAM PURCHASE should just be STEAM and so on
Forgive me if this is a stupid question as I am a newbie when it comes to coding and I am just looking to use it to automate certain situations in my life.
import csv
from unicodedata import category
import gspread
import time
MONTH = 'June'
# Set month name
file = f'HDFC_{MONTH}_2022.csv'
#the file we need to extract data from
transactions = []
# Create empty list to add data to
def hdfcFin(file):
'''Create a function that allows us to export data to google sheets'''
with open(file, mode = 'r') as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
date = row[0]
name = row[1]
expense = float(row[2])
income = float(row[3])
category = 'other'
transaction = ((date, name, expense, income, category))
transactions.append(transaction)
return transactions
sa = gspread.service_account()
# connect json to api
sh = sa.open('Personal Finances')
wks = sh.worksheet(f'{MONTH}')
rows = hdfcFin(file)
for row in rows:
wks.insert_row([row[0], row[1], row[4], row[2], row[3]], 8)
time.sleep(2)
# time delay because of api restrictions
If you dont have specific format to identify the name then you can use below logic. Which will have key value pair. If key appears in name then you can replace it with value.
d={'ME DC SI XXXXXXXXXXXXXXXX NETFLIX':'NETFLIX','POS XXXXXXXXXXXXXXXX STEAM PURCHASE':'STEAM'}
test='POS XXXXXXXXXXXXXXXX STEAM PURCHASE'
if test in d.keys():
test=d[test]
print(test)
Output:
STEAM
If requirement is to fetch only last word out of your name then you can use below logic.
test='ME DC SI XXXXXXXXXXXXXXXX NETFLIX'
test=test.split(" ")[-1]
print(test)
Output:
NETFLIX
I was playing around with the code provided here: https://www.geeksforgeeks.org/update-column-value-of-csv-in-python/ and couldn't seem to figure out how to change the value in a specific column of the row without it bringing up an error.
Say I wanted to change the status of the row belonging to the name Molly Singh, how would I go about it? I've tried the following below only to get an error and the CSV file turning out empty. I'd also prefer the solution be without the use of pandas tysm.
For example the row in the csv file will originally be
Sno Registration Number Name RollNo Status
1 11913907 Molly Singh RK19TSA01 P
What I want the outcome to be
Sno Registration Number Name RollNo Status
1 11913907 Molly Singh RK19TSA01 N
One more question if I were to alter the value in column snow by doing addition/substraction etc how would I go about that as well? Thanks!
the error I get as you can see, the name column is changed to true then false etc
import csv
op = open("AllDetails.csv", "r")
dt = csv.DictReader(op)
print(dt)
up_dt = []
for r in dt:
print(r)
row = {'Sno': r['Sno'],
'Registration Number': r['Registration Number'],
'Name'== "Molly Singh": r['Name'],
'RollNo': r['RollNo'],
'Status': 'P'}
up_dt.append(row)
print(up_dt)
op.close()
op = open("AllDetails.csv", "w", newline='')
headers = ['Sno', 'Registration Number', 'Name', 'RollNo', 'Status']
data = csv.DictWriter(op, delimiter=',', fieldnames=headers)
data.writerow(dict((heads, heads) for heads in headers))
data.writerows(up_dt)
op.close()
Issues
Your error is because the field name in the input file is misspelled as Regristation rather than Registration
Correction is to just read the names from the input file and propagate to the output file as below.
Alternately, you can your code to:
headers = ['Sno', 'Regristation Number', 'Name', 'RollNo', 'Status']
"One more question if I were to alter the value in column snow by doing addition/substraction etc how would I go about that as well"
I'm not sure what is meant by this. In the code below you would just have:
r['Sno'] = (some compute value)
Code
import csv
with open("AllDetails.csv", "r") as op:
dt = csv.DictReader(op)
headers = None
up_dt = []
for r in dt:
# get header of input file
if headers is None:
headers = r
# Change status of 'Molly Singh' record
if r['Name'] == 'Molly Singh':
r['Status'] = 'N'
up_dt.append(r)
with open("AllDetails.csv", "w", newline='') as op:
# Use headers from input file above
data = csv.DictWriter(op, delimiter=',', fieldnames=headers)
data.writerow(dict((heads, heads) for heads in headers))
data.writerows(up_dt)
New to Python/Boto3 so this is a little confusing. I am trying to get AWS Security Hub findings written to a csv using csv.writer but only certain items in the response. I can get the correct columns and rows written to csv however when I try to loop through the writer it just repeats the same row, not the other data from the response. I feel like I'm overlooking something simple, any help is appreciated.
def getSecurityHubFindings():
hub = boto3.client('securityhub')
findingsList = []
for key in paginate(hub.get_findings, Filters=filters, PaginationConfig={'MaxItems': MAX_ITEMS}):
scantype = key['Types']
str1 = ''.join(scantype)
port=key['ProductFields']['attributes:2/value']
vgw=key['ProductFields']['attributes:3/value']
scantype = key['Types']
str1 = ''.join(scantype)
findingAccountId = key['AwsAccountId']
findingLastObservedAt=key['LastObservedAt']
findingFirstObservedAt=key['FirstObservedAt']
findingCreatedAt=key['CreatedAt']
findingrecommendation=key['Remediation']['Recommendation']
findingTypes=key['Types']
InstanceId=key['Resources'][0]['Id']
findingInstanceId=str(InstanceId)
findingAppCode=key['Resources'][0]['Tags']['AppCode']
findingGeneratorId=key['GeneratorId']
findingProductArn=key['ProductArn']
findingTitle=key['Title']
findingsList.append(key)
if (str1 == 'Software and Configuration Checks/AWS Security Best Practices/Network Reachability - Recognized port reachable from a Peered VPC'):
vgw=''
port=key['ProductFields'][ 'attributes:4/value']
peeredvpc= key['ProductFields']['attributes:2/value']
if (str1 == 'Software and Configuration Checks/AWS Security Best Practices/Network Reachability - Recognized port reachable from a Virtual Private Gateway'):
peeredvpc=''
sev = key['Severity']['Product']
if (sev == 3):
findingSeverity='LOW'
elif (sev == 6):
findingSeverity='MEDIUM'
elif ( sev == 9):
findingSeverity='HIGH'
rows = [findingAccountId, findingGeneratorId, findingTitle,findingProductArn,findingSeverity,findingAppCode,findingFirstObservedAt,findingLastObservedAt,findingCreatedAt,findingrecommendation,findingTypes,port,vgw,peeredvpc,findingInstanceId]
columns = ('Account ID', 'Generator ID', 'Title', 'Product ARN', 'Severity', 'AppCode', 'First Observed At','Last Observed At', 'Created At', 'Recommendation', 'Types', 'Port', 'VGW', 'Peered VPC', 'Instance #ID')
with open(FILE_NAME, mode='w', newline='',) as writefile:
writefile_writer = csv.writer(writefile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writefile_writer.writerow(columns)
i = 0
while i < MAX_ITEMS:
writefile_writer.writerow(rows)
i +=1
return(findingsList)
The general flow should be:
def getSecurityHubFindings():
...
# Open output file and write header
columns = ('Account ID', 'Generator ID', 'Title', 'Product ARN', 'Severity', 'AppCode', 'First Observed At','Last Observed At', 'Created At', 'Recommendation', 'Types', 'Port', 'VGW', 'Peered VPC', 'Instance #ID')
with open(FILE_NAME, mode='w', newline='',) as writefile:
writefile_writer = csv.writer(writefile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writefile_writer.writerow(columns)
## Loop through response
for key in paginate(...):
...
(get data here)
...
# Write output
row = [findingAccountId, findingGeneratorId, findingTitle,findingProductArn,findingSeverity,findingAppCode,findingFirstObservedAt,findingLastObservedAt,findingCreatedAt,findingrecommendation,findingTypes,port,vgw,peeredvpc,findingInstanceId]
writefile_writer.writerow(row)
You are opening your file within the for loop every time with the 'w' option which truncates the file [1] and writes from the beginning, so you're overwriting your csv each time.
The block
while i < MAX_ITEMS:
writefile_writer.writerow(rows)
i +=1
also seem wrong, this just writes the same row (even though its called rows) MAX_ITEMS number of times. You probably want to open your csv file and write the header names outside of the for loop, and then write a single row for each iteration of the for loop.
I made an improvement to my code according to this suggestion from #paultrmbrth. what i need is to scrape data from pages that are similar to this and this one and i want the csv output to be like the picture below.
But my code's csv output is little messy, like this:
I have two questions, Is there anyway that the csv output can be like the first picture? and my second question is, i want the movie tittle to be scrapped too, Please give me a hint or provide to me a code that i can use to scrape the movie title and the contents.
UPDATE
The problem has been solved by Tarun Lalwani perfectly. But Now, the csv File's Header only contains the first scraped url categories. for example when i try to scrape this webpage which has References, Referenced in, Features, Featured in and Spoofed in categories and this webpage which has Follows, Followed by, Edited from, Edited into, Spin-off, References, Referenced in, Features, Featured in, Spoofs and Spoofed in categories then the csv output file header will only contain the first webpage's categories i.e References, Referenced in, Features, Featured in and Spoofed in so some categories from the 2nd webpage like Follows, Followed by, Edited from, Edited into and Spoofswill not be on the output csv file header so is its contents.
Here is the code i used:
import scrapy
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["imdb.com"]
start_urls = (
'http://www.imdb.com/title/tt0093777/trivia?tab=mc&ref_=tt_trv_cnn',
'http://www.imdb.com/title/tt0096874/trivia?tab=mc&ref_=tt_trv_cnn',
)
def parse(self, response):
item = {}
for cnt, h4 in enumerate(response.css('div.list > h4.li_group'), start=1):
item['Title'] = response.css("h3[itemprop='name'] a::text").extract_first()
key = h4.xpath('normalize-space()').get().strip()
if key in ['Follows', 'Followed by', 'Edited into', 'Spun-off from', 'Spin-off', 'Referenced in',
'Featured in', 'Spoofed in', 'References', 'Spoofs', 'Version of', 'Remade as', 'Edited from',
'Features']:
values = h4.xpath('following-sibling::div[count(preceding-sibling::h4)=$cnt]', cnt=cnt).xpath(
'string(.//a)').getall(),
item[key] = values
yield item
and here is exporters.py file:
try:
from itertools import zip_longest as zip_longest
except:
from itertools import izip_longest as zip_longest
from scrapy.exporters import CsvItemExporter
from scrapy.conf import settings
class NewLineRowCsvItemExporter(CsvItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
super(NewLineRowCsvItemExporter, self).__init__(file, include_headers_line, join_multivalued, **kwargs)
def export_item(self, item):
if self._headers_not_written:
self._headers_not_written = False
self._write_headers_and_set_fields_to_export(item)
fields = self._get_serialized_fields(item, default_value='',
include_empty=True)
values = list(self._build_row(x for _, x in fields))
values = [
(val[0] if len(val) == 1 and type(val[0]) in (list, tuple) else val)
if type(val) in (list, tuple)
else (val, )
for val in values]
multi_row = zip_longest(*values, fillvalue='')
for row in multi_row:
self.csv_writer.writerow([unicode(s).encode("utf-8") for s in row])
What I'm trying to achieve is i want all these categories to be on the csv output header.
'Follows', 'Followed by', 'Edited into', 'Spun-off from', 'Spin-off', 'Referenced in',
'Featured in', 'Spoofed in', 'References', 'Spoofs', 'Version of', 'Remade as', 'Edited from', 'Features'
Any help would be appreciated.
You can extract the title using below
item = {}
item['Title'] = response.css("h3[itemprop='name'] a::text").extract_first()
For the CSV part you would need to create a FeedExports which can split each row into multiple rows
from itertools import zip_longest
from scrapy.contrib.exporter import CsvItemExporter
class NewLineRowCsvItemExporter(CsvItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
super(NewLineRowCsvItemExporter, self).__init__(file, include_headers_line, join_multivalued, **kwargs)
def export_item(self, item):
if self._headers_not_written:
self._headers_not_written = False
self._write_headers_and_set_fields_to_export(item)
fields = self._get_serialized_fields(item, default_value='',
include_empty=True)
values = list(self._build_row(x for _, x in fields))
values = [
(val[0] if len(val) == 1 and type(val[0]) in (list, tuple) else val)
if type(val) in (list, tuple)
else (val, )
for val in values]
multi_row = zip_longest(*values, fillvalue='')
for row in multi_row:
self.csv_writer.writerow(row)
Then you need to assign the feed exporter in your settings
FEED_EXPORTERS = {
'csv': '<yourproject>.exporters.NewLineRowCsvItemExporter',
}
Assuming you put the code in exporters.py file. The output will be as desired
Edit-1
To set the fields and their order you will need to define FEED_EXPORT_FIELDS in your settings.py
FEED_EXPORT_FIELDS = ['Title', 'Follows', 'Followed by', 'Edited into', 'Spun-off from', 'Spin-off', 'Referenced in',
'Featured in', 'Spoofed in', 'References', 'Spoofs', 'Version of', 'Remade as', 'Edited from',
'Features']
https://doc.scrapy.org/en/latest/topics/feed-exports.html#std:setting-FEED_EXPORT_FIELDS
To set csv data format one of the easiest way is to clean data using excel power queries follow these steps:
1: open csv file in excel.
2:select all values using ctrl+A
3:Then click on table from insert and create table.
4:after create table click on Data from top menu and select From Table 5:know they open new excel window power queries.
6:select any column and click on split column
7: from split column select by delimiter,
8: know select delimiter like comma,space etc
9: final step select advanced option in which there are two options split in rows or column
10: you can do all type of data cleaning using these power queries this is the easiest way to setup data format according to your need