sample data:
id, Name, mail, data1, data2, data3
1, Name1, mail#com, abc, 14, de
1, Name1, mail#com, fgh, 25, kl
1, Name1, mail#com, mno, 38, pq
2, Name2, mail#com, abc, 14, d
I wrote a script that selects the first field is a unique string to clear the duplicates. However, since the data in the fields date1-3 are not repeated, it is necessary to make the result:
1, Name1, mail#com, "abc, 14, de, fgh, 25, kl, mno, 38, pq"
How to merge rows in the array?
My code not work:
import sys
import csv
in_fln = sys.argv[1]
# You can replace here and choose any delimiter:
csv.register_dialect('dlm', delimiter=',')
csv.register_dialect('dmt', delimiter=';')
# if this .csv file do:
if (in_fln[-3:]) == "csv":
out_fln = 'out' + in_fln
inputf = open(in_fln, 'r')
seen = []
outfile = []
nout = {}
#rowun = []
try:
reader = csv.reader(inputf, dialect='dlm')
# select by ContactID
for row in reader:
if row[0] not in seen:
#IT'S work byt temp comment
#rowun = '"' + (row[-4]) + ', ' + (row[-3]) + ', ' + (row[-2]) + '"'
#outfile.append(row[:-5]+[rowun])
outfile.append(row[:-4])
rowun = (row[0])
nout[rowun] = (row[-4:-1])
seen.append(row[0])
print (type(row))
else:
#rowun = '"' + (row[-4]) + ', ' + (row[-3]) + ', ' + (row[-2]) + '"'
#nout.insert(-1,(row[-4:-1]))
print (type(row))
rowun = (row[0])
rowun2 = {rowun:(row[-4:-1])}
nout.update(rowun2)
finally:
#print (nout)
#print (outfile[:-1])
#csv.writer(open(('nout' + in_fln), 'w', newline='')).writerows(nout)
csv.writer(open(out_fln, 'w', newline=''), dialect='dlm').writerows(outfile)
inputf.close()
print ("All done")
This should do the trick.
from collections import defaultdict
import pandas as pd
# recreate your example
df = pd.DataFrame([[1, 'Name1', 'mail#com', 'abc', 14, 'de'],
[1, 'Name1', 'mail#com', 'fgh', 25, 'kl'],
[1, 'Name1', 'mail#com', 'mno', 38, 'pq'],
[2, 'Name2', 'mail#com', 'abc', 14, 'd']
], columns=['id', 'Name', 'mail', 'data1', 'data2','data3'])
res = defaultdict(list)
for ind, row in df.iterrows():
key = (row['id'], row['Name'], row['mail'])
value = (row['data1'], row['data2'], row['data3'])
res[key].append(value)
for key, value in res.items():
print(key, value)
# gives
# (2, 'Name2', 'mail#com') [('abc', 14, 'd')]
# (1, 'Name1', 'mail#com') [('abc', 14, 'de'), ('fgh', 25, 'kl'), ('mno', 38, 'pq')]
My own version is very close to the beter:
Now all work!
#!/usr/bin/env python3
import csv, re
import os, sys
in_fln = sys.argv[1]
# You can replace here and choose any delimiter:
#csv.register_dialect('dlm', delimiter=',')
dm = ','
seen = []
# if this .csv file do:
if (in_fln[-3:]) == "csv":
out_fln = 'out' + in_fln
#create the full structure: output_rows
infile = csv.reader(open(in_fln, 'r'), delimiter=dm, quotechar='"')
output_rows = []
for row in infile:
a = 0
if row[0] not in seen:
seen.append(row[0])
output_rows.append(row[:-4])
#rowun = '"' + row[-4] + ', ' + row[-3] + ', ' + row[-2] + '"'
rowun = row[-4] + ', ' + row[-3] + ', ' + row[-2]
output_rows.append([rowun])
else:
#output_rows.append([row[-4], row[-3], row[-2]])
#rowun = '"' + row[-4] + ', ' + row[-3] + ', ' + row[-2] + '"'
rowun = row[-4] + ', ' + row[-3] + ', ' + row[-2]
#output_rows.insert(-1,[rowun])
#rowun = str(rowun)
#print (rowun)
output_rows[-1].append(rowun)
#Finally save it to a file
csv.writer(open(out_fln, 'w', newline=''), delimiter=dm, quotechar='"').writerows(output_rows)
chng = [
['","',','], # chng "," on ,
['\n"',',"'], # Del new str
]
input_file = open(out_fln).read()
output_file = open(out_fln,'w')
for string in chng:
input_file = re.sub(str(string[0]),str(string[1]),input_file)
output_file.write(input_file)
output_file.close()
print ("All done")
Related
I am trying to run the following python code
Technology: Python, Selenium scraper
Device: Windows device
Getting error......
Traceback (most recent call last):
File "scraper.py", line 35, in for row in cp_url:
ValueError: I/O operation on closed file.
#!/usr/bin/python3
# Description: The Python code below will search selenium in Google.
import time
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#EVERYTIME CHANGE THE DRIVER PATH TO THE CHROME DRIVER FOR LATEST CHROME VERSION
driver = webdriver.Chrome(
executable_path="D:\chromedriver.exe")
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
contents = []
filePath = 'output1.csv'
# As file at filePath is deleted now, so we should check if file
# exists or not not before deleting them
if os.path.exists(filePath):
os.remove(filePath)
else:
print("Can not delete the file as it doesn't exists")
f = open("output1.csv", "a")
f.write("website," + "htmltag," + "type," + "id," + "classname," + "for," + "href," + "alt," + "type," + "src,"
+ "name," + "width," + "height," + "data-src,"+ 'inner-text,' + 'action,' + 'value,' + "\n")
with open('inputLinks1.csv', 'rt') as cp_csv:
cp_url = csv.reader(cp_csv)
for row in cp_url:
links = row[0]
contents.append(links)
driver.get(links)
with open('xpathtags.csv', 'rt') as cp2_csv:
cp_url2 = csv.reader(cp2_csv)
for row1 in cp_url2:
print(row[0])
(xtype, xpathtext) = row1[0].split(';')
print(xtype, xpathtext)
contents.append(xtype)
contents.append(xpathtext)
elems = driver.find_elements_by_xpath(xpathtext)
for elem in elems:
f = open('output1.csv', 'a', encoding='utf-8')
f.write( links + ", "+ xtype + ","
+ str(elem.get_attribute('type')) + ', '
+ str(elem.get_attribute('id')) + ', '
+ str(elem.get_attribute('class')) + ', '
+ str(elem.get_attribute('for')) + ', '
+ str(elem.get_attribute('href')) + ', '
+ str(elem.get_attribute('alt')) + ', '
+ str(elem.get_attribute('type')) + ', '
+ str(elem.get_attribute('src')) + ', '
+ str(elem.get_attribute('name')) + ', '
+ str(elem.get_attribute('width')) + ', '
+ str(elem.get_attribute('height')) + ', '
+ str(elem.get_attribute('data-src')) + ', '
+ str(elem.get_attribute('innerText').strip()) + ', '
+ str(elem.get_attribute('action')) + ', '
+ str(elem.get_attribute('value')) + ', '
+ '\n')
f.close()
driver.close()
I am using the following CSV files
A) inputlinks1.csv
www.flipkart.com
www.ebay.com
B) xpathtags.csv
Link;//a[#href]
Button;//button
Image;//img
Heading1;//h1
Heading2;//h2
Heading3;//h3
Heading4;//h4
C) Output.csv is a blank file
I am getting the following error
Traceback (most recent call last):
File "scraper.py", line 35, in <module>
for row in cp_url:
ValueError: I/O operation on closed file.
I can't test it but I think your problem is that you have wrong indentations
with open('inputLinks1.csv', 'rt') as cp_csv:
cp_url = csv.reader(cp_csv)
for row in cp_url:
# ...rest...
so you run for-loop outside with...as... and with...as... automatically closes file.
You should run for-loop inside with...as...
with open('inputLinks1.csv', 'rt') as cp_csv:
cp_url = csv.reader(cp_csv)
for row in cp_url:
# ...rest...
Or you could use standard open() and close()
cp_csv = open('inputLinks1.csv', 'rt')
cp_url = csv.reader(cp_csv)
for row in cp_url:
# ...rest...
cp_csv.close()
Had to make a few changes to your code to get it working.
After fixing indentation, it threw another error w.r.t inputlinks1.csv file.
Changed it to-
https://www.flipkart.com
https://www.ebay.com
And always try to use with open when handling files.
Code snippet:-
contents = []
filePath = 'output1.csv'
# As file at filePath is deleted now, so we should check if file
# exists or not not before deleting them
if os.path.exists(filePath):
os.remove(filePath)
else:
print("Can not delete the file as it doesn't exists")
with open("output1.csv", "a") as f:
f.write("website," + "htmltag," + "type," + "id," + "classname," + "for," + "href," + "alt," + "type," + "src,"
+ "name," + "width," + "height," + "data-src,"+ 'inner-text,' + 'action,' + 'value,' + "\n")
with open('inputLinks1.csv', 'r') as cp_csv:
cp_url = csv.reader(cp_csv)
for row in cp_url:
links = row[0]
print(links)
contents.append(links)
driver.get(links)
with open('xpathtags.csv', 'r') as cp2_csv:
cp_url2 = csv.reader(cp2_csv)
for row1 in cp_url2:
print(row[0])
(xtype, xpathtext) = row1[0].split(';')
print(xtype, xpathtext)
contents.append(xtype)
contents.append(xpathtext)
elems = driver.find_elements_by_xpath(xpathtext)
for elem in elems:
with open('output1.csv', 'a', encoding='utf-8') as f:
f.write( links + ", "+ xtype + ","
+ str(elem.get_attribute('type')) + ', '
+ str(elem.get_attribute('id')) + ', '
+ str(elem.get_attribute('class')) + ', '
+ str(elem.get_attribute('for')) + ', '
+ str(elem.get_attribute('href')) + ', '
+ str(elem.get_attribute('alt')) + ', '
+ str(elem.get_attribute('type')) + ', '
+ str(elem.get_attribute('src')) + ', '
+ str(elem.get_attribute('name')) + ', '
+ str(elem.get_attribute('width')) + ', '
+ str(elem.get_attribute('height')) + ', '
+ str(elem.get_attribute('data-src')) + ', '
+ str(elem.get_attribute('innerText').strip()) + ', '
+ str(elem.get_attribute('action')) + ', '
+ str(elem.get_attribute('value')) + ', '
+ '\n')
driver.close()
I am trying to create a large flat file with fixed width columns that contains multiple layers, but processing seems to be very slow, most likely because I am iterating over each row.
For context, this is for transmitting insurance policy information.
The hierarchy goes like this:
-Policy row
--Property on policy
---Coverage on property
--Property on policy
---Coverage on property
--Owner on policy
--Owner on policy
--Owner on policy
Currently I'm loading the four record types into separate dataframes, and then doing a for loop over each type by pulling them based on the parent record's ID, and then writing them to the file. I'm hoping for some sort of hierarchical dataFrame merge that doesn't force me to scan the file each time I want a record.
import re
import pandas as pd
import math
def MakeNumeric(instring):
output = re.sub('[^0-9]', '', str(instring))
return str(output)
def Pad(instring, padchar, length, align):
if instring is None: # Takes care of NULL values
instring = ''
instring = str(instring).upper()
instring = instring.replace(',', '').replace('\n', '').replace('\r', '')
instring = instring[:length]
if align == 'L':
output = instring + (padchar * (length - len(instring)))
elif align == 'R':
output = (padchar * (length - len(instring))) + instring
else:
output = instring
return output
def FileCreation():
POLR = pd.read_parquet(r'POLR.parquet')
PRP1 = pd.read_parquet(r'PRP1.parquet')
PROP = pd.read_parquet(r'PROP.parquet')
SUBJ = pd.read_parquet(r'SUBJ.parquet')
rownum = 1
totalrownum = 1
POLRCt = 0
size = 900000
POLR = [POLR.loc[i:i + size - 1, :] for i in range(0, len(POLR), size)]
FileCt = 0
print('Predicted File Count: ' + str(math.ceil(len(POLR[0])/ size)) )
for df in POLR:
FileCt += 1
filename = r'OutputFile.' + Pad(FileCt, '0', 2, 'R')
with open(filename, 'a+') as outfile:
for i, row in df.iterrows():
row[0] = Pad(rownum, '0', 9, 'R')
row[1] = Pad(row[1], ' ', 4, 'L')
row[2] = Pad(row[2], '0', 5, 'R')
# I do this for all 50 columns
outfile.write((','.join(row[:51])).replace(',', '') + '\n')
rownum += 1
totalrownum += 1
for i2, row2 in PROP[PROP.ID == row[51]].iterrows():
row2[0] = Pad(rownum, '0', 9, 'R')
row2[1] = Pad(row2[1], ' ', 4, 'L')
row2[2] = Pad(row2[2], '0', 5, 'R')
# I do this for all 105 columns
outfile.write((','.join(row2[:106])).replace(',', '') + '\n')
rownum += 1
totalrownum += 1
for i3, row3 in PRP1[(PRP1['id'] == row2['ID']) & (PRP1['VNum'] == row2['vnum'])].iterrows():
row3[0] = Pad(rownum, '0', 9, 'R')
row3[1] = Pad(row3[1], ' ', 4, 'L')
row3[2] = Pad(row3[2], '0', 5, 'R')
# I do this for all 72 columns
outfile.write((','.join(row3[:73])).replace(',', '') + '\n')
rownum += 1
totalrownum += 1
for i2, row2 in SUBJ[SUBJ['id'] == row['id']].iterrows():
row2[0] = Pad(rownum, '0', 9, 'R')
row2[1] = Pad(row2[1], ' ', 4, 'L')
row2[2] = Pad(row2[2], '0', 5, 'R')
# I do this for all 24 columns
outfile.write((','.join(row2[:25])).replace(',', '') + '\n')
rownum += 1
totalrownum += 1
POLRCt += 1
print('File {} of {} '.format(str(FileCt),str(len(POLR)) ) + str((POLRCt - 1) / len(df.index) * 100) + '% Finished\r')
rownum += 1
rownum = 1
POLRCt = 1
I'm essentially looking for a script that doesn't take multiple days to create a 27M record file.
I ended up populating temp tables for each record level, and creating keys, then inserting them into a permanent staging table and assigning an clustered index to the keys.
I then queried the results while using OFFSET and FETCH NEXT %d ROWS ONLY to reduce memory size. I then used the multiprocessing library to break the workload out for each thread on the CPU.
Ultimately, the combination of these have reduced the runtime to about 20% of what it was when this question was originally posted.
I'm iterating API requests for each row of the input CSV file. And I want to add API output results to the existing CSV file.
Input
Desired output
As you can see, I added three headers with corresponding results (latitude, longitude, coordinates)
However, I'm finding difficulty with writing the right query for this. Below is the best I could do.
df=pd.read_csv(r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy2.csv",delimiter=',', na_values="nan")
# Output
with open(r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy2.csv", 'r') as csvin, open (r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy3.csv", 'w', newline='') as out:
csvreader = csv.DictReader(csvin)
fieldnames = csvreader.fieldnames + ["latitude","longitude","coordinates"]
csvwriter = csv.DictWriter(out, fieldnames)
csvwriter.writeheader()
# Iterating requests for each row
for row in df.itertuples():
output = client.geocode(str(row.addressline1) + ', ' + str(row.city) + ', ' + str(row.state) + ', ' + str(row.postalcode)).coords
cord = '(' + str(output[0]) + ', '+ str(output[1]) + ')'
for node, row in enumerate(csvreader, 3):
csvwriter.writerow(dict(3, {'latitude': output[0], 'longitude': output[1], 'coordinates': cord}))
Update:
Here is my new Python query:
df=pd.read_csv(r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy2.csv",delimiter=',', na_values="nan")
# Output
with open(r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy2.csv", 'r') as csvin, open (r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy3.csv", 'w', newline='') as out:
csvreader = csv.DictReader(csvin)
fieldnames = csvreader.fieldnames + ["latitude","longitude","coordinates"]
csvwriter = csv.DictWriter(out, fieldnames)
csvwriter.writeheader()
# Iterating requests for each row
for row in df.itertuples():
output = client.geocode(str(row.addressline1) + ', ' + str(row.city) + ', ' + str(row.state) + ', ' + str(row.postalcode)).coords
cord = '(' + str(output[0]) + ', '+ str(output[1]) + ')'
for node, row1 in enumerate(csvreader, 38):
csvwriter.writerow(dict(row1,latitude= output[0] % node))
for node, row2 in enumerate(csvreader, 39):
csvwriter.writerow(dict(row2,longitude = output[1] % node))
for node, row3 in enumerate(csvreader, 40):
csvwriter.writerow(dict(row3,coordinates= cord % node))
However, I get the following result:
You can more easily accomplish this by using more of pandas features.
Import the data from csv as you have been doing.
import pandas as pd
df = pd.read_csv("input_file.csv")
You can use dataframe.apply(func, axis=1) to apply a function to each row of a dataframe. https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html
def get_coords(row):
return client.geocode(str(row.addressline1) + ', ' + str(row.city) + ', ' \
+ str(row.state) + ', ' + str(row.postalcode)).coords
coords = df.apply(get_coords, axis=1)
df['latitide'] = coords.values[:,0]
df['longitude'] = coords.values[:,1]
df['coords'] = coords
You can then easily save the dataframe to csv using:
df.to_csv('output_filename.csv')
Hope this help.
p.s. code is untested but should be good :)
I have a csv file and I need to mix 2 of its columns:
Sitio, ID_espacio, Espacio, Tamano, Country, Impresiones_exchange, Importe_a_cobrar, eCPM, Subastas, Fill_rate
NUEVO_Infotechnology, 264244, NUEVO_Infotechnology - Home_IT - IT_Header, Variable (1240x90), Bangladesh, 0, 0.00, 0.00, 1, 0.00
NUEVO Apertura, 274837, NUEVO Apertura - Nota_Ap - Right3_300x250, 300x250, Paises Bajos, 0, 0.00, 0.00, 4, 0.00
The problem is I need to mix ID_espaciowith Espacio but in this way:
example:
NUEVO_Infotechnology, 264244, NUEVO_Infotechnology - Home_IT - IT_Header, Variable (1240x90), Bangladesh, 0, 0.00, 0.00, 1, 0.00
What I need:
NUEVO_Infotechnology, 264244 - Home_IT - IT_Header, Variable (1240x90), Bangladesh, 0, 0.00, 0.00, 1, 0.00
As you can see I remove the first name of the Espacio until the '-' and then i put the ID_espacio.
I tried to do it and I could but the now I need to have all the csv and not only my modification:
import csv
lista_ok = []
base = []
with open("test.csv", 'rb') as f:
reader = csv.reader(f)
your_list = list(reader)
for item in your_list[1:]:
a = item[2].split(" - ")
base.append(a)
for item in base:
for itemf in your_list[1:]:
b = []
a = itemf[1] + ' - ' + ' - '.join(item[1:])
b.append(a)
lista_ok.append(b)
Output:
[[' 264244 - Home_IT - IT_Header'], [' 274837 - Home_IT - IT_Header'], [' 264244 - Nota_Ap - Right3_300x250'], [' 274837 - Nota_Ap - Right3_300x250']]
Output I need:
[['Sitio', ' ID_espacio', ' Espacio', ' Tamano', ' Country', ' Impresiones_exchange', ' Importe_a_cobrar', ' eCPM', ' Subastas', ' Fill_rate'], ['NUEVO_Infotechnology', ' 264244 - Home_IT - IT_Header', ' Variable (1240x90)', ' Bangladesh', ' 0', ' 0.00', ' 0.00', ' 1', ' 0.00'], ['NUEVO Apertura', ' 274837 - Nota_Ap - Right3_300x250', ' 300x250', ' Paises Bajos', ' 0', ' 0.00', ' 0.00', ' 4', ' 0.00']]
Here another version:
import csv
lista_ok = []
with open("test.csv", 'rb') as f:
reader = csv.reader(f)
your_list = list(reader)
for item in your_list:
sitio = item[0]
id_espacio = item[1]
item.remove(id_espacio)
espacio_parts = item[1].split(' - ')
if your_list.index(item) > 0:
espacio_parts[0] = espacio_parts[0].lstrip().replace(sitio,id_espacio)
espacio = ' - '.join(espacio_parts)
item[1] = espacio
lista_ok.append(item)
You could write a function that transforms a single row the way you want. Then call that function for each row as you read it from the file and put it in your final list:
def row_transform(row, is_header=False):
if not is_header:
# trim Sitio from Espacio
row[2] = row[2].split(" - ", 1)[1]
# add ID to espacio
row[2] = " - ".join((row[1], espacio))
# remove ID col
del row[1]
return row
with open("test.csv") as fp:
reader = csv.reader(fp)
lista_ok = [row_transform(next(reader), True)]
lista_ok.extend((row_transform(row) for row in reader))
I have 30911 html files. I need to do webscraping and then save the info into a txt file named index.txt.
It should look like
filename1, title, t1, date, p1
filename2, title, t1, date, p1
filename3, title, t1, date, p2
and so on...
I only want filename, but output gave me path+filename.
Your problem is that filename is filepath in reality, in order to get the filename you could use os module
os.path.basename('filepath')
so in order to write to the file:
indexFile.write(os.path.basename(filename)+ ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + parti_names + '\n')
You can use:
path = 'C:/Users/.../.../output/'
#read html files
for filename in glob.glob(os.path.join(path, '*.html')):
soup = bs4.BeautifulSoup(open(filename).read(), "lxml")
title = soup.find('h1')
ticker = soup.find('p')
d_date = soup.find_all('div', {"id": "a-body"})[0].find_all("p")[2]
try:
def find_participant(tag):
return tag.name == 'p' and tag.find("strong", text=re.compile(r"Executives|Corporate Participants"))
participants = soup.find(find_participant)
parti_names = ""
for parti in participants.find_next_siblings("p"):
if parti.find("strong", text=re.compile(r"(Operator)")):
break
parti_names += parti.get_text(strip=True) + ","
except:
indexFile = open('C:/Users/.../output1/' + 'index.txt', 'a+')
indexFile.write(filename + ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + 'No participants' + '\n')
else:
participants = soup.find(find_participant)
parti_names = ""
for parti in participants.find_next_siblings("p"):
if parti.find("strong", text=re.compile(r"(Operator)")):
break
parti_names += parti.get_text(strip=True) + ","
indexFile = open('C:/Users/.../output1/' + 'index.txt', 'a+')
indexFile.write(os.path.basename(filename) + ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + parti_names + '\n')
indexFile.close()
ntpath is another module used to get base name from path.
>>> import ntpath
>>> ntpath.basename('C:/Users/.../output1/' + 'index.txt')
'index.txt'