python create a unexpected blank row below header - python

My code below create csv file and then adding random data to it.
Somehow, the first part of the code create a blank row below header.
Can anyone please help to fix the code to remove this blank row?
Thank you,
Hary
header_list = ["Firm", "hour", "unit sold", "product code", "dollar value"]
for i in range(3):
# create file with header -----------
with open(f'D:\\2000 transactions\\location_id_' + str(i) + '.csv', 'w', newline='') as file:
dw = csv.DictWriter(file, delimiter=',',fieldnames=header_list)
dw.writeheader() # this line create a blank row below header
# adding data to file ---------------
for j in range(5):
n = random.randint(1, 99)
text = str(str(sp100_list['Name'].loc[n]) + ',' + str(random.randint(5, 20)) + ',' + str(random.randint(200, 1000)) + ',' + str(
random.randint(100, 150)) + ',' + str(int(random.random() * 1000000)))
myfile = open(f'D:\\2000 transactions\\location_id_' + str(i) + '.csv', 'a')
myfile.write('\n' + text)

Actually you add unexpected blank row in the line myfile.write('\n' + text),not dw.writeheader(). You add a \n first, which add a blank row to your file. Move the \n to backwards should solve it.
code:
import csv
header_list = ["Firm", "hour", "unit sold", "product code", "dollar value"]
for i in range(1):
with open(f'test{str(i)}.csv', 'w', newline='') as file:
dw = csv.DictWriter(file, delimiter=',',fieldnames=header_list)
dw.writeheader() # this line create a blank row below header
for j in range(5):
with open(f'test{str(i)}.csv', 'a', newline='') as myfile:
text = "1,1"
myfile.write(text+'\n')
result:
Firm,hour,unit sold,product code,dollar value
1,1
1,1
1,1
1,1
1,1

you can only use csv writer instead of DictWriter.here is example
use csv write to write header and rows
header_list = ["Firm", "hour", "unit sold", "product code", "dollar value"]
for i in range(3):
#create writer here
with open(f'tmp_' + str(i) + '.csv', 'w') as file:
writer = csv.writer(file, delimiter=',')
#write header
writer.writerow(header_list)
# adding data to file ---------------
for j in range(5):
n = random.randint(1, 99)
writer.writerow([sp100_list['Name'].loc[n],random.randint(5, 20),
random.randint(200, 1000),random.randint(100, 150),
int(random.random() * 1000000)])

Related

Is there a way to read and alter the contents of a huge csv file in PyCharm?

I'm attempting to create a program currently that can read a csv, determine if a substring is included in one of the columns of each row, and if it isn't present, rewrites certain columns to a new csv. I have the code down for this much- but the csv I need to use the program for has well over 3 million rows. I use PyCharm and currently I'm not able to process this much data. It can only view the csv in a read-only format which doesn't allow me to use it. I know pandas has a chunk size feature but I don't know how to implement this with the rest of my code.
def reading(csv_input):
originalLength = 0
rowCount = 0
with open(f'Web Report {csv_input}', 'w') as file:
writer = csv.writer(file)
writer.writerow(['Index', 'URL Category', 'User IP', 'URL'])
dropCount = 0
data = pd.read_csv(csv_input, chunksize=100000)
df = pd.DataFrame(data,
columns=['Line', 'Date', 'Hour', 'User Name', 'User IP', 'Site Name',
'URL Category', 'Action', 'Action Description'])
originalLength = len(df.index)
for line in range(originalLength):
dataLine = df.loc[line]
x = dataLine.get(key='Action')
if x == 0:
siteName = dataLine.get(key='Site Name')
if 'dbk' in siteName:
dropCount = dropCount + 1
elif 'ptc' in siteName:
dropCount = dropCount + 1
elif 'wcf' in siteName:
dropCount = dropCount + 1
elif 'google' in siteName:
dropCount = dropCount + 1
else:
writer.writerow([line, # Original Index
df.loc[line].get(key='URL Category'), # Original URL Category
df.loc[line].get(key='User IP'), # Original User IP
df.loc[line].get(key='Site Name')]) # Original Site Name
rowCount = rowCount + 1
else:
dropCount = dropCount + 1
file.close()
print("Input: " + str(csv_input))
print("Output: " + str(file.name))
print("Original Length: " + str(originalLength))
print("Current Length: " + str(rowCount))
print("Drop Count: " + str(dropCount) + "\n")
return df
If you use csv to write file then you could use it also to read row by row.
import csv
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader: # read row by row
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
If you want to use pandas with chunk then you should use for-loop for this.
And when you write with pandas then you need append mode without headers.
import pandas as pd
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create new file with headers
df.to_csv('output.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output.csv', mode='a', header=False)
Minimal working code
import pandas as pd
import csv
# --- create some data ---
data = {
'A': range(0,10),
'B': range(10,20),
'C': range(20,30),
} # columns
df = pd.DataFrame(data)
df.to_csv('input.csv', index=False)
# --- read and write with `pandas` ---
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create empty with headers
df.to_csv('output_pandas.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output_pandas.csv', mode='a', header=False)
# --- read and write with `csv` ---
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader:
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
Doc: read_csv(), to_csv()

how to parse a txt file to csv and modify formatting

Is there a way I can use python to take my animals.txt file results and convert it to csv and format it differently?
Currently the animals.txt file looks like this:
ID:- 512
NAME:- GOOSE
PROJECT NAME:- Random
REPORT ID:- 30321
REPORT NAME:- ANIMAL
KEYWORDS:- ['"help,goose,Grease,GB"']
ID:- 566
NAME:- MOOSE
PROJECT NAME:- Random
REPORT ID:- 30213
REPORT NAME:- ANIMAL
KEYWORDS:- ['"Moose, boar, hansel"']
I would like the CSV file to present it as:
ID, NAME, PROJECT NAME, REPORT ID, REPORT NAME, KEYWORDS
Followed by the results underneath each header
Here is a script I have wrote:
import re
import csv
with open("animals.txt") as f: text = f.read()
data = {}
keys = ['ID', 'NAME', 'PROJECT NAME', 'REPORT ID', 'REPORT NAME', 'KEYWORDS']
for k in keys:
data[k] = re.findall(r'%s:- (.*)' % k, text)
csv_file = 'out.csv'
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=keys)
writer.writeheader()
for x in data:
writer.writerow(x)
An easy way to do is parsing using regex and store them in a dict, just before you write the final csv:
import re
# `text` is your input text
data = {}
keys = ['ID', 'NAME', 'PROJECT NAME', 'REPORT ID', 'REPORT NAME', 'KEYWORDS']
for k in keys:
data[k] = re.findall(r'%s:- (.*)' % k, text)
And to CSV:
import csv
csv_file = 'out.csv'
with open(csv_file, 'w') as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_NONE, escapechar='\\')
writer.writerow(data.keys())
for i in range(len(data[keys[0]])):
writer.writerow([data[k][i] for k in keys])
Output in csv:
ID,NAME,PROJECT NAME,REPORT ID,REPORT NAME,KEYWORDS
512,GOOSE,Random,30321,ANIMAL,['\"help\,goose\,Grease\,GB\"']
566,MOOSE,Random,30213,ANIMAL,['\"Moose\, boar\, hansel\"']
Note that I used re.M multiline mode since there's a trick in your text, preventing matching ID twice! Also the default write rows needed to be twisted.
Also uses \ to escape the quote.
This should work:
fname = 'animals.txt'
with open(fname) as f:
content = f.readlines()
content = [x.strip() for x in content]
output = 'ID, NAME, PROJECT NAME, REPORT ID, REPORT NAME, KEYWORDS\n'
line_output = ''
for i in range(0, len(content)):
if content[i]:
line_output += content[i].split(':-')[-1].strip() + ','
elif not content[i] and not content[i - 1]:
output += line_output.rstrip(',') + '\n'
line_output = ''
output += line_output.rstrip(',') + '\n'
print(output)
That's the code in Autoit (www.autoitscript.com)
Global $values_A = StringRegExp(FileRead("json.txt"), '[ID|NAME|KEYWORDS]:-\s(.*)?', 3)
For $i = 0 To UBound($values_A) - 1 Step +6
FileWrite('out.csv', $values_A[$i] & ',' & $values_A[$i + 1] & ',' & $values_A[$i + 2] & ',' & $values_A[$i + 3] & ',' & $values_A[$i + 4] & ',' & $values_A[$i + 5] & #CRLF)
Next

Create Excel Sheets from different files in Linux with Python

There are 2 txt file in a linux server.
first data file:
a;1234
b;12334
c;234234
second data file :
a ; ass ; asfda
b ; sdfq; qwrwffsaa
c ; asda ; qdasasd
What I try to make is to create a excel file with python which has 2 sheets.
First sheet keeps first data file second one should keep second data file.
What I develop so far is:
#!/bin/python
import xlsxwriter
import smtplib
import datetime
now = datetime.datetime.now()
workbookname = 'Excel_'+now.strftime("%Y-%m-%d_%H:%M")+'.xlsx'
workbook = xlsxwriter.Workbook(workbookname)
worksheet = workbook.add_worksheet('Sheet1')
worksheet.write('A1', 'Hostname')
worksheet.write('B1', 'User Name')
worksheet2 = workbook.add_worksheet('User Privilege')
worksheet2.write('A1', 'Hostname')
worksheet2.write('B1', 'User Detail')
worksheet2.write('C1', 'Description')
with open('/tmp/file1.txt') as f:
content = f.read().splitlines()
i = 0
while i < len(content):
content2 = content[i].split(';')
worksheet.write('A'+str(i+2), content2[0])
worksheet.write('B'+str(i+2), content2[1])
workbook.close()
i = 0
while i < len(content):
with open('/tmp/file2.txt') as f:
content = f.read().splitlines()
worksheet2.write('A' + str(i + 2), content2[0])
worksheet2.write('B' + str(i + 2), content2[1])
worksheet2.write('C' + str(i + 2), content2[2])
i=i+1
workbook.close()
This script only works for the first sheet it does not write to second sheet.
With pandas this can be done in a couple of lines
import pandas
df1 = pandas.read_csv('file1.csv', sep = ';', header = None)
df2 = pandas.read_csv('file2.csv', sep = ';', header = None)
writer = pandas.ExcelWriter('output.xlsx')
df1.to_excel(writer, 'sheet 1')
df2.to_excel(writer, 'sheet 2')
writer.save()

Adding DictHeader with corresponding values to the existing table

I'm iterating API requests for each row of the input CSV file. And I want to add API output results to the existing CSV file.
Input
Desired output
As you can see, I added three headers with corresponding results (latitude, longitude, coordinates)
However, I'm finding difficulty with writing the right query for this. Below is the best I could do.
df=pd.read_csv(r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy2.csv",delimiter=',', na_values="nan")
# Output
with open(r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy2.csv", 'r') as csvin, open (r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy3.csv", 'w', newline='') as out:
csvreader = csv.DictReader(csvin)
fieldnames = csvreader.fieldnames + ["latitude","longitude","coordinates"]
csvwriter = csv.DictWriter(out, fieldnames)
csvwriter.writeheader()
# Iterating requests for each row
for row in df.itertuples():
output = client.geocode(str(row.addressline1) + ', ' + str(row.city) + ', ' + str(row.state) + ', ' + str(row.postalcode)).coords
cord = '(' + str(output[0]) + ', '+ str(output[1]) + ')'
for node, row in enumerate(csvreader, 3):
csvwriter.writerow(dict(3, {'latitude': output[0], 'longitude': output[1], 'coordinates': cord}))
Update:
Here is my new Python query:
df=pd.read_csv(r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy2.csv",delimiter=',', na_values="nan")
# Output
with open(r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy2.csv", 'r') as csvin, open (r"C:\users\testu\documents\travis_50000_melissa_joined_dropna - Copy3.csv", 'w', newline='') as out:
csvreader = csv.DictReader(csvin)
fieldnames = csvreader.fieldnames + ["latitude","longitude","coordinates"]
csvwriter = csv.DictWriter(out, fieldnames)
csvwriter.writeheader()
# Iterating requests for each row
for row in df.itertuples():
output = client.geocode(str(row.addressline1) + ', ' + str(row.city) + ', ' + str(row.state) + ', ' + str(row.postalcode)).coords
cord = '(' + str(output[0]) + ', '+ str(output[1]) + ')'
for node, row1 in enumerate(csvreader, 38):
csvwriter.writerow(dict(row1,latitude= output[0] % node))
for node, row2 in enumerate(csvreader, 39):
csvwriter.writerow(dict(row2,longitude = output[1] % node))
for node, row3 in enumerate(csvreader, 40):
csvwriter.writerow(dict(row3,coordinates= cord % node))
However, I get the following result:
You can more easily accomplish this by using more of pandas features.
Import the data from csv as you have been doing.
import pandas as pd
df = pd.read_csv("input_file.csv")
You can use dataframe.apply(func, axis=1) to apply a function to each row of a dataframe. https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html
def get_coords(row):
return client.geocode(str(row.addressline1) + ', ' + str(row.city) + ', ' \
+ str(row.state) + ', ' + str(row.postalcode)).coords
coords = df.apply(get_coords, axis=1)
df['latitide'] = coords.values[:,0]
df['longitude'] = coords.values[:,1]
df['coords'] = coords
You can then easily save the dataframe to csv using:
df.to_csv('output_filename.csv')
Hope this help.
p.s. code is untested but should be good :)

Row lables to columns in multi column file

I am new to Python and am using version 2.7.1 as part of Hyperion FDMEE.
I have a file which I need to reorder the columns plus, split one column into 3 as part of the same file.
Source file;
ACCOUNT;UD1;UD2;UD3;PERIOD;PERIOD;AMOUNT
QTY;032074;99953;53;2017.07.31;2017.07.31;40.91
COGS;032074;99953;53;2017.07.31;2017.07.31;-7488.36
TURNOVER;032074;99953;53;2017.07.31;2017.07.31;505.73
QTY;032075;99960;60;2017.07.31;2017.07.31;40.91
COGS;032075;99960;60;2017.07.31;2017.07.31;-7488.36
TURNOVER;032075;99960;60;2017.07.31;2017.07.31;505.73
I have managed to reorder the columns per this script;
infilename = fdmContext["OUTBOXDIR"]+"/Targit_1707.dat"
outfilename = fdmContext["OUTBOXDIR"]+"/TargitExport.csv"
import csv
infile = open(infilename, 'r')
outfile = open(outfilename, 'w+')
for line in infile:
column = line.split(';')
outfile.write(column[1] + ";" + column[2] + ";" + column[3] + ";" + column[4] + ";" + column[0] + ";" + str(column[6].strip('\n')) + ";201701" + "\n")
outfile.close()
infile.close()
Producing the result;
UD1;UD2;UD3;PERIOD;ACCOUNT;AMOUNT;201701
032074;99953;53;2017.07.31;QTY;40.91;201701
032074;99953;53;2017.07.31;COGS;-7488.36;201701
032074;99953;53;2017.07.31;TURNOVER;505.73;201701
032075;99960;60;2017.07.31;QTY;40.91;201701
032075;99960;60;2017.07.31;COGS;-7488.36;201701
032075;99960;60;2017.07.31;TURNOVER;505.73;201701
but I am struggling to transpose the Account column (QTY, COGS, TURNOVER) into seperate columns as in the example below;
UD1;UD2;UD3;PERIOD;QTY;COGS;TURNOVER;201701
032074;99953;53;2017.07.31;40.91;-7488.36;505.73;201701
032075;99960;60;2017.07.31;40.91;-7488.36;505.73;201701
Any suggestions would be very much appreciated.
Use a dict, for instance:
import csv
fieldnames = infile.readline()[:-1]
fieldnames = fieldnames.split(';')[1:5] + ['QTY', 'COGS', 'TURNOVER']
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
record_dict = {}
for i, line in enumerate(infile):
if not line: break
line = line[:-1].split(';')
# Assign column data every 1,2,3 lines
mod_row = (i % 3)+1
if mod_row == 1:
record_dict['QTY'] = line[6]
record_dict['UD1'] = line[1]
# ... and so on
if mod_row == 2:
record_dict['COGS'] = line[6]
if mod_row == 3:
record_dict['TURNOVER'] = line[6]
writer.writerow(record_dict)
record_dict = {}
Output:
UD1,UD2,UD3,PERIOD,QTY,COGS,TURNOVER
032074,,,,40.91,-7488.36,505.73
032075,,,,40.91,-7488.36,505.73
Tested with Python: 3.4.2
Read about:
Python ยป 3.6.1 Documentation csv.DictWriter

Categories