csv merging issue, python - python

Using the following code to merge CSV files, it will at times put the data in the wrong columns. Rather than being in Columns A-D it will put the data in columns F-J. From what I can tell is it's the first line of a new CSV that gets put in the wrong column, however, not every CSV file.
import glob
import codecs
import csv
my_files = glob.glob("*.csv")
header_saved = False
with codecs.open('Final-US-Allies-Expects.csv','w', "UTF-8", 'ignore') as file_out: #save data to
for filename in my_files:
with codecs.open(filename, 'r', 'UTF-8', 'ignore') as file_in:
header = next(file_in)
if not header_saved:
file_out.write(header) #write header
header_saved = True
for line in file_in:
file_out.write(line) #write next line
original code available at Merging multiple CSV files without headers being repeated (using Python) (reputation not high enough to add to original question)
Visual of issue
I've attached a visual of the issue. I need to be able to have every line be written in in the column it is meant to be written into.
Thanks for your help in advance.

Looks like you are not checking if the lines end in new line character before writing it to the file. This could mess up the alignment. Could you try this?
import glob
import codecs
import csv
my_files = glob.glob("*.csv")
header_saved = False
with codecs.open('output.csv','w', "UTF-8", 'ignore') as file_out:
for filename in my_files:
with codecs.open(filename, 'r', 'UTF-8', 'ignore') as file_in:
header = next(file_in)
if not header_saved:
file_out.write(header if "\n" == header[-1] else header + "\n")
header_saved = True
for line in file_in:
file_out.write(line if "\n" == line[-1] else line + "\n")

Related

Removing duplicates while reading multiple text files using Python

I am trying to:
read multiple text files
keep only header of first file
account for formatting issues (e.g. special characters)
merge them into one file
This is the code, I came up with:
import glob
read_files = glob.glob(data_path + "*.txt")
header_saved = False
with open(data_path +"result.txt","w") as outfile:
for f in read_files:
with open(f) as infile:
header = next(infile)
if not header_saved:
outfile.write(header)
header_saved = True
text = infile.read()
replaced_text = re.sub(r"[-()\"##;:<>{}`+=~|.!?,]", "", text)
outfile.write(replaced_text+"\n")
The problem is, for some reason this produces duplicated rows.
Does someone see the code parts which are at fault?
I appreciate any help.
Thanks!

Removing New Line from CSV Files using Python

I obtain multiple CSV files from API, in which I need to remove New Lines present in the CSV and join the record, consider the data provided below;
My Code to remove the New Line:
## Loading necessary libraries
import glob
import os
import shutil
import csv
## Assigning necessary path
source_path = "/home/Desktop/Space/"
dest_path = "/home/Desktop/Output/"
# Assigning file_read path to modify the copied CSV files
file_read_path = "/home/Desktop/Output/*.csv"
## Code to copy .csv files from one folder to another
for csv_file in glob.iglob(os.path.join(source_path, "*.csv"), recursive = True):
shutil.copy(csv_file, dest_path)
## Code to delete the second row in all .CSV files
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding = 'ISO-8859-1') as file:
reader = list(csv.reader(file , delimiter = ","))
for i in range(0,len(reader)):
reader[i] = [row_space.replace("\n", "") for row_space in reader[i]]
with open(filename, "w") as output:
writer = csv.writer(output, delimiter = ",", dialect = 'unix')
for row in reader:
writer.writerow(row)
I actually copy the CSV files into a new folder and then use the above code to remove any new line present in the file.
You are fixing the csv File, because they have wrong \n the problem here is how
to know if the line is a part of the previous line or not. if all lines starts
with specifics words like in your example SV_a5d15EwfI8Zk1Zr or just SV_ You can do something like this:
import glob
# this is the FIX PART
# I have file ./data.csv(contains your example) Fixed version is in data.csv.FIXED
file_read_path = "./*.csv"
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding='ISO-8859-1') as file, open(filename + '.FIXED', "w", encoding='ISO-8859-1') as target:
previous_line = ''
for line in file:
# check if it's a new line or a part of the previous line
if line.startswith('SV_'):
if previous_line:
target.write( previous_line + '\n')
previous_line = line[:-1] # remove \n
else:
# concatenate the broken part with previous_line
previous_line += line[:-1] # remove \n
# add last line
target.write(previous_line + '\n')
Ouput:
SV_a5d15EwfI8Zk1Zr;QID4;"<span style=""font-size:16px;""><strong>HOUR</strong> Interview completed at:</span>";HOUR;TE;SL;;;true;ValidNumber;0;23.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID6;"<span style=""font-size:16px;""><strong>MINUTE</strong> Interview completed:</span>";MIN;TE;SL;;;true;ValidNumber;0;59.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID8;Number of Refusals - no language<br />For <strong>Zero Refusals - no language</strong> use 0;REFUSAL1;TE;SL;;;true;ValidNumber;0;99.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID10;<strong>DAY OF WEEK:</strong>;WEEKDAY;MC;SACOL;TX;;true;;0;;;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID45;"<span style=""font-size:16px;"">Using points from 0 to 10, how likely would you be recommend Gatwick Airport to a friend or colleague?</span><div> </div>";NPSCORE;MC;NPS;;;true;;0;;;882;-873;
EDITS:
Can Be Simpler using split too, this will fix the file it self:
import glob
# this is the FIX PART
# I have file //data.csv the fixed version in the same file
file_read_path = "./*.csv"
# assuming that all lines starts with SV_
STARTING_KEYWORD = 'SV_'
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding='ISO-8859-1') as file:
lines = file.read().split(STARTING_KEYWORD)
with open(filename, 'w', encoding='ISO-8859-1') as file:
file.write('\n'.join(STARTING_KEYWORD + l.replace('\n', '') for l in lines if l))
Well I'm not sure on the restrictions you have. But if you can use the pandas library , this is simple.
import pandas as pd
data_set = pd.read_csv(data_file,skip_blank_lines=True)
data_set.to_csv(target_file,index=False)
This will create a CSV File will all new lines removed. You can save a lot of time with available libraries.

Inserting a comma in between columns in text tile

The problem is I have this text, csv file which is missing commas and I would like to insert it in order to run the file on LaTex and make a table. I have a MWE of a code from another problem which I ran and it did not work. Is it possible someone could guide me on how to change it.
I have used a Python code which provides a blank file, and another one which provides a blank document, and another which removes the spaces.
import fileinput
input_file = 'C:/Users/Light_Wisdom/Documents/Python Notes/test.txt'
output= open('out.txt','w+')
with open('out.txt', 'w+') as output:
for each_line in fileinput.input(input_file):
output.write("\n".join(x.strip() for x in each_line.split(',')))
text file contains more numbers but its like this
0 2.58612
0.00616025 2.20018
0.0123205 1.56186
0.0184807 0.371172
0.024641 0.327379
0.0308012 0.368863
0.0369615 0.322228
0.0431217 0.171899
Outcome
0.049282, -0.0635003
0.0554422, -0.110747
0.0616025, 0.0701394
0.0677627, 0.202381
0.073923, 0.241264
0.0800832, 0.193697
Renewed Attempt:
with open("CSV.txt","r") as file:
new = list(map(lambda x: ''.join(x.split()[0:1]+[","]+x.split()[0:2]),file.readlines()))
with open("New_CSV.txt","w+") as output:
for i in new:
output.writelines(i)
output.writelines("\n")
This can be using .split and .join by splitting the line into a list and then joining the list separated by commas. This enables us to handle several subsequent spaces in the file:
f1 = open(input_file, "r")
with open("out.txt", 'w') as f2:
for line in f1:
f2.write(",".join(line.split()) + "\n")
f1.close()
You can also use csv to handle the writing automatically:
import csv
f1 = open(input_file, "r")
with open("out.txt", 'w') as f2:
writer = csv.writer(f2)
for line in f1:
writer.writerow(line.split())
f1.close()

Errors when reading column name from csv files and saving as list

I have a folder that has over 15,000 csv files. They all have different number of column names.
Most files have its first row as a column name (attribute of data) like this :
Name Date Contact Email
a b c d
a2 b2 c2 d2
What I want to do is read first row of all files, store them as a list, and write that list as new csv file.
Here is what I have done so far :
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
for file in files :
f = open(file)
a=[file,f.readline()]
list.append(a)
with open('test.csv', 'w') as testfile:
csv_writer = csv.writer(testfile)
for i in list:
csv_writer.writerow(i)
When I try this code, result comes out like this :
[('C:/example\\example.csv', 'Name,Date,Contact,Email\n'), ('C:/example\\example2.csv', 'Address,Date,Name\n')]
Therefore in a made csv, all attributes of each file go into second column making it look like this (for some reason, there's a empty row between) :
New CSV file made
Moreover when going through files, I have encoutered another error :
UnicodeDecodeError: 'cp949' codec can't decode byte 0xed in position 6: illegal multibyte sequence
So I included this code in first line but it didn't work saying files are invalid.
import codecs
files=glob.glob('C:/example/*.csv')
fileObj = codecs.open( files, "r", "utf-8" )
I read answers on stackflow but I couldn't find one related to my problem. I appreciate your answers.
Ok, so
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
for file in files :
f = open(file)
a=[file,f.readline()]
list.append(a)
here you're opening the file and then creating a list with the column headers as a string(note that means they'll look like "Column1,Column2") and the file name. So [("Filename", "Column1, Column2")]
so you're going to need to split that on the ',' like:
for file in files :
f = open(file)
a=[file] + f.readline().split(',')
Now we have:
["filename", ("Column1", "Column2")]
So it's still going to print to the file wrong. We need to concatenate the lists.
a=[file] + f.readline().split(',')
So we get:
["filename", "Column1", "Column2"]
And you should be closing each file after you open it with f.close() or use a context manager inside your loop like:
for file in files :
with open(file) as f:
a=[file] + f.readline()
list.append(a)
Better solution and how I would write it:
import csv
import glob
files = glob.glob('mydir/*.csv')
lst = list()
for file in files:
with open(file) as f:
reader = csv.reader(f)
lst.append(next(reader))
try:
with open(files,'r'.encoding='utf8') as f:
# do things
except UnicodeError:
with open(files,'r'.encoding='utf8') as f:
# do things
a little bit of tidying, proper context managing, and using csv.reader:
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
with open('test.csv', 'w') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
reader = csv.reader(infile)
headers = next(reader)
lst = [file] + headers
writer.writerow(lst)
this will write a new csv with one row per infile, each row being filename, column1, column2, ...

Overwrite the first and last column in csv file using python

I am new to data processing using CSV module. And i have input file And using this code`
import csv
path1 = "C:\\Users\\apple\\Downloads\\Challenge\\raw\\charity.a.data"
csv_file_path = "C:\\Users\\apple\\Downloads\\Challenge\\raw\\output.csv.bak"
with open(path1, 'r') as in_file:
in_file.__next__()
stripped = (line.strip() for line in in_file)
lines = (line.split(":$%:") for line in stripped if line)
with open(csv_file_path, 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('id', 'donor_id','last_name','first_name','year','city','state','postal_code','gift_amount'))
writer.writerows(lines)
`
Is it possible to remove (:) in the first and last column of csv file. And i want output be like
Please help me.
If you just want to eliminate the ':' at the first and last column, this should work. Keep in mind that your dataset should be tab (or something other than comma) separated before you read it, because as I commented in your question, there are commas ',' in your dataset.
path1 = '/path/input.csv'
path2 = '/path/output.csv'
with open(path1, 'r') as input, open(path2, 'w') as output:
file = iter(input.readlines())
output.write(next(file))
for row in file:
output.write(row[1:][:-2] + '\n')
Update
So after giving your code, I added a small change to do the whole process starting from the initial file. The idea is the same. You should just exclude the first and the last char of each line. So instead of line.strip() you should have line.strip()[1:][:-2].
import csv
path1 = "C:\\Users\\apple\\Downloads\\Challenge\\raw\\charity.a.data"
csv_file_path = "C:\\Users\\apple\\Downloads\\Challenge\\raw\\output.csv.bak"
with open(path1, 'r') as in_file:
in_file.__next__()
stripped = (line.strip()[1:][:-2] for line in in_file)
lines = (line.split(":$%:") for line in stripped if line)
with open(csv_file_path, 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('id', 'donor_id','last_name','first_name','year','city','state','postal_code','gift_amount'))
writer.writerows(lines)

Categories