Remove line break after CSV file aggregation - python

I am aggregating data in a CVS file, the code:
import pandas
df = pandas.read_csv("./input.csv", delimiter=";", low_memory=False)
df.head()
count_severity = df.groupby("PHONE")["IMEI"].unique()
has_multiple_elements = count_severity.apply(lambda x: len(x)>1)
result = count_severity[has_multiple_elements]
result.to_csv("./output.csv", sep=";")
and in some lines of the received file, I get the following:
It turns out that I get the second column, which is after the sign ;, divided into two rows.
Could you tell me please, how to get rid of this line break? I tried adding a parameter line_terminator=None in result.to_csv - it didn't help.
Any method is accepted, even if you have to overwrite this file and save a new one. I also tried this:
import pandas as pd
output_file = open("./output.csv", "r")
output_file = ''.join([i for i in output_file]).replace("\n", "")
output_file_new = open("./output_new.csv", "w")
output_file_new.writelines(output_file)
output_file_new.close()
But then I get solid lines, which is not good at all.
To summarize, I should get the format of this file:
Thank You!

If your wrong lines always start with a comma, you could just replace the sequence "\n," by ",".
with open("./output.csv", "r") as file:
content = file.read()
new_content = content.replace("\n,", ",")
with open("./new_output.csv", "w") as file:
file.write(new_content)

Related

Remove everything between a specific character in a txt file

If I have a .txt file with this content:
1020,"Balance",+10000
1030,"Something",-5000
How do I remove whats in the middle, so that the only thing im left with is
1020,+10000
1030,-5000
If it's always in the same index:
with open('yourfile.txt', 'r') as f:
lines = f.readlines()
output = []
for line in lines:
temp = line.split(",")
output.append(temp[0])
output.append(temp[2])
print(output)
I would approach it with a regex:
import re
string = "1030,\"Something\",-5000"
stripped = re.sub("[\"].*[\"]", "", string)
print stripped
This prints 1030,,-5000 from there you can remove one of the commas.
You could import the data into a dataframe using Pandas and then delete the second column like this.
import pandas as pd
df = pd.read_csv('example.txt', header=None)
del df[1]
print(df)
You can use csv module for this task:
import csv
def removeColumn(fn1,fn2,idx=1):
with open(fn1,"r") as csvfile1:
reader = csv.reader(csvfile1)
with open(fn2,"w") as csvfile2:
writer = csv.writer(csvfile2)
for row in reader:
writer.write(row[:idx] + row[:idx+1])

How do I write date, time, serial, part, etc... to csv file using Python?

The code works properly upto entering for loop and the date values is fetched. after that it returns an empty list of values for rest of the variables like time, ref1, seriel and all.
import pandas as pd
import re
# Create a Dataframe from CSV
my_dataframe = pd.read_csv('C:/Users/WI/Desktop/file.csv')
# Drop rows with any empty cells
my_dataframe.dropna(axis=0, how='any', thresh=None, subset=['date'], inplace=False)
with open("C:/Users/WDSI/Desktop/OutputFile.txt", "w") as F:
F.write("%s" %my_dataframe)
fin = open("C:/Users/WDSI/Desktop/OutputFile.txt", "r")
# print("Input file is taken")
fout = open("C:/Users/WDSI/Desktop/OutputFile1.txt", "w")
# print("Output file is taken")
for line in fin:
date = re.findall(r'(\d{4}-\d{2}-\d{2})', fin.read())
time = re.findall(r'(\s\d{2}:\d{2}:\d{2})',fin.read())
seriel=re.findall(r'(\s[A-Z][A-Z][A-Z][0-9])',fin.read())
part=re.findall(r'(\s[0-9][0-9][0-9][A-Z][0-9][0-9][0-9][0-9][0-9])',fin.read())
ref1=re.findall(r'(\s\d{16})',fin.read())
ref3=re.findall(r'(\d{9})+$',fin.read())
#print(date)
#print(time)
#print(seriel)
#print(part)
#print(ref1)
#print(ref3)
fout.write("%10s,%8s" %((date,time)))
fout.close()
when we run this code only date variable gets the value other variables like time, ref1 and all goes empty. also please help me to write date,time,serial,part,ref1,ref3 from each row of csv file. in this format the output file should be written.
You are reading line by line with the for line in fin but the first all your findall read the whole file content with fin.read().
You either process line by line (replace those fin.read() with line):
for line in fin:
date = re.findall(r'(\d{4}-\d{2}-\d{2})', line)
...
Or read the whole file and remove the for:
content = f.read()
date = re.findall(r'(\d{4}-\d{2}-\d{2})', content)
...
It is not exact replica of your solution but that how you can open a file and take whatever you need from each line then write the new data to a new file.
I have catered a csv file with the following lines:
This is a date 2019-08-05, 2019-09-03
This is a email asdfasdf#abc.com
Solution 1:
with open("./Datalake/output.txt", "w+") as wf:
with open("./Datalake/test.csv") as f:
for line in f:
dates = re.findall(r"\d{4}-\d{1,2}-\d{1,2}", line)
dates = "|".join(dates)
emails = re.findall(r'[\w\.-]+#[\w\.-]+', line)
emails = "|".join(emails)
extracted_line = "{}, {}\n".format(dates, emails)
wf.write(extracted_line)
print(extracted_line)
Solution 2:
You can extract directly from data frame. Apply the same search using a lambda function which will execute for each row. But be careful you might need some error handling lambda function will through error if there is None value in the column. Convert the column to str before applying lambda.
df = pd.read_csv("./Datalake/test.csv", sep='\n', header=None, names=["string_col"])
df['dates'] = df["string_col"].apply(lambda x: re.findall(r"\d{4}-\d{1,2}-\d{1,2}", x))
df['emails'] = df["string_col"].apply(lambda x: re.findall(r"[\w\.-]+#[\w\.-]+", x))
In that case the calculated column will a python list so you might consider to use ''.join() in the lambda to make them text.

Python: Replace string in a txt file but not on every occurrence

I am really new to python and I need to change new artikel Ids to the old ones. The Ids are mapped inside a dict. The file I need to edit is a normal txt where every column is sperated by Tabs. The problem is not replacing the values rather then only replacing the ouccurances in the desired column which is set by pos.
I really would appreciate some help.
def replaceArtCol(filename, pos):
with open(filename) as input_file, open('test.txt','w') as output_file:
for each_line in input_file:
val = each_line.split("\t")[pos]
for row in artikel_ID:
if each_line[pos] == pos
line = each_line.replace(val, artikel_ID[val])
output_file.write(line)`
This Code just replaces any occurance of the string in the text file.
supposed your ID mapping dict looks like ID_mapping = {'old_id': 'new_id'}, I think your code is not far from working correctly. A modified version could look like
with open(filename) as input_file, open('test.txt','w') as output_file:
for each_line in input_file:
line = each_line.split("\t")
if line[pos] in ID_mapping.keys():
line[pos] = ID_mapping[line[pos]]
line = '\t'.join(line)
output_file.write(line)
if you're not working in pandas anyway, this can save a lot of overhead.
if your data is tab separated then you must load this data into dataframe.. this way you can have columns and rows structure.. what you are sdoing right now will not allow you to do what you want to do without some complex and buggy logic. you may try these steps
import pandas as pd
df = pd.read_csv("dummy.txt", sep="\t", encoding="latin-1")
df['desired_column_name'] = df['desired_column_name'].replace({"value_to_be_changed": "newvalue"})
print(df.head())

Writing a pandas DataFrame into a csv file with some empty rows

I create a one-column pandas DataFrame that contains only strings. One row is empty. When I write the file on disk, the empty row gets an empty quote "" while I want no quote at all. Here's how to replicate the issue:
import pandas as pd
df = "Name=Test\n\n[Actual Values]\nLength=12\n"
df = pd.DataFrame(df.split("\n"))
df.to_csv("C:/Users/Max/Desktop/Test.txt", header=False, index=False)
The output file should be like this:
Name=Test
[Actual Values]
Length=12
But instead is like this:
Name=Test
[Actual Values]
""
Length=12
Is there a way to instruct pandas not to write the quotes and leaves an empty row in the output text file? Thank you, a lot.
There is a parameter for DataFrame.to_csv called na_rep. If you have None values, it will replace them with whatever you pass into this field.
import pandas as pd
df = "Name=Test\n"
df += "\n[Actual Values]\n"
df += "Length=12\n"
df = pd.DataFrame(df.split("\n"))
df[df[0]==""] = None
df.to_csv("pandas_test.txt", header=False, index=False, na_rep=" ")
Unfortunately, it looks like passing in na_rep="" will print quotes into the csv. However, if you pass in a single space (na_rep=" ") it looks better aesthetically...
Of course you could always write your own function to output a csv, or simply replace the "" in the output file using:
f = open(filename, 'r')
text = f.read()
f.close()
text = text.replace("\"\"","")
f = open(filename, 'w')
f.write(text)
f.close()
And here's how you could write your own to_csv() method:
def to_csv(df, filename, separator):
f = open(filename, 'w')
for col in df.values:
for row in col:
f.write(row + separator)
f.close()

Delete blank rows from CSV?

I have a large csv file in which some rows are entirely blank. How do I use Python to delete all blank rows from the csv?
After all your suggestions, this is what I have so far
import csv
# open input csv for reading
inputCSV = open(r'C:\input.csv', 'rb')
# create output csv for writing
outputCSV = open(r'C:\OUTPUT.csv', 'wb')
# prepare output csv for appending
appendCSV = open(r'C:\OUTPUT.csv', 'ab')
# create reader object
cr = csv.reader(inputCSV, dialect = 'excel')
# create writer object
cw = csv.writer(outputCSV, dialect = 'excel')
# create writer object for append
ca = csv.writer(appendCSV, dialect = 'excel')
# add pre-defined fields
cw.writerow(['FIELD1_','FIELD2_','FIELD3_','FIELD4_'])
# delete existing field names in input CSV
# ???????????????????????????
# loop through input csv, check for blanks, and write all changes to append csv
for row in cr:
if row or any(row) or any(field.strip() for field in row):
ca.writerow(row)
# close files
inputCSV.close()
outputCSV.close()
appendCSV.close()
Is this ok or is there a better way to do this?
Use the csv module:
import csv
...
with open(in_fnam, newline='') as in_file:
with open(out_fnam, 'w', newline='') as out_file:
writer = csv.writer(out_file)
for row in csv.reader(in_file):
if row:
writer.writerow(row)
If you also need to remove rows where all of the fields are empty, change the if row: line to:
if any(row):
And if you also want to treat fields that consist of only whitespace as empty you can replace it with:
if any(field.strip() for field in row):
Note that in Python 2.x and earlier, the csv module expected binary files, and so you'd need to open your files with e 'b' flag. In 3.x, doing this will result in an error.
Surprised that nobody here mentioned pandas. Here is a possible solution.
import pandas as pd
df = pd.read_csv('input.csv')
df.to_csv('output.csv', index=False)
Delete empty row from .csv file using python
import csv
...
with open('demo004.csv') as input, open('demo005.csv', 'w', newline='') as output:
writer = csv.writer(output)
for row in csv.reader(input):
if any(field.strip() for field in row):
writer.writerow(row)
Thankyou
You have to open a second file, write all non blank lines to it, delete the original file and rename the second file to the original name.
EDIT: a real blank line will be like '\n':
for line in f1.readlines():
if line.strip() == '':
continue
f2.write(line)
a line with all blank fields would look like ',,,,,\n'. If you consider this a blank line:
for line in f1.readlines():
if ''.join(line.split(',')).strip() == '':
continue
f2.write(line)
openning, closing, deleting and renaming the files is left as an exercise for you. (hint: import os, help(open), help(os.rename), help(os.unlink))
EDIT2: Laurence Gonsalves brought to my attention that a valid csv file could have blank lines embedded in quoted csv fields, like 1, 'this\n\nis tricky',123.45. In this case the csv module will take care of that for you. I'm sorry Laurence, your answer deserved to be accepted. The csv module will also address the concerns about a line like "","",""\n.
Doing it with pandas is very simple. Open your csv file with pandas:
import pandas as pd
df = pd.read_csv("example.csv")
#checking the number of empty rows in th csv file
print (df.isnull().sum())
#Droping the empty rows
modifiedDF = df.dropna()
#Saving it to the csv file
modifiedDF.to_csv('modifiedExample.csv',index=False)
python code for remove blank line from csv file without create another file.
def ReadWriteconfig_file(file):
try:
file_object = open(file, 'r')
lines = csv.reader(file_object, delimiter=',', quotechar='"')
flag = 0
data=[]
for line in lines:
if line == []:
flag =1
continue
else:
data.append(line)
file_object.close()
if flag ==1: #if blank line is present in file
file_object = open(file, 'w')
for line in data:
str1 = ','.join(line)
file_object.write(str1+"\n")
file_object.close()
except Exception,e:
print e
Here is a solution using pandas that removes blank rows.
import pandas as pd
df = pd.read_csv('input.csv')
df.dropna(axis=0, how='all',inplace=True)
df.to_csv('output.csv', index=False)
I need to do this but not have a blank row written at the end of the CSV file like this code unfortunately does (which is also what Excel does if you Save-> .csv). My (even simpler) code using the CSV module does this too:
import csv
input = open("M51_csv_proc.csv", 'rb')
output = open("dumpFile.csv", 'wb')
writer = csv.writer(output)
for row in csv.reader(input):
writer.writerow(row)
input.close()
output.close()
M51_csv_proc.csv has exactly 125 rows; the program always outputs 126 rows, the last one being blank.
I've been through all these threads any nothing seems to change this behaviour.
In this script all the CR / CRLF are removed from a CSV file then has lines like this:
"My name";mail#mail.com;"This is a comment.
Thanks!"
Execute the script https://github.com/eoconsulting/lr2excelcsv/blob/master/lr2excelcsv.py
Result (in Excel CSV format):
"My name",mail#mail.com,"This is a comment. Thanks!"
Replace the PATH_TO_YOUR_CSV with your
import pandas as pd
df = pd.read_csv('PATH_TO_YOUR_CSV')
new_df = df.dropna()
df.dropna().to_csv('output.csv', index=False)
or in-line:
import pandas as pd
pd.read_csv('data.csv').dropna().to_csv('output.csv', index=False)
I had the same, problem.
I converted the .csv file to a dataframe and after that I converted the dataframe back to the .csv file.
The initial .csv file with the blank lines was the 'csv_file_logger2.csv' .
So, i do the following process
import csv
import pandas as pd
df=pd.read_csv('csv_file_logger2.csv')
df.to_csv('out2.csv',index = False)

Categories