Remove &#13 from end of strings in list - python

I am using the following python code to extract all of the file names from a specific csv file. It is working, but I run into a problem that the output contains two "&#13" at the end of each string in the list. I am looking for a way to remove it, but am having trouble because I have to remove it from each item in the list, rather than just removing it from one string:
import re
import csv
import os
with open('file.txt', 'rU') as textfile:
filetext = textfile.read()
file_list = []
file_list = [os.path.basename(fn) for fn in re.findall(r'\bFile Name:\s+.*\\.*(?=\n)', filetext)]
out_file = csv.writer(open("File-Original.csv", "wb"), delimiter='\n', quoting=csv.QUOTE_MINIMAL)
out_file.writerow(file_list)

Related

Python iterating elemens from list to file

What I'm trying to do is basically reading a csv file to a list, ( one column only ). Then I need to read 11 elements ( one element = 9 digit number ) from the list, with comma, to a row with newline. All goes into another text file. 11 elements i a row match a A4 sheet. Then iterate over remaining elements in the list. I can't figure out how. Below follows my code I'm working on:
count = 0
textfile = 'texstf.txt'
filepath = 'testfile2.csv'
with open(filepath, "r") as f:
lines = [ str(line.rstrip()) for line in f ]
for key in lines:
while(count < 11):
with open(textfile, "w") as myfile:
myfile.write(','.join(lines))
count += 1
csv sample:
6381473
6381783
6381814
...
expected output to file sample:
6381473,6381783,6381814
I ran your code and it looks like it is working. If you could provide any more context of what specifically is not working with your code such as error messages, that would be helpful. Make sure you have the correct filepath for each file you are trying to read and write.
Here is an alternative way to do this based off of this similar question:
import csv
import os
textfile = os.getcwd() + r'\texstf.txt'
filepath = os.getcwd() + r'\testfile2.csv'
with open(textfile, "w") as my_output_file:
with open(filepath, "r") as my_input_file:
[ my_output_file.write(" ".join(row)+',') for row in csv.reader(my_input_file)]
my_output_file.close()

Removing New Line from CSV Files using Python

I obtain multiple CSV files from API, in which I need to remove New Lines present in the CSV and join the record, consider the data provided below;
My Code to remove the New Line:
## Loading necessary libraries
import glob
import os
import shutil
import csv
## Assigning necessary path
source_path = "/home/Desktop/Space/"
dest_path = "/home/Desktop/Output/"
# Assigning file_read path to modify the copied CSV files
file_read_path = "/home/Desktop/Output/*.csv"
## Code to copy .csv files from one folder to another
for csv_file in glob.iglob(os.path.join(source_path, "*.csv"), recursive = True):
shutil.copy(csv_file, dest_path)
## Code to delete the second row in all .CSV files
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding = 'ISO-8859-1') as file:
reader = list(csv.reader(file , delimiter = ","))
for i in range(0,len(reader)):
reader[i] = [row_space.replace("\n", "") for row_space in reader[i]]
with open(filename, "w") as output:
writer = csv.writer(output, delimiter = ",", dialect = 'unix')
for row in reader:
writer.writerow(row)
I actually copy the CSV files into a new folder and then use the above code to remove any new line present in the file.
You are fixing the csv File, because they have wrong \n the problem here is how
to know if the line is a part of the previous line or not. if all lines starts
with specifics words like in your example SV_a5d15EwfI8Zk1Zr or just SV_ You can do something like this:
import glob
# this is the FIX PART
# I have file ./data.csv(contains your example) Fixed version is in data.csv.FIXED
file_read_path = "./*.csv"
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding='ISO-8859-1') as file, open(filename + '.FIXED', "w", encoding='ISO-8859-1') as target:
previous_line = ''
for line in file:
# check if it's a new line or a part of the previous line
if line.startswith('SV_'):
if previous_line:
target.write( previous_line + '\n')
previous_line = line[:-1] # remove \n
else:
# concatenate the broken part with previous_line
previous_line += line[:-1] # remove \n
# add last line
target.write(previous_line + '\n')
Ouput:
SV_a5d15EwfI8Zk1Zr;QID4;"<span style=""font-size:16px;""><strong>HOUR</strong> Interview completed at:</span>";HOUR;TE;SL;;;true;ValidNumber;0;23.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID6;"<span style=""font-size:16px;""><strong>MINUTE</strong> Interview completed:</span>";MIN;TE;SL;;;true;ValidNumber;0;59.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID8;Number of Refusals - no language<br />For <strong>Zero Refusals - no language</strong> use 0;REFUSAL1;TE;SL;;;true;ValidNumber;0;99.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID10;<strong>DAY OF WEEK:</strong>;WEEKDAY;MC;SACOL;TX;;true;;0;;;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID45;"<span style=""font-size:16px;"">Using points from 0 to 10, how likely would you be recommend Gatwick Airport to a friend or colleague?</span><div> </div>";NPSCORE;MC;NPS;;;true;;0;;;882;-873;
EDITS:
Can Be Simpler using split too, this will fix the file it self:
import glob
# this is the FIX PART
# I have file //data.csv the fixed version in the same file
file_read_path = "./*.csv"
# assuming that all lines starts with SV_
STARTING_KEYWORD = 'SV_'
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding='ISO-8859-1') as file:
lines = file.read().split(STARTING_KEYWORD)
with open(filename, 'w', encoding='ISO-8859-1') as file:
file.write('\n'.join(STARTING_KEYWORD + l.replace('\n', '') for l in lines if l))
Well I'm not sure on the restrictions you have. But if you can use the pandas library , this is simple.
import pandas as pd
data_set = pd.read_csv(data_file,skip_blank_lines=True)
data_set.to_csv(target_file,index=False)
This will create a CSV File will all new lines removed. You can save a lot of time with available libraries.

How to save an edited txt file into a new txt file?

I am trying to save my output from x .txt files in only one .txt file.
The .txt file should look like the output as you can see in the picture below.
What this program actually does is read a couple of .txt files with tons of data which I filter out using regex.
My source code:
import os,glob
import re
folder_path =(r"C:\Users\yokay\Desktop\DMS\Messdaten_DMT")
values_re = re.compile(r'\t\d+\t-?\d+,?\d*(\t-?\d+,?\d+){71}')
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename) as lines:
for line in lines:
match = values_re.search(line)
if match:
values = match.group(0).split('\t')
assert values[0] == ''
values = values[1:]
print(values)
Thank you for your time! :)
Then you just need to open a file and write values to it. Try with this. You might need to format (I cannot test since I don't have your text files. I am assuming the output you have in values is correct and keep in mind that this is appending, so if you run more than once you will get duplicates.
import os,glob
import re
folder_path =(r"C:\Users\yokay\Desktop\DMS\Messdaten_DMT")
values_re = re.compile(r'\t\d+\t-?\d+,?\d*(\t-?\d+,?\d+){71}')
outF = open("myOutFile.txt", "a")
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename) as lines:
for line in lines:
match = values_re.search(line)
if match:
values = match.group(0).split('\t')
assert values[0] == ''
values = values[1:]
outF.write(values)
print(values)

Removing punctuation and change to lowercase in python CSV file

The code below allow me to open the CSV file and change all the texts to lowercase. However, i have difficulties trying to also remove the punctuation in the CSV file. How can i do that? Do i use string.punctuation?
file = open('names.csv','r')
lines = [line.lower() for line in file]
with open('names.csv','w') as out
out.writelines(sorted(lines))
print (lines)
sample of my few lines from the file:
Justine_123
ANDY*#3
ADRIAN
hEnNy!
You can achieve this by importing strings and make use of the following example code below.
The other way you can achieve this is by using regex.
import string
str(lines).translate(None, string.punctuation)
Also you may want to learn more about how import string works and its features
The working example you requested for.
import string
with open("sample.csv") as csvfile:
lines = [line.lower() for line in csvfile]
print(lines)
will give you ['justine_123\n', 'andy*#3\n', 'adrian\n', 'henny!']
punc_table = str.maketrans({key: None for key in string.punctuation})
new_res = str(lines).translate(punc_table)
print(new_res)
new_s the result will give you justine123n andy3n adriann henny
Example with regular expressions.
import csv
import re
filename = ('names.csv')
def reg_test(name):
reg_result = ''
with open(name, 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
row = re.sub('[^A-Za-z0-9]+', '', str(row))
reg_result += row + ','
return reg_result
print(reg_test(filename).lower())
justine123,andy3,adrian,henny,

Errors when reading column name from csv files and saving as list

I have a folder that has over 15,000 csv files. They all have different number of column names.
Most files have its first row as a column name (attribute of data) like this :
Name Date Contact Email
a b c d
a2 b2 c2 d2
What I want to do is read first row of all files, store them as a list, and write that list as new csv file.
Here is what I have done so far :
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
for file in files :
f = open(file)
a=[file,f.readline()]
list.append(a)
with open('test.csv', 'w') as testfile:
csv_writer = csv.writer(testfile)
for i in list:
csv_writer.writerow(i)
When I try this code, result comes out like this :
[('C:/example\\example.csv', 'Name,Date,Contact,Email\n'), ('C:/example\\example2.csv', 'Address,Date,Name\n')]
Therefore in a made csv, all attributes of each file go into second column making it look like this (for some reason, there's a empty row between) :
New CSV file made
Moreover when going through files, I have encoutered another error :
UnicodeDecodeError: 'cp949' codec can't decode byte 0xed in position 6: illegal multibyte sequence
So I included this code in first line but it didn't work saying files are invalid.
import codecs
files=glob.glob('C:/example/*.csv')
fileObj = codecs.open( files, "r", "utf-8" )
I read answers on stackflow but I couldn't find one related to my problem. I appreciate your answers.
Ok, so
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
for file in files :
f = open(file)
a=[file,f.readline()]
list.append(a)
here you're opening the file and then creating a list with the column headers as a string(note that means they'll look like "Column1,Column2") and the file name. So [("Filename", "Column1, Column2")]
so you're going to need to split that on the ',' like:
for file in files :
f = open(file)
a=[file] + f.readline().split(',')
Now we have:
["filename", ("Column1", "Column2")]
So it's still going to print to the file wrong. We need to concatenate the lists.
a=[file] + f.readline().split(',')
So we get:
["filename", "Column1", "Column2"]
And you should be closing each file after you open it with f.close() or use a context manager inside your loop like:
for file in files :
with open(file) as f:
a=[file] + f.readline()
list.append(a)
Better solution and how I would write it:
import csv
import glob
files = glob.glob('mydir/*.csv')
lst = list()
for file in files:
with open(file) as f:
reader = csv.reader(f)
lst.append(next(reader))
try:
with open(files,'r'.encoding='utf8') as f:
# do things
except UnicodeError:
with open(files,'r'.encoding='utf8') as f:
# do things
a little bit of tidying, proper context managing, and using csv.reader:
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
with open('test.csv', 'w') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
reader = csv.reader(infile)
headers = next(reader)
lst = [file] + headers
writer.writerow(lst)
this will write a new csv with one row per infile, each row being filename, column1, column2, ...

Categories