Read, then Write CSV with "Non-ISO extended-ASCII" text Encoding - python

My csv has strings like:
TîezÑnmidnan
I'm trying to use the following below to set up a reader/writer
import csv
# File that will be written to
csv_output_file = open(file, 'w', encoding='utf-8')
# File that will be read in
csv_file = open(filename, encoding='utf-8', errors='ignore')
# Define reader
csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
# Define writer
csv_writer = csv.writer(csv_output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
Then iterating over the information read in
# Iterate over the rows in the csv
for idx, row in enumerate(csv_reader):
csv_writer.writerow(row[0:30])
Problem is in my output file I can't get it to show up with that same string. According to my mac, the csv file type has the encoding "Non-ISO extended-ASCII"
I tried various encodings and some would just remove the special characters while others just wouldn't work.
It's weird because I can hard code that string above into a variable and use it without problems, so I assume it's something to do with how I'm reading in the file. If I breakpoint before it writes it shows up as the following in the debugger.
T�ez�nmidnan
I can't convert the file before running it, so the python code has to handle any conversions itself.
The expected output I want would be for it to remain in the output file looking like
TîezÑnmidnan
Adding a link to a sample csv that shows the issue along with a complete version of my code (with some details removed)
Example file to run with this
import tkinter as tk
from tkinter.filedialog import askopenfilename
import csv
import os
root = tk.Tk()
root.withdraw()
# Ask for file
filename = os.path.abspath(askopenfilename(initialdir="/", title="Select csv file", filetypes=(("CSV Files", "*.csv"),)))
# Set output file name
output_name = filename.rsplit('.')
del output_name[len(output_name) - 1]
output_name = "".join(output_name)
output_name += "_processed.csv"
# Using the file that will be written to
csv_output_file = open(os.path.abspath(output_name), 'w', encoding='utf-8')
# Using the file is be read in
csv_file = open(filename, encoding='utf-8', errors='ignore')
# Define reader with , delimiter
csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
# Define writer to put quotes around input values with a comma in them
csv_writer = csv.writer(csv_output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
header_row = []
# Iterate over the rows in the csv
for idx, row in enumerate(csv_reader):
if idx != 0:
csv_writer.writerow(row)
else:
header_row = row
csv_writer.writerow(header_row)
csv_file.flush()
csv_output_file.flush()
csv_file.close()
csv_output_file.close()
Expected results
Header1,Header2
Value1,TîezÑnmidnan
Actual results
Header1,Header2
Value1,Teznmidnan
Edit:
chardetect gave me "utf-8 with confidence 0.99"

Related

Why is my code not working while converting bulk csv to json?

There are two CSV files. I need to convert to JSON. Code is below
import csv
import json
import os
import glob
os.chdir(r'C:\Users\user\Desktop\test' )
result = glob.glob( '*.csv' )
print (result)
def make_json(csvFile, jsonFile):
csvFile, jsonFile = '',''
for i in result:
data = {}
with open(csvFile, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['id']
data[key] = rows
with open(jsonFile, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
csvFilePath =f"{i}"
jsonFilePath =f"{i.split('.')[-2]}.json"
make_json(csvFile, jsonFile)
I got error > csvFile is not mentioned. But the third line from the end mentions the CSV file.
Disclaimer. Please find the error in the code. I already know of the working code which is in pandas
Below is the correct code, but I would recommend you learn to use the python debugger so you can resolve any logic flaws in your code next time. Documentation on the python debugger can be found here:
https://docs.python.org/3/library/pdb.html
Your code was structured in a way that meant for each csv file, you were not setting the file name until after you attempted to open it. The immediate error you saw was caused because you tried to call make_json() before you defined the values for csvFile and jsonFile.
I would recommend changing the code to:
import csv
import json
import glob
def make_json(csvList):
for csvFile in csvList:
data = {}
with open(csvFile, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['id']
data[key] = rows
jsonFile =f"{csvFile.split('.')[-2]}.json"
with open(jsonFile, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
make_json(glob.glob('*.csv'))
You should try this
import csv, json, os, glob
os.chdir(r'C:\Users\user\Desktop\test' )
result = glob.glob( '*.csv' )
print(result)
def make_json():
for i in result:
with open(i, encoding='utf-8') as csvf:
data = [row for row in csv.DictReader(csvf)]
with open(f"{i.split('.')[-2]}.json", 'w', encoding='utf-8') as jsonf:
json.dump(data, jsonf)
make_json()
You did not initialize both the arguments of make_json() - (csvFilePath & jsonFilePath)

Copying Specific Rows between CSV Files Python

I have an image folder with images saved with their ids. I would like to search the ids in the rows of a CSV file, then copy the rows into a new CSV file. Code as follows:
import os
import csv
folder = os.listdir("[image folder]")
file_in = "[csv containing info rows]"
file_out = "[new csv to create]"
with open(file_in, 'r', newline='') as f_input, open(file_out, 'w', newline='') as f_output:
writer = csv.writer(f_output)
reader = csv.reader(f_input)
writer.writerow(["ImageID", "LabelName", "XMin", "XMax", "YMin", "YMax", "IsGroupOf"]) #writes header of new csv
for filename in folder:
idx = os.path.splitext(filename)[0]
for row in reader:
if idx == row[0]:
print(row)
writer.writerow(row)
It outputs only the first matching id into the new CSV, instead of the actual number which is a few thousand. Sry for the simple task but I've been stumped for quite a while.
e.g.
Sample CSV:
ImageID,LabelName,XMin,XMax,YMin,YMax,IsGroupOf
0001eeaf4aed83f9,/m/0cmf2,0.022673031,0.9642005,0.07103825,0.80054647,0
00075905539074f2,/m/04yx4,0.020477816,0.32935154,0.0956023,0.665392,0
00075905539074f2,/m/04yx4,0.3208191,0.63993174,0,0.6596558,0
00075905539074f2,/m/04yx4,0.6757679,0.9914676,0.17208412,0.94837475,0
0007cebe1b2ba653,/m/07mhn,0.7359882,0.9262537,0.022123894,0.40265486,0
0007cebe1b2ba653,/m/0bt9lr,0.42035398,0.7935103,0.18141593,0.7212389,0
0007cebe1b2ba653,/m/01g317,0.7345133,0.9321534,0,0.36946902,0
0007d6cf88afaa4a,/m/0bt9lr,0.17342657,0.9020979,0.21678321,0.94172496,0
0008e425fb49a2bf,/m/0bt9lr,0.22610295,0.7150735,0.11170213,0.93439716,0
0009bad4d8539bb4,/m/0cmf2,0.2945508,0.70544916,0.34070796,0.5154867,0
3 sample images in folder: 0001eeaf4aed83f9.jpg, 0007cebe1b2ba653.jpg, 0009bad4d8539bb4.jpg
Expected output CSV:
ImageID,LabelName,XMin,XMax,YMin,YMax,IsGroupOf
0001eeaf4aed83f9,/m/0cmf2,0.022673031,0.9642005,0.07103825,0.80054647,0
0007cebe1b2ba653,/m/07mhn,0.7359882,0.9262537,0.022123894,0.40265486,0
0007cebe1b2ba653,/m/0bt9lr,0.42035398,0.7935103,0.18141593,0.7212389,0
0007cebe1b2ba653,/m/01g317,0.7345133,0.9321534,0,0.36946902,0
0009bad4d8539bb4,/m/0cmf2,0.2945508,0.70544916,0.34070796,0.5154867,0
You could use the following approach. Firstly, use glob.glob() to only get the jpg files. A set can be used to hold all of the filenames that are found (without their extensions).
Now you can just read the sample CSV file in a row at a time and use a simple in check to test if the ImageID is one of the file names in the set.
For example:
import glob
import csv
import os
files = {os.path.splitext(filename)[0] for filename in glob.glob("*.jpg")}
file_in = "sample.csv"
file_out = "output.csv"
with open(file_in, 'r', newline='') as f_input, open(file_out, 'w', newline='') as f_output:
csv_input = csv.reader(f_input)
header = next(csv_input)
csv_output = csv.writer(f_output)
csv_output.writerow(header)
for row in csv_input:
if row[0] in files: # Is ImageID one of the filenames found?
print(row)
csv_output.writerow(row)
This would give you an output.csv file as follows:
ImageID,LabelName,XMin,XMax,YMin,YMax,IsGroupOf
0001eeaf4aed83f9,/m/0cmf2,0.022673031,0.9642005,0.07103825,0.80054647,0
0007cebe1b2ba653,/m/07mhn,0.7359882,0.9262537,0.022123894,0.40265486,0
0007cebe1b2ba653,/m/0bt9lr,0.42035398,0.7935103,0.18141593,0.7212389,0
0007cebe1b2ba653,/m/01g317,0.7345133,0.9321534,0,0.36946902,0
0009bad4d8539bb4,/m/0cmf2,0.2945508,0.70544916,0.34070796,0.5154867,0

Save CSV file using python

I am able to change the data to lowercase and remove all the punctuation but I have trouble saving the corrected data in CSV file.
import csv
import re
import os
input_file=raw_input("Name of the CSV file:")
output_file=raw_input("Output Name:")
reg_test=input_file
result = ''
with open(input_file,'r') as csvfile:
with open(output_file,'w') as csv_out_file:
filereader = csv.reader(csvfile)
filewriter =csv.writer(csv_out_file)
for row in filereader:
row = re.sub('[^A-Za-z0-9]+', '', str(row))
result += row + ','
lower = (result).lower()
csvfile.close()
csv_out_file.close()
You do not have to close the files, this is done automatically after the context of the with statement is over and you have to actually write something after you create the csv.writer, e.g. with writerow:
import csv
import re
input_file = 'in.csv'
output_file = 'out.csv'
with open(input_file, 'r') as csvfile, open(output_file, 'w') as csv_out_file:
filereader = csv.reader(csvfile)
filewriter = csv.writer(csv_out_file)
for row in filereader:
new_row = re.sub('[^A-Za-z0-9]+', '', str(row)) # manipulate the row
filewriter.writerow([new_row.lower()]) # write the new row to the out file
# the files are closed automatically after the context of the with statement is over
This saves the manipulated content of the first csv file to the second.

Python csv register_dialect delimiter is not working

I have the written the code below to read in a large csv file with many variables and then just print 1 variable for every row in the outfile. It is working except that the delimiter is not being picked up.
import csv
fieldnames = ['tag']
outfile = open('ActiveTags.txt', 'w')
csv.register_dialect('me', delimiter=',', quotechar="'", quoting=csv.QUOTE_ALL, lineterminator='')
writer = csv.DictWriter(outfile, fieldnames=fieldnames, dialect='me')
with open('ActiveList_16.csv', 'r', newline='') as f:
reader = csv.DictReader(f)
for row in reader:
Tag = row['Tag']
writer.writerow({'tag': Tag})
outfile.close()
What am I missing here? I do not understand why the delimiter is not working on the outfile.

how to delete entire row in csv file and save changes on same file?

i'm new with python and try to modify csv file so i will able to delete specific rows with specific fields according to given list.
in my current code i get the rows which i want to delete but i can't delete it and save the changes on same file (replace).
import os, sys, glob
import time ,csv
# Open a file
path = 'C:\\Users\\tzahi.k\\Desktop\\netzer\\'
dirs = os.listdir( path )
fileslst = []
alertsCode = ("42001", "42003", "42006","51001" , "51002" ,"61001" ,"61002","71001",
"71002","71003","71004","71005","71006","72001","72002","72003","72004",
"82001","82002","82003","82004","82005","82006","82007","83001","84001")
# This would print the unnesscery codes
for file in dirs:
if "ALERTS" in file.upper() :
fileslst.append(file)
fileslst.sort()
with open(fileslst[-1], 'rb') as csvfile:
csvReader = csv.reader(csvfile)
for row in csvReader:
for alert in alertsCode:
if any(alert in row[2] for s in alertsCode) :
print row
any help?
Read all the rows into a list using a list comprehension and excluding the unwanted rows. Then rewrite the rows to the file in mode w (write mode) which overwrites or replaces the content of the file:
with open(fileslst[-1], 'rb') as csvfile:
csvReader = csv.reader(csvfile)
clean_rows = [row for row in csvReader if not any(alert in row[2] for alert in alertsCode)]
# csvfile.truncate()
with open(fileslst[-1], 'wb') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerows(clean_rows)

Categories