Python continue for loop after exception - python

I'm trying to create a new version of a file that excludes NULL bytes. I'm using the code below to attempt this however it's still breaking on the NULL byte. How should I structure the for statement and try-catch block to keep going after the exception?
import csv
input_file = "/data/train.txt"
outFileName = "/data/train_no_null.txt"
############################
i_f = open( input_file, 'r' )
reader = csv.reader( i_f , delimiter = '|' )
outFile = open(outFileName, 'wb')
mywriter = csv.writer(outFile, delimiter = '|')
i_f.seek( 0 )
i = 1
for line in reader:
try:
i += 1
mywriter.writerow(line)
except csv.Error:
print('csv choked on line %s' % (i + 1))
pass
EDIT:
Here's the error message:
Traceback (most recent call last):
File "20150310_rewrite_csv_wo_NULL.py", line 26, in <module>
for line in reader:
_csv.Error: line contains NULL byte
UPDATE:
I'm using this code:
i_f = open( input_file, 'r' )
reader = csv.reader( i_f , delimiter = '|' )
# reader.next()
outFile = open(outFileName, 'wb')
mywriter = csv.writer(outFile, delimiter = '|')
i_f.seek( 0 )
i = 1
for idx, line in enumerate(reader):
try:
mywriter.writerow(line)
except:
print('csv choked on line %s' % idx)
and now get this error:
Traceback (most recent call last):
File "20150310_rewrite_csv_wo_NULL.py", line 26, in <module>
for idx, line in enumerate(reader):
_csv.Error: line contains NULL byte

You can catch all errors with the following code...
for idx, line in enumerate(reader):
try:
mywriter.writerow(line)
except:
print('csv choked on line %s' % idx)

The exception is being thrown from the reader, which is not being caught as it is outside of the try/catch.
But even if it was, the reader won't want to continue after its encounter with the NUL byte. But if the reader never saw it, along the lines of...
for idx, line in enumerate(csv.reader((line.replace('\0','') for line in open('myfile.csv')), delimiter='|')):
you might be OK.
Really though, you should find out where the NUL bytes are coming from as they might be symptomatic of a wider problem with your data.

Related

memoryError when looping through a huge file, with python 3.8

I need to loop through a huge csv file (~120GB), and add 2 new columns.
FILE_NAME = "C:/Temp/large_file.csv"
ENCODING = "utf-8-sig"
IUD_COLUMN_NAME = "stg_stage_load_type"
BATCH_ID_COLUMN_NAME = 'batch_id'
# BUFFER = 20_000_000
BUFFER = 1
print("Opening file")
local_file = open(FILE_NAME, 'rt', encoding=ENCODING, buffering=BUFFER)
hsep = ";"
local_output_file = "C:/Temp/local_test.csv"
iud_val = "I"
batch_id_val = "1607327136305"
print("Open Output file")
with open(local_output_file, 'w', newline='', encoding=ENCODING, buffering=BUFFER) as write_obj:
print("Starting loop")
for row in local_file:
# Append the batch_id text in the row / list
write_obj.write(row.rstrip('\n').rstrip('\r') + hsep + iud_val + hsep + batch_id_val + '\n')
local_file.close()
My code fails with a memoryError.
I've tried different values for BUFFER, but it doesn't seem to help.
This was originally encountered on AWS fargate, running an ubunto based image, with 30GB of memory. Same error reproduced on a windows laptop with 16GB of memory, running python 3.8.7.
Full stack trace:
Traceback (most recent call last):
File "loop_test.py", line 27, in <module>
write_obj.write(row.rstrip('\n').rstrip('\r') + hsep + iud_val + hsep + batch_id_val + '\n')
File "C:\Users\k64092633\AppData\Local\Programs\Python\Python38\lib\encodings\utf_8_sig.py", line 37, in encode
return codecs.utf_8_encode(input, self.errors)[0]
MemoryError
The file has 221697553 lines, which on average are around 571 bytes long.
The error does not appear immediately, but only after processing for a while. I think there might be some very long lines causing this.
with open(local_output_file, 'w', newline='', encoding=ENCODING, buffering=BUFFER) as write_obj:
print("Starting loop")
for row in local_file:
row_len = len(row)
if row_len < min_len:
min_len = row_len
if row_len > max_len:
max_len = row_len
#print("############################")
#print(row)
num_rows = num_rows + 1
# Append the batch_id text in the row / list
if row_len >1073745760:
print(num_rows)
write_obj.write(row)
seemed to indicate this.

Getting Error codec can't encode characters in position 8-13: character maps to <undefined>

I get this error
Traceback (most recent call last):
File "C:\Users\Anthony\PycharmProjects\ReadFile\main.py", line 14, in <module>
masterFile.write("Line {}: {}\n".format(index, line.strip()))
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 8-13: character maps to undefined
The program is supposed to search for all txts in a directory and search them for a specific word. Once it finds it print them to a file with the line and then also print another copy of the file with full line numbers. There will be like 100 txt files and it will work on the first 3 before I get this error message. All the files are UTF-8 encoded. I tried changing
with open(file, encoding="utf-8") as f: but it didn't work.
import glob
searchWord = "Hello"
dataFile = open("C:/Users/Anthony/Documents/TextDataFolder/TextData.txt", 'w')
masterFile = open("C:/Users/Anthony/Documents/TextDataFolder/masterFile.txt", 'w')
files = glob.iglob("#C:/Users/Anthony/Documents/Texts/*.txt", recursive = True)
for file in files:
with open(file) as f:
print(file)
for index, line in enumerate(f):
#print("Line {}: {}".format(index, line.strip()))
masterFile.write("Line {}: {}\n".format(index, line.strip()))
if searchWord in line:
print("Line {}: {}".format(index, line.strip()))
dataFile.write("Line {}: {}\n".format(index, line.strip()))
I eventually figured it out... I feel like an idiot. The problem wasn't my reading of the files. It was my writing wasn't encoded. had only attempted to encoding my read. So Final Looks like this
import glob
searchWord = "Hello"
dataFile = open("C:/Users/Anthony/Documents/TextDataFolder/TextData.txt", 'w', encoding="utf-8")masterFile = masterFile = open("C:/Users/Anthony/Documents/TextDataFolder/masterFile.txt", 'w', encoding="utf-8")
files = glob.iglob("#C:/Users/Anthony/Documents/Texts/*.txt", recursive = True)
for file in files:
with open(file, "r", encoding="utf-8") as f:
print(file)
for index, line in enumerate(f):
#print("Line {}: {}".format(index, line.strip()))
masterFile.write("Line {}: {}\n".format(index, line.strip()))
if searchWord in line:
print("Line {}: {}".format(index, line.strip()))
dataFile.write("Line {}: {}\n".format(index, line.strip()))

Skip line in .txt import to postgresql

I'm trying to import 5'000 .txt files into a postgresql database. My script is running fine as long as it doesn't reach a line which doesn't fit the format. For example every file has a new line at the end which also causes the script to crash.
I've tried to handle exceptions but to no success...
My script:
import csv
import os
import sys
import psycopg2
conn = psycopg2.connect(
host="localhost",
database="demo",
user="demo",
password="123",
port="5432"
)
cur = conn.cursor()
maxInt = sys.maxsize
while True:
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt / 10)
def searchFiles(directory='', extension=''):
print('SEARCHING IN: ', directory)
filelist = []
extension = extension.lower()
for dirpath, dirnames, files in os.walk(directory):
for name in files:
if extension and name.lower().endswith(extension):
filelist.append(os.path.join(dirpath, name))
elif not extension:
print('FAILED TO READ: ', (os.path.join(dirpath, name)))
print('FINISHED FILE SEARCH AND FOUND ', str(len(filelist)), ' FILES')
return filelist
def importData(fileToImport):
with open(fileToImport, 'r') as f:
reader = csv.reader(f, delimiter=':')
for line in reader:
try:
cur.execute("""INSERT INTO demo VALUES (%s, %s)""", (line[0], line[1]))
conn.commit()
except:
pass
print('FAILED AT LINE: ', line)
print(conn.get_dsn_parameters())
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record)
fileList = searchFiles('output', '.txt')
counter = 0
length = len(fileList)
for file in fileList:
# if counter % 10 == 0:
print('Processing File: ', str(file), ', COMPLETED: ', str(counter), '/', str(length))
importData(str(file))
counter += 1
print('FINISHED IMPORT OF ', str(length), ' FILES')
A few lines of the data I'm trying to import:
example1#example.com:123456
example2#example.com:password!1
The error I'm getting:
File "import.py", line 66, in <module>
importData(str(file))
File "import.py", line 45, in importData
for line in reader:
_csv.Error: line contains NULL byte
How should I handle lines which can not get imported?
Thanks for any help
Your traceback shows the source of the exception in for line in reader:
File "import.py", line 45, in importData
for line in reader:
_csv.Error: line contains NULL byte
and you do not handle exceptions at that point. As the exception suggests, it is raised by your csv reader instance. While you certainly could wrap your for loop in a try-except block, your loop will still end once the exception raises.
This exception may be caused by the file having a different encoding than your locale's, which is assumed by open() if no encoding is explicitly provided:
In text mode, if encoding is not specified the encoding used is
platform dependent: locale.getpreferredencoding(False) is called to
get the current locale encoding.
The accepted answer in this Q&A outlines a solution to deal with that, provided that you can identify the correct encoding to open the file with. The Q&A also shows some approaches on how to get rid of NULL bytes in the file, prior to handing it over to a reader.
You might also want to simply skip empty lines instead of firing them to your DB and handle the exception, e.g.
for line in reader:
if not line:
continue
try:
[...]

StopIteration error before reading the text file using next()

I created this code to scan my samples_vsdt.txt getting a certain values then writing it in a csv, I'm having an error StopIteration and doesn't even read the text file. I'm trying to solve this for hours, any idea what's causing the problem?
Here is how my code works, Example this line:
Scanning samples_extracted\82e5b144cb5f1c10629e72fc1291f535db7b0b40->(Word 2003 XML Document 1003-1)
Will be written to csv as this:
82e5b144cb5f1c10629e72fc1291f535db7b0b40,Word 2003 XML Document 1003-1
Here is my code, and this is working for all my txt_files but this one sample_vsdt.txt doesn't work properly
import csv,re
out_vsdt = "samples_vsdt.txt"
out_sha1_vsdt = "sha1_vsdt.csv"
def read_text_file(out_vsdt):
with open(out_vsdt) as f:
data = []
for line in f:
if "Scanning " + new in line and "(" in line:
try:
sha = re.search('\\\(.*)->', line).group(1)
desc= re.search('->\((.*)\)', line).group(1)
except AttributeError:
desc = None
sha = None
mix = sha,desc
data.append(mix)
continue
if "Scanning " + new in line:
try:
sha= re.search('\\\(.*)$', line).group(1)
while True:
i = next(f)
if "(" in i:
try:
desc = re.search('->\((.*)\)', i).group(1)
break
except AttributeError:
desc = None
sha = None
mix = sha,desc
data.append(mix)
except AttributeError:
sha = None
return data
def write_csv_file(data,out_sha1_vsdt):
with open(out_sha1_vsdt, 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"')
csvwriter.writerow(['SHA-1','VSDT','DESC'])
for row in data:
csvwriter.writerow(row)
def main():
data = read_text_file(out_vsdt)
write_csv_file(data, out_sha1_vsdt)
if __name__ == '__main__':
main()
print "Parsing Successful"
Gives me error:
Traceback (most recent call last):
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 65, in <module>
main()
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 61, in main
data = read_text_file(out_vsdt)
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 37, in read_text_file
i = next(f)
StopIteration
An alternative approach could be to just use a regular expression to extract whole blocks:
import csv
import re
out_vsdt = "samples_vsdt.txt"
out_sha1_vsdt = "sha1_vsdt.csv"
with open(out_vsdt) as f_input:
vscan32 = f_input.read()
with open(out_sha1_vsdt, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['SHA-1', 'VSDT', 'DESC'])
for sha, desc, vsdt in re.findall(r'Scanning.*?\\([0-9a-f]+)(.*?)->\((.*?)\)$', vscan32, re.S + re.M):
desc = '|'.join(line.strip() for line in desc.splitlines() if len(line.strip()))
desc = ''.join(filter(lambda x: x in string.printable, desc)) # remove non-printable characters
csv_output.writerow([sha, vsdt, desc])
This uses a multi-line expression that looks for blocks starting with Scanning. Where there are multiple lines, the lines are stripped and joined together using a |. Finally any non-printable characters are removed from the description.
This would give you an output starting something like:
SHA-1,VSDT,DESC
004d44eeecae27314f8bd3825eb82d2f40182b51,WIN32 EXE 7-2,
07eab9ea58d4669febf001d52c5182ecf579c407,WIN32 EXE 7-2,
0d558bb5e0a5b544621af0ffde1940615ac39deb,WIN32 EXE 7-2,
5172c70c1977bbddc2a163f6ede46595109c7835,WIN32 EXE 7-2,- $R0\NsCpuCNMiner32.exe->Found Virus [WORM_CO.331300D2]|- $R0\NsCpuCNMiner64.exe->Found Virus [WORM_CO.331300D2]|- $R0\NsGpuCNMiner.exe->Found Virus [TROJ64_.743CC567]
This assumes you are using Python 3.x

"_csv.Error: line contains NULL byte" in CSV reader from STDIN

There are many StackOverflow questions about this error when reading from a CSV file. My problem is occurring while reading from STDIN.
[Most SO solutions talk about tweaking the open() command which works for opening CSV files - not for reading them through STDIN]. My problem is with reading through STDIN. So please don't mark this as a duplicate.
My python code is:
import sys , csv
def main(argv):
reader = csv.reader(sys.stdin, delimiter=',')
for line in reader:
print line
and the returned error is:
Traceback (most recent call last):
File "mapper.py", line 19, in <module>
main(sys.argv)
File "mapper.py", line 4, in main
for line in reader:
_csv.Error: line contains NULL byte
It would suffice me to simply ignore that line where the NULL byte occurs (if that is possible) in the for loop.
i solved it by handling CSV exception
import sys , csv
def main(argv):
reader = csv.reader(sys.stdin, delimiter=',')
lineCount = 0
errorCount = 0
while True:
# keep iterating indefinitely until exception is raised for end of the reader (an iterator)
try:
lineCount += 1
line = next(reader)
print "%d - %s" % (lineCount , line)
except csv.Error:
# this exception is raised when a malformed CSV is encountered... ignore it and continue
errorCount += 1
continue
except StopIteration:
# this exception is raised when next() reaches the end of the iterator
lineCount -= 1
break
print "total line: %d" % lineCount
print "total error: %d" % errorCount

Categories