I created this code to scan my samples_vsdt.txt getting a certain values then writing it in a csv, I'm having an error StopIteration and doesn't even read the text file. I'm trying to solve this for hours, any idea what's causing the problem?
Here is how my code works, Example this line:
Scanning samples_extracted\82e5b144cb5f1c10629e72fc1291f535db7b0b40->(Word 2003 XML Document 1003-1)
Will be written to csv as this:
82e5b144cb5f1c10629e72fc1291f535db7b0b40,Word 2003 XML Document 1003-1
Here is my code, and this is working for all my txt_files but this one sample_vsdt.txt doesn't work properly
import csv,re
out_vsdt = "samples_vsdt.txt"
out_sha1_vsdt = "sha1_vsdt.csv"
def read_text_file(out_vsdt):
with open(out_vsdt) as f:
data = []
for line in f:
if "Scanning " + new in line and "(" in line:
try:
sha = re.search('\\\(.*)->', line).group(1)
desc= re.search('->\((.*)\)', line).group(1)
except AttributeError:
desc = None
sha = None
mix = sha,desc
data.append(mix)
continue
if "Scanning " + new in line:
try:
sha= re.search('\\\(.*)$', line).group(1)
while True:
i = next(f)
if "(" in i:
try:
desc = re.search('->\((.*)\)', i).group(1)
break
except AttributeError:
desc = None
sha = None
mix = sha,desc
data.append(mix)
except AttributeError:
sha = None
return data
def write_csv_file(data,out_sha1_vsdt):
with open(out_sha1_vsdt, 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"')
csvwriter.writerow(['SHA-1','VSDT','DESC'])
for row in data:
csvwriter.writerow(row)
def main():
data = read_text_file(out_vsdt)
write_csv_file(data, out_sha1_vsdt)
if __name__ == '__main__':
main()
print "Parsing Successful"
Gives me error:
Traceback (most recent call last):
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 65, in <module>
main()
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 61, in main
data = read_text_file(out_vsdt)
File "C:\Users\trendMICRO\Desktop\ojt\scanner\parser.py", line 37, in read_text_file
i = next(f)
StopIteration
An alternative approach could be to just use a regular expression to extract whole blocks:
import csv
import re
out_vsdt = "samples_vsdt.txt"
out_sha1_vsdt = "sha1_vsdt.csv"
with open(out_vsdt) as f_input:
vscan32 = f_input.read()
with open(out_sha1_vsdt, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['SHA-1', 'VSDT', 'DESC'])
for sha, desc, vsdt in re.findall(r'Scanning.*?\\([0-9a-f]+)(.*?)->\((.*?)\)$', vscan32, re.S + re.M):
desc = '|'.join(line.strip() for line in desc.splitlines() if len(line.strip()))
desc = ''.join(filter(lambda x: x in string.printable, desc)) # remove non-printable characters
csv_output.writerow([sha, vsdt, desc])
This uses a multi-line expression that looks for blocks starting with Scanning. Where there are multiple lines, the lines are stripped and joined together using a |. Finally any non-printable characters are removed from the description.
This would give you an output starting something like:
SHA-1,VSDT,DESC
004d44eeecae27314f8bd3825eb82d2f40182b51,WIN32 EXE 7-2,
07eab9ea58d4669febf001d52c5182ecf579c407,WIN32 EXE 7-2,
0d558bb5e0a5b544621af0ffde1940615ac39deb,WIN32 EXE 7-2,
5172c70c1977bbddc2a163f6ede46595109c7835,WIN32 EXE 7-2,- $R0\NsCpuCNMiner32.exe->Found Virus [WORM_CO.331300D2]|- $R0\NsCpuCNMiner64.exe->Found Virus [WORM_CO.331300D2]|- $R0\NsGpuCNMiner.exe->Found Virus [TROJ64_.743CC567]
This assumes you are using Python 3.x
Related
I'm trying to import 5'000 .txt files into a postgresql database. My script is running fine as long as it doesn't reach a line which doesn't fit the format. For example every file has a new line at the end which also causes the script to crash.
I've tried to handle exceptions but to no success...
My script:
import csv
import os
import sys
import psycopg2
conn = psycopg2.connect(
host="localhost",
database="demo",
user="demo",
password="123",
port="5432"
)
cur = conn.cursor()
maxInt = sys.maxsize
while True:
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt / 10)
def searchFiles(directory='', extension=''):
print('SEARCHING IN: ', directory)
filelist = []
extension = extension.lower()
for dirpath, dirnames, files in os.walk(directory):
for name in files:
if extension and name.lower().endswith(extension):
filelist.append(os.path.join(dirpath, name))
elif not extension:
print('FAILED TO READ: ', (os.path.join(dirpath, name)))
print('FINISHED FILE SEARCH AND FOUND ', str(len(filelist)), ' FILES')
return filelist
def importData(fileToImport):
with open(fileToImport, 'r') as f:
reader = csv.reader(f, delimiter=':')
for line in reader:
try:
cur.execute("""INSERT INTO demo VALUES (%s, %s)""", (line[0], line[1]))
conn.commit()
except:
pass
print('FAILED AT LINE: ', line)
print(conn.get_dsn_parameters())
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record)
fileList = searchFiles('output', '.txt')
counter = 0
length = len(fileList)
for file in fileList:
# if counter % 10 == 0:
print('Processing File: ', str(file), ', COMPLETED: ', str(counter), '/', str(length))
importData(str(file))
counter += 1
print('FINISHED IMPORT OF ', str(length), ' FILES')
A few lines of the data I'm trying to import:
example1#example.com:123456
example2#example.com:password!1
The error I'm getting:
File "import.py", line 66, in <module>
importData(str(file))
File "import.py", line 45, in importData
for line in reader:
_csv.Error: line contains NULL byte
How should I handle lines which can not get imported?
Thanks for any help
Your traceback shows the source of the exception in for line in reader:
File "import.py", line 45, in importData
for line in reader:
_csv.Error: line contains NULL byte
and you do not handle exceptions at that point. As the exception suggests, it is raised by your csv reader instance. While you certainly could wrap your for loop in a try-except block, your loop will still end once the exception raises.
This exception may be caused by the file having a different encoding than your locale's, which is assumed by open() if no encoding is explicitly provided:
In text mode, if encoding is not specified the encoding used is
platform dependent: locale.getpreferredencoding(False) is called to
get the current locale encoding.
The accepted answer in this Q&A outlines a solution to deal with that, provided that you can identify the correct encoding to open the file with. The Q&A also shows some approaches on how to get rid of NULL bytes in the file, prior to handing it over to a reader.
You might also want to simply skip empty lines instead of firing them to your DB and handle the exception, e.g.
for line in reader:
if not line:
continue
try:
[...]
I have this sample_vsdt.txt file containing SHA-1 and description like this inside of my txt file:
Scanning samples_extracted\02b809d4edee752d9286677ea30e8a76114aa324->(Microsoft RTF 6008-0)
->Found Virus [Possible_SCRDL]
Scanning samples_extracted\0349e0101d8458b6d05860fbee2b4a6d7fa2038d->(Adobe Portable Document Format(PDF) 6015-0)
->Found Virus [TROJ_FRS.VSN11I18]
Example:
SHA-1: 02b809d4edee752d9286677ea30e8a76114aa324
Description:(Microsoft RTF 6008-0)
Problem:
My task is to list those SHA-1 and Description in my txt file then list it in a csv file, I was able to do that using regex,prefix and delimeter. However this example is what makes it hard for me:
Scanning samples_extracted\0191a23ee122bdb0c69008971e365ec530bf03f5
- Invoice_No_94497.doc->Found Virus [Trojan.4FEC5F36]->(MIME 6010-0)
- Found 1/3 Viruses in samples_extracted\0191a23ee122bdb0c69008971e365ec530bf03f5
It has different line pattern and I only want to get the SHA-1 in the first line not the 4th line and get the description in the second line.
Output:
The output went wrong because the description (MIME 6010-0) was put in the SHA-1 column.
0191a23ee122bdb0c69008971e365ec530bf03f5
(MIME 6010-0)
02b809d4edee752d9286677ea30e8a76114aa324 (Microsoft RTF 6008-0)
0349e0101d8458b6d05860fbee2b4a6d7fa2038d (Adobe Portable Document Format(PDF) 6015-0)
035a7afca8b72cf1c05f6062814836ee31091559 (Adobe Portable Document Format(PDF) 6015-0)
Code
import csv
import re
INPUTFILE = 'samples_vsdt.txt'
OUTPUTFILE = 'output.csv'
PREFIX = '\\'
DELIMITER = '->'
DELIMITER2 = ']->'
PREFIX2 = ' - '
def read_text_file(inputfile):
data = []
with open(inputfile, 'r') as f:
lines = f.readlines()
for line in lines:
line = line.rstrip('\n')
if re.search(r'[a-zA-Z0-9]{40}', line) and not "Found" in line: # <----
line = line.split(PREFIX, 1)[-1]
parts = line.split(DELIMITER)
data.append(parts)
else:
if "->(" in line and "Found" in line :
matched_words=(re.search(r'\(.*?\)',line))
sha =(re.search(r'[a-zA-Z0-9]{40}',line))
if matched_words!=None:
matched_words=matched_words.group()
matched_words=matched_words.split("]->")
data.append(matched_words)
#data.append(parts)
return data
def write_csv_file(data, outputfile):
with open(outputfile, 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"')
for row in data:
csvwriter.writerow(row)
def main():
data = read_text_file(INPUTFILE)
write_csv_file(data, OUTPUTFILE)
if __name__ == '__main__':
main()
Here is the full content of my text file:
sample_vsdt.txt
I changed some logic, maybe I can give you some different ideas.
Basically it checks if the string Scanning samples_extracted is present with (, which means that the description is on the same line of the sha.
Otherwise with only Scanning samples_extracted means that the desc is on the following line ( in your example there are some blank line, I had to add a while cycle )
Prints the result, cherry-pick logic and put in your program.
import re
with open("vCjjGQxe.txt") as f:
for line in f:
if "Scanning samples_extracted" in line and "(" in line:
sha = re.search('\\\(.*)->', line).group(1)
desc = re.search('->\((.*)\)', line).group(1)
print("SHA-1:", sha)
print("Description:", desc)
continue
if "Scanning samples_extracted" in line:
sha = re.search('\\\(.*)$', line).group(1)
while True:
i = next(f)
if "(" in i:
desc = re.search('->\((.*)\)', i).group(1)
break
print("SHA-1:", sha)
print("Description:", desc)
I'm trying to write the result of a function in a csv. Unfortunately, no pandas.
csv file input:
Hello all well?
today is cold!
I have not had lunch yet
He does not have many brothers or sisters.
We are sick
Script:
import re
import csv
import string
with open('teste_csv.csv', 'r') as f:
file = csv.reader(f)
for line in file:
message = ''.join(line)
def toto(message):
message = message.lower()
p = re.compile('|'.join(map(re.escape, string.punctuation)))
no_punct = p.sub(' ', message)
writer = csv.writer(open('result.csv', 'w'))
for row in no_punct:
writer.writerow(row)
return writer
print(toto(message))
At my terminal, I have <_csv.writer object at 0x7fee60e57c50> and in my result.csv I have only one line written 'w'. I would like each line to be in my result.csv
You keep erasing the file since everytime you call toto it opens result.csv for writing, hence you are left only with a single write. You need to open the file once ,and create the wirter once. You also only need to define the function once for that matter:
import re
import csv
import string
def toto(message,writer):
message = message.lower()
p = re.compile('|'.join(map(re.escape, string.punctuation)))
no_punct = p.sub(' ', message)
for row in no_punct:
writer.writerow(row)
with open('teste_csv.csv', 'r') as f:
writer = csv.writer(open('result.csv','w'))
file = csv.reader(f)
for line in file:
message = ''.join(line)
toto(message,writer)
You need to put the writer outside of your first loop. each time you are looping throw it's opening and rewriting the file
another issue you are defining and calling the toto inside the loop so it's getting called with last message value.
import re
import csv
import string
with open('test.csv', 'r') as f:
file = csv.reader(f)
writer = csv.writer(open('result.csv', 'w'))
def toto(message):
message = message.lower()
p = re.compile('|'.join(map(re.escape, string.punctuation)))
no_punct = p.sub(' ', message)
for row in no_punct:
writer.writerow(row)
return writer
for line in file:
print line
message=''.join(line)
print(toto(message))
I have been trying for last few hours to narrow down an issue and I cannot see it. I'm new to Python 3 and trying parse a text file for a project.
The parsing simply cleans up some whitespace and replaces delimitters.
I don't understand why it won't work.
More specifically
I am getting this particular error:
"NameError: name 'out' is not defined"
Code:
save_path = 'C:/UsersDesktop/CSVproject'
with open('C:/Users/CSVproject/sourceData.dat', 'r') as f:
for line in f:
if ':DUBLIN' in line:
line = line.replace(' ', '')
line = line.replace(':', ';')
print(line)
found = True
fullNameOfFile = os.path.join(save_path, 'newFormattedData'+".csv")
out = open(fullNameOfFile, 'w')
for line in f:
out.write(line)
You are attempting to open the file each time the word :DUBLIN occurs. you only need to open it once and you should open it at a place where the scope ensures that the handle is visible to the write method.
fullNameOfFile = os.path.join(save_path, 'newFormattedData'+".csv")
out = open(fullNameOfFile, 'w')
for line in f:
if ':DUBLIN' in line:
line = line.replace(' ', '')
line = line.replace(':', ';')
print(line)
found = True
out.write(line)
And you definitely don't want to have a nested loop for iterating through the input file.
I'm trying to create a new version of a file that excludes NULL bytes. I'm using the code below to attempt this however it's still breaking on the NULL byte. How should I structure the for statement and try-catch block to keep going after the exception?
import csv
input_file = "/data/train.txt"
outFileName = "/data/train_no_null.txt"
############################
i_f = open( input_file, 'r' )
reader = csv.reader( i_f , delimiter = '|' )
outFile = open(outFileName, 'wb')
mywriter = csv.writer(outFile, delimiter = '|')
i_f.seek( 0 )
i = 1
for line in reader:
try:
i += 1
mywriter.writerow(line)
except csv.Error:
print('csv choked on line %s' % (i + 1))
pass
EDIT:
Here's the error message:
Traceback (most recent call last):
File "20150310_rewrite_csv_wo_NULL.py", line 26, in <module>
for line in reader:
_csv.Error: line contains NULL byte
UPDATE:
I'm using this code:
i_f = open( input_file, 'r' )
reader = csv.reader( i_f , delimiter = '|' )
# reader.next()
outFile = open(outFileName, 'wb')
mywriter = csv.writer(outFile, delimiter = '|')
i_f.seek( 0 )
i = 1
for idx, line in enumerate(reader):
try:
mywriter.writerow(line)
except:
print('csv choked on line %s' % idx)
and now get this error:
Traceback (most recent call last):
File "20150310_rewrite_csv_wo_NULL.py", line 26, in <module>
for idx, line in enumerate(reader):
_csv.Error: line contains NULL byte
You can catch all errors with the following code...
for idx, line in enumerate(reader):
try:
mywriter.writerow(line)
except:
print('csv choked on line %s' % idx)
The exception is being thrown from the reader, which is not being caught as it is outside of the try/catch.
But even if it was, the reader won't want to continue after its encounter with the NUL byte. But if the reader never saw it, along the lines of...
for idx, line in enumerate(csv.reader((line.replace('\0','') for line in open('myfile.csv')), delimiter='|')):
you might be OK.
Really though, you should find out where the NUL bytes are coming from as they might be symptomatic of a wider problem with your data.