Having problems searching with strings from csv module in Python - python

I have a csv file partList.csv with strings that I want to use to search through a larger group of txt files. For some reason when I use the direct string 'L 99' I get a result. When I load the string L 99 from the csv I get no result.
partList.csv only contains cells in the first column with part numbers, one of which is L-99. txt_files_sample\5.txt is a text document that at some point contains the string L 99
My code:
def GetPartList():
partList = []
f = open('partList.csv', 'rb')
try:
reader = csv.reader(f)
for row in reader:
part = row[0].replace('-',' ').strip()
partList.append(part)
finally:
f.close()
return partList
def FindFileNames(partList):
i = 0
files = []
for root, dirs, filenames in os.walk('txt_files_sample'):
for f in filenames:
document = open(os.path.join(root, f), 'rb')
for line in document:
if partList[i] in line:
#if 'L 99' in line:
files.append(f)
break
i = i + 1
return files
print FindFileNames(GetPartList())
The code, as it stands above produces:
>>> []
If I uncomment if 'L 99' in line: and comment out if partList[i] in line: I get the result:
>>> ['5.txt']

So using Martijn's input, I discovered the issue was how I looped over partList. Rewritting FindFileNames() worked:
def FindFileList(partList):
i = 0
files = []
for root, dirs, filenames in os.walk('txt_files'):
for f in filenames:
a = 0
document = open(os.path.join(root, f), 'rb')
for line in document:
if a is 1:
break
for partNo in partList:
if partNo in line:
files.append(f)
a = 1
document.close()
return files
With the updated code I got a result that was an accurate list of filenames.

Related

Write to file line by line Python

Here, I want to write the word_count in each loop line by line to the file. However, they are written all back to back.
import os
import string
def remove_punctuation(value):
result = ""
for c in value:
# If char is not punctuation, add it to the result.
if c not in string.punctuation and c != '،' and c != '؟' and c ! = '؛' and c != '«' and c != '»':
result += c
return result
def all_words(file_path):
with open(file_path, 'r', encoding = "utf-8") as f:
p = f.read()
p = remove_punctuation(p)
words = p.split()
word_count = len(words)
return str(word_count)
myfile = open('D:/t.txt', 'w')
for root, dirs, files in os.walk("C:/ZebRa", topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
f = all_words(file_path)
myfile.write(f)
break
myfile.close()
I have also tried to add newline, but instead, it writes nothing.
myfile.write(f'\n')
Change this line:
return str(word_count)
to
return str(word_count) + '\n'
If you're using python 3.6+, you could also try:
return f'{word_count}\n'
You can write a newline character at the end of each iteration:
for root, dirs, files in os.walk("C:/ZebRa", topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
f = all_words(file_path)
myfile.write(f)
break
myfile.write('\n')
When you us file.write() try using this instead:
myfile.write(f+"\n")
This will add a new line after every iteration
For your code to work, however, you need to iterate in a for loop, like this:
for string in f:
file.write(string+"\n")
I hope this helps

Writting several files according to an element of an original file

I need to read a file in bed format that contains coordinates of all chr in a genome, into different files according with the chr. I tried this approach but it doesn't work, it doesn't create any files. Any idees why this happens or alternative approaches to solve this problem?
import sys
def make_out_file(dir_path, chr_name, extension):
file_name = dir_path + "/" + chr_name + extension
out_file = open(file_name, "w")
out_file.close()
return file_name
def append_output_file(line, out_file):
with open(out_file, "a") as f:
f.write(line)
f.close()
in_name = sys.argv[1]
dir_path = sys.argv[2]
with open(in_name, "r") as in_file:
file_content = in_file.readlines()
chr_dict = {}
out_file_dict = {}
line_count = 0
for line in file_content[:0]:
line_count += 1
elems = line.split("\t")
chr_name = elems[0]
chr_dict[chr_name] += 1
if chr_dict.get(chr_name) = 1:
out_file = make_out_file(dir_path, chr_name, ".bed")
out_file_dict[chr_name] = out_file
append_output_file(line, out_file)
elif chr_dict.get(chr_name) > 1:
out_file = out_file_dict.get(chr_name)
append_output_file(line, out_file)
else:
print "There's been an Error"
in_file.close()
This line:
for line in file_content[:0]:
says to iterate over an empty list. The empty list comes from the slice [:0] which says to slice from the beginning of the list to just before the first element. Here's a demonstration:
>>> l = ['line 1\n', 'line 2\n', 'line 3\n']
>>> l[:0]
[]
>>> l[:1]
['line 1\n']
Because the list is empty no iteration takes place, so the code in the body of your for loop in not executed.
To iterate over each line of the file you do not need the slice:
for line in file_content:
However, it is better again to iterate over the file object as this does not require that the whole file be first read into memory:
with open(in_name, "r") as in_file:
chr_dict = {}
out_file_dict = {}
line_count = 0
for line in in_file:
...
Following that there are numerous problems, including syntax errors, with the code in the for loop which you can begin debugging.

How to split text file by id in python

I have a bunch of text files containing tab separated tables. The second column contains an id number, and each file is already sorted by that id number. I want to separate each file into multiple files by the id number in column 2. Here's what I have.
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(readpath+filename, 'r') as fh:
lines = fh.readlines()
lastid = 0
f = open(writepath+'checkme.txt', 'w')
f.write(filename)
for line in lines:
thisid = line.split("\t")[1]
if int(thisid) <> lastid:
f.close()
f = open(writepath+thisid+'-'+filename,'w')
lastid = int(thisid)
f.write(line)
f.close()
What I get is simply a copy of all the read files with the first id number from each file in front of the new filenames. It is as if
thisid = line.split("\t")[1]
is only done once in the loop. Any clue to what is going on?
EDIT
The problem was my files used \r rather than \r\n to terminate lines. Corrected code (simply adding 'rU' when opening the read file and swapping != for <>):
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(readpath+filename, 'rU') as fh:
lines = fh.readlines()
lastid = 0
f = open(writepath+'checkme.txt', 'w')
f.write(filename)
for line in lines:
thisid = line.split("\t")[1]
if int(thisid) != lastid:
f.close()
f = open(writepath+thisid+'-'+filename,'w')
lastid = int(thisid)
f.write(line)
f.close()
If you're dealing with tab delimited files, then you can use the csv module, and take advantage of the fact that itertools.groupby will do the previous/current tracking of the id for you. Also utilise os.path.join to make sure your filenames end up joining correctly.
Untested:
import os
import csv
from itertools import groupby
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(os.path.join(readpath, filename)) as fin:
tabin = csv.reader(fin, delimiter='\t')
for file_id, rows in groupby(tabin, lambda L: L[1]):
with open(os.path.join(writepath, file_id + '-' + filename), 'w') as fout:
tabout = csv.writer(fout, delimiter='\t')
tabout.writerows(rows)

Lists to dictionary but with multiple values for each key

Im gathering the following data filenames, usernames and passwords. This data is being gathered by traversing each dir looking for documents(scripts mostly) with credentials in clear text. The idea is to gather evidence of bad practices being followed by system admins.
The my script does this well enough, however I am trying to understand the best way to handle the data. Id like to place the filename and credentials found in that particular file into a dictionary. So the key being the filename and the values being the credentials found in that file.
Ive worked out how to add data to dictionaries but im not entirely sure how to and cant find a way to get 2 lists into a dictionary and the dictionary hosting multiple values for 1 key. Any pointers would be appreciated. The line with #if not m: add non matched data to un_matched list is currently not used, as suggested by the comment. Id like to add non matched data to another list (for debugging)
Code
dirt = "~/Desktop/tmp"
def get_files():
regs = ["(.*)((U|u)ser(.*))(\s=\s\W\w+\W)", "(.*)((U|u)ser(.*))(\s=\s\w+)", "(.*)((P|p)ass(.*))\s=\s(\W(.*)\W)", "(.*)((P|p)ass(.*))(\s=\s\W\w+\W)"]
combined = "(" + ")|(".join(regs) + ")"
cred_results = []
creds = []
un_matched = []
filesfound = []
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
for line in readfile:
m = re.match(combined, line)
if m:
creds.append(m.group(0))
#if not m: add non matched data to un_matched list
filesfound.append(os.path.join(root, filename))
cred_results = [line.rstrip() for line in creds]
print cred_results
print filesfound
Current Ouput from script
['strUser = "guytom"', 'strPassword = "P#ssw0rd1"', 'strUsername = "guytom2"', 'strPass = "SECRETPASSWORD"']
['~/Desktop/tmp/Domain/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript1.vbs', '~/Desktop/tmp/Domain/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat']
You can use a dict with dict.setdefault:
d = {} # create dict
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
d.setdefault(filename,[]) # set default value to a list
for line in readfile:
m = re.match(combined, line)
if m:
creds.append(m.group(0))
d[filename].append(m.group(0).rstrip()) # append data to the key's list stripping newlines etc..
If you want to keep track of the unmatched data just add a second dict and using with which will close your files automatically:
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
with open(os.path.join(root, filename), "r") as readfile:
matched_d.setdefault(filename,[])
unmatched_d.setdefault(filename,[])
for line in readfile:
m = re.match(combined, line)
if m:
creds.append(m.group(0))
d[filename].append(m.group(0).rstrip())
else:
unmatched_d[filename].append(add_data_here)

Python: Issue when trying to read and write multiple files

This script reads and writes all the individual html files in a directory. The script reiterates, highlight and write the output.The issue is, after highlighting the last instance of the search item, the script removes all the remaining contents after the last search instance in the output of each file. Any help here is appreciated.
import os
import sys
import re
source = raw_input("Enter the source files path:")
listfiles = os.listdir(source)
for f in listfiles:
filepath = os.path.join(source+'\\'+f)
infile = open(filepath, 'r+')
source_content = infile.read()
color = ('red')
regex = re.compile(r"(\b in \b)|(\b be \b)|(\b by \b)|(\b user \b)|(\bmay\b)|(\bmight\b)|(\bwill\b)|(\b's\b)|(\bdon't\b)|(\bdoesn't\b)|(\bwon't\b)|(\bsupport\b)|(\bcan't\b)|(\bkill\b)|(\betc\b)|(\b NA \b)|(\bfollow\b)|(\bhang\b)|(\bbelow\b)", re.I)
i = 0; output = ""
for m in regex.finditer(source_content):
output += "".join([source_content[i:m.start()],
"<strong><span style='color:%s'>" % color[0:],
source_content[m.start():m.end()],
"</span></strong>"])
i = m.end()
outfile = open(filepath, 'w')
outfile.seek(0, 2)
outfile.write(output)
print "\nProcess Completed!\n"
infile.close()
outfile.close()
raw_input()
After your for loop is over, you need to include whatever is left after the last match:
...
i = m.end()
output += source_content[i:]) # Here's the end of your file
outfile = open(filepath, 'w')
...

Categories