def isexact(pat):
for c in pat.upper():
if c not in 'ATGC':
return 0
return 1
def print_matches(ofh, enz, matches):
if matches:
print >>ofh, "Enzyme %s matches at:" % enz,
for m in matches:
print >>ofh, m,
print >>ofh
else:
print >>ofh, "No match found for enzyme %s." % enz
def get_site_only(pat):
newpat = ""
for c in pat:
if c.isalpha():
newpat += c
return newpat
def findpos(seq, pat):
matches = []
current_match = seq.find(pat)
while current_match != -1:
matches.append(current_match)
current_match =seq.find(pat, current_match+1)
return matches
seq = ""
ifh = open("C:\Python27\\link_cutzymes.txt",'r')
ofh = open("C:\Python27\\re-en-output.txt", "w")
line = ifh.readline()
while line:
fields = line.split()
name = fields[0]
pat = get_site_only(fields[2])
if isexact(pat):
print_matches(ofh, name, findpos(seq, pat))
line = ifh.readline()
else:
line = ifh.readline()
ofh.close()
ifh.close()
it is showing list index error can help me
Traceback (most recent call last): File
"C:/Users/ram/Desktop/rest_enz7.py", line 55, in
name = fields[0] IndexError: list index out of range
name = fields[0] - you probably are reading an empty line, splitting it, and accessing it at index 0, which is out of range for an empty list..
you can make sure your file contains only lines of your format, check for empty lines in the code, or use try and except to name a few options.
while reading the data from file,if data is not exist to split,it will not convert into list. I can see in your code name = fields[0] is causing error.
At that time please use try and except in your code.
you can rewrite the code as :
try:
fields = line.split()
name = fields[0]
except:
pass
What a string[x] does is get the xth letter of the list. This means that if there is no object in the xth position then you get an error.
So if name = fields[0] returns an error then fieldsmust be an empty list (It would look like this: []) because there is no first object (Python counts from zero so letter 0 is letter 1, letter 1 is letter 2 and so on). You can fix this with a try: and except: like so:
try:
name = fields[0]
except:
name = '' #Or whatever code you want to run if it fails
In the place of name = fields[0]
Related
I am new to python and stuck with a log file in text format, where it has following repetitive structure and I am required to extract the data from rows and change it into column depending upon the data. e.g.
First 50 line are trash like below(in first six lines):
-------------------------------------------------------------
Logging to file xyz.
Char
1,
3
r
=
----------------------------------------------
Pid 0
Name SAB=1, XYZ=3
----------------------------------------------
a 1
b 2
c 3
----------------------------------------------
Pid 0
Name SAB=1, XYZ=3, P_NO=546467
----------------------------------------------
Test_data_1 00001
Test_data_2 FOXABC
Test_data_3 SHEEP123
Country US
----------------------------------------------
Pid 0
Name SAB=1
----------------------------------------------
Sno 893489423
Log FileFormat
------------Continues for another million lines.
Now the required output is like below:
Required output format
PID, Name, a,b,c
0, "SAB=1, XYZ=3", 1,2,3
PID, Name , Test_data_1, Test_data_2, Test_data_3, Country
0, "SAB=1, XYZ=3, P_NO=546467", 00001, FOXABC, SHEEP123, US
Pid, Name, Sno
0, SAB=1, 893489423
I tried to write a code but failed to get the desired results: My attempt was as below:
'''
fn=open(file_name,'r')
for i,line in enumerate(fn ):
if i >= 50 and "Name " in line: # for first 50 line deletion/or starting point
last_tag=line.split(",")[-1]
last_element=last_tag.split("=")[0]
print(last_element)
'''
Any help would be really appreciated.
Newly Discovered Structure
RBY Structure
The solution I came up with is a bit messy but it works, check it out below:
import sys
import re
import StringIO
ifile = open(sys.argv[1],'r') #Input log file as command-line argument
ofile = open(sys.argv[1][:-4]+"_formatted.csv",'w') #output formatted log txt
stringOut = ""
i = 0
flagReturn = True
j = 0
reVal = re.compile("Pid[\s]+(.*)\nName[\s]+(.*)\n[-]+\<br\>(.*)\<br\>") #Regex pattern for separating the Pid & Name from the variables
reVar = re.compile("(.*)[ ]+(.*)") #Regex pattern for getting vars and their values
reVarStr = re.compile(">>> [0-9]+.(.*)=(.*)") #Regex Pattern for Struct
reVarStrMatch = re.compile("Struct(.*)+has(.*)+members:") #Regex pattern for Struct check
for lines in ifile.readlines():
if(i>8): #Omitting the first 9 lines of Garbage values
if(lines.strip()=="----------------------------------------------"): #Checking for separation between PID & Name group and the Var group
j+=1 #variable keeping track of whether we are inside the vars section or not (between two rows of hyphens)
flagReturn = not flagReturn #To print the variables in single line to easily separate them with regex pattern reVal
if(not flagReturn):
stringTmp = lines.strip()+"<br>" #adding break to the end of each vars line in order for easier separation
else:
stringTmp = lines #if not vars then save each line as is
stringOut += stringTmp #concatenating each lines to form the searchable string
i+=1 #incrementing for omitting lines (useless after i=8)
if(j==2): #Once a complete set of PIDs, Names and Vars have been collected
j=0 #Reset j
matchObj = reVal.match(stringOut) #Match for PID, Name & Vars
line1 = "Pid,Name,"
line2 = matchObj.group(1).strip()+",\""+matchObj.group(2)+"\","
buf = StringIO.StringIO(matchObj.group(3).replace("<br>","\n"))
structFlag = False
for line in buf.readlines(): #Separate each vars and add to the respective strings for writing to file
if(not (reVarStrMatch.match(line) is None)):
structFlag = True
elif(structFlag and (not (reVarStr.match(line) is None))):
matchObjVars = reVarStr.match(line)
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
else:
structFlag = False
matchObjVars = reVar.match(line)
try:
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
except:
line1 += line.strip()+","
line2 += " ,"
ofile.writelines(line1[:-1]+"\n")
ofile.writelines(line2[:-1]+"\n")
ofile.writelines("\n")
stringOut = "" #Reseting the string
ofile.close()
ifile.close()
EDIT
This is what I came up with to include the new pattern as well.
I suggest you do the following:
Run the parser script on a copy of the log file and see where it fails next.
Identify and write down the new pattern that broke the parser.
Delete all data in the newly identified pattern.
Repeat from Step 1 till all patterns have been identified.
Create individual regular expressions pattern for each type of pattern and call them in separate functions to write to the string.
EDIT 2
structFlag = False
RBYflag = False
for line in buf.readlines(): #Separate each vars and add to the respective strings for writing to file
if(not (reVarStrMatch.match(line) is None)):
structFlag = True
elif(structFlag and (not (reVarStr.match(line) is None))):
matchObjVars = reVarStr.match(line)
if(matchObjVars.group(1).strip()=="RBY" and not RBYFlag):
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+"**"
RBYFlag = True
elif(matchObjVars.group(1).strip()=="RBY"):
line2 += matchObjVars.group(2).strip()+"**"
else:
if(RBYFlag):
line2 = line2[:-2]
RBYFlag = False
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
else:
structFlag = False
if(RBYFlag):
line2 = line2[:-2]
RBYFlag = False
matchObjVars = reVar.match(line)
try:
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
except:
line1 += line.strip()+","
line2 += " ,"`
NOTE
This loop has become very bloated and it is better to create a separate function to identify the type of data and return some value accordingly.
I want to extract website names from the url. For e.g. https://plus.google.com/in/test.html
should give the output as - "plus google"
Some more testcases are -
WWW.OH.MADISON.STORES.ADVANCEAUTOPARTS.COM/AUTO_PARTS_MADISON_OH_7402.HTML
Output:- OH MADISON STORES ADVANCEAUTOPARTS
WWW.LQ.COM/LQ/PROPERTIES/PROPERTYPROFILE.DO?PROPID=6054
Output:- LQ
WWW.LOCATIONS.DENNYS.COM
Output:- LOCATIONS DENNYS
WV.WESTON.STORES.ADVANCEAUTOPARTS.COM
Output:- WV WESTON STORES ADVANCEAUTOPARTS
WOODYANDERSONFORDFAYETTEVILLE.NET/
Output:- WOODYANDERSONFORFAYETTEVILLE
WILMINGTONMAYFAIRETOWNCENTER.HGI.COM
Output:- WILMINGTONMAYFAIRETOWNCENTER HGI
WHITEHOUSEBLACKMARKET.COM/
Output:- WHITEHOUSEBLACKMARKET
WINGATEHOTELS.COM
Output:- WINGATEHOTELS
string = str(input("Enter the url "))
new_list = list(string)
count=0
flag=0
if 'w' in new_list:
index1 = new_list.index('w')
new_list.pop(index1)
count += 1
if 'w' in new_list:
index2 = new_list.index('w')
if index2 != -1 and index2 == index1:
new_list.pop(index2)
count += 1
if 'w' in new_list:
index3= new_list.index('w')
if index3!= -1 and index3== index2 and new_list[index3+1]=='.':
new_list.pop(index3)
count+=1
flag = 1
if flag == 0:
start = string.find('/')
start += 2
end = string.rfind('.')
new_string=string[start:end]
print(new_string)
elif flag == 1:
start = string.find('.')
start = start + 1
end = string.rfind('.')
new_string=string[start:end]
print(new_string)
The above works for some testcases but not all. Please help me with it.
Thanks
this is something you could build upon; using urllib.parse.urlparse:
from urllib.parse import urlparse
tests = ('https://plus.google.com/in/test.html',
('WWW.OH.MADISON.STORES.ADVANCEAUTOPARTS.COM/'
'AUTO_PARTS_MADISON_OH_7402.HTML'),
'WWW.LQ.COM/LQ/PROPERTIES/PROPERTYPROFILE.DO?PROPID=6054')
def extract(url):
# urlparse will not work without a 'scheme'
if not url.startswith('http'):
url = 'http://' + url
parsed = urlparse(url).netloc
split = parsed.split('.')[:-1] # get rid of TLD
if split[0].lower() == 'www':
split = split[1:]
ret = ' '.join(split)
return ret
for url in tests:
print(extract(url))
The function strips the url from the double slash to the single slash:
the rest is 'clean up'
def stripURL( url, TwoSlashes, OneSlash ):
try:
start = url.index(TwoSlashes) + len(TwoSlashes)
end = url.index( OneSlash, start )
return url[start:end]
except ValueError:
return ""
url= raw_input("URL : ")
if "www." in url:url=url.replace("www.","")
Strip = stripURL( url, "//", "/" )
# Strips anything after the last period found
Stripped = Strip[:Strip.rfind(".")]
# get rid of the any periods used in the name
Stripped = Stripped.replace("."," ")
print Stripped
I am having an issue getting the train function to work correctly in python. I can not modify the def function. I am at the point where I need to get the second file to read lines one at a time for PosList and i need to match the value of movieWordCount[z] in OpenPos. If the file is there, then I am good to incrment column 2 by one of t hat line (segmented by a space). If it is not, then I need the else to append it to the file end. It does not work. It does not append the values if it is missing and I am not sure if it will find the value if it is there. I have been stuck getting thsi to work for two days.
Here is my code segment I am working with:
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
#Now use tokenize to split it apart by space and set to new array for me to call column2
else:
print "not found"
lines.append(movieWordCount[z] + " 1" + "\n")
Here is my full code:
#!/usr/bin/python
#Import Counter
import collections
from collections import Counter
#Was already here but pickle is used for data input and export
import math, os, pickle, re
class Bayes_Classifier:
def __init__(self, trainDirectory = "movie_reviews/"):
#If file listing exists skip to train
if os.path.isfile('iFileList'):
print "file found"
self.train()
#self.classify()
#If file listing does not exist skip to train
if not os.path.isfile('iFileList'):
print "no file"
newfile = 'iFileList'
tempList = set()
subDir = './movie_reviews'
for filenames in os.listdir(subDir):
my_sub_path = os.path.join(os.sep,subDir,filenames)
tempList.add(filenames)
self.save("filenames", "try3")
f = []
for fFileObj in os.walk("movie_reviews/"):
f.extend(fFileObj)
break
pickle.dump(f, open( "save.p", "wb" ))
self.save(f, "try4")
with open(newfile, 'wb') as fi:
pickle.dump(tempList, fi)
#print tempList
self.train()
#self.classify()
def train(self):
'''Trains the Naive Bayes Sentiment Classifier.'''
print "File ready for training"
#Open iFileList to use as input for opening movie files
x = 0
OpenIFileList = open('iFileList','r')
print "iFileList now Open"
#Loop through the file
for line in OpenIFileList:
#print "Ready to read lines"
#print "reading line " + line
if x > 4:
if x % 2 == 0:
#print line
s = line
if '-' in s:
comp = s.split("'")
#print comp[2]
print comp[1] #This is What you need for t he movie file
compValue1 = comp[1]
#Determine Positive/Negative.
#compType is the variable I am storing it to.
compType = compValue1.split("-",2)[1]
#print compType #Prints that middle value like 5 or 1
# This will do the work based on the value.
if compType == '5':
# print "you have a five" #Confirms the loop I am in.
#If file does not exists create it
if not os.path.exists('PosList'):
print "no file"
file('PosList', 'w').close()
#Open file that needs to be reviewed for word count
compValue2 = "movie_reviews/" + compValue1
print compValue2 #Prints the directory and file path
OpenMovieList = open(compValue2,'r')
for commentLine in OpenMovieList:
commentPositive = commentLine.split(" ")
commentPositiveCounter = Counter(commentPositive)
#print commentPositiveCounter # " Comment Pos goes here"
#if commentLine != '' or commentLine != ' ':
#Get first word, second word, ....
if commentLine and (not commentLine.isspace()):
movieWordCount = self.tokenize(commentLine)
y = len(movieWordCount) #determines length of string
print y
z = 0
#print movieWordCount[0] # Shows the zero position in the file.
while z < y:
print "position " + str(z) + " word is " + movieWordCount[z] # Shows the word we are at and position id
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
else:
print "not found"
lines.append(movieWordCount)
z = z + 1
#Close the files
OpenMovieList.close()
OpenPos.close()
x += 1
#for line2 in OpenIFileList.readlines():
#for line in open('myfile','r').readlines():
#do_something(line)
#Save results
#Close the File List
OpenIFileList.close()
def loadFile(self, sFilename):
'''Given a file name, return the contents of the file as a string.'''
f = open(sFilename, "r")
sTxt = f.read()
f.close()
return sTxt
def save(self, dObj, sFilename):
'''Given an object and a file name, write the object to the file using pickle.'''
f = open(sFilename, "w")
p = pickle.Pickler(f)
p.dump(dObj)
f.close()
def load(self, sFilename):
'''Given a file name, load and return the object stored in the file.'''
f = open(sFilename, "r")
u = pickle.Unpickler(f)
dObj = u.load()
f.close()
return dObj
def tokenize(self, sText):
'''Given a string of text sText, returns a list of the individual tokens that
occur in that string (in order).'''
lTokens = []
sToken = ""
for c in sText:
if re.match("[a-zA-Z0-9]", str(c)) != None or c == "\'" or c == "_" or c == '-':
sToken += c
else:
if sToken != "":
lTokens.append(sToken)
sToken = ""
if c.strip() != "":
lTokens.append(str(c.strip()))
if sToken != "":
lTokens.append(sToken)
return lTokens
To open a file for writing, you can use
with open('PosList', 'w') as Open_Pos
As you are using the with form, you do not need to close the file; Python will do that for you at the end of the with-block.
So assuming that the way you add data to the lines variable is correct, you could remove the superfluous code OpenMovieList.close() and OpenPos.close(), and append 2 lines to your code:
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
else:
print "not found"
lines.append(movieWordCount)
with open("PosList", "w") as OpenPos:
OpenPos.write(lines)
I am trying to create a simple word search program.
I have successfully opened an external file that contains the grid of the word search. I also have successfully opened a file that contains the words that are to be searched for. I have stored every line of the grid in a list and every word from the file in a list called words[].
I am attempting to search for the words in each line of the grid. My code currently does not search for the word in each line of the grid.
gridlines_horizontal = []
gridlines_vertical = []
words = []
not_found = []
found_words = {}
def puzzle(fname) :
print ""
for line in f :
gridlines_horizontal.append(line)
for line in gridlines_horizontal :
print line,
for item in zip(*(gridlines_horizontal[::-1])):
gridlines_vertical.append(item)
Here I am trying to get each word in words[] one at a time and see if the word is in any of the lines of the word search grid. If the word is present in any of the lines I am then trying to print the word. The code currently does not do this.
def horizontal_search(word,gridlines_horizontal) :
x = 0
for line in gridlines_horizontal :
if words[0] in line or words[0] in line[::-1]:
found_words.update({words[0]:" "})
print words[0]
else :
not_found.append(words)
x = x + 1
def vertical_search(word,gridlines_vertical):
x = 0
for line in gridlines_vertical:
if words[x] in line or words[x] in line[::-1]:
print words[0]
found_words.update({words[x]:" "})
else:
not_found.append(words[x])
x = x + 1
while True:
try:
fname = input("Enter a filename between double quotation marks: ")
with open(fname) as f:
puzzle(fname)
break
except IOError as e :
print""
print("Problem opening file...")
print ""
while True:
try:
fname2 = input("Enter a filename for your words between double quotation marks: ")
with open(fname2) as f:
for line in f:
words.append(line)
""" line in words:
line = lin """
break
except IOError as e :
print("")
print("Problem opening file...")
There are a couple mistakes in your code:
- You aren't being consistent in using words[x], in your code you would want to replace every words[0] with words[x] BUT
- this isn't necessary because you can use nested 'for' loops.
So for horizontal search:
def horizontal_search(words,gridlines_horizontal):
for word in words:
for line in gridlines_horizontal:
if word in line or word in line[::-1]:
found_words.update({word : " "})
print(word)
break
else:
not_found.append(word)
Did you look at find?
a = 'this is a string'
b = 'string'
if (a.find(b) > -1):
print 'found substring in string'
else:
print 'substring not found in string'
Live demo of above code
EDIT:
I am not sure if its a typo, but you are passing word as parameter instead of words
def horizontal_search(word,gridlines_horizontal) :
x = 0 ^----------------------------------
for line in gridlines_horizontal : |
if words[0] in line or words[0] in line[::-1]: |
^-- see here <------------not matching here -----
Similar issue with def vertical_search(words,gridlines_vertical) :
i am doing this:
def GetDistinctValues(theFile, theColumn):
lines=theFile.split('\n')
allValues=[]
for line in lines:
allValues.append(line[theColumn-1])
return list(set(allValues))
i am getting string index out of range on this line:
allValues.append(line[theColumn-1])
does anyone know what i am doing wrong?
here's the complete code if needed:
import hashlib
def doStuff():
createFiles('together.csv')
def readFile(fileName):
a=open(fileName)
fileContents=a.read()
a.close()
return fileContents
def GetDistinctValues(theFile, theColumn):
lines=theFile.split('\n')
allValues=[]
for line in lines:
allValues.append(line[theColumn-1])
return list(set(allValues))
def createFiles(inputFile):
inputFileText=readFile(inputFile)
b = inputFileText.split('\n')
r = readFile('header.txt')
DISTINCTCOLUMN=12
dValues = GetDistinctValues(inputFileText,DISTINCTCOLUMN)
for uniqueValue in dValues:
theHash=hashlib.sha224(uniqueValue).hexdigest()
for x in b:
if x[DISTINCTCOLUMN]==uniqueValue:
x = x.replace(', ',',').decode('latin-1','ignore')
y = x.split(',')
if len(y) < 3:
break
elif len(y) > 3:
desc = ' '.join(y[3:])
else:
desc = 'No description'
# Replacing non-XML-allowed characters here (add more if needed)
y[2] = y[2].replace('&','&')
desc = desc.replace('&','&')
r += '\n<Placemark><name>'+y[2].encode('utf-8','xmlcharrefreplace')+'</name>' \
'\n<description>'+desc.encode('utf-8','xmlcharrefreplace')+'</description>\n' \
'<Point><coordinates>'+y[0]+','+y[1]+'</coordinates></Point>\n</Placemark>'
r += readFile('footer.txt')
f = open(theHash,'w')
f.write(r)
f.close()
The error isn't caused by append(), It's because the line isn't long enough. Maybe your file has a empty line at the end. You could try
def GetDistinctValues(theFile, theColumn):
lines=theFile.split('\n')
allValues=[]
for line in lines:
if line:
allValues.append(line[theColumn-1])
return list(set(allValues))
otherwise an exception handler can help find what's going wrong
def GetDistinctValues(theFile, theColumn):
lines=theFile.split('\n')
allValues=[]
for line in lines:
try:
allValues.append(line[theColumn-1])
except IndexError:
print "line: %r"%line
return list(set(allValues))
That is happening because line doesn't have as many elements as the code is assuming. Try the following:
for line in lines:
if len(line) < theColumn:
print "This line doesn't have enough elements:\n" + line
else:
allValues.append(line[theColumn-1])
return list(set(allValues))
That will give you a hint, that is the type of error you expect when trying to access an element out of the range of a list i. e. a non existent element.
line[theColumn-1])
This will of course raise the mentioned error if the string(line) is shorted then 'theColumn'.
What else would you expect?