IndexError: list index out of range when using tuples - python

I'm very confused. I get an error on line 43 saying that the list index is out of range. Any help is appreciated.
def tokenize(lines):
words = []
for line in lines:
start = 0
end = start + 1
while start < len(line):
character = line[start]
if character.isspace():
end += 1
elif character.isalpha():
end = start + 1
while end < len(line) and line[end].isalpha():
end += 1
words.append(line[start:end].lower())
elif character.isdigit():
end = start + 1
while end < len(line) and line[end].isdigit():
end += 1
words.append(line[start:end])
else:
end += 1
words.append(line[start:end])
start = end
return words
def countWords(words, stopWords):
wordDict = {}
for word in words:
if word in stopWords:
continue
elif not word in wordDict:
wordDict[word] = 1
else:
frequency = wordDict.get(word)
wordDict[word] = frequency + 1
return wordDict
def printTopMost(frequencies, n):
listOfTuples = sorted(frequencies.items(), key=lambda x:x[1], reverse=True)
for x in range(n):
pair = listOfTuples[x]
word = pair[0]
frequency = str(pair[1])
print(word.ljust(20), frequency.rjust(5))
pair = listOfTuples[x] gives me an error. Please help me why do i have to add this much text it says mostly code please.
This is how the function is called: (test.py) there are instructions for the other functions I have created like tokenize and countWords also, but the error I'm getting is not a part of that which is why i've left those out.
def printTopMost(freq,n):
saved = sys.stdout
sys.stdout = io.StringIO()
wordfreq.printTopMost(freq,n)
out = sys.stdout.getvalue()
sys.stdout = saved
return out
test(printTopMost,({"horror": 5, "happiness": 15},0),"")
test(printTopMost,({"C": 3, "python": 5, "haskell": 2, "java":
1},3),"python 5\nC 3\nhaskell
2\n")
Full error message
Traceback (most recent call last):
File "C:/Users/Daniel/Documents/Scripts/Chalmers/lab1/test.py", line 81, in <module>
run()
File "C:/Users/Daniel/Documents/Scripts/Chalmers/lab1/test.py", line 70, in run
test(printTopMost,({},10),"")
File "C:/Users/Daniel/Documents/Scripts/Chalmers/lab1/test.py", line 8, in test
z = fun(*x)
File "C:/Users/Daniel/Documents/Scripts/Chalmers/lab1/test.py", line 41, in printTopMost
wordfreq.printTopMost(freq,n)
File "C:\Users\Daniel\Documents\Scripts\Chalmers\lab1\wordfreq.py", line 4, in printTopMost
pair = listOfTuples[x]
IndexError: list index out of range
Condition failed:
printTopMost({'horror': 5, 'happiness': 15}, 0) == ''
printTopMost returned/printed:
happiness 15
horror 5
Condition failed:
printTopMost({'C': 3, 'python': 5, 'haskell': 2, 'java': 1}, 3) == 'python 5\nC 3\nhaskell 2\n'
printTopMost returned/printed:
python 5
C 3
haskell 2
java 1
https://i.imgur.com/9SciXtx.png

def printTopMost(frequencies, n):
listOfTuples = sorted(frequencies.items(), key=lambda x:x[1], reverse=True)
n=min(n,len(listOfTuples))
for x in range(n):
pair = listOfTuples[x]
word = pair[0]
frequency = str(pair[1])
print(word.ljust(20), frequency.rjust(5))
Simple hack

Related

Map Reduce program to calculate the average and count

I am trying to calculate the taxi and its trip using map reduce python program.
In Map program I have written the following code where it will assign each row a key.
import sys
for line in sys.stdin:
line = line.strip()
words = line.split(",")
trip = words[0]
km = words[1]
print('%s\t%s\t%s' % (trip, km, "1"))
Next while in reducer below is the program.
#!/usr/bin/env python3
import sys
current_trip = None
current_km = 0
current_count = 0
trip = None
gender = None
for line in sys.stdin:
line = line.strip()
trip,gender,count = line.split(",")
try:
count = int(count)
except ValueError:
continue
if current_trip == trip:
current_km = (km + current_km)
current_count += count
print('%s\t%s' % (current_trip,current_count, {current_km/current_count}))
current_trip = trip
current_count = count
current_km = 0
else:
if current_trip == trip:
current_count += count
print('%s\t%s' % (current_trip, current_count,km))
Here I am getting the error saying
Traceback (most recent call last):
File "reducer.py", line 23, in <module>
print('%s\t%s\t%s' % (current_trip, current_count, {current_km / current_count}))
ZeroDivisionError: division by zero
and I am not able to debug properly because if I include the print statement it is not printing in output.
Can someone please help
If the first line contains a count 0, or you have negative counts and at some point the current_count is 0, you will get this error. Try to add a condition before your print method to debug the problem:
if current_count != 0:
print('%s\t%s' % (current_trip,current_count, {current_km/current_count}))
else:
print(f"error: the current_count is 0 and the count is {count}")

Python code for optimal alignment score and sequence giving wrong result

This is my first time coding, so please do understand my code is very messy. I have done two different ways to get the optimal score and the optimal sequence, unfortunately both of my answers are wrong. In my code I have included a way to open a fasta file, but since this seemed to not work I also just included the sequences in the code myself.
My optimal score is computed but not printed for some reason- it is also wrong I have 208 when I should get 275. I also dont get a correct alignment score back.
The two sequences are
The scoring alignment needs to follow , 11 for internal gaps, 8 for terminal gaps on the 5' end, 7 for gaps on the 3' end, 4 for mismatches, 0 for matches
My file is at [removed link]
my_file = open("one.fasta","w")
my_file.write (""">Testseq1
TCTGGTGTCCTAGGCGTAGAGGAACCACACCAATCCATCCCGAACTCTGGTGGTTAAACTCTACTGCGGTGACGATACT""")
sequenceone= open("one.fasta","r")
line = sequenceone.readline()
header = ""
seqA = ""
while line:
line = line.rstrip("\n")
if ">" in line:
header = line
else :
seqA = seqA + line
line = sequenceone.readline()
my_file.close()
my_files = open("two.fasta","w")
my_files.write (""">Testseq2
TGGTGCGGTCATACCAGCGCTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGCGCTTGGGCCAGAACAGTACTGGGATGGGTGTCC""")
sequencetwo= open("two.fasta","r")
line = sequencetwo.readline()
header = ""
seqB = ""
while line:
line = line.rstrip("\n")
if ">" in line:
header = line
else :
seqB = seqB + line
line = sequencetwo.readline()
my_files.close()
alphabet = ["A","C","G","T"]
score = [[8,8,8,8,8],\
[0,4,4,4,11],\
[4,0,4,4,11],\
[4,4,0,4,11],\
[4,4,4,0,11],\
[7,7,7,7,7]]
def Global(a,b):
D = []
for i in range(len(a)+1):
D.append([0]* (len(b)+1))
for i in range(len(a)+1):
D[i][0] = D[i-1][0] + score[alphabet.index(a[i-1])][-1]
for i in range(len(b)+1):
D[0][i] = D[0][i-1] + score[-1][alphabet.index(b[i-1])]
for i in range (1, len(a)+1):
for j in range (1, len(b)+1):
distHor = D[i][j-1] + score[-1][alphabet.index(b[j-1])]
distVer = D[i-1][j] + score[alphabet.index(a[i-1])][-1]
if a[i-1] == b[j-1]:
distDiag = D[i-1][j-1]
else:
distDiag = D[i-1][j-1] + score[alphabet.index(a[i-1])][alphabet.index(b[j-1])]
D[i][j] = min(distHor, distVer, distDiag)
return D[-1][-1]
seqA = "TCTGGTGTCCTAGGCGTAGAGGAACCACACCAATCCATCCCGAACTCTGGTGGTTAAACTCTACTGCGGTGACGATACT"
seqB = "TGGTGCGGTCATACCAGCGCTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGCGCTTGGGCCAGAACAGTACTGGGATGGGTGTCC"
row = len(seqA)+1
column = len(seqB)+1
match = 0
mismatch = 4
gap = 11
align1=""
align2=""
matrix=[[[[None] for i in range (2)] for i in range(column)] for i in range(row)]
for i in range(column):
matrix[0][i][0]=gap*i
if(i>0):
matrix[0][i][1]="hor"
for i in range(row):
matrix[i][0][0]=gap*i
if(i>0):
matrix[i][0][1]="ver"
for i in range(1,row):
for j in range(1,column):
hor=matrix[i][j-1][0]+gap
ver=matrix[i-1][j][0]+gap
if (seqA[i-1]==seqB[j-1]):
diag=matrix[i-1][j-1][0]+match
else:
diag=matrix[i-1][j-1][0]+mismatch
var = {hor:"hor",ver:"ver",diag:"diag"}
hvd=[hor,ver,diag]
matrix[i][j][0]=max(hvd)
matrix[i][j][1]=var.get(max(var))
k=row
l=column
while(True):
if(l==1 and k==1):
break
else:
if(matrix[k-1][l-1][1]=="ver"):
align1+=seqA[k-2]
align2+="-"
k-=1
elif(matrix[k-1][l-1][1]=="hor"):
align1+="-"
align2+=seqB[l-2]
l-=1
elif(matrix[k-1][l-1][1]=="diag"):
align1+=seqA[k-2]
align2+=seqB[l-2]
k-=1
l-=1
align1=align1[::-1]
align2=align2[::-1]
print (align1)
print (align2)
Global(seqA,seqB)
Please can anyone guide me on what I am doing wrong?

Python: Count kmers from fasta files

I want to count the kmers from a fasta file. I have the following script:
import operator
seq = open('file', 'r')
kmers = {}
k = 5
for i in range(len(seq) - k + 1):
kmer = seq[i:i+k]
if kmer in kmers:
kmers[kmer] += 1
else:
kmers[kmer] = 1
for kmer, count in kmers.items():
print (kmer + "\t" + str(count))
sortedKmer = sorted(kmers.items(), key=itemgetter(1), reverse=True)
for item in sortedKmer:
print (item[0] + "\t" + str(item[1]))
This works fine for a file with only one sequence, but now I have a fasta file with several contigs.
My fasta file looks like this:
>1
GTCTTCCGGCGAGCGGGCTTTTCACCCGCTTTATCGTTACTTATGTCAGCATTCGCACTT
CTGATACCTCCAGCAACCCTCACAGGCCACCTTCGCAGGCTTACAGAACGCTCCCCTACC
CAACAACGCATAAACGTCGCTGCCGCAGCTTCGGTGCATGGTTTAGCCCCGTTACATCTT
CCGCGCAGGCCGACTCGACCAGTGAGCTATTACGCTTTCTTTAAATGATGGCTGCTTCTA
AGCCAACATCCTGGCTGTCTGG
>2
AAAGAAAGCGTAATAGCTCACTGGTCGAGTCGGCCTGCGCGGAAGATGTAACGGGGCTAA
ACCATGCACCGAAGCTGCGGCAGCGACACTCAGGTGTTGTTGGGTAGGGGAGCGTTCTGT
AAGCCTGTGAAGGTGGCCTGTGAGGGTTGCTGGAGGTATCAGAAGTGCGAATGCTGACAT
AAGTAACGATAAAGCGGGTGAAAAGCCCGCTCGCCGGAAGACCAAGGGTTCCTGTCCAAC
GTTAATCGGGGCAGG
How can I change the script that it take first the sequence after ">1", print that output, go to ">2", print that output etc?
I have never heard about kmer or fasta, but I think I understand what you are trying to do.
You can try to split on a regex involving '>', but I would recommend processing the file line by line and accumulate kmers before printing them appropriately when reaching the '>1'-lines. See below code with comments
import operator
def printSeq(name, seq):
# Extract your code into a function and print header for current kmer
print("%s\n################################" %name)
kmers = {}
k = 5
for i in range(len(seq) - k + 1):
kmer = seq[i:i+k]
if kmer in kmers:
kmers[kmer] += 1
else:
kmers[kmer] = 1
for kmer, count in kmers.items():
print (kmer + "\t" + str(count))
sortedKmer = sorted(kmers.items(), reverse=True)
for item in sortedKmer:
print (item[0] + "\t" + str(item[1]))
with open('file', 'r') as f:
seq = ""
key = ""
for line in f.readlines():
# Loop over lines in file
if line.startswith(">"):
# if we get '>' it is time for a new sequence
if key and seq:
# if it wasn't the first we should print it before overwriting the variables
printSeq(key, seq)
# store name after '>' and reset sequence
key = line[1:].strip()
seq = ""
else:
# accumulate kmer until we hit another '>'
seq += line.strip()
# when we are done with all the lines, print the last sequence
printSeq(key, seq)
I tried the following with your example FASTA file and it should work:
def count_kmers(seq, k, kmers):
for i in range(len(seq) - k + 1):
kmr = seq[i:i + k]
if kmr in kmers:
kmers[kmr] += 1
else:
kmers[kmr] = 1
filename = raw_input('File name/path: ')
k = input('Value for k: ')
kmers = {}
# Put each line of the file into a list (avoid empty lines)
with open(filename) as f:
lines = [l.strip() for l in f.readlines() if l.strip() != '']
# Find the line indices where a new sequence starts
idx = [i for (i, l) in enumerate(lines) if l[0] == '>']
idx += [len(lines)]
for i in xrange(len(idx) - 1):
start = idx[i] + 1
stop = idx[i + 1]
sequence = ''.join(lines[start:stop])
count_kmers(sequence, k, kmers)
print kmers
Hope it helps :)

python scripts showing different result( with one error ) in two similar input files

The script, originally taken and modified from (http://globplot.embl.de/):
#!/usr/bin/env python
# Copyright (C) 2003 Rune Linding - EMBL
# GlobPlot TM
# GlobPlot is licensed under the Academic Free license
from string import *
from sys import argv
from Bio import File
from Bio import SeqIO
import fpformat
import sys
import tempfile
import os
from os import system,popen3
import math
# Russell/Linding
RL = {'N':0.229885057471264,'P':0.552316012226663,'Q':-0.187676577424997,'A':-0.261538461538462,'R':-0.176592654077609, \
'S':0.142883029808825,'C':-0.0151515151515152,'T':0.00887797506611258,'D':0.227629796839729,'E':-0.204684629516228, \
'V':-0.386174834235195,'F':-0.225572305974316,'W':-0.243375458622095,'G':0.433225711769886,'H':-0.00121743364986608, \
'Y':-0.20750516775322,'I':-0.422234699606962,'K':-0.100092289621613,'L':-0.337933495925287,'M':-0.225903614457831}
def Sum(seq,par_dict):
sum = 0
results = []
raws = []
sums = []
p = 1
for residue in seq:
try:
parameter = par_dict[residue]
except:
parameter = 0
if p == 1:
sum = parameter
else:
sum = sum + parameter#*math.log10(p)
ssum = float(fpformat.fix(sum,10))
sums.append(ssum)
p +=1
return sums
def getSlices(dydx_data, DOM_join_frame, DOM_peak_frame, DIS_join_frame, DIS_peak_frame):
DOMslices = []
DISslices = []
in_DOMslice = 0
in_DISslice = 0
beginDOMslice = 0
endDOMslice = 0
beginDISslice = 0
endDISslice = 0
for i in range( len(dydx_data) ):
#close dom slice
if in_DOMslice and dydx_data[i] > 0:
DOMslices.append([beginDOMslice, endDOMslice])
in_DOMslice = 0
#close dis slice
elif in_DISslice and dydx_data[i] < 0:
DISslices.append([beginDISslice, endDISslice])
in_DISslice = 0
# elseif inSlice expandslice
elif in_DOMslice:
endDOMslice += 1
elif in_DISslice:
endDISslice += 1
# if not in slice and dydx !== 0 start slice
if dydx_data[i] > 0 and not in_DISslice:
beginDISslice = i
endDISslice = i
in_DISslice = 1
elif dydx_data[i] < 0 and not in_DOMslice:
beginDOMslice = i
endDOMslice = i
in_DOMslice = 1
#last slice
if in_DOMslice:
DOMslices.append([beginDOMslice, endDOMslice])
if in_DISslice:
DISslices.append([beginDISslice,endDISslice])
k = 0
l = 0
while k < len(DOMslices):
if k+1 < len(DOMslices) and DOMslices[k+1][0]-DOMslices[k][1] < DOM_join_frame:
DOMslices[k] = [ DOMslices[k][0], DOMslices[k+1][1] ]
del DOMslices[k+1]
elif DOMslices[k][1]-DOMslices[k][0]+1 < DOM_peak_frame:
del DOMslices[k]
else:
k += 1
while l < len(DISslices):
if l+1 < len(DISslices) and DISslices[l+1][0]-DISslices[l][1] < DIS_join_frame:
DISslices[l] = [ DISslices[l][0], DISslices[l+1][1] ]
del DISslices[l+1]
elif DISslices[l][1]-DISslices[l][0]+1 < DIS_peak_frame:
del DISslices[l]
else:
l += 1
return DOMslices, DISslices
def SavitzkyGolay(window,derivative,datalist):
SG_bin = 'sav_gol'
stdin, stdout, stderr = popen3(SG_bin + '-D' + str(derivative) + ' -n' + str(window)+','+str(window))
for data in datalist:
stdin.write(`data`+'\n')
try:
stdin.close()
except:
print stderr.readlines()
results = stdout.readlines()
stdout.close()
SG_results = []
for result in results:
SG_results.append(float(fpformat.fix(result,6)))
return SG_results
def reportSlicesTXT(slices, sequence, maskFlag):
if maskFlag == 'DOM':
coordstr = '|GlobDoms:'
elif maskFlag == 'DIS':
coordstr = '|Disorder:'
else:
raise SystemExit
if slices == []:
#by default the sequence is in uppercase which is our search space
s = sequence
else:
# insert seq before first slide
if slices[0][0] > 0:
s = sequence[0:slices[0][0]]
else:
s = ''
for i in range(len(slices)):
#skip first slice
if i > 0:
coordstr = coordstr + ', '
coordstr = coordstr + str(slices[i][0]+1) + '-' + str(slices[i][1]+1)
#insert the actual slice
if maskFlag == 'DOM':
s = s + lower(sequence[slices[i][0]:(slices[i][1]+1)])
if i < len(slices)-1:
s = s + upper(sequence[(slices[i][1]+1):(slices[i+1][0])])
#last slice
elif slices[i][1] < len(sequence)-1:
s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
elif maskFlag == 'DIS':
s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
#insert untouched seq between disorder segments, 2-run labelling
if i < len(slices)-1:
s = s + sequence[(slices[i][1]+1):(slices[i+1][0])]
#last slice
elif slices[i][1] < len(sequence)-1:
s = s + sequence[(slices[i][1]+1):(len(sequence))]
return s,coordstr
def runGlobPlot():
try:
smoothFrame = int(sys.argv[1])
DOM_joinFrame = int(sys.argv[2])
DOM_peakFrame = int(sys.argv[3])
DIS_joinFrame = int(sys.argv[4])
DIS_peakFrame = int(sys.argv[5])
file = str(sys.argv[6])
db = open(file,'r')
except:
print 'Usage:'
print ' ./GlobPipe.py SmoothFrame DOMjoinFrame DOMpeakFrame DISjoinFrame DISpeakFrame FASTAfile'
print ' Optimised for ELM: ./GlobPlot.py 10 8 75 8 8 sequence_file'
print ' Webserver settings: ./GlobPlot.py 10 15 74 4 5 sequence_file'
raise SystemExit
for cur_record in SeqIO.parse(db, "fasta"):
#uppercase is searchspace
seq = upper(str(cur_record.seq))
# sum function
sum_vector = Sum(seq,RL)
# Run Savitzky-Golay
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
dydx_vector = SavitzkyGolay('smoothFrame',1, sum_vector)
#test
sumHEAD = sum_vector[:smoothFrame]
sumTAIL = sum_vector[len(sum_vector)-smoothFrame:]
newHEAD = []
newTAIL = []
for i in range(len(sumHEAD)):
try:
dHEAD = (sumHEAD[i+1]-sumHEAD[i])/2
except:
dHEAD = (sumHEAD[i]-sumHEAD[i-1])/2
try:
dTAIL = (sumTAIL[i+1]-sumTAIL[i])/2
except:
dTAIL = (sumTAIL[i]-sumTAIL[i-1])/2
newHEAD.append(dHEAD)
newTAIL.append(dTAIL)
dydx_vector[:smoothFrame] = newHEAD
dydx_vector[len(dydx_vector)-smoothFrame:] = newTAIL
globdoms, globdis = getSlices(dydx_vector, DOM_joinFrame, DOM_peakFrame, DIS_joinFrame, DIS_peakFrame)
s_domMask, coordstrDOM = reportSlicesTXT(globdoms, seq, 'DOM')
s_final, coordstrDIS = reportSlicesTXT(globdis, s_domMask, 'DIS')
sys.stdout.write('>'+cur_record.id+coordstrDOM+coordstrDIS+'\n')
print s_final
print '\n'
return
runGlobPlot()
My input and output files are here: link
This script takes a input (input1.fa) and gives following output output1.txt
But when I try to run this script with similar type but larger input file (input2.fa) .. It shows following error:
Traceback (most recent call last):
File "final_script_globpipe.py", line 207, in <module>
runGlobPlot()
File "final_script_globpipe.py", line 179, in runGlobPlot
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
File "final_script_globpipe.py", line 105, in SavitzkyGolay
stdin.write(`data`+'\n')
IOError: [Errno 22] Invalid argument
I have no idea where the problem is. Any type of suggestion is appriciated.
I am using python 2.7 in windows 7 machine. I have also attached the Savitzky Golay module which is needed to run the script.
Thanks
UPDATE:
After trying to reproduce the error on linux it's showing a similar behavior, working fine with the first file but with the second is returning Errno32.
Traceback:
Traceback (most recent call last):
File "Glob.py", line 207, in <module>
runGlobPlot()
File "Glob.py", line 179, in runGlobPlot
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
File "Glob.py", line 105, in SavitzkyGolay
stdin.write(`data`+'\n')
IOError: [Errno 32] Broken pipe
Update:
Some calls of the SG_bin return that the -n parameter is the wrong type.
Wrong type of parameter for flag -n. Has to be unsigned,unsigned
This parameter comes from the window variable that is passed to the SavitzkyGolay function.
Surrounding the stdin.write with a trycatch block reveals that it breaks a hadnfull of times.
try:
for data in datalist:
stdin.write(repr(data)+'\n')
except:
print "It broke"

Positional Inverted Index in Python

I recently developed a Python program that makes an inverted index out of terms in a certain document. I now want to create position postings, such as
to, 993427:
⟨ 1, 6: ⟨7, 18, 33, 72, 86, 231⟩;
2, 5: ⟨1, 17, 74, 222, 255⟩; 4, 5: ⟨8, 16, 190, 429, 433⟩; 5, 2: ⟨363, 367⟩;
7, 3: ⟨13, 23, 191⟩; …⟩
I know the code is not complete as described above, I'm just trying to implement functionality.
from pprint import pprint as pp
from collections import Counter
import pprint
import re
import sys
import string
import fileinput
try:
reduce
except:
from functools import reduce
try:
raw_input
except:
raw_input = input
def readIn(fileglob): #Reads in multiple files and strips punctation/uppercase.
texts, words = {}, set()
for txtfile in (fileglob):
with open(txtfile, 'r') as splitWords:
txt = splitWords.read().lower().split()
txt = str(txt)
txt = re.findall(r'\w+', txt)
words |= set(txt)
texts[txtfile.split('\\')[-1]] = txt
return texts, words
def search(indexes): # Inverted index, based off the book and the web.
return reduce(set.intersection,
(index[word] for word in indexes),
set(texts.keys()))
def getWordBins(posOfWords):
cnt = Counter()
for word in posOfWords:
cnt[posOfWords] += 1
return cnt
def main(fileList, topWords):
tempArray = []
for x in range(1,len(fileList)):
tempArray.append(fileList[x])
texts, words = readIn(tempArray)
index = {word:set(txt
for txt, wrds in texts.items() if word in wrds)
for word in words}
test =({k + " " + str(len(v)) + " " + str(sorted(v)) for k,v in index.items()})
txt = readIn(fileList)
posWord = getWordBins(txt)
for key, value in posWord.most_common(topWords):
print key, value
#Writes out the information requested to a ".idx" file.
doc = open("document.idx", "w")
doc.write("# INPUT DOCUMENT REFERENCE LEGEND\n")
for fileNumber in range(1, len(fileList)):
doc.write(str(fileNumber) + "\t" + fileList[fileNumber] + "\n")
doc.write("# INVERTED INDEX RESULTS\n")
tempTest = []
for x in test:
tempTest.append(x.split(" "))
for x in tempTest:
tempStr = ""
for y in x:
tempStr += y + "\t"
doc.write(tempStr + "\n")
doc.close
main(sys.argv, sys.argv)
This is what I have so far, the only new functionality is the getWordBins function, and the loop:
txt = readIn(fileList)
posWord = getWordBins(txt)
for key, value in posWord.most_common(topWords):
print key, value
Now, what happens when I try to run the code is this:
Traceback (most recent call last):
File "Intro3.py", line 82, in <module>
main(sys.argv, sys.argv)
File "Intro3.py", line 60, in main
posWord = getWordBins(txt)
File "Intro3.py", line 41, in getWordBins
cnt[posOfWords] += 1
TypeError: unhashable type: 'dict'
Any guidance with this troubling error is gladly received. It is not a dictionary, so why the error?
Thanks for your time!
Where you're doing:
cnt[posOfWords] += 1
I think you might mean:
cnt[word] += 1
Your readin function also returns a dict and a set, so your txt variable is a tuple of (dict, set)
So your problem boils down to trying to use a tuple holding a dict as a key (which I doubt is your intent). And it wouldn't work for cnt[word] += 1, because that would still be trying to use a dict as a key too. You need to do this, probably:
txt, _ = readIn(fileList)
and then this might work:
cnt[word] += 1

Categories