Positional Inverted Index in Python - python

I recently developed a Python program that makes an inverted index out of terms in a certain document. I now want to create position postings, such as
to, 993427:
⟨ 1, 6: ⟨7, 18, 33, 72, 86, 231⟩;
2, 5: ⟨1, 17, 74, 222, 255⟩; 4, 5: ⟨8, 16, 190, 429, 433⟩; 5, 2: ⟨363, 367⟩;
7, 3: ⟨13, 23, 191⟩; …⟩
I know the code is not complete as described above, I'm just trying to implement functionality.
from pprint import pprint as pp
from collections import Counter
import pprint
import re
import sys
import string
import fileinput
try:
reduce
except:
from functools import reduce
try:
raw_input
except:
raw_input = input
def readIn(fileglob): #Reads in multiple files and strips punctation/uppercase.
texts, words = {}, set()
for txtfile in (fileglob):
with open(txtfile, 'r') as splitWords:
txt = splitWords.read().lower().split()
txt = str(txt)
txt = re.findall(r'\w+', txt)
words |= set(txt)
texts[txtfile.split('\\')[-1]] = txt
return texts, words
def search(indexes): # Inverted index, based off the book and the web.
return reduce(set.intersection,
(index[word] for word in indexes),
set(texts.keys()))
def getWordBins(posOfWords):
cnt = Counter()
for word in posOfWords:
cnt[posOfWords] += 1
return cnt
def main(fileList, topWords):
tempArray = []
for x in range(1,len(fileList)):
tempArray.append(fileList[x])
texts, words = readIn(tempArray)
index = {word:set(txt
for txt, wrds in texts.items() if word in wrds)
for word in words}
test =({k + " " + str(len(v)) + " " + str(sorted(v)) for k,v in index.items()})
txt = readIn(fileList)
posWord = getWordBins(txt)
for key, value in posWord.most_common(topWords):
print key, value
#Writes out the information requested to a ".idx" file.
doc = open("document.idx", "w")
doc.write("# INPUT DOCUMENT REFERENCE LEGEND\n")
for fileNumber in range(1, len(fileList)):
doc.write(str(fileNumber) + "\t" + fileList[fileNumber] + "\n")
doc.write("# INVERTED INDEX RESULTS\n")
tempTest = []
for x in test:
tempTest.append(x.split(" "))
for x in tempTest:
tempStr = ""
for y in x:
tempStr += y + "\t"
doc.write(tempStr + "\n")
doc.close
main(sys.argv, sys.argv)
This is what I have so far, the only new functionality is the getWordBins function, and the loop:
txt = readIn(fileList)
posWord = getWordBins(txt)
for key, value in posWord.most_common(topWords):
print key, value
Now, what happens when I try to run the code is this:
Traceback (most recent call last):
File "Intro3.py", line 82, in <module>
main(sys.argv, sys.argv)
File "Intro3.py", line 60, in main
posWord = getWordBins(txt)
File "Intro3.py", line 41, in getWordBins
cnt[posOfWords] += 1
TypeError: unhashable type: 'dict'
Any guidance with this troubling error is gladly received. It is not a dictionary, so why the error?
Thanks for your time!

Where you're doing:
cnt[posOfWords] += 1
I think you might mean:
cnt[word] += 1
Your readin function also returns a dict and a set, so your txt variable is a tuple of (dict, set)
So your problem boils down to trying to use a tuple holding a dict as a key (which I doubt is your intent). And it wouldn't work for cnt[word] += 1, because that would still be trying to use a dict as a key too. You need to do this, probably:
txt, _ = readIn(fileList)
and then this might work:
cnt[word] += 1

Related

How to fix TypeError: 'int' object is not callable from a divided number

Im trying to create a program to generate text with usernames from a txt file but I keep getting a TypeError: 'int' object is not iterable i know what this means but I have no idea how to fix my issue. I tried just doing y = 12 / 2 and the same error came up when i passed the for loop y i am really confused so if someone could help me that would be great
This is my code
def generateNum():
#imports random
from random import randint
for _ in range(10):
value = randint(0, 900000)
return(str(value))
def getNumOfLines( file):
#opens txt file
with open(file) as f:
Lines = f.readlines()
count = 0
# Strips the newline character
for line in Lines:
count += 1
return(count)
class debug:
def __init__(self, credsTxt, tagsTxt):
self.credsTxt = credsTxt
self.tagsTxt = tagsTxt
self.numOfCreds = getNumOfLines(credsTxt)
self.numOfTags = getNumOfLines(tagsTxt)
self.ammountPerAccount = round(self.numOfTags / self.numOfCreds)
def getComments(self):
#initializes comment
comment = ""
#opens txt file
file1 = open(self.tagsTxt, 'r')
count = 0
while True:
count += 1
# Get next line from file
line = file1.readline()
for i in self.ammountPerAccount:
# if line is empty
# end of file is reached
if not line:
break
comment += ' ' + line.strip() + ' ' + generateNum() + '.'
return(comment)
print(debug('D:/FiverrWork/user/instagram-bot/textGen/assets/login_Info.txt', 'D:/FiverrWork/user/instagram-bot/textGen/assets/tags.txt').getComments())
this is my stack trace error
Traceback (most recent call last):
File "d:\FiverrWork\user\textgenerator\textgenerator\txt.py", line 57, in <module>
print(debug('D:/FiverrWork/user/textgenerator/textgenerator/assets/login_Info.txt', 'D:/FiverrWork/user/textgenerator/textgenerator/assets/tags.txt').getComments())
File "d:\FiverrWork\user\textgenerator\textgenerator\txt.py", line 47, in getComments
for i in self.ammountPerAccount():
TypeError: 'int' object is not callable
Your for loop as posted cannot iterate over an int. You meant to iterate over a range():
for _ in range(self.ammountPerAccount):
# if line is empty
# end of file is reached
if not line:
break
comment += ' ' + line.strip() + ' ' + generateNum() + '.'
I used _ as a placeholder variable since the actual value of i each time was not used.

Write Python code that reads the line contents of a text file and return index of words into numbers

I am trying to write a code in python that goes through a text file and creates a word index of every word in the file. However, It's not executing properly. So far this is what I have:
import sys
import re
line = sys.stdin.readline()
pattern = re.compile("[a-zA-Z0-9]+")
while line:
def build_word_index(txt):
out = {}
for i, line in enumerate(txt.split("\n")):
for word in line.strip().split(" "):
if word not in out:
out[word] = [i + 1]
else:
out[word].append(i + 1)
return out
You never call the function!
import sys
import re
pattern = re.compile("[a-zA-Z0-9]+")
def build_word_index(txt):
out = {}
for i, line in enumerate(txt.split("\n")):
for word in line.strip().split(" "):
if word not in out:
out[word] = [i + 1]
else:
out[word].append(i + 1)
return out
while True:
line = sys.stdin.readline()
if not line:
break
out = build_word_index(line)
print(out)

IndexError: list index out of range when using tuples

I'm very confused. I get an error on line 43 saying that the list index is out of range. Any help is appreciated.
def tokenize(lines):
words = []
for line in lines:
start = 0
end = start + 1
while start < len(line):
character = line[start]
if character.isspace():
end += 1
elif character.isalpha():
end = start + 1
while end < len(line) and line[end].isalpha():
end += 1
words.append(line[start:end].lower())
elif character.isdigit():
end = start + 1
while end < len(line) and line[end].isdigit():
end += 1
words.append(line[start:end])
else:
end += 1
words.append(line[start:end])
start = end
return words
def countWords(words, stopWords):
wordDict = {}
for word in words:
if word in stopWords:
continue
elif not word in wordDict:
wordDict[word] = 1
else:
frequency = wordDict.get(word)
wordDict[word] = frequency + 1
return wordDict
def printTopMost(frequencies, n):
listOfTuples = sorted(frequencies.items(), key=lambda x:x[1], reverse=True)
for x in range(n):
pair = listOfTuples[x]
word = pair[0]
frequency = str(pair[1])
print(word.ljust(20), frequency.rjust(5))
pair = listOfTuples[x] gives me an error. Please help me why do i have to add this much text it says mostly code please.
This is how the function is called: (test.py) there are instructions for the other functions I have created like tokenize and countWords also, but the error I'm getting is not a part of that which is why i've left those out.
def printTopMost(freq,n):
saved = sys.stdout
sys.stdout = io.StringIO()
wordfreq.printTopMost(freq,n)
out = sys.stdout.getvalue()
sys.stdout = saved
return out
test(printTopMost,({"horror": 5, "happiness": 15},0),"")
test(printTopMost,({"C": 3, "python": 5, "haskell": 2, "java":
1},3),"python 5\nC 3\nhaskell
2\n")
Full error message
Traceback (most recent call last):
File "C:/Users/Daniel/Documents/Scripts/Chalmers/lab1/test.py", line 81, in <module>
run()
File "C:/Users/Daniel/Documents/Scripts/Chalmers/lab1/test.py", line 70, in run
test(printTopMost,({},10),"")
File "C:/Users/Daniel/Documents/Scripts/Chalmers/lab1/test.py", line 8, in test
z = fun(*x)
File "C:/Users/Daniel/Documents/Scripts/Chalmers/lab1/test.py", line 41, in printTopMost
wordfreq.printTopMost(freq,n)
File "C:\Users\Daniel\Documents\Scripts\Chalmers\lab1\wordfreq.py", line 4, in printTopMost
pair = listOfTuples[x]
IndexError: list index out of range
Condition failed:
printTopMost({'horror': 5, 'happiness': 15}, 0) == ''
printTopMost returned/printed:
happiness 15
horror 5
Condition failed:
printTopMost({'C': 3, 'python': 5, 'haskell': 2, 'java': 1}, 3) == 'python 5\nC 3\nhaskell 2\n'
printTopMost returned/printed:
python 5
C 3
haskell 2
java 1
https://i.imgur.com/9SciXtx.png
def printTopMost(frequencies, n):
listOfTuples = sorted(frequencies.items(), key=lambda x:x[1], reverse=True)
n=min(n,len(listOfTuples))
for x in range(n):
pair = listOfTuples[x]
word = pair[0]
frequency = str(pair[1])
print(word.ljust(20), frequency.rjust(5))
Simple hack

How to get around "source code string cannot contain null bytes" when eval is run without removing the null bytes

I have a program where I encrypt class.__dict__ and save it to a file in an array so the file looks something like this -
{'some name':[1, 2, 3],
'data':'''𚶩9È𚵶𚶛𚵝X§Ë¡¡Ö©𚶕 î𚶣o𚶥[𚵶𚶎 y𚵽𚵮^𚵜𚶩5Ð¥"¢§«!𚵩𚶅𚶚𚵰fa¥𚶯m𚵬c𚶮^𚵵W𚵴𚶭''' #somthing like this but a lot longer
}
I then read it and need to decrypt data but before that, I need to get the data from the array that is currently a string which I did with eval(fileContent) and it gives me the error -
Traceback (most recent call last):
File "C:/Users/stemb/Documents/programing/python/programs/img editor/__init__.py", line 127, in
<module>
main()
File "C:/Users/stemb/Documents/programing/python/programs/img editor/__init__.py", line 102, in main
save_code.auto_save_load()
File "C:\Users\stemb\Documents\programing\python\programs\img editor\save_code.py", line 153, in
auto_save_load
data = eval(fileContent)
ValueError: source code string cannot contain null bytes
My reading function is this
data = open("./save." + GLOBAL.fileType, "r", encoding="utf-8").read()
json.loads gives json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
My code is -
# imports
import json
import random
import time
print("Finished save_progress imports")
def encrypt(obj, log=0): #encrypt
# set data
data = str(obj.__dict__)
key = "codeAsciEightAndMabyA_Seven" + str(time.time()) # crate key
key = list(key)
random.shuffle(key) # randomize key
cryptionKeys = list()
encrypted = list()
iMinus = 0
for i in range(len(data)): # create a random + or - for extra security
cryptionKeys.append(random.choice(("+", "-")))
# encode part
for i, c in enumerate(data):
# set individual data
charAsci = ord(c)
cryptionKey = cryptionKeys[i]
done = 0
while done == 0:
try:
charKey = ord(key[i - iMinus])
done = 1
except IndexError:
iMinus += len(key)
if cryptionKey == "+":
encryptedOrd = charAsci + charKey
else:
encryptedOrd = charAsci - charKey
if encryptedOrd < 0:
encryptedOrd += 110000
cryptionKeys[i] = "="
cryptionKey = cryptionKeys[i]
encryptedChar = chr(encryptedOrd)
encrypted.append(encryptedChar)
if log == 1:
print("charNumb \/")
print(i)
print("charAsci \/")
print(charAsci)
print("cryptionKey \/")
print(cryptionKey)
print("charKey \/")
print(charKey)
print("encryptedOrd \/")
print(encryptedOrd)
print("encryptedChar \/")
print(encryptedChar)
print()
encrypted2 = encrypted
encrypted = ""
for c in encrypted2:
encrypted += c
return str(encrypted), str(key), str(cryptionKeys)
def auto_save(GLOBAL): # the save func
file = open("./save." + GLOBAL.fileType, "w", encoding="utf-8")
encryptedGLOBAL, key, cryptionKeys = encrypt(GLOBAL)
out = ("{'key':" + str(key) + ", 'cryptionKeys':" + str(cryptionKeys) + ", 'data':'''" + str(
encryptedGLOBAL) + "'''}")
print(out)
file.write(out)
file.close()
def auto_save_load(aclass): # the loading dunc
data = open("./save." + GLOBAL.fileType, "r", encoding="utf-8").read()
data = eval(data)
key = data["key"]
cryptionKeys = data["cryptionKeys"]
encryptedGLOBAL = data["data"]
print(key)
print()
print(cryptionKeys)
print()
print(encryptedGLOBAL)
Other answers have said to remove the null bytes but the encryption method needs them.
Please help.

TypeError: list indices must be integers or slices, not str dictionary python

Here's the code:
with open("input.txt", "r") as f:
text = f.read()
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
res = {}
kol = 0
for buk in alphabet:
if buk in text:
kol += 1
if kol > 0:
for bukwa in text:
if bukwa in alphabet:
if bukwa not in res:
res[bukwa.upper()] = text.count(bukwa)
elif bukwa not in alphabet:
if bukwa not in res:
res[bukwa.upper()] = 0
res = sorted(res)
with open("output.txt", "w") as f:
for key in res:
f.write(key + " " + str(res[key]))
if kol == 0:
with open("output.txt", "w") as f:
f.write(-1)
And here's the error:
Traceback (most recent call last):
File "/home/tukanoid/Desktop/ejudge/analiz/analiz.py", line 23, in <module>
f.write(key + " " + str(res[key]))
TypeError: list indices must be integers or slices, not str
The line:
res = sorted(res)
isn't returning what you think it is. Using sort on a dictionary will sort its keys and return them as a list.
When you do res[key] inside the context manager, you're indexing the list with a string, resulting in an error.
If you want ordering in your dictionary you can do it in one of two ways:
Rename the list you create:
sorted_keys = sorted(res)
and then iterate through those while indexing the still referencing to the dict name res.
or, use OrderedDict and then iterate through its members as you would with a normal dict:
from collections import OrderedDict
# -- skipping rest of code --
# in the context manager
for key, val in OrderedDict(res):
# write to file

Categories