I have the following function:
def filetxt():
word_freq = {}
lvl1 = []
lvl2 = []
total_t = 0
users = 0
text = []
for l in range(0,500):
# Open File
if os.path.exists("C:/Twitter/json/user_" + str(l) + ".json") == True:
with open("C:/Twitter/json/user_" + str(l) + ".json", "r") as f:
text_f = json.load(f)
users = users + 1
for i in range(len(text_f)):
text.append(text_f[str(i)]['text'])
total_t = total_t + 1
else:
pass
# Filter
occ = 0
import string
for i in range(len(text)):
s = text[i] # Sample string
a = re.findall(r'(RT)',s)
b = re.findall(r'(#)',s)
occ = len(a) + len(b) + occ
s = s.encode('utf-8')
out = s.translate(string.maketrans("",""), string.punctuation)
# Create Wordlist/Dictionary
word_list = text[i].lower().split(None)
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = word_freq.keys()
numbo = range(1,len(keys)+1)
WList = ', '.join(keys)
NList = str(numbo).strip('[]')
WList = WList.split(", ")
NList = NList.split(", ")
W2N = dict(zip(WList, NList))
for k in range (0,len(word_list)):
word_list[k] = W2N[word_list[k]]
for i in range (0,len(word_list)-1):
lvl1.append(word_list[i])
lvl2.append(word_list[i+1])
I have used the profiler to find that it seems the greatest CPU time is spent on the zip() function and the join and split parts of the code, I'm looking to see if there is any way I have overlooked that I could potentially clean up the code to make it more optimized, since the greatest lag seems to be in how I am working with the dictionaries and the zip() function. Any help would be appreciated thanks!
p.s. The basic purpose of this function is that a I load in files which contain 20 or so tweets in them, so I am most likely going to end up with about 20k - 50k files being sent through this function. The output is that I produce a list of all the distinct words in a tweet, followed by which words linked to what, e.g:
1 "love"
2 "pasa"
3 "mirar"
4 "ants"
5 "kers"
6 "morir"
7 "dreaming"
8 "tan"
9 "rapido"
10 "one"
11 "much"
12 "la"
...
10 1
13 12
1 7
12 2
7 3
2 4
3 11
4 8
11 6
8 9
6 5
9 20
5 8
20 25
8 18
25 9
18 17
9 2
...
I think you want something like:
import string
from collections import defaultdict
rng = xrange if xrange else range
def filetxt():
users = 0
total_t = 0
occ = 0
wordcount = defaultdict(int)
wordpairs = defaultdict(lambda: defaultdict(int))
for filenum in rng(500):
try:
with open("C:/Twitter/json/user_" + str(filenum) + ".json",'r') as inf:
users += 1
tweets = json.load(inf)
total_t += len(tweets)
for txt in (r['text'] for r in tweets):
occ += txt.count('RT') + txt.count('#')
prev = None
for word in txt.encode('utf-8').translate(None, string.punctuation).lower().split():
wordcount[word] += 1
wordpairs[prev][word] += 1
prev = word
except IOError:
pass
I hope you don't mind I took the liberty of modifying your code to something that I would more likely write.
from itertools import izip
def filetxt():
# keeps track of word count for each word.
word_freq = {}
# list of words which we've found
word_list = []
# mapping from word -> index in word_list
word_map = {}
lvl1 = []
lvl2 = []
total_t = 0
users = 0
text = []
####### You should replace this with a glob (see: glob module)
for l in range(0,500):
# Open File
try:
with open("C:/Twitter/json/user_" + str(l) + ".json", "r") as f:
text_f = json.load(f)
users = users + 1
# in this file there are multiple tweets so add the text
# for each one.
for t in text_f.itervalues():
text.append(t) ## CHECK THIS
except IOError:
pass
total_t = len(text)
# Filter
occ = 0
import string
for s in text:
a = re.findall(r'(RT)',s)
b = re.findall(r'(#)',s)
occ += len(a) + len(b)
s = s.encode('utf-8')
out = s.translate(string.maketrans("",""), string.punctuation)
# make a list of words that are in the text s
words = s.lower().split(None)
for word in word_list:
# try/except is quicker when we expect not to miss
# and it will be rare for us not to have
# a word in our list already.
try:
word_freq[word] += 1
except KeyError:
# we've never seen this word before so add it to our list
word_freq[word] = 1
word_map[word] = len(word_list)
word_list.append(word)
# little trick to get each word and the word that follows
for curword, nextword in zip(words, words[1:]):
lvl1.append(word_map[curword])
lvl2.append(word_map[nextword])
What is is going to do is give you the following. lvl1 will give you a list of numbers corresponding to words in word_list. so word_list[lvl1[0]] will be the first word in the first tweet you processed. lvl2[0] will be index of the word that follows the lvl1[0] so you can say, world_list[lvl2[0]] is the word that follows word_list[lvl1[0]]. This code basically maintains word_map, word_list and word_freq as it builds this.
Please note that the way you were doing this before, specifically the way you were creating W2N will not work properly. Dictionaries do not maintain order. Ordered dictionaries are coming in 3.1 but just forget about it for now. Basically when you were doing word_freq.keys() it was changing every time you added a new word so there was no consistency. See this example,
>>> x = dict()
>>> x[5] = 2
>>> x
{5: 2}
>>> x[1] = 24
>>> x
{1: 24, 5: 2}
>>> x[10] = 14
>>> x
{1: 24, 10: 14, 5: 2}
>>>
So 5 used to be the 2nd one, but now it's the 3rd.
I also updated it to use a 0 index instead of 1 index. I don't know why you were using range(1, len(...)+1) rather than just range(len(...)).
Regardless, you should get away from thinking about for loops in the traditional sense of C/C++/Java where you do loops over numbers. You should consider that unless you need an index number then you don't need it.
Rule of Thumb: if you need an index, you probably need the element at that index and you should be using enumerate anyways. LINK
Hope this helps...
A few things. These lines are weird for me when put together:
WList = ', '.join(keys)
<snip>
WList = WList.split(", ")
That should be Wlist = list(keys).
Are you sure you want to optimize this? I mean, is it really so slow that it's worth your time? And finally, a description of what the script should do would be great, instead of letting us decipher it from the code :)
Related
Consider I have a log register for users' entry and exit times from some server. I need to find the time at which there are maximum sessions. If there are more than one possible answer, the smallest should be chosen. The input contains the number of sessions in the first line.
Example
Input:
5
4 5
0 3
1 9
7 8
2 6
Output:
2
I tried this script:
from collections import Counter, OrderedDict
load = Counter()
with open("input.txt", "r") as f:
n = int(f.readline())
for i in range(n):
session = f.readline()
session = session.split()
load.update(range(int(session[0]), int(session[1])+1))
load = load.most_common()
i = 0
max = load[0][1]
candidates = []
while load[i][1] == max:
candidates.append(load[i][0])
i += 1
print(min(candidates))
First, I use Counter() to count the occurrences of all points. Second, I use load = load.most_common() to order the resulting dict by occurrences. Finally I find the minimum value of all keys with the corresponding maximum value (= # of occurrences).
In fact, if Counter() returned a dict ordered by key, it would be much simpler.
Anyway, it is my home task and it runs more than 1 second (given time limit) on one of the test inputs. What could be done to speed it up? I've read about interval trees but I'm not sure if it is relevant.
Let's say ins and outs are the log in and log out times:
ins = [4,0,1,7,2]
outs = [5,3,9,8,6]
Combine them in one sorted list with the sign of the number indicating whether it is an "arrival" (positive) or "departure" (negative):
times = sorted(ins + [-x for x in outs], key=abs)
Now, walk through the list and count the "arrivals" and "departures" as they happen:
lmax = -1
logged = 0
for t in times:
if t >= 0:
logged += 1
if logged > lmax:
tmax = t
lmax = logged
else:
logged -= 1
print(tmax, lmax)
#2 3
The fast solution for this is just storing +1, -1 on the enter/exit times - then sort the dict-keys and sum over it incrementally, followed by getting the maximal value:
data = """5
4 5
0 3
1 9
7 8
2 6"""
with open("input.txt", "w") as f:
f.write(data)
d = {}
with open("input.txt", "r") as f:
next(f)
for line in f:
if line.strip():
start, stop = map(int,line.strip().split())
d.setdefault(start,0)
d[start] += 1
d.setdefault(stop,0)
d[stop] -= 1
maxx = 0
s = 0
max_idx = 0
# iteratively summ over sorted times from dict
for idx,key in enumerate(sorted(d)):
s += d[key]
if maxx < s: # remembert new max_idx and max
maxx = s
max_idx = idx
print(max_idx)
You can use a defaultdict(int) if that is still too slow to solve your challenge.
I am trying to create a list of words from a file is being read as then delete all words that contain duplicate letters. I was able to do it successfully with a list of words that I entered however when I try to use the function on the list created from a file the function still includes words with duplicates.
This works:
words = ['word','worrd','worrrrd','wordd']
alpha = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
x = 0
while x in range(0, len(alpha)):
i = 0
while i in range(0, len(words)):
if words[i].count(alpha[x]) > 1:
del(words[i])
i = i - 1
else:
i = i + 1
x = x + 1
print(words)
This is how I'm trying to do it when reading the file:
words = []
length = 5
file = open('dictionary.txt')
for word in file:
if len(word) == length+1:
words.append(word.splitlines())
alpha = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
x = 0
while x in range(0, len(alpha)):
i = 0
while i in range(0, len(words)):
if words[i].count(alpha[x]) > 1:
del(words[i])
i = i - 1
else:
i = i + 1
x = x + 1
print(words)
Try something like this. First, the string module is not quite deprecated, but it's unpopular. Lucky for you, it defines some useful constants to save you a bunch of typing. So you don't have to type all those quotes and commas.
Next, use the with open('filespec') as ... context for reading files: it's what it was put there for!
Finally, be aware of how iteration works for text files: for line in file: reads lines, including any trailing newlines. Strip those off. If you don't have one-word-per-line, you'll have to split the lines after you read them.
# Read words (possibly >1 per line) from dictionary.txt into lexicon[].
# Convert the words to lower case.
import string
Lexicon = []
with open('dictionary.txt') as file:
for line in file:
words = line.strip().lower().split()
Lexicon.extend(words)
for ch in string.ascii_lowercase:
for i in range(len(Lexicon)):
word = Lexicon[i]
if word.count(ch) > 1:
del Lexicon[i]
i -= 1
print('\n'.join(Lexicon))
How about this:
#This more comprehensive sample allows me to reproduce the file-reading
# problem in the script itself (before I changed the code "tee" would
# print, for instance)
words = ['green','word','glass','worrd','door','tee','wordd']
outlist = []
for word in words:
chars = [c for c in word]
# a `set` only contains unique characters, so if it is shorter than the
# `word` itself, we found a word with duplicate characters, so we keep
# looping
if len(set(chars)) < len(chars):
continue
else:
outlist.append(word)
print(outlist)
Result:
['word']
import string
words = ['word','worrd','worrrrd','wordd','5word']
new_words = [x for x in words if len(x) == len(set(x)) if all(i in string.ascii_letters for i in x)]
print(new_words)
# Imports the sys library
import sys
# Returns the file's contents
def readFile():
return open("gettysburg.txt", "r").read()
# Writes to the file with the variable output
def writeFile(output):
open("out.out", "w").write(output)
# Returns all of the words of the variable content by splitting the string by the space
def getWords(content):
return content.replace("--", " ").replace("\n", " ").replace(".", "").replace("!", "").replace(",", "").replace("?", "").split(" ")
def main():
# Initiates a HashMap
wordCounts = dict()
for text in getWords(readFile()):
# Checks if the string is empty
if (text != ""):
# If the text is not in variable wordCounts then add it to the wordCounts makes it = 1 else then increment it by 1
if (not text in wordCounts):
wordCounts[text] = 1
else:
wordCounts[text] = wordCounts[text] + 1
print(wordCounts)
for i in range(0, 9):
print(sorted(wordCounts, key=wordCounts.__getitem__, reverse=True)[i])
main()
How would I sort wordCounts by value then sort it by key?
I cannot use any sort of libraries so please do not suggest any use of libraries to improve efficiency.
I'm kind of stumped on this one.
Just some insight on what I'm doing:
Basically I'm finding the word frequency with a block of text and printing it out alphabetically based on value.
d = dict()
d["a"] = 10
d["ab"] = 8
d["abc"] = 10
d["bc"] = 9
for value, key in sorted(zip(d.values(), d.keys())):
print(value, key)
output :
8 ab
9 bc
10 a
10 abc
Is this the output you are looking for?
d = dict()
d['that']= 13
d['the']=9
d['to']=8
d['we']=8
d['here']=8
d['a']=7
d['and']=6
d['nation']=5
d['not']=5
d['for']=5
d['can']=5
d['of']=5
d['have']=5
ordered = dict()
for e in d:
if d[e] not in ordered:
ordered[d[e]] = []
ordered[d[e]].append(e)
for e in reversed(sorted(ordered.keys())):
for v in sorted(ordered[e]):
print e, v
Outputs:
13 that
9 the
8 here
8 to
8 we
7 a
6 and
5 can
5 for
5 have
5 nation
5 not
5 of
I'd appreciate some help debugging this code:
testing = """There is something unique about this line
in that it can span across several lines, which is unique and
useful in python."""
listofthings = []
i = 0
while i < len(testing):
if testing[i] == " ":
listofthings.append(i + 1)
i = i + 1
listofthings.insert(0, 0)
listofthings.append(len(testing))
print listofthings
word_list = []
i = 0
while i < len(listofthings):
l = i + 1
x = listofthings[i]
y = listofthings[l]
word = testing[x:y]
word_list.append(word)
i = l
print word_list
I am not sure why I am getting the index out of range error. I understand what the error means obviously, but am not sure what I am doing wrong. Weirdly enough, this only happens when I run the above code. It doesn't give me any errors when I run this:
word = testing[x:y]
print word
I am fairly new with Python(going on three days) so I am sure it is a stupid overlooked syntactical error...
l = i + 1
x = listofshit[i]
y = listofshit[l]
word = testing[x:y]
word_list.append(word)
When i=length-1,then y=length, which is an error.Python array indexing starts from 0, hence max address is length-1
The length of list listofshit is 21 with the range of index from 0 to 20. And when it comes to the final loop, i is 20 and l is 21, so there is a out of range error. And I think the following code is what you want:
testing = """There is something unique about this line
in that it can span across several lines, which is unique and
useful in python."""
listofshit = []
i = 0
while i < len(testing):
if testing[i] == " ":
listofshit.append(i)
i = i + 1
listofshit.insert(0, 0)
listofshit.append(len(testing))
word_list = []
i = 0
while i < len(listofshit) - 1:
l = i + 1
x = listofshit[i]
y = listofshit[l]
word = testing[x:y]
word_list.append(word)
i = l
print word_list
while i < len(listofshit):
l = i + 1
x = listofshit[i]
y = listofshit[l]
When i corresponds to the last element,
y = listofshit[l]
You are trying to access the element next to the last element. Thats why it is throwing the error.
On the last iteration of the second while loop, l is set to len(listofshit). This is past the end of listofshit; the last valid index is len(listofshit) - 1.
Actually i m new to hadoop and also to python .... So my doubt is how to run a python script in hadoop..... And also i was writing a wordcount program using python..So, can we execute this script without using the map reduce....
Actually i wrote the code i can see the output as below
Darkness 1
Heaven 2
It 3
Light 4
age 5
age 6
all 7
all 8
authorities 9
before 10
before 11
being 12
belief 13
best 14
comparison 15
degree 16
despair 17
direct 18
direct 19
It is counting number of words in a list..but whati have to achieve is grouping and deleting the duplicates and also count number of times of its occurrences .....
Below is my code . can somebody please tell me where i have done the mistake
********************************************************
Wordcount.py
********************************************************
import urllib2
import random
from operator import itemgetter
current_word = {}
current_count = 0
story = 'http://sixty-north.com/c/t.txt'
request = urllib2.Request(story)
response = urllib2.urlopen(request)
each_word = []
words = None
count = 1
same_words ={}
word = []
""" looping the entire file """
for line in response:
line_words = line.split()
for word in line_words: # looping each line and extracting words
each_word.append(word)
random.shuffle(each_word)
Sort_word = sorted(each_word)
for words in Sort_word:
same_words = words.lower(),int(count)
#print same_words
#print words
if not words in current_word :
current_count = current_count +1
print '%s\t%s' % (words, current_count)
else:
current_count = 1
#if Sort_word == words.lower():
#current_count += count
current_count = count
current_word = word
#print '2. %s\t%s' % (words, current_count)
For running python Based MR tasks ,have a Look at:
http://hadoop.apache.org/docs/r1.1.2/streaming.html
http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/
You need to design your code in Terms of Mapper - Reducer to enable Hadoop to execute your Python script. Read on the Map-Reduce Programming Paradigm before you can jump into writing the code. Its important to understand the MR programming paradigm and the role of {Key , value } pairs in solving the problem.
#Modified your above code to generate the required output
import urllib2
import random
from operator import itemgetter
current_word = {}
current_count = 0
story = 'http://sixty-north.com/c/t.txt'
request = urllib2.Request(story)
response = urllib2.urlopen(request)
each_word = []
words = None
count = 1
same_words ={}
word = []
""" looping the entire file """
#Collect All the words into a list
for line in response:
#print "Line = " , line
line_words = line.split()
for word in line_words: # looping each line and extracting words
each_word.append(word)
#for every word collected, in dict same_words
#if a key exists, such that key == word then increment Mapping Value by 1
# Else add word as new key with mapped value as 1
for words in each_word:
if words.lower() not in same_words.keys() :
same_words[words.lower()]=1
else:
same_words[words.lower()]=same_words[words.lower()]+1
for each in same_words.keys():
print "word = ",each, ", count = ",same_words[each]