I came across the following code in the book Programming collective intelligence called newsfeatures.py.
Here's the code:
import feedparser
import re
feedlist=['http://today.reuters.com/rss/topNews',
'http://today.reuters.com/rss/domesticNews',
'http://today.reuters.com/rss/worldNews',
'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
'http://news.google.com/?output=rss',
'http://feeds.salon.com/salon/news',
'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
'http://rss.cnn.com/rss/edition.rss',
'http://rss.cnn.com/rss/edition_world.rss',
'http://rss.cnn.com/rss/edition_us.rss']
def stripHTML(h):
p=''
s=0
for c in h:
if c=='<': s=1
elif c=='>':
s=0
p+=' '
elif s==0: p+=c
return p
def separatewords(text):
splitter=re.compile('\\W*')
return [s.lower( ) for s in splitter.split(text) if len(s)>3]
def getarticlewords( ):
allwords={}
articlewords=[]
articletitles=[]
ec=0
# Loop over every feed
for feed in feedlist:
f=feedparser.parse(feed)
# Loop over every article
for e in f.entries:
# Ignore identical articles
if e.title in articletitles: continue
# Extract the words
txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8'))
words=separatewords(txt)
articlewords.append({})
articletitles.append(e.title)
# Increase the counts for this word in allwords and in articlewords
for word in words:
allwords.setdefault(word,0)
allwords[word]+=1
articlewords[ec].setdefault(word,0)
articlewords[ec][word]+=1
ec+=1
return allwords,articlewords,articletitles
def makematrix(allw,articlew):
wordvec=[]
# Only take words that are common but not too common
for w,c in allw.items( ):
if c>3 and c<len(articlew)*0.6:
wordvec.append(w)
# Create the word matrix
l1=[[(word in f and f[word] or 0) for word in wordvec] for f in articlew]
return l1,wordvec
from numpy import *
def showfeatures(w,h,titles,wordvec,out='features.txt'):
outfile=file(out,'w')
pc,wc=shape(h)
toppatterns=[[] for i in range(len(titles))]
patternnames=[]
# Loop over all the features
for i in range(pc):
slist=[]
# Create a list of words and their weights
for j in range(wc):
slist.append((h[i,j],wordvec[j]))
# Reverse sort the word list
slist.sort( )
slist.reverse( )
# Print the first six elements
n=[s[1] for s in slist[0:6]]
outfile.write(str(n)+'\n')
patternnames.append(n)
# Create a list of articles for this feature
flist=[]
for j in range(len(titles)):
# Add the article with its weight
flist.append((w[j,i],titles[j]))
toppatterns[j].append((w[j,i],i,titles[j]))
# Reverse sort the list
flist.sort( )
flist.reverse( )
# Show the top 3 articles
for f in flist[0:3]:
outfile.write(str(f)+'\n')
outfile.write('\n')
outfile.close( )
# Return the pattern names for later use
return toppatterns,patternnames
The usage is as follows:
>>> import newsfeatures
>>> allw,artw,artt= newsfeatures.getarticlewords( )
>>> artt[1]
u'Fatah, Hamas men abducted freed: sources'
As you can see, this line, produces the news headline.
>>> artt[1]
u'Fatah, Hamas men abducted freed: sources'
What I want to know is is there someway through which the program not only displays the headline, but also displays the source of the headline from the feedlist.
Could anyone help?
Thanks!
Replace
articletitles.append(e.title)
in getarticlewords() with something like
articletitles.append(' '.join([e.title, ', from', feed]))
Related
Im wondering what would be the most eficient way in order to find if a text that has been scraped using Scrapy contains a word that is in a predefined list. Important to note that the list could be of around ~200 words and the text could be from hundreds of websites so efficiency is important.
My current solution with only a couple of words in list would be:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class BookSpider(CrawlSpider):
name = 'book'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
rules = (
Rule(LinkExtractor(), callback='parse', follow=True),
)
def parse(self, response):
restricted = ['word', 'word1', 'word2']
text = response.xpath("//body//text()").getall()
for words in restricted:
if words in text:
print('Found a restricted word!')
else:
print('All good!')
What do you think of such a solution? Maybe there is a more efficient way of achievieng the goal?
For a "pure" in/not in check, use set.intersection. Creating a set of the bigger text (if you can hold it in memory) will speed up this task tremendously.
A set reduces the amount of words to be checkt to unique checks and the check itself is O(1) - that is about as fast as you can get:
from urllib.request import urlopen
# use from disc, else get once from url and save to disc to use it
try:
with open("faust.txt") as f:
data = f.read()
except:
# partial credit: https://stackoverflow.com/a/46124819/7505395
# get some freebe text - Goethes Faust should suffice
url = "https://archive.org/stream/fausttragedy00goetuoft/fausttragedy00goetuoft_djvu.txt"
data = urlopen(url).read()
with open("faust.txt", "wb") as f:
f.write(data)
Process the data for measurements:
words = data.split() # words: 202915
unique = set(words) # distinct words: 34809
none_true = {"NoWayThatsInIt_1", "NoWayThatsInIt_2", "NoWayThatsInIt_3", "NoWayThatsInIt_4"}
one_true = none_true | {"foul"}
# should use timeit for it, havent got it here
def sloppy_time_measure(f, text):
import time
print(text, end="")
t = time.time()
# execute function 1000 times
for _ in range(1000):
f()
print( (time.time() - t) * 1000, "milliseconds" )
# .intersection calculates _full_ intersection, not only an "in" check:
lw = len(words)
ls = len(unique)
sloppy_time_measure(lambda: none_true.intersection(words), f"Find none in list of {lw} words: ")
sloppy_time_measure(lambda: one_true.intersection(words), f"Find one in list of {lw} words: ")
sloppy_time_measure(lambda: any(w in words for w in none_true),
f"Find none using 'in' in list of {lw} words: ")
sloppy_time_measure(lambda: none_true.intersection(unique), f"Find none in set of {ls} uniques: ")
sloppy_time_measure(lambda: one_true.intersection(unique), f"Find one in set of {ls} uniques: ")
sloppy_time_measure(lambda: any(w in unique for w in one_true),
f"Find one using 'in' in set of {ls} uniques: ")
Outputs for 1000 applications of the search (added spacing for clarity):
# in list
Find none in list of 202921 words: 5038.942813873291 milliseconds
Find one in list of 202921 words: 4234.968662261963 milliseconds
Find none using 'in' in list of 202921 words: 9726.848363876343 milliseconds
# in set
Find none in set of 34809 uniques: 15.897989273071289 milliseconds
Find one in set of 34809 uniques: 11.409759521484375 milliseconds
Find one using 'in' in set of 34809 uniques: 39.183855056762695 milliseconds
I have a large txt file and I'm trying to pull out every instance of a specific word, as well as the 15 words on either side. I'm running into a problem when there are two instances of that word within 15 words of each other, which I'm trying to get as one large snippet of text.
I'm trying to get chunks of text to analyze about a specific topic. So far, I have working code for all instances except the scenario mentioned above.
def occurs(word1, word2, filename):
import os
infile = open(filename,'r') #opens file, reads, splits into lines
lines = infile.read().splitlines()
infile.close()
wordlist = [word1, word2] #this list allows for multiple words
wordsString = ''.join(lines) #splits file into individual words
words = wordsString.split()
f = open(filename, 'w')
f.write("start")
f.write(os.linesep)
for word in wordlist:
matches = [i for i, w in enumerate(words) if w.lower().find(word) != -1]
for m in matches:
l = " ".join(words[m-15:m+16])
f.write(f"...{l}...") #writes the data to the external file
f.write(os.linesep)
f.close
So far, when two of the same word are too close together, the program just doesn't run on one of them. Instead, I want to get out a longer chunk of text that extends 15 words behind and in front of furthest back and forward words
This snippet will get number of words around the chosen keyword. If there are some keywords together, it will join them:
s = '''xxx I have a large txt file and I'm xxx trying to pull out every instance of a specific word, as well as the 15 words on either side. I'm running into a problem when there are two instances of that word within 15 words of each other, which I'm trying to get as one large snippet of text.
I'm trying to xxx get chunks of text to analyze about a specific topic. So far, I have working code for all instances except the scenario mentioned above. xxx'''
words = s.split()
from itertools import groupby, chain
word = 'xxx'
def get_snippets(words, word, l):
snippets, current_snippet, cnt = [], [], 0
for v, g in groupby(words, lambda w: w != word):
w = [*g]
if v:
if len(w) < l:
current_snippet += [w]
else:
current_snippet += [w[:l] if cnt % 2 else w[-l:]]
snippets.append([*chain.from_iterable(current_snippet)])
current_snippet = [w[-l:] if cnt % 2 else w[:l]]
cnt = 0
cnt += 1
else:
if current_snippet:
current_snippet[-1].extend(w)
else:
current_snippet += [w]
if current_snippet[-1][-1] == word or len(current_snippet) > 1:
snippets.append([*chain.from_iterable(current_snippet)])
return snippets
for snippet in get_snippets(words, word, 15):
print(' '.join(snippet))
Prints:
xxx I have a large txt file and I'm xxx trying to pull out every instance of a specific word, as well as the 15
other, which I'm trying to get as one large snippet of text. I'm trying to xxx get chunks of text to analyze about a specific topic. So far, I have working
topic. So far, I have working code for all instances except the scenario mentioned above. xxx
With the same data and different lenght:
for snippet in get_snippets(words, word, 2):
print(' '.join(snippet))
Prints:
xxx and I'm
I have xxx trying to
trying to xxx get chunks
mentioned above. xxx
As always, a variety of solutions avaliable here. A fun one would a be a recursive wordFind, where it searches the next 15 words and if it finds the target word it can call itself.
A simpler, though perhaps not efficient, solution would be to add words one at a time:
for m in matches:
l = " ".join(words[m-15:m])
i = 1
while i < 16:
if (words[m+i].lower() == word):
i=1
else:
l.join(words[m+(i++)])
f.write(f"...{l}...") #writes the data to the external file
f.write(os.linesep)
Or if you're wanting the subsequent uses to be removed...
bExtend = false;
for m in matches:
if (!bExtend):
l = " ".join(words[m-15:m])
f.write("...")
bExtend = false
i = 1
while (i < 16):
if (words[m].lower() == word):
l.join(words[m+i])
bExtend = true
break
else:
l.join(words[m+(i++)])
f.write(l)
if (!bExtend):
f.write("...")
f.write(os.linesep)
Note, have not tested so may require a bit of debugging. But the gist is clear: add words piecemeal and extend the addition process when a target word is encountered. This also allows you to extend with other target words other than the current one with a bit of addition to to the second conditional if.
NO CODE NEEDED
I am checking probability that given a series of words that, following that series, the index is some given word. I am currently working with nltk/python and was wondering if there was a simple function to do this or if I need to hard code this kind of thing myself by iterating through and counting all occurrences sort of thing.
Thanks
You have to iterate over the whole text first and count the n-grams so that you can compute their probability given a preceding sequence.
Here is a very simple example:
import re
from collections import defaultdict, Counter
# Tokenize the text in a very naive way.
text = "The Maroon Bells are a pair of peaks in the Elk Mountains of Colorado, United States, close to the town of Aspen. The two peaks are separated by around 500 meters (one-third of a mile). Maroon Peak is the higher of the two, with an altitude of 14,163 feet (4317.0 m), and North Maroon Peak rises to 14,019 feet (4273.0 m), making them both fourteeners. The Maroon Bells are a popular tourist destination for day and overnight visitors, with around 300,000 visitors every season."
tokens = re.findall(r"\w+", text.lower(), re.U)
def get_ngram_mapping(tokens, n):
# Add markers for the beginning and end of the text.
tokens = ["[BOS]"] + tokens + ["[EOS]"]
# Map a preceding sequence of n-1 tokens to a list
# of following tokens. 'defaultdict' is used to
# give us an empty list when we acces a key that
# does not exist yet.
ngram_mapping = defaultdict(list)
# Iterate through the text using a moving window
# of length n.
for i in range(len(tokens) - n + 1):
window = tokens[i:i+n]
preceding_sequence = tuple(window[:-1])
following_token = window[-1]
# Example for n=3: 'it is good' =>
# ngram_mapping[("it", "is")] = ["good"]
ngram_mapping[preceding_sequence].append(following_token)
return ngram_mapping
def compute_ngram_probability(ngram_mapping):
ngram_probability = {}
for preceding, following in ngram_mapping.items():
# Let's count which tokens appear right
# behind the tokens in the preceding sequence.
# Example: Counter(['a', 'a', 'b'])
# => {'a': 2, 'b': 1}
token_counts = Counter(following)
# Next we compute the probability that
# a token 'w' follows our sequence 's'
# by dividing by the frequency of 's'.
frequency_s = len(following)
token_probability = defaultdict(float)
for token, token_frequency in token_counts.items():
token_probability[token] = token_frequency / frequency_s
ngram_probability[preceding] = token_probability
return ngram_probability
ngrams = count_ngrams(tokens, n=2)
ngram_probability = compute_ngram_probability(ngrams)
print(ngram_probability[("the",)]["elk"]) # = 0.14285714285714285
print(ngram_probability[("the",)]["unknown"]) # = 0.0
I needed to solve the same issue as well. I used nltk.ngrams() function to get n-grams and then extend into a list as bi-grams because nltk.ConditionalFreqDist() function requires bi-grams. Then feed the results into nltk.ConditionalProbDist(). You can find the following example code;
from collections import defaultdict
ngram_prob = defaultdict(float)
ngrams_as_bigrams=[]
ngrams_as_bigrams.extend([((t[:-1]), t[-1]) for t in nltk.ngrams(tokens, n)])
cfd = nltk.ConditionalFreqDist(ngrams_as_bigrams)
cpdist = nltk.ConditionalProbDist(cfd, nltk.LidstoneProbDist, gamma=0.2, bins=len(tokens))
for (pre,follow) in ngrams_as_bigrams:
all_st = pre + (follow,)
ngram_prob[all_st] = cpdist[pre].prob(follow)
sorted_ngrams = [' '.join(k) for k, v in sorted(ngram_prob.items(), key=lambda x: x[1])[::-1]][:topk]
I'm using pip install wikipedia to make a simple "Philosophy step counter" and can't get the result I'm looking for. No matter what page I enter the code never finds a link that matches any of the words in the first few sentences of the article. Code is below:
import os
import string
import wikipedia
wikipedia.set_lang("en")
print("Please type the name of a wikipedia article: ")
page = input()
print("Searching wikipedia for: " + page)
wikiPage = wikipedia.page(page)
print("Using top result: " + wikiPage.title)
# currentPage = wikipedia.page(wikiPage.links[0])
# List of links (sorted alphabetically, makes our job much harder)
links = wikiPage.links
# Split the beginning of the article into words
words = wikipedia.summary(wikiPage, sentences=3).split()
words = [''.join(c for c in s if c not in string.punctuation)
for s in words] # Sanitize list of words to remove punctuation
# comparisons = [a == b for (a, b) in itertools.product(words, links)]
x = 0
while words[x] not in links:
print(words[x])
x = x + 1
newPage = wikipedia.page(words[x])
Is this a fault of the library I'm using or my code? The link list appears to be ordered alphabetically if that makes any difference (hence why I'm doing all this in the first place)
I am trying to count the number of contractions used by politicians in certain speeches. I have lots of speeches, but here are some of the URLs as a sample:
every_link_test = ['http://www.millercenter.org/president/obama/speeches/speech-4427',
'http://www.millercenter.org/president/obama/speeches/speech-4424',
'http://www.millercenter.org/president/obama/speeches/speech-4453',
'http://www.millercenter.org/president/obama/speeches/speech-4612',
'http://www.millercenter.org/president/obama/speeches/speech-5502']
I have a pretty rough counter right now - it only counts the total number of contractions used in all of those links. For example, the following code returns 79,101,101,182,224 for the five links above. However, I want to link up filename, a variable I create below, so I would have something like (speech_1, 79),(speech_2, 22),(speech_3,0),(speech_4,81),(speech_5,42). That way, I can track the number of contractions used in each individual speech. I'm getting the following error with my code: AttributeError: 'tuple' object has no attribute 'split'
Here's my code:
import urllib2,sys,os
from bs4 import BeautifulSoup,NavigableString
from string import punctuation as p
from multiprocessing import Pool
import re, nltk
import requests
reload(sys)
url = 'http://www.millercenter.org/president/speeches'
url2 = 'http://www.millercenter.org'
conn = urllib2.urlopen(url)
html = conn.read()
miller_center_soup = BeautifulSoup(html)
links = miller_center_soup.find_all('a')
linklist = [tag.get('href') for tag in links if tag.get('href') is not None]
# remove all items in list that don't contain 'speeches'
linkslist = [_ for _ in linklist if re.search('speeches',_)]
del linkslist[0:2]
# concatenate 'http://www.millercenter.org' with each speech's URL ending
every_link_dups = [url2 + end_link for end_link in linkslist]
# remove duplicates
seen = set()
every_link = [] # no duplicates array
for l in every_link_dups:
if l not in seen:
every_link.append(l)
seen.add(l)
def processURL_short_2(l):
open_url = urllib2.urlopen(l).read()
item_soup = BeautifulSoup(open_url)
item_div = item_soup.find('div',{'id':'transcript'},{'class':'displaytext'})
item_str = item_div.text.lower()
splitlink = l.split("/")
president = splitlink[4]
speech_num = splitlink[-1]
filename = "{0}_{1}".format(president, speech_num)
return item_str, filename
every_link_test = every_link[0:5]
print every_link_test
count = 0
for l in every_link_test:
content_1 = processURL_short_2(l)
for word in content_1.split():
word = word.strip(p)
if word in contractions:
count = count + 1
print count, filename
As the error message explains, you cannot use split the way you are using it. split is for strings.
So you will need to change this:
for word in content_1.split():
to this:
for word in content_1[0]:
I chose [0] by running your code, I think that gives you the chunk of the text you are looking to search through.
#TigerhawkT3 has a good suggestion you should follow in their answer too:
https://stackoverflow.com/a/32981533/1832539
Instead of print count, filename, you should save these data to a data structure, like a dictionary. Since processURL_short_2 has been modified to return a tuple, you'll need to unpack it.
data = {} # initialize a dictionary
for l in every_link_test:
content_1, filename = processURL_short_2(l) # unpack the content and filename
for word in content_1.split():
word = word.strip(p)
if word in contractions:
count = count + 1
data[filename] = count # add this to the dictionary as filename:count
This would give you a dictionary like {'obama_4424':79, 'obama_4453':101,...}, allowing you to easily store and access your parsed data.