How to get random top posts from reddit.com/r/showerthoughts - python

I have the following code that is supposed to take a random top post from reddit.com/r/showerthoughts and print the title and author of the post.
import random, json
randnum = random.randint(0,99)
response = json.load('https://www.reddit.com/r/showerthoughts/top.json?sort=top&t=week&limit=100')["data"]["children"][randnum]["data"]
print("\n\"" + response["title"] + "\"")
print(" -" + response["author"] + "\n")
I get the following error:
Traceback (most recent call last):
File "C:/Users/jacks/.PyCharmCE2019.1/config/scratches/scratch_4.py", line 4, in <module>
response = json.load('https://www.reddit.com/r/showerthoughts/top.json?sort=top&t=week&limit=100')["data"]["children"][randnum]["data"]
File "C:\Users\jacks\AppData\Local\Programs\Python\Python37\lib\json\__init__.py", line 293, in load
return loads(fp.read(),
AttributeError: 'str' object has no attribute 'read'
Am I on the right track here?
UPDATE:
Got it to work with this code:
import random, requests
randnum = random.randint(0,99)
response = requests.get('https://www.reddit.com/r/showerthoughts/top.json?sort=top&t=week&limit=100', headers = {'User-Agent': 'showerbot'})
result = response.json()
result1 = result["data"]["children"][randnum]["data"]
print("\n\"" + result1["title"] + "\"")
print(" -" + result1["author"] + "\n")

You cannot load json directly from a url, for that you need to use requests module.
Using json module
import random, json, requests
randnum = random.randint(0,99)
response = requests.get('https://www.reddit.com/r/showerthoughts/top.json?sort=top&t=week&limit=100')
response = json.loads(response.text)
response = response["data"]["children"][randnum]["data"]
print("\n\"" + response["title"] + "\"")
print(" -" + response["author"] + "\n")
Without using json module
import random, requests
randnum = random.randint(0,99)
response = requests.get('https://www.reddit.com/r/showerthoughts/top.json?sort=top&t=week&limit=100')
response = response.json()
response = response["data"]["children"][randnum]["data"]
print("\n\"" + response["title"] + "\"")
print(" -" + response["author"] + "\n")

Related

Not getting all JSON data from URL

i am using this code to get some data from JSON from this URL:
https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/json/
And i am using this code for that, but i am not getting all results ( all countries) i dont know why !! :
import json
import urllib.request
# Print Daily Updated Report from JSON file in this (URL)
url = "https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/json/"
data = urllib.request.urlopen(url).read()
obj = json.loads(data)
for p in obj['records']:
print('Country: ' + p['countriesAndTerritories'])
print('Date: ' + p['dateRep'])
print('Cases: ' + str(p['cases']))
print('Deaths: ' + str(p['deaths']))
print('')
There was an error but i solve it but the problem not solved yet !. I just add this before my code to solve the error:
import json
import urllib.request
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# Print Daily Updated Report from JSON file in this (URL)
url = "https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/json/"
data = urllib.request.urlopen(url).read().decode()
obj = json.loads(data)
for p in obj['records']:
print('Country: ' + p['countriesAndTerritories'])
print('Date: ' + p['dateRep'])
print('Cases: ' + str(p['cases']))
print('Deaths: ' + str(p['deaths']))
print('')

Decode .ilearner from Azure

I would like to ask for solutions for the problem with decoding .ilearner file downloaded via API from Microsoft Azure ML. The error I am receiving looks like this:
File "<ipython-input-1-157aac1e9b7a>", line 162, in <module>
invokeBatchExecutionService()
File "<ipython-input-1-157aac1e9b7a>", line 157, in invokeBatchExecutionService
processResults(result)
File "<ipython-input-1-157aac1e9b7a>", line 51, in processResults
saveBlobToFile(url3, "The results for " + outputName)
File "<ipython-input-1-157aac1e9b7a>", line 28, in saveBlobToFile
f.write(response.read().decode("utf8", 'ignore'))
File "D:\XXXXXX\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0
" UnicodeEncodeError: 'charmap' codec can't encode character '\u053e' in position 1463: character maps to <undefined>"
My full code looks like this:
import urllib.request
import json
import time
from azure.storage.blob import *
def printHttpError(httpError):
print("The request failed with status code: " + str(httpError.code))
# Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
print(httpError.info())
print(json.loads(httpError.read().decode("utf8", 'ignore')))
return
def saveBlobToFile(blobUrl, resultsLabel):
output_file = "C:/XXX/API/output.csv" # Replace this with the location you would like to use for your output file, and valid file extension (usually .csv for scoring results, or .ilearner for trained models)
print("Reading the result from " + blobUrl)
try:
response = urllib.request.urlopen(blobUrl)
except urllib.error.HTTPError as error:
printHttpError(error)
return
# print("Response: " + response)
with open(output_file, "w+") as f:
# print(response.read())
f.write(response.read().decode("utf8", 'ignore'))
print(resultsLabel + " have been written to the file " + output_file)
return
def processResults(result):
first = True
results = result["Results"]
# print(" Results : " + results)
for outputName in results:
result_blob_location = results[outputName]
sas_token = result_blob_location["SasBlobToken"]
base_url = result_blob_location["BaseLocation"]
relative_url = result_blob_location["RelativeLocation"]
print("The results for " + outputName + " are available at the following Azure Storage location:")
print("BaseLocation: " + base_url)
print("RelativeLocation: " + relative_url)
print("SasBlobToken: " + sas_token)
if (first):
first = False
url3 = base_url + relative_url + sas_token
saveBlobToFile(url3, "The results for " + outputName)
# first = True
return
def uploadFileToBlob(input_file, input_blob_name, storage_container_name, storage_account_name, storage_account_key):
blob_service = BlockBlobService(account_name=storage_account_name, account_key=storage_account_key)
print("Uploading the input to blob storage...")
blob_service.create_blob_from_path(storage_container_name, input_blob_name, input_file)
def invokeBatchExecutionService():
storage_account_name = "XXX" # Replace this with your Azure Storage Account name
storage_account_key = "XXX # Replace this with your Azure Storage Key
storage_container_name = "XXX" # Replace this with your Azure Storage Container name
connection_string = "DefaultEndpointsProtocol=https;AccountName=" + storage_account_name + ";AccountKey=" + storage_account_key
api_key = "XXX” # Replace this with the API key for the web service
url = "https://uswestcentral.services.azureml.net/subscriptions/75b6fa4e098c4ad88df85a3533530bd4/services/4ed857d52f3140e1b422e6e986437b6e/jobs"
uploadFileToBlob("C:/XXX/API/input.csv", # Replace this with the location of your input file, and valid file extension (usually .csv)
"input2.csv", # Replace this with the name you would like to use for your Azure blob; this needs to have the same extension as the input file
storage_container_name, storage_account_name, storage_account_key);
payload = {
"Inputs": {
"input1":
{
"ConnectionString": connection_string,
"RelativeLocation": "/" + storage_container_name + "/input.csv"
},
},
"Outputs": {
"output1":
{
"ConnectionString": connection_string,
"RelativeLocation": "/" + storage_container_name + "/output.csv" # Replace this with the location you would like to use for your output file, and valid file extension (usually .csv for scoring results, or .ilearner for trained models)
},
"output2":
{
"ConnectionString": connection_string,
"RelativeLocation": "/" + storage_container_name + "/output.ilearner" # Replace this with the location you would like to use for your output file, and valid file extension (usually .csv for scoring results, or .ilearner for trained models)
},
},
"GlobalParameters": {
}
}
body = str.encode(json.dumps(payload))
headers = { "Content-Type":"application/json", "Authorization":("Bearer " + api_key)}
print("Submitting the job...")
# submit the job
req = urllib.request.Request(url + "?api-version=2.0", body, headers)
try:
response = urllib.request.urlopen(req)
except urllib.error.HTTPError as error:
printHttpError(error)
return
result = response.read()
job_id = result.decode("utf8", 'ignore')[1:-1]
print("Job ID: " + job_id)
# start the job
print("Starting the job...")
body = str.encode(json.dumps({}))
req = urllib.request.Request(url + "/" + job_id + "/start?api-version=2.0", body, headers)
try:
response = urllib.request.urlopen(req)
except urllib.error.HTTPError as error:
printHttpError(error)
return
url2 = url + "/" + job_id + "?api-version=2.0"
while True:
print("Checking the job status...")
req = urllib.request.Request(url2, headers = { "Authorization":("Bearer " + api_key) })
try:
response = urllib.request.urlopen(req)
except urllib.error.HTTPError as error:
printHttpError(error)
return
result = json.loads(response.read().decode("utf8", 'ignore'))
status = result["StatusCode"]
if (status == 0 or status == "NotStarted"):
print("Job " + job_id + " not yet started...")
elif (status == 1 or status == "Running"):
print("Job " + job_id + " running...")
elif (status == 2 or status == "Failed"):
print("Job " + job_id + " failed!")
print("Error details: " + result["Details"])
break
elif (status == 3 or status == "Cancelled"):
print("Job " + job_id + " cancelled!")
break
elif (status == 4 or status == "Finished"):
print("Job " + job_id + " finished!")
# print("Results: " + results)
processResults(result)
break
time.sleep(1) # wait one second
return
invokeBatchExecutionService()
The code follows guideline described by https://learn.microsoft.com/en-us/azure/machine-learning/studio/retrain-models-programmatically I run the Python 3.5 script .
You are downloading a file encoded as UTF-8, then writing the data to your local filesystem using the filesystem's default encoding (probably cp1252 or similar). Writing to the local filesystem is failing because the data contains characters that cannot be encoded as cp1252.
>>> s = '\u053e'
>>> s
'Ծ'
>>> s.encode('cp1252')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/XXXXXX/data36/lib/python3.6/encodings/cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
UnicodeEncodeError: 'charmap' codec can't encode character '\u053e' in position 0: character maps to <undefined>
One solution is to encode your output file as UTF-8 (you will need to remember to open the file as UTF-8 if you open it elsewhere in your code)
with open(output_file, "wb+", encoding='utf-8') as f:
f.write(response.read())
Another approach might be to set the errors argument to open to handle encoding errors. For example:
with open(output_file, "w+", errors="replace") as f:
f.write(response.read().decode("utf8", 'ignore'))
will replace un-encodeable characters with a ? character.

Getting ValueError: unknown url type: ' '

I have this code below that iterate through some tracks. And then for each track I want to use the musixmatch api to get and print the lyrics of the track based on the artist name and track name.
code that iterete trough some tracks and print the lyrics:
for i, v in tracks.items():
artist = tracks[i]['artist'].replace(" ", "+")
title = tracks[i]['title'].replace(" ", "+")
print(tracks)
print(song_lyric(title, artist))
The print(tracks) returns in this format:
{12: {'trackID': 12, 'title': 'Achtung Baby', 'number': '1', 'artist': 'U2', 'album': 'Achtung Baby', 'albumID': 2, 'duration': '291'}
When the code exuted the lyrics for the firsts tracks are printed, but then it appears an error:
Traceback (most recent call last):
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 239, in <module>
print(song_lyric(title, artist))
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 72, in song_lyric
lyrics_tracking(tracking_url)
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 79, in lyrics_tracking
request = urllib.request.Request(querystring)
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 329, in __init__
self.full_url = url
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 355, in full_url
self._parse()
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 384, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
Do you know why this error is appearing?
The methods to get the lyrics from musixmatch are public available:
def song_lyric(song_name, artist_name):
while True:
querystring = apiurl_musixmatch + "matcher.lyrics.get?q_track=" + urllib.parse.quote(
song_name) + "&q_artist=" + urllib.parse.quote(
artist_name) + "&apikey=" + apikey_musixmatch + "&format=json&f_has_lyrics=1"
# matcher.lyrics.get?q_track=sexy%20and%20i%20know%20it&q_artist=lmfao
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
while True:
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
json_obj = json.loads(raw.decode('utf-8'))
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
copyright = json_obj["message"]["body"]["lyrics"]["lyrics_copyright"]
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
#print(tracking_url)
lyrics_tracking(tracking_url)
return (body + "\n\n" + copyright)
def lyrics_tracking(tracking_url):
while True:
querystring = tracking_url
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
print(raw)
Full working exemple that reproduces the error:
import requests
import json
import urllib.request, urllib.error, urllib.parse
import socket
apikey_musixmatch = '0b4a363bbd71974c2634837d5b5d1d9a' #generated for the example
apiurl_musixmatch = 'http://api.musixmatch.com/ws/1.1/'
api_key = "b088cbedecd40b35dd89e90f55227ac2" #generated for the example
def song_lyric(song_name, artist_name):
while True:
querystring = apiurl_musixmatch + "matcher.lyrics.get?q_track=" + urllib.parse.quote(
song_name) + "&q_artist=" + urllib.parse.quote(
artist_name) + "&apikey=" + apikey_musixmatch + "&format=json&f_has_lyrics=1"
# matcher.lyrics.get?q_track=sexy%20and%20i%20know%20it&q_artist=lmfao
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
while True:
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
json_obj = json.loads(raw.decode('utf-8'))
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
copyright = json_obj["message"]["body"]["lyrics"]["lyrics_copyright"]
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
print("Tracking_url====================" +tracking_url + "==================================")
lyrics_tracking(tracking_url)
return (body + "\n\n" + copyright)
def lyrics_tracking(tracking_url):
while True:
querystring = tracking_url
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
print(raw)
ID = 0
#get top artists from country
artists = {}
for i in range(2, 3):
artists_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=geo.gettopartists&country=spain&format=json&page=' + str(i) + '&api_key=' + api_key)
artists_data = artists_response.json()
for artist in artists_data["topartists"]["artist"]:
name = artist["name"]
url = artist["url"]
if ID > 1: continue
artists[ID] = {}
artists[ID]['ID'] = ID
artists[ID]['name'] = name
ID += 1
for i, v in artists.items():
chosen = artists[i]['name'].replace(" ", "+")
artist_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=artist.getinfo&format=json&artist=' + chosen + '&api_key=' + api_key)
artist_data = artist_response.json()
# get top albums of the artists
albums = {}
for i, v in artists.items():
chosen = artists[i]['name'].replace(" ", "+")
topalbums_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=artist.gettopalbums&format=json&artist=' + chosen + '&api_key=' + api_key + '&limit=5')
albums_data = topalbums_response.json()
for album in albums_data['topalbums']['album']:
name = album["name"]
url = album["url"]
albums[ID] = {}
albums[ID]['ID'] = ID
albums[ID]['artist'] = artists[i]['name']
albums[ID]['artistID'] = artists[i]['ID']
albums[ID]['name'] = name
ID += 1
# Get tracks of the album
tracks = {}
for i, v in albums.items():
artist = albums[i]['artist'].replace(" ", "+")
name = albums[i]['name'].replace(" ", "+")
album_response_data = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=album.getinfo&format=json&api_key=' + api_key + '&artist=' + artist + '&album=' + name)
album_response = album_response_data.json()
for album in album_response['album']['tracks']['track']:
title = album['name']
tracks[ID] = {}
tracks[ID]['trackID'] = ID
tracks[ID]['title'] = title
tracks[ID]['artist'] = albums[i]['artist']
tracks[ID]['album'] = albums[i]['name']
tracks[ID]['albumID'] = albums[i]['ID']
ID += 1
for i, v in tracks.items():
artist = tracks[i]['artist'].replace(" ", "+")
title = tracks[i]['title'].replace(" ", "+")
# print the lyric of each track
print(song_lyric(title, artist))
It seems like url is not correct. It happens here:
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
If you have ability to run that API locally and see what is returned into tracking_url, you can find out what is still wrong with it.
UPDATE:
I reproduced it, so the urllib.request cannot process empty string URL: "", that is why you need to check if the tracking_url != "" and only if its not empty string or None you need to request for the song.

Python TypeError on executing weather service code

I am using a weather API to design a slack bot service using python.
My source code is-
import requests
import re
import json
from bs4 import BeautifulSoup
def weather(cityname):
cityid = extractid(cityname)
url = "http://api.openweathermap.org/data/2.5/forecast?id=" + str(cityid) + "&APPID=c72f730d08a4ea1d121c8e25da7e4411"
while True:
r = requests.get(url, timeout=5)
while r.status_code is not requests.codes.ok:
r = requests.get(url, timeout=5)
soup = BeautifulSoup(r.text)
data = ("City: " + soup.city["name"] + ", Country: " + soup.country.text + "\nTemperature: " + soup.temperature["value"] +
" Celsius\nWind: " + soup.speed["name"] + ", Direction: " + soup.direction["name"] + "\n\n" + soup.weather["value"])
# print data
return data
def extractid(cname):
with open('/home/sourav/Git-Github/fabulous/fabulous/services/city.list.json') as data_file:
data = json.load(data_file)
for item in data:
if item["name"] == cname:
return item["id"]
def on_message(msg, server):
text = msg.get("text", "")
match = re.findall(r"~weather (.*)", text)
if not match:
return
searchterm = match[0]
return weather(searchterm.encode("utf8"))
on_bot_message = on_message
But executing the code gives the following error-
File "/usr/local/lib/python2.7/dist-packages/fabulous-0.0.1-py2.7.egg/fabulous/services/weather.py", line 19, in weather
" Celsius\nWind: " + soup.speed["name"] + ", Direction: " + soup.direction["name"] + "\n\n" + soup.weather["value"])
TypeError: 'NoneType' object has no attribute '__getitem__'
I can't figure out what's the error. Please help!
__getitem__ is called when you ask for dictionary key like a['abc'] translates to a.__getitem__('abc')
so in this case one attribute of soup is None (speed, direction or weather)
ensure that your r.text contains data you want, simply print it:
print(r.text)
list structure in parsed data:
for child in soup.findChildren():
print child
always assume your entry data might be wrong, instead doing soup.city do soup.find('city'), it might be empty so:
city = soup.find('city')
if len(city):
city_name = city[0]['name']
else:
city_name = 'Error' # or empty, or sth

Why do I keep getting this title match error with my Python program?

When I run the following code, I keep getting this error:
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 94, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 71, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 35, in handle_listing
title, = TITLE_MATCH.match(listing_title).groups()
AttributeError: 'NoneType' object has no attribute 'groups'
What is wrong?
It has something to do with the Title match but I don't know how to fix it!
If you could help me I would really appreciate it!
Thanks!
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
for a in range(2,3):
START_URL = 'http://www.locationary.com/place/en/US/New_Jersey/Randolph-page' + str(a) + '/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
Quoting from your error:
title, = TITLE_MATCH.match(listing_title).groups()
AttributeError: 'NoneType' object has no attribute 'groups'
TITLE_MATCH.match(listing_title) returns None, so you can't call .groups().
When a re .match does not find anything to match, it returns None. Since you cannot call .groups() on None, you have to check for a match first. To do that:
Change this:
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
To this:
titleMatch = TITLE_MATCH.match(listing_title)
if titleMatch:
title, = titleMatch.groups()
else:
# handle it
addressMatch = ADDRESS_MATCH.match(listing_title)
if addressMatch:
address, = addressMatch.groups()
else:
# handle it

Categories