I'm running a script to collect information from various pages on a website.
#python2
from __future__ import division
from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
from BeautifulSoup import SoupStrainer
import pandas as pd
import urllib,re,csv,os,urllib2,requests,itertools,pdfkit,time
import smtplib
import math
from selenium import webdriver
import requests.packages.urllib3
import requests
requests.packages.urllib3.disable_warnings()
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from datetime import datetime
os.environ["LANG"] = "en_US.UTF-8"
start_time = time.time()
os.chdir('DIRECTORY')
#import .csv with variables for fulls list
fulls = zip(orgs, terms, sites, smo_ids, year_i, year_start, year_end)
orgs2 = []
terms2 = []
sites2 = []
results2 = []
smo_ids2 = []
article_number = []
years2 = []
numbers = range(2000001)
numbers = numbers[0::200]
start_time = time.time()
display = Display(visible=0, size=(1600, 1200))
display.start()
otime = datetime.now()
startpoint = 1
for full in fulls:
site = full[2]
org = full[0]
smo_id = full[3]
term = full[1]
year = full[4]
driver = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
try:
driver.get(site) #get original site info
except (WebDriverException, TimeoutException) as e:
print(e.Message)
print "REFRESHING PAGE"
driver.refresh(site)
source = driver.page_source
soup = BeautifulSoup(source, "html.parser")
soup2 = soup.encode("utf-8")
try:
resultno = re.findall('<h1 id="pqResultsCount">\n(.*?) result',soup2)
resultno = ''.join(resultno)
resultno = resultno.translate(None, "(){}<>,")
resultno = int(resultno)
except ValueError, e:
resultno = int(0)
no_pages = int(math.ceil(resultno/20))
an = re.findall('{"(.*?)markedlistcheckbox:markallitems',soup2)
an = ''.join(an)
an = re.findall('markAll":false,"formats":{(.*?)},"markURL"',an)
an = ''.join(an)
an = re.sub(r'":.+?,"', '', an)
an = an.translate(None, '"')
an = an.split(':', 1)[0]
an = an.split('MSTAR_')
an.pop(0)
for i in an:
article_number.append(i)
years2.append(year)
sites2.append(site)
orgs2.append(org)
smo_ids2.append(smo_id)
terms2.append(term)
#begin encryption search
encrypt = re.findall('id="searchForm"(.*?)/></div>',soup2)
encrypt = ''.join(encrypt)
encrypt
t_ac = re.findall('name="t:ac" type="hidden" value="(.*?)/',encrypt)
t_ac = ''.join(t_ac)
t_ac
t_formdata = re.findall('name="t:formdata" type="hidden" value="(.*?)"',encrypt)
t_formdata = ''.join(t_formdata)
t_formdata
#start page 2 stuff
for page in range(2,no_pages+1):
site_ = "https://WEBSITE.com/results:gotopage/" + str(page) + "?t:ac=" + t_ac + "/?t:formdata=" + t_formdata + ""
driver.get(site_) #get subsequent page info
source = driver.page_source # Here is your populated data for the page source
soup_ = BeautifulSoup(source, "html.parser")
soup2_ = soup_.encode("utf-8")
an_ = re.findall('{"(.*?)markedlistcheckbox:markallitems',soup2_)
an_ = ''.join(an_)
an_ = re.findall('markAll":false,"formats":{(.*?)},"markURL"',an_)
an_ = ''.join(an_)
an_ = re.sub(r'":.+?,"', '', an_)
an_ = an_.translate(None, '"')
an_ = an_.split(':', 1)[0]
an_ = an_.split('MSTAR_')
an_.pop(0)
for i_ in an_:
article_number.append(i_)
years2.append(year)
sites2.append(site)
orgs2.append(org)
smo_ids2.append(smo_id)
terms2.append(term)
driver.quit()
elapsed_time = time.time() - start_time
try:
ctime_1 = ctime
except:
ctime_1 = otime
m, s = divmod(elapsed_time, 60)
h, m = divmod(m, 60)
ctime = datetime.now()
diftime = ctime - ctime_1
diftime = str(diftime)
diftime = diftime[2:7]
ctime2 = str(ctime)
ctime2 = ctime2[11:19]
print "%d:%02d:%02d | %s | %s" % (h, m, s, ctime2, diftime)
print "%d: Page %d is complete" % (startpoint, startpoint)
if startpoint in numbers:
print "Sleeping for 10 seconds"
time.sleep(10)
startpoint += 1
article_info = zip(article_number, years2, sites2, orgs2, smo_ids2, terms2)
The code runs, but at various points (sometimes 20 mins into the run, sometimes 14 hours into it), I get the following error:
Traceback (most recent call last):
File "<stdin>", line 131, in <module>
File "/usr/local/lib/python2.7/dist-packages/selenium/webdriver/chrome/webdriver.py", line 69, in __init__
desired_capabilities=desired_capabilities)
File "/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/webdriver.py", line 151, in __init__
self.start_session(desired_capabilities, browser_profile)
File "/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/webdriver.py", line 240, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/webdriver.py", line 308, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/errorhandler.py", line 194, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: session not created exception
from timeout: Timed out receiving message from renderer: 600.000
(Session info: chrome=64.0.3282.186)
(Driver info: chromedriver=2.35.528139 (47ead77cb35ad2a9a83248b292151462a66cd881),platform=Linux 4.13.0-36-generic x86_64)
I'm using current chrome and chromedriver, and I have tried this using selenium versions 3.9, 3.8, and 3.7. No matter what, I eventually get the above error.
Any ideas how to fix this error?
Related
So I wrote this code to get the list of followers on Instagram using instaloader library in python :
login_name = 'beyondhelloworld'
target_profile = 'femindharamshi'
# OR
#import sys
#target_profile = sys.argv[1] # pass in target profile as argument
from instaloader import Instaloader, Profile
loader = Instaloader()
# login
try:
loader.load_session_from_file(login_name)
except FileNotFoundError:
loader.context.log("Session file does not exist yet - Logging in.")
if not loader.context.is_logged_in:
loader.interactive_login(login_name)
loader.save_session_to_file()
profile = Profile.from_username(loader.context, target_profile)
followers = profile.get_followers()
loader.context.log()
loader.context.log('Profile {} has {} followers:'.format(profile.username, profile.followers))
loader.context.log()
for follower in followers:
loader.context.log(follower.username, flush=True)
But I keep getting this error :
Loaded session from /Users/femindharamshi/.config/instaloader/session-beyondhelloworld.
Traceback (most recent call last):
File "/Users/femindharamshi/Documents/instaload/env/lib/python3.7/site-packages/instaloader/structures.py", line 597, in _obtain_metadata
self._node = metadata['entry_data']['ProfilePage'][0]['graphql']['user']
KeyError: 'graphql'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "il.py", line 20, in <module>
profile = Profile.from_username(loader.context, target_profile)
File "/Users/femindharamshi/Documents/instaload/env/lib/python3.7/site-packages/instaloader/structures.py", line 552, in from_username
profile._obtain_metadata() # to raise ProfileNotExistException now in case username is invalid
File "/Users/femindharamshi/Documents/instaload/env/lib/python3.7/site-packages/instaloader/structures.py", line 606, in _obtain_metadata
', '.join(similar_profiles[0:5]))) from err
instaloader.exceptions.ProfileNotExistsException: Profile femindharamshi does not exist.
The most similar profile is: femindharamshi.
How do I solve this issue?
The output says that profile "femindharamshi" does not exist but that is what my profile is. It also says :
The most similar profile is: femindharamshi.
import instaloader
import random
import os
dir_path_driver = os.getcwd()
def username_password():
listusername = []
with open("./username.txt","r") as usernames:
for username in usernames:
listusername.append((username.rstrip("\n")).split(":"))
if len(listusername) == 1:
select = 0
else:
select = random.randint(0,len(listusername))
return listusername[select][0],listusername[select][1]
def get_followers():
L = instaloader.Instaloader()
# Login or load session
username,password =username_password()
listfile = os.listdir(dir_path_driver+"/cookie")
for i in listfile:
if i != f"{username}":
L.login(username, password)
L.save_session_to_file(filename=dir_path_driver+"/cookie/"+f"{username}")
else:
L.load_session_from_file(filename=dir_path_driver+"/cookie/"+f"{username}",username = username)
file = open("prada_followers.txt","a+")
profile = instaloader.Profile.from_username(L.context, "idinstagram")
for followee in profile.get_followers():
username = followee.username
file.write(username + "\n")
file.close()
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
class InstaBot:
"""InstaBot can login, can return unfollowers that don't
follow you back.
Object requires two args.
'Username' & 'Password' """
def __init__(self,username,pw):
self.username = username
self.pw = pw
self.driver = webdriver.Chrome(executable_path='chromedriver.exe')
self.base_url = "https://instagram.com"
self.driver.get("{}".format(self.base_url))
sleep(2)
self.driver.maximize_window()
self.login()
def login(self):
self.driver.find_element_by_xpath("//input[#name=\"username\"]")\
.send_keys(self.username)
self.driver.find_element_by_xpath("//input[#name=\"password\"]")\
.send_keys(self.pw)
self.driver.find_element_by_xpath("//button[#type=\"submit\"]")\
.click()
sleep(10)
self.driver.find_element_by_xpath("//button[contains(text(), 'Not Now')]")\
.click()
sleep(2)
def get_unfollowers(self):
self.driver.find_element_by_xpath("//a[contains(#href, '/{}')]".format(self.username))\
.click()
sleep(3)
self.driver.find_element_by_xpath("//a[contains(#href, '/following')]")\
.click()
sleep(2)
following = self._get_names()
self.driver.find_element_by_xpath("//a[contains(#href, '/followers')]")\
.click()
sleep(2)
followers = self._get_names()
not_following_back = [user for user in following if user not in followers]
return not_following_back
## suggetions = self.driver.find_element_by_xpath('//h4[contains(text(), Suggetions)]')
## self.driver.execute_script('arguments[0].scrollIntoView()',suggetions)
def _get_names(self):
scroll_box = self.driver.find_element_by_xpath("/html/body/div[4]/div/div[2]")
last_ht , ht = 0,1
while last_ht != ht:
last_ht = ht
sleep(1)
ht = self.driver.execute_script("""
arguments[0].scrollTo(0,arguments[0].scrollHeight);
return arguments[0].scrollHeight;
""", scroll_box)
links = scroll_box.find_elements_by_tag_name('a')
names = [name.text for name in links if name.text != '']
sleep(2)
self.driver.find_element_by_xpath("/html/body/div[4]/div/div[1]/div/div[2]/button")\
.click()
return names
def navigate_to_user(self,user):
self.driver.get("{}/{}".format(self.base_url,user))
def scroll_down(self):
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(2)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
my_bot = InstaBot(Username,Password)
##unfollowers = my_bot.get_unfollowers() #will return a list
my_bot.navigate_to_user(Any User Name that you follow) #Will return your friend's followers list
import sys
import os, time
import cognitive_face as CF
import global_variables as global_var
import urllib
import sqlite3
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
Key = global_var.key
CF.Key.set(Key)
BASE_URL = global_var.BASE_URL # Replace with your regional Base URL
CF.BaseUrl.set(BASE_URL)
def get_person_id():
person_id = ''
extractId = str(sys.argv[1])[-2:]
connect = sqlite3.connect("Face-DataBase")
c = connect.cursor()
cmd = "SELECT * FROM Students WHERE ID = " + extractId
c.execute(cmd)
row = c.fetchone()
person_id = row[3]
connect.close()
return person_id
if len(sys.argv) is not 1:
currentDir = os.path.dirname(os.path.abspath(__file__))
imageFolder = os.path.join(currentDir, "dataset/" + str(sys.argv[1]))
person_id = get_person_id()
for filename in os.listdir(imageFolder):
if filename.endswith(".jpg"):
print(filename)
imgurl = urllib.request.pathname2url(os.path.join(imageFolder, filename))
imgurl = imgurl[3:]
print("imageurl = {}".format(imgurl))
res = CF.face.detect(imgurl)
if len(res) != 1:
print("No face detected in image")
else:
res = CF.person.add_face(imgurl, global_var.personGroupId, person_id)
print(res)
time.sleep(6)
else:
print("supply attributes please from dataset folder")
A:\microsoft api FaceRecognition-Attendance-Marking-master>python add_person_faces.py user97
User.97.1.jpg
imageurl = A:/microsoft%20api%20FaceRecognition-Attendance-Marking-master/dataset/user97/User.97.1.jpg
Traceback (most recent call last):
File "add_person_faces.py", line 42, in <module>
res = CF.face.detect(imgurl)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\cognitive_face\face.py", line 41, in detect
'POST', url, headers=headers, params=params, json=json, data=data)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\cognitive_face\util.py", line 105, in request
error_msg.get('message'))
cognitive_face.util.CognitiveFaceException: Error when calling Cognitive Face API:
status_code: 400
code: InvalidURL
message: Invalid image URL.
I am running Selenium and PhantomJS to input search terms into a website and retrieve the number of hits for each search term. I have to do this 130,000+ times, so the code has been running nicely for a day until suddenly the program broke with the following error:
Traceback (most recent call last):
File "CBBPlyNwsScrape.py", line 82, in <module>
browser = webdriver.PhantomJS()
File "/Library/Python/2.7/site-packages/selenium/webdriver/phantomjs/webdriver.py", line 50, in __init__
self.service.start()
File "/Library/Python/2.7/site-packages/selenium/webdriver/phantomjs/service.py", line 69, in start
raise WebDriverException("Can not connect to GhostDriver")
selenium.common.exceptions.WebDriverException: Message: 'Can not connect to GhostDriver'
I'm running this on Mac OSX and Python 2.7.3. I have the latests versions of Selenium and PhantomJS installed. Can anyone tell me what is going on and why GhostDriver was working fine for so long and suddenly stopped?
In the ghostdriver.log file, this is all it contains:
PhantomJS is launching GhostDriver...
[ERROR - 2013-12-01T05:14:34.491Z] GhostDriver - Main - Could not start Ghost Driver => {
"message": "Could not start Ghost Driver",
"line": 82,
"sourceId": 4445044288,
"sourceURL": ":/ghostdriver/main.js",
"stack": "Error: Could not start Ghost Driver\n at :/ghostdriver/main.js:82",
"stackArray": [
{
"sourceURL": ":/ghostdriver/main.js",
"line": 82
}
]
}
Thanks
Installing latest phantom js fixed this error, this was happening with default ubuntu 12.04 phantomjs destro
I was having the same problem. I don't know why the program has trouble calling the phantomJS webdriver, but the answer is to write a simple exception WebDriverException. This following code did the trick for me
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException, WebDriverException
import unittest, time, re, urllib2
f = open("mother.txt","r") #opens file with name of "test.txt"
l = "1"
m = "2"
n = "3"
aTuple = ( l, m, n ) # create tuple
e = int(0)
for line in f:
e += 1
try:
h = str(e)
j = line
g = open("yes4/" + h + ".txt","w") #opens file with name of "test.txt"
for item in aTuple:
driver = webdriver.PhantomJS('phantomjs')
base_url = j + item
verificationErrors = []
accept_next_alert = True
driver.get(base_url)
elem=driver.find_element_by_id("yelp_main_body")
source_code=elem.get_attribute("outerHTML").encode('utf-8').strip()
g.write(source_code)
driver.quit()
except WebDriverException:
print "e"
h = str(e)
j = line
g = open("yes4/" + h + ".txt","w") #opens file with name of "test.txt"
for item in aTuple:
driver = webdriver.PhantomJS('phantomjs')
base_url = j + item
verificationErrors = []
accept_next_alert = True
driver.get(base_url)
elem=driver.find_element_by_id("yelp_main_body")
source_code=elem.get_attribute("outerHTML").encode('utf-8').strip()
g.write(source_code)
driver.quit()
else:
print h
My code that I will post below gives me this error and I can't figure out why or how to fix it. If anyone could help I would greatly appreciate it. Thanks!
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 99, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 76, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 48, in handle_listing
yellow_page = BeautifulSoup(download(yellow_page_url))
File "build\bdist.win32\egg\BeautifulSoup.py", line 1519, in __init__
BeautifulStoneSoup.__init__(self, *args, **kwargs)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1144, in __init__
self._feed(isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1168, in _feed
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1770, in __init__
self._detectEncoding(markup, isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1915, in _detectEncoding
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
TypeError: expected string or buffer
I don't know what it wants or what it means...
This is my code:
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
# define an alphabet
alfa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':'', '-Suite-' + alfa[1-26] : ''}
if TITLE_MATCH.match(listing_title) is not None:
title, = TITLE_MATCH.match(listing_title).groups()
if ADDRESS_MATCH.match(listing_title) is not None:
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
"""
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
"""
for a in range(0,1):
START_URL = 'http://www.locationary.com/place/en/US/Arkansas/Fayetteville-page2/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
A very common mistake made by novices using any language that supports exceptions is that they catch exceptions that they do not actually handle. This leads to hard-to-debug errors since it disrupts the normal flow of the program.
Specifically, catching urllib2.HTTPError in download() is preventing actual problems from being propagated to the rest of the program. Either remove the exception handler altogether, or raise at the end of the handler to maintain flow.
When I run the following code, I keep getting this error:
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 94, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 71, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 35, in handle_listing
title, = TITLE_MATCH.match(listing_title).groups()
AttributeError: 'NoneType' object has no attribute 'groups'
What is wrong?
It has something to do with the Title match but I don't know how to fix it!
If you could help me I would really appreciate it!
Thanks!
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
for a in range(2,3):
START_URL = 'http://www.locationary.com/place/en/US/New_Jersey/Randolph-page' + str(a) + '/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
Quoting from your error:
title, = TITLE_MATCH.match(listing_title).groups()
AttributeError: 'NoneType' object has no attribute 'groups'
TITLE_MATCH.match(listing_title) returns None, so you can't call .groups().
When a re .match does not find anything to match, it returns None. Since you cannot call .groups() on None, you have to check for a match first. To do that:
Change this:
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
To this:
titleMatch = TITLE_MATCH.match(listing_title)
if titleMatch:
title, = titleMatch.groups()
else:
# handle it
addressMatch = ADDRESS_MATCH.match(listing_title)
if addressMatch:
address, = addressMatch.groups()
else:
# handle it