Python Knowledge: beginner
I managed to create a script to scrape contact information. The flow I followed since I am a beginner is to extract all the first links and copied it to text file and this is being used in link = browser.find_element_by_link_text(str(link_text)) Scraping of contact details have been confirmed working (based on my separate run). The problem is that after clicking the first links, it won't go on clicking the links inside it, hence it cannot scrape the contact info.
What is wrong with my script? Please bear in mind I am a beginner so my script is a little bit manual and lengthy.
Thanks very much!!!
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import requests
from bs4 import BeautifulSoup
import urllib
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import csv, time, lxml
######################### open file list ####################################
testfile = open("category.txt") # this is where I saved the category
readfile = testfile.read()
readfilesplit = readfile.split("\n")
############################### end ###################################
################### open browser ###############################
browser = webdriver.Firefox()
browser.get('http://aucklandtradesmen.co.nz/')
####################### end ###################################
link_texts = readfilesplit
for link_text in link_texts:
link = browser.find_element_by_link_text(str(link_text))
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".add-listing")))
link.click() #click link
time.sleep(5)
print "-------------------------------------------------------------------------------------------------"
print("Getting listings for '%s'" % link_text)
################# get list name #######################
urlNoList = 'http://aucklandtradesmen.co.nz/home-mainmenu-1.html'
r = requests.get(browser.current_url)
if (urlNoList != browser.current_url):
soup = BeautifulSoup(r.content, 'html.parser')
g_data = soup.find_all("div", {"class":"listing-summary"})
pageRange = soup.find_all("span", {"class":"xlistings"})
pageR = [pageRange[0].text]
pageMax = str(pageR)[-4:-2] # get max item for lists
X = str(pageMax).replace('nd', '0')
# print "Number of listings: ", X
Y = int(X) #convert string to int
print "Number of listings: ", Y
for item in g_data:
try:
listingNames = item.contents[1].text
lstList = []
lstList[len(lstList):] = [listingNames]
replStr = re.sub(r"u'", "'",str(lstList)) #strip u' char
replStr1 = re.sub(r"\s+'", "'",str(replStr)) #strip space and '
replStr2 = re.sub(r"\sFeatured", "",str(replStr1)) #strip Featured string
print "Cleaned string: ", replStr2
################ SCRAPE INFO ################
################### This is where the code is not executing #######################
count = 0
while (count < Y):
for info in replStr2:
link2 = browser.find_element_by_link_text(str(info))
time.sleep(10)
link2.click()
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#rating-msg")))
print "count", count
count+= 1
print("Contact info for: '%s'" % link_text)
r2 = requests.get(browser.current_url)
soup2 = BeautifulSoup(r2.content, 'html.parser')
g_data2 = soup.find_all("div", {"class":"fields"})
for item2 in g_data2:
# print item.contents[0]
print item2.contents[0].text
print item2.contents[1].text
print item2.contents[2].text
print item2.contents[3].text
print item2.contents[4].text
print item2.contents[5].text
print item2.contents[6].text
print item2.contents[7].text
print item2.contents[8].text
browser.back()
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".add-listing")))
################### END ---- This is where the code is not executing END ---#######################
############ END SCRAPE INFO ####################
except NoSuchElementException:
browser.back()
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "pagenav")))
else:
browser.back()
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "pagenav")))
print "Number of listings: 0"
browser.back()
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "pagenav")))
By the way this is some of the result:
-------------------------------------------------------------------------------------------------
Getting listings for 'Plumbers'
Number of listings: 5
Cleaned string: ['Hydroflame Plumbing & Gas Ltd']
Cleaned string: ['Osborne Plumbing Ltd']
Cleaned string: ['Plumbers Auckland Central']
Cleaned string: ['Griffiths Plumbing']
Cleaned string: ['Plumber Auckland']
-------------------------------------------------------------------------------------------------
Getting listings for 'Professional Services'
Number of listings: 2
Cleaned string: ['North Shore Chiropractor']
Cleaned string: ['Psychotherapy Werks - Rob Hunter']
-------------------------------------------------------------------------------------------------
Getting listings for 'Property Maintenance'
Number of listings: 7
Cleaned string: ['Auckland Tree Services']
Cleaned string: ['Bob the Tree Man']
Cleaned string: ['Flawless House Washing & Drain Unblocking']
Cleaned string: ['Yardiez']
Cleaned string: ['Build Corp Apartments Albany']
Cleaned string: ['Auckland Trellis']
Cleaned string: ['Landscape Design']
What I would do is change the logic some. Here's the logic flow I would suggest you use. This will eliminate the writing off of the links and speed up the script.
1. Navigate to http://aucklandtradesmen.co.nz/
2. Grab all elements using CSS selector "#index a" and store the attribute "href" of each
in an array of string (links to each category page)
3. Loop through the href array
3.1. Navigate to href
3.1.1. Grab all elements using CSS selector "div.listing-summary a" and store the
.text of each (company names)
3.1.2. If an element .by_link_text("Next") exists, click it and return to 3.1.1.
If you want business contact info off of the company pages, you would want to store the href in 3.1.1. and then loop through that list and grab what you want off the page.
Sorry about the weirdness of the formatting of the list. It won't let me indent more than one level.
okay I found a solution after thinking #jeffC's suggestion:
extract the href values and append it to the base url which is http://aucklandtradesmen.co.nz, so for example the if the extracted href is /home-mainmenu-1/alarms-a-security/armed-alarms-ltd-.html,and tell browser to navigate to that URL..and then I can do whatever I want in the current page..
Related
I've been building a webscrape that:
1.) Asks what item you'd like to look for on Amazon
2.) Opens a Chrome browser with Selenium and searches for item
3.) Runs through a pre-set amount of pages (I have it at 1 for time efficiency when debugging)
4.) Scrapes all items information on each page and creates a list of "Product" objects.
The issue I'm having is even with the Try & Except I still don't get all the information for each item. When debugging I've double and triple checked my xpaths with "Xpath Helper" and don't see where I went wrong.
Below is my code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from reference_functions import Product
import time
from lxml import html
from selenium.webdriver.chrome.options import Options
import pandas as pd
import datetime as datetime
## SETTING UP QUESTIONS NEEDED FOR SCRAPE
question_product = "What would you like to search for?\n:"
search_term = "invicta mens watch" #str(input(question_product))
search_terms = search_term.split(" ")
question_export = "Do you want to export all item data to excel?\n:"
export_data = "no"#str(input(question_export))
## SETTING UP WEBDRIVER
s = Service('/Users/nicholaskenney/PycharmProjects/Amazon_Scrape/chromedriver')
chrome_options = Options()
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=s, options=chrome_options)
## OPENING URL
url = "https://www.amazon.com/"
driver.get(url)
time.sleep(3)
## SENDING SEARCH TERMS TO SEARCH BOX FOLLED BY ENDER BUTTON
search_box = driver.find_element(By.ID, "twotabsearchtextbox")
search_box.send_keys(search_term)
search_box.send_keys(Keys.RETURN)
time.sleep(3)
products_list = []
page = 1
while True:
if page != 0:
try:
driver.get(driver.current_url + "&page=" + str(page))
time.sleep(3)
except:
break
else:
break
tree = html.fromstring(driver.page_source)
time.sleep(3)
for product_tree in tree.xpath('//div[contains(#data-cel-widget, "search_result_")]'):
should_add = True
title = ""
price = ""
url = ""
number_of_reviews = ""
review_score = ""
previous_price = ""
try:
## Finding Title of item
try:
title = product_tree.xpath('.//span[#class="a-size-medium a-color-base a-text-normal"]/text()')
except Exception as e:
print("This is from first title try: " + e)
title = product_tree.xpath('.//span[#class="a-size-base-plus a-color-base a-text-normal"]/text()')
## FINDING CURRENT PRICE OF ITEM
price = product_tree.xpath('.//span[#class="a-price-whole"]/text()')
## FINDING NUMBER OF REVIEWS OF EACH ITEM
try:
number_of_reviews = product_tree.xpath('.//span[#class="a-size-base"]/text()')
except:
number_of_reviews = product_tree.xpath('.//span[#class="a-size-base a-color-base s-underline-text"]/text()')
## REVIEW SCORE FOR EACH ITEM
try:
review_score = product_tree.xpath('.//span[#class="a-icon-alt"]/text()')
except:
review_score = product_tree.xpath('.//span[#class="a-size-base a-color-base s-underline-text"]/text()')
## FINDING LINK FOR EACH ITEM
try:
links = product_tree.xpath('.//a[#class="a-link-normal s-link-style a-text-normal"]')
for link in links:
if 'href' in link.attrib:
url = (str(link.attrib['href']))
except:
links = product_tree.xpath('.//a[#class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"]')
for link in links:
if 'href' in link.attrib:
url = (str(link.attrib['href']))
## PREVIOUS PRICE SCRAPE
try:
previous_price = product_tree.xpath('.//span[#class="a-price a-text-price"]//span['
'#class="a-offscreen"]/text()')
except:
previous_price = price
except:
print("exception")
should_add = False
## IF ALL INFORMATION IS SCRAPED (SHOULD_ADD IS TRUE) CREATE PRODUCT OBJECTS FOR EACH ITEM AND APPEND TO PRODUCT LIST
product = Product(price, title, url, number_of_reviews, review_score, previous_price)
if should_add == True:
products_list.append(product)
page = page - 1
print("Number of items scraped: " + str(len(products_list)))
## End of Webscrape
driver.quit()
## PRINTING RESULT FOR DEBUGGING
count = 0
for x in products_list:
print(x)
print(x.url)
print("Price is: " + str(x.price))
print("Previous Price is: " + str(x.previous_price))
print("Item title: " + str(x.title))
print("Number of review: "+ str(x.number_of_reviews))
print("Review Scores: " + str(x.review_score))
print("__________")
And this is the result I get:
Number of items scraped: 83
<reference_functions.Product object at 0x7ffd78e8bf10>
https://www.amazon.com/
Price is: []
Previous Price is: []
Item title: []
Number of review: []
Review Scores: ['4.6 out of 5 stars.', '4.6 out of 5 stars.', '4.6 out of 5 stars.']
__________
<reference_functions.Product object at 0x7ffd78eb10d0>
/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A05122932N0ETGH50WEB2&url=%2FWatches-Chronograph-Stainless-Waterproof-Business%2Fdp%2FB07Z62B354%2Fref%3Dsr_1_1_sspa%3Fcrid%3DIFKI3E407I9T%26keywords%3Dinvicta%2Bmens%2Bwatch%26qid%3D1640751697%26sprefix%3Di%252Caps%252C70%26sr%3D8-1-spons%26psc%3D1&qualifier=1640751697&id=2139685257788988&widgetName=sp_atf
Price is: ['42']
Previous Price is: ['$49.99']
Item title: []
Number of review: ['6,012']
Review Scores: ['4.4 out of 5 stars']
__________
<reference_functions.Product object at 0x7ffd78eb12e0>
/Invicta-Diver-Blue-Watch-26972/dp/B07GMSXZBM/ref=sr_1_2?crid=IFKI3E407I9T&keywords=invicta+mens+watch&qid=1640751697&sprefix=i%2Caps%2C70&sr=8-2
Price is: ['49']
Previous Price is: []
Item title: []
Number of review: ['6,122']
Review Scores: ['4.6 out of 5 stars']
__________
<reference_functions.Product object at 0x7ffd78eb1130>
/Invicta-Diver-Quartz-Green-30623/dp/B08447S81T/ref=sr_1_omk_3?crid=IFKI3E407I9T&keywords=invicta+mens+watch&qid=1640751697&sprefix=i%2Caps%2C70&sr=8-3
Price is: ['59']
Previous Price is: ['$69.90']
Item title: []
Number of review: ['6']
Review Scores: ['4.8 out of 5 stars']
__________
<reference_functions.Product object at 0x7ffd78eb1070>
/Invicta-12847-Specialty-Stainless-Steel/dp/B00962GV2E/ref=sr_1_4?crid=IFKI3E407I9T&keywords=invicta+mens+watch&qid=1640751697&sprefix=i%2Caps%2C70&sr=8-4
Price is: ['37']
Previous Price is: []
Item title: []
Number of review: ['5,376']
Review Scores: ['4.7 out of 5 stars']
Etc. Etc. Etc.
On this trial run it exported the url and the total reviews. I find that every other run doesn't export these variables. Is that because Amazons html changes each time I run it or is it something wrong with the code?
Any help on this would be gratefully appreciated!
Personally, I would use css selector to find the link since I don't find xpath reliable. The code that I would use to find the link would be:
product_tree.find_element(By.CSS_SELECTOR, 'a.a-link-normal.s-no-outline').get_attribute('href')
Running this path for me would return the correct link every time, without any problems.
As for the reviews, I would also use css. In this case it would be:
product_tree.find_element(By.CSS_SELECTOR, 'span.a-icon-alt').text
If it still doesn't work, I would suggest writing the whole page source to a text file by using driver.page_source and then use a tool to view what the driver is seeing.
CODE IS HERE
Hi guys
I have some problem with scraping this dynamic site (https://kvartiry-bolgarii.ru/)
I need to get all the links to the home sale ads
I used selenium to load the page and get links to ads after that I move the page down to load new ads. After the new ads are loaded, I start to parse all the links on the page and write them to the list again.
But the data in the list is not updated and the script continues to work with the links that were on the page before scrolling down.
By the way, I set a check so that the script is executed until the last announcement on the site appears in the list, the link to which I found out in advance
How can this problem be corrected?
def get_link_info():
try:
url = "https://kvartiry-bolgarii.ru/"
driver = webdriver.Chrome(
executable_path=r'C:\Users\kk\Desktop\scrape_house\drivers\chromedriver.exe',
options=options
)
driver.get(url)
req = requests.get(url)
req.encoding = 'utf8'
soup = BeautifulSoup(req.text, "lxml")
articles = soup.find_all("div", class_="content")
links_urls = []
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
#print(links_urls)
first_link_number = links_urls[-2].split("-")[-1]
first_link_number = first_link_number[1:]
#print(first_link_number)
last_link_number = links_urls[-1].split("-")[-1]
last_link_number = last_link_number[1:]
#print(last_link_number)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
check = "https://kvartiry-bolgarii.ru/kvartira-v-elitnom-komplekse-s-unikalynym-sadom-o21751"
for a in links_urls:
if a != check:
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
print(links_urls[-1])
else:
print(links_urls[0], links_urls[-1])
print("all links are ready")
Some pointers. You don't need to mix selenium,requests and BeautifulSoup. Just selenium is enough. When you are scrolling infinitely, you need to remove duplicate elements before adding them to your list.
You can try this. This should work.
from selenium import webdriver
import time
def get_link_info():
all_links = []
try:
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
driver.get('https://kvartiry-bolgarii.ru/')
time.sleep(3)
old_links = set() # Empty Set
while True:
# Scroll to get more ads
driver.execute_script("window.scrollBy(0,3825)", "")
# Wait for new ads to load
time.sleep(8)
links_divs = driver.find_elements_by_xpath('//div[#class="content"]//a') # Find Elements
ans = set(links_divs) - set(old_links) # Remove old elements
for link in ans:
# Scroll to the link.
driver.execute_script("arguments[0].scrollIntoView();", link)
fir = link.get_attribute('href')
all_links.append(fir)
# Remove Duplicates
old_links = links_divs
except Exception as e:
raise e
get_link_info()
My code goes into a website, and clicks on records which causes drop downs.
My current code only prints the first drop down record, and not the others.
For example, the first record of the webpage when clicked, drops down 1 record. This record is shown attached. This is also the first and only dropdown record that gets printed as my output.
The code prints this
How do I get it to pull all drop down titles?
from selenium import webdriver
import time
driver = webdriver.Chrome()
for x in range (1,2):
driver.get(f'https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page={x}')
time.sleep(4)
productlist_length = len(driver.find_elements_by_xpath("//div[#class='accordin_title']"))
for i in range(1, productlist_length + 1):
product = driver.find_element_by_xpath("(//div[#class='accordin_title'])[" + str(i) + "]")
title = product.find_element_by_xpath('.//h4').text.strip()
print(title)
buttonToClick = product.find_element_by_xpath('.//div[#class="sign"]')
buttonToClick.click()
time.sleep(5)
subProduct=driver.find_element_by_xpath(".//li[#class='sub_accordin_presentation']")
otherTitle=subProduct.find_element_by_xpath('.//h4').text.strip()
print(otherTitle)
You don't need selenium at all. Not sure what all the info is that you are after but the following shows you that the content is available, from within those expand blocks, with the response from a simple requests.get().:
import requests
from bs4 import BeautifulSoup as bs
import re
r = requests.get('https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1')
soup = bs(r.text, 'lxml')
sessions = soup.select('#accordin > ul > li')
for session in sessions:
print(session.select_one('h4').text)
sub_session = session.select('.sub_accordin_presentation')
if sub_session:
print([re.sub(r'[\n\s]+', ' ', i.text) for i in sub_session])
print()
print()
Try:
productlist_length = len(driver.find_elements_by_xpath('//*[#class="jscroll-inner"]/ul/li'))
for product in productlist_length:
title = product.find_element_by_xpath('(.//*[#class="accordin_title"]/div)[3]/h4').text
I have several URLs which link to Hotel pages and I would like to scrape some data from it.
I'm using the following this script, but I would like to update it:
data=[]
for i in range(0,10):
url = final_list[i]
driver2 = webdriver.Chrome()
driver2.get(url)
sleep(randint(10,20))
soup = BeautifulSoup(driver2.page_source, 'html.parser')
my_table2 = soup.find_all(class_=['title-2', 'rating-score body-3'])
review=soup.find_all(class_='reviews')[-1]
try:
price=soup.find_all('span', attrs={'class':'price'})[-1]
except:
price=soup.find_all('span', attrs={'class':'price'})
for tag in my_table2:
data.append(tag.text.strip())
for p in price:
data.append(p)
for r in review:
data.append(r)
But here's the problem, tag.text.strip() scrape rating numbers like here :
It will strip the number rating into alone value but some hotels don't have the same amout of ratings. Here's a hotel with 7 ratings, the default number is 8. Some have seven ratings, other six, and so on. So in the end, my dataframe is quite screwed. If the hotel doesn't have 8 ratings, the value will be shifted.
My question is : How to tell the script "if there is a value in this tag.text.strip(i) so put the value but if there isn't put None. And of course made that for the eight value.
I tried several things like :
for tag in my_table2:
for i in tag.text.strip()[i]:
if i:
data.append(i)
else:
data.append(None)
But unfortunately, that goes nowhere, so if you could help to figure out the answer, it would be awesome :)
If that could help you, I put link on Hotel that I'm scraping :
https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1
The number ratings are at the end
Thank you.
A few suggestions:
Put your data in a dictionary. You don't have to assume that all tags are present and the order of the tags doesn't matter. You can get the labels and the corresponding ratings with
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
and then iterate over both lists with zip
move your driver outside of the loop, opening it once is enough
don't use wait but you use Selenium's wait functions. You can wait for a particular element to be present or populated with WebDriverWait(driver, 10).until(EC.presence_of_element_located(your_element)
https://selenium-python.readthedocs.io/waits.html
Cache your scraped HTML code to a file. It's faster for you and politer to the website you are scraping
import selenium
import selenium.webdriver
import time
import random
import os
from bs4 import BeautifulSoup
data = []
final_list = [
'https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1',
'https://www.hostelworld.com/pwa/hosteldetails.php/Be-Ramblas-Hostel/Barcelona/435?from=2020-11-27&to=2020-11-28&guests=1'
]
# load your driver only once to save time
driver = selenium.webdriver.Chrome()
for url in final_list:
data.append({})
# cache the HTML code to the filesystem
# generate a filename from the URL where all non-alphanumeric characters (e.g. :/) are replaced with underscores _
filename = ''.join([s if s.isalnum() else '_' for s in url])
if not os.path.isfile(filename):
driver.get(url)
# better use selenium's wait functions here
time.sleep(random.randint(10, 20))
source = driver.page_source
with open(filename, 'w', encoding='utf-8') as f:
f.write(source)
else:
with open(filename, 'r', encoding='utf-8') as f:
source = f.read()
soup = BeautifulSoup(source, 'html.parser')
review = soup.find_all(class_='reviews')[-1]
try:
price = soup.find_all('span', attrs={'class':'price'})[-1]
except:
price = soup.find_all('span', attrs={'class':'price'})
data[-1]['name'] = soup.find_all(class_=['title-2'])[0].text.strip()
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
assert len(rating_labels) == len(rating_scores)
for label, score in zip(rating_labels, rating_scores):
data[-1][label.text.strip()] = score.text.strip()
data[-1]['price'] = price.text.strip()
data[-1]['review'] = review.text.strip()
The data can then be easily put in a nicely formatted table using Pandas
import pandas as pd
df = pd.DataFrame(data)
df
If some data is missing/incomplete, Pandas will replace it with 'NaN'
data.append(data[0].copy())
del(data[-1]['Staff'])
data[-1]['name'] = 'Incomplete Hostel'
pd.DataFrame(data)
I've parsed a list of href links and it's titles from a webpage. I want to click all the links that don't have the word "[$]". Here is my code.
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import webbrowser
from selenium import webdriver
import urllib.request
import time
from bs4 import BeautifulSoup
import re
browser = webdriver.Chrome(r"C:\Users\vasanth\Downloads\Compressed\chromedriver.exe")
browser.get("http://englishworldwide.ning.com/events/event/listUpcoming")
tip = browser.page_source
soup = BeautifulSoup(tip, 'html.parser')
link = soup.find_all('div', {'class': "wrap xg_lightborder"})
for dept in link:
lilly = dept.find_all('ul', {'class': 'clist'})
for h3 in lilly:
sill = h3.find_all('li')
for sec in sill:
tap = sec.find_all('div', {'class': 'tb'})
for lip in tap:
tappy = lip.find_all('h3')
for lips in tappy:
tom = lips.find_all('a')
for pos, lee in enumerate(tom):
sappy = lee.get('href')
result = re.sub(r'<.*?>', "", str(lee))
print(result)
print(sappy)
Here is my output. And I want to click all those links which don't have the word "[$]" on its title.
C:\Users\vasanth\AppData\Local\Programs\Python\Python35-32\python.exe C:/Users/vasanth/PycharmProjects/Youtube/jill.py
LEWWWP's round the clock Google+ Hangout Club!
http://englishworldwide.ning.com/events/lewwwp-s-24-7-google-hangout-club
Weekly Wednesday LEWWWP Site Text Chat
http://englishworldwide.ning.com/events/weekly-wednesday-lewwwp-site-text-chat-952
Improve your speaking fluency [$] faster-paced
http://englishworldwide.ning.com/events/improve-your-speaking-fluency-faster-paced-45
Exam Prep speaking practice [$] Answer, Discuss, Repeat
http://englishworldwide.ning.com/events/exam-prep-speaking-practice-answer-discuss-repeat-29
Transcription / Pronunciation class [SLOWER-paced / Novice level]
http://englishworldwide.ning.com/events/transcription-pronunciation-class-395
Process finished with exit code 0
EDIT 1:
I have found another step ahead to find those links which don't have "[$]" in it. But I can't open those links by its positions. But the following method doesn't open those specific links.
Here is the rest of my code...
tricky = BeautifulSoup(str(tom), 'html.parser')
href_links = lambda tag: (getattr(tag, 'name', None) == 'a' and not '$' in tag.get_text())
for pos, final in enumerate(tricky.find_all(href_links)):
simmpy = final.get('href')
print(simmpy)
if pos == 2:
webbrowser.open(simmpy)
else:
break
Just check if link contains dollar sign:
s = "This be a string"
if s.find("$") == -1:
print "$ not found"
else:
print "Found $ in string"