how to get list of members using beautifulsoup - python

I tried to do this
URL=str(browser.current_url)
page=requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
imena = soup.findAll('a', class_='text-headline')
imena

Assuming the URL is for the Members tab of the Starva club Россия 2021, i.e., https://www.strava.com/clubs/236545/members the following should work to get all of the members across the 193 pages (you really should be using the Strava API...):
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
BASE_URL = "https://www.strava.com/clubs/236545/members?page="
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(f"{BASE_URL}{1}")
driver.find_element_by_id("email").send_keys("<your-email>")
driver.find_element_by_id("password").send_keys("<your-password>")
driver.find_element_by_id("login-button").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
num_pages = int(soup.find("ul", "pagination").find_all("li")[-2].text)
# Ignore the admins shown on each page
athletes = soup.find_all("ul", {"class": "list-athletes"})[1]
members = [
avatar.attrs['title']
for avatar in athletes.find_all("div", {"class": "avatar"})
if 'title' in avatar.attrs
]
for page in range(2, num_pages + 1):
time.sleep(1)
driver.get(f"{BASE_URL}{page}")
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
athletes = soup.find_all("ul", {"class": "list-athletes"})[1]
for avatar in athletes.find_all("div", {"class": "avatar"}):
if 'title' in avatar.attrs:
members.append(avatar.attrs['title'])
# Print first 10 members
print('\n'.join(m.strip() for m in members[:10]))
driver.close()
Output (first 10 members):
- Victor Koldaev - ♥LCHF Runners♥
Antonio Raposo ®️
Vadim Issin
"DuSenna🇧🇷 Vá com Garra e a Felicidade te Agarra 😉
#MIX MIX
#RunВасяRun ...
$ерЖ 🇷🇺 КЛИМoff
'Luis Fernando Osorio' MTB
( CE )Faisal ALShammary "حائل $الشرقية "
(# Monique #) bermudez

You need to first get the "text-headline" elements div and then loop through each of them to get the anchor links.
URL=str(browser.current_url)
page=requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
members = soup.findAll('div', {'class': 'text-headline'})
for (member in members):
name = member.find("a")
print(name.get_text())

Related

How do I make this web crawler print only the titles of the songs?

import requests
from bs4 import BeautifulSoup
url = 'https://www.officialcharts.com/charts/singles-chart'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
def chart_spider(max_pages):
page = 1
while page >= max_pages:
url = "https://www.officialcharts.com/charts/singles-chart"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {"class": "title"}):
href = "BAD HABITS" + link.title(href)
print(href)
page += 1
chart_spider(1)
Wondering how to make this print just the titles of the songs instead of the entire page. I want it to go through the top 100 charts and print all the titles for now. Thanks
Here's is a possible solution, which modify your code as little as possible:
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
URL = 'https://www.officialcharts.com/charts/singles-chart'
def chart_spider():
source_code = requests.get(URL)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for title in soup.find_all('div', {"class": "title"}):
print(title.contents[1].string)
chart_spider()
The result is a list of all the titles found in the page, one per line.
If all you want is the titles for each song on the top 100,
this code:
import requests
from bs4 import BeautifulSoup
url='https://www.officialcharts.com/charts/singles-chart/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
titles = [i.text.replace('\n', '') for i in soup.find_all('div', class_="title")]
does what you are looking for.
You can do like this.
The Song title is present inside a <div> tag with class name as title.
Select all those <div> with .find_all(). This gives you a list of all <div> tags.
Iterate over the list and print the text of each div.
from bs4 import BeautifulSoup
import requests
url = 'https://www.officialcharts.com/charts/singles-chart/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
d = soup.find_all('div', class_='title')
for i in d:
print(i.text.strip())
Sample Output:
BAD HABITS
STAY
REMEMBER
BLACK MAGIC
VISITING HOURS
HAPPIER THAN EVER
INDUSTRY BABY
WASTED
.
.
.

Web Scraping with Python - blank return

I'm trying to scrape reviews from TrustPilot, but the code always return with blank sheets and the headers/categories I specified. Could someone help me with this?
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
driver= webdriver.Chrome()
names=[] #List to store name of the product
headers=[] #List to store price of the product
bodies=[]
ratings=[] #List to store rating of the product
dates=[]
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = driver.page_source
soup = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer('a'))
for a in soup.findAll('a', href=True, attrs={'class':'reviews-container'}):
name=a.find('div', attrs={'class':'consumer-information_name'})
header=a.find('div', attrs={'class':'review-content_title'})
body=a.find('div', attrs={'class':'review-content_text'})
rating=a.find('div', attrs={'class':'star-rating star-rating--medium'})
date=a.find('div', attrs={'class':'review-date--tooltip-target'})
names.append(name.text)
headers.append(header.text)
bodies.append(body.text)
ratings.append(rating.text)
dates.append(date.text)
print ('webpage, no errors')
df = pd.DataFrame({'User Name':names,'Header':headers,'Body':bodies,'Rating':ratings,'Date':dates})
df.to_csv('reviews02.csv', index=False, encoding='utf-8')
print ('csv made')```
The issue is soup.findAll('a', href=True, attrs={'class':'reviews-container'}) is not finding any results, so there are 0 iterations in the loop. Make sure you are using the correct tags and class names. Also you don't need to use a loop because BeautifulSoup has a find_all method. I used the requests module to open the web page, though it shouldn't make a difference.
from bs4 import BeautifulSoup
import requests
req = requests.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
And now each list has 20 entries.

Following links with a second request - Web crawler

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through results from a travel website. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the links and gather the pieces of information from each of those pages. But I am stuck. Hope you can give me a hint.
Here is my code:
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for t in set(t.get("href") for t in Deeplink):
Deeplink_final = t
print(Deeplink_final) #The output shows all the links that I would like to follow and gather information from.
trade_spider(1)
Output:
/de/7132/London-attractions/Stonehenge/d737-a113
/de/7132/London-attractions/Tower-of-London/d737-a93
/de/7132/London-attractions/London-Eye/d737-a1400
/de/7132/London-attractions/Thames-River/d737-a1410
The output shows all the links that I would like to follow and gather information from.
Next step in my code:
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for t in set(t.get("href") for t in Deeplink):
Deeplink_final = t
trade_spider(1)
def trade_spider2(max_pages):
r = requests.get("https://www.viatorcom.de" + Deeplink_final, verify = False)
soup = BeautifulSoup(r.content, "lxml")
print(soup)
trade_spider2(9)
I would like to append the initally crawled output to my second request. But this doesnt work.Hope you can give me a hint.
This should help.
import requests
from bs4 import BeautifulSoup
import urllib, collections
Spider =1
def trade_spider2(Deeplink_final):
r = requests.get("https://www.viatorcom.de" + Deeplink_final, verify = False)
soup = BeautifulSoup(r.content, "lxml")
print(soup)
def trade_spider(max_pages):
RegionIDArray = {737: "London"}
for reg in RegionIDArray:
page = -1
r = requests.get("https://www.viatorcom.de/London/d" +str(reg) +"&page=" + str(page) , verify = False)
soup = BeautifulSoup(r.content, "lxml")
g_data = soup.find_all("h2", {"class": "mtm mbn card-title"})
for item in g_data:
Deeplink = item.find_all("a")
for Deeplink_final in set(t.get("href") for t in Deeplink):
trade_spider2(Deeplink_final)
trade_spider(1)

Using beautiful soup to scrape data from indeed

i am trying to use bs to scrape resume on indeed but i met some problems
here is the sample site: https://www.indeed.com/resumes?q=java&l=&cb=jt
here is my code:
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app-link'}):
job.append(a['title'])
return(job)
scrape_job_title(soup)
it print out nothing: []
As you can see in the picture, I want to grab the job title "Java developer".
The class is app_link, not app-link. Additionally, a['title'] doesn't do what you want. Use a.contents[0] instead.
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app_link'}):
job.append(a.contents[0])
return(job)
scrape_job_title(soup)
Try this to get all the job titles:
import requests
from bs4 import BeautifulSoup
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html5lib')
for items in soup.select('.sre'):
data = [item.text for item in items.select('.app_link')]
print(data)

No output in console python

from bs4 import BeautifulSoup
import requests
def imdb_spider():
url = 'http://www.imdb.com/chart/top'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'secondaryInfo' }):
href = link.get('href')
print(href)
imdb_spider()
I'm trying to get links of all top rated movies from imdb . I'm using pycharm . The code runs for more than 30 mins but i'm not getting any print in my console.
You're correct that there's an element with class secondaryInfo for every movie title, but that's not the a element. If you want to find that, you have to use a different selector. For example, the following selector will do the trick instead of using soup.findAll().
soup.select('td.titleColumn a')
The problem is that {'class': 'secondaryInfo' } is a parameter of <span> object.
So try this:
from bs4 import BeautifulSoup
import requests
def imdb_spider():
url = 'http://www.imdb.com/chart/top'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "lxml")
for td in soup.findAll('td', {'class': 'titleColumn'}):
href = td.find('a').get('href')
print(href)
imdb_spider()

Categories