How to loop through urls in python with beautifulsoup in python

How to loop through urls in python with beautifulsoup in python - python

So I have this code where I extract the number of goals score from a certain website. I want to loop it through so that I can get the information for every player, for example: https://www.futbin.com/20/player/1, then https://www.futbin.com/20/player/2, then https://www.futbin.com/20/player/3 up to 5000. How would I go about that? Here is my code for how I get the goals information.
url = 'https://www.futbin.com/20/player/143/cristiano-ronaldo'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
googclose = soup.find_all(class_='ps4-pgp-data')
hi=soup.find_all('div', attrs={'class':'ps4-pgp-data'})[4]

all_goals=[]
for i in range(1,51):
url = 'https://www.futbin.com/20/player/{}'.format(i)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
goal=soup.find_all('div', attrs={'class':'ps4-pgp-data'})[4].text.strip()
all_goals.append(goal)

Related

How to get data in Webscraping Python

https://mor.nlm.nih.gov/RxNav/search?searchBy=NDC&searchTerm=51079045120
how to get RXCUI number from this website in python, I am unable to get
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
cont = soup.findAll("div", {"id": "titleHolder"})
rx = cont.find("span", id = "rxcuiDecoration")
print(rx.text)

The site renders using javascript you have to use the API
import requests
r = requests.get('https://rxnav.nlm.nih.gov/REST/ndcstatus.json?caller=RxNav&ndc=51079045120')
print(r.json()['ndcStatus']['rxcui'])

How do I make this web crawler print only the titles of the songs?

import requests
from bs4 import BeautifulSoup
url = 'https://www.officialcharts.com/charts/singles-chart'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
def chart_spider(max_pages):
page = 1
while page >= max_pages:
url = "https://www.officialcharts.com/charts/singles-chart"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {"class": "title"}):
href = "BAD HABITS" + link.title(href)
print(href)
page += 1
chart_spider(1)
Wondering how to make this print just the titles of the songs instead of the entire page. I want it to go through the top 100 charts and print all the titles for now. Thanks

Here's is a possible solution, which modify your code as little as possible:
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
URL = 'https://www.officialcharts.com/charts/singles-chart'
def chart_spider():
source_code = requests.get(URL)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for title in soup.find_all('div', {"class": "title"}):
print(title.contents[1].string)
chart_spider()
The result is a list of all the titles found in the page, one per line.

If all you want is the titles for each song on the top 100,
this code:
import requests
from bs4 import BeautifulSoup
url='https://www.officialcharts.com/charts/singles-chart/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
titles = [i.text.replace('\n', '') for i in soup.find_all('div', class_="title")]
does what you are looking for.

You can do like this.
The Song title is present inside a <div> tag with class name as title.
Select all those <div> with .find_all(). This gives you a list of all <div> tags.
Iterate over the list and print the text of each div.
from bs4 import BeautifulSoup
import requests
url = 'https://www.officialcharts.com/charts/singles-chart/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
d = soup.find_all('div', class_='title')
for i in d:
print(i.text.strip())
Sample Output:
BAD HABITS
STAY
REMEMBER
BLACK MAGIC
VISITING HOURS
HAPPIER THAN EVER
INDUSTRY BABY
WASTED
.
.
.

Using beautiful soup to scrape data from indeed

i am trying to use bs to scrape resume on indeed but i met some problems
here is the sample site: https://www.indeed.com/resumes?q=java&l=&cb=jt
here is my code:
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app-link'}):
job.append(a['title'])
return(job)
scrape_job_title(soup)
it print out nothing: []
As you can see in the picture, I want to grab the job title "Java developer".

The class is app_link, not app-link. Additionally, a['title'] doesn't do what you want. Use a.contents[0] instead.
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app_link'}):
job.append(a.contents[0])
return(job)
scrape_job_title(soup)

Try this to get all the job titles:
import requests
from bs4 import BeautifulSoup
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html5lib')
for items in soup.select('.sre'):
data = [item.text for item in items.select('.app_link')]
print(data)

Python bs4 BeautifulSoup: findall gives empty bracket

when i run this code it gives me an empty bracket. Im new to web scraping so i dont know what im doing wrong.
import requests
from bs4 import BeautifulSoup
url = 'https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=laptop'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
container = soup.findAll('li', {'class': 's-result-item celwidget '})
#btw the space is also there in the html code
print(container)
results:
[]
What i tried is to grab the html code from the site, and to soup trough the li tags where all the information is stored so I can print out all the information in a for loop.
Also if someone wants to explain how to use BeautifulSoup we can always talk.
Thank you guys.

So a working code that grabs product and price would could look something like this.
import requests
from bs4 import BeautifulSoup
url = 'https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=laptop'
r = requests.get(url, headers={'User-Agent': 'Mozilla Firefox'})
soup = BeautifulSoup(r.text, 'html.parser')
container = soup.findAll('li', {'class': 's-result-item celwidget '})
for cont in container:
h2 = cont.h2.text.strip()
# Amazon lists prices in two ways. If one fails, use the other
try:
currency = cont.find('sup', {'class': 'sx-price-currency'}).text.strip()
price = currency + cont.find('span', {'class': 'sx-price-whole'}).text.strip()
except:
price = cont.find('span', {'class': 'a-size-base a-color-base'})
print('Product: {}, Price: {}'.format(h2, price))
Let me know if that helps you further...

How to scrape whole website using beautifulsoup

I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)

Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to loop through urls in python with beautifulsoup in python - python

all_goals=[] for i in range(1,51): url = 'https://www.futbin.com/20/player/{}'.format(i) response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') goal=soup.find_all('div', attrs={'class':'ps4-pgp-data'})[4].text.strip() all_goals.append(goal)

Related

How to get data in Webscraping Python

How do I make this web crawler print only the titles of the songs?

Using beautiful soup to scrape data from indeed

Python bs4 BeautifulSoup: findall gives empty bracket

How to scrape whole website using beautifulsoup

Categories

Resources