How to get next page link in python beautifulsoup?

How to get next page link in python beautifulsoup? - python

I have this link:
http://www.brothersoft.com/windows/categories.html
I am trying to to get the link for the item inside the div.
Example:
http://www.brothersoft.com/windows/mp3_audio/midi_tools/
I have tried this code:
import urllib
from bs4 import BeautifulSoup
url = 'http://www.brothersoft.com/windows/categories.html'
pageHtml = urllib.urlopen(url).read()
soup = BeautifulSoup(pageHtml)
sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':'brLeft'})]
for i in sAll:
print "http://www.brothersoft.com"+i['href']
But I only get output:
http://www.brothersoft.com/windows/mp3_audio/
How can I get output that I needed?

Url http://www.brothersoft.com/windows/mp3_audio/midi_tools/ is not in tag <div class='brLeft'>, so if output is http://www.brothersoft.com/windows/mp3_audio/, that's correct.
If you want to get the url you want, change
sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':'brLeft'})]
to
sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':'brRight'})]
UPDATE:
an example to get info inside 'midi_tools'
import urllib
from bs4 import BeautifulSoup
url = 'http://www.brothersoft.com/windows/categories.html'
pageHtml = urllib.urlopen(url).read()
soup = BeautifulSoup(pageHtml)
sAll = [div.find('a') for div in soup.findAll('div', attrs={'class':'brRight'})]
for i in sAll:
suburl = "http://www.brothersoft.com"+i['href'] #which is a url like 'midi_tools'
content = urllib.urlopen(suburl).read()
anosoup = BeautifulSoup(content)
ablock = anosoup.find('table',{'id':'courseTab'})
for atr in ablock.findAll('tr',{'class':'border_bot '}):
print atr.find('dt').a.string #name
print "http://www.brothersoft.com" + atr.find('a',{'class':'tabDownload'})['href'] #link

Related

How do I make this web crawler print only the titles of the songs?

import requests
from bs4 import BeautifulSoup
url = 'https://www.officialcharts.com/charts/singles-chart'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
def chart_spider(max_pages):
page = 1
while page >= max_pages:
url = "https://www.officialcharts.com/charts/singles-chart"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {"class": "title"}):
href = "BAD HABITS" + link.title(href)
print(href)
page += 1
chart_spider(1)
Wondering how to make this print just the titles of the songs instead of the entire page. I want it to go through the top 100 charts and print all the titles for now. Thanks

Here's is a possible solution, which modify your code as little as possible:
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
URL = 'https://www.officialcharts.com/charts/singles-chart'
def chart_spider():
source_code = requests.get(URL)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for title in soup.find_all('div', {"class": "title"}):
print(title.contents[1].string)
chart_spider()
The result is a list of all the titles found in the page, one per line.

If all you want is the titles for each song on the top 100,
this code:
import requests
from bs4 import BeautifulSoup
url='https://www.officialcharts.com/charts/singles-chart/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
titles = [i.text.replace('\n', '') for i in soup.find_all('div', class_="title")]
does what you are looking for.

You can do like this.
The Song title is present inside a <div> tag with class name as title.
Select all those <div> with .find_all(). This gives you a list of all <div> tags.
Iterate over the list and print the text of each div.
from bs4 import BeautifulSoup
import requests
url = 'https://www.officialcharts.com/charts/singles-chart/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
d = soup.find_all('div', class_='title')
for i in d:
print(i.text.strip())
Sample Output:
BAD HABITS
STAY
REMEMBER
BLACK MAGIC
VISITING HOURS
HAPPIER THAN EVER
INDUSTRY BABY
WASTED
.
.
.

How to get multiple class to BeautifulSoup?

Trying to get torrent links from skidrowreloaded.
On the post detail page we have a div like this, I tried get this div by id but i think id is dynamic so I tried get this div by class but did not work,
<div id="tabs-105235-0-0" aria-labelledby="ui-id-1" class="ui-tabs-panel ui-widget-content ui-corner-bottom" role="tabpanel" aria-hidden="false">
the following code is returning none
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
err:
AttributeError: 'NoneType' object has no attribute 'find_all'
full code:
import os
from bs4 import BeautifulSoup
import requests
import webbrowser
clear = lambda: os.system('cls')
clear()
r = requests.get('https://www.skidrowreloaded.com/')
source = BeautifulSoup(r.content,"lxml")
source2 = source.find_all("h2")
games = []
for i in source2:
games.append(i.a.get("href"))
lastgame = games[0]
r = requests.get(lastgame)
source = BeautifulSoup(r.content,"lxml")
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
source3 = source2.find_all("a")
k = 0;
for i in source3:
if k == 0: #hide steam link.
k = k + 1
else:
if i.get("href") == "https://www.skidrowreloaded.com": #hide null links
pass
else: #throwing links to the browser
print(i.get("href"))
webbrowser.open(i.get("href"))
k = k + 1

To get all the links try this:
import requests
from bs4 import BeautifulSoup
url = "https://www.skidrowreloaded.com/projection-first-light-goldberg/"
soup = BeautifulSoup(requests.get(url).text, "html.parser").find_all("a", {"target": "_blank"})
skip = 'https://www.skidrowreloaded.com'
print([a['href'] for a in soup if a['href'].startswith('https') and a['href'] != skip])
Output:
['https://store.steampowered.com/app/726490/Projection_First_Light/', 'https://mega.nz/file/geogAATS#-0U0PklF-Q5i5l_SELzYx3klh5FZob9HaD4QKcFH_8M', 'https://uptobox.com/rqnlpcp7yb3v', 'https://1fichier.com/?0syphwpyndpo38af04ky', 'https://yadi.sk/d/KAmlsBmGaI1f2A', 'https://pixeldra.in/u/wmcsjuhv', 'https://dropapk.to/v6r7mjfgxjq6', 'https://gofile.io/?c=FRWL1o', 'https://racaty.net/dkvdyjqvg02e', 'https://bayfiles.com/L0k7Qea2pb', 'https://tusfiles.com/2q00y4huuv15', 'https://megaup.net/2f0pv/Projection.First.Light-GoldBerg.zip', 'https://letsupload.org/88t5', 'https://filesupload.org/0d7771dfef54d055', 'https://dl.bdupload.in/17ykjrifizrb', 'https://clicknupload.co/o0k9dnd3iwoy', 'https://dailyuploads.net/n1jihwjwdmjp', 'https://userscloud.com/nircdd4q1t5w', 'https://rapidgator.net/file/b6b8f5782c7c2bdb534214342b58ef18', 'https://turbobit.net/m308zh1hdpba.html', 'https://hitfile.net/5OhkcqZ', 'https://filerio.in/0wbvn4md4i91', 'https://mirrorace.org/m/1Fiic', 'https://go4up.com/dl/0ee9f4866312b5/Projection.First.Light-GoldBerg.zip', 'https://katfile.com/w74l823vuyw5/Projection.First.Light-GoldBerg.zip.html', 'https://multiup.org/download/3d355ba18d58234c792da7a872ab4998/Projection.First.Light-GoldBerg.zip', 'https://dl1.indishare.in/hs55pkx4ex82']

You can use find_all as noted in the BeautifulSoup documentation
import requests
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", class_="ui-tabs-panel ui-widget-content ui-corner-bottom")
# do something with the data
edit -
looking at the response.text, the div exists, but does not have the class you're looking for, hence it returns empty. You can search by using regex like so
import requests, re
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", id=re.compile("^tabs"))
for ele in raw_data:
a_tag = ele.find("a")
# do something with the a_tag

Scraping websites with BS4

I have this code
import requests
from bs4 import BeautifulSoup
result = requests.get("http://www.cvbankas.lt/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
urls = []
for article_tag in soup.find_all("article"):
a_tag = article_tag.find('a')
urls.append(a_tag.attrs['href'])
div_tag = article_tag.find('span')
urls.append(div_tag.attrs['class'])
print(urls)
Can anyone explane me how to get the data marked in red?

You can get span with the class label "salary_amount"
salary_object = article_tag.find("span", class_= "salary_amount")
and then extract the text with the .text attribute of the created object.

Python: BeautifulSoup extract all the heading text from div class

import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
event_containers = soup.find_all('div', class_ = "col-xs-12 col-sm-6 col-md-8")
first_event = event_containers[0]
print(first_event.h3.text)
By using this code i'm able to extract the event name,I'm trying for a way to loop and extract all the event names and dates ? and also i'm trying to extract the location information which is visable after clicking on readmore link

event_containers is a bs4.element.ResultSet object, which is basically a list of Tag objects.
Just loop over the tags in event_containers and select h3 for the title, div.date for the date and a for the URL, example:
for tag in event_containers:
print(tag.h3.text)
print(tag.select_one('div.date').text)
print(tag.a['href'])
Now, for the location information you'll have to visit each URL and collect the text in div.date.
Full code:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
event_containers = soup.find_all('div', class_ = "col-xs-12 col-sm-6 col-md-8")
base_url = 'http://aicd.companydirectors.com.au'
for tag in event_containers:
link = base_url + tag.a['href']
soup = BeautifulSoup(requests.get(link).text,"lxml")
location = ', '.join(list(soup.select_one('div.event-add').stripped_strings)[1:-1])
print('Title:', tag.h3.text)
print('Date:', tag.select_one('div.date').text)
print('Link:', link)
print('Location:', location)

Try this to get all the events and dates you are after:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all(class_='lead'):
date = item.find_previous_sibling().text.split(" |")[0]
print(item.text,date)

No output in console python

from bs4 import BeautifulSoup
import requests
def imdb_spider():
url = 'http://www.imdb.com/chart/top'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'secondaryInfo' }):
href = link.get('href')
print(href)
imdb_spider()
I'm trying to get links of all top rated movies from imdb . I'm using pycharm . The code runs for more than 30 mins but i'm not getting any print in my console.

You're correct that there's an element with class secondaryInfo for every movie title, but that's not the a element. If you want to find that, you have to use a different selector. For example, the following selector will do the trick instead of using soup.findAll().
soup.select('td.titleColumn a')

The problem is that {'class': 'secondaryInfo' } is a parameter of <span> object.
So try this:
from bs4 import BeautifulSoup
import requests
def imdb_spider():
url = 'http://www.imdb.com/chart/top'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "lxml")
for td in soup.findAll('td', {'class': 'titleColumn'}):
href = td.find('a').get('href')
print(href)
imdb_spider()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to get next page link in python beautifulsoup? - python

Related

How do I make this web crawler print only the titles of the songs?

How to get multiple class to BeautifulSoup?

Scraping websites with BS4

Python: BeautifulSoup extract all the heading text from div class

No output in console python

Categories

Resources