How to get all application link in every page? - python

I have this code:
import urllib
from bs4 import BeautifulSoup
url = "http://www.padtube.com/Audio-Files-Player/30-01-1-2.html"
pageurl = urllib.urlopen(url)
soup = BeautifulSoup(pageurl)
for b in soup.select("table#dl-tbl-list th a[href]"):
print b['href']
When I run this code, it only give me the link only on the first page.
I can't get the application link on next page.

the site is using post to go to next page, so what you need is sending the page number via post.
i did this via http://www.python-requests.org/
import urllib
from bs4 import BeautifulSoup
import requests
url = "http://www.padtube.com/Audio-Files-Player/30-01-1-2.html"
#pageurl = urllib.urlopen(url)
pageurl = requests.post(url, data = {
'page': 2
})
pageurl = pageurl.text
soup = BeautifulSoup(pageurl)
for b in soup.select("table#dl-tbl-list th a[href]"):
print b['href']

Related

Web scraping several a href

I would like to scrape this page with Python: https://statusinvest.com.br/acoes/proventos/ibovespa.
With this code:
import requests
from bs4 import BeautifulSoup as bs
URL = "https://statusinvest.com.br/acoes/proventos/ibovespa"
page = 1
req = requests.get(URL+str(page))
soup = bs(req.text, 'html.parser')
container = soup.find('div', attrs={'class','list'})
dividends = container.find('a')
for dividend in dividends:
links = dividend.find_all('a')
print(links)
But it doesn't return anything.
Can someone help me please?
Edited: you can see the below updated code to access any data you mentioned in the comment, you can modify according to your needs as all the data on that page is inside data variable.
Updated Code:
import json
import requests
from bs4 import BeautifulSoup as bs
url = "https://statusinvest.com.br"
links = []
req = requests.get(f"{url}/acoes/proventos/ibovespa")
soup = bs(req.content, 'html.parser')
data = json.loads(soup.find('input', attrs={'id': 'result'})["value"])
print("Date Com Data")
for datecom in data["dateCom"]:
print(f"{datecom['code']}\t{datecom['companyName']}\t{datecom['companyNameClean']}\t{datecom['companyId']}\t{datecom['companyId']}\t{datecom['resultAbsoluteValue']}\t{datecom['dateCom']}\t{datecom['paymentDividend']}\t{datecom['earningType']}\t{datecom['dy']}\t{datecom['recentEvents']}\t{datecom['recentEvents']}\t{datecom['uRLClear']}")
print("\nDate Payment Data")
for datePayment in data["datePayment"]:
print(f"{datePayment['code']}\t{datePayment['companyName']}\t{datePayment['companyNameClean']}\t{datePayment['companyId']}\t{datePayment['companyId']}\t{datePayment['resultAbsoluteValue']}\t{datePayment['dateCom']}\t{datePayment['paymentDividend']}\t{datePayment['earningType']}\t{datePayment['dy']}\t{datePayment['recentEvents']}\t{datePayment['recentEvents']}\t{datePayment['uRLClear']}")
print("\nProvisioned Data")
for provisioned in data["provisioned"]:
print(f"{provisioned['code']}\t{provisioned['companyName']}\t{provisioned['companyNameClean']}\t{provisioned['companyId']}\t{provisioned['companyId']}\t{provisioned['resultAbsoluteValue']}\t{provisioned['dateCom']}\t{provisioned['paymentDividend']}\t{provisioned['earningType']}\t{provisioned['dy']}\t{provisioned['recentEvents']}\t{provisioned['recentEvents']}\t{provisioned['uRLClear']}")
Seeing to the source code of that website one could fetch the json directly and get your desired links follow the below code.
Code:
import json
import requests
from bs4 import BeautifulSoup as bs
url = "https://statusinvest.com.br"
links=[]
req = requests.get(f"{url}/acoes/proventos/ibovespa")
soup = bs(req.content, 'html.parser')
data = json.loads(soup.find('input', attrs={'id': 'result'})["value"])
for datecom in data["dateCom"]:
links.append(f"{url}{datecom['uRLClear']}")
for datePayment in data["datePayment"]:
links.append(f"{url}{datePayment['uRLClear']}")
for provisioned in data["provisioned"]:
links.append(f"{url}{provisioned['uRLClear']}")
print(links)
Output:
Let me know if you have any questions :)

Getting search results from youtube via bs4 python

I am trying to get the first result from the youtube search query by scraping the page and open the page in the browser! This code does not work! And I am having problems finding the right class, id or selector to get the link.
I have tried to use tags but some them return none.
import webbrowser
import urllib.request
import bs4 as bs
iarg = str(input('Query: '))
url = str('https://www.youtube.com/results?search_query={}'.format(arg))
req = urllib.request.Request(url, data=None, headers={'User-Agent':'Mozilla'})
src = urllib.request.urlopen(req)
soup = bs.BeautifulSoup(src, 'html.parser')
results = []
for elem in a = soup.find_all("a"):
link = elem.get(href)
results.append(link)
webbrowser.open(results[0])
Thanks in advance!

How to scrape next page data as i do in the first page?

I have the following code:
from bs4 import BeautifulSoup
import requests
import csv
url = "https://coingecko.com/en"
base_url = "https://coingecko.com"
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
names = [div.a.span.text for div in soup.find_all("div",attrs={"class":"coin-content center"})]
Link = [base_url+div.a["href"] for div in soup.find_all("div",attrs={"class":"coin-content center"})]
for link in Link:
inner_page = requests.get(link)
inner_soup = BeautifulSoup(inner_page.content,"html.parser")
indent = inner_soup.find("div",attrs={"class":"py-2"})
content = indent.div.next_siblings
Allcontent = [sibling for sibling in content if sibling.string is not None]
print(Allcontent)
I have successfully enter to innerpage and grabbed all coins' information from the first page listed coin. But there is next page as 1,2,3,4,5,6,7,8,9 etc. How can I go to all the next page and do the same as previously?
Further, the output of my code contains a lot of \n and space. How can I fix that.
You need to generate all the pages and requests one by one and parse using bs4
from bs4 import BeautifulSoup
import requests
req = requests.get('https://www.coingecko.com/en')
soup = BeautifulSoup(req.content, 'html.parser')
last_page = soup.select('ul.pagination li:nth-of-type(8) > a:nth-of-type(1)')[0]['href']
lp = last_page.split('=')[-1]
count = 0
for i in range(int(lp)):
count+=1
url = 'https://www.coingecko.com/en?page='+str(count)
print(url)
requests.get(url)#requests each page one by one till last page
##parse your fileds here using bs4
The way you have written your script has got a messy look. Try with .select() to make it concise and less prone to break. Although I could not find the further usage of names in your script, I kept it as it is. Here is how you can get all the available links traversing multiple pages.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
url = "https://coingecko.com/en"
while True:
page = requests.get(url)
soup = BeautifulSoup(page.text,"lxml")
names = [item.text for item in soup.select("span.d-lg-block")]
for link in [urljoin(url,item["href"]) for item in soup.select(".coin-content a")]:
inner_page = requests.get(link)
inner_soup = BeautifulSoup(inner_page.text,"lxml")
desc = [item.get_text(strip=True) for item in inner_soup.select(".py-2 p") if item.text]
print(desc)
try:
url = urljoin(url,soup.select_one(".pagination a[rel='next']")['href'])
except TypeError:break
Btw, whitespaces have also been taken care of by using .get_text(strip=True)

How to get Html code after crawling with python

https://plus.google.com/s/casasgrandes27%40gmail.com/top
I need to crawl the following page with python but I need its HTML not the generic source code of link.
For example
Open the link: plus.google.com/s/casasgrandes27%40gmail.com/top without login second last thumbnail will be "G Suite".
<div class="Wbuh5e" jsname="r4nke">G Suite</div>
I am unable to find the above line of HTML-code after executing this python-code.
from bs4 import BeautifulSoup
import requests
L = list()
r = requests.get("https://plus.google.com/s/casasgrandes27%40gmail.com/top")
data = r.text
soup = BeautifulSoup(data,"lxml")
print(soup)
To get the soup object try the following
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
you can try this code to read a HTML page :
import urllib.request
urls = "https://plus.google.com/s/casasgrandes27%40gmail.com/top"
html_file = urllib.request.urlopen(urls)
html_text = html_file.read()
html_text = str(html_text)
print(html_text)

Could not able to extract #document from HTML file through python web scraping

When I inspect the elements on my browser, I can obviously see the exact web content. But when I try to run the below script, I cannot see the some of the web page details. In the web page I see there are "#document" elements and that is missing while I run the script. How can I see the details of #document elements or extract with the script.?
from bs4 import BeautifulSoup
import requests
response = requests.get('http://123.123.123.123/')
soup = BeautifulSoup(response.content, 'html.parser')
print soup.prettify()
You need to make additional requests to get the frame page contents as well:
from urlparse import urljoin
from bs4 import BeautifulSoup
import requests
BASE_URL = 'http://123.123.123.123/'
with requests.Session() as session:
response = session.get(BASE_URL)
soup = BeautifulSoup(response.content, 'html.parser')
for frame in soup.select("frameset frame"):
frame_url = urljoin(BASE_URL, frame["src"])
response = session.get(frame_url)
frame_soup = BeautifulSoup(response.content, 'html.parser')
print(frame_soup.prettify())

Categories