search pdf links from all over the website - python

I want to search a website and look for all pdf links. I know there are several solutions with BeautifulSoup to look for pdf files using < a > tags but I want to search the whole domain using the base url, instead of just the page linked.
My idea was to a) first search a whole website for all sub links and then b) filter out the links that have a .pdf extension. For the first part, I tried this https://github.com/mujeebishaque/extract-urls:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
user_input_url = "https://www.aurednik.de/"
if not user_input_url or len(user_input_url) < 1:
raise Exception("INFO: Invalid Input")
_start = user_input_url.find('//')
_end = user_input_url.find('.com')
readable_website_name = user_input_url[_start+2:_end].strip()
try:
website_content = requests.get(user_input_url.strip()).text
except:
check_internet = requests.get('https://google.com').status_code
if check_internet != requests.codes.ok:
raise ConnectionError("ERROR: Check internet connection.")
_soup = BeautifulSoup(website_content, features='lxml')
internal_url_links = []
external_url_links = []
for link in _soup.find_all('a', href=True):
if readable_website_name in link.get('href'):
internal_url_links.append(link['href'])
if readable_website_name not in link.get('href') and len(link.get('href')) > 3:
external_url_links.append(link['href'])
print(internal_url_links, '\n')
print(external_url_links, '\n')
I was expecting that it would be able to crawl and return all links such as
https://www.aurednik.de/info-service/downloads/#unserekataloge
and https://www.aurednik.de/downloads/AUREDNIK_Haupt2021.pdf
but that is not the case. I dont see the 2nd pdf link at all and for the first link, I only see
/info-service/downloads/#unserekataloge
when I print out the external links. I want the full link and preferably also all pdf links on the website domain. How else could I achieve this? I am open to using any tools or libraries.

Maybe the website has dynamic content. Check if the HTML loaded by BeautifulSoup contains is the same as when you inspect the website in your browser. If not use for example selenium to scrape the website with dynamically loaded content.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
html = driver.page_source
soup = BeautifulSoup(html)
internal_url_links = []
external_url_links = []
for link in soup.find_all('a', href=True):
if readable_website_name in link.get('href'):
internal_url_links.append(link['href'])
if readable_website_name not in link.get('href') and len(link.get('href')) > 3:
external_url_links.append(link['href'])
print(internal_url_links, '\n')
print(external_url_links, '\n')
driver.close()

Related

How to scrape hyperlinks from multiple pages on webpage with unchanging URL, using Python/Beautiful Soup?

There is a paginated list of hyperlinks on this webpage: https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/.
The code I have created till now scrapes the relevant links from the first page. I cannot figure out how to extract links from subsequent pages (8 links per page, about 25 pages).
There does not seem to be a way to navigate the pages using the URL.
from bs4 import BeautifulSoup
import urllib.request
# Scrape webpage
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
# Extract links
links = []
for link in soup.find_all('a', href=True):
links.append(link['href'])
# Select relevant links, reformat, and drop duplicates
links = list(dict.fromkeys(["https://www.farmersforum.ie"+link for link in links if "/reports/Thurles" in link]))
Please advise for how I can do this using Python.
I've solved this with Selenium. Thank you.
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
# Launch Chrome driver
driver = webdriver.Chrome(ChromeDriverManager().install())
# Open webpage
driver.get("https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/")
# Loop through pages
allLnks = []
iStop = False
# Continue until fail to find button
while iStop == False:
for ii in range(2,12):
try:
# Click page
driver.find_element_by_xpath('//*[#id="mainContent"]/div/div[1]/div[2]/ul/li['+str(ii)+']/a').click()
except:
iStop = True
break
# Wait to load
time.sleep(0.1)
# Identify elements with tagname <a>
lnks=driver.find_elements_by_tag_name("a")
# Traverse list of links
iiLnks = []
for lnk in lnks:
# Use get_attribute() to get all href and add links to list
iiLnks.append(lnk.get_attribute("href"))
# Select relevant links, reformat, and drop duplicates
iiLnks = list(dict.fromkeys([iiLnk for iiLnk in iiLnks if "/reports/Thurles" in iiLnk]))
allLnks = allLnks + iiLnks
driver.find_element_by_xpath('//*[#id="mainContent"]/div/div[1]/div[2]/ul/li[12]/a').click()
driver.quit()

Download all files from web using Python error

I am trying to download all files from this website https://superbancos.gob.pa/es/fin-y-est/reportes-estadisticos
I found this code on a page and I am trying to adapt it to my process
If you could help me I would appreciate it
#Aqui importe las librerias
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
archive_url = "https://www.superbancos.gob.pa/es/fin-y-est/reportes-estadisticos"
def get_video_links():
r = requests.get(archive_url)
soup = BeautifulSoup(r.content,'html5lib')
links = soup.findAll('a')
video_links = [archive_url + link['href'] for link in links if link['href'].endswith('xlsx')]
return video_links
def download_video_series(video_links):
for link in video_links:
'''iterate through all links in video_links
and download them one by one'''
# obtain filename by splitting url and getting
# last string
file_name = link.split('/')[-1]
print ("Downloading file:{!s}".format(file_name))
# create response object
r = requests.get(link, stream = True)
# download started
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ("{!s} downloaded!\n".format(file_name))
print ("All files downloaded!")
return
if __name__ == "__main__":
video_links = get_video_links()
download_video_series(video_links)
but when i start the program he said All files downloaded and dont download anyone
The information you are looking for is dynamically loaded with JS code. So you should use something that can run JS and render the page like you see it in the browser.
The most straightforward way is using selenium:
from bs4 import BeautifulSoup
from selenium import webdriver
def get_soup(link):
driver = webdriver.Chrome()
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html5lib')
driver.close()
return soup
So your first function could be rewritten as
def get_video_links():
soup = get_soup(archive_url)
links = soup.findAll('a')
video_links = [archive_url + link['href'] for link in links if link['href'].endswith('xlsx')]
return video_links
Just make sure to set up your ChromeDriver properly! here is the documentation.
The Problem here is that the page is required javascript. Your best bet here is to use selenium webdriver to handle this, instead of bs4:

What would be the best way to scrape this website? (Not Selenium)

Before I begin TLDR is at the bottom
So I'm trying to scrape https://rarbgmirror.com/ for torrent magnet links and for their torrent title names based on user inputted searches. I've already figured out how to do this using BeautifulSoup and Requests through this code:
from bs4 import BeautifulSoup
import requests
import re
query = input("Input a search: ")
link = 'https://rarbgmirror.com/torrents.php?search=' + query
magnets = []
titles = []
try:
request = requests.get(link)
except:
print("ERROR")
source = request.text
soup = BeautifulSoup(source, 'lxml')
for page_link in soup.findAll('a', attrs={'href': re.compile("^/torrent/")}):
page_link = 'https://www.1377x.to/' + page_link.get('href')
try:
page_request = requests.get(page_link)
except:
print("ERROR")
page_source = page_request.content
page_soup = BeautifulSoup(page_source, 'lxml')
link = page_soup.find('a', attrs={'href': re.compile("^magnet")})
magnets.append(link.get('href'))
title = page_soup.find('h1')
titles.append(title)
print(titles)
print(magnets)
I am almost certain that this code has no error in it because the code was originally made for https://1377x.to for the same purpose, and if you look through the HTML structure of both websites, they use the same tags for magnet links and title names. But if the code is faulty please point that out to me!
After some research I found the issue to be that https://rarbgmirror.com/ uses JavaScript which dynamically loads web pages. So after some more research I find that selenium is recommended for this purpose. Well after some time using selenium I find some cons to using it such as:
The slow speed of scraping
The system which the app is running on must have the selenium browser installed (I'm planning on using pyinstaller to pack the app which would be an issue)
So I'm requesting for an alternative to selenium to scrape dynamically loaded web pages.
TLDR:
I want an alternative to selenium to scrape a website which is dynamically loaded using JavaScript.
PS: GitHub Repo:
https://github.com/eliasbenb/MagnetMagnet
If you are using only Chrome, you can check out Puppeteer by Google. It is fast and integrates quite well with Chrome DevTools.
WORKING SOLUTION
DISCLAIMER FOR PEOPLE LOOKING FOR AN ANSWER: this method WILL NOT work for any website other than RARBG
I posted this same question to reddit's r/learnpython someone on there found a great answer which met all my requirements. You can find the original comment here
What he found out was that rarbg gets its info from here
You can change what is searcher by changing "QUERY" in the link. On that page was all the information for each torrent, so using requests and bs4 I scraped all the information.
Here is the working code:
query = input("Input a search: ")
rarbg_link = 'https://torrentapi.org/pubapi_v2.php?mode=search&search_string=' + query + '&token=lnjzy73ucv&format=json_extended&app_id=lol'
try:
request = requests.get(rarbg_link, headers={'User-Agent': 'Mozilla/5.0'})
except:
print("ERROR")
source = request.text
soup = str(BeautifulSoup(source, 'lxml'))
soup = soup.replace('<html><body><p>{"torrent_results":[', '')
soup = soup.split(',')
titles = str([i for i in soup if i.startswith('{"title":')])
titles = titles.replace('{"title":"', '')
titles = titles.replace('"', '')
titles = titles.split("', '")
for title in titles:
title.append(titles)
links = str([i for i in soup if i.startswith('"download":')])
links = links.replace('"download":"', '')
links = links.replace('"', '')
links = links.split("', '")
for link in links:
magnets.append(link)

Python Selenium: How to go to a google search URL without the page showing up as "not found", "access forbidden", or "permission denied"

I just began working with Selenium to learn about webscraping and I am trying to now run a google search, then iterate my code over each of the top 5 URLs that my search returned.
My google search would load properly, but when I go to any of the search result URLs the page would show a "not found", "access forbidden", or "permission denied" page. This happens as well if I manually paste the URL in. How can I bypass this?
Or am I going to the next URL improperly? I am currently resetting the driver.get URL.
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import re
search = '5 most popular dog breeds'
driver = webdriver.Chrome()
driver.get('https://www.google.co.in/#q=' + search)
b = driver.current_url
page = requests.get(b)
soup = BeautifulSoup(page.content, features="lxml")
links = soup.findAll("a")
urlList = []
# Put first 5 URLs of search into array x.
for link in soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
urlList.append(re.split(":(?=http)",link["href"].replace("/url?q=","")))
if len(urlList) == 5:
break
driver.get(urlList[0][0])
url = driver.current_url
page = requests.get(url)
pgsource = driver.page_source
You are opening pages correctly. It looks like you are pulling extra query parameters when getting href for the a elements.
I modified your code to only get links that match this regex pattern for links 'https?:\/\/[a-zA-Z0-9\.\-\/]+' and to only take 1 link per web element (in your case it sometimes was 2).
# Put first 5 URLs of search into array x.
for link in soup.find_all("a", href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
r = re.findall(pattern=re.compile('https?:\/\/[a-zA-Z0-9\.\-\/]+'), string=link['href'])[0]
urlList.append(r)
if len(urlList) == 5:
break
print(urlList[0])
driver.get(urlList[0])
pgsource = driver.page_source
print(pgsource)
You can also just use Selenium for the same purpose, without Beautiful Soup, it'll look like this:
from selenium import webdriver
search = '5 most popular dog breeds'
driver = webdriver.Chrome()
driver.get('https://www.google.co.in/#q=' + search)
# Using XPath to filter desired elements instead of regex:
links = driver.find_elements_by_xpath("//a[#href!='' and contains(#ping,'/url?sa')]")
urls = []
for link in links[1:6]:
urls += [link.get_attribute('href')]
print(urls[0])
driver.get(urls[0])
pgsource = driver.page_source
print(pgsource)
This worked for me. I hope it helps, good luck.

Extract urls from the footer of a web page

I am trying to extract the social media links from websites for my research unfortunately, I am not able to extract them as they are located in the footer of the website.
I tried requests, urllib.request, pattern.web apis to download the html document of a webpage. All these apis download the same content and failing to download the content in the footer of the websites.
import requests
from bs4 import BeautifulSoup as soup
url = 'https://cloudsight.ai/'
headers = {'User-Agent':'Mozilla/5.0'}
sm_sites = ['https://www.twitter.com','https://www.facebook.com',
'https://www.youtube.com','https://www.linkedin.com',
'https://www.linkedin.com/company', 'https://twitter.com',
'https://facebook.com','https://youtube.com','https://linkedin.com',
'http://www.twitter.com','http://www.facebook.com',
'http://www.youtube.com','http://www.linkedin.com',
'http://www.linkedin.com/company', 'http://twitter.com',
'http://facebook.com','http://youtube.com','http://linkedin.com']
blocked = ['embed','search','sharer','intent','share','watch']
sm_sites_present = []
r = requests.get(url,headers=headers)
content = soup(r.content,'html.parser')
text = r.text
links = content.find_all('a',href=True)
for link in links:
a = link.attrs['href'].strip('/')
try:
if any(site in a for site in sm_sites) and not any(block in a for block in blocked):
sm_sites_present.append(a)
except:
sm_sites_present.append(None)
output:
>>> sm_sites_present
>>> []
If you see the website inspect element the social_media information is provided in the footer div DOM.
If you just even try text.find('footer') the result is -1.
I tried for many hours to figure out how to extract this footer information and I failed.
SO, I kindly request if anyone could help me in solving it.
Note:
Even I tried regex, the problem is the when we download the page the footer information is not being downloaded.
As suggested by #chitown88, you can use Selenium to get the content.
from selenium import webdriver
url = 'https://cloudsight.ai/'
driver = webdriver.Firefox()
driver.get(url)
html = driver.page_source
driver.quit()
soup = BeautifulSoup(html,'html.parser')
[i.a['href'] for i in soup.footer.find_all('li', {'class':'social-list__item'})]
output
['https://www.linkedin.com/company/cloudsight-inc',
'https://www.facebook.com/CloudSight',
'https://twitter.com/CloudSightAPI']

Categories