Scraping data with Python from reCAPTCHA protecred website - python

I've been trying to scrape some info for personal use from a website. It works nice, no errors, but I found out it somehow can't see email addresses from second half of the site. Code I'm using:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://rejestradwokatow.pl/adwokat/abramowicz-joanna-49486')
soup = BeautifulSoup(page.content, "html.parser")
kancelaria = [x.strip() for x in soup.find(
'div', class_='mb_tab_content special_one').find_all('div')[::2][0].text.split('\n') if x != ''][1:]
with result:
>>> kancelaria
['Kancelaria Adwokacka', 'Chlebnicka 48/51', '80-830 GdaƄsk', '', 'Stacjonarny/Fax: 583054010', 'Email: [email\xa0protected]']
Please take notice in last element: 'Email: [email\xa0protected]'. I believe it has something to do with reCAPTCHA mechanism implemented in the website, but I have no idea how to go around it. Interesting - emails from first half of the site are visible for my program and can be scraped. Anh thoughts?
EDIT:
I'm reffering to the lower part of the page:

I'm going to add another answer to this:
That one is created by Javascript, and you may test it using Selenium. The code is provided below.
from selenium import webdriver
import chromedriver_autoinstaller
# auto install chromedriver
chromedriver = chromedriver_autoinstaller.install()
# driver define and lunch
driver = webdriver.Chrome(chromedriver)
driver.maximize_window()
# Go to website and get email
url = 'https://rejestradwokatow.pl/adwokat/artymiak-grzegorz-46439'
driver.get(url)
email_text = driver.find_element_by_xpath('//div[#class="mb_tab_content special_one"]/div[#class="line_list_K"]').text.split('Email: ')
email = email_text[-1]
print(email)
gartymiak#protonmail.com

The email is generated with CSS. You have to extract attribute values in div data-ea and data-eb and join with #
name = soup.find('div', class_="address_e").get('data-ea')
domain = soup.find('div', class_="address_e").get('data-eb')
email = f'{name}#{domain}'

To get just emails, enter the following:
email_1st_part = soup.find('div', class_="address_e").get('data-ea')
email_2nd_part = soup.find('div', class_="address_e").get('data-eb')
email = email_1st_part + '#'+ email_2nd_part
Full Code:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://rejestradwokatow.pl/adwokat/abramowicz-joanna-49486')
soup = BeautifulSoup(page.content, "html.parser")
email_1st_part = soup.find('div', class_="address_e").get('data-ea')
email_2nd_part = soup.find('div', class_="address_e").get('data-eb')
email = email_1st_part + '#'+ email_2nd_part
Result:
print(email)
'abramowicz#pro.onet.pl'

Related

Extracting the titles of the websites mentioned in the link

https://www.g2.com/categories/marketing-automation
I am trying webscrap the above link that has list of 350+ websites i need to extract the title of the websites mentioned
But I am failing to get any results i have tried with using requests and beautiful soup
then with selenium and all i am getting is empty list "[]" or none
import requests
from bs4 import BeautifulSoup
# Send a GET request to the URL and parse the HTML content
url = 'https://www.g2.com/categories/marketing-automation'
response = requests.get(url).text
soup = BeautifulSoup(response, 'html.parser')
name = soup.find(class_ = "product-card__product-name")
print(name)
This above code is just a test code to check if the data is being pulled or not and the response is 'None'
From this code i am expecting to see the results of the class mentioned upon calling print
I kind of got this code to get something. Im still working on it.
from selenium import webdriver
from selenium.webdriver.common.by import By
# Create a new instance of the Chrome driver
driver = webdriver.Chrome()
# Navigate to the webpage
driver.get('https://www.g2.com/categories/marketing-automation')
# Wait for the page to load
driver.implicitly_wait(10)
# Find all the product cards on the page
product_cards = driver.find_elements(By.CLASS_NAME, 'product-card__product-name')
# Iterate over the product cards and extract the title from each one
for product_card in product_cards:
title = product_card.text
print(title)
# Close the browser
driver.quit()

Python - Item Price Web Scraping for Target

I'm trying to get any item's price from Target website. I did some examples for this website using selenium and Redsky API but now I tried to wrote bs4 code below:
import requests
from bs4 import BeautifulSoup
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
r= requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price)
But it returns me None .
I tried soup.find("div",{'class': "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp"})
What am I missing?
I can accept any selenium code or Redsky API code but my priority is bs4
The page is dynamic. The data is rendered after the initial request is made. You can use selenium to load the page, and once it's rendered, then you can pull out the relevant tag. API though is always the preferred way to go if it's available.
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
# If you don't want to open a browser, comment out the line above and uncomment below
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe', options=options)
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
driver.get(url)
r = driver.page_source
soup = BeautifulSoup(r, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)
Output:
$1.99
You are simply using wrong locator.
Try this
price_css_locator = 'div[data-test=product-price]'
or in XPath style
price_xpath_locator = '//div[#data-test="product-price"]'
With bs4 it should be something like this:
soup.select('div[data-test="product-price"]')
to get the element get you just need to add .text
price = soup.select('div[data-test="product-price"]').text
print(price)
use .text
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)

BeatifulSoap find() returns "None" with any name/attributes

I'm trying to get some informations about a product i'm interested in, on Amazon.
I'm using BeatifulSoap library for webscraping :
URL = 'https://www.amazon.it/gp/offer-listing/B08KHL2J5X/ref=dp_olp_unknown_mbc'
page = requests.get(URL,headers=headers)
soup = BeautifulSoup(page.content,'html.parser')
title = soup.find('span',class_='a-size-large a-color-price olpOfferPrice a-text-bold')
print(title)
In the pic, the highlined row it's the one i want to select, but when i run my script i get 'None' everytime. (Printing the entire output after BeatifulSoap call, give me the entire HTML source, so i'm using the right URL)
Any solutions?
You need to use .text() to get the text of an element.
so change:
print(title)
to:
print(title.text)
Output:
EUR 1.153,00
I wouldn't use BS alone in this case. You can easily use add Selenium to scrape the website:
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium import webdriver
url = 'https://www.amazon.it/gp/offer-listing/B08KHL2J5X/ref=dp_olp_unknown_mbc'
driver = webdriver.Safari()
driver.get(url)
html_content = driver.page_source
soup = BeautifulSoup(html_content, "html.parser")
title = soup.find('span',class_='a-size-large a-color-price olpOfferPrice a-text-bold')
print(title)
If you don't can use Safari you have to download the webdriver for Chrome, Firefox etc. but there is plenty of reading material on this topic.

What would be the best way to scrape this website? (Not Selenium)

Before I begin TLDR is at the bottom
So I'm trying to scrape https://rarbgmirror.com/ for torrent magnet links and for their torrent title names based on user inputted searches. I've already figured out how to do this using BeautifulSoup and Requests through this code:
from bs4 import BeautifulSoup
import requests
import re
query = input("Input a search: ")
link = 'https://rarbgmirror.com/torrents.php?search=' + query
magnets = []
titles = []
try:
request = requests.get(link)
except:
print("ERROR")
source = request.text
soup = BeautifulSoup(source, 'lxml')
for page_link in soup.findAll('a', attrs={'href': re.compile("^/torrent/")}):
page_link = 'https://www.1377x.to/' + page_link.get('href')
try:
page_request = requests.get(page_link)
except:
print("ERROR")
page_source = page_request.content
page_soup = BeautifulSoup(page_source, 'lxml')
link = page_soup.find('a', attrs={'href': re.compile("^magnet")})
magnets.append(link.get('href'))
title = page_soup.find('h1')
titles.append(title)
print(titles)
print(magnets)
I am almost certain that this code has no error in it because the code was originally made for https://1377x.to for the same purpose, and if you look through the HTML structure of both websites, they use the same tags for magnet links and title names. But if the code is faulty please point that out to me!
After some research I found the issue to be that https://rarbgmirror.com/ uses JavaScript which dynamically loads web pages. So after some more research I find that selenium is recommended for this purpose. Well after some time using selenium I find some cons to using it such as:
The slow speed of scraping
The system which the app is running on must have the selenium browser installed (I'm planning on using pyinstaller to pack the app which would be an issue)
So I'm requesting for an alternative to selenium to scrape dynamically loaded web pages.
TLDR:
I want an alternative to selenium to scrape a website which is dynamically loaded using JavaScript.
PS: GitHub Repo:
https://github.com/eliasbenb/MagnetMagnet
If you are using only Chrome, you can check out Puppeteer by Google. It is fast and integrates quite well with Chrome DevTools.
WORKING SOLUTION
DISCLAIMER FOR PEOPLE LOOKING FOR AN ANSWER: this method WILL NOT work for any website other than RARBG
I posted this same question to reddit's r/learnpython someone on there found a great answer which met all my requirements. You can find the original comment here
What he found out was that rarbg gets its info from here
You can change what is searcher by changing "QUERY" in the link. On that page was all the information for each torrent, so using requests and bs4 I scraped all the information.
Here is the working code:
query = input("Input a search: ")
rarbg_link = 'https://torrentapi.org/pubapi_v2.php?mode=search&search_string=' + query + '&token=lnjzy73ucv&format=json_extended&app_id=lol'
try:
request = requests.get(rarbg_link, headers={'User-Agent': 'Mozilla/5.0'})
except:
print("ERROR")
source = request.text
soup = str(BeautifulSoup(source, 'lxml'))
soup = soup.replace('<html><body><p>{"torrent_results":[', '')
soup = soup.split(',')
titles = str([i for i in soup if i.startswith('{"title":')])
titles = titles.replace('{"title":"', '')
titles = titles.replace('"', '')
titles = titles.split("', '")
for title in titles:
title.append(titles)
links = str([i for i in soup if i.startswith('"download":')])
links = links.replace('"download":"', '')
links = links.replace('"', '')
links = links.split("', '")
for link in links:
magnets.append(link)

Extract all links from drop down list combination

I have a sample website and I want to extract all the "href links" from the website. It has two drop downs and once drop down is selected it displays results with link to manual to download.
It does not navigate to different page instead shows result on the same page. I have extracted the combination of drop down lists, I am trying to extract the manual links and I am unable to find the link.
code is as follows
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup
import requests
url = "https://www.cars.com/"
driver = webdriver.Chrome('C:/Users/webdrivers/chromedriver.exe')
driver.get(url)
time.sleep(4)
selectYear = Select(driver.find_element_by_id("odl-selected-year"))
data = []
for yearOption in selectYear.options:
yearText = yearOption.text
selectYear.select_by_visible_text(yearText)
time.sleep(1)
selectModel = Select(driver.find_element_by_id("odl-selected-model"))
for modelOption in selectModel.options:
modelText = modelOption.text
selectModel.select_by_visible_text(modelText)
data.append([yearText,modelText])
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
content = soup.findAll('div',attrs={"class":"odl-results-container"})
for i in content:
x = i.findAll(['h3','span'])
for y in x:
print(y.get_text())
print does not show any data. How can I get the links for manuals? Thanks in advance
You need to click the button for each car model and year and then retrieve the rendered HTML page source from your Selenium webdriver rather than with requests.
Add this in your inner loop:
button = driver.find_element_by_link_text("Select this vehicle")
button.click()
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
content = soup.findAll('a',attrs={"class":"odl-download-link"})
for i in content:
print(i["href"])
This prints out:
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=6875&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O91668&VIN=&userMarket=GBR
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=7126&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O134871&VIN=&userMarket=GBR
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=7708&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O177941&VIN=&userMarket=GBR
...

Categories