Hidden phone number can't be scraped - python

I've been having trouble trying to extract the phone number after clicking the "llamar" button. So far I've used the xpath method with selenium and also tried using beautiful soup to extract the number but unfortunately nothing has worked. I usually get an invalid selector error (if I use an xpath selector with selenium) and with BS4 I get a - AttributeError: 'NoneType' object has no attribute 'text' ...
I hope you can help me out!
Here is the url to the link - https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm
Heres the code that I tried:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import UnexpectedAlertPresentException
url = 'https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque - 386352344.htm'
path = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe'
path1 = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\firefox'
# driver = webdriver.Chrome(path)
options = Options()
driver = webdriver.Chrome(path)
driver.get(url)
a = []
mah_div = driver.page_source
soup = BeautifulSoup(mah_div, features='lxml')
cookie_button = '//*[#id="sui-TcfFirstLayerModal"]/div/div/footer/div/button[2]'
btn_press = driver.find_element_by_xpath(cookie_button)
btn_press.click()
llam_button = '//*[#id="ad-detail-contact"]/a[2]'
llam_press = driver.find_element_by_xpath(llam_button)
llam_press.click()
time.sleep(10)
for item in soup.find_all("div", {"class": "contenido"}):
a.append(item.find("div", {"class": "plaincontenido"}).text)
print(a)

The phone is stored inside Javascript. You can use re module to extract it:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm"
phone_url = "https://www.milanuncios.com/datos-contacto/?usePhoneProxy=0&from=detail&includeEmail=false&id={}"
ad_id = re.search(r"(\d+)\.htm", url).group(1)
html_text = requests.get(phone_url.format(ad_id)).text
soup = BeautifulSoup(html_text, "html.parser")
phone = re.search(r"getTrackingPhone\((.*?)\)", html_text).group(1)
print(soup.select_one(".texto").get_text(strip=True), phone)
Prints:
ana (Particular) 639....

With Selenium you will need to click the button and to switch to iframe.
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".def-btn.phone-btn")))
tel_button = driver.find_element_by_css_selector(".def-btn.phone-btn")
tel_button.click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, "ifrw")))
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".texto>.telefonos")))
tel_number = driver.find_element_by_css_selector(".texto>.telefonos").text
Please note, I used much stable locators.

Related

How do I click on the first DIV Class "link' If they all have the same div class name?

I am trying to click on the links for each of the product tiles https://www.hugoboss.com/uk/men-clothing/, using Selenium.
webcode:
My current code: `
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
driverfile = r'C:\Users\Main\Documents\Work\Projects\Scraping Websites\extra\chromedriver'
driver = webdriver.Chrome(executable_path=driverfile)
driver.implicitly_wait(10)
url = "https://www.hugoboss.com/uk/men-clothing/"
driver.get(url)
driver.implicitly_wait(10)
shadowRoot = driver.find_element(By.XPATH,"//div[#id='usercentrics-root']").shadow_root
shadowRoot.find_element(By.CSS_SELECTOR, "button[data-testid='uc-save-button']").click()
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-testid='uc-save-button']"))).click()
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//a[#class='product-tile-plp__title-link.font--sub2.js-product-tile-link.widget-initialized']"))).click()
`
Current Error:
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-testid='uc-save-button']"))).click()
File "C:\Users\Main\Anaconda3\lib\site-packages\selenium\webdriver\support\wait.py", line 87, in until
raise TimeoutException(message, screen, stacktrace)
TimeoutException
Actually, you can't click on each link from title because they didn't contain any clickable button but you can iterate them to pull the url. I use bs4 to grab each link because they aren't dynamic.
from bs4 import BeautifulSoup
import requests
URL = 'https://www.hugoboss.com/uk/men-clothing/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
for link in soup.select('.product-tile-plp__title a'):
link = 'https://www.hugoboss.com' + link.get('href')
print(link)
Output:
https://www.hugoboss.com/uk/three-pack-of-regular-fit-cotton-t-shirts/hbeu50325887_961.html
https://www.hugoboss.com/uk/slim-fit-shorts-in-stretch-cotton-twill/hbeu50467083_037.html
https://www.hugoboss.com/uk/slim-fit-three-piece-suit-in-stretch-wool/hbeu50478270_424.html
https://www.hugoboss.com/uk/organic-cotton-polo-shirt-with-curved-logo/hbeu50468983_453.html
https://www.hugoboss.com/uk/slim-fit-trousers-in-stretch-cotton-satin/hbeu50470813_404.html
https://www.hugoboss.com/uk/slim-fit-suit-in-micro-patterned-traceable-stretch-wool/hbeu50468911_413.html
https://www.hugoboss.com/uk/slim-fit-suit-in-micro-patterned-performance-stretch-fabric/hbeu50474242_273.html
https://www.hugoboss.com/uk/organic-cotton-polo-shirt-with-curved-logo/hbeu50468983_401.html
https://www.hugoboss.com/uk/slim-fit-shorts-in-structured-stretch-cotton/hbeu50472870_404.htmlhttps://www.hugoboss.com/uk/tapered-fit-chinos-in-overdyed-stretch-cotton-satin/hbeu50470797_404.html
https://www.hugoboss.com/uk/three-pack-of-regular-fit-cotton-t-shirts/hbeu50325887_975.html
https://www.hugoboss.com/uk/three-pack-of-regular-fit-cotton-t-shirts/hbeu50325887_974.html
https://www.hugoboss.com/uk/slim-fit-shorts-in-stretch-cotton-twill/hbeu50467083_404.html
https://www.hugoboss.com/uk/slim-fit-suit-in-stretch-wool-with-logo-lining/hbeu50474754_068.html
https://www.hugoboss.com/uk/single-breasted-jacket-in-virgin-wool-serge/hbeu50469172_401.html
https://www.hugoboss.com/uk/formal-trousers-in-virgin-wool-serge/hbeu50469174_401.html
https://www.hugoboss.com/uk/slim-fit-shorts-in-structured-stretch-cotton/hbeu50472870_275.htmlhttps://www.hugoboss.com/uk/slim-fit-shirt-in-easy-iron-cotton-poplin/hbeu50289499_199.html
https://www.hugoboss.com/uk/slim-fit-trousers-in-stretch-cotton-satin/hbeu50470813_239.html
https://www.hugoboss.com/uk/slim-fit-trousers-in-stretch-cotton-satin/hbeu50470813_027.html
https://www.hugoboss.com/uk/single-breasted-jacket-in-virgin-wool-serge/hbeu50469171_401.html
https://www.hugoboss.com/uk/tuxedo-jacket-in-virgin-wool-serge/hbeu50469191_401.html
https://www.hugoboss.com/uk/tuxedo-trousers-in-virgin-wool-serge/hbeu50469186_001.html
https://www.hugoboss.com/uk/regular-fit-jeans-in-dark-blue-comfort-stretch-denim/hbeu50470509_420.html
https://www.hugoboss.com/uk/stretch-cotton-t-shirt-with-contrast-logo/hbeu50469057_402.html
https://www.hugoboss.com/uk/regular-fit-jersey-shirt-with-button-down-collar/hbeu50469854_411.html
https://www.hugoboss.com/uk/cotton-jersey-regular-fit-t-shirt-with-collaborative-artwork/hbeu50472110_001.html
https://www.hugoboss.com/uk/regular-fit-jersey-shirt-with-button-down-collar/hbeu50469854_100.html
https://www.hugoboss.com/uk/regular-fit-jersey-shirt-with-button-down-collar/hbeu50469854_453.html
https://www.hugoboss.com/uk/slim-fit-trousers-in-stretch-cotton-satin/hbeu50470813_012.html
https://www.hugoboss.com/uk/cotton-blend-slim-fit-polo-shirt-with-contrast-trims/hbeu50466442_402.html
https://www.hugoboss.com/uk/slim-fit-shorts-in-structured-stretch-cotton/hbeu50472870_337.htmlhttps://www.hugoboss.com/uk/tapered-fit-jeans-in-dark-blue-super-stretch-denim/hbeu50471005_417.html
https://www.hugoboss.com/uk/organic-cotton-polo-shirt-with-curved-logo/hbeu50468983_316.html
https://www.hugoboss.com/uk/regular-fit-suit-in-super-flex-wool-blend-cloth/hbeu50466013_001.html
https://www.hugoboss.com/uk/tuxedo-jacket-in-responsible-virgin-wool/hbeu50469185_001.html

Unable to click on next button with Selenium

I've been at this for hours and haven't made any progress. I'm trying to click on the next button on this page here
Here's my code:
#!/usr/local/bin python3
import sys
import time
import re
import logging
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as options
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
_USE_VIRTUAL_DISPLAY = False
_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
# logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)
logging.basicConfig(format=_FORMAT, level=logging.INFO)
_LOGGER = logging.getLogger(sys.argv[0])
_DEFAULT_SLEEP = 0.5
try:
options = options()
# options.headless = True
driver = webdriver.Firefox(options=options, executable_path=r"/usr/local/bin/geckodriver")
print("Started Browser and Driver")
except:
_LOGGER.info("Can not run headless mode.")
url = 'https://www.govinfo.gov/app/collection/uscourts/district/alsd/2021/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
driver.get(url)
time.sleep(5)
page = driver.page_source
soup = bs(page, "html.parser")
next_page = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="collapseOne1690"]/div/span[1]/div/ul/li[8]/a')))
if next_page:
print('*****getting next page*****')
# driver.execute_script('arguments[0].click()', next_page)
next_page.click()
time.sleep(3)
else:
print('no next page')
driver.quit()
I get a timeout error. I've tried changing the XPath. I've tried ActionChains to scroll into view and none have worked. Any help appreciated.
1 Your XPATH does not work because it uses dynamic class name collapseOne1690, as was mentioned earlier.
Also, it's not very stable even if you used a part of this class name.
If you prefer XPaths, I'd suggest this one: //span[#class='custom-paginator']//li[#class='next fw-pagination-btn']/a or just //li[#class='next fw-pagination-btn']/a. You can also use css selector: .next.fw-pagination-btn
2 I got rid of logging code because it also has some issues, re-check it.
3 5 seconds explicit wait is too small. Make it at least 10 seconds, better 15. It's just a suggestion.
The smallest reproducible code which clicks the button and uses Firefox is:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as options
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
options = options()
# options.headless = True
driver = webdriver.Firefox(options=options)
print("Started Browser and Driver")
url = 'https://www.govinfo.gov/app/collection/uscourts/district/alsd/2021/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
driver.get(url)
page = driver.page_source
soup = bs(page, "html.parser")
print(soup)
next_page = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.XPATH, "//span[#class='custom-paginator']//li[#class='next fw-pagination-btn']/a")))
next_page.click()
# driver.quit()
It appears when I load up this page that the div id's are assigned dynamically. The first time I loaded the page, the id was collapseOne5168, the second time it was collapseOne1136
You might consider using find_element_by_class_name("next fw-pagination-btn") instead?

Scraping a dynamically loaded table with BeautifulSoup

My code could return values of first two tags, but the behind won't in per tag.
HTML:
My code:
import bs4 as bs
import requests
resp = requests.get('https://q.stock.sohu.com/cn/bk_4401.shtml')
resp.encoding = 'gb2312'
soup = bs.BeautifulSoup(resp.text, 'lxml')
tab_sgtsc_list = soup.find('table').find('tbody').find_all('tr')
for tab_sgtsc in tab_sgtsc_list:
print('**************************************')
print(tab_sgtsc.find_all('td')[0].text)
print(tab_sgtsc.find_all('td')[1].text)
print(tab_sgtsc.find_all('td')[2].text)
print(tab_sgtsc.find_all('td')[3].text)
print('**************************************')
Result:
The table is rendered dynamically by JavaScript so you won't get much from pure HTML.
However, selenium and pandas come to the rescue!
Required:
Chrome driver
selenium
pip install pandas
Here's how:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://q.stock.sohu.com/cn/bk_4401.shtml")
wait = WebDriverWait(driver, 10)
element = wait.until(
EC.visibility_of_element_located((By.CSS_SELECTOR, 'table.tableMSB'))
).text.replace("点击按代码排序查询", "").split()
table = [element[i:i + 12] for i in range(0, len(element), 12)]
pd.DataFrame(table[1:], columns=table[0]).to_csv("your_table_data.csv", index=False)
Output:

Iterating through multiple urls

With your help, I was able to get a scraper running, but now I am stuck when it comes to iterating. Ultimately, I want to scraper to run through different URLs, but I'm getting confused in the syntax. I am using Selenium to open the web page and then BeautifulSoup to extract the data. I think I need to define the URLs and then use something like:
for url in urls
but I am not sure how to use this. Reading other answers and videos has left me scratching my head.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=2"]
driver = webdriver.Chrome()
driver.get(urls)
for url in urls:
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df1 = pd.read_html(str(race_soup))[0]
print(df1)
df2 = pd.read_html(str(results_soup))[0]
print(df2)
print('good')
driver.close()

Extracting text from a website using selenium

trying to find a way to extract the book's summary from the good reads page. Have tried Beautiful soup / Selenium, unfortunately to no avail.
link:https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1
code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
link='https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1'
driver.get(link)
Description=driver.find_element_by_xpath("//div[contains(text(),'TextContainer')]")
#first TextContainer contains the sumary of the book
book_page = requests.get(link)
soup = BeautifulSoup(book_page.text, "html.parser")
print(soup)
Container = soup.find('class', class_='leftContainer')
print(Container)
Error:
container is empty +
NoSuchElementException: no such element: Unable to locate element:
{"method":"xpath","selector":"//div[contains(text(),'TextContainer')]"}
(Session info: chrome=83.0.4103.116)
You can get the description like so
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
...
driver.get("https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1")
description = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div#description span[style="display:none"]'))
)
print(description.get_attribute('textContent'))
I have utilised a CSS Selector to get the specific hidden span that contains the full description. I have also used an explicit wait to give the element time to load.

Categories