I have a script that loads a page and saves a bunch of data ids from multiple containers. I then want to open up new urls appending those said data ids onto the end of the urls. For each url I want to locate all the hrefs and compare them to a list of specific links and if any of them match I want to save that link and a few other details to a table.
I have managed to get it to open the url with the appended data id but when I try to search for elements in the new page it either pulls them from the first url that was parsed if I try to findAll from soup again or I constantly get this error when I try to run another html.parser.
ResultSet object has no attribute 'findAll'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
Is it not possible to run another parser or am I just doing something wrong?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.action_chains import ActionChains
url = "http://csgo.exchange/id/76561197999004010#x"
driver = webdriver.Firefox()
driver.get(url)
import time
time.sleep(15)
html = driver.page_source
soup = soup(html, "html.parser")
containers = soup.findAll("div",{"class":"vItem"})
print(len(containers))
data_ids = [] # Make a list to hold the data-id's
for container in containers:
test = container.attrs["data-id"]
data_ids.append(test) # add data-id's to the list
print(str(test))
for id in data_ids:
url2 = "http://csgo.exchange/item/" + id
driver.get(url2)
import time
time.sleep(2)
soup2 = soup(html, "html.parser")
containers2 = soup2.findAll("div",{"class":"bar"})
print(str(containers2))
with open('scraped.txt', 'w', encoding="utf-8") as file:
for id in data_ids:
file.write(str(id)+'\n') # write every data-id to a new line
Not sure exactly what you want from each page. You should add waits. I add waits looking for hrefs in the flow history section of each page (if present). It should illustrate the idea.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
driver = webdriver.Chrome()
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
results.append([id, flowHistory])
except:
print(url)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
pros = ['http://csgo.exchange/profiles/76561198149324950']
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,3).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
if flowHistory in pros:
results.append([url,flowHistory])
print(results)
except:
print()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
urls = ['http://csgo.exchange/id/76561197999004010']
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
for url in urls:
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
pros = ['http://csgo.exchange/profiles/76561198149324950', 'http://csgo.exchange/profiles/76561198152970370']
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,2).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
match = []
for string in pros:
if string in flowHistory:
match = string
break
if match:
pass
results.append([url,match])
print(results)
except:
print()
Related
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
urls=[]
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:
try:
t1 =detail.select_one("h5.profile-title a").text
except:
pass
wev={
'Name':t1,
}
product.append(wev)
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
data={
'website':website
}
product.append(data)
df=pd.DataFrame(product)
df.to_csv('firm.csv')
The data of the website will be down in to CSV file as shown in pic is I am appending the data in wrong way why is data moving down where I am wrong ...Kindly recommend where I am wrong there .......
I want output in these format Kindly suggest solution for these...I want output in these format as you shown below...
You can't append wev and data separately - you need website and name in the same dictionary for pandas to know that they belong to same row.
You could add the websites in a separate list like
sites = []
# for url in urls:
# driver.get...
# soup = ....
# try:....except:....
data={
'website':website
}
sites.append(data)
and then zip and combine:
for pi, dictPair in enumerate(zip(product, sites)):
product[pi].update(dictPair[1])
df = pd.DataFrame(product)
df.to_csv('firm.csv')
However, I don't think it's the best way to make sure the right Names and Websites are matched up.
You should just add to the same dictionary for each row from the start instead of zipping and merging.
added_urls = []
product = []
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')
how to navigate page to the last page and get all href link from unchanged link page?
hhere my code is:
url = 'https://hoaxornot.detik.com/paging#'
options = webdriver.ChromeOptions()
pathToChromeDriver = "C:/Program Files/Google/Chrome/Application/chromedriver.exe"
browser = webdriver.Chrome(executable_path=pathToChromeDriver,
options=options)
try:
browser.get(url)
browser.implicitly_wait(10)
html = browser.page_source
page = 1
while page <= 2:
paging = browser.find_elements_by_xpath('//*[#id="number_filters"]/a[{}]'.format(page)).click()
for p in paging:
articles = p.find_elements_by_xpath('//*[#id="results-search-hoax-paging"]/div/div/article/a')
for article in articles:
print(article.get_attribute("href"))
page += 1
finally:
browser.quit()
wait=WebDriverWait(browser,60)
browser.get("https://hoaxornot.detik.com/paging#")
page=1
articles=[]
while True:
try:
time.sleep(1)
pagearticles=wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//*[#id="results-search-hoax-paging"]/div/div/article/a')))
for article in pagearticles:
articles.append(article.get_attribute("href"))
page+=1
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="number_filters"]/a[{}]'.format(page)))).click()
except:
break
print(articles)
Here's a simple way to loop through the pages and wait for the element's visibility to come up so you can obtain their values instead of an empty list.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
Outputs:
['https://news.detik.com/berita/d-5900248/video-jembatan-ambruk-disebut-di-samarinda-faktanya-bukan-di-indonesia', 'https://news.detik.com/berita/d-5898607/kantor-walkot-jakbar-diviralkan-rusak-akibat-gempa-ini-faktanya', 'https://news.detik.com/berita/d-5896931/polisi-di-singkawang-diviralkan-berbahasa-china-di-publik-begini-faktanya', 'https://news.detik.com/berita-jawa-timur/d-5895069/video-viral-hutan-baluran-banjir-dipastikan-hoax-polisi-itu-video-lama', 'https://news.detik.com/internasional/d-5873027/beredar-video-ledakan-parah-di-dubai-ternyata-3-insiden-lama-beda-negara', 'https://news.detik.com/berita/d-5865905/awas-ikut-tertipu-sejumlah-warga-ke-kantor-pln-bali-gegara-hoax-rekrutmen', 'https://news.detik.com/berita/d-5863802/beredar-pesan-gambar-kpk-pantau-muktamar-nu-di-lampung-ini-faktanya', 'https://news.detik.com/berita/d-5842083/viral-video-ayah-pukuli-anak-pakai-balok-kayu-begini-faktanya', 'https://news.detik.com/berita/d-5798562/video-mobil-ngebut-190-kmjam-dikaitkan-vanessa-angel-dipastikan-hoax', 'https://news.detik.com/berita/d-5755035/muncul-isu-liar-jokowi-joget-tanpa-masker-di-papua-ini-faktanya', 'https://news.detik.com/berita/d-5729500/beredar-edaran-penerima-bantuan-pesantren-kemenag-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5715146/5-bersaudara-di-surabaya-butuh-diadopsi-karena-papa-mama-meninggal-covid-19-hoaks', 'https://news.detik.com/berita/d-5714873/minta-maaf-ustaz-royan-jelaskan-viral-5-polisi-angkat-poster-demo-jokowi', 'https://health.detik.com/berita-detikhealth/d-5714239/viral-bawang-putih-tarik-cairan-dari-paru-paru-pasien-corona-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5699731/awas-hoax-viral-info-vaksin-palsu-beredar-di-indonesia-ini-faktanya', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5688266/hoax-pesan-bantuan-subsidi-gaji-rp-35-juta-jangan-dibuka', 'https://news.detik.com/berita-jawa-timur/d-5658878/2-sekolah-ditolak-warga-bondowoso-jadi-tempat-isolasi-satgas-tak-patah-arang', 'https://news.detik.com/berita/d-5655368/viral-video-demo-rusuh-di-jl-gajah-mada-polisi-pastikan-hoax', 'https://news.detik.com/berita/d-5755035/muncul-isu-liar-jokowi-joget-tanpa-masker-di-papua-ini-faktanya', 'https://news.detik.com/berita/d-5729500/beredar-edaran-penerima-bantuan-pesantren-kemenag-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5715146/5-bersaudara-di-surabaya-butuh-diadopsi-karena-papa-mama-meninggal-covid-19-hoaks', 'https://news.detik.com/berita/d-5714873/minta-maaf-ustaz-royan-jelaskan-viral-5-polisi-angkat-poster-demo-jokowi', 'https://health.detik.com/berita-detikhealth/d-5714239/viral-bawang-putih-tarik-cairan-dari-paru-paru-pasien-corona-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5699731/awas-hoax-viral-info-vaksin-palsu-beredar-di-indonesia-ini-faktanya', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5688266/hoax-pesan-bantuan-subsidi-gaji-rp-35-juta-jangan-dibuka', 'https://news.detik.com/berita-jawa-timur/d-5658878/2-sekolah-ditolak-warga-bondowoso-jadi-tempat-isolasi-satgas-tak-patah-arang', 'https://news.detik.com/berita/d-5655368/viral-video-demo-rusuh-di-jl-gajah-mada-polisi-pastikan-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5645668/heboh-ajakan-tolak-ppkm-darurat-di-pekalongan-ini-kata-polisi', 'https://news.detik.com/berita/d-5643373/heboh-tim-covid-buru-warga-tanjungpinang-langgar-ppkm-darurat-ini-faktanya', 'https://news.detik.com/berita/d-5638774/viral-rusa-keliaran-di-jalanan-denpasar-saat-ppkm-darurat-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5635282/deretan-hoax-air-kelapa-netralkan-vaksin-hingga-obati-covid-19', 'https://news.detik.com/berita-jawa-tengah/d-5633158/beredar-pesan-ada-pasien-corona-kabur-di-kudus-ternyata', 'https://news.detik.com/berita-jawa-tengah/d-5622194/viral-tim-sar-klaten-kewalahan-jasad-covid-belum-dimakamkan-ini-faktanya', 'https://news.detik.com/berita/d-5607406/beredar-isu-sutiyoso-meninggal-keluarga-tidak-benar', 'https://news.detik.com/berita-jawa-tengah/d-5603576/waspada-ada-akun-wa-catut-bupati-klaten-minta-sumbangan', 'https://news.detik.com/berita-jawa-tengah/d-5603472/heboh-pesan-berantai-soal-varian-baru-corona-di-kudus-ini-faktanya', 'https://news.detik.com/berita/d-5591931/beredar-poster-konvensi-capres-nu-2024-pbnu-pastikan-hoax', 'https://health.detik.com/berita-detikhealth/d-5591504/viral-hoax-makan-bawang-3-kali-sehari-sembuhkan-corona-ini-faktanya', 'https://news.detik.com/berita/d-5590632/viral-tes-antigen-pakai-air-keran-hasilnya-positif-satgas-kepri-menepis', 'https://news.detik.com/internasional/d-5586179/fakta-di-balik-aksi-penyiar-malaysia-tutup-1-mata-untuk-palestina', 'https://inet.detik.com/cyberlife/d-5585732/waspada-6-hoax-vaksin-bermagnet-hingga-china-siapkan-senjata-biologis', 'https://health.detik.com/berita-detikhealth/d-5533468/viral-jadi-sulit-ereksi-karena-vaksin-sinovac-ini-penjelasan-dokter', 'https://health.detik.com/berita-detikhealth/d-5527149/viral-cacing-di-masker-impor-dari-china-ini-fakta-di-baliknya', 'https://finance.detik.com/energi/d-5526617/viral-gaji-petugas-kebersihan-pertamina-rp-13-juta-manajemen-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5519314/fakta-fakta-gibran-disebut-duduk-di-meja-menteri-pupr-duduk-di-kursi', 'https://finance.detik.com/energi/d-5511928/awas-hoax-bbm-langka-imbas-kilang-kebakaran-pertamina-stok-luber', 'https://news.detik.com/berita-jawa-tengah/d-5511550/viral-gibran-duduk-di-atas-meja-depan-menteri-basuki-begini-faktanya', 'https://news.detik.com/berita/d-5507088/geger-kaca-bus-transmetro-deli-medan-diduga-ditembak-begini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5487986/viral-lansia-non-dki-bisa-vaksin-corona-di-senayan-dipastikan-hoax', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5487983/awas-hoax-pesan-berantai-soal-vaksinasi-lansia-di-istora-senayan', 'https://health.detik.com/berita-detikhealth/d-5480124/hoax-tak-ada-larangan-minum-obat-jantung-sebelum-vaksin-covid-19', 'https://health.detik.com/berita-detikhealth/d-5473657/hoax-kemenkes-bantah-puluhan-wartawan-terkapar-setelah-vaksinasi-covid-19', 'https://health.detik.com/berita-detikhealth/d-5368305/minum-air-putih-bisa-atasi-kekentalan-darah-pasien-covid-19-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5360703/viral-info-penemu-vaksin-covid-19-sinovac-meninggal-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5357602/pasien-jalan-ngangkang-seperti-penguin-disebut-karena-anal-swab-ini-faktanya', 'https://finance.detik.com/moneter/d-5351004/kabar-bi-di-lockdown-bank-internasional-swiss-dipastikan-hoax', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5350942/hoax-jangan-percaya-pesan-berantai-dana-bagi-bagi-uang-tunai', 'https://health.detik.com/berita-detikhealth/d-5340874/sederet-hoax-vaksin-jokowi-disebut-salah-suntik-hingga-tak-sampai-habis', 'https://health.detik.com/berita-detikhealth/d-5338133/hoax-viral-kasdim-0817-gresik-wafat-usai-vaksin-covid-19-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5337075/viral-urutan-mandi-agar-tak-kena-stroke-ini-faktanya', 'https://news.detik.com/berita/d-5328895/foto-bayi-selamat-dari-sriwijaya-air-sj182-dipastikan-hoax', 'https://health.detik.com/berita-detikhealth/d-5324630/viral-vaksin-covid-19-memperbesar-penis-bpom-hoax-lah', 'https://news.detik.com/berita-jawa-timur/d-5321500/wawali-surabaya-terpilih-armuji-dikabarkan-meninggal-ketua-dprd-hoaks', 'https://news.detik.com/berita/d-5287986/beredar-chat-kapolda-metro-soal-sikat-laskar-hrs-dipastikan-hoax', 'https://news.detik.com/berita/d-5286913/video-ambulans-fpi-masuk-rs-saat-ricuh-diviralkan-ini-faktanya', 'https://news.detik.com/berita-jawa-tengah/d-5280091/viral-bendung-gerak-serayu-jebol-kepala-upt-itu-kapal-ponton-hanyut', 'https://news.detik.com/berita-jawa-tengah/d-5279872/viral-asrama-isolasi-mandiri-ugm-penuh-ternyata-begini-faktanya', 'https://news.detik.com/berita/d-5275107/kpu-makassar-bantah-keluarkan-flyer-hasil-survei-paslon-pilwalkot-berlogo-kpu', 'https://news.detik.com/berita-jawa-tengah/d-5264429/beredar-voice-note-binatang-buas-gunung-merapi-turun-ke-selo-kades-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5262931/viral-peta-bahaya-gunung-merapi-sejauh-10-km-bpptkg-itu-peta-2010', 'https://health.detik.com/berita-detikhealth/d-5254580/viral-tips-sembuhkan-covid-19-dalam-waktu-5-menit-dokter-paru-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5253524/video-jenazah-covid-19-diviralkan-bola-mata-hilang-keluarga-sebut-hoaks', 'https://news.detik.com/berita/d-5287986/beredar-chat-kapolda-metro-soal-sikat-laskar-hrs-dipastikan-hoax', 'https://news.detik.com/berita/d-5286913/video-ambulans-fpi-masuk-rs-saat-ricuh-diviralkan-ini-faktanya', 'https://news.detik.com/berita-jawa-tengah/d-5280091/viral-bendung-gerak-serayu-jebol-kepala-upt-itu-kapal-ponton-hanyut', 'https://news.detik.com/berita-jawa-tengah/d-5279872/viral-asrama-isolasi-mandiri-ugm-penuh-ternyata-begini-faktanya', 'https://news.detik.com/berita/d-5275107/kpu-makassar-bantah-keluarkan-flyer-hasil-survei-paslon-pilwalkot-berlogo-kpu', 'https://news.detik.com/berita-jawa-tengah/d-5264429/beredar-voice-note-binatang-buas-gunung-merapi-turun-ke-selo-kades-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5262931/viral-peta-bahaya-gunung-merapi-sejauh-10-km-bpptkg-itu-peta-2010', 'https://health.detik.com/berita-detikhealth/d-5254580/viral-tips-sembuhkan-covid-19-dalam-waktu-5-menit-dokter-paru-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5253524/video-jenazah-covid-19-diviralkan-bola-mata-hilang-keluarga-sebut-hoaks', 'https://news.detik.com/berita/d-3124615/benarkah-sesuap-lele-mengandung-3000-sel-kanker', 'https://news.detik.com/berita/d-3124915/loket-tiket-konser-bon-jovi-di-gbk-dibakar-hoax']
I want to get all the results from a race. The website shows 50 rows/page.
I navigate to the next page (same URL with suffix #page-x) using selenium, but I get a StaleElementReferenceException error whenever I try to find elements (cells of the table = td) on the next page.
I tried to close the driver between the steps to get just one list of elements at a time. I've also tried to load the pages separately with the URL+suffix, but it doesn't load correctly. I've tried building separate lists (at first I wanted one big list with all the results).
from selenium import webdriver
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
#The block under works well and I get a list of cells as intended.
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
elements = driver.find_elements_by_tag_name("td")
course = []
for i in range(len(elements)):
course.append(elements[i].text)
to_2 = driver.find_element_by_link_text("2")
to_2.click()
print(driver.current_url)
#I'm trying similar code for the next chunk, but it doesn't work.
elements2 = driver.find_elements_by_tag_name("td")
print(len(elements2))
print(elements2[5].text)
course2 = []
for i in range(len(elements2)):
course2.append(elements2[i].text)
driver.close()
I would expect a new list (course2), with the results of the second page, but I get a stale element error. When I print the current URL, the result is as expected. When I print the len(elements2), it's also OK. Looks like the problem is when I try to get the text of an element.
Solution-1:
Using BeautifulSoup and selenium, WebDriverWait is waiting for a certain condition to occur before proceeding further in the code. for more details about BeautifulSoup.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
course = []
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
page_soup = BeautifulSoup(driver.page_source, 'lxml')
# get table data
tbody = page_soup.find("tbody",{"id":"searchResultBoxParticipants"})
rows = tbody.find_all("tr")
for row in rows:
rowData = []
for td in row.find_all("td"):
rowData.append(td.text)
course.append(rowData)
data.append(course)
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
print(data)
Solution-2:
Using pandas library.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
tables = pd.read_html(driver.page_source)
#append Participants table data
data.append(tables[0])
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
#Concat dataframe object
result = pd.concat(data)
print(result)
I am not able to print the link of the final pdf which is opening after running the given code
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
def page_is_loaded(driver):
return driver.find_element_by_tag_name("body")!= None
def check_exists_by_text(text):
try:
driver.find_element_by_link_text(text)
except NoSuchElementException:
return False
return True
driver = webdriver.Chrome("C:/Users/Roshan/Desktop/sbi/chromedriver")
driver.maximize_window()
driver.get("http://www.careratings.com/brief-rationale.aspx")
wait = ui.WebDriverWait(driver,10)
wait.until(page_is_loaded)
location_field = driver.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = driver.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
driver.find_element_by_xpath("//input[#name='btn_submit']").click()
if check_exists_by_text('Reliance Capital Limited'):
elm =driver.find_element_by_link_text('Reliance Capital Limited')
driver.implicitly_wait(5)
elm.click()
driver.implicitly_wait(50)
#time.sleep(5)
#driver.quit()
else :
print("Company is not rated in the given Date range")
I am expecting the actual output is the link of this pdf :
"http://www.careratings.com/upload/CompanyFiles/PR/Reliance%20Capital%20Ltd.-05-18-2019.pdf"
but I do not know how to print this link
You need to find all elements in table, then extract data from them.
from selenium import webdriver
import os
# setup path to chrome driver
chrome_driver = os.getcwd() + '/chromedriver'
# initialise chrome driver
browser = webdriver.Chrome(chrome_driver)
# load url
browser.get('http://www.careratings.com/brief-rationale.aspx')
# setup date range
location_field = browser.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = browser.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
browser.find_element_by_xpath("//input[#name='btn_submit']").click()
# get all data rows
content = browser.find_elements_by_xpath('//*[#id="divManagementSpeak"]/table/tbody/tr/td/a')
# get text and href link from each element
collected_data = []
for item in content:
url = item.get_attribute("href")
description = item.get_attribute("innerText")
collected_data.append((url, description ))
Output:
('http://www.careratings.com/upload/CompanyFiles/PR/Ashwini%20Frozen%20Foods-05-21-2019.pdf', 'Ashwini Frozen Foods')
('http://www.careratings.com/upload/CompanyFiles/PR/Vanita%20Cold%20Storage-05-21-2019.pdf', 'Vanita Cold Storage')
and so on
I would say you just need to put this line:
pdf_link = elm.get_attribute("href")
Just check out the below image. You have missed one important part to click on. When you enter some text in that inputbox, there is a dropdown projected downward displaying the search results available in their stock to choose from. Once you click on that, the rest are as it is.
Try the following script:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "http://www.careratings.com/brief-rationale.aspx"
with webdriver.Chrome() as driver:
driver.get(url)
wait = WebDriverWait(driver,10)
location_field = wait.until(EC.presence_of_element_located((By.NAME, "txtfromdate")))
location_field.send_keys("2019-05-06")
last_date = wait.until(EC.presence_of_element_located((By.NAME, "txttodate")))
last_date.send_keys("2019-05-21")
input_search = wait.until(EC.presence_of_element_located((By.NAME, "txtSearchCompany_brief")))
input_search.send_keys('Reliance Capital Limited')
time.sleep(3) #could not get rid of this hardcoded delay to make the script work
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"[onclick*='Reliance Capital Limited']"))).click()
# time.sleep(2) #activate this line in case the script behaves otherwise
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"input[name='btn_submit']"))).click()
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"table tr td > a[href$='.pdf']"))):
print(item.get_attribute("href"))
My intention is to get the name, location, time of posting, title of the review and the whole review content from the web page (http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061).
My code :
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup = BeautifulSoup(driver.page_source,"lxml")
for link in soup.select(".profile"):
try:
profile = link.select("p:nth-of-type(1) a")[0]
profile1 = link.select("p:nth-of-type(2)")[0]
except:pass
print(profile.text,profile1.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup1 = BeautifulSoup(driver.page_source,"lxml")
for link in soup1.select(".col-10.review"):
try:
profile2 = link.select("small:nth-of-type(1)")[0]
profile3 = link.select("span:nth-of-type(3)")[0]
profile4 = link.select("a:nth-of-type(1)")[0]
except:pass
print(profile2.text,profile3.text,profile4.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup2 = BeautifulSoup(driver.page_source,"lxml")
for link in soup2.select(".more.review"):
try:
containers=page_soup.findAll("div",{"class":"more reviewdata"})
count=len(containers)
for index in range(count):
count1=len(containers[index].p)
for i in range(count1):
profile5 = link.select("p:nth-of-type(i)")[0]
except:pass
print(profile5.text)
driver.quit()
I am getting the output for name, location, time and title of the review but I am unable to get the full review of a user. I would be grateful, if anyone could help me in getting the output for the same, along with the optimization of my code (i.e) I want my code to extract the required data by loading the web page only once. Also, It would be very helpful for me if someone could help me in extracting all the customer reviews of Jio from all the webpages of the website.
You can achieve the same with few lines of code along with lesser pain. However, I've defined here three main categories, as in name, review_title, review_data and the rest of the fields you can twitch very easily.
This is how you can do alternatively:
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = item.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
name = item.find_element_by_css_selector("p a").text
review_title = item.find_element_by_css_selector("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]").text
review_data = ' '.join([' '.join(items.text.split()) for items in item.find_elements_by_css_selector(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()
Or to do the same combinedly (selenium + bs4):
from bs4 import BeautifulSoup
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source,"lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()