Using Selenium css selector to extract data

Using Selenium css selector to extract data - python

Hello I did this code that returns to me a list of li , but I want to access to each a tag mentioned inside and open it , if you have any recommandation I would be very grateful
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
options = Options()
# Creating our dictionary
all_services = pd.DataFrame(columns=['Motif', 'Description'])
path = "C:/Users/Al4D1N/Documents/ChromeDriver_webscraping/chromedriver.exe"
driver = webdriver.Chrome(options=options, executable_path=path)
driver.get("https://www.mairie.net/national/acte-naissance.htm#plus")
list_of_services = driver.find_elements_by_css_selector(".list-images li")
I know that I need to iterate in each list_of_services Item , but I don't know how can I open each a tag since they all don't have classes or ids that can help me to make difference between them

This is one way to extract all of the links within the hrefs.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
options = Options()
# Creating our dictionary
all_services = pd.DataFrame(columns=['Motif', 'Description'])
path = "C:/Users/Al4D1N/Documents/ChromeDriver_webscraping/chromedriver.exe"
driver = webdriver.Chrome(options=options, executable_path=path)
driver.get("https://www.mairie.net/national/acte-naissance.htm#plus")
#Get all elements in class 'list-images'
list_of_services = driver.find_elements_by_class_name("list-images")
for service in list_of_services:
#In each element, select the atags
atags = service.find_elements_by_css_selector('a')
for atag in atags:
#In each atag, select the href
href = atag.get_attribute('href')
Output:
https://www.mairie.net/national/acte-mariage.htm#acte-naissance
https://www.mairie.net/national/acte-deces.htm#acte-naissance
https://www.mairie.net/national/carte-identite.htm#acte-naissance
https://www.mairie.net/national/passeport.htm#acte-naissance
https://www.mairie.net/national/casier-judiciaire.htm#acte-naissance
https://www.mairie.net/national/demande-carte-electorale.htm#acte-naissance
https://www.mairie.net/national/cadastre-plu.htm#acte-naissance
https://www.mairie.net/national/carte-grise-en-ligne-par-internet.htm#acte-naissance
https://www.mairie.net/national/certificat-non-gage.htm#acte-naissance
https://www.mairie.net/national/permis-conduire-delivrance.htm#acte-naissance
https://www.mairie.net/national/changement-adresse.htm#acte-naissance

Related

Selenium error' list' object has no attribute 'text'

from selenium.webdriver import Chrome
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.suwon.go.kr/sw-www/www01/www01-01.jsp?q_tabs=open')
driver.implicitly_wait(3)
content = driver.find_element(By.TAG_NAME, 'iframe')
driver.switch_to.frame(content)
dropdown = Select(driver.find_element(By.XPATH,'//*[#id="dateType"]'))
dropdown.select_by_index(4)
driver.find_element(By.XPATH,'//*[#id="searchBtn"]').click()
complaint_list = []
contents_list = []
def complaint_Scraping():
for i in range(1,78):
titles = driver.find_elements(By.CSS_SELECTOR,'tbody > tr > td.left')
for complaint in titles:
name = BeautifulSoup(complaint.text, "html.parser")
complaint_list.append(name)
a = driver.find_elements(By.CSS_SELECTOR,' tbody > tr > td.left > a')
for content in a:
content.click()
time.sleep(2)
ancient_html = driver.find_elements(By.XPATH,'//*[#id="txt"]/div[1]/div[1]/div/div[2]')
content = BeautifulSoup(ancient_html.text, "html.parser")
contents_list.append(content)
driver.back()
complaint_Scraping()
i dont'know what's wrong in here
I can get all the titles, but It's not working if I try to get contents of the titles. first page may possible, but other things can't, please let me solve the problem.

ancient_html = driver.find_elements(By.XPATH,'//*[#id="txt"]/div[1]/div[1]/div/div[2]')
content = BeautifulSoup(ancient_html.text, "html.parser")
find_elements() returns a list of elements. ancient_html is a list.

How to get Instagram Number of Post, Number of Followers, and Number of Following using Selenium?

Firstly I'm sorry for my poor Englih. I'm kinda new to Python. So, I would like to know on how to scrape instagram number of post, number of followers, and number of following for certain account (I try to loop at it) and store the data in CSV files.
I've been trying to figure it out the XPATH, but I thought that my XPATH already correct, so what did I miss??
Here are my code:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
from selenium.webdriver.chrome.service import Service
urls = [
'https://www.instagram.com/acc_1/',
'https://www.instagram.com/acc_2/',
'https://www.instagram.com/acc_3/',
'https://www.instagram.com/acc_4/',
'https://www.instagram.com/acc_5/',
'https://www.instagram.com/acc_6/',
'https://www.instagram.com/acc_7/',
'https://www.instagram.com/acc_8/',
'https://www.instagram.com/acc_9/',
'https://www.instagram.com/acc_10/',
'https://www.instagram.com/acc_11/',
'https://www.instagram.com/acc_12/',
'https://www.instagram.com/acc_13/',
'https://www.instagram.com/acc_14/'
]
username_channel = []
number_of_post_chan = []
followers_chan = []
followings_chan = []
description_chan = []
#langsung buka
#collecting_data
for url in urls:
PATH = 'C:\webdrivers\chromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
#log-in
login = driver.find_element(By.XPATH, "//input[#name='username']")
login.clear()
login.send_keys('xxxxx')
driver.implicitly_wait(5)
login_pass = driver.find_element(By.XPATH, "//input[#name='password']")
login_pass.clear()
login_pass.send_keys('xxxxx')
driver.implicitly_wait(5)
button_login = driver.find_element(By.XPATH, "//form[#id='loginForm']/div/div[3]/button/div")
button_login.click()
time.sleep(3)
#Save Your Login info?
login_info = driver.find_element(By.XPATH, "//div[#class='cmbtv']/button")
login_info.click()
time.sleep(10)
driver.implicitly_wait(5)
usernameChan = driver.find_element(By.XPATH, "//h2[#class='_aacl _aacs _aact _aacx _aada']").text
numb_of_post = driver.find_element(By.CSS_SELECTOR, "//ul[#class=' _aa_8']/li[1]/div/span").text
followers = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[2]/a/div/span").get_attribute('title')
followings = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[3]/a/div/span").text
description = driver.find_element(By.XPATH, "//div[#class='_aa_c']/div").text
#username_channel.append(usernameChan)
#number_of_post_chan.append(numb_of_post)
#followers_chan.append(followers)
#followings_chan.append(followings)
#description_chan.append(description)
print(username_channel, number_of_post_chan, followers_chan, followings_chan, description_chan)
account_items = {
"username_ig" : username_channel,
"jumlah_posting" : number_of_post_chan,
"followers" : followers_chan,
"followings" : followings_chan,
"deskripsi" : description_chan
}
driver.quit()
df = pd.DataFrame(account_items, columns=["username_ig", "jumlah_posting", "followers", "followings", "deskripsi"])
print(df)
Is there any way better to express the element? Heeelp.
Thank you in advance.

To get the username, number of posts, followers, following and description you can select the element using CSS_SELECTOR.
In your code after the third driver.implicitly_wait(5) statement, instead of the next 5lines you can add the following.
usernameChan = driver.find_element(By.CSS_SELECTOR,"h2._aacl._aacs._aact._aacx._aada").text
details = driver.find_elements(By.CSS_SELECTOR, "span._ac2a._ac2b")
numb_of_post = details[0].text
followers = details[1].text
followings = details[2].text
description = driver.find_element(By.CSS_SELECTOR, "div._aacl._aaco._aacu._aacx._aad6._aade").text
EDIT : As you said, you got error while fetching details above IndexError: list index out of range. This probably is because the element might not have loaded until now. With the below imports replace the line where we are fetching details with the details in below code.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
details = WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span._ac2a._ac2b")))

The problem there is that the selector depends on whether the window is expanded or not

How can I extract the URLs from a website?

I am trying to get the list of URLs of each restaurant from this website . So far this is the code that I am trying to implement
Reproducible example
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
driver_path = '/Users/driverpath'
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://btownmenus.com/some/bloomington/delivery/all')
cards = driver.find_elements(By.CSS_SELECTOR, "div.restaurant-info")
urls = [card.get_attribute('href') for card in cards]
URLs
Result
I am getting None as result and I wonder if I am selecting in the wrong way the CSS of the div.

You need to get the a element inside the div with the restaurant-info class before you can get the href attribute from it:
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get("https://btownmenus.com/some/bloomington/delivery/all")
infos = driver.find_elements(By.CLASS_NAME, "restaurant-info")
for info in infos:
print(info.find_element(By.CSS_SELECTOR, "a").get_attribute("href"))
Output:
https://btownmenus.com/restaurants/stir-coffee/delivery
https://btownmenus.com/restaurants/avers-north-o449/delivery
https://btownmenus.com/restaurants/wings-x-extended-delivery/delivery
https://btownmenus.com/restaurants/the-chocolate-moose/delivery
https://btownmenus.com/restaurants/gudon/delivery
https://btownmenus.com/restaurants/do-asian-fusion-restaurant/delivery
https://btownmenus.com/restaurants/wings-xtreme-o263/delivery
https://btownmenus.com/restaurants/avers-pizza-south-o465/delivery
https://btownmenus.com/restaurants/papa-johns-south-o476/delivery
https://btownmenus.com/restaurants/white-castle/delivery
https://btownmenus.com/restaurants/taco-bell/delivery
https://btownmenus.com/restaurants/mcdonalds/delivery
https://btownmenus.com/restaurants/india-garden/delivery
https://btownmenus.com/restaurants/brusters-ice-cream/delivery
https://btownmenus.com/restaurants/my-thai-cafeplus/delivery
https://btownmenus.com/restaurants/da-vinci-pizza-pasta/delivery
https://btownmenus.com/restaurants/rush-bowls/delivery
https://btownmenus.com/restaurants/subway-r5626/delivery
https://btownmenus.com/restaurants/the-chef-s-table/delivery
https://btownmenus.com/restaurants/dk-sweets/delivery
https://btownmenus.com/restaurants/cafe-bali/delivery
https://btownmenus.com/restaurants/sweet-escape-btown/delivery
https://btownmenus.com/restaurants/rush-hour-station-r8330/delivery
https://btownmenus.com/restaurants/potbelly-sandwich-shop/delivery
https://btownmenus.com/restaurants/mr-pot/delivery
https://btownmenus.com/restaurants/in-bloom-eats-juice/delivery
https://btownmenus.com/restaurants/fazolis/delivery
https://btownmenus.com/restaurants/my-happy-thai/delivery
https://btownmenus.com/restaurants/chop-shop/delivery
https://btownmenus.com/restaurants/irish-lion-c182/delivery
https://btownmenus.com/restaurants/social-cantina/delivery
https://btownmenus.com/restaurants/apna-bazaar/delivery
https://btownmenus.com/restaurants/zero-degree-snow-ice/delivery
https://btownmenus.com/restaurants/dats/delivery
https://btownmenus.com/restaurants/bloomingfoods-near-west-side/delivery
https://btownmenus.com/restaurants/red/delivery
https://btownmenus.com/restaurants/buccetos-west/delivery
https://btownmenus.com/restaurants/amrit-india-restaurant/delivery
https://btownmenus.com/restaurants/subway-w-bloomfield-rd/delivery
https://btownmenus.com/restaurants/dagwood-s-deli/delivery
https://btownmenus.com/restaurants/chow-bar-c4355/delivery
https://btownmenus.com/restaurants/mura-sushi/delivery
https://btownmenus.com/restaurants/sunny-poke/delivery
https://btownmenus.com/restaurants/burma-garden/delivery
https://btownmenus.com/restaurants/feta-kitchen-cafe/delivery
https://btownmenus.com/restaurants/laughing-planet-cafe/delivery
https://btownmenus.com/restaurants/sunny-palace/delivery
https://btownmenus.com/restaurants/gourmet-garden/delivery
https://btownmenus.com/restaurants/best-taste/delivery
https://btownmenus.com/restaurants/krispy-krunchy-chicken-r2118/delivery
https://btownmenus.com/restaurants/honey-baked-ham/delivery
https://btownmenus.com/restaurants/kilroy-s-on-kirkwood/delivery
https://btownmenus.com/restaurants/indian-palace/delivery
https://btownmenus.com/restaurants/longfei-chinese-restaurant/delivery
https://btownmenus.com/restaurants/crazy-horse/delivery
https://btownmenus.com/restaurants/nourish-bar/delivery
https://btownmenus.com/restaurants/anatolia-turkish-cuisine-r65/delivery
https://btownmenus.com/restaurants/lotus-garden/delivery
https://btownmenus.com/restaurants/jersey-mike-s-subs-r296/delivery
https://btownmenus.com/restaurants/the-3-amigos/delivery
https://btownmenus.com/restaurants/swing-in-pizza/delivery
https://btownmenus.com/restaurants/subway/delivery
https://btownmenus.com/restaurants/golden-china/delivery
https://btownmenus.com/restaurants/hartzell-s-ice-cream/delivery
https://btownmenus.com/restaurants/village-pub/delivery
https://btownmenus.com/restaurants/On-theway/delivery
https://btownmenus.com/restaurants/scholars-inn-bakehouse/delivery
https://btownmenus.com/restaurants/noodle-town-r8354/delivery
https://btownmenus.com/restaurants/anyetsangs-little-tibet-o1325/delivery
https://btownmenus.com/restaurants/el-ranchero-mexican-restaurant/delivery
https://btownmenus.com/restaurants/my-thai-cafe-sushi/delivery
https://btownmenus.com/restaurants/sakura-15/delivery
https://btownmenus.com/restaurants/burgers-wings-things-o736/delivery
https://btownmenus.com/restaurants/korea-restaurant/delivery
https://btownmenus.com/restaurants/naughty-dog/delivery
https://btownmenus.com/restaurants/fat-dan-s-chicago-deli/delivery
https://btownmenus.com/restaurants/red-mango/delivery
https://btownmenus.com/restaurants/mr-taco/delivery
https://btownmenus.com/restaurants/sushi-bar-r1501/delivery
https://btownmenus.com/restaurants/szechuan-kitchen/delivery
https://btownmenus.com/restaurants/popkorn-twist/delivery
https://btownmenus.com/restaurants/smokeworks/delivery
https://btownmenus.com/restaurants/noble-roman-s-pizza/delivery
https://btownmenus.com/restaurants/btown-gyros/delivery
https://btownmenus.com/restaurants/viva-mas-mexican-restaurant/delivery
https://btownmenus.com/restaurants/szechuan-taste/delivery
https://btownmenus.com/restaurants/little-downtown-cafe/delivery
https://btownmenus.com/restaurants/mr-hibachi/delivery
https://btownmenus.com/restaurants/rockits-pizza-o616/delivery
https://btownmenus.com/restaurants/which-wich-superior-sandwiches-o1769/delivery
https://btownmenus.com/restaurants/subway-r1047/delivery
https://btownmenus.com/restaurants/the-cabin-restaurant/delivery
https://btownmenus.com/restaurants/sobon-korean-cafe/delivery
https://btownmenus.com/restaurants/buccetos-o103/delivery
https://btownmenus.com/restaurants/smokin-jacks/delivery
https://btownmenus.com/restaurants/deangelos-c5671/delivery
https://btownmenus.com/restaurants/brilliant-coffee-company/delivery
https://btownmenus.com/restaurants/apna-grocery-convenience/delivery
https://btownmenus.com/restaurants/mama-s-korean-bbq/delivery
https://btownmenus.com/restaurants/bangkok-thai-cuisine/delivery
https://btownmenus.com/restaurants/subway-r1979/delivery
https://btownmenus.com/restaurants/roly-poly/delivery
https://btownmenus.com/restaurants/denny-s/delivery
https://btownmenus.com/restaurants/bloomingfood-s-ivy-tech/delivery
https://btownmenus.com/restaurants/el-ranchero-mexican-restaurant-r2101/delivery
https://btownmenus.com/restaurants/butchs-grillacatessen-and-eatzeria-o725/delivery
https://btownmenus.com/restaurants/bedrak-cafe/delivery
https://btownmenus.com/restaurants/el-ranchero-mexican-restaurant-r3334/delivery
https://btownmenus.com/restaurants/trailhead-pizzeria/delivery
https://btownmenus.com/restaurants/the-tap/delivery
https://btownmenus.com/restaurants/taste-of-india-o797/delivery
https://btownmenus.com/restaurants/bloomington-sandwich-co-o787/delivery
https://btownmenus.com/restaurants/avers-pizza-east-o1737/delivery
https://btownmenus.com/restaurants/peach-garden-o45/delivery
https://btownmenus.com/restaurants/hive/delivery
https://btownmenus.com/restaurants/the-orbit-room/delivery
https://btownmenus.com/restaurants/eric-gordon-s-greek-s-pizzeria/delivery
https://btownmenus.com/restaurants/hoosiers-pizza/delivery
https://btownmenus.com/restaurants/the-bowl/delivery
https://btownmenus.com/restaurants/brown-diner/delivery
https://btownmenus.com/restaurants/buffalo-wild-wings-r2144/delivery
https://btownmenus.com/restaurants/chick-fil-a/delivery
https://btownmenus.com/restaurants/subway-r6058/delivery
https://btownmenus.com/restaurants/the-trojan-horse/delivery
https://btownmenus.com/restaurants/yogi-s-bar-grill/delivery
https://btownmenus.com/restaurants/bloomingfoods-east/delivery
https://btownmenus.com/restaurants/asuka-r1874/delivery
https://btownmenus.com/restaurants/restaurant-ami-c4353/delivery
https://btownmenus.com/restaurants/japonee-express/delivery
https://btownmenus.com/restaurants/sofra-cafe-r6465/delivery
https://btownmenus.com/restaurants/buffalouies-o54/delivery
https://btownmenus.com/restaurants/freddy-s-frozen-custard/delivery
https://btownmenus.com/restaurants/chipotle-mexican-grill/delivery
https://btownmenus.com/restaurants/five-guys/delivery
https://btownmenus.com/restaurants/kfc/delivery
https://btownmenus.com/restaurants/bloomington-bagel-company/delivery
https://btownmenus.com/restaurants/wendy-s/delivery
https://btownmenus.com/restaurants/qdoba/delivery
https://btownmenus.com/restaurants/china-wok/delivery
https://btownmenus.com/restaurants/square-donuts/delivery
https://btownmenus.com/restaurants/siam-house/delivery
https://btownmenus.com/restaurants/z-c-teriyaki/delivery
https://btownmenus.com/restaurants/burger-king/delivery
https://btownmenus.com/restaurants/joella-s-hot-chicken/delivery
https://btownmenus.com/restaurants/applebees-c179/delivery
https://btownmenus.com/restaurants/wheel-pizza-r6622/delivery
https://btownmenus.com/restaurants/japonee-on-walnut/delivery
https://btownmenus.com/restaurants/papa-johns-north-o467/delivery

How to move to the next enclosing(div) while scraping a site?

All the data in is populated from the first table. I cannot move to the next div and get the data of the td for each tr.
The site: https://asd.com/page/
Below is the code that I have written.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://asd.com/page/asd"
driver.get(my_url)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
company = {}
for box in boxes:
header = box.find_element(By.CLASS_NAME,"text-primary.text-uppercase")
company['name']= header.text
td= box
company['Type']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[1]/td").text
company['Capital']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[2]/td").text
company['Address'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[3]/td").text
company['Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[4]/td").text
company['Co-Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[5]/td").text
company['Duration'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[6]/td").text
company['Place'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[7]/td").text
company['Company ID'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[8]/td").text
companies.append(company)
print(company)

There are several issues here:
You need to add some delay between driver.get(my_url) and boxes = driver.find_elements(By.CLASS_NAME, "col-md-4") to let the elements loaded before getting the list of all of them.
text-primary.text-uppercase is actually 2 class names: text-primary and text-uppercase so you should use XPATH or CSS_SELECTOR to locate element by 2 class names, not by CLASS_NAME.
In order to locate elements inside another element you should use XPATH starting with a dot .
Your locators like //div/div/div/table/tbody/tr[1]/td are absolute while they should be calculated based on the parent box element.
No need to define td element, you can use the existing box element here.
Locators like //div/div/div/table/tbody/tr[1]/td can and should be improved.
You probably will need to scroll to boxes while iterating over them.
I think company = {} should be defined inside the loop.
This should work better:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://monentreprise.bj/page/annonces"
driver.get(my_url)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "col-md-4")))
time.sleep(2)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
for box in boxes:
actions.move_to_element(box).perform()
time.sleep(0.3)
company = {}
header = box.find_element(By.XPATH,".//h5[#class='text-primary text-uppercase']")
company['name']= header.text
company['Objet']= box.find_element(By.XPATH,".//tr[1]/td").text
company['Capital']= box.find_element(By.XPATH,".//tr[2]/td").text
company['Siège Social'] = box.find_element(By.XPATH,".//tr[3]/td").text
company['Gérant'] = box.find_element(By.XPATH,".//tr[4]/td").text
company['Co-Gérant'] = box.find_element(By.XPATH,".//tr[5]/td").text
company['Durée'] = box.find_element(By.XPATH,".//tr[6]/td").text
company['Dépôt'] = box.find_element(By.XPATH,".//tr[7]/td").text
company['Immatriculation RCCM'] = box.find_element(By.XPATH,".//tr[8]/td").text
companies.append(company)
print(company)

Html Parser pulling from previous webpage

I have a script that loads a page and saves a bunch of data ids from multiple containers. I then want to open up new urls appending those said data ids onto the end of the urls. For each url I want to locate all the hrefs and compare them to a list of specific links and if any of them match I want to save that link and a few other details to a table.
I have managed to get it to open the url with the appended data id but when I try to search for elements in the new page it either pulls them from the first url that was parsed if I try to findAll from soup again or I constantly get this error when I try to run another html.parser.
ResultSet object has no attribute 'findAll'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
Is it not possible to run another parser or am I just doing something wrong?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.action_chains import ActionChains
url = "http://csgo.exchange/id/76561197999004010#x"
driver = webdriver.Firefox()
driver.get(url)
import time
time.sleep(15)
html = driver.page_source
soup = soup(html, "html.parser")
containers = soup.findAll("div",{"class":"vItem"})
print(len(containers))
data_ids = [] # Make a list to hold the data-id's
for container in containers:
test = container.attrs["data-id"]
data_ids.append(test) # add data-id's to the list
print(str(test))
for id in data_ids:
url2 = "http://csgo.exchange/item/" + id
driver.get(url2)
import time
time.sleep(2)
soup2 = soup(html, "html.parser")
containers2 = soup2.findAll("div",{"class":"bar"})
print(str(containers2))
with open('scraped.txt', 'w', encoding="utf-8") as file:
for id in data_ids:
file.write(str(id)+'\n') # write every data-id to a new line

Not sure exactly what you want from each page. You should add waits. I add waits looking for hrefs in the flow history section of each page (if present). It should illustrate the idea.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
driver = webdriver.Chrome()
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
results.append([id, flowHistory])
except:
print(url)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
pros = ['http://csgo.exchange/profiles/76561198149324950']
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,3).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
if flowHistory in pros:
results.append([url,flowHistory])
print(results)
except:
print()

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
urls = ['http://csgo.exchange/id/76561197999004010']
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
for url in urls:
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
pros = ['http://csgo.exchange/profiles/76561198149324950', 'http://csgo.exchange/profiles/76561198152970370']
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,2).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
match = []
for string in pros:
if string in flowHistory:
match = string
break
if match:
pass
results.append([url,match])
print(results)
except:
print()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Using Selenium css selector to extract data - python

Related

Selenium error' list' object has no attribute 'text'

How to get Instagram Number of Post, Number of Followers, and Number of Following using Selenium?

How can I extract the URLs from a website?

How to move to the next enclosing(div) while scraping a site?

Html Parser pulling from previous webpage

Categories

Resources