Using link;
image on flickr, requests only returns html to the comment:
`<!-- rendered with love by pprd1-node580-lh1.manhattan.bf1.yahoo.com -->`
(see image below for html).
I would like to access the links within in the img elements 3 div elements below so would appreciate any input.
from bs4 import BeautifulSoup
import logging
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import shutil
import sys
import time
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - \
%(levelname)s - %(message)s")
def flickr_images():
try:
search_term, number_images = sys.argv[1:]
num_req_images = int(number_images)
except ValueError:
print("Something went wrong. Command line input must be of \
format: 'filename searchterm numberimages'")
return
# navigate to search results page
driver = webdriver.Firefox()
# poll DOM for max 10 secs if element not immediately available
driver.implicitly_wait(10)
driver.get("https://www.flickr.com/search/?text=" + search_term)
driver.maximize_window()
# 0sec wait = 25images, 1sec = 48, 3+sec = 98
time.sleep(3)
image_link_elems = driver.find_elements_by_class_name("overlay")
# Incase requested is > found
num_images_tosave = min(req_images, len(image_link_elems))
image_elems_tosave = image_link_elems[:num_images_tosave]
print("{} images found.".format(num_images_tosave))
logging.info("Length photos: {}".format(len(image_link_elems)))
# extract image src's from found elements
src_links = []
image_links = [link.get_attribute("href") for link in image_elems_tosave]
for image_link in image_links:
res = requests.get(image_link)
res.raise_for_status
soup = bs4.BeautifulSoup(res.text, "html.parser")
src_elem = soup.select(".zoom-small")
HTML image:
Related
Here below I mentioned my code and I want to get multiple urls from web page but not from the end the limit of my page is 10. I need only 10 pages urls.
My Code :
`
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
array = []
driver = webdriver.Chrome()
driver.get('https://google.com')
search = driver.find_element("name","q").send_keys("websites"+Keys.RETURN)
urls = "https://www.google.com/search?q=websites&sxsrf=ALiCzsZmfT1H8dxBOig9KvuRbtnQUtVTtQ%3A1668401426603&source=hp&ei=EslxY8GlIZyM4-EPjqOLoAk&iflsig=AJiK0e8AAAAAY3HXIiUkRUcpwQ84iKLerx9VqGixFmVk&ved=0ahUKEwjB9vzS76z7AhUcxjgGHY7RApQQ4dUDCAk&uact=5&oq=websites&gs_lcp=Cgdnd3Mtd2l6EAMyCAgAEIAEELEDMgsIABCABBCxAxCDATILCAAQgAQQsQMQgwEyBQgAEIAEMgsIABCABBCxAxCDATIFCAAQgAQyBQgAEIAEMgcIABCABBAKMgUIABCABDILCAAQgAQQsQMQgwE6BwgjEOoCECc6BwguEOoCECc6CwguEIMBELEDEIAEOggILhCDARCxAzoICAAQsQMQgwE6BAgjECc6DgguEIAEELEDEMcBENEDOhEILhCABBCxAxCDARDHARCvAVDUBFj1E2D0FWgBcAB4AIAB4QGIAeMJkgEFMC43LjGYAQCgAQGwAQo&sclient=gws-wiz"
driver.get(urls)
elems = driver.find_elements("xpath","/html/body/div/div/div/div/div[2]/div[2]/div/div/div/div/div/div[1]/div/a")
while True:
link = driver.find_elements("xpath","/html/body/div[7]/div/div[11]/div/div[4]/div/div[2]/table/tbody/tr/td[12]/a")
if link == 1 :
print("No more pages left")
break
else:
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next'))).click()
elems = driver.find_elements("xpath","/html/body/div/div/div/div/div[2]/div[2]/div/div/div/div/div/div[1]/div/a")
for elem in elems:
array = elem.get_attribute("href")
print(array)
`
I am trying to get prices of routes on a bus page
import requests
from bs4 import BeautifulSoup
import re
popup_linkz= list()
p=range(1, 2, 1)
for i in p:
def get_headers(session):
res = session.get("https://new.turbus.cl/turbuscl/inicio-compra")
if res.status_code == 200:
print("Got headers")
return res.text
else:
print("Failed to get headers")
def search(session):
data = {
'origenInputModal': 'Santiago',
'destinoInputModal':'Calama',
'fechaRegreso': '03-04-2021',
'fechaIda': '31-03-2021',
}
res = session.post(
"https://new.turbus.cl/turbuscl/seleccion-itinerario",
data=data) #not sure if this is the search link
if res.status_code == 200:
print("Search succeeded")
return res.text
else:
print("Search failed with error:", res.reason)
print(res.text)
def get_popup_link(html):
soup = BeautifulSoup(html, "html.parser")
for t in soup.find_all('div', {'class': 'ticket_price-value'}):
precio = t.find('[class$="ticket_price-value"]').text
#cantidad = t.select_one('[id$="lblCantidad"]').text
#descripction = t.select_one('[id$="lblDescripcion"]').text
print(f"{precio=} {precio=}")
#print()
return precio
def main():
with requests.Session() as s:
get_headers(s)
html = search(s)
popup_links = (get_popup_link(html))
print(popup_links)
# popup_linkz.extend(popup_links)
#print(popup_links)
#print(popup_linkz)
#download_html = get_download_html(s, popup_links)
# print(download_html)
#popup_linkz.extend(popup_links for i in range(0, 1, 1))
main()
#a = popup_linkz
#print(a)
enter code here
this is the link https://new.turbus.cl/turbuscl/inicio-compra
So right now I am able to find the input boxes of the search, but not sure were to run it.
I am getting this error ValueError: too many values to unpack (expected 2)
so i am not so sure of what i am failing.
would you try to enlight me in order to succeed?
I have been trying all die and get a new approach with selenium in order to get search....
is right what i am doing or was better my first approach?
-- coding: utf-8 --
"""
Created on Tue Mar 29 21:04:05 2022
#author: christian marcos
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 29 16:20:40 2022
#author: christian marcos
"""
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from pandas.io.html import read_html
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
#select and fill firs field origin
driver=wd.Chrome('C:\\chromedriver.exe')
driver.maximize_window()
driver.get('https://new.turbus.cl/turbuscl/inicio-compra')
driver.implicitly_wait(20)
driver.find_element_by_xpath('//*[#id="origen"]').click();
wait = WebDriverWait(driver, 30)
#select and fill firs field
driver.implicitly_wait(10)
driver.find_element_by_xpath('//*[#id="modalOriginCity"]/div/div/div[2]/div[2]/ul/li[1]').click();
Best regards,
The post data needed is different. In this case, you need:
{
"fechaSalidaTramo": "31/03/2022",
"mnemotecnicoCiudadOrigenTramo": "stgo",
"mnemotecnicoCiudadDestinoTramo": "aric",
"horaSalidaTramo": 0,
"horaSalidaTramoMaxima": 0,
"codigoLinea": 90,
"numeroViaje": 0,
"numeroCuentaCorrienteCliente": 0,
"codigoIdaRegreso": 1,
"cantidadAsientos": 1,
"numeroRegistros": 0
}
And the link is, https://new.turbus.cl/turbuscl/recursos/vtwst76/web1.
In python, it'll look like this:
import requests
HOST = "https://nclt.gov.in/"
LINK = "https://new.turbus.cl/turbuscl/recursos/vtwst76/web1"
DATA = '{"fechaSalidaTramo":"31/03/2022","mnemotecnicoCiudadOrigenTramo":"stgo","mnemotecnicoCiudadDestinoTramo":"aric","horaSalidaTramo":0,"horaSalidaTramoMaxima":0,"codigoLinea":90,"numeroViaje":0,"numeroCuentaCorrienteCliente":0,"codigoIdaRegreso":1,"cantidadAsientos":1,"numeroRegistros":0}'
HEADERS = {
"Content-Type": "application/json",
}
def get_route(origin, destination):
res = requests.post(LINK, data=DATA, headers=HEADERS)
if res.status_code == 200:
print("getting routes")
return res.json()
else:
print(res)
def main():
info = get_route("here", "there")
print(info)
if __name__ == "__main__":
main()
How I got to the answer:
Go to the site.
Open the network tab, so I can see requests.
Do a search, and find the request that matches.
Copy the request as a curl request and import it into postman.
Remove headers, and see if you get an error when you do a request. Repeat until you have only the needed headers.
Copy the needed headers and data, and test it using requests.
I am not able to print the link of the final pdf which is opening after running the given code
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
def page_is_loaded(driver):
return driver.find_element_by_tag_name("body")!= None
def check_exists_by_text(text):
try:
driver.find_element_by_link_text(text)
except NoSuchElementException:
return False
return True
driver = webdriver.Chrome("C:/Users/Roshan/Desktop/sbi/chromedriver")
driver.maximize_window()
driver.get("http://www.careratings.com/brief-rationale.aspx")
wait = ui.WebDriverWait(driver,10)
wait.until(page_is_loaded)
location_field = driver.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = driver.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
driver.find_element_by_xpath("//input[#name='btn_submit']").click()
if check_exists_by_text('Reliance Capital Limited'):
elm =driver.find_element_by_link_text('Reliance Capital Limited')
driver.implicitly_wait(5)
elm.click()
driver.implicitly_wait(50)
#time.sleep(5)
#driver.quit()
else :
print("Company is not rated in the given Date range")
I am expecting the actual output is the link of this pdf :
"http://www.careratings.com/upload/CompanyFiles/PR/Reliance%20Capital%20Ltd.-05-18-2019.pdf"
but I do not know how to print this link
You need to find all elements in table, then extract data from them.
from selenium import webdriver
import os
# setup path to chrome driver
chrome_driver = os.getcwd() + '/chromedriver'
# initialise chrome driver
browser = webdriver.Chrome(chrome_driver)
# load url
browser.get('http://www.careratings.com/brief-rationale.aspx')
# setup date range
location_field = browser.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = browser.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
browser.find_element_by_xpath("//input[#name='btn_submit']").click()
# get all data rows
content = browser.find_elements_by_xpath('//*[#id="divManagementSpeak"]/table/tbody/tr/td/a')
# get text and href link from each element
collected_data = []
for item in content:
url = item.get_attribute("href")
description = item.get_attribute("innerText")
collected_data.append((url, description ))
Output:
('http://www.careratings.com/upload/CompanyFiles/PR/Ashwini%20Frozen%20Foods-05-21-2019.pdf', 'Ashwini Frozen Foods')
('http://www.careratings.com/upload/CompanyFiles/PR/Vanita%20Cold%20Storage-05-21-2019.pdf', 'Vanita Cold Storage')
and so on
I would say you just need to put this line:
pdf_link = elm.get_attribute("href")
Just check out the below image. You have missed one important part to click on. When you enter some text in that inputbox, there is a dropdown projected downward displaying the search results available in their stock to choose from. Once you click on that, the rest are as it is.
Try the following script:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "http://www.careratings.com/brief-rationale.aspx"
with webdriver.Chrome() as driver:
driver.get(url)
wait = WebDriverWait(driver,10)
location_field = wait.until(EC.presence_of_element_located((By.NAME, "txtfromdate")))
location_field.send_keys("2019-05-06")
last_date = wait.until(EC.presence_of_element_located((By.NAME, "txttodate")))
last_date.send_keys("2019-05-21")
input_search = wait.until(EC.presence_of_element_located((By.NAME, "txtSearchCompany_brief")))
input_search.send_keys('Reliance Capital Limited')
time.sleep(3) #could not get rid of this hardcoded delay to make the script work
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"[onclick*='Reliance Capital Limited']"))).click()
# time.sleep(2) #activate this line in case the script behaves otherwise
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"input[name='btn_submit']"))).click()
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"table tr td > a[href$='.pdf']"))):
print(item.get_attribute("href"))
I have a script that loads a page and saves a bunch of data ids from multiple containers. I then want to open up new urls appending those said data ids onto the end of the urls. For each url I want to locate all the hrefs and compare them to a list of specific links and if any of them match I want to save that link and a few other details to a table.
I have managed to get it to open the url with the appended data id but when I try to search for elements in the new page it either pulls them from the first url that was parsed if I try to findAll from soup again or I constantly get this error when I try to run another html.parser.
ResultSet object has no attribute 'findAll'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
Is it not possible to run another parser or am I just doing something wrong?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.action_chains import ActionChains
url = "http://csgo.exchange/id/76561197999004010#x"
driver = webdriver.Firefox()
driver.get(url)
import time
time.sleep(15)
html = driver.page_source
soup = soup(html, "html.parser")
containers = soup.findAll("div",{"class":"vItem"})
print(len(containers))
data_ids = [] # Make a list to hold the data-id's
for container in containers:
test = container.attrs["data-id"]
data_ids.append(test) # add data-id's to the list
print(str(test))
for id in data_ids:
url2 = "http://csgo.exchange/item/" + id
driver.get(url2)
import time
time.sleep(2)
soup2 = soup(html, "html.parser")
containers2 = soup2.findAll("div",{"class":"bar"})
print(str(containers2))
with open('scraped.txt', 'w', encoding="utf-8") as file:
for id in data_ids:
file.write(str(id)+'\n') # write every data-id to a new line
Not sure exactly what you want from each page. You should add waits. I add waits looking for hrefs in the flow history section of each page (if present). It should illustrate the idea.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
driver = webdriver.Chrome()
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
results.append([id, flowHistory])
except:
print(url)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
pros = ['http://csgo.exchange/profiles/76561198149324950']
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,3).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
if flowHistory in pros:
results.append([url,flowHistory])
print(results)
except:
print()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
urls = ['http://csgo.exchange/id/76561197999004010']
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
for url in urls:
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
pros = ['http://csgo.exchange/profiles/76561198149324950', 'http://csgo.exchange/profiles/76561198152970370']
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,2).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
match = []
for string in pros:
if string in flowHistory:
match = string
break
if match:
pass
results.append([url,match])
print(results)
except:
print()
I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()