I am trying to get prices of routes on a bus page
import requests
from bs4 import BeautifulSoup
import re
popup_linkz= list()
p=range(1, 2, 1)
for i in p:
def get_headers(session):
res = session.get("https://new.turbus.cl/turbuscl/inicio-compra")
if res.status_code == 200:
print("Got headers")
return res.text
else:
print("Failed to get headers")
def search(session):
data = {
'origenInputModal': 'Santiago',
'destinoInputModal':'Calama',
'fechaRegreso': '03-04-2021',
'fechaIda': '31-03-2021',
}
res = session.post(
"https://new.turbus.cl/turbuscl/seleccion-itinerario",
data=data) #not sure if this is the search link
if res.status_code == 200:
print("Search succeeded")
return res.text
else:
print("Search failed with error:", res.reason)
print(res.text)
def get_popup_link(html):
soup = BeautifulSoup(html, "html.parser")
for t in soup.find_all('div', {'class': 'ticket_price-value'}):
precio = t.find('[class$="ticket_price-value"]').text
#cantidad = t.select_one('[id$="lblCantidad"]').text
#descripction = t.select_one('[id$="lblDescripcion"]').text
print(f"{precio=} {precio=}")
#print()
return precio
def main():
with requests.Session() as s:
get_headers(s)
html = search(s)
popup_links = (get_popup_link(html))
print(popup_links)
# popup_linkz.extend(popup_links)
#print(popup_links)
#print(popup_linkz)
#download_html = get_download_html(s, popup_links)
# print(download_html)
#popup_linkz.extend(popup_links for i in range(0, 1, 1))
main()
#a = popup_linkz
#print(a)
enter code here
this is the link https://new.turbus.cl/turbuscl/inicio-compra
So right now I am able to find the input boxes of the search, but not sure were to run it.
I am getting this error ValueError: too many values to unpack (expected 2)
so i am not so sure of what i am failing.
would you try to enlight me in order to succeed?
I have been trying all die and get a new approach with selenium in order to get search....
is right what i am doing or was better my first approach?
-- coding: utf-8 --
"""
Created on Tue Mar 29 21:04:05 2022
#author: christian marcos
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 29 16:20:40 2022
#author: christian marcos
"""
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from pandas.io.html import read_html
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
#select and fill firs field origin
driver=wd.Chrome('C:\\chromedriver.exe')
driver.maximize_window()
driver.get('https://new.turbus.cl/turbuscl/inicio-compra')
driver.implicitly_wait(20)
driver.find_element_by_xpath('//*[#id="origen"]').click();
wait = WebDriverWait(driver, 30)
#select and fill firs field
driver.implicitly_wait(10)
driver.find_element_by_xpath('//*[#id="modalOriginCity"]/div/div/div[2]/div[2]/ul/li[1]').click();
Best regards,
The post data needed is different. In this case, you need:
{
"fechaSalidaTramo": "31/03/2022",
"mnemotecnicoCiudadOrigenTramo": "stgo",
"mnemotecnicoCiudadDestinoTramo": "aric",
"horaSalidaTramo": 0,
"horaSalidaTramoMaxima": 0,
"codigoLinea": 90,
"numeroViaje": 0,
"numeroCuentaCorrienteCliente": 0,
"codigoIdaRegreso": 1,
"cantidadAsientos": 1,
"numeroRegistros": 0
}
And the link is, https://new.turbus.cl/turbuscl/recursos/vtwst76/web1.
In python, it'll look like this:
import requests
HOST = "https://nclt.gov.in/"
LINK = "https://new.turbus.cl/turbuscl/recursos/vtwst76/web1"
DATA = '{"fechaSalidaTramo":"31/03/2022","mnemotecnicoCiudadOrigenTramo":"stgo","mnemotecnicoCiudadDestinoTramo":"aric","horaSalidaTramo":0,"horaSalidaTramoMaxima":0,"codigoLinea":90,"numeroViaje":0,"numeroCuentaCorrienteCliente":0,"codigoIdaRegreso":1,"cantidadAsientos":1,"numeroRegistros":0}'
HEADERS = {
"Content-Type": "application/json",
}
def get_route(origin, destination):
res = requests.post(LINK, data=DATA, headers=HEADERS)
if res.status_code == 200:
print("getting routes")
return res.json()
else:
print(res)
def main():
info = get_route("here", "there")
print(info)
if __name__ == "__main__":
main()
How I got to the answer:
Go to the site.
Open the network tab, so I can see requests.
Do a search, and find the request that matches.
Copy the request as a curl request and import it into postman.
Remove headers, and see if you get an error when you do a request. Repeat until you have only the needed headers.
Copy the needed headers and data, and test it using requests.
Related
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
urls=[]
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:
try:
t1 =detail.select_one("h5.profile-title a").text
except:
pass
wev={
'Name':t1,
}
product.append(wev)
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
data={
'website':website
}
product.append(data)
df=pd.DataFrame(product)
df.to_csv('firm.csv')
The data of the website will be down in to CSV file as shown in pic is I am appending the data in wrong way why is data moving down where I am wrong ...Kindly recommend where I am wrong there .......
I want output in these format Kindly suggest solution for these...I want output in these format as you shown below...
You can't append wev and data separately - you need website and name in the same dictionary for pandas to know that they belong to same row.
You could add the websites in a separate list like
sites = []
# for url in urls:
# driver.get...
# soup = ....
# try:....except:....
data={
'website':website
}
sites.append(data)
and then zip and combine:
for pi, dictPair in enumerate(zip(product, sites)):
product[pi].update(dictPair[1])
df = pd.DataFrame(product)
df.to_csv('firm.csv')
However, I don't think it's the best way to make sure the right Names and Websites are matched up.
You should just add to the same dictionary for each row from the start instead of zipping and merging.
added_urls = []
product = []
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')
It seams that Beautifull Soup is not able to retrieve the info from a table.
What I am trying to do is to retrieve the table with header and save it to a dataframe in pands. Any help is really much appreciated.
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Create an URL object
url = 'xxxx'
# Create object page
page = requests.get(url)
soup = BeautifulSoup(page.content, "html5lib")
data = soup.find_all("table", id="cve_table", attrs={"class": "table"})
print(len(data))
headers = []
for body in data:
print(body)
for item in body:
title = item.text
print(title)
headers.append(title)
print(headers)
All I got is this:
<table class="table cell-border table-striped table-condensed table-hover" id="cve_table">
<tbody></tbody>
</table>
['\n ', '', '\n\n ']
It seems, that the table is rendered by javascript, so when requests loads the HTML page, the table is empty. When investigating the page sources, it could be seen, that the table is rendered with the function called in the last script element. This function takes as a parameter the data structure needed to render the table. It can be extracted as follows:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
from collections import defaultdict
# Create an URL object
url = 'https://cve.rayvyn.net/rayvyn'
# Create object page
page = requests.get(url)
soup = BeautifulSoup(page.content, "html5lib")
dct = defaultdict(list)
script = soup.find(lambda tag: tag.name == "script" and "get_all_cve_data" in tag.text)
if script:
result = re.search('\((.*)\)', script.text)
text = result.group(1)
data = json.loads(json.loads(text))
for row in data:
dct['CVE ID'].append(row[0])
dct['Feed'].append(row[1])
dct['Date Modified'].append(row[2])
dct['Description'].append(row[3])
dct['Vector'].append(row[4])
dct['Vendor'].append(row[5])
dct['Product'].append(row[6])
dct['Advisory Link'].append(row[7])
else:
print('Script tag with function get_all_cve_data() not found')
df = pd.DataFrame(dct)
df
An alternative approach would be using selenium framework:
import re
import json
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from collections import defaultdict
# Create an URL object
url = 'https://cve.rayvyn.net/rayvyn'
# delay for selenium web driver wait
DELAY = 30
# create selenium driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome('<<path to chromedriver.exe>>', options = chrome_options)
# open web page
driver.get(url)
script = WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.XPATH, "//script[contains(text(), 'get_all_cve_data')]")))
dct = defaultdict(list)
if script:
result = re.search('\((.*)\)', script.get_attribute('innerHTML'))
text = result.group(1)
data = json.loads(text)
print(data)
else:
print('Script tag with function get_all_cve_data() not found')
driver.quit()
data = json.loads(json.loads(text))
for row in data:
dct['CVE ID'].append(row[0])
dct['Feed'].append(row[1])
dct['Date Modified'].append(row[2])
dct['Description'].append(row[3])
dct['Vector'].append(row[4])
dct['Vendor'].append(row[5])
dct['Product'].append(row[6])
dct['Advisory Link'].append(row[7])
df = pd.DataFrame(dct)
df
Please note, that for using selenium there also will be selenium webdriver needed (as a separate executable). It will simulate browser behavior and (among other features) will wait for the javascript code on the page to be executed and HTML code to be rendered.
I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")
I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()
Using link;
image on flickr, requests only returns html to the comment:
`<!-- rendered with love by pprd1-node580-lh1.manhattan.bf1.yahoo.com -->`
(see image below for html).
I would like to access the links within in the img elements 3 div elements below so would appreciate any input.
from bs4 import BeautifulSoup
import logging
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import shutil
import sys
import time
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - \
%(levelname)s - %(message)s")
def flickr_images():
try:
search_term, number_images = sys.argv[1:]
num_req_images = int(number_images)
except ValueError:
print("Something went wrong. Command line input must be of \
format: 'filename searchterm numberimages'")
return
# navigate to search results page
driver = webdriver.Firefox()
# poll DOM for max 10 secs if element not immediately available
driver.implicitly_wait(10)
driver.get("https://www.flickr.com/search/?text=" + search_term)
driver.maximize_window()
# 0sec wait = 25images, 1sec = 48, 3+sec = 98
time.sleep(3)
image_link_elems = driver.find_elements_by_class_name("overlay")
# Incase requested is > found
num_images_tosave = min(req_images, len(image_link_elems))
image_elems_tosave = image_link_elems[:num_images_tosave]
print("{} images found.".format(num_images_tosave))
logging.info("Length photos: {}".format(len(image_link_elems)))
# extract image src's from found elements
src_links = []
image_links = [link.get_attribute("href") for link in image_elems_tosave]
for image_link in image_links:
res = requests.get(image_link)
res.raise_for_status
soup = bs4.BeautifulSoup(res.text, "html.parser")
src_elem = soup.select(".zoom-small")
HTML image: