import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.haremaltin.com/canli-piyasalar/")
soup = BeautifulSoup(html.content)
atalira = soup.findall(?????)
for gold in atalira:
price = gold.text
print(price)
Hello everyone, if you go to page https://www.haremaltin.com/canli-piyasalar/ In "Altın Fiyatları" you will see "Eski Ata". I want to insert one of those values into ?????? part of my python code and it is a little bit challenging for me. Thank you for your time in advance. Below you can see that html codes and value that I want to insert
<span class="item end price"><span class="arrowWrapper"><!----> <!----></span>
3.327
</span>
Edit:
I have found a way
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
# pip install selenium
# apt-get update # to update ubuntu to correctly run apt install
# apt install chromium-chromedriver
# cp /usr/lib/chromium-browser/chromedriver /usr/bin
# use command above if you code on google colab
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
site = 'https://www.haremaltin.com/altin-fiyatlari'
wd = webdriver.Chrome('chromedriver', options=options)
wd.get(site)
time.sleep(5) # give chrome 5 seconds to load the page
html = wd.page_source
df = pd.read_html(html)
gold = df[1][2][15] # table 1, column 2, row 15, the value I want
gold = int((float(gold))*1000)
# it was a float and even more float value that I got, something like
# 3.252,000, so I tried to convert it into int so code above the
# solution that I found
I'm not sure you can do it this way. The best way is to get the API for this site you want and go from there. If you can't get it, find a different site. Here is a sample code I made a while back ago.
import re
import http.client
def gold_price():
conn = http.client.HTTPSConnection("www.goldapi.io")
payload = ''
headers = {
'x-access-token': 'goldapi-aq2kfluknfhfjz4-io',
'Content-Type': 'application/json'
}
conn.request("GET", "/api/XAU/USD", payload, headers)
res = conn.getresponse()
data = res.read()
data.decode("utf-8")
txt = data.decode("utf-8")
pattern = re.search(r'"price":\d\d\d\d',txt)
# pattern = re.findall(r'\d\d\d\d',txt)
print(pattern)
gold_price()
Related
everyone.
I am working on a python project with selenium to scrape data.
But there is one problem, I have to scrape the data every 5mins.
So I run chrome driver with selenium, the problem is selenium scrape speed is very slow.
If I run this project, It takes at least 30mins. I can't get data every 5mins.
If you have experience in this field, please help me.
If you can give me other ways(for example beautiful soap), I will be very happy.
Note: This site that I want to get data is rendering using javascript.
This is my source code. I am testing it.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
driver = webdriver.Chrome()
driver.set_window_size(800, 600)
tickerNames = []
finvizUrl = "https://finviz.com/screener.ashx?v=111&f=exch_nasd,geo_usa,sh_float_u10,sh_price_u10,sh_relvol_o2"
nasdaqUrl = "https://www.nasdaq.com/market-activity/stocks/"
tickerPrice = []
def openPage(url):
driver.get(url)
def exitDrive():
driver.quit()
def getTickers():
tickers = driver.find_elements_by_class_name('screener-link-primary')
for i in range(len(tickers)):
tickerNames.append(tickers[i].text)
return tickerNames
def comparePrice(tickers):
for i in range(len(tickers)):
openPage(nasdaqUrl+tickers[i])
tickerPrice[i] = driver.find_element_by_class_name('symbol-page-header__pricing-price').text
return tickerPrice
openPage(finvizUrl)
comparePrice(getTickers())
# getTickers()
print(comparePrice())
There seems to be an API on the nasdaq site that you can query (found using network tools), so there isn't really any need to use selenium for this. Here is an example that gets the data using requests
import requests
import lxml.html
import time
FINVIZ_URL = "https://finviz.com/screener.ashx?v=111&f=exch_nasd,geo_usa,sh_float_u10,sh_price_u10,sh_relvol_o2"
NASDAQ_URL = "https://api.nasdaq.com/api/quote/{}/summary?assetclass=stocks"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"
}
session = requests.Session()
session.headers.update(headers)
r = session.get(FINVIZ_URL)
# Get data using lxml xpath but use whatever you want
x = lxml.html.fromstring(r.text)
stocks = x.xpath("//*[#class='screener-link-primary']/text()")
for stock in stocks:
data = session.get(NASDAQ_URL.format(stock))
print(f"INFO for {stock}")
print(data.json()) # This might have the data you want
# Sleep in case there is a rate limit (may not be needed)
time.sleep(5)
I would like to download data from http://ec.europa.eu/taxation_customs/vies/ site. Case is that when I enter data on it through program the URL doesn't change, so file saved on disc has a page same as the one which were opened from the begining without data.Maybe I don't know how to access this site after adding data? I'm new in Python and tried to look for solution but with no result so if there was such issue, please link me. Here's my code. I appreciate all responses:)
import requests
import selenium
import select as something
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pdfkit
url = "http://ec.europa.eu/taxation_customs/vies/?locale=pl"
driver = webdriver.Chrome(executable_path ="C:\\Users\\Python\\Chromedriver.exe")
driver.get("http://ec.europa.eu/taxation_customs/vies/")
#wait = WebDriverWait(driver, 10)
obj = Select(driver.find_element_by_id("countryCombobox"))
obj = obj.select_by_index(1)
vies_r = requests.get(url)
vies_vat = driver.find_element_by_id("number")
vies_vat.send_keys('U54799909')
vies_verify = driver.find_element_by_id("submit")
vies_verify.click()
path_wkhtmltopdf = r'C:\Users\Python\wkhtmltox\wkhtmltox\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
print(driver.current_url)
pdfkit.from_url(driver.current_url, "out.pdf", configuration=config)
Ukalo
I'm trying to scrape website traffic from semrush.com.
my current code using BeautifulSoup is:
from bs4 import BeautifulSoup, BeautifulStoneSoup
import urllib
import json
req = urllib.request.Request('https://www.semrush.com/info/burton.com', headers={'User-Agent':'Magic Browser'})
response = urllib.request.urlopen(req)
raw_data = response.read()
response.close()
soup = BeautifulSoup(raw_data)
I've been trying data = soup.findAll("a", {"href":"/info/burton.com+(by+organic)"}) or data = soup.findAll("span", {"class":"sem-report-counter"}) without much luck.
I can see the numbers on the webpage that I would like to get. Is there a way to pull this information off? I'm not seeing it in the html I pull.
I went the extra mile and set up a working example of how you can use selenium to scrape that page. Install selenium and try it out!
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = 'https://www.semrush.com/info/burton.com' #your url
options = Options() #set up options
options.add_argument('--headless') #add --headless mode to options
driver = webdriver.Chrome(executable_path='/opt/ChromeDriver/chromedriver',
chrome_options=options)
#note: executable_path will depend on where your chromedriver.exe is located
driver.get(url) #get response
driver.implicitly_wait(1) #wait to load content
elements = driver.find_elements_by_xpath(xpath='//a[#href="/info/burton.com+(by+organic)"]') #grab that stuff you wanted?
for e in elements: print(e.get_attribute('text').strip()) #print text fields
driver.quit() #close the driver when you're done
Output that I see in my terminal:
356K
6.5K
59.3K
$usd305K
Organic keywords
Organic
Top Organic Keywords
View full report
Organic Position Distribution
I have a page that has a table (table id= "ctl00_ContentPlaceHolder_ctl00_ctl00_GV" class="GridListings" )i need to scrape.
I usually use BeautifulSoup & urllib for it,but in this case the problem is that the table takes some time to load ,so it isnt captured when i try to fetch it using BS.
I cannot use PyQt4,drysracpe or windmill because of some installation issues,so the only possible way is to use Selenium/PhantomJS
I tried the following,still no success:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.PhantomJS()
driver.get(url)
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located(By.CSS_SELECTOR, 'table#ctl00_ContentPlaceHolder_ctl00_ctl00_GV'))
The above code doesnt give me the desired contents of the table.
How do i go about achieveing this???
You can get the data using requests and bs4,, with almost if not all asp sites there are a few post params that always need to be provided like __EVENTTARGET, __EVENTVALIDATION etc.. :
from bs4 import BeautifulSoup
import requests
data = {"__EVENTTARGET": "ctl00$ContentPlaceHolder$ctl00$ctl00$RadAjaxPanel_GV",
"__EVENTARGUMENT": "LISTINGS;0",
"ctl00$ContentPlaceHolder$ctl00$ctl00$ctl00$hdnProductID": "139",
"ctl00$ContentPlaceHolder$ctl00$ctl00$hdnProductID": "139",
"ctl00$ContentPlaceHolder$ctl00$ctl00$drpSortField": "Listing Number",
"ctl00$ContentPlaceHolder$ctl00$ctl00$drpSortDirection": "A-Z, Low-High",
"__ASYNCPOST": "true"}
And for the actual post, we need to add a few more values to out post data:
post = "https://seahawks.strmarketplace.com/Charter-Seat-Licenses/Charter-Seat-Licenses.aspx"
with requests.Session() as s:
s.headers.update({"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0"})
soup = BeautifulSoup(s.get(post).content)
data["__VIEWSTATEGENERATOR"] = soup.select_one("#__VIEWSTATEGENERATOR")["value"]
data["__EVENTVALIDATION"] = soup.select_one("#__EVENTVALIDATION")["value"]
data["__VIEWSTATE"] = soup.select_one("#__VIEWSTATE")["value"]
r = s.post(post, data=data)
soup2 = BeautifulSoup(r.content)
table = soup2.select_one("div.GridListings")
print(table)
You will see the table printed when you run the code.
If you want to scrap something, it will be nice first to install a web debugger ( Firebug for Mozilla Firefox for example) to watch how the website you want to scrap is working.
Next, you need to copy the process of how the website is connecting to backoffice
As you said, the content that you want to scrap is being loaded asynchronously (only when the document is ready)
Assuming the debugger is running and also you have refreshed the page, you will see on the network tab the following request:
POST https://seahawks.strmarketplace.com/Charter-Seat-Licenses/Charter-Seat-Licenses.aspx
The final process flow to reach your goal will be:
1/ Use requests python module
2/ Open a requests session to the index page website site (with cookies handling)
3/ Scrap all the input for the specific POST form request
4/ Build a POST parameter DICT containing all inputs & value fields scrapped in the previous step + adding some specific fixed params.
5/ POST the request (with required data)
6/ Use finally BS4 module (as usual) to soup the answered html to scrap your data
Please see bellow a working code:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import requests
base_url="https://seahawks.strmarketplace.com/Charter-Seat-Licenses/Charter-Seat-Licenses.aspx"
#create requests session
s = requests.session()
#get index page
r=s.get(base_url)
#soup page
bs=BeautifulSoup(r.text)
#extract FORM html
form_soup= bs.find('form',{'name':'aspnetForm'})
#extracting all inputs
input_div = form_soup.findAll("input")
#build the data parameters for POST request
#we add some required <fixed> data parameters for post
data={
'__EVENTARGUMENT':'LISTINGS;0',
'__EVENTTARGET':'ctl00$ContentPlaceHolder$ctl00$ctl00$RadAjaxPanel_GV',
'__EVENTVALIDATION':'/wEWGwKis6fzCQLDnJnSDwLq4+CbDwK9jryHBQLrmcucCgL56enHAwLRrPHhCgKDk6P+CwL1/aWtDQLm0q+gCALRvI2QDAKch7HjBAKWqJHWBAKil5XsDQK58IbPAwLO3dKwCwL6uJOtBgLYnd3qBgKyp7zmBAKQyTBQK9qYAXAoieq54JAuG/rDkC1djKyQMC1qnUtgoC0OjaygUCv4b7sAhfkEODRvsa3noPfz2kMsxhAwlX3Q=='
}
#we add some <dynamic> data parameters
for input_d in input_div:
try:
data[ input_d['name'] ] =input_d['value']
except:
pass #skip unused input field
#post request
r2=s.post(base_url,data=data)
#write the result
with open("post_result.html","w") as f:
f.write(r2.text.encode('utf8'))
Now, please get a look at "post_result.html" content and you will find the data !
Regards
I am using Python 2.7 on a 32 bit Windows machine.
I am trying to enter species data into http://explorer.natureserve.org and retrieve the results, but am having difficulty understanding how to do it. Needless to say I am relatively new to Python.
I have the following code:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get("http://explorer.natureserve.org")
assert "NatureServe" in driver.title
SciName = driver.find_element_by_name('searchSciOrCommonName')
SciName.send_keys("Arabis georgiana")
SciName.send_keys(Keys.RETURN)
assert "No results found." not in driver.page_source
The above works, but now I need to select the element Arabis georgiana on the results page, which will take me to another page. How do I get the results page back into Python and redirect to the page that I actually want?
You need to set the searchSciOrCommonName field value this way:
br.form = list(br.forms())[0]
br.form['searchSciOrCommonName'] = 'butterfly'
response = br.submit()
Then, you can parse the HTML response via, for example, BeautifulSoup:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response)
for item in soup.select('table[border="1"] > tr i')[1:]:
print(item.text.strip())
which would print:
Aglais io
Callophrys mossii hidakupa
Callophrys mossii marinensis
Cercyonis pegala incana
...
Psora nipponica
Flowering Plants
Asclepias tuberosa