I wrote a parser that should parse exchange rates but there is a final touch.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.google.com/search?sxsrf=ALeKk02hYi-HCGXbHdPuek-VJRu_8qsUVg%3A1587054998453&ei=lomYXvaSG7zAmwWP_LHQBA&q=%D0%B4%D0%BE%D0%BB%D0%BB%D0%B0%D1%80+%D0%B3%D1%80%D0%B8%D0%B2%D0%BD%D0%B0&oq=&gs_lcp=CgZwc3ktYWIQARgBMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnOgQIABBHSgkIFxIFMTAtMjRKCAgYEgQxMC0yUPFtWPFtYKt8aAFwAngAgAEAiAEAkgEAmAEAoAEBqgEHZ3dzLXdperABCg&sclient=psy-ab'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'accept': '*/*'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="VgAgW")
currency = []
for item in items:
currency.append({
'uah': item.find('span', class_='SwHCTb').get_text(strip=True),
})
print(f"'Now the course:' + {currency}")
return currency
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print('Error')
parse()
I don’t know how to remove this: [{'uah':}]
Here is what comes out:
'Now the course:' + [{'uah': '27,22'}]
Process finished with exit code 0
Currency is a list currency = [] so when you print list it's always prints like this [].
Currency is a list of dicts {'uah': ...} so when you print dict it's always prints like this {key: value}.
Looks like you need to print(f"Now the course: {currency[0]['uah']}") where [0] is the first element of list, which is dict, and then gets value of that first dict by it's key 'uah'.
You can add an additional variable course to make it easier to access the value:
course = item.find('span', class_='SwHCTb').get_text(strip=True)
currency.append({'uah': course})
print(f"Now the course: {course}")
Related
I'm trying to webscrape for a personal project. Whenever I attempt to condense the code into either one function, or create two inner functions as shown below, I don't receive anything back. However, when I expand the functions out on their own, they work just fine.
Works:
import requests
import os
import json
from bs4 import BeautifulSoup
browser_headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.137"}
def iata_sing(city):
starting_letter = city[0].upper()
iata_single = f"https://en.wikipedia.org/wiki/List_of_airports_by_IATA_airport_code:_{starting_letter}"
sin_city = requests.get(url=iata_single, headers=browser_headers)
sin_city.raise_for_status()
sin_soup = BeautifulSoup(sin_city.content, 'html.parser')
main = sin_soup.find('div', id='content').find_all('td')
for line in main:
if "title=" and city.title() in line.text:
iata = line.findPreviousSibling().findPreviousSibling()
iata_target = list(iata)[0]
# print(iata_target)
return iata_target
def iata_multi(city):
iata_multiple = "https://en.wikipedia.org/wiki/IATA_airport_code#Cities_with_multiple_airports"
airport_list = []
city = city.title()
cities = requests.get(url=iata_multiple, headers=browser_headers)
cities.raise_for_status()
soup = BeautifulSoup(cities.content, 'html.parser')
# print(soup.prettify())
s = soup.find('div', id='content')
body = s.find('div', class_='mw-parser-output')
ULs = body.find_all('li')
for line in ULs:
if city in line.text:
for item in line.text:
airport_list.append(item)
new_list = "".join(airport_list)
return new_list
# print(iata_multi("paris"))
print(iata_sing("leigh"))
Doesn't work and not sure why, but a single call is preferable.:
def iata(city):
browser_headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.137"}
def iata_sing(city):
starting_letter = city[0].upper()
iata_single = f"https://en.wikipedia.org/wiki/List_of_airports_by_IATA_airport_code:_{starting_letter}"
sin_city = requests.get(url=iata_single, headers=browser_headers)
sin_city.raise_for_status()
sin_soup = BeautifulSoup(sin_city.content, 'html.parser')
main = sin_soup.find('div', id='content').find_all('td')
for line in main:
if "title=" and city.title() in line.text:
iata = line.findPreviousSibling().findPreviousSibling()
iata_target = list(iata)[0]
# print(iata_target)
return iata_target
def iata_multi(city):
iata_multiple = "https://en.wikipedia.org/wiki/IATA_airport_code#Cities_with_multiple_airports"
airport_list = []
city = city.title()
cities = requests.get(url=iata_multiple, headers=browser_headers)
cities.raise_for_status()
soup = BeautifulSoup(cities.content, 'html.parser')
# print(soup.prettify())
s = soup.find('div', id='content')
body = s.find('div', class_='mw-parser-output')
ULs = body.find_all('li')
for line in ULs:
if city in line.text:
for item in line.text:
airport_list.append(item)
new_list = "".join(airport_list)
return new_list
try:
iata_multi(city)
iata_sing(city)
except IndexError:
print("This appears to be a multi-airport location")
iata("paris")
So far I've attempted to separate each function out (which works) but whenever I create a single function, be that with 2 inner functions, or just removing the function definitions and using the code, I don't get a reply, and I'm kind of stuck as to the reason.
Put print() around the calls to the inner functions. – Barmar
As Barmar stated, this worked:
try:
print(iata_multi(city))
print(iata_sing(city))
except IndexError:
print("This appears to be a multi-airport location")
Thank you!
How do I get this code to loop for other stocks? For example, I want it to repeat and show stocks like Telsa, Amazon, Apple all in one executution? In my code, it only shows one stock and I want it to display multiple stocks.
Code:
import requests
from bs4 import BeautifulSoup
def create_url():
url = f'https://finance.yahoo.com/quote/TSLA'
return url
def get_html(url):
header = {"User Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
response = requests.get(url, headers = header)
if response.status_code == 200:
return response.text
else:
return None
def parse_data(html):
soup = BeautifulSoup(html,'html.parser')
name = soup.find('h1', {'class': 'D(ib) Fz(18px)'}).text
price = soup.select_one('#quote-header-info > div.My(6px).Pos(r).smartphone_Mt(6px).W(100%) > div.D(ib).Va(m).Maw(65%).Ov(h) > div.D(ib).Mend(20px) > fin-streamer.Fw(b).Fz(36px).Mb(-4px).D(ib)').text
stock_data = {
'name':name,
'price':price,
}
return stock_data
def main():
url = create_url()
# get html
html = get_html(url)
data = parse_data(html)
#return data
print(data)
if __name__ == '__main__':
main()
Try changing your create_url to take one parameter, which will be the stock you want to query, like so:
def create_url(ticker):
url = 'https://finance.yahoo.com/quote/' + ticker
return url
Then, you can create a list of tickers in your main function and call the function for each ticker.
def main():
tickers = [“AAPL”, “TSLA”]
for ticker in tickers:
url = create_url(ticker)
# get html
html = get_html(url)
data = parse_data(html)
print(data)
I have the list of links, each link has an id that is in the Id list
How to change the code so that when iterating the link, the corresponding id is substituted into the string:
All code is below:
import pandas as pd
from bs4 import BeautifulSoup
import requests
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.125', 'accept': '*/*'}
links = ['https://www..ie', 'https://www..ch', 'https://www..com']
Id = ['164240372761e5178f0488d', '164240372661e5178e1b377', '164240365661e517481a1e6']
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
def get_data_no_products(html):
data = []
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', id= '') # How to iteration paste id???????
for item in items:
data.append({'pn': item.find('a').get('href')})
return print(data)
def parse():
for i in links:
html = get_html(i)
get_data_no_products(html.text)
parse()
Parametrise your code:
def get_data_no_products(html, id_):
data = []
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', id=id_)
And then use zip():
for link, id_ in zip(links, ids):
get_data_no_producs(link, id_)
Note that there's a likely bug in your code: you return print(data) which will always be none. You likely just want to return data.
PS
There is another solution to this which you will frequently encounter from people beginning in python:
for i in range(len(links)):
link = links[i]
id_ = ids[i]
...
This... works. It might even be easier or more natural, if you are coming from e.g. C. (Then again I'd likely use pointers...). Style is very much personal, but if you're going to write in a high level language like python you might as well avoid thinking about things like 'the index of the current item' as much as possible. Just my £0.02.
I want to access the E-journal page and then retrieve every abstract of the articles.
So I wrote the code that makes a list of the URLs of abstract pages. And it works successfully.
But when I tried to request the URLs and retrieve the abstracts, it didn't work. (with many 'None' in the console.)
This is my code.
import requests
from bs4 import BeautifulSoup
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
URL = "https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7"
JAGS_result = requests.get(URL, headers=h)
JAGS_soup = BeautifulSoup(JAGS_result.text, "html.parser")
L = []
for link in JAGS_soup.find_all('a',{"title":"Abstract"}):
L.append(link.get('href'))
Ab_Links = []
a = 0
for ab_link in L:
if a == len(L):
break
else:
full_link = "https://agsjournals.onlinelibrary.wiley.com"+L[a]
Ab_Links.append(full_link)
a = a+1
print(Ab_Links)
b = 0
Ab = []
Ab_URL = Ab_Links[b]
for ab_url in Ab_Links:
if b == len(L):
break
else:
Ab_result = requests.get(Ab_Links[b], headers = h)
Ab_soup = BeautifulSoup(Ab_result.text, "html.parser")
abstract = Ab_soup.find({"class" : "article-section article-section__abstract"})
Ab.append(abstract)
b = b+1
print(Ab)
I am a novice to python and HTML so it is very hard to write code by myself. Please help me...
import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = BeautifulSoup(
r.content, 'lxml', parse_only=SoupStrainer('a', title='Abstract'))
links = [urljoin(url, x['href']) for x in soup.select('a')]
for link in links:
r = req.get(link)
soup = BeautifulSoup(r.text, 'lxml')
print(soup.select_one('.article-section.article-section__abstract'))
if __name__ == "__main__":
main('https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7')
You could try this out.
This prints the abstract of all the articles in the page.
import requests
import bs4 as bs
url = 'https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7'
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
resp = requests.get(url, headers=h)
soup = bs.BeautifulSoup(resp.text, 'lxml')
base_url = 'https://agsjournals.onlinelibrary.wiley.com'
abstract_urls = soup.findAll('a', attrs= {'title': 'Abstract'})
for i in abstract_urls:
a_url = base_url + i['href']
r = requests.get(a_url,headers=h)
soup = bs.BeautifulSoup(r.text, 'lxml')
abs_text = soup.find('section', class_='article-section article-section__full').text.strip()
print(abs_text)
Your code is mostly correct. The problem is with finding the abstract. In order to search for an element by class, use class_='...'. If you change your Abstract = line to the following, it will return results:
abstract = Ab_soup.find(class_='article-section article-section__abstract')
Also, you can simplify your loops. for ab_link in L will iterate through each item in L and then stop. You do not need to test if a == len(L), and in fact that code will never be True, because the loop will exit before a == len(L).
Help me please! I programmed a simple parser, but it does not work correctly, and I do not know what this is connected with.
import requests
from bs4 import BeautifulSoup
URL = 'https://stopgame.ru//topgames'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0', 'accept': '*/*'}
HOST = 'https://stopgame.ru'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('a', class_="lent-block game-block")
print(items)
def parse():
html = get_html(URL)
if html.status_code == 200:
items = get_content(html.text)
else:
print('Error')
parse()
I've got this output :
[]
Process finished with exit code 0
items = soup.find_all('a', class_="lent-block game-block")
You are trying to find out "lent-block game-block" class for anchor
tag which actually is not there in html and hence you are getting
blank list.
Try with this div item you will get the list of matched items.
items = soup.find_all('div', class_="lent-block lent-main")