I'm trying to make a request to multiple links from Yahoo Finance then return array of Income Statement, Balance Sheet and Cash Flow, respectively. I found myself stuck in a lot of for loops. I'm wondering if there a better version to this code below:
def scrapper(symbol):
htmls = []
soup = []
gen_table = []
IS = "http://finance.yahoo.com/q/is?s={}+Income+Statement&annual".format(symbol)
BS = "http://finance.yahoo.com/q/is?s={}+Balance+Sheet&annual".format(symbol)
CF = "http://finance.yahoo.com/q/is?s={}+Cash+Flow&annual".format(symbol)
urls = [IS, BS, CF]
# read each link in urls
for url in urls:
with urllib.request.urlopen(url) as response:
htmls.append(response.read())
# parse data with BeautifulSoup
for html in htmls:
soup.append(BeautifulSoup(html))
# store income statement, balance sheet and cash flow into soup
for s in soup:
gen_table.append(s.find_all("table", class_="yfnc_tabledata1"))
return gen_table
I might do it this way:
from bs4 import BeautifulSoup
import urllib
def fetch_table(symbol, table):
url = "http://finance.yahoo.com/q/is?s={}+{}&annual".format(symbol, table)
with urllib.request.urlopen(url) as response:
result = response.read()
result = BeautifulSoup(result)
result = result.find_all("table", class_="yfnc_tabledata1")
return result
def scrapper(symbol):
return [fetch_table(symbol, table)
for table in (
"Income+Statement",
"Balance+Sheet",
"Cash+Flow")]
print (scrapper("X"))
Related
I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)
I am new to Python and would really appreciate some help!!
I have been trying to create a dictionary for assigning books to their authors, only for it to come out messy and be repeating itself.
How can I fix this?
import requests
from bs4 import BeautifulSoup
url = "https://www.banyen.com/new-arrivals/index.html"
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html,'html.parser')
results = []
article = scraped.find("div", class_="block block-system block-odd clearfix")
for i in article.find_all():
name = i.find("h2", "a href", class_="teaser-title")
author = i.find("span", class_="price-amount")
if name is not None:
if author is not None:
results.append({name:author})
print(results)
import requests
from bs4 import BeautifulSoup
import re
url = "https://www.banyen.com/new-arrivals/index.html"
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html,'html.parser')
results = []
articles = scraped.find_all("div", id=re.compile("node-"))
for i in articles:
name = i.find("h2").find('a')
author = i.find("span", class_="price-amount")
if name is not None:
if author is not None:
results.append({name.text.strip():author.text})
print(results)
I have a list of twitter usernames. I need to get their number of followers. I used BS and requests. However, I've only received one account every time.
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv', engine= 'python')
followers = purcsv['username']
followers.head(10)
handle = purcsv['username'][0:40]
temp = ("https://twitter.com/"+handle)
temp = temp.tolist()
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text,'lxml')
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
That's because you are looping over the urls first and fetching the content for each in the same variable page here:
for url in temp:
page = requests.get(url)
so page will always contain the last url page accessed, to solve this you need to process a page once fetched
followers_list = []
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
here is a full example to verify
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv')
followers = purcsv['username']
handles = purcsv['username'][0:40].tolist()
followers_list = []
for handle in handles:
url = "https://twitter.com/" + handle
try:
page = requests.get(url)
except Exception as e:
print(f"Failed to fetch page for url {url} due to: {e}")
continue
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
output:
Number of followers: 13714085
Number of followers: 4706511
['13714085', '4706511']
You may consider using async function for fetching and processing those urls if you have two many of them.
Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()
I try to parse https://www.drugbank.ca/drugs. The idea is to extract all the drug names and some additional informationfor each drug. As you can see each webpage represents a table with drug names and the when we hit the drugname we can access to this drug information.
Let's say I will keep the following code to handle the pagination:
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
The issue is that in each page, and for each drug in the table of this page I need to capture :
Name.
Accession Number.
Structured Indications,
Generic Prescription Products,
I used the classical request/beautifusoup but can't go deep ..
Some Help please
Create function with requests and BeautifulSoup to get data from subpage
import requests
from bs4 import BeautifulSoup
def get_details(url):
print('details:', url)
# get subpage
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
# get data on subpabe
dts = soup.findAll('dt')
dds = soup.findAll('dd')
# display details
for dt, dd in zip(dts, dds):
print(dt.text)
print(dd.text)
print('---')
print('---------------------------')
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
# get links to subpages
links = soup.select('strong a')
for link in links:
# exeecute function to get subpage
get_details('https://www.drugbank.ca' + link['href'])
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
To crawl effectively, you'll want to implement a few measures, such as maintaining a queue of urls to visit and be aware of what urls you have already visited.
Keeping in mind that links can be absolute or relative and that redirects are very likely, you also probably want to construct the urls dynamically rather than string concatenation.
Here is a generic (we usually only want to use example.com on SO) crawling workflow...
from urllib.parse import urljoin, urlparse # python
# from urlparse import urljoin, urlparse # legacy python2
import requests
from bs4 import BeautifulSoup
def process_page(soup):
'''data extraction process'''
pass
def is_external(link, base='example.com'):
'''determine if the link is external to base'''
site = urlparse(link).netloc
return base not in site
def resolve_link(current_location, href):
'''resolves final location of a link including redirects'''
req_loc = urljoin(current_location, href)
response = requests.head(req_loc)
resolved_location = response.url # location after redirects
# if you don't want to visit external links...
if is_external(resolved_location):
return None
return resolved_location
url_queue = ['https://example.com']
visited = set()
while url_queue:
url = url_queue.pop() # removes a url from the queue and assign it to `url`
response = requests.get(url)
current_location = response.url # final location after redirects
visited.add(url) # note that we've visited the given url
visited.add(current_location) # and the final location
soup = BeautifulSoup(response.text, 'lxml')
process_page(soup) # scrape the page
link_tags = soup.find_all('a') # gather additional links
for anchor in link_tags:
href = anchor.get('href')
link_location = resolve_link(current_location, href)
if link_location and link_location not in visited:
url_queue.append(link_location)