Python append adding same data - python

I'm trying to extract the stock price and the market cap data from a Korean website.
Here is my code:
import requests
from bs4 import BeautifulSoup
response = requests.get('http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', { 'class': 'type_2' })
data = []
for tr in table.find_all('tr'):
tds = list(tr.find_all('td'))
for td in tds:
if td.find('a'):
company_name = td.find('a').text
price_now = tds[2].text
market_cap = tds[5].text
data.append([company_name, price_now, market_cap])
print(*data, sep = "\n")
And this is the result I get. (Sorry for the Korean characters)
['삼성전자', '43,650', '100']
['', '43,650', '100']
['SK하이닉스', '69,800', '5,000']
['', '69,800', '5,000']
The second and the fourth line in the outcome should not be there. I just want the first and the third line. Where do line two and four come from and how do I get rid of them?

My dear friend, I think the problem is you should check if td.find('a').text have values!
So I change your code to this and it works!
import requests
from bs4 import BeautifulSoup
response = requests.get(
'http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', {'class': 'type_2'})
data = []
for tr in table.find_all('tr'):
tds = list(tr.find_all('td'))
for td in tds:
# where magic happends!
if td.find('a') and td.find('a').text:
company_name = td.find('a').text
price_now = tds[2].text
market_cap = tds[5].text
data.append([company_name, price_now, market_cap])
print(*data, sep="\n")

While I can't test it, it could be because there are two a tags on the page you're trying to scrape, while your for loop and if statement is set up to append information whenever it finds an a tag. The first one has the name of the company, but the second one has no text, thus the blank output (because you do td.find('a').text, it tries to get the text of the target a tag).
For reference, this is the a tag you want:
삼성전자
This is what you're picking up the second time around:
<img src="https://ssl.pstatic.net/imgstock/images5/ico_debatebl2.gif" width="15" height="13" alt="토론실">
Perhaps you can change your if statement to make sure the class of the a tag is title or something to make sure that you only enter the if statement when you're looking at the a tag with the company name in it.
I'm at work so I can't really test anything, but let me know if you have any questions later!

check tds it should be equal to 13 and no need multiple for loop
import requests
from bs4 import BeautifulSoup
response = requests.get('http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', { 'class': 'type_2' })
data = []
for tr in table.find_all('tr'):
tds = tr.find_all('td')
if len(tds) == 13:
company_name = tds[1].text
price_now = tds[2].text
market_cap = tds[6].text
data.append([company_name, price_now, market_cap])
print(*data, sep = "\n")
result
['삼성전자', '43,650', '2,802,035']
['SK하이닉스', '69,800', '508,146']
['삼성전자우', '35,850', '323,951']
['셀트리온', '229,000', '287,295']
['LG화학', '345,500', '243,897']

Related

Webscraping the contents of a tables

Hi I am trying to use Python and Beautiful Soup to scrape a webpage. There are various tables in the webpage with results that I want out of them, but I am struggling to:
1) find the right table
2) find the right two cells
3) write the cells 1 and 2 into a dictionary key and value, respectively.
So far, after making a request, and parsing the HTML, I use:
URL='someurl.com'
def datascrape(url):
page=requests.get(url)
print ("requesting page")
soup = BeautifulSoup(page.content, "html.parser")
return(soup)
soup=datascrape(URL)
results = {}
for row in soup.findAll('tr'):
aux = row.findAll('td')
try:
if "Status" in (aux.stripped_strings):
key=(aux[0].strings)
value=(aux[1].string)
results[key] = value
except:
pass
print (results)
Unfortunately "results" is always empty. I am really not sure where I am going wrong. Could anyone enlighten me please?
I'm not sure why you're using findAll() instead of find_all() as I'm fairly new to web-scraping, but nevertheless I think this gives you the output you're looking for.
URL='http://sitem.herts.ac.uk/aeru/bpdb/Reports/2070.html'
def datascrape(url):
page=requests.get(url)
print ("requesting page")
soup = BeautifulSoup(page.content,
"html.parser")
return(soup)
soup=datascrape(URL)
results = {}
table_rows = soup.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
try:
for i in row:
if "Status" in i:
key=(row[0].strip())
value=(row[1].strip())
results[key] = value
else:
pass
print(results)
Hope this helps!
If just after the Status and Not Applicable you can use positional nth-of-type css selectors. This does depend on position being the same across pages.
import requests
from bs4 import BeautifulSoup
url ='https://sitem.herts.ac.uk/aeru/bpdb/Reports/2070.htm'
page=requests.get(url)
soup = BeautifulSoup(page.content, "lxml")
tdCells = [item.text.strip() for item in soup.select('table:nth-of-type(2) tr:nth-of-type(1) td')]
results = {tdCells[0] : tdCells[1]}
print(results)

Web scraping coinmarketcap.com with Python (requests & BeautifulSoup)

I want to build a list with coins from coinmarketcap.com.
Every element should be a tuple.
Something like:
coins = [('btc',8500,'+0.5%','+1.2%', '-1%'), ...]
I can't get percentage:
The information is in td like this:
<td class="no-wrap percent-change text-right positive_change" data-timespan="1h" data-percentusd="0.99" data-symbol="BTC" data-sort="0.991515">0.99%</td>
How can I access 0.99% value from above? I need in fact data-percentageusd from td but I don't know what that is.
My testing script is something like:
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/all/views/all/'
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
name = soup.find_all('a', class_='currency-name-container')
price = soup.find_all('a', class_='price')
print(name)
print(price)
#how can percentage modification for 1h, 24h, 7d?
#delta_h = soup.find_all('td', ???)
You can loop over the rows of the table to get the data for each currency and store it in a tuple, and then add it to the list.
r = requests.get('https://coinmarketcap.com/all/views/all/')
soup = BeautifulSoup(r.text, 'lxml')
data = []
table = soup.find('table', id='currencies-all')
for row in table.find_all('tr'):
try:
symbol = row.find('td', class_='text-left col-symbol').text
price = row.find('a', class_='price').text
time_1h = row.find('td', {'data-timespan': '1h'}).text
time_24h = row.find('td', {'data-timespan': '24h'}).text
time_7d = row.find('td', {'data-timespan': '7d'}).text
except AttributeError:
continue
data.append((symbol, price, time_1h, time_24h, time_7d))
for item in data:
print(item)
Partial Output:
('BTC', '$8805.46', '0.88%', '-12.30%', '-19.95%')
('ETH', '$677.45', '0.98%', '-11.54%', '-21.66%')
('XRP', '$0.780113', '0.62%', '-10.63%', '-14.42%')
('BCH', '$970.70', '1.01%', '-11.33%', '-23.89%')
('LTC', '$166.70', '0.74%', '-10.06%', '-19.56%')
('NEO', '$83.55', '0.24%', '-16.29%', '-33.39%')
('XLM', '$0.286741', '1.13%', '-13.23%', '-11.84%')
('ADA', '$0.200449', '0.63%', '-16.92%', '-31.43%')
('XMR', '$256.92', '0.63%', '-19.98%', '-19.46%')
Since the data is missing for some currencies in the table, the code will raise an AttributeError for .text. To skip those currencies, I've used the try-except.

Iterating over BeautifulSoup object

I am iterating over table that I parsed from html page. I want to iterate over BeautifulSoup object and parse the texts between tag and store them into a list. However, the code below keeps giving me only the very last text from the iteration. How do I add on texts in this problem?
soup = BeautifulSoup(webpage, 'html.parser')
table = soup.find("table",attrs={"id":"mvp_NBA"}).find("tbody").findAll("tr")
for row in table:
key = []
season = row.find_all("th")
for year in season:
y = year.get_text().encode('utf-8')
key.append(y)
print key
Check this:
from bs4 import BeautifulSoup
import requests
url = "https://www.basketball-reference.com/awards/mvp.html"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
table = soup.find("table",attrs={"id":"mvp_NBA"}).find("tbody").findAll("tr")
key = []
for row in table:
season = row.findAll("th", {'class': 'left'})
for year in season:
y = year.get_text().encode('utf-8')
key.append(y)
print key
The only mistake you are doing that in your for loop on every ilteration you empyted your list key=[] i have modified your code little bit and it is giving your desired output.

Scrape multiple pages with Beautiful soup

I am trying to scrape multiple pages of a url.
But am able to scrape only the first page is there is a way to get all the pages.
Here is my code.
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
pd.set_option('max_colwidth',500) # to remove column limit (Otherwise, we'll lose some info)
df = pd.DataFrame()
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
comp_name = elem.find('span', attrs={'class':'company'}).getText().strip()
job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
home_url = "http://www.indeed.com"
job_link = "%s%s" % (home_url,elem.find('a').get('href'))
job_addr = elem.find('span', attrs={'class':'location'}).getText()
date_posted = elem.find('span', attrs={'class': 'date'}).getText()
description = elem.find('span', attrs={'class': 'summary'}).getText().strip()
comp_link_overall = elem.find('span', attrs={'class':'company'}).find('a')
if comp_link_overall != None:
comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
else: comp_link_overall = None
df = df.append({'comp_name': comp_name, 'job_title': job_title,
'job_link': job_link, 'date_posted': date_posted,
'overall_link': comp_link_overall, 'job_location': job_addr, 'description': description
}, ignore_index=True)
df
df.to_csv('path\\web_scrape_Indeed.csv', sep=',', encoding='utf-8')
Please suggest if there is anyway.
Case 1: The code presented here is exactly what you have
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
The problem here is targetElements changes with every iteration in the first for loop.
To avoid this, indent the second for loop inside the first like so:
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
Case 2: Your the bug is not a result of improper indentation (i.e. not like what is in your original post)
If it is the case that your code is properly idented , then it may be the case that targetElements is an empty list. This means target.findAll('div', class_ =' row result') does not return anything. In that case, visit the sites, check out the dom, then modify your scraping program.

requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied

I am working on a web scraping project and have run into the following error.
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Below is my code. I retrieve all of the links from the html table and they print out as expected. But when I try to loop through them (links) with request.get I get the error above.
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
Your mistake is second for loop in code
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
ref['href'] gives you single url but you use it as list in next for loop.
So you have
for link in ref['href']:
and it gives you first char from url http://properties.kimcore... which is h
Full working code
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
link = ref['href']
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
BTW: if you use comma in (ref['href'], ) then you get tuple and then second for works correclty.
EDIT: it create list table_data at start and add all data into this list. And it convert into DataFrame at the end.
But now I see it read the same page few times - because in every row the same url is in every column. You would have to get url only from one column.
EDIT: now it doesn't read the same url many times
EDIT: now it get text and hre from first link and add to every element in list when you use append().
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table_data = []
# all rows in table except first ([1:]) - headers
rows = soup.select('table tr')[1:]
for row in rows:
# link in first column (td[0]
#link = row.select('td')[0].find('a')
link = row.find('a')
link_href = link['href']
link_text = link.text
print('text:', link_text)
print('href:', link_href)
page = requests.get(link_href)
soup = BeautifulSoup(page.content, 'html.parser')
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
lis = anchor.find_all('li')
item1 = unicodedata.normalize("NFKD", lis[0].text).strip()
item2 = lis[1].text
item3 = lis[2].text.strip()
table_data.append([item1, item2, item3, link_text, link_href])
print('table_data size:', len(table_data))
headers = ['Number', 'Tenant', 'Square Footage', 'Link Text', 'Link Href']
df = DataFrame(table_data, columns=headers)
print(df)

Categories