I am trying to check if a row already exists. If it doesn't, something has to be written in the row. My CSV file is always empty.
# import libraries
import csv
import urllib2
from bs4 import BeautifulSoup
# integer for first article id
articleid = 4449
articles = 4459
while articleid < articles:
# specify the url and article id
url = 'http://www.bkfrem.dk/default.asp?vis=nyheder&id='+str(articleid)
articleid += 1
# query the website and return the html to the variable
page = urllib2.urlopen(url)
# parse the html using beautiful soup and store in variable soup
soup = BeautifulSoup(page, 'html.parser')
# create CSV file
csvfile = csv.writer(open('news.csv', 'a'))
# take out the <div> of name and get its value and text
title_box = soup.find('h1', attrs={'style': 'margin-bottom:0px'})
title = title_box.text.encode('utf-8').strip()
date_box = soup.find('div', attrs={'style': 'font-style:italic; padding-bottom:10px'})
date = date_box.text.encode('utf-8').strip()
articleText_box = soup.find('div', attrs={'class': 'news'})
articleText = articleText_box.text.encode('utf-8').strip()
# print the data (encoded) to the CSV file
with open('news.csv', 'rb') as csvfileO:
f_reader = csv.reader(csvfileO, delimiter=',')
for row in f_reader:
if articleText not in row:
csvfile.writerow(["Title", "Date", "Text"])
csvfile.writerow((title, date, articleText))
What am I doing wrong since it's empty?
for row in f_reader:
if articleText not in
csvfile.writerow(["Title", "Date", "Text"])
csvfile.writerow((title, date, articleText))
You have if articleText not in
Not in what? You should have it pointing to something to validate.
if articleText not in "Something":
csvfile.writerow(["Title", "Date", "Text"])
csvfile.writerow((title, date, articleText))
Related
I expected a csv file created with in my desktop directory.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://basketball.realgm.com/ncaa/conferences/Big-12-
Conference/3/Kansas/54/nba-players"
# get permission
response = requests.get(url)
# access html files
soup = BeautifulSoup(response.text, 'html.parser')
# creating data frame
columns = ['Player', 'Position', 'Height', 'Weight', 'Draft Year', 'NBA
Teams', 'Years', 'Games Played','Points Per Game', 'Rebounds Per Game',
'Assists Per Game']
df = pd.DataFrame(columns=columns)
table = soup.find(name='table', attrs={'class': 'tablesaw','data-
tablesaw-mode':'swipe','id': 'table-6615'}).tbody
trs = table.find('tr')
# rewording html
for tr in trs:
tds = tr.find_all('td')
row = [td.text.replace('\n', '')for td in tds]
df = df.append(pd.Series(row, index=columns), ignore_index=True)
df.to_csv('kansas_player', index=False)
I expected a csv file created with in my desktop directory.
Looks like by your way the soup.find(...) can not find 'table', and that's might
be why you get a None type returned, here is my change and you can tailor it to cope with you csv export need:
from bs4 import BeautifulSoup
import urllib.request
url = "https://basketball.realgm.com/ncaa/conferences/Big-12-Conference/3/Kansas/54/nba-players"
# get permission
response = urllib.request.urlopen(url)
# access html files
html = response.read()
soup = BeautifulSoup(html)
table = soup.find("table", {"class": "tablesaw"})
At this point, you can return full table content as:
From there on, you can easily extract the table row information by such as:
for tr in table.findAll('tr'):
tds = tr.find_all('td')
row = [td.text.replace('\n', '')for td in tds]
.....
Now each row would look like:
Finally, you can write each row into the csv with or without the pandas, your call then.
I'm trying to extract the stock price and the market cap data from a Korean website.
Here is my code:
import requests
from bs4 import BeautifulSoup
response = requests.get('http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', { 'class': 'type_2' })
data = []
for tr in table.find_all('tr'):
tds = list(tr.find_all('td'))
for td in tds:
if td.find('a'):
company_name = td.find('a').text
price_now = tds[2].text
market_cap = tds[5].text
data.append([company_name, price_now, market_cap])
print(*data, sep = "\n")
And this is the result I get. (Sorry for the Korean characters)
['삼성전자', '43,650', '100']
['', '43,650', '100']
['SK하이닉스', '69,800', '5,000']
['', '69,800', '5,000']
The second and the fourth line in the outcome should not be there. I just want the first and the third line. Where do line two and four come from and how do I get rid of them?
My dear friend, I think the problem is you should check if td.find('a').text have values!
So I change your code to this and it works!
import requests
from bs4 import BeautifulSoup
response = requests.get(
'http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', {'class': 'type_2'})
data = []
for tr in table.find_all('tr'):
tds = list(tr.find_all('td'))
for td in tds:
# where magic happends!
if td.find('a') and td.find('a').text:
company_name = td.find('a').text
price_now = tds[2].text
market_cap = tds[5].text
data.append([company_name, price_now, market_cap])
print(*data, sep="\n")
While I can't test it, it could be because there are two a tags on the page you're trying to scrape, while your for loop and if statement is set up to append information whenever it finds an a tag. The first one has the name of the company, but the second one has no text, thus the blank output (because you do td.find('a').text, it tries to get the text of the target a tag).
For reference, this is the a tag you want:
삼성전자
This is what you're picking up the second time around:
<img src="https://ssl.pstatic.net/imgstock/images5/ico_debatebl2.gif" width="15" height="13" alt="토론실">
Perhaps you can change your if statement to make sure the class of the a tag is title or something to make sure that you only enter the if statement when you're looking at the a tag with the company name in it.
I'm at work so I can't really test anything, but let me know if you have any questions later!
check tds it should be equal to 13 and no need multiple for loop
import requests
from bs4 import BeautifulSoup
response = requests.get('http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', { 'class': 'type_2' })
data = []
for tr in table.find_all('tr'):
tds = tr.find_all('td')
if len(tds) == 13:
company_name = tds[1].text
price_now = tds[2].text
market_cap = tds[6].text
data.append([company_name, price_now, market_cap])
print(*data, sep = "\n")
result
['삼성전자', '43,650', '2,802,035']
['SK하이닉스', '69,800', '508,146']
['삼성전자우', '35,850', '323,951']
['셀트리온', '229,000', '287,295']
['LG화학', '345,500', '243,897']
I am iterating over table that I parsed from html page. I want to iterate over BeautifulSoup object and parse the texts between tag and store them into a list. However, the code below keeps giving me only the very last text from the iteration. How do I add on texts in this problem?
soup = BeautifulSoup(webpage, 'html.parser')
table = soup.find("table",attrs={"id":"mvp_NBA"}).find("tbody").findAll("tr")
for row in table:
key = []
season = row.find_all("th")
for year in season:
y = year.get_text().encode('utf-8')
key.append(y)
print key
Check this:
from bs4 import BeautifulSoup
import requests
url = "https://www.basketball-reference.com/awards/mvp.html"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
table = soup.find("table",attrs={"id":"mvp_NBA"}).find("tbody").findAll("tr")
key = []
for row in table:
season = row.findAll("th", {'class': 'left'})
for year in season:
y = year.get_text().encode('utf-8')
key.append(y)
print key
The only mistake you are doing that in your for loop on every ilteration you empyted your list key=[] i have modified your code little bit and it is giving your desired output.
I am trying to learn how to scrape a website with Python and BeautifulSoup. I have been able to collect all the names/job-titles, and I'm trying to save them into a csv-file. I either need some type of loop or append in order to get them all into a csv-file. As it stands now, only the final name and job-title are saved in the csv-file.
#import libraries
import csv
import urllib2
from bs4 import BeautifulSoup
#specify the url
buzzly_page = 'http://buzzlymedia.com/ourwork/'
#query the website and return the html to the variable 'page'
page = urllib2.urlopen(buzzly_page)
#parse the html
soup = BeautifulSoup(page, 'html.parser')
#query to get value of name
for name_box in soup.find_all('strong', attrs={'itemprop': 'name'}):
name = name_box.text.strip() #remove starting and trailing
print name
#query to get value of job-title
for job_box in soup.find_all('span', attrs={'itemprop': 'jobTitle'}):
job = job_box.text.strip() #remove starting and trailing
print job
#write into csv-file
with open('buzzly_clients.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name, job])
Find the divs that contains the elements you want and iterate over them like this.
# import libraries
import csv
import urllib2
from bs4 import BeautifulSoup
# specify the url
buzzly_page = 'http://buzzlymedia.com/ourwork/'
# query the website and return the html to the variable 'page'
page = urllib2.urlopen(buzzly_page)
# parse the html
soup = BeautifulSoup(page, 'html.parser')
# write into csv-file
with open('buzzly_clients.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for div in soup.find_all('div', attrs={'class': 'avia-testimonial-meta-mini'}):
# query to get value of name
name_box = div.find('strong', attrs={'itemprop': 'name'})
name = name_box.text.strip() # remove starting and trailing
print (name)
# query to get value of job-title
job_box = div.find('span', attrs={'itemprop': 'jobTitle'})
job = job_box.text.strip() # remove starting and trailing
print (job)
writer.writerow([name, job])
I have a set of urls (stock data) for which I want certain data to be put into a csv. Per row I need to have:
name price recrat opinion
A csv appears but has no data, and I get the error:
ValueError: too many values to unpack
How should I go about this? Here is my code so far:
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import csv
from datetime import datetime
quote_page = ['http://uk.mobile.reuters.com/business/quotes/overview/AALB.AS',
'http://uk.mobile.reuters.com/business/stocks/overview/ABNd.AS',
'http ://uk.mobile.reuters.com/business/stocks/overview/ACCG.AS',
'http ://uk.mobile.reuters.com/business/stocks/overview/AD.AS']
for link in quote_page:
try:
page = urllib2.urlopen(link)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('span', attrs={'class': 'company-name'})
name = name_box.text.strip()
print name
price_box = soup.find('span', attrs={'class':'price'})
price = price_box.text.strip()
print price
recrating_box = soup.find('div', attrs={'class':'recommendation-rating'})
recrat = recrating_box.text.strip()
print recrat
opinion = soup.find('div', attrs={'class':'recommendation-marker'})['style']
print opinion
except TypeError:
continue
quote_page.append((name, price, recrat, opinion))
# open a csv file with append, so old data will not be erased
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for name, price in quote_page:
writer.writerows([name, price, recrat, opinion, datetime.now()])
Tested and working:
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import csv
from datetime import datetime
quote_page = ['http://uk.mobile.reuters.com/business/quotes/overview/AALB.AS',
'http://uk.mobile.reuters.com/business/stocks/overview/ABNd.AS',
'http://uk.mobile.reuters.com/business/stocks/overview/ACCG.AS',
'http://uk.mobile.reuters.com/business/stocks/overview/AD.AS']
results = []
for link in quote_page:
try:
page = urllib2.urlopen(link)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('span', attrs={'class': 'company-name'})
name = name_box.text.strip()
print name
price_box = soup.find('span', attrs={'class':'price'})
price = price_box.text.strip()
print price
recrating_box = soup.find('div', attrs={'class':'recommendation-rating'})
recrat = recrating_box.text.strip()
print recrat
opinion = soup.find('div', attrs={'class':'recommendation-marker'})['style']
print opinion
except TypeError:
continue
results.append((name, price, recrat, opinion))
# open a csv file with append, so old data will not be erased
with open('index.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
for item in results:
writer.writerow([item[0], item[1], item[2], item[3], datetime.now()])
There were 3 issues, first, you were overwriting an active list - Not a good idea: I renamed this to results.
Second, you were trying to iterate over the list but accessing only 2 of the 4 items. I've done these as indexed.
Finally, as you were iterating, you'd want to do it line by line so writerows needs to be changed to writerow.