Why is this code not downloading the data in real time? - python

I have a problem which I don't know how to solve (I'm a beginner in coding). This program is supposed to scrape stock price data from yahoo finance:
import bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime as dt
def real_time_price(stock_code):
url = 'https://finance.yahoo.com/quote/' + stock_code + '/'
r = requests.get(url)
web_content = BeautifulSoup(r.text, 'lxml')
web_content = web_content.find('div', {'class':'My(6px) Pos(r) smartphone_Mt(6px)'})
web_content = web_content.find('span').text
if web_content==[]:
web_content = '999999'
return web_content
LA = ['AAPL', 'FB', 'F', 'AMZN', 'GOOG']
for step in range(1,101):
price = []
col = []#Lista, która dodaje dane do df
time_stamp = dt.datetime.now()
time_stamp = time_stamp.strftime('%Y-%m-%d %H:%M:%S')
for stock_code in LA:
price.append(real_time_price(stock_code))
col = [time_stamp]
col.extend(price)
df = pd.DataFrame(col)
df = df.T
df.to_csv('realtimestockdata.csv', mode = 'a', header = False)
print(col)
But it seems that it does not update when it's running is there some syntactic error in that that I missed?
All responses are really appriciated, thank you.

Related

How can i collect the data i want from 4 pages of a website with for loop for url in urls?

im a newbie in python, actually trying to pull off an assignment for a uni course, and im tryin to do multiple pages scraping with python pandas. Im trying to exact all the data from the table that is in each page in this site https://aaiasb.gr/publications/investigation-reports
after i managed to scrape all the urls i tried this, but i only get the data from the first page:
#imports
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
response=requests.get('https://aaiasb.gr/publications/investigation-reports', )
response
soup = BeautifulSoup(response.text, 'html.parser')
soup
base_url = 'https://aaiasb.gr/publications/investigation-reports'
ending = '?start='
numbers = [50, 100, 150]
urls = [base_url]
for n in numbers:
url = base_url+ending+str(n)
urls.append(url)
df = pd.DataFrame(urls)
df = df.rename(columns={df.columns[0]:'url'})
df
for url in urls:
response = requests.get(url)
time.sleep(3)
soup_doc = BeautifulSoup(response.text, 'html.parser')
entries = []
page=soup.select('div.cck_page_items')[0]
rows = page.find('table').find_all('tr')[1:]
conclusion_date1 = tr.find_all('td')[0].find_all('div')[1].text.strip()
conclusion_date2 = tr.find_all('td')[0].find_all('div')[2].text.strip()
incident_info = tr.find_all('td')[1].find_all('div')[0].text.strip()
incident_type = tr.find_all('td')[1].find_all('div')[1].text.strip()
incident_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[0])
fatalities = tr.find_all('td')[1].find_all('div')[2].text.strip()
fatalities_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[1])
area = tr.find_all('td')[2].find_all('div')[0].text.strip()
registry = tr.find_all('td')[2].find_all('div')[1].text.strip()
aircraft_type = tr.find_all('td')[2].find_all('div')[-2].text.strip()
aircraft_info = tr.find_all('td')[2].find_all('div')[-1].text.strip()
area_info = tr.find_all('td')[2].text.strip()
dict = {'conclusion_date1': conclusion_date1,
'conclusion_date2': conclusion_date2,
'incident_info': incident_info,
'incident_type': incident_type,
'incident_description': incident_description,
'fatalities': fatalities,
'fatalities_description': fatalities_description,
'area': area,
'registry': registry,
'aircraft_type': aircraft_type,
'aircraft_info': aircraft_info,
'area_info': area_info}
entries.append(dict)
df1 =pd.DataFrame(entries)
The main issue has to do with your indentation and location of assignments, e.g. entries = [] and df = pd.DataFrame(entries) need to be in the right places. Try the below.
# imports
from bs4 import BeautifulSoup
import requests
import unicodedata
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
base_url = "https://aaiasb.gr/publications/investigation-reports"
suffix = "?start="
start_indices = [0, 50, 100, 150]
urls = [base_url + suffix + str(index) for index in start_indices]
entries = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.select("div.cck_page_items").pop().find("table")
for row in table.find_all("tr")[1:]:
cols = row.find_all("td")
conclusion_date1 = cols[0].find_all("div")[1].text.strip()
try:
conclusion_date2 = cols[0].find_all("div")[2].text.strip()
except IndexError:
conclusion_date2 = "N/A"
incident_info = cols[1].find_all("div")[0].text.strip()
incident_type = cols[1].find_all("div")[1].text.strip()
fatalities = cols[1].find_all("div")[2].text.strip()
info_hovers = cols[1].find_all("span", attrs={"uk-icon": "info"})
incident_description = ' '.join(unicodedata.normalize("NFC",info_hovers[0]['uk-tooltip']).split())
fatalities_description = ' '.join(unicodedata.normalize("NFC",info_hovers[1]['uk-tooltip']).split()).replace("<br>","\n")
area = cols[2].find_all("div")[0].text.strip()
area_info = '\n'.join(list(cols[2].strings)[-3:]).strip()
registry = cols[2].find_all("div")[1].text.strip()
aircraft_type = cols[2].find_all("div")[-2].text.strip()
aircraft_info = cols[2].find_all("div")[-1].text.strip()
entry = {
"conclusion_date1": conclusion_date1,
"conclusion_date2": conclusion_date2,
"incident_info": incident_info,
"incident_type": incident_type,
"incident_description": incident_description,
"fatalities": fatalities,
"fatalities_description": fatalities_description,
"area": area,
"registry": registry,
"aircraft_type": aircraft_type,
"aircraft_info": aircraft_info,
"area_info": area_info,
}
entries.append(entry)
df = pd.DataFrame(entries)
print(df.head())
print(df.tail())

Iterate Over URLs Using BeautifulSoup

I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,

Loop through the lists of lists using Python and append to dataframe

I am tried to loop through the lists of lists and scrape all the links and append them to dataframe as one table, but in vain.
Help will be appreciated.
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
data_df = pd.DataFrame()
for sub_tab in company_A_subpg1:
for tab in sub_tab:
sub_table_1 = tab.find_all('a', href=True)
company_name = [name.text.strip() for name in sub_table_1]
company_link = [name.get('href') for name in sub_table_1]
company_link_edit=[convert(name) for name in company_link]
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
data_df.to_csv('results_3.csv')
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
for sub_tab in company_A_subpg1:
temp = sub_tab.find('tbody')
all_rows = temp.find_all('tr')
for val in all_rows:
a_tag = val.find('a', href=True)
company_name.append(a_tag.text.strip())
company_link_edit.append(convert(a_tag.get('href')))
print(len(company_name), len(company_link_edit))
data_df = pd.DataFrame()
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
print(df.shape)
data_df.to_csv('results_3.csv')
You can check values inside csv file I fetched all the 200 names and link mentioned in the page.

crawling next page when url remains same

I need some help to crawl the data from the url below
https://onland.kbstar.com/quics?page=C060250&keyword=%EB%8F%99%EC%9E%91%EA%B5%AC
I would like to crawl the second page but the url remains the same when I click '2' and don't know how to do it. Please help!
Here's my python code to crawl the fist page :
from selenium import webdriver
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request as req
import urllib
import re
from datetime import datetime
import requests
dataframe = pd.DataFrame()
def KB_liveON(area_name):
query = area_name
area = urllib.parse.quote(query)
url = 'https://onland.kbstar.com' \
+ '/quics?page=C060250' \
+ '&keyword=' + str(area)
# + '#CP'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table')
trs = table.tbody.find_all('tr')
dataframe = pd.DataFrame()
value_list = []
for tr in trs[::1]:
tds = tr.find_all('td')
#cols = [' '.join(td.text.strip().split()) for td in tds]
cols = [td.text.strip().split() for td in tds]
progress = cols[0]
location = cols[1]
complex_name = cols[2]
area = cols[3]
sale_price = cols[4]
sale_price2 = cols[5]
time = cols[6]
type_of_sale = cols[7]
construction_company = cols[8]
value_list.append([progress ,location, complex_name ,area, sale_price, sale_price2, time, type_of_sale, construction_company])
cols = ['progress' ,'location','complex_name' ,'area','sale_price','sale_price2','time','type_of_sale','construction_company']
df = pd.DataFrame(value_list, columns=cols)
return df
kb = KB_liveON('동작구')
dataframe = dataframe.append(kb)
dataframe
First, I install a Selenium WebDriver on Google Colab. Then I write a crawler to crawl data from multiple pages.
Python code:
import time
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
def extract_data(value_list, html_tags):
soup = BeautifulSoup(html_tags, 'lxml')
table = soup.find('table')
trs = table.tbody.find_all('tr')
for tr in trs[::1]:
tds = tr.find_all('td')
cols = [td.text.strip().split() for td in tds]
progress = cols[0]
location = cols[1]
complex_name = cols[2]
area = cols[3]
sale_price = cols[4]
sale_price2 = cols[5]
time = cols[6]
type_of_sale = cols[7]
construction_company = cols[8]
value_list.append([progress ,location, complex_name ,area, sale_price, sale_price2, time, type_of_sale, construction_company])
return value_list
def KB_liveON(area):
url = 'https://onland.kbstar.com' \
+ '/quics?page=C060250' \
+ '&keyword=' + str(area)
wd.get(url)
data_list = []
# Extract data from first page
tbl = wd.find_elements_by_class_name("tbl_list")[0]
html_tags = tbl.get_attribute('outerHTML')
data_list = extract_data(data_list, html_tags)
# Find and extract data from other pages except first page
forms = wd.find_elements_by_xpath("//div[#class='paging']//form")
for f in forms[1:]:
f.submit()
time.sleep(10)
tbl = wd.find_elements_by_class_name("tbl_list")[0]
html_tags = tbl.get_attribute('outerHTML')
data_list = extract_data(data_list, html_tags)
time.sleep(10)
cols = ['progress' ,'location','complex_name' ,'area','sale_price','sale_price2','time','type_of_sale','construction_company']
df = pd.DataFrame(data_list, columns=cols)
return df
if __name__=='__main__':
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver', options=options)
df = KB_liveON('동작구')
print (df)
Output (The crawl results):

python beautifulsoup next page

here is my current code to scrape specific player data from a site:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import ExcelWriter
import lxml
import xlsxwriter
page = requests.get('https://www.futbin.com/players?page=1')
soup = BeautifulSoup(page.content, 'lxml')
pool = soup.find(id='repTb')
pnames = pool.find_all(class_='player_name_players_table')
pprice = pool.find_all(class_='ps4_color font-weight-bold')
prating = pool.select('span[class*="form rating ut20"]')
all_player_names = [name.getText() for name in pnames]
all_prices = [price.getText() for price in pprice]
all_pratings = [rating.getText() for rating in prating]
fut_data = pd.DataFrame(
{
'Player': all_player_names,
'Rating': all_pratings,
'Price': all_prices,
})
writer = pd.ExcelWriter('file.xlsx', engine='xlsxwriter')
fut_data.to_excel(writer,'Futbin')
writer.save()
print(fut_data)
This is working fine for the first page. But I need to go through 609 pages in total and get the data from all pages.
Could you please help me to re-write this code to make that working? I am still new and learning with this project.
You can iterate over all 609 pages, parse each page and at the end save collected data to file.xlsx:
import requests
from bs4 import BeautifulSoup
import pandas as pd
all_player_names = []
all_pratings = []
all_prices = []
for i in range(1, 610):
page = requests.get('https://www.futbin.com/players?page={}'.format(i))
soup = BeautifulSoup(page.content, 'lxml')
pool = soup.find(id='repTb')
pnames = pool.find_all(class_='player_name_players_table')
pprice = pool.find_all(class_='ps4_color font-weight-bold')
prating = pool.select('span[class*="form rating ut20"]')
all_player_names.extend([name.getText() for name in pnames])
all_prices.extend([price.getText() for price in pprice])
all_pratings.extend([rating.getText() for rating in prating])
fut_data = pd.DataFrame({'Player': all_player_names,
'Rating': all_pratings,
'Price': all_prices})
writer = pd.ExcelWriter('file.xlsx', engine='xlsxwriter')
fut_data.to_excel(writer, 'Futbin')
writer.save()

Categories