Parsing tables from the site - python

There is a site https://ru.myip.ms/browse/market_bitcoin/%D0%91%D0%B8%D1%82%D0%BA%D0%BE%D0%B8%D0%BD_%D0%B8%D1%81%D1%82%D0%BE%D1%80%D0%B8%D1%8F_%D1%86%D0%B5%D0%BD.html#a, below is a table with BTC prices, I need to like then parse this table. I was trying to do, but for some reason, instead of the price in the table is displayed dots
from time import sleep
import pandas as pd
import requests
host = 'ru.myip.ms'
index_url = 'https://ru.myip.ms'
home_url = "https://ru.myip.ms/browse/market_bitcoin/%D0%91%D0%B8%D1%82%D0%BA%D0%BE%D0%B8%D0%BD_%D0%B8%D1%81%D1%82%D0%BE%D1%80%D0%B8%D1%8F_%D1%86%D0%B5%D0%BD.html#a"
base_ajax_url = "https://ru.myip.ms/ajax_table/market_bitcoin/{page}"
with requests.Session() as session:
session.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Host': host
}
# visit home page and parse the initial dataframe
response = session.get(home_url)
df = pd.read_html(response.text, attrs={"id": "market_bitcoin_tbl"})[0]
df = df.rename(columns=lambda x: x.strip()) # remove extra newlines from the column names
sleep(2)
# start paginating with page=2
page = 1
while True:
url = base_ajax_url.format(page=page)
print("Processing {url}...".format(url=url))
response = session.post(url,
data={'getpage': 'yes', 'lang': 'ru'},
headers={
'X-Requested-With': 'XMLHttpRequest',
'Origin': index_url,
'Referer': home_url
})
# add data to the existing dataframe
try:
new_df = pd.read_html("<table>{0}</table>".format(response.text))[0]
except ValueError: # could not extract data from HTML - last page?
break
new_df.columns = df.columns
df = pd.concat([df, new_df])
page += 1
sleep(1)
print(df)

you are doing it correctly. and you already have your results.
try just to do this to see the result.
print(df['Bitcoin Price'])
you see the dots, just because the df is big to show it all when you run it, but it exists.

Related

Data ino pd.DataFrame

from datetime import timedelta, date
from nsepy import get_history
import pandas as pd
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
#print(stock_fut.columns)
print(stock_fut[["Symbol","Number of Contracts","Change in OI","Open Interest"]])
a = ["ULTRACEMCO"]
#a = ["CONCOR", "JKCEMENT","SHREECEM","RAMCOCEM","INDIGO","ACC","BAJAJ-AUTO","ULTRACEMCO","PERSISTENT","MARUTI"]
for i in range(0,len(a)):
#print(a[i])
#importdata(a[i])
df = pd.DataFrame(a[i])
print(df)
Unable to do same with error as DataFrame is not called properly.
Also I want to join all data for all symbols in single table.
import requests
import json
import codecs
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
mini_df = df[['symbol']]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#print(df)
#print(mini_df)
print(mini_df.to_string(index=False))
Sir what if I want this symbol output to give input in below code for value of "a".
I tried to fix your code with minimal modification. Hope it helps;
from datetime import timedelta, date
from nsepy import get_history
import pandas as pd
import requests
import json
import codecs
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ''like Gecko) ''Chrome/80.0.3987.149 Safari/537.36','accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
# hereby we use from_dict module of pandas.DataFrame in order to make a dataframe from a collection of dictionaries inside a dictionary.
df1 = pd.DataFrame.from_dict(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
# and here we get a list unique of symbols
a = df1.symbol.unique().tolist()
def importdata(stock):
stock_fut = get_history(symbol=stock,
start=date.today() - timedelta(days = 3), end=date.today(),
futures=True,
expiry_date=date(2022,9,29))
return stock_fut[["Symbol","Number of Contracts","Change in OI","Open Interest"]]
# here we add all the dataframes to a list
df_list = []
for i in a:
temp = importdata(i)
temp_df = pd.DataFrame(temp)
df_list.append(temp_df)
# and here we concatenate all of them together in a row-wise manner.
df = pd.concat(df_list)
print (df)
Change the line to:
df = pd.DataFrame([a[i]])

How to separate data per column when writing data to excel from web scraping results

I know how to separate it when the data looks like:
x, y, z
But I can't figure out how to do it when the data format is like:
Doe, John, BookName, Year, abstract with commas, links.
This is what the data looks like in excel after the scrape
This is what i wanted it to looks like
This is my code
from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fakdep = '165'
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2):
count_page+=1
print('Scraping Offset No:', count_page)
result = requests.get(url+str(page), verify=False)
soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().split('\n'))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('\n'))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''))
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)
This is my suggestion. I will need to know an offset to be able to test it.
A CSV separated by semi-colons will be far easier to separate in Excel.
from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fakdep = '165'
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2):
count_page+=1
print('Scraping Offset No:', count_page)
result = requests.get(url+str(page), verify=False)
soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().replace('/n', ''))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', ''))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";")
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)

How to write seperate functions in seperate py files and execute it using main.py without using concept of class

i am new to python and i am yet to learn the concept of oop,classes with python. i thought i understood functions. But i am facing issue while calling functions from different py file.
Below code shows all my fuctions described in main.py
i want to split main.py and get 2 other py files as data extraction.py and data processing.py
i understand that it can be done using classes, but can we do it without using classes as well?
i divided the code in two other files but i am getting error(please find my attached screenshot)
please explain me what i can do here!
main.py
import pandas as pd
import requests
from bs4 import BeautifulSoup
from configparser import ConfigParser
import logging
import data_extraction
config = ConfigParser()
config.read('config.ini')
logging.basicConfig(filename='logfile.log', level=logging.DEBUG,
format='%(asctime)s:%(lineno)d:%(name)s:%(levelname)s:%(message)s')
baseurl = config['configData']['baseurl']
sub_url = config['configData']['sub_url']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
"Upgrade-Insecure-Requests": "1", "DNT": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"
}
r = requests.get(baseurl, headers=headers)
status = r.status_code
soup = BeautifulSoup(r.content, 'html.parser')
model_links = []
all_keys = ['Model', 'Platform', 'Product Family', 'Product Line', '# of CPU Cores',
'# of Threads', 'Max. Boost Clock', 'Base Clock', 'Total L2 Cache', 'Total L3 Cache',
'Default TDP', 'Processor Technology for CPU Cores', 'Unlocked for Overclocking', 'CPU Socket',
'Thermal Solution (PIB)', 'Max. Operating Temperature (Tjmax)', 'Launch Date', '*OS Support']
# function to get the model links in one list from soup object(1st page extraction)
def get_links_in_list():
for model_list in soup.find_all('td', headers='view-name-table-column'):
# model_list = model_list.a.text - to get the model names
model_list = model_list.a.get('href')
# print(model_list)
model_list = sub_url + model_list
# print(model_list)
one_link = model_list.split(" ")[0]
model_links.append(one_link)
return model_links
model_links = get_links_in_list()
logging.debug(model_links)
each_link_data = data_extraction()
print(each_link_data)
#all_link_data = data_processing()
#write_to_csv(all_keys)
data_extraction.py
import requests
from bs4 import BeautifulSoup
from main import baseurl
from main import all_keys
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
"Upgrade-Insecure-Requests": "1", "DNT": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"
}
r = requests.get(baseurl, headers=headers)
status = r.status_code
soup = BeautifulSoup(r.content, 'html.parser')
model_links = []
# function to get data for each link from the website(2nd page extraction)
def data_extraction(model_links):
each_link_data = []
try:
for link in model_links:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
specification = {}
for key in all_keys:
spec = soup.select_one(
f'.field__label:-soup-contains("{key}") + .field__item, .field__label:-soup-contains("{key}") + .field__items .field__item')
# print(spec)
if spec is None:
specification[key] = ''
if key == 'Model':
specification[key] = [i.text for i in soup.select_one('.page-title')]
specification[key] = specification[key][0:1:1]
# print(specification[key])
else:
if key == '*OS Support':
specification[key] = [i.text for i in spec.parent.select('.field__item')]
else:
specification[key] = spec.text
specification['link'] = link
each_link_data.append(specification)
except:
print('Error occurred')
return each_link_data
# print(each_link_data)
data processing.py
# function for data processing : converting the each link object into dataframe
def data_processing():
all_link_data = []
for each_linkdata_obj in each_link_data:
# make the nested dictionary to normal dict
norm_dict = dict()
for key in each_linkdata_obj:
if isinstance(each_linkdata_obj[key], list):
norm_dict[key] = ','.join(each_linkdata_obj[key])
else:
norm_dict[key] = each_linkdata_obj[key]
all_link_data.append(norm_dict)
return all_link_data
# print(all_link_data)
all_link_data = data_processing()
# function to write dataframe data into csv
def write_to_csv(all_keys):
all_link_df = pd.DataFrame.from_dict(all_link_data)
all_link_df2 = all_link_df.drop_duplicates()
all_link_df3 = all_link_df2.reset_index()
# print(all_link_df3)
all_keys = all_keys + ['link']
all_link_df4 = all_link_df3[all_keys]
# print(all_link_df4)
all_link_df4.to_csv('final_data.csv')
write_to_csv(all_keys)
Move the existing functions(ex. write_to_csv) to different file for example 'utility_functions.py'. Import it in main.py using from utility_functions import write_to_csv. Now you can use the function 'write_to_csv' in main.py as
write_to_csv(all_keys)
Edit
In the main.pyfile
use from data_extraction import data_extraction instead of import data_extraction
In data_extraction.py file
Remove lines
from main import baseurl from main import all_keys
It will throw variable undefined error, you can fix it by passing the variable in the function call.

Add new column in a dataframe based on a condition on the content of another column in the same dataframe

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
def scrap_hrefs(url,baseUrl):
resp = requests.get(url, headers= header)
respData = BeautifulSoup(resp.content, 'html.parser')
allHrefs = respData.select('[href]')
return allHrefs, baseUrl
def get_hrefs(allHrefs, baseUrl):
for i in range(0,len(allHrefs)):
if allHrefs[i]['href'].startswith('/'):
allHrefs[i]= baseUrl + allHrefs[i]['href']
else:
allHrefs[i]= allHrefs[i]['href']
return allHrefs
def clean_hrefs(allHrefs):
links = {'links' : allHrefs}
df = pd.DataFrame(links).drop_duplicates()
df = df[df['links'].str.contains('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings|news|media')]
for i in range(0,len(df)):
if df[i]['links'].str.find('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings')!= -1:
df[i]['segments'] = df['Finance']
else:
continue
return df
def store_hrefs(df):
df.to_csv("testing.csv", index=False)
def run_scraper(url,baseUrl) :
store_hrefs(clean_hrefs(get_hrefs(*scrap_hrefs(url, baseUrl))))
run_scraper('https://www.example.com/','https://www.example.com')
In clean_hrefs() function, I want to get the first link from the data frame, check if it's content has the word 'finance, investors, ir, report, filings'. If it does, create another column called 'segments' and assign it id 'FINANCE'.
But it's giving an error. KeyError: 0
Any help would be much appreciated!
You can set column to another one by mask, similar like filtration, if no matching get missing values:
mask = df['links'].str.contains('financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings')
df.loc[mask, 'segments'] = 'Finance'
working like:
df['segments'] = np.where(mask, 'Finance', np.nan)
EDIT:
If want set multiple values you can specify new values in dictionary and then set column segments like:
d = {'INVESTOR':'financial|investors|investor|Investors|Investor|INVESTORS|INVESTOR|relations|relation|Relations|Relation|report|filings',
'NEWS':'news|media'}
for k, v in d.items():
df.loc[df['links'].str.contains(v, na=False), 'segmentID'] = k

Sending multiple POST data in Python

I have a Python code that sends POST request to a website, reads the response and filters it. For the POST data I used ('number', '11111') and it works perfect. However, I want to create a txt file that contains 100 different numbers as 1111,2222,3333,4444... and then send the POST requests for each of them. Can you help me how to do this in Python?
import urllib
from bs4 import BeautifulSoup
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Origin': 'http://mahmutesat.com/python.aspx',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'http://mahmutesat.com/python.aspx',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
}
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'http://mahmutesat.com/python.aspx'
# first HTTP request without form data
f = myopener.open(url)
soup = BeautifulSoup(f)
# parse and retrieve two vital form values
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
viewstategenerator = soup.select("#__VIEWSTATEGENERATOR")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('__VIEWSTATEGENERATOR',viewstategenerator),
('number', '11111'),
('Button', 'Sorgula'),
)
encodedFields = urllib.urlencode(formData)
# second HTTP request with form data
f = myopener.open(url, encodedFields)
soup = BeautifulSoup(f.read())
name=soup.findAll('input',{'id':'name_field'})
for eachname in name:
print eachname['value']
If your file has data:
"sample.txt"
1111,2222,3333,4444,5555,6666,7777,8888,......(and so on)
To read the file contents, you can use the file open operation:
import itertools
#open the file for read
with open("sample.txt", "r") as fp:
values = fp.readlines()
#Get the values split with ","
data = [map(int, line.split(",")) for line in values]
numbers = list(itertools.chain(*data)) #Ensuring if its having many lines then concatenate
Now, use it as:
for number in numbers:
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('__VIEWSTATEGENERATOR',viewstategenerator),
('number', str(number)), # Here you use the number obtained
('Button', 'Sorgula'),
)
encodedFields = urllib.urlencode(formData)
# second HTTP request with form data
f = myopener.open(url, encodedFields)
soup = BeautifulSoup(f.read())
name=soup.findAll('input',{'id':'name_field'})
for eachname in name:
print eachname['value']
1 - Here is an example on how to create a file:
f = open('test.txt','w')
This will open the test.txt file for writing ('w') (if it has already data, it will be erased but if you want to append it write: f = open('test.txt','a') ) or create one if it does not exist yet. Note that this will happen in your current working directory, if you want it in a specific directory, include with the file name the full directory path, example:
f = open('C:\\Python\\test.txt','w')
2 - Then write/append to this file the data you want, example:
for i in range(1,101):
f.write(str(i*1111)+'\n')
This will write 100 numbers as string from 1111 to 111100
3 - You should always close the file at the end:
f.close()
4 - Now if you want to read from this file 'test.txt':
f = open('C:\\Python\\test.txt','r')
for i in f:
print i,
file.close()
This is as simple as it can be,
You need to read about File I/O in python from:
https://docs.python.org/2.7/tutorial/inputoutput.html#reading-and-writing-files
Make sure you select the right Python version for you in this docs.
using dictionary you can deal with the multiple requests, very easily.
import requests
values = {
'__EVENTVALIDATION': event_validation,
'__LASTFOCUS': '',
'__VIEWSTATE': view_state,
'__VIEWSTATEGENERATOR': '6264FB8D',
'ctl00$ContentPlaceHolder1$ButGet': 'Get Report',
'ctl00$ContentPlaceHolder1$Ddl_Circles': 'All Circles',
'ctl00$ContentPlaceHolder1$Ddl_Divisions': '-- Select --',
'ctl00$ContentPlaceHolder1$TxtTin': tin_num,
'ctl00$ContentPlaceHolder1$dropact': 'all'
}
headers_1 = {
'Origin': 'https://www.apct.gov.in',
'User-Agent': user_agent,
'Cookie': cookie_1,
'Accept-Encoding': 'gzip, deflate, br',
'Referer': url_1,
'Content-Type': 'application/x-www-form-urlencoded',
'Upgrade-Insecure-Requests': '1'
}
try:
req = requests.post(url_1, data=values, headers=headers_1)

Categories