I have this code, which scrapes the Hacker News website with beautifulsoup4 and I am looking for a way to save the results into a Dataframe using Pandas. I have already imported pandas in the below code but I do not know how I can save the results into a DataFrame. It only scrapes the most favored Hacker News post now but it can be changed.
import pandas as pd
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from math import ceil
import json, sys, argparse, validators
MAX_NUM_POSTS = 100
class HackerNewsScraper:
URL = 'https://news.ycombinator.com/news'
def __init__(self, posts):
self._total_posts = posts
self._total_pages = int(ceil(posts/30))
self._stories = []
def scrape_stories(self):
"""
Fetches all HTML data.
Each page is limited to 30 stories, this function will ensure enough pages are fetched.
"""
page = 1
while(page <= self._total_pages): # Makes sure to visit sufficient amount of pages
url = '{}?p={}'.format(self.URL, page)
html = get_html(url)
self.parse_stories(html)
page += 1
def parse_stories(self, html):
"""
Given a BeautifulSoup nested data structure, html. parse_stories(html) will parse the data and select the desired fields.
After getting title, uri, author, comments, points, and rank, it will save them in dictionary form in self._stories.
"""
for storytext, subtext in zip(html.find_all('tr', {'class': 'athing'}),
html.find_all('td', {'class': 'subtext'})):
storylink = storytext.find_all('a',{'class':'storylink'})
sublink = subtext.select('a')
# All requested data being saved in the dictionary story below
TITLE = storylink[0].text.strip()
LINK = storylink[0]['href']
AUTHOR = sublink[0].text
COMMENTS = sublink[-1].text
POINTS = subtext.select('span')[0].text
RANK = storytext.select('span.rank')[0].text.strip('.')
story = {
'title' : TITLE,
'uri' : LINK,
'author' : AUTHOR,
'points' : POINTS,
'comments' : COMMENTS,
'rank' : RANK
}
# Make sure data satisfies requirements
story = validate_story(story)
# self._stories is an array of dictionaries that saves the requested number of stories
self._stories.append(story)
# If required number of stories met, stop parsing
if len(self._stories) >= self._total_posts:
return
def print_stories(self):
"""
Outputs the stories from list of dictionary format to JSON in STDOUT.
"""
json.dump(self._stories, sys.stdout, indent=4)
def get_stories(self):
"""
Returns the scraped stories to the user in a list of dictionary format.
Used for testing purposes.
"""
return self._stories
def get_html(url):
"""
Runs the HTML data through BeautifulSoup to get a BeautifulSoup object, a nested data structure.
"""
response = get_response(url)
if response is not None:
html = BeautifulSoup(response, 'html.parser')
return html
def validate_story(story):
"""
Ensures that all the story data is valid according to the task.
Will return valid data for each field.
"""
story['title'] = story['title'][:256]
if not valid_title(story['title']):
story['title'] = 'Valid title not found'
story['author'] = story['author'][:256]
if not valid_author(story['author']):
story['author'] = 'Valid author not found'
if not valid_uri(story['uri']):
story['uri'] = 'Valid URI not found'
story['comments'] = validate_number(story['comments'])
story['points'] = validate_number(story['points'])
story['rank'] = validate_number(story['rank'])
return story
def valid_title(title):
"""
Ensures that title is non empty string with <= 256 characters
"""
return (len(title) <= 256 and title)
def valid_author(author):
"""
Ensures that author is non empty string and <= 256 characters.
Solved the issue of not finding an author by checking the fetched data with HN username rules.
"""
if(author.find(' ') > -1): #Hacker news username doesnt support whitespace
return False
return (len(author) <= 256 and author)
def valid_uri(url):
"""
To be able to find the scraped stories, we need their URL.
If data is not a valid URL, return False.
"""
if(validators.url(url)):
return True
return False
def validate_number(numString):
"""
Will make sure that the returned number is an integer.
Will strip any non digits from the input and return the first number.
"""
if numString.find('ago') > -1: #If not found, 'time since posted' would replace points for example
return 0
digits = [int(s) for s in numString.split() if s.isdigit()]
if len(digits) > 0:
return digits[0]
return 0
def get_response(url):
"""
Attempts to get the content at 'url' by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
Log the errors. Currently just printing them out to user.
"""
print(e)
def validate_input(arg, arg_max):
"""
Validate the user input. Makes sure it is less than or equal to 100 posts.
"""
error_msg = 'Posts cannot exceed {}'.format(arg_max)
if arg > arg_max:
raise argparse.ArgumentTypeError(error_msg)
# Parses the number of posts input from user. Default is 10.
def parse_arguments():
"""
Parses the argument input from the user. Default is 10.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--posts', '-p', metavar='n', type=int, default=1, help='number of posts (max 100)')
args = parser.parse_args()
validate_input(args.posts, MAX_NUM_POSTS)
return args.posts
def main():
"""
If user input is valid, will create a scraper and fetch requests number of posts and print them to the user.
"""
try:
posts = parse_arguments()
hnews_scraper = HackerNewsScraper(posts)
hnews_scraper.scrape_stories()
hnews_scraper.print_stories()
except argparse.ArgumentTypeError as ex:
log_error(ex)
if __name__ == '__main__':
main()
Try This:
Don't forget to import Pandas
story = {
'title' : TITLE,
'uri' : LINK,
'author' : AUTHOR,
'points' : POINTS,
'comments' : COMMENTS,
'rank' : RANK
}
data = list(zip(TITLE, LINK, AUTHOR, POINTS, COMMENTS, RANK))
dt = pd.DataFrame(data, columns = ['title', 'uri', 'author', 'points', 'comments', 'rank'])
Related
The code below is a sample from my complete program, I tried it to make understandable.
It sends requests to a REST API. It starts with an URL and the number of pages for this specific search and tries to catch the content for each page.
Each page has several results. Each result becomes a FinalObject.
Because there are as many API requests as there are pages, I decided to use multi-threading and the concurrent.futures module.
=> It works but, as I'm new in coding and Python, I still have these 2 questions:
How to use ThreadPoolExecutor sequentially in this case,
Is there a better way to handle multi-threading in this case?
from concurrent.futures import ThreadPoolExecutor
from requests import get as re_get
def main_function(global_page_number, headers, url_request):
# create a list of pages number
pages_numbers_list = [i for i in range(global_page_number)]
# for each page, call the page_handler (MultiThreading)
with ThreadPoolExecutor(max_workers=10) as executor:
for item in pages_numbers_list:
executor.submit(
page_handler,
item,
url_request,
headers
)
def page_handler(page_number, url_request, headers):
# we change the page number in the url request
url_request = change_page(url_request, page_number)
# new request with the new url
result = re_get(url_request, headers=headers)
result = result.json()
# in the result, with found the list of dict in order to create the
# final object
final_object_creation(result['results_list'])
def change_page(url_request, new_page_number):
"to increment the value of the 'page=' attribute in the url"
current_nb_page = ''
start_nb = url_request.find("page=") + len('page=')
while 1:
if url_request[start_nb].isdigit():
current_nb_page = url_request[start_nb]
else:
break
new_url_request = url_request.replace("page=" + current_nb_page,
"page=" + str(new_page_number))
return new_url_request
def final_object_creation(results_list):
'thanks to the object from requests.get(), it builts the final object'
global current_id_decision, dict_decisions
# each item in the results lis should be an instance of the final object
for item in results_list:
# On définit l'identifiant du nouvel objet Decision
current_id_decision += 1
new_id = current_id_decision
# On crée l'objet Décision et on l'ajoute au dico des décisions
dict_decisions[new_id] = FinalObject(item)
class FinalObject:
def __init__(self, content):
self.content = content
current_id_decision = 0
dict_decisions = {}
main_function(1000, "headers", "https://api/v1.0/search?page=0&query=test")
I am trying to get this awesome code I found on Github to compile https://github.com/apryor6/stockstreamer/blob/master/data_fetcher.py
I made a few modifications to the code as API URL for IEX has changed this code was published
Class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/market/batch?token=MY TOKEN"
url_suffix_price = "&price"
url_suffix_img = "&logo"
url_suffix_highlow = ""e"***
When I step through the code and get to the end I receive the following error: "Can't instantiate abstract class IEXStockFetcher with abstract methods fetchImageURL, fetchPrice, fetchStockHighLow"
I am relatively new to object oriented programing in Python. Anyone has any thoughts?
class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/"
url_suffix_price = "/quote/latestPrice"
url_suffix_img = "/logo"
url_suffix_highlow = "/quote"
url_suffix_token = "?token=pk_44de71531a5d400bb1bd98a2c7dd011d"
....
....
def fetchPrice(self, stock):
# get the price of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_price, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
price = float(resp)
return price
except:
return self.fetchPrice(stock)
def fetchImageURL(self, stock):
# get the image url of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_img, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
return resp['url']
except:
return self.fetchImageURL(stock)
def fetchStockHighLow(self, stock):
# get the image url of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_highlow, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
return (resp['week52High'], resp['week52Low'])
except:
return self.fetchStockHighLow(stock)
I was able to get your code working with the new API.. I had to make a few small modifications
class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/market/batch?token=<MY TOKEN>&symbols="
url_suffix_price = "&types=price"
url_suffix_img = "&types=logo"
url_suffix_highlow = "&types=quote"
....
....
....
....
def fetchPrice(self, stock):
# get the price of a single stock
try:
resp = urlopen("{}{}{}".format(IEXStockFetcher.url_prefix,stock,IEXStockFetcher.url_suffix_price))
resp = json.loads(resp.readlines()[0].decode('utf8'))
price = float(resp[stock]['price'])
return price
except:
return self.fetchPrice(stock)
I need to fetch information about likes, comments and etc. from only one post object and here's the request code I send.
Example of my requests:
class StatsSN:
def init(self, fb_post_id, fb_token):
self.fb_post_id = fb_post_id
self.fb_token = fb_token
def req_stats(self, url_method):
req = requests.get(url_method)
if req.status_code != 200:
# return req.json().get('error')
# return 'error'
log.info('FB_Statistics: %s' % req.json())
return -1
return req.json().get('summary').get('total_count')
def fb_likes(self):
url_method = fb_api_url + '%s/likes?summary=true&access_token=%s' % (self.fb_post_id, self.fb_token)
return self.req_stats(url_method)
def fb_reactions(self):
url_method = fb_api_url + '%s/reactions?summary=total_count&access_token=%s' % (self.fb_post_id, self.fb_token)
return self.req_stats(url_method)
def fb_comments(self):
url_method = fb_api_url + '%s/comments?summary=true&access_token=%s' % (self.fb_post_id, self.fb_token)
return self.req_stats(url_method)
def fb_sharedposts(self):
url_method = fb_api_url + '%s/sharedposts?access_token=%s' % (self.fb_post_id, self.fb_token)
req = requests.get(url_method)
if req.status_code != 200:
log.info('FB_Statistics: %s' % req.json())
return -1
return len(req.json().get('data'))
def fb_stats(self):
fb_likes, fb_reactions, fb_comments, fb_sharedposts = self.fb_likes(), self.fb_reactions(), self.fb_comments(), \
self.fb_sharedposts()
return int(fb_likes), int(fb_reactions), int(fb_comments), int(fb_sharedposts)
Is there a method in the Graph API to get info about few posts in one request?
You can achieve it by sending a batch request; If you only need public data, a normal page token is good enough. However if you need private information, you will need a specific page token of the page post you want to get the metrics of.
As the metrics you are referring to are public, you should be able to send a GET request with following syntax:
https://graph.facebook.com/v2.12/?fields=id,comments.limit(0).summary(true),shares,reactions.limit(0).summary(true)&ids=STATUS_ID1,STATUS_ID2,STATUS_ID3,...,STATUS_ID50&access_token=PAGE_TOKEN
You can request up to 50 status id's in one call.
limit(0).summary(true)
This part you need to add with comments and reactions as it is the best practice to retrieve the total amount of comments/reactions.
I try to crawl this link by sending json requests. My first request would be :
parameters1 = {'ticker':'XOM', 'countryCode':'US',
'dateTime':'', 'docId':'1222737422 ',
'docType':'806','sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2',
'messageNumber':'','count':'10',
'channelName':'/news/latest/company/us/xom', 'topic':'',
'_':'' }
firstUrl = "http://www.marketwatch.com/news/headline/getheadlines"
html1 = requests.get(firstUrl, params = parameters1, headers = header)
html_json1=(json.loads(html1.text))
for sending the next requests, I have to extract docId from the corresponding HTML and add it to the new parameters. I don't know how to do that. Do you have any idea how to get new HTML frile after sending json requestes?
import requests
import json
from bs4 import BeautifulSoup
def main():
html_url = 'http://www.marketwatch.com/investing/stock/xom'
resp = requests.get(html_url)
if resp.status_code != 200:
raise Exception("http request failed: %s" % resp)
soup = BeautifulSoup(resp.text, 'lxml')
# get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM'
li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[-1]
unique_id = li_node['data-uniqueid']
print('got unique_id=%r, from %r' % (unique_id, li_node.text.replace('\n', ' ').strip()))
baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines'
parameters = {
'ticker':'XOM',
'countryCode':'US',
'docType':'806',
'docId': '', # (Optional) initial value extract from HTML page
'sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', # initial value extract from HTML page
'messageNumber':'8589', # initial value extract from HTML page
'count':'10',
'channelName': '/news/latest/company/us/xom',
}
parameters.update(extract_page_params(unique_id))
while True:
resp = requests.get(baseUrl, params = parameters)
data = json.loads(resp.text) # array of size 10
first = data[0] # get first item of array
last = data[-1] # get last item of array
print("\ngot %d data, url: %s" % (len(data), resp.url))
print("\tfirst: %-42s, %s" % (first['UniqueId'], first['SeoHeadlineFragment']))
print("\t last: %-42s, %s" % (last['UniqueId'], last['SeoHeadlineFragment']))
print("")
uid = last['UniqueId'] # get value of UniqueId from dict object `last`
parameters.update(extract_page_params(uid))
input("press <enter> to get next")
def extract_page_params(uid):
sequence = ''
messageNumber = ''
docId = ''
if ':' in uid: # if the symbol ':' in string `uid`
# uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499`
# so split it by ':'
sequence, messageNumber = uid.split(':')
else:
docId = uid
return {
'sequence': sequence,
'messageNumber': messageNumber,
'docId': docId,
}
if __name__ == '__main__':
main()
This is my code to solve your problem.
Since you are new to programming, i have added some comments.
You could directly copy and run with python version 3. (2 should work either)
You can use Beautiful Soup to extract data from html.It is a python library for extracting data from HTML.
I'm attempting to implement dynamic routing for a web framework. At the moment, the goal is to pass arguments into a function by way of the url. So, if user offers a url of "/page/23", then the route function will extract the "23" which will then be used as a parameter for the page function. I am getting a "keyerror", however.
import re
routing_table = {}
url = "/page/23"
def route(url, func):
key = url
key = re.findall(r"(.+?)/<[a-zA-Z_][a-zA-Z0-9_]*>", url)
if key:
params = re.findall(r"<([a-zA-Z_][a-zA-Z0-9_]*)>", url)
routing_table[key[0]] = [params, func]
else:
routing_table[url] = func
def find_path(url):
if url in routing_table:
return routing_table[url]
else:
return None
def page(page_id):
return "this is page %d" % page_id
route("/page/<page_id>", page)
print(routing_table[url])
When you called route, you used a url equal to "/page/<page_id>", but in the last line, url is a global variable equal to "/page/23".
It looks like there are other problems: replace your last line with
print(routing_table)
to see what you're doing.