I have created a small script that scraped a webpage that scrapes all items name, link, image and price from a product table.
I am currently facing problem where I am not able to store multiple dataclasses where I want to first of all see if there is a new URL found in the webpage and if there is a new change, I want to print out the name, image and price of the new url that has been found.
import time
from typing import Optional
import attr
import requests
from selectolax.parser import HTMLParser
#attr.dataclass
class Info:
store: str = attr.ib(factory=str)
link: str = attr.ib(factory=str)
name: Optional[str] = attr.ib(factory=str)
price: Optional[str] = attr.ib(factory=str)
image: Optional[str] = attr.ib(factory=str)
# -------------------------------------------------------------------------
# Get all latest products found in the webpage
# -------------------------------------------------------------------------
def from_page():
with requests.get("https://www.footish.se/sneakers", timeout=5) as rep:
if rep.status_code in (200, 404):
doc = HTMLParser(rep.text)
for product in doc.css('article.product-wrapper'):
name = product.css_first('div.product-image > a').attributes.get('title')
link = product.css_first('div.product-image > a').attributes.get('href')
image = product.css_first('div.product-image > a > img').attributes.get('data-original')
price = product.css_first('span.price-amount')
return Info(
store="Footish",
link=link,
name=name,
image=image,
price=price
)
if __name__ == '__main__':
all_found_products = set()
while True:
get_all_products: Info = from_page()
diff = set(get_all_products.link) - all_found_products
for new_urls in diff:
print(f"Found new url! {new_urls}")
print(f"Name: {get_all_products.name}")
print(f"image: {get_all_products.image}")
print(f"price: {get_all_products.price}")
print("Sleeping 120 sec")
time.sleep(120)
My problem is that I dont know how return dataclasses that is looped from a for loop for product in doc.css('article.product-wrapper'): as there is multiple products on the webpage and I want to store all found products and then compare to see if there is a new url found and if there is then I would like to print out the name, price and image of the new url.
You should use a list to store multiple Info instances, then return them all
def from_page():
with requests.get("https://www.footish.se/sneakers", timeout=5) as rep:
if rep.status_code in (200, 404):
doc = HTMLParser(rep.text)
infos = []
for product in doc.css('article.product-wrapper'):
name = product.css_first('div.product-image > a').attributes.get('title')
link = product.css_first('div.product-image > a').attributes.get('href')
image = product.css_first('div.product-image > a > img').attributes.get('data-original')
price = product.css_first('span.price-amount')
infos.append(Info(store="Footish", link=link, name=name,
image=image, price=price))
return infos
And for the main, it would be more something like
all_found_urls = set()
while True:
get_all_products = from_page()
for info in get_all_products:
if info.link not in all_found_urls:
print(f"Found new url! {info.link}")
print(f"Name: {info.name}")
print(f"image: {info.image}")
print(f"price: {info.price}")
all_found_urls.add(info.link)
print("Sleeping 120 sec")
time.sleep(120)
Related
I am working on a project where I have to scrap subreddit using PRAW. But I have to put limit so that it will scrap only that many posts. For example, if I want to scrap a subreddit gaming (https://www.reddit.com/r/gaming/) I have to give limit 100 so it scrap for first 100 posts. But instead, I want first the total number of posts in gaming subreddit and then that value I can set as a limit to extract all the posts. I have searched on internet about Pushshift API, but don't know how to do that. Any help is appreciated!
Following code:
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from psaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def main(name, value):
i = 0
subreddit = reddit_read_only.subreddit(name)
print(subreddit.created)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
# print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
# df.to_csv(name + str('.csv'), index=False)
return name
if __name__ == "__main__":
print('#####################################################################')
print('############### Reddit Web Scrapping Started ########################')
print('#####################################################################')
print()
name = main('gaming', 50)
print()
print('Created {}.csv file!'.format(name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
I have put limit to 50 which will scrap first 50 posts. But I want to scrap all the posts that is available in gaming. If I put limit = "None", then it will throw me an error:
TypeError: '<' not supported between instances of 'int' and 'str'
And this is logical as well. So, I guess I won't be able to use limit = "None".
I have created a function total_posts() with the help of Pushshift API, that will give me total number of posts avaialble for a particular subreddit.
#Importing Dependencies
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from pmaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def total_posts(name):
print("Calculating total number of posts")
print()
api = PushshiftAPI()
api_request_generator = api.search_submissions(subreddit='ChatGPT', score = ">=0")
aita_submissions = pd.DataFrame([submission for submission in api_request_generator])
print("Total number of posts in subreddit {} are {}".format(name, aita_submissions.shape[0]))
return aita_submissions.shape[0]
def main(name, value):
print('Creating dataframe')
print()
i = 0
subreddit = reddit_read_only.subreddit(name)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
df.to_csv(name + str('.csv'), index=False)
if __name__ == "__main__":
subreddit_name = 'gaming'
print('#####################################################################')
print('#### Reddit Web Scrapping Started for {}'.format(subreddit_name) + '####')
print('#####################################################################')
print()
posts_number = total_posts(subreddit_name)
print()
main(subreddit_name, posts_number)
print()
print('Created {}.csv file!'.format(subreddit_name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
I am trying to get this awesome code I found on Github to compile https://github.com/apryor6/stockstreamer/blob/master/data_fetcher.py
I made a few modifications to the code as API URL for IEX has changed this code was published
Class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/market/batch?token=MY TOKEN"
url_suffix_price = "&price"
url_suffix_img = "&logo"
url_suffix_highlow = ""e"***
When I step through the code and get to the end I receive the following error: "Can't instantiate abstract class IEXStockFetcher with abstract methods fetchImageURL, fetchPrice, fetchStockHighLow"
I am relatively new to object oriented programing in Python. Anyone has any thoughts?
class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/"
url_suffix_price = "/quote/latestPrice"
url_suffix_img = "/logo"
url_suffix_highlow = "/quote"
url_suffix_token = "?token=pk_44de71531a5d400bb1bd98a2c7dd011d"
....
....
def fetchPrice(self, stock):
# get the price of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_price, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
price = float(resp)
return price
except:
return self.fetchPrice(stock)
def fetchImageURL(self, stock):
# get the image url of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_img, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
return resp['url']
except:
return self.fetchImageURL(stock)
def fetchStockHighLow(self, stock):
# get the image url of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_highlow, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
return (resp['week52High'], resp['week52Low'])
except:
return self.fetchStockHighLow(stock)
I was able to get your code working with the new API.. I had to make a few small modifications
class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/market/batch?token=<MY TOKEN>&symbols="
url_suffix_price = "&types=price"
url_suffix_img = "&types=logo"
url_suffix_highlow = "&types=quote"
....
....
....
....
def fetchPrice(self, stock):
# get the price of a single stock
try:
resp = urlopen("{}{}{}".format(IEXStockFetcher.url_prefix,stock,IEXStockFetcher.url_suffix_price))
resp = json.loads(resp.readlines()[0].decode('utf8'))
price = float(resp[stock]['price'])
return price
except:
return self.fetchPrice(stock)
So I have this craigslist scraper project I am working on and I am running into a potential problem. I have this file url.py that has a UrlObj class and getters and setters. In my main.py, I am instantiating that object and getting the completed url back to be sent to the Job class in my main.py to do its scraping stuff.
I would like to deploy this in the cloud in the future and have it run on a time interval (i.e. every day, 4 hours, etc), but I have noticed a problem. Every time this program is ran, the UrlObj class will be called, prompting the user to enter the relevant data to construct the URL. Since this will be in the cloud running in the background, no one will be able to input the prompts every time its built and ran.
What I want is for url.py and UrlObj to be called only once, in the beginning to allow the user to input and populate the necessary fields to construct the url. Then, every time the program is built and ran, the url the user made in the beginning should be used, not calling url.py and UrlObj to prompt the user again to type in inputs since it will be running in the cloud and on a time interval.
Is it too naive to think to set conditions around url = UrlObj().url to make sure it runs once. Like an if statement or while loop?
url.py:
class UrlObj:
def __init__(self):
self.location = self.get_location() # Location(i.e. City) being searched
self.postal_code = self.get_postal_code() # Postal code of location being searched
self.query = self.get_query() # Search for the type of items that will be searched
self.max_price = self.get_max_price() # Max price of the items that will be searched
self.radius = self.get_radius() # Radius of the area searched derived from the postal code given previously
self.url = f"https://{self.location}.craigslist.org/search/sss?&max_price={self.max_price}&postal={self.postal_code}&query={self.query}&20card&search_distance={self.radius}"
def get_location(self):
location = input("Please enter the location: ")
return location
def get_postal_code(self):
postal_code = input("Please enter the postal code: ")
return postal_code
def get_query(self):
query = input("Please enter the item: ")
return query
def get_max_price(self):
max_price = input("Please enter the max price: ")
return max_price
def get_radius(self):
radius = input("Please enter the radius: ")
return radius
main.py:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from url import *
class Job():
def __init__(self):
self.driver = webdriver.Chrome(r"C:\Program Files\chromedriver") # Path of Chrome web driver
self.delay = 5 # The delay the driver gives when loading the web page
# Load up the web page
# Gets all relevant data on the page
# Goes to next page until we are at the last page
def load_craigslist_url(self, url):
data = []
self.driver.get(url)
while True:
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID, "searchform")))
data.append(self.extract_post_titles())
WebDriverWait(self.driver, 2).until(
EC.element_to_be_clickable((By.XPATH, '//*[#id="searchform"]/div[3]/div[3]/span[2]/a[3]'))).click()
except:
break
return data
# # Extracts all relevant information from the web-page and returns them as individual lists
def extract_post_titles(self):
all_posts = self.driver.find_elements_by_class_name("result-row")
dates_list = []
titles_list = []
prices_list = []
distance_list = []
for post in all_posts:
title = post.text.split("$")
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(" ")
month = title[0]
day = title[1]
title = ' '.join(title[2:])
date = month + " " + day
if not price[:1].isdigit():
price = "0"
int(price)
raw_distance = post.find_element_by_class_name(
'maptag').text
distance = raw_distance[:-2]
titles_list.append(title)
prices_list.append(price)
dates_list.append(date)
distance_list.append(distance)
return titles_list, prices_list, dates_list, distance_list
# # Kills browser
def kill(self):
self.driver.close()
#staticmethod
def organizeResults(results):
titles_list = results[0][0]
prices_list = list(map(int, results[0][1]))
dates_list = results[0][2]
distance_list = list(map(float, results[0][3]))
list_of_attributes = []
for i in range(len(titles_list)):
content = {'Listing': titles_list[i], 'Price': prices_list[i], 'Date posted': dates_list[i],
'Distance from zip': distance_list[i]}
list_of_attributes.append(content)
list_of_attributes.sort(key=lambda x: x['Distance from zip'])
return list_of_attributes
#staticmethod
def to_csv(dictionary):
df = pd.DataFrame(dictionary)
df.to_csv('data.csv', index=False)
if __name__ == '__main__':
# This should be called only once!!!
# Then the 'url' should be used every time main.py is built and ran, and not be constructed again by calling 'UrlObj().url'
url = UrlObj().url
scraper = Job()
results = scraper.load_craigslist_url(url)
scraper.kill()
dictionary_of_listings = scraper.organizeResults(results)
scraper.to_csv(dictionary_of_listings)
I have this code, which scrapes the Hacker News website with beautifulsoup4 and I am looking for a way to save the results into a Dataframe using Pandas. I have already imported pandas in the below code but I do not know how I can save the results into a DataFrame. It only scrapes the most favored Hacker News post now but it can be changed.
import pandas as pd
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from math import ceil
import json, sys, argparse, validators
MAX_NUM_POSTS = 100
class HackerNewsScraper:
URL = 'https://news.ycombinator.com/news'
def __init__(self, posts):
self._total_posts = posts
self._total_pages = int(ceil(posts/30))
self._stories = []
def scrape_stories(self):
"""
Fetches all HTML data.
Each page is limited to 30 stories, this function will ensure enough pages are fetched.
"""
page = 1
while(page <= self._total_pages): # Makes sure to visit sufficient amount of pages
url = '{}?p={}'.format(self.URL, page)
html = get_html(url)
self.parse_stories(html)
page += 1
def parse_stories(self, html):
"""
Given a BeautifulSoup nested data structure, html. parse_stories(html) will parse the data and select the desired fields.
After getting title, uri, author, comments, points, and rank, it will save them in dictionary form in self._stories.
"""
for storytext, subtext in zip(html.find_all('tr', {'class': 'athing'}),
html.find_all('td', {'class': 'subtext'})):
storylink = storytext.find_all('a',{'class':'storylink'})
sublink = subtext.select('a')
# All requested data being saved in the dictionary story below
TITLE = storylink[0].text.strip()
LINK = storylink[0]['href']
AUTHOR = sublink[0].text
COMMENTS = sublink[-1].text
POINTS = subtext.select('span')[0].text
RANK = storytext.select('span.rank')[0].text.strip('.')
story = {
'title' : TITLE,
'uri' : LINK,
'author' : AUTHOR,
'points' : POINTS,
'comments' : COMMENTS,
'rank' : RANK
}
# Make sure data satisfies requirements
story = validate_story(story)
# self._stories is an array of dictionaries that saves the requested number of stories
self._stories.append(story)
# If required number of stories met, stop parsing
if len(self._stories) >= self._total_posts:
return
def print_stories(self):
"""
Outputs the stories from list of dictionary format to JSON in STDOUT.
"""
json.dump(self._stories, sys.stdout, indent=4)
def get_stories(self):
"""
Returns the scraped stories to the user in a list of dictionary format.
Used for testing purposes.
"""
return self._stories
def get_html(url):
"""
Runs the HTML data through BeautifulSoup to get a BeautifulSoup object, a nested data structure.
"""
response = get_response(url)
if response is not None:
html = BeautifulSoup(response, 'html.parser')
return html
def validate_story(story):
"""
Ensures that all the story data is valid according to the task.
Will return valid data for each field.
"""
story['title'] = story['title'][:256]
if not valid_title(story['title']):
story['title'] = 'Valid title not found'
story['author'] = story['author'][:256]
if not valid_author(story['author']):
story['author'] = 'Valid author not found'
if not valid_uri(story['uri']):
story['uri'] = 'Valid URI not found'
story['comments'] = validate_number(story['comments'])
story['points'] = validate_number(story['points'])
story['rank'] = validate_number(story['rank'])
return story
def valid_title(title):
"""
Ensures that title is non empty string with <= 256 characters
"""
return (len(title) <= 256 and title)
def valid_author(author):
"""
Ensures that author is non empty string and <= 256 characters.
Solved the issue of not finding an author by checking the fetched data with HN username rules.
"""
if(author.find(' ') > -1): #Hacker news username doesnt support whitespace
return False
return (len(author) <= 256 and author)
def valid_uri(url):
"""
To be able to find the scraped stories, we need their URL.
If data is not a valid URL, return False.
"""
if(validators.url(url)):
return True
return False
def validate_number(numString):
"""
Will make sure that the returned number is an integer.
Will strip any non digits from the input and return the first number.
"""
if numString.find('ago') > -1: #If not found, 'time since posted' would replace points for example
return 0
digits = [int(s) for s in numString.split() if s.isdigit()]
if len(digits) > 0:
return digits[0]
return 0
def get_response(url):
"""
Attempts to get the content at 'url' by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
Log the errors. Currently just printing them out to user.
"""
print(e)
def validate_input(arg, arg_max):
"""
Validate the user input. Makes sure it is less than or equal to 100 posts.
"""
error_msg = 'Posts cannot exceed {}'.format(arg_max)
if arg > arg_max:
raise argparse.ArgumentTypeError(error_msg)
# Parses the number of posts input from user. Default is 10.
def parse_arguments():
"""
Parses the argument input from the user. Default is 10.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--posts', '-p', metavar='n', type=int, default=1, help='number of posts (max 100)')
args = parser.parse_args()
validate_input(args.posts, MAX_NUM_POSTS)
return args.posts
def main():
"""
If user input is valid, will create a scraper and fetch requests number of posts and print them to the user.
"""
try:
posts = parse_arguments()
hnews_scraper = HackerNewsScraper(posts)
hnews_scraper.scrape_stories()
hnews_scraper.print_stories()
except argparse.ArgumentTypeError as ex:
log_error(ex)
if __name__ == '__main__':
main()
Try This:
Don't forget to import Pandas
story = {
'title' : TITLE,
'uri' : LINK,
'author' : AUTHOR,
'points' : POINTS,
'comments' : COMMENTS,
'rank' : RANK
}
data = list(zip(TITLE, LINK, AUTHOR, POINTS, COMMENTS, RANK))
dt = pd.DataFrame(data, columns = ['title', 'uri', 'author', 'points', 'comments', 'rank'])
looking at the example provided by wordpresslib, its very straight forward on how to upload images to the media library. However, the attachment of images looks like it was never finished. Has anyone successfully attached the images?
#!/usr/bin/env python
"""
Small example script that publish post with JPEG image
"""
# import library
import wordpresslib
print 'Example of posting.'
print
url = raw_input('Wordpress URL (xmlrpc.php will be added):')
user = raw_input('Username:')
password = raw_input('Password:')
# prepare client object
wp = wordpresslib.WordPressClient(url+"xmlrpc.php", user, password)
# select blog id
wp.selectBlog(0)
# upload image for post
# imageSrc = wp.newMediaObject('python.jpg')
# FIXME if imageSrc:
# create post object
post = wordpresslib.WordPressPost()
post.title = 'Test post'
post.description = '''
Python is the best programming language in the earth !
No image BROKEN FIXME <img src="" />
'''
#post.categories = (wp.getCategoryIdFromName('Python'),)
# Add tags
post.tags = ["python", "snake"]
# do not publish post
idNewPost = wp.newPost(post, False)
print
print 'Posting successfull! (Post has not been published though)'
WordPressPost class:
class WordPressPost:
"""Represents post item
"""
def __init__(self):
self.id = 0
self.title = ''
self.date = None
self.permaLink = ''
self.description = ''
self.textMore = ''
self.excerpt = ''
self.link = ''
self.categories = []
self.user = ''
self.allowPings = False
self.allowComments = False
self.tags = []
self.customFields = []
def addCustomField(self, key, value):
kv = {'key':key, 'value':value}
self.customFields.append(kv)
Wordpress saves images as website.com/wp-content/uploads/YEAR/MONTH/FILENAME
Adding a simple image tag with the above format in to post.description display the image on the post.
where YEAR is the current year with a 4 digit format (ex. 2015)
and MONTH is the current month with a leading zero (ex. 01,02,... 12)
and FILENAME is the file name submitted via imageSrc = wp.newMediaObject('python.jpg')
Example file name: website.com/wp-content/uploads/2015/06/image.jpg
Here is how I posted my image:
import time
import wordpresslib
import Image
from datetime import datetime
time = datetime.now()
h = str(time.strftime('%H'))
m = str(time.strftime('%M'))
s = str(time.strftime('%S'))
mo = str(time.strftime('%m'))
yr = str(time.strftime('%Y'))
url = 'WORDPRESSURL.xmlrpc.php'
wp = wordpresslib.WordPressClient(url,'USERNAME','PASSWORD')
wp.selectBlog(0)
imageSrc = wp.newMediaObject('testimage'+h+m+s'.jpg') #Used this format so that if i post images with the same name its unlikely they will override eachother
img = 'http://WORDPRESSURL/wp-content/uploads/'+yr+'/'+mo+'/testimage'+h+m+s+'.jpg'
post=wordpresslib.WordPressPost()
post.title='title'
post.description='<img src="'+img+'"/>'
idPost=wp.newPost(post,true)