So I have this craigslist scraper project I am working on and I am running into a potential problem. I have this file url.py that has a UrlObj class and getters and setters. In my main.py, I am instantiating that object and getting the completed url back to be sent to the Job class in my main.py to do its scraping stuff.
I would like to deploy this in the cloud in the future and have it run on a time interval (i.e. every day, 4 hours, etc), but I have noticed a problem. Every time this program is ran, the UrlObj class will be called, prompting the user to enter the relevant data to construct the URL. Since this will be in the cloud running in the background, no one will be able to input the prompts every time its built and ran.
What I want is for url.py and UrlObj to be called only once, in the beginning to allow the user to input and populate the necessary fields to construct the url. Then, every time the program is built and ran, the url the user made in the beginning should be used, not calling url.py and UrlObj to prompt the user again to type in inputs since it will be running in the cloud and on a time interval.
Is it too naive to think to set conditions around url = UrlObj().url to make sure it runs once. Like an if statement or while loop?
url.py:
class UrlObj:
def __init__(self):
self.location = self.get_location() # Location(i.e. City) being searched
self.postal_code = self.get_postal_code() # Postal code of location being searched
self.query = self.get_query() # Search for the type of items that will be searched
self.max_price = self.get_max_price() # Max price of the items that will be searched
self.radius = self.get_radius() # Radius of the area searched derived from the postal code given previously
self.url = f"https://{self.location}.craigslist.org/search/sss?&max_price={self.max_price}&postal={self.postal_code}&query={self.query}&20card&search_distance={self.radius}"
def get_location(self):
location = input("Please enter the location: ")
return location
def get_postal_code(self):
postal_code = input("Please enter the postal code: ")
return postal_code
def get_query(self):
query = input("Please enter the item: ")
return query
def get_max_price(self):
max_price = input("Please enter the max price: ")
return max_price
def get_radius(self):
radius = input("Please enter the radius: ")
return radius
main.py:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from url import *
class Job():
def __init__(self):
self.driver = webdriver.Chrome(r"C:\Program Files\chromedriver") # Path of Chrome web driver
self.delay = 5 # The delay the driver gives when loading the web page
# Load up the web page
# Gets all relevant data on the page
# Goes to next page until we are at the last page
def load_craigslist_url(self, url):
data = []
self.driver.get(url)
while True:
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID, "searchform")))
data.append(self.extract_post_titles())
WebDriverWait(self.driver, 2).until(
EC.element_to_be_clickable((By.XPATH, '//*[#id="searchform"]/div[3]/div[3]/span[2]/a[3]'))).click()
except:
break
return data
# # Extracts all relevant information from the web-page and returns them as individual lists
def extract_post_titles(self):
all_posts = self.driver.find_elements_by_class_name("result-row")
dates_list = []
titles_list = []
prices_list = []
distance_list = []
for post in all_posts:
title = post.text.split("$")
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(" ")
month = title[0]
day = title[1]
title = ' '.join(title[2:])
date = month + " " + day
if not price[:1].isdigit():
price = "0"
int(price)
raw_distance = post.find_element_by_class_name(
'maptag').text
distance = raw_distance[:-2]
titles_list.append(title)
prices_list.append(price)
dates_list.append(date)
distance_list.append(distance)
return titles_list, prices_list, dates_list, distance_list
# # Kills browser
def kill(self):
self.driver.close()
#staticmethod
def organizeResults(results):
titles_list = results[0][0]
prices_list = list(map(int, results[0][1]))
dates_list = results[0][2]
distance_list = list(map(float, results[0][3]))
list_of_attributes = []
for i in range(len(titles_list)):
content = {'Listing': titles_list[i], 'Price': prices_list[i], 'Date posted': dates_list[i],
'Distance from zip': distance_list[i]}
list_of_attributes.append(content)
list_of_attributes.sort(key=lambda x: x['Distance from zip'])
return list_of_attributes
#staticmethod
def to_csv(dictionary):
df = pd.DataFrame(dictionary)
df.to_csv('data.csv', index=False)
if __name__ == '__main__':
# This should be called only once!!!
# Then the 'url' should be used every time main.py is built and ran, and not be constructed again by calling 'UrlObj().url'
url = UrlObj().url
scraper = Job()
results = scraper.load_craigslist_url(url)
scraper.kill()
dictionary_of_listings = scraper.organizeResults(results)
scraper.to_csv(dictionary_of_listings)
Related
Iam trying to write my own python script to find an account top followed followers, and it seems to work fine, however after a while or after running the script more than 1-2 times, instagram gives me a try again error, which ive searched and found its Instagram temporarily blocking my ip as i have given to many requests at once.
Does anyone know a way to get around this?
MY CODE
"""
WHAT DOES THIS SCRIPT ACTUALLY DO?:
This script enables you to scrape all your followers and then find X top followed followers.
--------------------------------------------------------------------------------------------
NOTICE:
Unfortunately it is very hard now a days to scrape social media sites, due to
timeout issues, to many pings in a set time and other request restrictions.
So this script can only be ran 1-3 times a day.
I've tried also using exciting API's but all these are either too slow, or simply
show a '428' to many requests error.
"""
import instaloader
from selenium import webdriver
import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from rich.console import Console
from rich.table import Column, Table
# Global vars
L = instaloader.Instaloader()
URL = "https://www.instagram.com/{}/"
usernameGlobal = None
passwordGlobal = None
console = Console()
def get_followers():
# Login
while True: # Keep running if password/username was wrong
try:
global usernameGlobal, passwordGlobal
print("\n"+"*-=-*"*5)
usernameGlobal = input("> Enter your username: ")
passwordGlobal = input("> Enter your password: ")
L.login(usernameGlobal, passwordGlobal)
print("\n"+"-"*28+"\n> Successfully Logged In!")
print("> Please leave this program running in the background")
print("> Until you see the 'FINISHED' message'"+"\n"+"-"*28)
break
except:
print("\n"+"-"*28+"\n> Wrong Username / Password"+"\n"+"-"*28)
# Obtain profile metadata
profile = instaloader.Profile.from_username(L.context, usernameGlobal)
follow_list = []
# Loop through each follower and add to list
for followee in profile.get_followers():
follow_list.append(followee.username)
return follow_list
def scrape_data(username):
driver.get(URL.format(username))
FOLLOWERS = 0
try:
try:
FOLLOWERS = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/header/section/ul/li[2]/a/span').text
except: # For people who you don't follow but follow you and have private accounts
FOLLOWERS = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/header/section/ul/li[2]/span/span').text
except:
print("\n"+"-"*28+"\n> Please try this script again later!"+"\n"+"-"*28)
result = ''.join([i for i in FOLLOWERS if i.isdigit()])
return int(float(result))
def driver_login():
driver.get("https://www.instagram.com")
time.sleep(3)
element = driver.find_element_by_xpath("//input[#name='username']")
element.send_keys(usernameGlobal)
element = driver.find_element_by_xpath("//input[#name='password']")
element.send_keys(passwordGlobal)
element.send_keys(Keys.RETURN)
time.sleep(3)
# -- This is for if you have two factor authentication enabled --
# element = driver.find_element_by_xpath("//input[#name='verificationCode']")
# key = input("Enter Activation key: ")
# element.send_keys(key)
# element.send_keys(Keys.RETURN)
# time.sleep(3)
def output_result(size, result):
n_input = 0
# Get user to select how many of the top followed followers they want
while True:
try:
print("\n"+"*-=-*"*10)
n_input = int(input("> How many of your top followed followers do you want to see?\n> E.g 5 for top 5.\n> "))
if n_input > size:
continue
break
except:
print("\n"+"-"*28+"\n> Invalid input. (Must be a number & less then your follower count)"+"\n"+"-"*28)
# Make the table for a clean user friendly output and print it out
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Your Followers", style="dim", width=12)
table.add_column("There Follower Count")
for x in range(n_input):
table.add_row(
list(result.keys())[x-1],
list(result.values())[x-1]
)
console.print(table)
return
if __name__ == "__main__":
list_of_followers = get_followers()
# Initialize the selenium driver
driver = webdriver.Chrome(ChromeDriverManager().install())
driver_login()
result = {}
for follower in list_of_followers:
followers = scrape_data(follower)
result[follower] = followers
# Sort the dictionary by descending order
result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
print("\n> FINISHED")
driver.quit()
output_result(len(list_of_followers), result)
exit(0)
You can potentially make unlimited requests if you use proxies. You can buy thousands of proxies from various sites and rotate them in a dictionary.
Simply add a list of proxies to your GET request and enjoy:
proxyDict = {
"http" : http_proxy,
"https" : https_proxy,
"ftp" : ftp_proxy
}
r = requests.get(url, headers=headers, proxies=proxyDict)
Also for Selenium, from this answer:
PROXY = "1.111.111.1:8080" #your proxy
chrome_options = WebDriverWait.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % PROXY)
chrome = webdriver.Chrome(chrome_options=chrome_options)
chrome.get("instagram.com")
I have created a small script that scraped a webpage that scrapes all items name, link, image and price from a product table.
I am currently facing problem where I am not able to store multiple dataclasses where I want to first of all see if there is a new URL found in the webpage and if there is a new change, I want to print out the name, image and price of the new url that has been found.
import time
from typing import Optional
import attr
import requests
from selectolax.parser import HTMLParser
#attr.dataclass
class Info:
store: str = attr.ib(factory=str)
link: str = attr.ib(factory=str)
name: Optional[str] = attr.ib(factory=str)
price: Optional[str] = attr.ib(factory=str)
image: Optional[str] = attr.ib(factory=str)
# -------------------------------------------------------------------------
# Get all latest products found in the webpage
# -------------------------------------------------------------------------
def from_page():
with requests.get("https://www.footish.se/sneakers", timeout=5) as rep:
if rep.status_code in (200, 404):
doc = HTMLParser(rep.text)
for product in doc.css('article.product-wrapper'):
name = product.css_first('div.product-image > a').attributes.get('title')
link = product.css_first('div.product-image > a').attributes.get('href')
image = product.css_first('div.product-image > a > img').attributes.get('data-original')
price = product.css_first('span.price-amount')
return Info(
store="Footish",
link=link,
name=name,
image=image,
price=price
)
if __name__ == '__main__':
all_found_products = set()
while True:
get_all_products: Info = from_page()
diff = set(get_all_products.link) - all_found_products
for new_urls in diff:
print(f"Found new url! {new_urls}")
print(f"Name: {get_all_products.name}")
print(f"image: {get_all_products.image}")
print(f"price: {get_all_products.price}")
print("Sleeping 120 sec")
time.sleep(120)
My problem is that I dont know how return dataclasses that is looped from a for loop for product in doc.css('article.product-wrapper'): as there is multiple products on the webpage and I want to store all found products and then compare to see if there is a new url found and if there is then I would like to print out the name, price and image of the new url.
You should use a list to store multiple Info instances, then return them all
def from_page():
with requests.get("https://www.footish.se/sneakers", timeout=5) as rep:
if rep.status_code in (200, 404):
doc = HTMLParser(rep.text)
infos = []
for product in doc.css('article.product-wrapper'):
name = product.css_first('div.product-image > a').attributes.get('title')
link = product.css_first('div.product-image > a').attributes.get('href')
image = product.css_first('div.product-image > a > img').attributes.get('data-original')
price = product.css_first('span.price-amount')
infos.append(Info(store="Footish", link=link, name=name,
image=image, price=price))
return infos
And for the main, it would be more something like
all_found_urls = set()
while True:
get_all_products = from_page()
for info in get_all_products:
if info.link not in all_found_urls:
print(f"Found new url! {info.link}")
print(f"Name: {info.name}")
print(f"image: {info.image}")
print(f"price: {info.price}")
all_found_urls.add(info.link)
print("Sleeping 120 sec")
time.sleep(120)
I am trying to get this awesome code I found on Github to compile https://github.com/apryor6/stockstreamer/blob/master/data_fetcher.py
I made a few modifications to the code as API URL for IEX has changed this code was published
Class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/market/batch?token=MY TOKEN"
url_suffix_price = "&price"
url_suffix_img = "&logo"
url_suffix_highlow = ""e"***
When I step through the code and get to the end I receive the following error: "Can't instantiate abstract class IEXStockFetcher with abstract methods fetchImageURL, fetchPrice, fetchStockHighLow"
I am relatively new to object oriented programing in Python. Anyone has any thoughts?
class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/"
url_suffix_price = "/quote/latestPrice"
url_suffix_img = "/logo"
url_suffix_highlow = "/quote"
url_suffix_token = "?token=pk_44de71531a5d400bb1bd98a2c7dd011d"
....
....
def fetchPrice(self, stock):
# get the price of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_price, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
price = float(resp)
return price
except:
return self.fetchPrice(stock)
def fetchImageURL(self, stock):
# get the image url of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_img, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
return resp['url']
except:
return self.fetchImageURL(stock)
def fetchStockHighLow(self, stock):
# get the image url of a single stock
try:
resp = urlopen("{}{}{}{}".format(IEXStockFetcher.url_prefix, stock, IEXStockFetcher.url_suffix_highlow, IEXStockFetcher.url_suffix_token))
resp = json.loads(resp.readlines()[0].decode('utf8'))
return (resp['week52High'], resp['week52Low'])
except:
return self.fetchStockHighLow(stock)
I was able to get your code working with the new API.. I had to make a few small modifications
class IEXStockFetcher(StockFetcher):
"""
Fetches stock information using iextrading.com API
"""
url_prefix = "https://cloud.iexapis.com/stable/stock/market/batch?token=<MY TOKEN>&symbols="
url_suffix_price = "&types=price"
url_suffix_img = "&types=logo"
url_suffix_highlow = "&types=quote"
....
....
....
....
def fetchPrice(self, stock):
# get the price of a single stock
try:
resp = urlopen("{}{}{}".format(IEXStockFetcher.url_prefix,stock,IEXStockFetcher.url_suffix_price))
resp = json.loads(resp.readlines()[0].decode('utf8'))
price = float(resp[stock]['price'])
return price
except:
return self.fetchPrice(stock)
I keep getting this error on multiple scripts, I'm doing a lot of scraping, I have a loop that scrapes through hundreds of pages, and at some point the scripts just stops due to this error.
Here's an example of a script
Example 2:
def scrape(urls):
for url in urls:
session = HTMLSession()
resp = session.get(url)
resp.html.render()
try:
phone = resp.html.find('span.phone')[0].text
except IndexError:
phone = None
biz_name = resp.html.find('h1')[0].text
try:
biz_desc = resp.html.find('p.biz-description-text')[0].text
except IndexError:
biz_desc = None
biz_location = resp.html.find('span.title-address-text')[0].text
city = biz_location.split(',')[-1]
print(
f'phone is: {phone}\nthe business name is: {biz_name}\nthe description is: {biz_desc}\nthe city is: {city}')
import_data(biz_name, phone, biz_desc, city)
def import_data(name, phone, desc, city):
global keyword
wp_title_box = driver.find_element_by_xpath('//*[#id="title"]')
wp_title_box.send_keys(name)
time.sleep(1)
wp_desc_box = driver.find_element_by_xpath('//*[#id="content_ifr"]')
wp_desc_box.send_keys(desc)
time.sleep(1)
new_field_button = driver.find_element_by_xpath('//*[#id="newmeta-submit"]')
select_box = Select(driver.find_element_by_xpath('//*[#id="metakeyselect"]'))
select_box.select_by_value("ad_city")
wp_city_fill = driver.find_element_by_xpath('//*[#id="metavalue"]')
wp_city_fill.send_keys(city)
new_field_button.click()
time.sleep(2)
select_box.select_by_value("ad_phone")
wp_city_fill = driver.find_element_by_xpath('//*[#id="metavalue"]')
wp_city_fill.send_keys(phone)
new_field_button.click()
time.sleep(2)
select_box.select_by_value("ad_promote")
wp_city_fill = driver.find_element_by_xpath('//*[#id="metavalue"]')
wp_city_fill.send_keys('1')
new_field_button.click()
time.sleep(2)
save_btn = driver.find_element_by_xpath('//*[#id="save-post"]')
driver.execute_script("window.scrollTo(0,0);")
time.sleep(1)
save_btn.click()
time.sleep(2)
driver.find_element_by_xpath('//*[#id="menu-posts"]/ul/li[3]/a').click()
time.sleep(2)
I've added example 2, as example 1 was solved by a loop provided below.
In the second example the script should end since I'm using a for loop, once it has finished going through all of the urls and importing them, it should be done, am I missing something?
Your program never terminates. Number calls scrape, which calls number, which calls scrape, which calls number etc. If you are going to use recursion you need to have a terminating or base case.
One suggestion is using a counter to track the depth of your recursion and then increment the counter at each step until it reaches the specified depth.
I do think for what you are doing you do not need recursion at all which is expensive due to the overhead of function calls. A simple loop would be fine:
import random
import urllib3
from requests_html import HTMLSession
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def scrape(rand_num):
session = HTMLSession()
resp = session.get("https://www.example.com/prize/?d=" + '92' + str(rand_num))
resp.html.render()
print(f'trying coupon code 92{rand_num}')
prize = resp.html.find(containing="You've won a prize")
print(prize)
if prize:
print("https://www.example.com/prize/?d=" + '92' + str(rand_num))
def number():
for i in range(99999999):
x = random.randint(00000000, 99999999)
scrape(x)
number()
I'm trying to make a little testing script which can post something to my testing facebook group. I think that the best way is to use Selenium webdriver if I don't use Graph API.
Login works correctly. Then I get the group (self.driver.get(group_url)). Now, I locate an textarea element and send my text there - this works (according to self.driver.save_screenshot..).
Now, I'm going to locate a submit/post button. I think it is located correctly because I've tested it by copying XPATH by inspect element in Chrome.
So I do click(). Everything seems work good but there is no new post.
# -*- coding: utf-8 -*-
from selenium import webdriver
import mLib
from selenium.webdriver.common.keys import Keys
import time
GROUPS = ['https://www.facebook.com/groups/1467884653516334/']
TEST = 'TESTING TEXT'
class base():
def __init__(self):
self.driver = webdriver.PhantomJS()
self.settings = {}
self.user = None
self.password = None
self.logged = False
self.groups = []
self.set_settings()
self.groups = self.get_groups()
def get_post_button(self):
print self.driver.find_elements_by_xpath("//*[contains(text(), 'Post')]")
def get_groups(self):
# if 'groups' in self.settings.keys():
# groups = self.settings['groups'].split('*')
# return groups if groups is not None else []
return GROUPS
def set_settings(self):
with open('settings.txt') as f:
for line in f:
splitted = line.split('::')
self.settings[splitted[0]]=splitted[1]
def login(self,username,password):
self.driver.get('http://www.facebook.com')
user_form = self.driver.find_element_by_id('email')
user_form.send_keys(username)
pass_form = self.driver.find_element_by_id('pass')
pass_form.send_keys(password)
pass_form.send_keys(Keys.RETURN)
time.sleep(5)
if 'You must log in to continue' in self.driver.page_source:
self.login()
self.logged = True
def send_post(self,text,group):
assert self.logged == True
self.driver.get(group)
mLib.printToFile('source.txt',self.driver.page_source.encode('utf-8'))
inner_wraps = self.driver.find_elements_by_css_selector('div.innerWrap')
for iw in inner_wraps:
try:
text_field = iw.find_element_by_css_selector('textarea')
text_field.send_keys(text)
self.driver.save_screenshot('screen.png')
except:
continue
button = self.driver.find_element_by_xpath('//*[#id="u_0_1w"]/div/div[5]/div/ul/li[2]/button')
print button.get_attribute('outerHTML').encode('utf-8')
webdriver.ActionChains(self.driver).move_to_element(button).click(button).perform()
bs = base()
bs.login('email#gmail.com','password')
bs.send_post(TEST,GROUPS[0])
print 'OK'
bs.driver.quit()
Why is that so? Do you have any advice?