I am new to stacker flow and this is my first post, so I hope I can explain myself well and you can help me! Thanks in advance for your help!!
I am using Scrapy to web scrape a popular real statement website from my native country. I am doing well with all the characteristics I want, such as Price, Surface, Bedrooms, among others. But I haven't been able to get the latitude/longitude of a property. In the website, for example, https://www.portalinmobiliario.com/MLC-564988630-estilo-mariposa-_JM#position=2&type=item&tracking_id=ed337e69-9999-4ede-b393-ef378e1a5675, you can find a google map location as the image shows, and inside of this HTML element, it is possible to get the lat/long (highlighted in blue) but when I try to reach this element in my code, the spider doesn't recognize it.
Using this css selector crs_location = response.css('div.map-container img:nth-child(1)').getall() I am able to get the first img inside the div, getting the following output https://http2.mlstatic.com/resources/frontend/web-vip/ui-dist/images/pin-real-estate-d1ebb73e65.svg, but when I change the nth-child to: crs_location = response.css('div.map-container img:nth-child(2)').getall() to get the second child (what I want), the crs_location variable outcome empty.
I appreciate it if you can help to figure out how to get the lat/long of this.
Thanks!
HTML elements
Complete Code:
import scrapy
from scrapy import Selector
import requests
import pandas as pd
import numpy as np
# Import the CrawlerProcess
from scrapy.crawler import CrawlerProcess
# Create the Spider class
class Spider_Inmob(scrapy.Spider):
name = 'spider_inmob'
#download_delay = 3
# start_requests method
def start_requests( self ):
headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
i=1
page=0
for y in range(1):
url = 'http://portalinmobiliario.com/venta/departamento/propiedades-usadas/providencia-metropolitana/_Desde_' + str(page)
print("----------PRUEBA1--------------" + str(page))
page = 51 + 50*i
i+=1
yield scrapy.Request(url = url, callback=self.parse, headers=headers)
def parse(self, response):
global aux3
links_busqueda = response.css('ol.ui-search-layout > li.ui-search-layout__item a.ui-search-result__content.ui-search-link::attr(href)').getall()
print(len(links_busqueda))
for url in links_busqueda:
aux3 = aux3+1
print(aux3)
yield response.follow(url=url, callback = self.parse_propiedad, meta={'dont_redirect': True, 'handle_httpstatus_list':[302]})
def parse_propiedad(self,response):
global aux2
aux2 = aux2+1
global crs_Bedroom, crs_Currency, crs_Link, crs_Parking, crs_Price, crs_Restroom, crs_Storage, crs_Total_Surface, crs_Useful_Surface, crs_location
#print ("Number iteration " + str(aux2))
global Nombre_variables
#print('-------------------------PRUEBAAAA------1---------------')
aux=1
crs_prueba = response.css('header.item-title > h1.item-title__primary::text').getall()
#print(crs_prueba)
#This for goes over each characteristic for property like, total surface, bedrooms, bathrooms, etc
for i in range(20):
variable = response.css('section.specs-container > ul.specs-list li.specs-item:nth-child('+ str(i) +') > strong::text').getall()
variable2 = response.css('section.specs-container > ul.specs-list li.specs-item:nth-child('+ str(i) +') > span::text').getall()
np_variable = np.array(variable)
if not variable:
a=0
else:
for var in Nombre_variables:
if np_variable[0] == "Superficie total":
crs_Total_Surface = variable2
elif np_variable[0] == "Superficie útil":
crs_Useful_Surface = variable2
elif np_variable[0] == "Dormitorios":
crs_Bedroom = variable2
elif np_variable[0] == "Baños":
crs_Restroom = variable2
elif np_variable[0] == "Estacionamientos":
crs_Parking = variable2
elif np_variable[0] == "Bodegas":
crs_Storage = variable2
# print(crs_Storage)
#print("----------------PRUEBA--------------2--------------------")
crs_Link = response.url
crs_location = response.css('div.map-container img:nth-child(2)').getall()
print("/n/n/n")
print(crs_location)
print("/n/n/n")
# Ass we have two kind of currency, we transform everything to UF currency
variable3 = response.css('fieldset.item-price span.price-tag > span.price-tag-symbol::text').getall()
np_variable3 = np.array(variable3)
# print(np_variable3[0])
if np_variable3[0] != "UF":
crs_Currency = "$"
variable4 = response.css('fieldset.item-price span.price-tag > span.price-tag-fraction::text').getall()
variable4= str(variable4).strip("['']")
# print(variable4)
variable4= str(variable4).replace(".","")
# print(variable4)
# print(type(variable4))
np_variable4 = np.array(variable4)
variable4 = float(variable4)
# print(variable4)
crs_Price = round(variable4/28500,0)
else:
crs_Currency = response.css('fieldset.item-price span.price-tag > span.price-tag-symbol::text').getall()
crs_Price = response.css('fieldset.item-price span.price-tag > span.price-tag-fraction::text').getall()
df2 = {'Link':[crs_Link],
'Currency':[crs_Currency],
'Price':[crs_Price],
'Total Surface':[crs_Total_Surface],
'Useful Surface':[crs_Useful_Surface],
'Location':[crs_location],
'Bedroom':[crs_Bedroom],
'Restroom':[crs_Restroom],
'Parking':[crs_Parking],
'Storage':[crs_Storage]}
# print(df2)
# print('-------------------------PRUEBAAAA---------------')
global df3
df3 = df3.append(df2, ignore_index=True)
#print(df3.head())
#Name of variables to take in consideration
Nombre_variables =["Superficie total", "Superficie útil", "Dormitorios", "Baños", "Estacionamientos", "Bodegas"]
Dict_Nombre_variables = {}
#initialize DataFrame
headers = ["Link","Currency", "Price", "Total Surface","Useful Surface", "Location", "Bedroom", "Restroom", "Parking", "Storage"]
df_data = pd.DataFrame(columns=headers)
headers = ["Link","Currency", "Price", "Total Surface","Useful Surface", "Location", "Bedroom", "Restroom", "Parking", "Storage"]
df3 = pd.DataFrame(columns=headers)
#Initialize global variables used in methods
aux2=0
crs_Link=0
crs_Currency=0
crs_Price=0
crs_Total_Surface=0
crs_Useful_Surface=0
crs_location=0
crs_Bedroom=0
crs_Restroom=0
crs_Parking=0
crs_Storage =0
aux3=0
# Run the Spider
process = CrawlerProcess({'USER_AGENT': 'hol'})
process.crawl(Spider_Inmob)
process.start()
path = "D:\\0. Documentos\\7. DataCamp\\1. WebScraping\\99. Ejemplos\\PortalInmob.csv"
df3.to_csv(path)
print(df3.head())
print(df3)
print(df3['Location'])
Pretty trivial with requests and regex since we know it's the only lat/lon on the page, and we know the url format. We can capture the lat/lon portion of the url using regex and split it apart.
import requests
import re
url = 'https://www.portalinmobiliario.com/MLC-564988630-estilo-mariposa-_JM#position=2&type=item&tracking_id=ed337e69-9999-4ede-b393-ef378e1a5675'
r = requests.get(url).text
lat, lon = re.findall(r'center=(-?\d+\.\d+\%2C-?\d+\.\d+)',r)[0].split('%2C')
Related
I am trying to scrape all the possible data from this webpage Gstaad 2017
Here is my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium.webdriver.support.ui import Select
#Starts the driver and goes to our starting webpage
driver = webdriver.Chrome( "C:/Users/aldi/Downloads/chromedriver.exe")
driver.get('http://www.bvbinfo.com/Tournament.asp?ID=3294&Process=Matches')
#Imports HTML into python
page = requests.get('http://www.bvbinfo.com/Tournament.asp?ID=3294&Process=Matches')
soup = BeautifulSoup(driver.page_source, 'lxml')
stages = soup.find_all('div')
stages = driver.find_elements_by_class_name('clsTournBracketHeader')[-1].text
#TODO the first row (country quota matches) has no p tag and therefore it is not included in the data
rows = []
paragraphs = []
empty_paragraphs = []
for x in soup.find_all('p'):
if len(x.get_text(strip=True)) != 0:
paragraph = x.extract()
paragraphs.append(paragraph)
if len(x.get_text(strip=True)) == 0:
empty_paragraph = x.extract()
empty_paragraphs.append(empty_paragraph)
# players
home_team_player_1 = ''
home_team_player_2 = ''
away_team_player_1 = ''
away_team_player_2 = ''
for i in range(0, len(paragraphs)):
#round and satege of the competition
round_n= paragraphs[i].find('u').text
paragraph_rows = paragraphs[i].text.split('\n')[1:-1]
counter = 0
for j in range(0,len(paragraph_rows)):
#TODO tournament info, these can vary from tournament to tournament
tournament_info = soup.find('td', class_ = 'clsTournHeader').text.strip().split()
tournament_category = [' '.join(tournament_info[0 : 2])][0]
tournament_prize_money = tournament_info[2]
#TODO tournament city can also have two elements, not just one
tournament_city = tournament_info[3]
tournament_year = tournament_info[-1]
tournament_days = tournament_info[-2][:-1].split("-")
tournament_starting_day = tournament_days[0]
tournament_ending_day = tournament_days[-1]
tournament_month = tournament_info[-3]
tournament_stars = [' '.join(tournament_info[5 : 7])][0]
players = paragraphs[i].find_all('a', {'href':re.compile('.*player.*')})
home_team_player_1 = players[counter+0].text
home_team_player_2 = players[counter+1].text
away_team_player_1 = players[counter+2].text
away_team_player_2 = players[counter+3].text
#matches
match= paragraph_rows[j].split(":")[0].split()[-1].strip()
#nationalities
nationalities = ["United", "States"]
if paragraph_rows[j].split("def.")[0].split("/")[1].split("(")[0].split(" ")[3] in nationalities:
home_team_country = "United States"
else:
home_team_country = paragraph_rows[j].split("def.")[0].split("/")[1].split("(")[0].split(" ")[-2]
if paragraph_rows[j].split("def.")[1].split("/")[1].split(" ")[3] in nationalities:
away_team_country = "United States"
else:
away_team_country = paragraph_rows[j].split("def.")[1].split("/")[1].split("(")[0].split(" ")[-2]
parentheses = re.findall(r'\(.*?\)', paragraph_rows[j])
if "," in parentheses[0]:
home_team_ranking = parentheses[0].split(",")[0]
home_team_ranking = home_team_ranking[1:-1]
home_team_qualification_round = parentheses[0].split(",")[1]
home_team_qualification_round = home_team_qualification_round[1:-1]
else:
home_team_ranking = parentheses[0].split(",")[0]
home_team_ranking = home_team_ranking[1:-1]
home_team_qualification_round = None
if "," in parentheses[1]:
away_team_ranking = parentheses[1].split(",")[0]
away_team_ranking = away_team_ranking[1:-1]
away_team_qualification_round = parentheses[1].split(",")[1]
away_team_qualification_round = away_team_qualification_round[1:-1]
else:
away_team_ranking = parentheses[1].split(",")[0]
away_team_ranking = away_team_ranking[1:-1]
match_duration = parentheses[2]
match_duration = match_duration[1:-1]
away_team_qualification_round = None
# sets
sets = re.findall(r'\).*?\(', paragraph_rows[j])
sets = sets[1][1:-1]
if len(sets.split(",")) == 2:
score_set1 = sets.split(",")[0]
score_set2 = sets.split(",")[1]
score_set3 = None
if len(sets.split(",")) == 3:
score_set1 = sets.split(",")[0]
score_set2 = sets.split(",")[1]
score_set3 = sets.split(",")[2]
row = { " home_team_player_1 ": home_team_player_1 ,
" home_team_player_2": home_team_player_2,
"away_team_player_1": away_team_player_1,
"away_team_player_2":away_team_player_1,
"match": match,
"home_team_country":home_team_country,
"away_team_country": away_team_country,
"home_team_ranking": home_team_ranking,
"away_team_ranking": away_team_ranking,
"match_duration": match_duration,
"home_team_qualification_round": home_team_qualification_round,
"away_team_qualification_round": away_team_qualification_round,
"score_set1":score_set1,
"score_set2":score_set2,
"score_set3":score_set3,
"tournament_category": tournament_category,
"tournament_prize_money": tournament_prize_money,
"tournament_city": tournament_city,
"tournament_year": tournament_year,
"tournament_starting_day": tournament_starting_day,
"tournament_ending_day":tournament_ending_day,
"tournament_month":tournament_month,
"tournament_stars":tournament_stars,
"round_n": round_n
}
counter += 4
rows.append(row)
data = pd.DataFrame(rows)
data.to_csv("beachvb.csv", index = False)
I am not really experienced in web scraping. I have just started as a self-taught and find the HTML source code quite messy and poorly structured.
I want to improve my code in two ways:
Include all the missing matches (country quota matches, semifinals, bronze medal, and gold medal) and the respective category for each match (country quota matches, pool, winner's bracket, semifinals, bronze medal, and gold medal)
iterate the code for more years and tournaments from the dropdown menu at the top of the webpage
I have tried to iterate through different years but my code does not work
tournament_years = {"FIVB 2015", "FIVB 2016"}
dfs = []
for year in tournament_years:
# select desired tournament
box_year = Select(driver.find_element_by_xpath("/html/body/table[3]/tbody/tr/td/table[1]/tbody/tr[1]/td[2]/select"))
box_year.select_by_visible_text(year)
box_matches = Select(driver.find_element_by_xpath("/html/body/table[3]/tbody/tr/td/table[1]/tbody/tr[2]/td[2]/select"))
box_matches.select_by_visible_text("Matches")
The main idea was to create a list of dataframes for each year and each tournament by adding a new loop at the beginning of the code.
If someone has a better idea and technique to do so, it is really appreciated!
I'm just a few hours into learning Python so please go easy with me! I'm just wanting to scrape scores and scorers off a website, I've been able to do that, however, I'm only getting one scorer (if there is one!), when there are multiple goal scorers I am only getting the first. I think I'm trying to look for multiple scorers under '# Home Scorers'.
My code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.skysports.com/football-results"
match_results = {}
match_details = {}
match_no = 0
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
matches = soup.find_all('div',{'class':'fixres__item'})
for match in matches:
try:
match_url_get = match.find('a',{'class':'matches__item matches__link'}).get('href')
match_url = match_url_get if match_url_get else "unknown"
event_id = match_url[-6:]
match_response = requests.get(match_url)
match_data = match_response.text
match_soup = BeautifulSoup(match_data,'html.parser')
# Match Details
match_date = match_soup.find('time',{'class':'sdc-site-match-header__detail-time'}).text
match_location = match_soup.find('span',{'class':'sdc-site-match-header__detail-venue'}).text
match_info = match_soup.find('p',{'class':'sdc-site-match-header__detail-fixture'}).text
# Home Scores & Team
home_details = match_soup.find_all('span',{'class':'sdc-site-match-header__team-name sdc-site-match-header__team-name--home'})
for home_detail in home_details:
home_team = home_detail.find('span',{'class':'sdc-site-match-header__team-name-block-target'}).text
home_score_get = match_soup.find('span',{'class':'sdc-site-match-header__team-score-block','data-update':'score-home'})
home_score = home_score_get.text if home_score_get else "none"
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
for home_scorer_detail in home_scorer_details:
goal_scorer_get = home_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
goal_scorer = goal_scorer_get.text if goal_scorer_get else "none"
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
# Away Scores & Team
away_details = match_soup.find_all('span',{'class':'sdc-site-match-header__team-name sdc-site-match-header__team-name--away'})
for away_detail in away_details:
away_team = away_detail.find('span',{'class':'sdc-site-match-header__team-name-block-target'}).text
away_score_get = match_soup.find('span',{'class':'sdc-site-match-header__team-score-block','data-update':'score-away'})
away_score = away_score_get.text if away_score_get else "none"
# Home Scorers
away_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-away'})
for away_scorer_detail in away_scorer_details:
away_goal_scorer_get = away_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
away_goal_scorer = away_goal_scorer_get.text if away_goal_scorer_get else "none"
away_goal_score_minute_get = away_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
away_goal_score_minute = away_goal_score_minute_get.text if away_goal_score_minute_get else "none"
print("Match: ",event_id , "Match Date:", match_date, "Match Location:", match_location, "Match Info:", match_info, "\nResult: ", home_team, home_score, away_team, away_score)
print("Home Scorer:", goal_scorer, "Minute:",goal_score_minute, "\nAway Scorer:", away_goal_scorer, "Minute:",away_goal_score_minute)
print(match_date)
except:
pass
match_no+=1
match_results[match_no] = [event_id, home_team, home_score, away_team, away_score, match_url, match_date, match_location, match_info]
match_details[match_no] = [event_id, goal_scorer, goal_score_minute, away_goal_scorer, away_goal_score_minute]
Period = "2021-22"
print("Total Matches: ", match_no)
match_results = pd.DataFrame.from_dict(match_results, orient='index', columns = ['Event_ID:', 'Home Team:','Home Score:','Away Team:','Away Score:','Link:','Match Date:','Match Location:','Match Info:'])
match_results.to_csv("Python/FL/Premier League Results (SkySports.com) " + Period + ".csv")
match_details = pd.DataFrame.from_dict(match_details, orient='index', columns = ['Event_ID:', 'Home Goal:','Home Goal Minute:','Away Goal:','Away Goal Minute:'])
match_details.to_csv("Python/FL/Premier League Details (SkySports.com) " + Period + ".csv")
So the bit that's not working correctly is:
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
for home_scorer_detail in home_scorer_details:
goal_scorer_get = home_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
goal_scorer = goal_scorer_get.text if goal_scorer_get else "none"
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
Any ideas how I can return multiple rows for that bit?!
Thanks in advance :)
home_scorer_details only has 1 item, the unordered list itself.
To get all the scorers you need to get the items in that list.
The following code, which is pretty rough, will create a list of dictionaries where each dictionary has the name of the scorer and the minute(s) they scored.
You could use similar code to get all the away scorers.
Like I said, this code is rough and needs refined but it should give you a start.
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
home_scorers = []
for home_scorer_detail in home_scorer_details[0].find_all('li'):
goal_scorer = home_scorer_detail.text
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
home_scorers.append({'scorer': goal_scorer, 'minute': goal_score_minute})
print(home_scorers)
I am trying to scrape a web page to analyze the stock of some shoes, I did it as I would like but I would need to add one thing, and that is, for example, when suddenly there is a stock of a shoe size, it sends me a Discord webhook with the size that there is stock, but it keeps sending me webhooks all the time until it stops having stock, the thing is that I would like to do that when it fulfills the function of the if, it sends the webhook, but the next time I check If the stock is still in stock, do not send me any webhook until it is out of stock and then back in stock, I don't know if I have explained myself well.
I want it to fulfill the function once it only executes it once even though it continues to fulfill that function, only to fulfill it again when the stock status returns to out of stock and then to with stock
This is my code:
from bs4 import BeautifulSoup
from dhooks import Webhook, Embed
import requests
import pandas as pd
import logging
from json import loads
import time, datetime
import random
from requests.auth import HTTPProxyAuth
import time
import multiprocessing
import re
headers = {
}
def monitor2(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
marca = soup.find("h3", {"class":"OEhtt9 ka2E9k uMhVZi uc9Eq5 pVrzNP _5Yd-hZ"}).text
nombre = soup.find("h1", {"class":"OEhtt9 ka2E9k uMhVZi z-oVg8 pVrzNP w5w9i_ _1PY7tW _9YcI4f"}).text
color = soup.find("span", {"class":"u-6V88 ka2E9k uMhVZi dgII7d z-oVg8 pVrzNP"}).text
precio = soup.find("span", {"class":"uqkIZw ka2E9k uMhVZi FxZV-M z-oVg8 pVrzNP"}).text
talla = soup.find("span", {"class":"u-6V88 ka2E9k uMhVZi FxZV-M z-oVg8 pVrzNP"}).text
imagen = soup.find("img", {"class": "_6uf91T z-oVg8 u-6V88 ka2E9k uMhVZi FxZV-M _2Pvyxl JT3_zV EKabf7 mo6ZnF _1RurXL mo6ZnF PZ5eVw"})['src']
api = 'https://api.silverpings.eu/zalando?skuid='
tallas = soup.find_all(re.compile("script"))[15]
tallas2 = re.findall(r'size":.....', str(tallas))
tallas3 = str(tallas2).replace('size":"',"").replace('"', "").replace(']', "").replace("'", "").replace(",", "").replace("[Te", "").replace("r", "").replace("tall", "").replace("¿Cuá", "").replace("}", "").split()
tallas3 = sorted(list(set(tallas3)))
skus = soup.find_all(re.compile("script"))[15]
skus2 = re.findall(r"sku.......................", str(skus))
skus3 = str(skus2).replace('sku":"',"").replace("'", "").replace("'","").replace("[","").replace("]","").replace("silh", "").replace("uri", "").replace(",", "").replace(" ", "")
skus4 = str(skus3).strip()
skus5 = re.findall(r".........-...\d......", str(skus4))
disponibilidad = soup.find_all(re.compile("script"))[15]
disponibilidad2 = re.findall(r'quantity":.............', str(disponibilidad))
disponibilidad3 = str(disponibilidad2).replace('quantity":"',"").replace('"', "").replace(']', "").replace("'", "").replace(",", "").replace("[", "").replace("r", "").replace("tall", "").replace("¿Cuá", "").replace("}", "").split()
print("[",datetime.datetime.now().hour,":",datetime.datetime.now().minute,":",datetime.datetime.now().second,":",datetime.datetime.now().microsecond,"]", " Comprobando disponibilidad de: ("+nombre+")")
numero = 0
while numero <= len(tallas3):
if not "OUT_OF_STOCK" in disponibilidad3[numero]:
hook = Webhook('')
embed = Embed(
color=15105570,
timestamp='now' # sets the timestamp to current time
)
embed.url = (url)
embed.title = (nombre+" " + color)
embed.add_field(name='Talla', value="["+tallas3[numero]+"]("+api+skus5[numero]+")")
embed.add_field(name='Precio', value=precio)
embed.add_field(name='Useful Links', value="[Checkout](https://www.zalando.es/checkout/confirm)", inline=False)
embed.set_footer(text='Zalando by SilverPings', icon_url="https://assets.stickpng.com/thumbs/5a32a860cb9a85480a628f95.png")
embed.set_thumbnail(imagen)
hook.send(embed=embed)
print("[",datetime.datetime.now().hour,":",datetime.datetime.now().minute,":",datetime.datetime.now().second,"]", " Stock encontrado: ("+nombre + " - " + tallas3[numero]+")")
numero = numero+1
else:
numero = numero+1
if numero == len(tallas3):
break
def gymred():
url = 'https://www.zalando.es/nike-sportswear-air-force-1-07-zapatillas-light-bonewhite-ni112o0h3-
a12.html'
monitor2(url)
while True:
gymred()
You can create a boolean value that becomes True when you send the webhook, and then look for this value to be false in order to send the email. The code would be something like this:
if not "OUT_OF_STOCK" in disponibilidad3[numero] and not enviado:
enviado = True
hook = Webhook('')
embed = Embed(
color=15105570,
timestamp='now' # sets the timestamp to current time
)
embed.url = (url)
embed.title = (nombre+" " + color)
embed.add_field(name='Talla', value="["+tallas3[numero]+"]("+api+skus5[numero]+")")
embed.add_field(name='Precio', value=precio)
embed.add_field(name='Useful Links', value="[Checkout](https://www.zalando.es/checkout/confirm)", inline=False)
embed.set_footer(text='Zalando by SilverPings', icon_url="https://assets.stickpng.com/thumbs/5a32a860cb9a85480a628f95.png")
embed.set_thumbnail(imagen)
hook.send(embed=embed)
print("[",datetime.datetime.now().hour,":",datetime.datetime.now().minute,":",datetime.datetime.now().second,"]", " Stock encontrado: ("+nombre + " - " + tallas3[numero]+")")
numero = numero+1
else:
numero = numero+1
if numero == len(tallas3):
break
Then you could update this boolean value if you want to send another webhook but I will leave that to you.
I have the following view function used to scrape data:
def results(request):
if request.method == 'POST':
form = RoomForm(request.POST)
if form.is_valid():
form_city = form.cleaned_data['city'].title()
form_country = form.cleaned_data['country'].title()
form_arrival_date = form.cleaned_data['arrival_date']
form_departure_date = form.cleaned_data['departure_date']
form_pages_to_scrape = form.cleaned_data['pages_to_scrape']
#launch scraper
scraper = AIRBNB_scraper(city=form_city, country=form_country, arrival_date=str(form_arrival_date), departure_date=str(form_departure_date))
scraped_dataframe = scraper.scrape_multiple_pages(last_page_selector_number=form_pages_to_scrape)
scraped_dataframe_sorted = scraped_dataframe.sort_values('prices')
print(scraped_dataframe_sorted)
#convert scraped dataframe into lists
prices = scraped_dataframe_sorted['prices'].tolist()
listings_links = scraped_dataframe_sorted['listings_links'].tolist()
listings_names = scraped_dataframe_sorted['listings_names'].tolist()
photo_links = scraped_dataframe_sorted['photo_links'].tolist()
dictionary = zip(prices, listings_links, listings_names, photo_links)
context = {'dictionary': dictionary}
return render(request, 'javascript/results.html', context)
On form submit, a post request is sent to this function using AJAX:
var frm = $('#login-form');
frm.submit(function () {
$.ajax({
type: "POST",
url: "/results",
data: frm.serialize(),
success: function (data) {
$("#table").html(data);
$('#go_back').remove();
},
error: function(data) {
$("#table").html("Something went wrong!");
}
});
return false;
});
After that the scraped data is displayed as HTML table on the same page the form is on.
The problem is the number of scraped items doubles every time the form submit is done. So for example if the number of scraped items on first button click is sixteen, the output will be 16, but on the second run it will be 32, then 64, and so on.
It is like the app remembers previous form submits, but I don't see any reason why. I tried clearin - at the end of this function - the pandas dataframe used to store the scraped data and also the dictionary passed as context, but to no avail.
The form is:
class RoomForm(forms.Form):
city = forms.CharField(max_length=100)
country = forms.CharField(max_length=100)
arrival_date = forms.DateField(widget=forms.DateInput(attrs=
{
'class':'datepicker'
}), required=False)
departure_date = forms.DateField(widget=forms.DateInput(attrs=
{
'class':'datepicker'
}), required=False)
pages_to_scrape = forms.IntegerField(label='Pages to scrape (max. 17)', min_value=0, max_value=17, widget=forms.NumberInput(attrs={'style':'width: 188px'}))
AIRBNB_scraper is:
import requests, bs4
import re
import pandas as pd
price_pattern = re.compile(r'\d*\s*?,?\s*?\d*\szł')
photo_link_pattern = re.compile(r'https.*\)')
prices = []
listings_links = []
photo_links = []
listings_names = []
class AIRBNB_scraper():
def __init__(self, city, country, accomodation_type='homes', arrival_date='2018-03-25', departure_date='2018-04-10'):
self.city = city
self.country = country
self.arrival_date = arrival_date
self.departure_date = departure_date
self.accomodation_type = accomodation_type
def make_soup(self, page_number):
url = 'https://www.airbnb.pl/s/'+ self.city +'--'+ self.country +'/'+ self.accomodation_type +'?query='+ self.city +'%2C%20'+ self.country +'&refinement_paths%5B%5D=%2F'+ self.accomodation_type +'&checkin=' + self.arrival_date + '&checkout=' + self.departure_date + '§ion_offset=' + str(page_number)
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(self, page_number):
soup = self.make_soup(page_number)
listings = soup.select('._f21qs6')
number_of_listings = len(listings)
print('\n' + "Number of listings found: " + str(number_of_listings))
while number_of_listings != 18:
print('\n' + str(number_of_listings) + ' is not correct number of listings, it should be 18. Trying again now.')
soup = self.make_soup(page_number)
listings = soup.find_all('div', class_='_f21qs6')
number_of_listings = len(listings)
print('\n' + "All fine! The number of listings is: " + str(number_of_listings) + '. Starting scraping now')
return listings
def scrape_listings_per_page(self, page_number):
listings_to_scrape = self.get_listings(page_number)
for listing in listings_to_scrape:
#get price
price_container = listing.find_all('span', class_='_hylizj6')
price_search = re.search(price_pattern, str(price_container))
price = price_search.group()
#get listing_link
listing_link = 'https://www.airbnb.pl' + listing.find('a', class_='_15ns6vh')['href']
#get photo_link
photo_link_node = listing.find('div', class_="_1df8dftk")['style']
photo_link_search = re.search(photo_link_pattern, str(photo_link_node))
#~ if photo_link_search:
#~ print('Is regex match')
#~ else:
#~ print('No regex match')
photo_link_before_strip = photo_link_search.group()
photo_link = photo_link_before_strip[:-1] #remove ") at the end of link
#get listing_name
listing_name = listing.find('div', class_='_1rths372').text
#append lists
prices.append(price)
listings_links.append(listing_link)
photo_links.append(photo_link)
listings_names.append(listing_name)
def scrape_multiple_pages(self, last_page_selector_number):
last_page_selector_number += 1
for x in range(0, last_page_selector_number):#18
self.scrape_listings_per_page(x)
print('\n' + "INDEX OF PAGE BEING SCRAPED: " + str(x))
scraped_data = pd.DataFrame({'prices': prices,
'listings_links': listings_links,
'photo_links': photo_links,
'listings_names': listings_names})
return scraped_data
You have module-level variables: prices, listings_links, etc. You append to these inside your AIRBNB_scraper instance but they are not part of that instance, and will persist between calls. You should make them instance attributes - define them as self.prices etc in the __init__ method.
I am trying to parse data from a website by inserting the data into a list, but the list comes back empty.
url =("http://www.releasechimps.org/resources/publication/whos-there-md- anderson")
http = urllib3.PoolManager()
r = http.request('Get',url)
soup = BeautifulSoup(r.data,"html.parser")
#print(r.data)
loop = re.findall(r'<td>(.*?)</td>',str(r.data))
#print(str(loop))
newLoop = str(loop)
#print(newLoop)
for x in range(1229):
if "\\n\\t\\t\\t\\t" in loop[x]:
loop[x] = loop[x].replace("\\n\\t\\t\\t\\t","")
list0_v2.append(str(loop[x]))
print(loop[x])
print(str(list0_v2))
Edit: Didn't really have anything else going on, so I made your data format into a nice list of dictionaries. There's a weird <td height="26"> on monkey 111, so I had to change the regex slightly.
Hope this helps you, I did it cause I care about the monkeys man.
import html
import re
import urllib.request
list0_v2 = []
final_list = []
url = "http://www.releasechimps.org/resources/publication/whos-there-md-anderson"
data = urllib.request.urlopen(url).read()
loop = re.findall(r'<td.*?>(.*?)</td>', str(data))
for item in loop:
if "\\n\\t\\t\\t\\t" or "em>" in item:
item = item.replace("\\n\\t\\t\\t\\t", "").replace("<em>", "")\
.replace("</em>", "")
if " " == item:
continue
list0_v2.append(item)
n = 1
while len(list0_v2) != 0:
form = {"n":0, "name":"", "id":"", "gender":"", "birthdate":"", "notes":""}
try:
if list0_v2[5][-1] == '.':
numb, name, ids, gender, birthdate, notes = list0_v2[0:6]
form["notes"] = notes
del(list0_v2[0:6])
else:
raise Exception('foo')
except:
numb, name, ids, gender, birthdate = list0_v2[0:5]
del(list0_v2[0:5])
form["n"] = int(numb)
form["name"] = html.unescape(name)
form["id"] = ids
form["gender"] = gender
form["birthdate"] = birthdate
final_list.append(form)
n += 1
for li in final_list:
print("{:3} {:10} {:10} {:3} {:10} {}".format(li["n"], li["name"], li["id"],\
li["gender"], li["birthdate"], li["notes"]))