I get this error on the last line of my code. If anyone has encountered with the same problem, I'll be glad to share with me on how to solve it.
The source code is telethon based and is full. The execution is also successful but when wanna give response to userid it gives the UnboundLocalError.
The codes:
#client.on(events.NewMessage(incoming=True, from_users=(723428565, 677543378)))
async def _(event):
if event.fwd_from:
return
url = "http://www.google.com"
if event.reply_to_msg_id and "allow" in event.raw_text:
previous_message = await event.get_reply_message()
previous_message_text = previous_message.message
if previous_message.media:
downloaded_file_name = await client.download_media(
previous_message,
path,
)
surl = "{}/searchbyimage/upload".format(url)
multipart = {
"encoded_image": (
downloaded_file_name,
open(downloaded_file_name, "rb"),
),
"image_content": "",
}
google_rs_response = requests.post(
surl, files=multipart, allow_redirects=False
)
the_location = google_rs_response.headers.get("Location")
os.remove(downloaded_file_name)
else:
previous_message_text = previous_message.message
surl = "{}/searchbyimage?image_url={}"
request_url = surl.format(url, previous_message_text)
google_rs_response = requests.get(request_url, allow_redirects=False)
the_location = google_rs_response.headers.get("Location")
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"
}
response = requests.get(the_location, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
bro = soup.find_all("div", {"class": "r5a77d"})[0]
lol = bro.find("a")
url + lol.get("href")
final = lol.text
await event.edit(
event.chat_id, final.replace("me", "")
)
Error :
Line 42: UnboundLocalError: local variable 'final' referenced before assignment
You are defining the variable text = lol.text inside the if block if "allow" in event.raw_text:
So it looks like your condition wasn't met, and the variable text was never defined. So when you tried to access it await event.edit(event.chat_id, text.replace("me", "")) you got an error
Related
I'm new to python, using the latest version. I have found a nice script that runs searches on Presearch.com. However i keep getting this error
TypeError: 'NoneType' object is not subscriptable pointing at:
line 13, in <module>
token = soup.find("input", {
How do i fix this? Here's the code
import time
import requests
from bs4 import BeautifulSoup
import random
email = "Enter your email"
password = "Enter your password"
r = requests.Session()
content = r.get("https://www.presearch.org").content
soup = BeautifulSoup(content, 'html.parser')
token = soup.find("input", {
"name": "_token"
})["value"]
payload = "_token={}&login_form=1&email={}&password={}".format(token, email, password)
headers = {
'Content-Type': 'application/x-www-form-urlencoded'
}
login = r.post("https://www.presearch.org/api/auth/login", data=payload, headers=headers)
for x in range(0, 10):
words = random.choice(
["apple", "life", "hacker", "facebook", "abeyancies", "abeyancy", "abeyant", "abfarad", "abfarads", "abhenries",
"abhenry", "abhenrys", "abhominable", "abhor", "abhorred", "abhorrence", "abhorrences", "abhorrencies",
"abhorrency", "abhorrent", "abhorrently", "abhorrer", "abhorrers", "abhorring", "abhorrings", "abhors", "abid",
"abidance", "abidances", "abidden", "abide", "abided", "abider", "abiders", "abides", "abiding", "abidingly",
"abidings", "abies", "abietic", "abigail", "abigails", "abilities", "ability", "abiogeneses", "abiogenesis",
"abiogenetic", "abiogenetically", "abiogenic", "abiogenically", "abiogenist", "abiogenists", "abiological",
"abioses", "abiosis", "abiotic", "abiotically", "abiotrophic", "abiotrophies", "abiotrophy"])
payload = "term={}&provider_id=98&_token={}".format(words, token)
r.post("https://www.presearch.org/search", data=payload, headers=headers)
print("Term:{} Search done!".format(words))
time.sleep(10)
r = r.get("https://www.presearch.org/")
soup = BeautifulSoup(r.content, 'html.parser')
balance = soup.find("span", {
"class": "number ajax balance"
})
print("Your Balance: {} PRE".format(balance.text))
As pointed out, find() could not find the input you want and returned None, but the code ["value"] is trying to get a key named value from None, by iterating over it, assuming that find() will always return a dictionary. The subscriptable means something that can be iterated and None cannot be iterated, so this hopefully makes the error clearer.
One way to avoid this error when a element could not be found would be:
token = soup.find("input", {
"name": "_token"
})
if not token:
# Do something if find() could not find the element
I've been trying to make an autoreg script for a certain website (just to improve requests skills), the registration form is shown below
screenshot
It's in russian, but it's only asking you to input username, email, solve captcha from the image and solve an easy math equation.
Here is a CURL of the successful post request while sending form manually.
As you can see, successful request has 302 status code, but with my script status code is 200 and I can't find a reason for it. My code below:
def register(username = None, email = None, email_pass = None):
with HTMLSession() as session:
params ={"action": "register"}
r = session.get("https://gidonline.io/wp-login.php", params=params)
# with open("gidonlineForm.html", "w", encoding="utf-8") as f:
# f.write(r.text)
if int(r.status_code) != 200:
print("Something went wrong")
return None
r.html.render()
soup = BeautifulSoup(r.text, "lxml")
imageLink = soup.find("img", {"alt": "captcha image"})["src"]
print(imageLink)
imageBytes = session.get(imageLink).content
imageBase64 = base64.b64encode(imageBytes)
captchaKey = solve_captcha(imageBase64)
print(captchaKey)
question = soup.find("input", {"name": "math", "id": "math"}).find_parent("label").text
expString = question.split(":")[-1].strip()
expressionResult = solve_math(expString)
encodedStr = soup.find("p", {"id": "sabre_spectre"}).find("input")["name"]
sabre_js_check1 = soup.find("input", {"id": "sabre_js_check1"})["value"]
sabre_js_check2 = soup.find("input", {"id": "sabre_js_check2"})["value"]
sabre_id = soup.find("input", {"id": "sabre_id"})["value"]
sabre_js_payload = r.html.find('#sabre_js_payload', first=True).attrs['value']
params = {
'action': 'register',
}
data = {
'user_login': username,
'user_email': email,
'captcha': str(captchaKey),
'math': str(expressionResult),
encodedStr: "",
"sabre_js_check1": sabre_js_check1,
"sabre_js_check2": sabre_js_check2,
"sabre_js_payload": sabre_js_payload,
"sabre_id": sabre_id,
'wp-submit': 'Регистрация'
}
r = session.post('https://gidonline.io/wp-login.php', params=params, data=data, allow_redirects=True)
if int(r.status_code) == 200:
print("Ooops...200 status code")
print(r.html.find("#login_error", first=True).text)
return None
else:
print(f"Status code: {r.status_code}")
with open("success.html", "w", encoding='utf-8') as f:
f.write(r.text)
I can easily provide any additional info if needed, thank you all!
I am scrapping the website craiglist.com but after getting certain requests it keeps blocking my device. I tried out the solution in Proxies with Python 'Requests' module but didn't understand how to specify the headers every time. Here's the code :
from bs4 import BeautifulSoup
import requests,json
list_of_tuples_with_given_zipcodes = []
id_of_apartments = []
params = {
'sort': 'dd',
'filter': 'reviews-dd',
'res_id': 18439027
}
http_proxy = "http://10.10.1.10:3128"
https_proxy = "https://10.10.1.11:1080"
ftp_proxy = "ftp://10.10.1.10:3128"
proxies = {
"http" : http_proxy,
"https" : https_proxy,
"ftp" : ftp_proxy
}
for i in range(1,30):
content = requests.get('https://losangeles.craigslist.org/search/apa?s = ' + str(i),params = params) #https://losangeles.craigslist.org/search/apa?s=120
# content = requests.get('https://www.zillow.com/homes/for_rent/')
soup = BeautifulSoup(content.content, 'html.parser')
my_anchors = list(soup.find_all("a",{"class": "result-image gallery"}))
for index,each_anchor_tag in enumerate(my_anchors):
URL_to_look_for_zipcode = soup.find_all("a",{"class": "result-title"}) #taking set so that a page is not visited twice.
for each_href in URL_to_look_for_zipcode:
# content_href = requests.get(each_href['href']) #script id="ld_posting_data" type="application/ld+json">
content_href = requests.get(each_href['href']) #script id="ld_posting_data" type="application/ld+json">
# print(each_href['href'])
soup_href = BeautifulSoup(content_href.content, 'html.parser')
my_script_tags = soup_href.find("script",{"id": "ld_posting_data"})
# for each_tag in my_script_tags:
if my_script_tags:
res = json.loads(str(list(my_script_tags)[0]))
if res and 'address' in list(res.keys()):
if res['address']['postalCode'] == "90012": #use the input zipcode entered by the user.
list_of_tuples_with_given_zipcodes.append(each_href['href'])
I am still not sure about the value of the http_proxy variable. I specified it as what was given but should it be the IP address of my device mapped to the localhost port number? It still keeps blocking the code.
Please help.
request's GET method lets you specify the proxy to use it on a call
r = requests.get(url, headers=headers, proxies=proxies)
I want to scrape the title and the URL of each Posting at the Forum of the URL, so that when a new Post is created with 1 of the Titles below i'd like to receive a Mail with that Link of the Post.
Please do not be so harsh with me i'm a beginner with Python and Scraping
I have multiple Problems.
1: at the While(True) Function the "soup" is red underlined with the Error: Undefined variable 'soup'
2: When commenting out the While(True) Function then the Program will not run. I get no error.
3: When there is a new Posting with one of my Criterias, how do I get the URL of that Post?
Titles
def Jeti_DC_16
def Jeti_DC_16_v2
def Jeti_DS_16
def Jeti_DS16_v2
My FullCode
from requests import get
from bs4 import BeautifulSoup
import re
import smtplib
import time
import lxml
import pprint
import json
URL = 'https://www.rc-network.de/forums/biete-rc-elektronik-zubeh%C3%B6r.135/'
def scrape_page_metadata(URL):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
pp = pprint.PrettyPrinter(indent=4)
response = get(URL, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
metadata = {
'Jeti_DC_16': Jeti_DC_16(soup, URL),
'jeti_dc_16_2': Jeti_DC_16_v2(soup, URL),
'jeti_ds_16': Jeti_DS_16(soup, URL),
'jeti_ds_16_2': Jeti_DS_16_v2(soup, URL)
}
pp.pprint(metadata)
return metadata
def Jeti_DC_16(soup, URL):
jeti_dc_16 = None
if soup.name.string:
jeti_dc_16 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_dc_16 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16 = URL.split('//')[1]
return jeti_dc_16.split('/')[0].rsplit('.')[1].capitalize()
return jeti_dc_16
def Jeti_DC_16_v2(soup, URL):
jeti_dc_16_v2 = None
if soup.name.string:
jeti_dc_16_v2 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_dc_16_v2 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16_v2 = URL.split('//')[1]
return jeti_dc_16_v2.split('/')[0].rsplit('.')[1].capitalize()
return jeti_dc_16_v2
def Jeti_DS_16(soup, URL):
jeti_ds_16 = None
if soup.jeti_ds_16.string:
jeti_ds_16 = soup.jeti_ds_16.string
elif soup.find_all("div", class_='structItem-title'):
jeti_ds_16 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_ds_16 = URL.split('//')[1]
return jeti_ds_16.split('/')[0].rsplit('.')[1].capitalize()
return jeti_ds_16
def Jeti_DS_16_v2(soup, URL):
jeti_ds_16_v2 = None
if soup.name.string:
jeti_ds_16_v2 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_ds_16_v2 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16_v2 = URL.split('//')[1]
return jeti_dc_16_v2.split('/')[0].rsplit('.')[1].capitalize()
return jeti_ds_16_v2
# search_for_class = soup.find_all(
# 'div', class_='structItem-title')
# Jeti_DS_16 = soup.find_all(text="Jeti DS 16")
# Jeti_DS_16_v2 = soup.find_all(text="Jeti DS 16 2")
# Jeti_DC_16 = soup.find_all(text="Jeti DC 16")
# Jeti_DC_16_v2 = soup.find_all(text="Jeti DC 16 2")
if(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2):
send_mail()
# # print('Die Nummer {0} {1} {2} {3} wurden gezogen'.format(
# # Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2))
# for i in soup.find_all('div', attrs={'class': 'structItem-title'}):
# print(i.a['href'])
# first_result = search_for_class[2]
# print(first_result.text)
# print(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2)
def send_mail():
with open('/Users/blackbox/Desktop/SynologyDrive/Programmieren/rc-network/credentials.json', 'r') as myFile:
data = myFile.read()
obj = json.loads(data)
print("test: " + str(obj['passwd']))
server_ssl = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server_ssl.ehlo()
# server.starttls()
# server.ehlo()
server_ssl.login('secure#gmail.com', 'secure')
subject = 'Es gibt ein neuer Post im RC-Network auf deine gespeicherte Anfragen. Sieh in dir an{Link to Post}'
body = 'Sieh es dir an Link: https://www.rc-network.de/forums/biete-rc-elektronik-zubeh%C3%B6r.135/'
msg = f"Subject: {subject}\n\n{body}"
emails = ["secure#gmx.de"]
server_ssl.sendmail(
'secure#gmail.com',
emails,
msg
)
print('e-Mail wurde versendet!')
# server_ssl.quit
while(True):
Jeti_DC_16(soup, URL)
Jeti_DC_16_v2(soup, URL)
Jeti_DS_16(soup, URL)
Jeti_DS_16_v2(soup, URL)
time.sleep(10)
# time.sleep(86400)
You create soup inside scrape_page_metadata and it is local varible which doesn't exist outside scrape_page_metadata. In while-loop you should rather use scrape_page_metadata() instead of functions Jeti_DC_16(), Jeti_DC_16_v2(), Jeti_DS_16(), Jeti_DS_16_v2()
And this functions gives you metadata which you should check instead of if(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2)
More or less (you have to use correct value in place of ... because I don't know what you want to compare)
while True:
metadata = scrape_page_metadata(URL)
if metadata["Jeti_DC_16"] == ... and metadata["Jeti_DC_16_v2"] == ... and metadata["Jeti_DS_16"] == ... and metadata["Jeti_DS_16_v2"] == ...:
send_mail()
time.sleep(10)
But there are other problems
All your functions Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2 look the same and probably they return the same element. You could use one of them and delete others. Or you should change them and they should search different elements.
Probably you would have to use more print() to see values in variables and which part of code is executed because I think this code needs a lot changes yet.
For example find_all() gives list with results and you can't use get() which needs single element. You need for-loop to get all titles from all elements
More or less
jeti_ds_16_v2 = soup.find_all("div", class_='structItem-itle')
jeti_ds_16_v2 = [item.get('text') for item in jeti_ds_16_v2]
The battle to finish my first scraping script continues. I think that I'm almost finishing but I hit a new roadblock.
So, the problem is that when I reach the last pagination page I'm getting this error:
Traceback (most recent call last):
File "C:/Users/Andre/Desktop/scripts python/scrape_learn/ttc_quase.py", line 50, in <module>
url_tag = soup.find('li', {"id": "next-page-link"}).find('a')
AttributeError: 'NoneType' object has no attribute 'find'
I think that the error is related with the way I'm finding url_tag, but I'm not seeign any other way to grab the "next page". I tried to use the Try/Except method but when apply it I just get the listings on the first page.
So I'm not sure what should be my next step. If someone could help I will appreciate.
My full code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://timetochoose.co.ao/?ct_keyword&ct_ct_status&ct_property_type&ct_beds&search-listings=true&ct_country=portugal&ct_state&ct_city&ct_price_to&ct_mls&lat&lng"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
anuncios_ttc = {}
anuncios_nr = 0
while True:
response = requests.get(url, headers=headers)
print(response)
data = response.text
print(data)
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
anuncios = soup.find_all("div", {"class": "grid-listing-info"})
for anuncios in anuncios:
titles = anuncios.find("a",{"class": "listing-link"}).text
location = anuncios.find("p",{"class": "location muted marB0"}).text
link = anuncios.find("a",{"class": "listing-link"}).get("href")
anuncios_response = requests.get(link, headers=headers)
anuncios_data = anuncios_response.text
anuncios_soup = BeautifulSoup(anuncios_data, 'html.parser')
conteudo = anuncios_soup.find("div", {"id":"listing-content"}).text
preco = anuncios_soup.find("span",{"class": "listing-price"})
preco_imo = preco.text if preco else "N/A"
quartos = anuncios_soup.find("li", {"class": "row beds"})
nr_quartos = quartos.text if quartos else "N/A"
wcs = anuncios_soup.find("li", {"class": "row baths"})
nr_wcs = wcs.text if wcs else "N/A"
tipo = anuncios_soup.find("li", {"class": "row property-type"})
tipo_imo = tipo.text if tipo else "N/A"
bairro = anuncios_soup.find("li", {"class": "row community"})
bairro1 = bairro.text if bairro else "N/A"
ref = anuncios_soup.find("li", {"class": "row propid"}).text
anuncios_nr+=1
anuncios_ttc[anuncios_nr] = [titles, location, bairro1, preco_imo, tipo_imo, nr_quartos, nr_wcs, conteudo, ref, link]
print("Título", titles, "\nLocalização", location, "\nPreço", preco_imo, "\nLink", link, "\nReferencia", ref, "\nTipo", tipo_imo, "\nQuartos", nr_quartos, "\nWC", nr_wcs, "\nBairro", bairro1, "\nConteudo", conteudo)
url_tag = soup.find('li', {"id": "next-page-link"}).find('a')
if url_tag.get('href'):
url = url_tag.get('href')
print(url)
else:
break
print("Nr Total de Anuncios: ", anuncios_nr)
anuncios_ttc_df = pd.DataFrame.from_dict(anuncios_ttc, orient = 'index', columns =['Titulo', 'Localização', 'Bairro', 'Preço', 'Tipo', 'Quartos', 'WCs', 'Descrição', 'Referência', 'Ligação'])
anuncios_ttc_df.head()
anuncios_ttc_df.to_csv('ttc_python.csv')
The answer for this question ended up to be provided in other thread where I was trying to identify better the URL_Tag element.
With the help of #Andrej Kesely I was able to solve the problem with:
url_tag = soup.find('li', {"id": "next-page-link"})
if not url_tag:
break
url = url_tag.find('a')['href']
Now the script is able to run until the end and to generate the csv file as intended.