I'm starting to work with python again after 8 years. I'm trying to do program with BeautifulSoup and a array argument. I pass the array argument medios to the url functions count_words, but it doesn't work. Is there a way fix it or to search a word in multiple websites using BeautifulSoup?
import requests
from bs4 import BeautifulSoup
def count_words(url, the_word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
words = soup.find(text=lambda text: text and the_word in text)
# print(words)
return len(words)
def main():
url = 'https://www.nytimes.com/'
medios = {
'Los Angeles Times': ['http://www.latimes.com/'],
'New York Times' : ['http://www.nytimes.com/'
] }
word = 'Trump'
#count = count_words(url, word)
cuenta = count_words(medios, word)
# print('\n El Sitio: {}\n Contiene {} occurrencias de la palabra: {}'.format(url, count, word))
print('\n La palabra: {} aparece {} occurrencias en el New York Times'.format(word, cuenta))
if __name__ == '__main__':
main()
There are 3 problems here
medios is a dict. Hence, you will have to loop through the keys and values to send it to the method as the method only accepts url string.
BeautifulSoup find method needs a tag name for it to search else it will return None. If you want to count the number of occurrences of the word, then use count on the string.
You have to send User-Agent in the requests code else you will get 403 or 301.
import requests
from bs4 import BeautifulSoup
headers = {'user-agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}
def count_words(url, the_word):
r = requests.get(url, headers=headers)
return r.text.lower().count(the_word)
def main():
url = 'https://www.nytimes.com/'
medios = {
'Los Angeles Times': ['http://www.latimes.com/'],
'New York Times' : ['http://www.nytimes.com/']
}
word = 'trump'
for web_name, urls in medios.items():
for url in urls:
cuenta = count_words(url, word)
print('La palabra: {} aparece {} occurrencias en el {}'.format(word, cuenta, web_name))
if __name__ == '__main__':
main()
Output:
La palabra: trump aparece 47 occurrencias en el Los Angeles Times
La palabra: trump aparece 194 occurrencias en el New York Times
You are sending a dictionary to count_words(). You need to send the urls in a loop, or else loop thru the dictionary in count_words().
Perhaps you meant:
cuenta = count_words(url, word)
Update your code to the following:
cuenta = 0
for key in medios:
for url in medios[key]:
cuenta += count_words(url, word)
Basically you should pass the url not a dict, and I am assuming you want to count all the words in all elements in medios.
Related
I have a function that makes requests on a website, when I'm not specifying any proxies it works fine. But when I make requests through proxies, it doesn't work.
I've tried this code:
def parseCookie(p):
req = urllib.request.Request("xxxx")
req.set_proxy(p, 'https')
d = urllib.request.urlopen(req)
and I get proxies with this code:
class Proxy:
def __init__(self):
self.contenu = ["http://135.181.14.45:5959", "http://94.158.53.145:3128", "http://46.225.237.146:3128", "http://157.245.27.9:3128", "http://45.236.17.93:8085", "http://80.252.5.34:7001", "http://118.27.113.167:8080", "http://70.44.24.245:8888", "http://68.183.185.62:80", "http://68.183.185.62:80"]
self.response = requests.get("https://free-proxy-list.net/")
self.proxy_list = pd.read_html(self.response.text)[0]
def enfiler(self, element):
"rajoute un element au début de la liste"
self.contenu.insert(0, element)
def defiler(self):
"retire le premier element de la liste"
if len(self.contenu) >= 1:
self.contenu.pop()
else:
print("Il n'y a pas d'élément dans ta liste")
def validProxies(self):
return self.contenu
def getProxies(self):
self.proxy_list["url"] = "http://" + self.proxy_list["IP Address"] + ":" + self.proxy_list["Port"].astype(str)
https_proxies = self.proxy_list[self.proxy_list["Https"] == "yes"]
good_proxies = 0
for proxy_url in https_proxies["url"]:
proxies = {
"http": proxy_url,
"https": proxy_url,
}
try:
response = requests.get("https://httpbin.org/ip", headers=headers, proxies=proxies)
self.enfiler(proxy_url)
self.defiler()
good_proxies += 1
print(f"Proxy {proxy_url} OK, added to good proxy list")
except Exception:
pass
if good_proxies >= 10:
break
but it doesn't work.
I know there is a way with the requests module but I do want to use urllib because I need to fetch set-cookies. Please tell me why my code doesn't work and how could I fix it?
So, i need help here , this is mi code
results=[]
import re
for i in popup_linkz: # Here I take N links like this one https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs=uEap3sWEgifS2G+m9xvYiA== to iterate thorught them a scraping
url=i # so right now I scrape the iterating urls
response = requests.get(url)
print('url:', response.url)
#print('status:', response.status_code)
soup = BeautifulSoup(response.content, "html.parser")
results=[]
#json_res = json.loads(res.text)
#print(json_res[0]['price'])
item_1='grvProducto_ctl02_lblCategoria'
for line in soup.findAll('span', attrs={'id': 'grvProducto_ctl02_lblCategoria'}):
results.append(line.text)
#this actually get the first code, but don't know how to iterate for others, also doesn't store every code on it, when I print doesn't stack them , show them single on print.
print('id',results)
I am trying to get from this urlsample >https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs=uEap3sWEgifS2G+m9xvYiA==
actually it iterates from 2 to 10.000 of them.
information I want to get here but that cant get it
I am not sure how use this
for line in soup.findAll('span', attrs={'id': 'grvProducto_ctl02_lblCategoria'}):
results.append(line.text)
to use the same loop to get the other information.
data of page underlying
could you enlight me please?
Try:
import requests
from bs4 import BeautifulSoup
url = "https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs=uEap3sWEgifS2G+m9xvYiA=="
soup = BeautifulSoup(requests.get(url).content, "html.parser")
licitation_number = soup.select_one("#lblNumLicitacion").text
responsable = soup.select_one("#lblResponsable").text
ficha = soup.select_one("#lblFicha2Reclamo").text
print(f"{licitation_number=}")
print(f"{responsable=}")
print(f"{ficha=}")
print("-" * 80)
for t in soup.select("#grvProducto .borde_tabla00"):
categoria = t.select_one('[id$="lblCategoria"]').text
candidad = t.select_one('[id$="lblCantidad"]').text
descripction = t.select_one('[id$="lblDescripcion"]').text
print(f"{categoria=} {candidad=}")
print(f"{descripction=}")
print()
Prints:
licitation_number='1549-5-LR22'
responsable='SERVICIO DE SALUD METROPOLITANA NORTE HOSPITAL SAN JOSE, Hospital San José'
ficha='107'
--------------------------------------------------------------------------------
categoria='42221501' candidad='130'
descripction='(226-2001) STENT CORONARIO DE CROMO COBALTO, LIBERADOR DE FÁRMACO EVEROLIMUS'
categoria='42221501' candidad='360'
descripction='(226-2002) STENT CORONARIO DE CROMO COBALTO, LIBERADOR DE FÁRMACO ZOTAROLIMUS'
categoria='42221501' candidad='120'
descripction='(226-2004) STENT CORONARIO DE CROMO COBALTO, LIBERADOR DE FÁRMACO SIROLIMUS, CON STRUT DE 0.80'
categoria='42221501' candidad='240'
descripction='(226-2003) STENT CORONARIO DE CROMO COBALTO, LIBERADOR DE FÁRMACO SIROLIMUS, CON STRUT DE 0.60'
Hello everyone I'm trying to get all href links with python by using this :
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
#Collecting links on rappel.gouv
def get_url(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links = url + item.find('a', {'class' : 'product-link'})['href']
return links
soup = get_url(url)
print(extract(soup))
I'm supposed to get 10 htmls links as following :
https://rappel.conso.gouv.fr/fiche-rappel/4571/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4572/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4573/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4575/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4569/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4565/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4568/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4570/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4567/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4558/Interne
it actually works when I write print into the code as following :
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links = url + item.find('a', {'class' : 'product-link'})['href']
print(links)
return
but I'm supposed with all the links I get from this request put them into a loop so I'll get data from each of those 10 pages and store them in a database (so it means there are lines code to write after def extract(soup)to come.
I have tried to understand with many tutorials, I get ever one html or a none
You just need to build a list of links, in your code the variable links only resets each time in the loop. Try this:
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
links = []
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links.append(url + item.find('a', {'class' : 'product-link'})['href'])
return links
To print each link in main code after functions:
soup = get_url(url)
linklist = extract(soup)
for url in linklist:
print(url)
Your links variable is being rewritten inside the for loop.
You can create an empty list before the loop, then append the URL on each iteration.
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
#Collecting links on rappel.gouv
def get_url(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
links = []
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links.append(url + item.find('a', {'class' : 'product-link'})['href'])
return links
soup = get_url(url)
print(extract(soup))
To use the links from the page to iterate over each products detail page collect the links in a list and return it from the funtion.
Try to name your functions more like what they are returning get_url() is more get_soup(),...
Example
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
def get_soup(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract_product_urls(url):
links = [url+x['href'] for x in get_soup(url).select('a.product-link')]
return links
def extract_product_details(url):
soup = get_soup(url)
items = {}
for x in soup.select('.product-desc li'):
content = x.get_text('|', strip=True).split('|')
items[content[0]]=content[1]
return items
data = []
for link in extract_product_urls(url):
data.append(extract_product_details(link))
data
Output
[{'Réf. Fiche\xa0:': '2021-11-0273',
'№ de Version\xa0:': '1',
'Origine de la fiche\xa0:': 'PLACE DU MARCHE PLACE DU MARCHE',
'Nature juridique du rappel\xa0:': 'Volontaire',
'Catégorie de produit': 'Alimentation',
'Sous-catégorie de produit': 'Lait et produits laitiers',
'Nom de la marque du produit': 'Toupargel',
'Noms des modèles ou références': 'BATONNETS GEANTS VANILLE AMANDES',
'Identification des produits': 'GTIN',
'Conditionnements': '292G',
'Date début/Fin de commercialisation': 'Du\r\n 11/07/2019\r\n au\r\n 18/09/2021',
'Température de conservation': 'Produit à conserver au congélateur',
'Marque de salubrité': 'EMB 35360C',
'Zone géographique de vente': 'France entière',
'Distributeurs': 'PLACE DU MARCHE',
'Motif du rappel': 'Nous tenons à vous informer, que suite à une alerte européenne concernant la présence potentielle d’oxyde d’éthylène à une teneur supérieure à la limite autorisée, et comme un grand nombre d’acteurs de la distribution, nous devons procéder au rappel',
'Risques encourus par le consommateur': 'Autres contaminants chimiques',
'Conduite à tenir par le consommateur': 'Ne plus consommer',
'Numéro de contact': '0805805910',
'Modalités de compensation': 'Remboursement',
'Date de fin de la procédure de rappel': 'samedi 26 février 2022'},
{'Réf. Fiche\xa0:': '2021-11-0274',
'№ de Version\xa0:': '1',
'Origine de la fiche\xa0:': 'PLACE DU MARCHE PLACE DU MARCHE',
'Nature juridique du rappel\xa0:': 'Volontaire',
'Catégorie de produit': 'Alimentation',
'Sous-catégorie de produit': 'Lait et produits laitiers',
'Nom de la marque du produit': 'Toupargel',
'Noms des modèles ou références': 'CREME GLACEE NOUGAT',
'Identification des produits': 'GTIN',
'Conditionnements': '469G',
'Date début/Fin de commercialisation': 'Du\r\n 28/06/2019\r\n au\r\n 10/10/2021',
'Température de conservation': 'Produit à conserver au congélateur',
'Marque de salubrité': 'EMB 35360C',
'Zone géographique de vente': 'France entière',
'Distributeurs': 'PLACE DU MARCHE',
'Motif du rappel': 'Nous tenons à vous informer, que suite à une alerte européenne concernant la présence potentielle d’oxyde d’éthylène à une teneur supérieure à la limite autorisée, et comme un grand nombre d’acteurs de la distribution, nous devons procéder au rappel',
'Risques encourus par le consommateur': 'Autres contaminants chimiques',
'Conduite à tenir par le consommateur': 'Ne plus consommer',
'Numéro de contact': '0805805910',
'Modalités de compensation': 'Remboursement',
'Date de fin de la procédure de rappel': 'samedi 26 février 2022'},...]
I want to create a program that searches for an inputted list of words in german and finds them in Reverso Context with the appropriate examples of these words. After finding them, the inputted words will be deleted and the examples will be represented without these words. I tried doing these but for one word:
import requests
from bs4 import BeautifulSoup
inp = input("Type a german word\n")
web = requests.get('https://context.reverso.net/translation/german-english/'+inp)
data = web.content
soup = BeautifulSoup(data, features = "html.parser")
tag = soup.find_all("span","text","de")
a = 1
for i in tag:
print(a, ".", i.text)
a = a+1
help to suit for requirements I wrote, please.
Every iterateion you can see this message Please enter the word to collect the data or you want to end the prosecc, enter the charcter 'e': or you can make list of word and itertae it you got the same result.You can try it:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:10.0) Gecko/20100101 Firefox/10.0 '}
mark = ""
while mark != 'e':
inp = input("Please enter the word to collect the data or you want to end the prosecc, enter the charcter 'e': ")
mark = inp
if mark == 'e':
break
s = requests.Session()
url = f'https://context.reverso.net/translation/german-english/{inp}'
web = s.get(url,headers=headers)
soup = BeautifulSoup(web.text,"lxml")
tag = soup.select("span",class_="text",lang="de")
a = 1
for i in tag:
if ('\n' or "") in i.text :
print(a, ". ", i.text.strip())
a = a+1
# print("Do You have any List of word?")
print("."*80)
Output will be:
1 . Join Reverso
2 .
3 . Facebook connect
4 . Google connect
5 . Zeigt die Anzahl der heute blockierten Ereignisse an.
6 . Displays the number of events that have been blocked today.
7 . In diesem Sinne werden wir heute die Entlastung verweigern.
8 . It is for this reason that we are today refusing to grant discharge.
9 . Die Agrarerzeugnisse sind heute ein wesentlicher Bestandteil der Verhandlungsrunden der Welthandelsorganisation.
10 . Agricultural products are now an integral part of the World Trade Organisation negotiating round.
11 . Das ist heute die wichtigste Frage.
12 . This is the pressing issue we now face.
13 . Sie wird in vergleichbaren Fällen heute anders vorgehen.
14 . It would take a different approach in comparable cases today.
15 . Kutschma regiert heute als allmächtiger Präsident.
16 . Today, Kuchma rules as an all-powerful president.
17 . Für mich verbleibt heute nur eine wesentliche Frage.
18 . In my view, there is only one important question left today.
19 . Die heute diskutierte Verordnung wird unsere Aktion fraglos verbessern helfen.
20 . The regulation we are debating today will undoubtedly contribute to improving our action.
and so on......
You can also try it:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:10.0) Gecko/20100101 Firefox/10.0 '}
mark = ""
while mark != 'e':
inp = input("Please enter the word to collect the data or you want to end the prosecc, enter the charcter 'e': ")
mark = inp
if mark == 'e':
break
s = requests.Session()
url = f'https://context.reverso.net/translation/german-english/{inp}'
web = s.get(url,headers=headers)
soup = BeautifulSoup(web.text,"lxml")
# tag = soup.select("span",class_="text",lang="de")
sentences = [x.text.strip() for x in soup.find_all('span', {'class':'text'},{"lang" : "de"}) if '\n' in x.text]
print(sentences)
print("."*80)
You get the same result as list.
I implemented a python wrapper for Reverso Context API: https://github.com/flagist0/reverso_context_api
In your case, you can use it like this:
from itertools import islice
from reverso_context_api import Client
def get_samples(client, word, num=5):
# There can be thousands of translation samples, this function requests and returns only needed amount of them
iterator = client.get_translation_samples(word)
return list(islice(iterator, num))
client = Client(source_lang="de", target_lang="en")
# call get_samples for each word in your list
print(get_samples(client, "Fortschritt"))
# Outputs:
# [('Überprüfen Sie den Fortschritt des Datenbank-Loaders im Prozessmanager.',
# 'Check the progress of the Database Loader in your Process Manager.'),
# ('Status verfolgen auch den Fortschritt des Auftragsabschlussprozesses.',
# 'Statuses also track the progress of the job close process.'),
# ('Kommissar Vitorino hatte das Abkommen als großen Fortschritt bezeichnet.',
# "Commissioner Vitorino has described it as a 'major advance'."),
# ('Dies ist deshalb schon ein großer Fortschritt.',
# 'This is, therefore, already a major advance.'),
# ('Ich betrachte die Charta als akzeptablen Fortschritt.',
# 'I consider that the Charter of Fundamental Rights represents a valuable step forward.')]
Hello I've created two functions that work well well called alone. But when I try to use a for loop with these functions I got a problem with my parameter.
First function to search and get link to pass to the second one.
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def searchsport(terme):
url = 'https://www.verif.com/recherche/{}/1/ca/d/?ville=null'.format(terme)
response = requests.get(url, headers= USER_AGENT)
response.raise_for_status()
return terme, response.text
def crawl(keyword):
if __name__ == '__main__':
try:
keyword, html = searchsport(keyword)
soup = bs(html,'html.parser')
table = soup.find_all('td', attrs={'class': 'verif_col1'})
premier = []
for result in table:
link = result.find('a', href=True)
premier.append(link)
truelink = 'https://www.verif.com/'+str(premier[0]).split('"')[1]
#print("le lien", truelink)
except Exception as e:
print(e)
finally:
time.sleep(10)
return truelink
Second function to scrape a link.
def single_text(item_url):
source_code = requests.get(item_url)
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml" )
print('nivo2 ok')
table = soup.find('table',{'class':"table infoGen hidden-smallDevice"}) # on cherche que la balise table
print('nivo1 ok', '\n', table)
table_rows = table.find_all('tr') # les données de tables sont dans les celulles tr
#print(table_rows)
l = []
for tr in table_rows:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td]
l.append(row)
# On enleve certains caractères unitiles
df = pd.DataFrame(l)
return df
All these function worked when I tested them on a link.
Now I have a csv file with name of companies using searchsport() to search in website and the returned link is passed to single_text() to scrape.
for keyword in list(pd.read_csv('sport.csv').name):
l = crawl(keyword)
print(l) # THIS PRINT THE LINK
single_item(l) # HERE I GOT THE PROBLEME
Error:
nivo1 ok
nivo2 ok
nivo1 ok
None
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-55-263d95d6748c> in <module>
3 l = crawl(keyword)
4
----> 5 single_item(item_url=l)
<ipython-input-53-6d3b5c1b1ee8> in single_item(item_url)
7 table = soup.find('table',{'class':"table infoGen hidden-smallDevice"}) # on cherche que la balise table
8 print('nivo1 ok', '\n', table)
----> 9 table_rows = table.find_all('tr') # les données de tables sont dans les celulles tr
10 #print(table_rows)
11
AttributeError: 'NoneType' object has no attribute 'find_all'
When I run this I got a df.
single_item(item_url="https://www.verif.com/societe/COMPANYNAME-XXXXXXXXX/").head(1)
My expected results should be two DataFrame for every keyword.
Why it doesn't work?
So I have noted throughout the code some of the problems I saw with your code as posted.
Some things I noticed:
Not handling cases of where something is not found e.g. 'PARIS-SAINT-GERMAIN-FOOTBALL' will fail whereas 'PARIS SAINT GERMAIN FOOTBALL' as a search term will not
Opportunities for simplification missed e.g. creating a dataframe by looping tr then td when could just use read_html on table; Using find_all when a single table or a tag is needed
Overwriting variables in loops as well as typos e.g.
for tr in table_rows:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td] # presumable a typo with row = row
Not testing if a dataframe is empty
Risking generating incorrect urls by using 'https://www.verif.com/' as the next part you concatenate on starts with "/" as well
Inconsistent variable naming e.g. what is single_item? The function I see is called single_text.
These are just some observations and there is certainly still room for improvement.
import requests, time
from bs4 import BeautifulSoup as bs
import pandas as pd
def searchsport(terme):
url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
response.raise_for_status()
return terme, response.text
def crawl(keyword):
try:
keyword, html = searchsport(keyword)
soup = bs(html,'lxml')
a_tag = soup.select_one('td.verif_col1 a[href]')
# your code before when looping tds would just overwrite truelink if more than one found. Instead
if a_tag is None:
#handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
#crawl('PARIS SAINT GERMAIN FOOTBALL')
truelink = ''
else:
# print(a_tag['href'])
# adding to the list premier served no purpose. Using split on href would result in list index out of range
truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
except Exception as e:
print(e)
truelink = '' #handle case of 'other' fail. Make sure there is an assigment
finally:
time.sleep(5)
return truelink #unless try succeeded this would have failed with local variable referenced before assignment
def single_text(item_url):
source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml")
print('nivo2 ok')
table = soup.select_one('.table') # on cherche que la balise table
#print('nivo1 ok', '\n', table)
if table is None:
df = pd.DataFrame()
else:
df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
return df
def main():
terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
for term in terms:
item_url = crawl(term)
if item_url:
print(item_url)
df = single_text(item_url) # what is single_item in your question? There is single_text
if not df.empty: #test if dataframe is empty
print(df.head(1))
if __name__ == '__main__':
main()
Returning df from main()
import requests, time
from bs4 import BeautifulSoup as bs
import pandas as pd
def searchsport(terme):
url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
response.raise_for_status()
return terme, response.text
def crawl(keyword):
try:
keyword, html = searchsport(keyword)
soup = bs(html,'lxml')
a_tag = soup.select_one('td.verif_col1 a[href]')
# your code before when looping tds would just overwrite truelink if more than one found. Instead
if a_tag is None:
#handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
#crawl('PARIS SAINT GERMAIN FOOTBALL')
truelink = ''
else:
# print(a_tag['href'])
# adding to the list premier served no purpose. Using split on href would result in list index out of range
truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
except Exception as e:
print(e)
truelink = '' #handle case of 'other' fail. Make sure there is an assigment
finally:
time.sleep(5)
return truelink #unless try succeeded this would have failed with local variable referenced before assignment
def single_text(item_url):
source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml")
print('nivo2 ok')
table = soup.select_one('.table') # on cherche que la balise table
#print('nivo1 ok', '\n', table)
if table is None:
df = pd.DataFrame()
else:
df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
return df
def main():
terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
for term in terms:
item_url = crawl(term)
if item_url:
#print(item_url)
df = single_text(item_url) # what is single_item in your question? There is single_text
return df
if __name__ == '__main__':
df = main()
print(df)
Your error suggests that you trying to run find_all() against a variable which hasn't been populated, i.e. a tag wasn't found to which you could run find_all() against. I have dealt with this by including a statement testing for NoneType
if VALUE is not None:
## code when the tag is found
else:
## code when tag is not found
I think this is the bit you need to do an update like this,
for tr in table_rows:
if tr is not None:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td]
l.append(row)
# On enleve certains caractères unitiles
df = pd.DataFrame(l)
else:
## code to run when tr isn't populated
There's a more colourful example where some XML is being parsed where this in action here