Scrape html links Python - python

Hello everyone I'm trying to get all href links with python by using this :
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
#Collecting links on rappel.gouv
def get_url(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links = url + item.find('a', {'class' : 'product-link'})['href']
return links
soup = get_url(url)
print(extract(soup))
I'm supposed to get 10 htmls links as following :
https://rappel.conso.gouv.fr/fiche-rappel/4571/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4572/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4573/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4575/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4569/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4565/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4568/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4570/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4567/Interne
https://rappel.conso.gouv.fr/fiche-rappel/4558/Interne
it actually works when I write print into the code as following :
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links = url + item.find('a', {'class' : 'product-link'})['href']
print(links)
return
but I'm supposed with all the links I get from this request put them into a loop so I'll get data from each of those 10 pages and store them in a database (so it means there are lines code to write after def extract(soup)to come.
I have tried to understand with many tutorials, I get ever one html or a none

You just need to build a list of links, in your code the variable links only resets each time in the loop. Try this:
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
links = []
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links.append(url + item.find('a', {'class' : 'product-link'})['href'])
return links
To print each link in main code after functions:
soup = get_url(url)
linklist = extract(soup)
for url in linklist:
print(url)

Your links variable is being rewritten inside the for loop.
You can create an empty list before the loop, then append the URL on each iteration.
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
#Collecting links on rappel.gouv
def get_url(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract(soup):
results = soup.find_all('div', {'class' : 'product-content'})
links = []
for item in results:
item.find('a', {'class' : 'product-link'}).text.replace('','').strip()
links.append(url + item.find('a', {'class' : 'product-link'})['href'])
return links
soup = get_url(url)
print(extract(soup))

To use the links from the page to iterate over each products detail page collect the links in a list and return it from the funtion.
Try to name your functions more like what they are returning get_url() is more get_soup(),...
Example
import requests
from bs4 import BeautifulSoup
url = 'https://rappel.conso.gouv.fr'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
def get_soup(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract_product_urls(url):
links = [url+x['href'] for x in get_soup(url).select('a.product-link')]
return links
def extract_product_details(url):
soup = get_soup(url)
items = {}
for x in soup.select('.product-desc li'):
content = x.get_text('|', strip=True).split('|')
items[content[0]]=content[1]
return items
data = []
for link in extract_product_urls(url):
data.append(extract_product_details(link))
data
Output
[{'Réf. Fiche\xa0:': '2021-11-0273',
'№ de Version\xa0:': '1',
'Origine de la fiche\xa0:': 'PLACE DU MARCHE PLACE DU MARCHE',
'Nature juridique du rappel\xa0:': 'Volontaire',
'Catégorie de produit': 'Alimentation',
'Sous-catégorie de produit': 'Lait et produits laitiers',
'Nom de la marque du produit': 'Toupargel',
'Noms des modèles ou références': 'BATONNETS GEANTS VANILLE AMANDES',
'Identification des produits': 'GTIN',
'Conditionnements': '292G',
'Date début/Fin de commercialisation': 'Du\r\n 11/07/2019\r\n au\r\n 18/09/2021',
'Température de conservation': 'Produit à conserver au congélateur',
'Marque de salubrité': 'EMB 35360C',
'Zone géographique de vente': 'France entière',
'Distributeurs': 'PLACE DU MARCHE',
'Motif du rappel': 'Nous tenons à vous informer, que suite à une alerte européenne concernant la présence potentielle d’oxyde d’éthylène à une teneur supérieure à la limite autorisée, et comme un grand nombre d’acteurs de la distribution, nous devons procéder au rappel',
'Risques encourus par le consommateur': 'Autres contaminants chimiques',
'Conduite à tenir par le consommateur': 'Ne plus consommer',
'Numéro de contact': '0805805910',
'Modalités de compensation': 'Remboursement',
'Date de fin de la procédure de rappel': 'samedi 26 février 2022'},
{'Réf. Fiche\xa0:': '2021-11-0274',
'№ de Version\xa0:': '1',
'Origine de la fiche\xa0:': 'PLACE DU MARCHE PLACE DU MARCHE',
'Nature juridique du rappel\xa0:': 'Volontaire',
'Catégorie de produit': 'Alimentation',
'Sous-catégorie de produit': 'Lait et produits laitiers',
'Nom de la marque du produit': 'Toupargel',
'Noms des modèles ou références': 'CREME GLACEE NOUGAT',
'Identification des produits': 'GTIN',
'Conditionnements': '469G',
'Date début/Fin de commercialisation': 'Du\r\n 28/06/2019\r\n au\r\n 10/10/2021',
'Température de conservation': 'Produit à conserver au congélateur',
'Marque de salubrité': 'EMB 35360C',
'Zone géographique de vente': 'France entière',
'Distributeurs': 'PLACE DU MARCHE',
'Motif du rappel': 'Nous tenons à vous informer, que suite à une alerte européenne concernant la présence potentielle d’oxyde d’éthylène à une teneur supérieure à la limite autorisée, et comme un grand nombre d’acteurs de la distribution, nous devons procéder au rappel',
'Risques encourus par le consommateur': 'Autres contaminants chimiques',
'Conduite à tenir par le consommateur': 'Ne plus consommer',
'Numéro de contact': '0805805910',
'Modalités de compensation': 'Remboursement',
'Date de fin de la procédure de rappel': 'samedi 26 février 2022'},...]

Related

Scrape data using beautifulsoup

I am extracting the data they give repeat name and surname in each entry how ever the name and surname is different for each entry these is page link https://www.aeafa.es/asociados.php
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
temp = []
wev={}
for page in range(1, 5):
r = requests.get(
"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
details=soup.find('table',class_="table")
for detail in details.find_all('tbody'):
link = [up.text for up in detail.find_all("td")]
name=link[0]
wev['Nombre']=name
surname=link[1]
wev["Apellidos"]=surname
tag = soup.find_all("div", class_="col-md-8 col-sm-8")
for pro in tag:
data = [tup.text for tup in pro.find_all("p")]
Dirección = data[2]
Dirección = Dirección[12:]
wev[" Dirección"]= Dirección
Población = data[3]
Población = Población[14:]
wev[" Población"]= Población
Provincia = data[4]
Provincia = Provincia[14:]
wev["Provincia "]=Provincia
Teléfono = data[5]
Teléfono = "+" + Teléfono[11:].replace(".", "")
Teléfono= Teléfono.replace("-", '')
wev[" Teléfono"]= Teléfono
Email = data[6]
Email = Email[10:]
wev["Email"]= Email
temp.append(wev)
df = pd.DataFrame(temp)
print(df)
They will print same name and surname in each entry how I correct it these is output
Nombre Apellidos
0 JUAN ARIAS BARTOLOMÉ
1 JUAN ARIAS BARTOLOM
One approach would be to merge the separate name and surname details into the data from the about information. A test could also be added for when the last page is reached:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unicodedata import normalize
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
page = 1
data1 = []
data2 = []
while True:
print(f"Page {page}")
r = requests.get(f"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}", headers=headers)
page += 1
soup = BeautifulSoup(r.content, "lxml")
for pro in soup.find_all("div", class_="col-md-8 col-sm-8"):
values = [re.sub(r'\s+', ' ', normalize('NFKD', p.get_text(strip=True))) for p in pro.find_all("p")]
row = {'Sobre' : values[0][6:]} # skip over the word Sobre
for item in values[2:]:
key, value = item.split(':', 1)
row[key.strip()] = value.strip()
row['Teléfono'] = row['Teléfono'].replace(".", "")
data1.append(row)
details = soup.find("table", class_="table").tbody
for tr in details.find_all("tr"):
data2.append([re.sub(r'\s+', ' ', normalize('NFKD', td.get_text(strip=True))) for td in tr.find_all("td")[:-1]])
# Any more?
ul = soup.find("ul", class_="pagination")
last_li = ul.find_all("li")[-1]
if last_li.text != "»":
break
# Merge the name and surname from the second table
data = []
for d1, d2 in zip(data1, data2):
data.append({'Nombre' : d2[0], 'Apellidos' : d2[1]} | d1)
df = pd.DataFrame(data)
print(df)
Giving you a dataframe starting:
Nombre Apellidos Sobre Dirección Población Provincia Teléfono E-mail Web
0 JUAN MARIANO MERCADO Juan Mariano Mercado Juan de Toledo, no 16, 1o B 30800 LORCA Murcia 968-471716 periagomer#hotmail.com
1 Ma. BELEN ABAD GARCIA Ma. Belen Abad Garcia Calle Constantino 33, 1o N 4700 EL EJIDO Almería 950487533 - 647936929 mariabelenabadgarcia#hotmail.com
2 JESÚS ABAD MUÑIZ Jesús Abad Muñiz Santiago, 15, 1o.- ctro. 47001 Valladolid 98.320.20.11 jabad#carlosgallegoabogados.es
3 Ma PALOMA ABAD TEJERINA Ma Paloma Abad Tejerina Poniente, 40 28036 Madrid 91.383.11.45 paloma#abadsaezabogados.com
4 GEMA ÁBALOS MUÑOZ Gema ábalos Muñoz Solarillo de Gracia, 4, 1o.- D 18002 Granada 639.317.297 3004#icagr.es
You could then use Pandas to make any further changes to the data structure. Note, the Python dictionary merge operation requires Python 3.9 onwards

Python scraping loop

So, i need help here , this is mi code
results=[]
import re
for i in popup_linkz: # Here I take N links like this one https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs=uEap3sWEgifS2G+m9xvYiA== to iterate thorught them a scraping
url=i # so right now I scrape the iterating urls
response = requests.get(url)
print('url:', response.url)
#print('status:', response.status_code)
soup = BeautifulSoup(response.content, "html.parser")
results=[]
#json_res = json.loads(res.text)
#print(json_res[0]['price'])
item_1='grvProducto_ctl02_lblCategoria'
for line in soup.findAll('span', attrs={'id': 'grvProducto_ctl02_lblCategoria'}):
results.append(line.text)
#this actually get the first code, but don't know how to iterate for others, also doesn't store every code on it, when I print doesn't stack them , show them single on print.
print('id',results)
I am trying to get from this urlsample >https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs=uEap3sWEgifS2G+m9xvYiA==
actually it iterates from 2 to 10.000 of them.
information I want to get here but that cant get it
I am not sure how use this
for line in soup.findAll('span', attrs={'id': 'grvProducto_ctl02_lblCategoria'}):
results.append(line.text)
to use the same loop to get the other information.
data of page underlying
could you enlight me please?
Try:
import requests
from bs4 import BeautifulSoup
url = "https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs=uEap3sWEgifS2G+m9xvYiA=="
soup = BeautifulSoup(requests.get(url).content, "html.parser")
licitation_number = soup.select_one("#lblNumLicitacion").text
responsable = soup.select_one("#lblResponsable").text
ficha = soup.select_one("#lblFicha2Reclamo").text
print(f"{licitation_number=}")
print(f"{responsable=}")
print(f"{ficha=}")
print("-" * 80)
for t in soup.select("#grvProducto .borde_tabla00"):
categoria = t.select_one('[id$="lblCategoria"]').text
candidad = t.select_one('[id$="lblCantidad"]').text
descripction = t.select_one('[id$="lblDescripcion"]').text
print(f"{categoria=} {candidad=}")
print(f"{descripction=}")
print()
Prints:
licitation_number='1549-5-LR22'
responsable='SERVICIO DE SALUD METROPOLITANA NORTE HOSPITAL SAN JOSE, Hospital San José'
ficha='107'
--------------------------------------------------------------------------------
categoria='42221501' candidad='130'
descripction='(226-2001) STENT CORONARIO DE CROMO COBALTO, LIBERADOR DE FÁRMACO EVEROLIMUS'
categoria='42221501' candidad='360'
descripction='(226-2002) STENT CORONARIO DE CROMO COBALTO, LIBERADOR DE FÁRMACO ZOTAROLIMUS'
categoria='42221501' candidad='120'
descripction='(226-2004) STENT CORONARIO DE CROMO COBALTO, LIBERADOR DE FÁRMACO SIROLIMUS, CON STRUT DE 0.80'
categoria='42221501' candidad='240'
descripction='(226-2003) STENT CORONARIO DE CROMO COBALTO, LIBERADOR DE FÁRMACO SIROLIMUS, CON STRUT DE 0.60'

TItle and Link Not Getting printed for Google Search

I am trying to scrape Title,Description and URL from Google Search Page using beautifulsoup and python.
from bs4 import BeautifulSoup
query = input("Enter your value: ")
print("Search Term:" + query) # query = 'Python'
links = [] # Initiate empty list to capture final results
titles = []
descriptions = []
# Specify number of pages on google search, each page contains 10 #links
n_pages = 6
for page in range(1, n_pages):
url = "http://www.google.com/search?q=" + query + "&start=" + str((page - 1) * 10)
print("Link : " + url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
search = soup.find_all('div', class_ = "yuRUbf")
for link in search:
links.append(link.a['href'])
description = soup.find_all('div', class_ = "VwiC3b yXK7lf MUxGbd yDYNvb lyLwlc lEBKkf")
for d in description:
description_text.append(d.span.text)
title = soup.find_all('div', class_ = "yuRUbf")
for t in title:
titles.append(t.h3.text)
# Next loop if one element is not present
except:
continue
print(links)
print(len(links))
print(description_text)
print(len(description_text))
print(titles)
print(len(titles))
The description is getting stored in a list however the links and title list is empty. I inspected the elements and I am using correct class but still unable to get the data.
Can someone help me figure out what I am doing wrong.
Personally, I find working with many lists irritating and cumbersome, when content can be stored directly and structured but anyway you can get information without selecting the dynamic classes in a more generic way:
for r in soup.select('#search a h3'):
data.append({
'title':r.text,
'url':r.parent['href'],
'desc':r.parent.parent.nextSibling.span.text if r.parent.parent.nextSibling.span else 'no desc'
})
Example
from bs4 import BeautifulSoup
import requests
query = input("Enter your value: ")
print("Search Term:" + query) # query = 'Python'
data = []
n_pages = 6
for page in range(1, n_pages):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
url = f'http://www.google.com/search?q={query}&start={str((page - 1) * 10)}'
r = requests.get(url, headers= headers)
soup = BeautifulSoup(r.text, 'lxml')
for r in soup.select('#search a h3'):
data.append({
'title':r.text,
'url':r.parent['href'],
'desc':r.parent.parent.nextSibling.span.text if r.parent.parent.nextSibling.span else 'no desc'
})
data
Output
[{'title': 'Welcome to Python.org',
'url': 'https://www.python.org/',
'desc': 'The official home of the Python Programming Language.'},
{'title': 'Python (Programmiersprache) - Wikipedia',
'url': 'https://de.wikipedia.org/wiki/Python_(Programmiersprache)',
'desc': 'Python ([ˈpʰaɪθn̩], [ ˈpʰaɪθɑn], auf Deutsch auch [ ˈpʰyːtɔn]) ist eine universelle, üblicherweise interpretierte, höhere Programmiersprache.'},
{'title': 'Pythons - Wikipedia',
'url': 'https://de.wikipedia.org/wiki/Pythons',
'desc': 'Die Pythons (Pythonidae; altgr. Πύθων Pythōn; Einzahl der, allgemeinsprachlich auch die Python) sind eine Familie von Schlangen aus der Überfamilie der\xa0...'},
{'title': 'Das Python-Tutorial — Das Python3.3-Tutorial auf Deutsch',
'url': 'https://py-tutorial-de.readthedocs.io/',
'desc': 'Python ist eine einfach zu lernende, aber mächtige Programmiersprache mit effizienten abstrakten Datenstrukturen und einem einfachen, aber effektiven Ansatz\xa0...'},...]

Passsing arrays arguments in Python using BeautifulSoup

I'm starting to work with python again after 8 years. I'm trying to do program with BeautifulSoup and a array argument. I pass the array argument medios to the url functions count_words, but it doesn't work. Is there a way fix it or to search a word in multiple websites using BeautifulSoup?
import requests
from bs4 import BeautifulSoup
def count_words(url, the_word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
words = soup.find(text=lambda text: text and the_word in text)
# print(words)
return len(words)
def main():
url = 'https://www.nytimes.com/'
medios = {
'Los Angeles Times': ['http://www.latimes.com/'],
'New York Times' : ['http://www.nytimes.com/'
] }
word = 'Trump'
#count = count_words(url, word)
cuenta = count_words(medios, word)
# print('\n El Sitio: {}\n Contiene {} occurrencias de la palabra: {}'.format(url, count, word))
print('\n La palabra: {} aparece {} occurrencias en el New York Times'.format(word, cuenta))
if __name__ == '__main__':
main()
There are 3 problems here
medios is a dict. Hence, you will have to loop through the keys and values to send it to the method as the method only accepts url string.
BeautifulSoup find method needs a tag name for it to search else it will return None. If you want to count the number of occurrences of the word, then use count on the string.
You have to send User-Agent in the requests code else you will get 403 or 301.
import requests
from bs4 import BeautifulSoup
headers = {'user-agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}
def count_words(url, the_word):
r = requests.get(url, headers=headers)
return r.text.lower().count(the_word)
def main():
url = 'https://www.nytimes.com/'
medios = {
'Los Angeles Times': ['http://www.latimes.com/'],
'New York Times' : ['http://www.nytimes.com/']
}
word = 'trump'
for web_name, urls in medios.items():
for url in urls:
cuenta = count_words(url, word)
print('La palabra: {} aparece {} occurrencias en el {}'.format(word, cuenta, web_name))
if __name__ == '__main__':
main()
Output:
La palabra: trump aparece 47 occurrencias en el Los Angeles Times
La palabra: trump aparece 194 occurrencias en el New York Times
You are sending a dictionary to count_words(). You need to send the urls in a loop, or else loop thru the dictionary in count_words().
Perhaps you meant:
cuenta = count_words(url, word)
Update your code to the following:
cuenta = 0
for key in medios:
for url in medios[key]:
cuenta += count_words(url, word)
Basically you should pass the url not a dict, and I am assuming you want to count all the words in all elements in medios.

Python Web Scraper gives the same page as the response

I wrote a python code to scrape data from the site. It doesn't seem to work the way it supposed to do. I want to get all the articles from the page, but I get one paragraph from the first article multiple times. I can't see what's wrong with the code. Please help me fix it if you know what's the issue.
import requests
from bs4 import BeautifulSoup
URL = 'https://zdravi.doktorka.cz/clanky?page=0'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}
HOST = 'https://zdravi.doktorka.cz'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('article', class_='node-teaser-display')
articles = []
for item in items:
articles.append({
HOST + item.find('a').get('href'),
})
arts = []
for each in articles:
b = ''.join(each)
arts.append(b)
for art in arts:
page = get_html(art)
pagesoup = BeautifulSoup(html, 'html.parser')
parags = pagesoup.find('p').get_text()
print(art)
print(parags)
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print('Error')
parse()
This is the response:
https://zdravi.doktorka.cz/infekcnost-bezpriznakovych-nosicu-covid-19-muze-byt-slaba-naznacuje-studie
Jsme tým lékařů, terapeutů, kosmetiček, odborníků pracujících ve zdravotnictví, v oboru fitness a ekologie. Náš web funguje od roku 1999 a patří mezi nejnavštěvovanější weby zabývající se zdravým životním stylem v ČR.
https://zdravi.doktorka.cz/pri-operativni-lecbe-sedeho-zakalu-existuji-tri-moznosti
Jsme tým lékařů, terapeutů, kosmetiček, odborníků pracujících ve zdravotnictví, v oboru fitness a ekologie. Náš web funguje od roku 1999 a patří mezi nejnavštěvovanější weby zabývající se zdravým životním stylem v ČR.
https://zdravi.doktorka.cz/epidemiolog-varuje-pred-dlouhodobym-nosenim-rousek
Jsme tým lékařů, terapeutů, kosmetiček, odborníků pracujících ve zdravotnictví, v oboru fitness a ekologie. Náš web funguje od roku 1999 a patří mezi nejnavštěvovanější weby zabývající se zdravým životním stylem v ČR.
https://zdravi.doktorka.cz/jidlo-muze-prozradit-na-co-mate-alergii
Jsme tým lékařů, terapeutů, kosmetiček, odborníků pracujících ve zdravotnictví, v oboru fitness a ekologie. Náš web funguje od roku 1999 a patří mezi nejnavštěvovanější weby zabývající se zdravým životním stylem v ČR.
https://zdravi.doktorka.cz/jak-muzeme-nyni-posilit-svou-imunitu
Jsme tým lékařů, terapeutů, kosmetiček, odborníků pracujících ve zdravotnictví, v oboru fitness a ekologie. Náš web funguje od roku 1999 a patří mezi nejnavštěvovanější weby zabývající se zdravým životním stylem v ČR.
In for-loop you have to use page.text instead of html
for art in arts:
page = get_html(art)
pagesoup = BeautifulSoup(page.text, 'html.parser')
parags = pagesoup.find('p').get_text()
print(art)
print(parags)
In html you have HTML from main page - so you always parsed the same HTML. But later you get new response from subpage and assing to variable page - and this variable has HTML from subpage.
BTW: Probably you would see it if you would check print( html )
EDIT: Full working code with other changes and with saving to file .csv
import requests
from bs4 import BeautifulSoup
import csv
URL = 'https://zdravi.doktorka.cz/clanky?page=0'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}
HOST = 'https://zdravi.doktorka.cz'
def get_soup(url, headers=HEADERS, params=None):
r = requests.get(url, headers=headers, params=params)
if r.status_code != 200:
print('Error:', r.status_code, url)
return
return BeautifulSoup(r.text, 'html.parser')
def get_content(soup):
data = []
articles = soup.find_all('article', class_='node-teaser-display')
for item in articles:
url = HOST + item.find('a').get('href')
print(url)
soup = get_soup(url)
if soup:
paragraph = soup.find('p').get_text().strip()
print(paragraph)
data.append({
'url': url,
'paragraph': paragraph,
})
print('---')
with open('output.csv', 'w') as fh:
csv_writer = csv.DictWriter(fh, ['url', 'paragraph'])
csv_writer.writeheader()
csv_writer.writerows(data)
def parse():
soup = get_soup(URL)
if soup:
get_content(soup)
if __name__ == '__main__':
parse()

Categories