When I make the csv file data are overwrite in csv file If there is any solution provide me the link of the page is https://www.aeafa.es/asociados.php?provinput=&_pagi_pg=1 have already searched for an answer here and spent a long time on google, but nothing... I've already tried opening the file with 'w' instead of 'r' or 'a' but I still can't get my code to
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
for page in range(1,3):
r =requests.get('https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}'.format(page=page),
headers=headers)
soup=BeautifulSoup(r.content, 'lxml')
tag=soup.find_all('div',class_='col-md-8 col-sm-8')
temp=[]
for pro in tag:
data=[tup.text for tup in pro.find_all('p')]
Dirección=data[2]
Dirección=Dirección[12:]
Población=data[3]
Población=Población[14:]
Provincia=data[4]
Provincia=Provincia[14:]
Teléfono=data[5]
Teléfono="+" + Teléfono[11:].replace('.', "")
Email=data[6]
Email=Email[10:]
temp.append([Dirección,Provincia,Población,Teléfono, Email])
df=pd.DataFrame(temp,columns=["Dirección","Provincia","Población","Teléfono","Email"])
df.to_csv('samp.csv')
Try to put the list temp outside of the for-loop. Then, create the dataframe after all the loops finish:
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
temp = []
for page in range(1, 3):
r = requests.get(
"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
tag = soup.find_all("div", class_="col-md-8 col-sm-8")
for pro in tag:
data = [tup.text for tup in pro.find_all("p")]
Dirección = data[2]
Dirección = Dirección[12:]
Población = data[3]
Población = Población[14:]
Provincia = data[4]
Provincia = Provincia[14:]
Teléfono = data[5]
Teléfono = "+" + Teléfono[11:].replace(".", "")
Email = data[6]
Email = Email[10:]
temp.append([Dirección, Provincia, Población, Teléfono, Email])
df = pd.DataFrame(
temp, columns=["Dirección", "Provincia", "Población", "Teléfono", "Email"]
)
df.to_csv("samp.csv")
print(len(df))
Prints:
98
Screenshot from LibreOffice:
Related
I am trying to get links of restaurants but i can only get the first 30 and not all the others.
Restaurants in Madrid Area are hundreads, the pagination only shows 30 in each page and the following code only get those 30
import re
import requests
from openpyxl import Workbook
from bs4 import BeautifulSoup as b
city_name = 'Madrid'
geo_code = '187514'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
data = requests.get(
"https://www.tripadvisor.com//Restaurants-g{}-{}.html".format(geo_code, city_name), headers=headers
).text
for link in re.findall(r'"detailPageUrl":"(.*?)"', data):
print("https://www.tripadvisor.com.sg/" + link)
next_link = "https://www.tripadvisor.com.sg/" + link
f.write('%s\n' % next_link)
Found the solution, had to add ao with number of the result in the url like:
"https://www.tripadvisor.com//Restaurants-g{}-{}-{}.html".format(geo_code, city_name, n_review), headers=headers
I want to scrape this URL https://aviation-safety.net/wikibase/type/C206.
I don't understand the meaning of this error below:
'NoneType' object has no attribute 'prettify'
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/type/C206'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('tbody').prettify())[0])
if soup.select_one('div.pagenumbers + div a[href]'):
url = soup.select_one('div.pagenumbers + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('206.csv',encoding='utf-8-sig',index=False)
You're not using headers with requests, which is the reason you're not getting the right HTML and the table you're after is the second one, not the first. Also, I'd highly recommend to use requests over urllib.request.
So, having said that, here's how to get all the tables from all the pages:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://aviation-safety.net/wikibase/type/C206'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
}
data = []
with requests.Session() as s:
total_pages = int(
BeautifulSoup(s.get(url, headers=headers).text, "lxml")
.select("div.pagenumbers > a")[-1]
.getText()
)
for page in range(1, total_pages + 1):
print(f"Getting page: {page}...")
data.append(
pd.read_html(
s.get(f"{url}/{page}", headers=headers).text,
flavor="lxml",
)[1]
)
df = pd.concat(data)
df.to_csv('206.csv', sep=";", index=False)
My list xfrs, returns a blank DF when I convert it....does anyone see any issues with the code?
I'm able to append and print the list fine, but when I append, the DF transfers is blank.
url2 = 'https://247sports.com/Season/2020-Football/TransferPortalPositionRanking/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
response = requests.get(url2, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
xfrs = []
schools = []
for li in soup.findAll('li', attrs={'class':'transfer-player'}):
xfrs.append(li.find('a').contents)
schools.append(li.find('li', attrs={'class':'destination'}))
transfers = pd.DataFrame(xfrs, columns=['Players'])
print(transfers)
As mentioned, .contents returns a list of BeautifulSoup objects, so you need to use for example .text to get the name. Also take care of your selection it should be more specific.
Storing the scraped data in a dataframe try to collect it as list of dicts:
data.append({
'Player':li.h3.text,
'Destination':destination['alt'] if (destination:=li.select_one('img[class="logo"]')) else None
})
Example
import requests,json
from bs4 import BeautifulSoup as bs
url2 = 'https://247sports.com/Season/2020-Football/TransferPortalPositionRanking/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
response = requests.get(url2, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
data = []
for li in soup.find_all('li', attrs={'class':'transfer-player'}):
data.append({
'Player':li.h3.text,
'Destination':destination['alt'] if (destination:=li.select_one('img[class="logo"]')) else None
})
pd.DataFrame(data)
Output
Player
Destination
JT Daniels
Georgia
KJ Costello
Mississippi State
Jamie Newman
Georgia
...
...
This question already has an answer here:
Webscraping with beautifulsoup 'NoneType' object has no attribute 'get_text'
(1 answer)
Closed 1 year ago.
url = "https://www.imdb.com/chart/top/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.content,"html.parser")
puan = soup.find_all("tr")
for i in puan:
puan2 = i.find_all("td",{"class":"ratingColumn"})
for x in puan2:
puan3 = x.find("strong")
print(puan3.text)
I'm scraping with BeautifulSoup. In the results I found, I get an error because there is NoneType in the list. How can I remove the NoneType parts from the list
Adding a simple if guard will do the trick:
if puan3 is not None:
print(puan3.text)
What happens?
Your selection is not that specific so you get a resultset, that also consists of elements you won't like to select.
How to fix?
Select your elements more specific:
i.find_all("td",{"class":"imdbRating"})
or
for row in soup.select('table.chart tbody tr'):
rating = row.select_one('.imdbRating strong').text
print(rating)
and additional with a double check:
for row in soup.select('table.chart tbody tr'):
rating = rating.text if (rating := row.select_one('.imdbRating strong')) else None
print(rating)
Example (based on your code)
import requests
from bs4 import BeautifulSoup
url = "https://www.imdb.com/chart/top/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.content,"html.parser")
puan = soup.find_all("tr")
for i in puan:
puan2 = i.find_all("td",{"class":"imdbRating"})
for x in puan2:
puan3 = x.find("strong")
print(puan3.text)
Example (css selectors)
import requests
from bs4 import BeautifulSoup
url = "https://www.imdb.com/chart/top/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.content,"html.parser")
for row in soup.select('table.chart tbody tr'):
rating = rating.text if (rating := row.select_one('.imdbRating strong')) else None
print(rating)
I have been working on the code below and getting myself tied up in knots. What I am trying to do is build a simple dataframe using text scraped using BeautifulSoup.
I have scraped the applicable text from the <h5> and <p> tags but using find_all means that when I build the dataframe and write to csv the tags are included. To deal with this I have added the print(p.text, end=" ") statements but now nothing is being written to the csv.
Can anyone see what I am doing wrong?
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
course = []
runner = []
page = requests.get('https://www.attheraces.com/tips/atr-tipsters/hugh-taylor', headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
tips = soup.find('div', class_='sticky')
for h5 in tips.find_all("h5"):
course_name = print(h5.text, end=" ")
course.append(course_name)
for p in tips.find_all("p"):
runner_name = print(p.text, end=" ")
runner.append(runner_name)
todays_tips = pd.DataFrame(
{'Course': course,
'Selection': runner,
})
print(todays_tips)
todays_tips.to_csv(r'C:\Users\*****\Today.csv')
Don't use the assignment for print and consider using a list comprehension. Applying this should get you the dataframe you want.
For example:
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
page = requests.get('https://www.attheraces.com/tips/atr-tipsters/hugh-taylor', headers=headers)
tips = BeautifulSoup(page.content, 'html.parser').find('div', class_='sticky')
course = [h5.getText() for h5 in tips.find_all("h5")]
runner = [p.getText() for p in tips.find_all("p")]
todays_tips = pd.DataFrame({'Course': course, 'Selection': runner})
print(todays_tips)
todays_tips.to_csv("your_data.csv", index=False)
Output:
Course Selection
0 1.00 HAYDOCK 1pt win RAINBOW JET (12-1 & 11-1 general)
1 2.50 GOODWOOD 1pt win MARSABIT (11-2 general)
And a .csv file: