I have a list that i want to append multiple items to it using python when i try to add the system crash and display the below error:
I already tried append() function extend() function the same crash and error.
TypeError Traceback (most recent call last)
<ipython-input-24-1e9480855366> in <module>
121
122
--> 123 joineddd.extend(link,jobdesc,alldd)
124
125
TypeError: extend() takes exactly one argument (3 given)
code:
import time
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(
requests.get("https://www.bayt.com/en/international/jobs/executive-chef-jobs/").content,
"lxml"
)
links = []
for a in soup.select("h2.m0.t-regular a"):
if a['href'] not in links:
links.append("https://www.bayt.com"+ a['href'])
joineddd = []
for link in links:
print(link)
s = BeautifulSoup(requests.get(link).content, "lxml")
jobdesc=s.select_one("div[class='card-content is-spaced'] p")
print(jobdesc.text)
alldt = [dt.text for dt in s.select("div[class='card-content is-spaced'] dt")]
dt_Job_location = alldt[0]
dt_Job_Company_Industry = alldt[1]
dt_Job_Company_Type = alldt[2]
if len(alldt[3])>0:
dt_Job_Job_Role = alldt[3]
elif len(dt_Job_Employment_Type)>0:
dt_Job_Employment_Type = alldt[4]
alldd = [dd.text for dd in s.select("div[class='card-content is-spaced'] dd")]
dd_job_location = alldd[0]
dd_job_Company_Industry = alldd[1]
dd_job_Company_Type = alldd[2]
if len(alldd[3])>0:
dd_job_Job_Role = alldd[3]
elif len(dd_job_Employment_Type)>0:
dd_job_Employment_Type = alldd[4]
print(f"{dt_Job_location}:{dd_job_location}\n{dt_Job_Company_Industry}:{dd_job_Company_Industry}\n\n")
print("-" * 80)
joineddd.extend(link,jobdesc,alldd)
expected result :
[link,description,location,Company_Industry,Company_Type,Job_Role,Employment_Type ]
You can use the list.extend method, but use square brackets around the items like so:
list = [1,2,3]
list.extend([4,5])
print(list)
>> returns 1,2,3,4,5
For Extend function you need to provide a list/touple/dict/set to add multiple items.
Like :-
joineddd.extend([link,jobdesc,alldd])
import time
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(
requests.get("https://www.bayt.com/en/international/jobs/executive-chef-jobs/").content,
"lxml"
)
links = []
for a in soup.select("h2.m0.t-regular a"):
if a['href'] not in links:
links.append("https://www.bayt.com"+ a['href'])
joineddd = []
for link in links:
print(link)
s = BeautifulSoup(requests.get(link).content, "lxml")
jobdesc=s.select_one("div[class='card-content is-spaced'] p")
print(jobdesc.text)
alldt = [dt.text for dt in s.select("div[class='card-content is-spaced'] dt")]
dt_Job_location = alldt[0]
dt_Job_Company_Industry = alldt[1]
dt_Job_Company_Type = alldt[2]
if len(alldt[3])>0:
dt_Job_Job_Role = alldt[3]
elif len(dt_Job_Employment_Type)>0:
dt_Job_Employment_Type = alldt[4]
alldd = [dd.text for dd in s.select("div[class='card-content is-spaced'] dd")]
dd_job_location = alldd[0]
dd_job_Company_Industry = alldd[1]
dd_job_Company_Type = alldd[2]
if len(alldd[3])>0:
dd_job_Job_Role = alldd[3]
elif len(dd_job_Employment_Type)>0:
dd_job_Employment_Type = alldd[4]
print(f"{dt_Job_location}:{dd_job_location}\n{dt_Job_Company_Industry}:{dd_job_Company_Industry}\n\n")
print("-" * 80)
joineddd.extend([link,jobdesc,alldd])
dataframe:
import pandas as pd
import numpy as np
array1 = ["value1", "value2"]
array2 = ["value1"]
df = dict( A = np.array(array1), B = np.array(array2 ) )
_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in df.items() ]))
print(_df)
_df.to_csv("filename.csv", index=False, encoding="utf-8")
output:
A B
0 value1 value1
1 value2 NaN
Related
im a newbie in python, actually trying to pull off an assignment for a uni course, and im tryin to do multiple pages scraping with python pandas. Im trying to exact all the data from the table that is in each page in this site https://aaiasb.gr/publications/investigation-reports
after i managed to scrape all the urls i tried this, but i only get the data from the first page:
#imports
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
response=requests.get('https://aaiasb.gr/publications/investigation-reports', )
response
soup = BeautifulSoup(response.text, 'html.parser')
soup
base_url = 'https://aaiasb.gr/publications/investigation-reports'
ending = '?start='
numbers = [50, 100, 150]
urls = [base_url]
for n in numbers:
url = base_url+ending+str(n)
urls.append(url)
df = pd.DataFrame(urls)
df = df.rename(columns={df.columns[0]:'url'})
df
for url in urls:
response = requests.get(url)
time.sleep(3)
soup_doc = BeautifulSoup(response.text, 'html.parser')
entries = []
page=soup.select('div.cck_page_items')[0]
rows = page.find('table').find_all('tr')[1:]
conclusion_date1 = tr.find_all('td')[0].find_all('div')[1].text.strip()
conclusion_date2 = tr.find_all('td')[0].find_all('div')[2].text.strip()
incident_info = tr.find_all('td')[1].find_all('div')[0].text.strip()
incident_type = tr.find_all('td')[1].find_all('div')[1].text.strip()
incident_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[0])
fatalities = tr.find_all('td')[1].find_all('div')[2].text.strip()
fatalities_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[1])
area = tr.find_all('td')[2].find_all('div')[0].text.strip()
registry = tr.find_all('td')[2].find_all('div')[1].text.strip()
aircraft_type = tr.find_all('td')[2].find_all('div')[-2].text.strip()
aircraft_info = tr.find_all('td')[2].find_all('div')[-1].text.strip()
area_info = tr.find_all('td')[2].text.strip()
dict = {'conclusion_date1': conclusion_date1,
'conclusion_date2': conclusion_date2,
'incident_info': incident_info,
'incident_type': incident_type,
'incident_description': incident_description,
'fatalities': fatalities,
'fatalities_description': fatalities_description,
'area': area,
'registry': registry,
'aircraft_type': aircraft_type,
'aircraft_info': aircraft_info,
'area_info': area_info}
entries.append(dict)
df1 =pd.DataFrame(entries)
The main issue has to do with your indentation and location of assignments, e.g. entries = [] and df = pd.DataFrame(entries) need to be in the right places. Try the below.
# imports
from bs4 import BeautifulSoup
import requests
import unicodedata
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
base_url = "https://aaiasb.gr/publications/investigation-reports"
suffix = "?start="
start_indices = [0, 50, 100, 150]
urls = [base_url + suffix + str(index) for index in start_indices]
entries = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.select("div.cck_page_items").pop().find("table")
for row in table.find_all("tr")[1:]:
cols = row.find_all("td")
conclusion_date1 = cols[0].find_all("div")[1].text.strip()
try:
conclusion_date2 = cols[0].find_all("div")[2].text.strip()
except IndexError:
conclusion_date2 = "N/A"
incident_info = cols[1].find_all("div")[0].text.strip()
incident_type = cols[1].find_all("div")[1].text.strip()
fatalities = cols[1].find_all("div")[2].text.strip()
info_hovers = cols[1].find_all("span", attrs={"uk-icon": "info"})
incident_description = ' '.join(unicodedata.normalize("NFC",info_hovers[0]['uk-tooltip']).split())
fatalities_description = ' '.join(unicodedata.normalize("NFC",info_hovers[1]['uk-tooltip']).split()).replace("<br>","\n")
area = cols[2].find_all("div")[0].text.strip()
area_info = '\n'.join(list(cols[2].strings)[-3:]).strip()
registry = cols[2].find_all("div")[1].text.strip()
aircraft_type = cols[2].find_all("div")[-2].text.strip()
aircraft_info = cols[2].find_all("div")[-1].text.strip()
entry = {
"conclusion_date1": conclusion_date1,
"conclusion_date2": conclusion_date2,
"incident_info": incident_info,
"incident_type": incident_type,
"incident_description": incident_description,
"fatalities": fatalities,
"fatalities_description": fatalities_description,
"area": area,
"registry": registry,
"aircraft_type": aircraft_type,
"aircraft_info": aircraft_info,
"area_info": area_info,
}
entries.append(entry)
df = pd.DataFrame(entries)
print(df.head())
print(df.tail())
I'm pretty new to web scraping but enjoying it so far so thought I'd test myself!
I've written this query to scrape this website but just wondering is there a way of making it more efficient? At the moment, I've had to set the max page to 87 as this is the last page that guitars appear on. However, amps only have 15 pages of results but I'm still looping through 87. Any ideas appreciated!
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 88
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/','guitars/bass/','amps/','guitars/acoustic/','pedals/']:
for x in range(1,n):
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(x)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price' : prices[index],
'avail' : avails[index]
})
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Thanks
Try the following approach:
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
page_number = 1
while True:
url = f"https://www.guitarguitar.co.uk/{category}page-{page_number}"
print(url)
page_number += 1
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
for div_product in soup.find_all('div', class_="product-inner"):
product = div_product.find('h3', {'class': 'qa-product-list-item-title'}).get_text(strip=True)
price = div_product.find('span', {'class': 'js-pounds'}).get_text(strip=True)
avail = div_product.find('div', {'class': 'availability'}).get_text(strip=True)
guitar_products.append({'product' : product, 'price' : price, 'avail' : avail})
# Is there a next button?
if not soup.find('a', class_="next-page-button"):
print("No more")
break
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Improvements:
This looks for the Next button on each page to then skip to the next category.
It locates the <div> holding each product and then uses a single find to get each product detail. This avoids the need to build multiple lists and then join them.
Build the URL using a Python f string.
You can check H1:
*soup = BeautifulSoup(page.content, 'html.parser')*
if soup.find('h1').contents[0] == 'Page Not Found':
break
or change circle from for to while:
is_page = True
x = 0
while is_page:
x = x + 1
. . .
if soup.find('h1').contents[0] == 'Page Not Found':
is_page = False
break
This is probably not the most elegant solution, but it is functional and straightforward. An infinite loop which ends if no product is found.
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 1
# ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
while True:
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(n)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price': prices[index],
'avail': avails[index]
})
if len(products) == 0:
n = 1
break
else:
n += 1
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,
I experience dealing with multi tags/attributes in one loop and appending them to the DataFrame. More speicifcally, it concerns Place loop:
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
Appending it to the DataFrame yields only 1 result out of expected 30.
Thank you in advance.
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url + str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
sub_soup_txt = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = sub_soup_txt.find('div', {'id': 'car-attributes'})
soup2 = sub_soup_txt.find('div', {'id': 'vip-seller'})
tmp = []
places = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final['Places'] = pd.Series(places)
df_final.to_csv('auto_database.csv')
As you are adding places to the final df, this line (currently sitting in for pagenumber in ... for car in ...)
places = []
should go all the way up and out of the main for loop here:
frames = []
places = []
I'm currently getting an output of A,A,B,B instead of A,B,A,B.
I really want to associate the values of each table header with each table data element (like a dictionary).
import requests
from bs4 import BeautifulSoup
courseCode = "IFB104"
page = requests.get("https://www.qut.edu.au/study/unit?unitCode=" + courseCode)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all(class_='table assessment-item')
numOfTables = 0
tableDataArray = []
for tbl in table:
numOfTables = numOfTables + 1
tableDataArray += [tbl.find_all('th'),tbl.find_all('td')]
If I understood correctly, you need to use dict, instead of list:
import requests
from bs4 import BeautifulSoup
courseCode = "IFB104"
page = requests.get("https://www.qut.edu.au/study/unit?unitCode=" + courseCode)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all(class_='table assessment-item')
numOfTables = 0
tableFormatted1 = []
tableFormatted2 = {}
for tbl in table:
numOfTables = numOfTables + 1
keys = tbl.find_all('th')
values = tbl.find_all('td')
new_data = dict(zip(keys, values))
# Method 1
tableFormatted1.append(new_data)
# Method 2
for k, v in new_data.items():
if k in tableFormatted2:
tableFormatted2[k].append(v)
else:
tableFormatted2[k] = [v]
print('List of dictionaries')
print(tableFormatted1)
print('')
print('Dictionary with list')
print(tableFormatted2)
Edited:
Each iteration of tbl is overwriting the iteration already done. So, it is necessary to change the structure. I've just provided two methods.