I experience dealing with multi tags/attributes in one loop and appending them to the DataFrame. More speicifcally, it concerns Place loop:
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
Appending it to the DataFrame yields only 1 result out of expected 30.
Thank you in advance.
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url + str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
sub_soup_txt = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = sub_soup_txt.find('div', {'id': 'car-attributes'})
soup2 = sub_soup_txt.find('div', {'id': 'vip-seller'})
tmp = []
places = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final['Places'] = pd.Series(places)
df_final.to_csv('auto_database.csv')
As you are adding places to the final df, this line (currently sitting in for pagenumber in ... for car in ...)
places = []
should go all the way up and out of the main for loop here:
frames = []
places = []
Related
I'm pretty new to web scraping but enjoying it so far so thought I'd test myself!
I've written this query to scrape this website but just wondering is there a way of making it more efficient? At the moment, I've had to set the max page to 87 as this is the last page that guitars appear on. However, amps only have 15 pages of results but I'm still looping through 87. Any ideas appreciated!
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 88
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/','guitars/bass/','amps/','guitars/acoustic/','pedals/']:
for x in range(1,n):
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(x)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price' : prices[index],
'avail' : avails[index]
})
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Thanks
Try the following approach:
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
page_number = 1
while True:
url = f"https://www.guitarguitar.co.uk/{category}page-{page_number}"
print(url)
page_number += 1
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
for div_product in soup.find_all('div', class_="product-inner"):
product = div_product.find('h3', {'class': 'qa-product-list-item-title'}).get_text(strip=True)
price = div_product.find('span', {'class': 'js-pounds'}).get_text(strip=True)
avail = div_product.find('div', {'class': 'availability'}).get_text(strip=True)
guitar_products.append({'product' : product, 'price' : price, 'avail' : avail})
# Is there a next button?
if not soup.find('a', class_="next-page-button"):
print("No more")
break
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Improvements:
This looks for the Next button on each page to then skip to the next category.
It locates the <div> holding each product and then uses a single find to get each product detail. This avoids the need to build multiple lists and then join them.
Build the URL using a Python f string.
You can check H1:
*soup = BeautifulSoup(page.content, 'html.parser')*
if soup.find('h1').contents[0] == 'Page Not Found':
break
or change circle from for to while:
is_page = True
x = 0
while is_page:
x = x + 1
. . .
if soup.find('h1').contents[0] == 'Page Not Found':
is_page = False
break
This is probably not the most elegant solution, but it is functional and straightforward. An infinite loop which ends if no product is found.
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 1
# ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
while True:
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(n)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price': prices[index],
'avail': avails[index]
})
if len(products) == 0:
n = 1
break
else:
n += 1
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
I am trying to fetch data from 7000 URLs and save the scraped info into csv. Rather then go through all the 7000 URLs once. how can I break the csv into let say 1000 URLs per csv.
Below is an example of my current code. I have change the total to index 7000 = 10 and per csv = 2 url.
urls = ['www.1.com', 'www.2.com', 'www.3.com', 'www.4.com', 'www.5.com', 'www.6.com', 'www.7.com', 'www.8.com', 'www.9.com', 'www.10.com']
ranks = []
names = []
prices = []
count = 0
rows_count = 0
total_index = 10
i = 1
while i < total_index:
for url in urls[rows_count+0:rows_count+2]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
count += 1
print('Loop', count, f'started for {url}')
rank = []
name = []
price = []
# loop for watchlist
for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz'):
item = item.text
rank.append(item)
ranks.append(rank)
# loop for ticker name
for ticker in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1'):
ticker = ticker.text
name.append(ticker)
names.append(name)
# loop for price
for price_tag in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle'):
price_tag = price_tag.text
price.append(price_tag)
prices.append(price)
sleep_interval = randint(1, 2)
print('Sleep interval ', sleep_interval)
time.sleep(sleep_interval)
rows_count += 2
df = pd.DataFrame(ranks)
df2 = pd.DataFrame(names)
df3 = pd.DataFrame(prices)
final_table = pd.concat([df, df2, df3], axis=1)
final_table.columns=['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
final_table.to_csv(os.path.join(path,fr'summary_{rows_count}.csv'))
i += 2
Seek senior assistant for my problem.
Or is there any other way to do it.
As I understand it you are getting one row of data from scraping each URL. A generic solution for scraping in chunks and writing to CSVs would look something like this:
def scrape_in_chunks(urls, scrape, chunk_size, filename_template):
""" Apply a scraping function to a list of URLs and save as a series of CSVs with data from
one URL on each row and chunk_size urls in each CSV file.
"""
for i in range(0, len(urls), chunk_size):
df = pd.DataFrame([scrape(url) for url in urls[i:i+chunk_size]])
df.to_csv(filename_template.format(start=i, end=i+chunk_size-1))
def my_scraper(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
print('Loop', count, f'started for {url}')
keys = ['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
data = ([item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz')] +
[item.text for item in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1')] +
[item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle')])
return dict(zip(keys, data)) # You could alternatively return a dataframe or series here but dict seems simpler
scrape_in_chunks(urls, my_scraper, 1000, os.path.join(path, "summary {start}-{end}.csv"))
I have several values scraped from a website, and I wanna covert to dataframe like:
dates titles links
2021-05-13 AAA https
I use:
html_link = 'https://www.ksei.co.id/publications/new-securities-registration?setLocale=en-US'
html = requests.get(html_link).text
soup = BeautifulSoup(html, 'html.parser')
titles = []
links = []
dates = []
for ultag in soup.find_all('ul', {'class': 'list-nostyle'}):
for litag in ultag.find_all('li'):
for dates in litag.find_all('small', {'class': 'muted'}):
title = litag.find('h2', {'class': 'h4 no-margin'})
link = litag.find('a', href=True)
titles.append(title.text)
links.append(f"https://www.pds.com.ph{link['href']}")
dates.append(dates.text)
#print(titles,links,dates)
dataframe = pd.DataFrame(zip(titles, links, dates), columns=['Titles', 'Links', 'Dates'])
print(dataframe)
But it only returns the first two rows, no idea why. I am confused with zip function, and always stuck with the lists to dataframe...
Thanks for the help in advance!
You're making this way too complicated than it should be. And by that I mean those nested for loops.
Just grab all the "boxes" and scoop out all the parts you need. Finally, dump the list of lists to a DataFrame and you're all done!
Here's how:
import pandas as pd
import requests
from bs4 import BeautifulSoup
html_link = 'https://www.ksei.co.id/publications/new-securities-registration?setLocale=en-US'
boxes = BeautifulSoup(requests.get(html_link).text, 'html.parser').select(".box--medium")
data = []
for box in boxes:
title = box.find("h2").getText()
date = box.find("b").getText().replace(", ", " ")
name = box.find("p").getText()
link = f'https://www.pds.com.ph{box.find("a")["href"]}'
data.append([title, date, name, link])
df = pd.DataFrame(data, columns=['Titles', 'Dates', 'Names', 'Links'])
print(df.head())
df.to_csv("your_data.csv", index=False)
Output:
Titles ... Links
0 KSEI-3512/DIR/0521 ... https://www.pds.com.ph/Announcement/Files/1271...
1 KSEI-3482/DIR/0521 ... https://www.pds.com.ph/Announcement/Files/1270...
2 KSEI-7362/JKU/0521 ... https://www.pds.com.ph/Announcement/Files/1270...
3 KSEI-3440/DIR/0521 ... https://www.pds.com.ph/Announcement/Files/1269...
4 KSEI-3394/DIR/0521 ... https://www.pds.com.ph/Announcement/Files/1268...
[5 rows x 4 columns]
and a .csv file:
The problem is because you have assigned same variable name(dates) to two data structures.there is a list assigned with name dates and also in the third loop you've dates variable.so, zip function is taking dates variable from the for loop
html_link = 'https://www.ksei.co.id/publications/new-securities-registration?setLocale=en-US'
html = requests.get(html_link).text
soup = BeautifulSoup(html, 'html.parser')
titles = []
links = []
dates_1 = []
for ultag in soup.find_all('ul', {'class': 'list-nostyle'}):
for litag in ultag.find_all('li'):
for dates in litag.find_all('small', {'class': 'muted'}):
title = litag.find('h2', {'class': 'h4 no-margin'})
link = litag.find('a', href=True)
titles.append(title.text)
links.append(f"https://www.pds.com.ph{link['href']}")
print(dates.text)
print(dates)
dates_1.append(dates.text)
dataframe = pd.DataFrame(zip(titles, links, dates_1), columns=['Titles', 'Links', 'Dates'])
Variables are reused and zip will take the shortest
import itertools
html_link = 'https://www.ksei.co.id/publications/new-securities-registration?setLocale=en-US'
html = requests.get(html_link).text
soup = BeautifulSoup(html, 'html.parser')
titles = []
links = []
dates = []
for ultag in soup.find_all('ul', {'class': 'list-nostyle'}):
for litag in ultag.find_all('li'):
for dat in litag.find_all('small', {'class': 'muted'}):
title = litag.find('h2', {'class': 'h4 no-margin'})
link = litag.find('a', href=True)
titles.append(title.text)
links.append(f"https://www.pds.com.ph{link['href']}")
dates.append(dat.text)
#print(titles,links,dates)
dataframe = pd.DataFrame(itertools.zip_longest(titles, links, dates), columns=['Titles', 'Links', 'Dates'])
print(dataframe)
How can I save the multiple printed output to the csv? Thanks!
page_html = driver.page_source
soup = BeautifulSoup(page_html, 'lxml')
names = soup.findAll('a', {"class": "a3H7pd r29r0b shntl"})
links = soup.findAll('a', {"class": "a3H7pd r29r0b shntl"})
for index, name in enumerate(names):
r = name.get_text(), "https://www.google.com" + links[index]['href']
print(r)
import pandas as pd
data = {'column_name' : [0] * len(names) }
df = pd.DataFrame(data, columns = ['column_name'])
for index, name in enumerate(names):
r = name.get_text(), "https://www.google.com" + links[index]['href']
df.loc[index, 'column_name'] = r
df.to_csv('your_file_path_and_name.csv')
Or more concise :
list = [name.get_text(), "https://www.google.com" + links[index]['href'] for index, name in enumerate(names)]
df = pd.DataFrame(list, columns = ['column_name'])
df.to_csv('your_file_path_and_name.csv')
I am tried to loop through the lists of lists and scrape all the links and append them to dataframe as one table, but in vain.
Help will be appreciated.
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
data_df = pd.DataFrame()
for sub_tab in company_A_subpg1:
for tab in sub_tab:
sub_table_1 = tab.find_all('a', href=True)
company_name = [name.text.strip() for name in sub_table_1]
company_link = [name.get('href') for name in sub_table_1]
company_link_edit=[convert(name) for name in company_link]
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
data_df.to_csv('results_3.csv')
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
for sub_tab in company_A_subpg1:
temp = sub_tab.find('tbody')
all_rows = temp.find_all('tr')
for val in all_rows:
a_tag = val.find('a', href=True)
company_name.append(a_tag.text.strip())
company_link_edit.append(convert(a_tag.get('href')))
print(len(company_name), len(company_link_edit))
data_df = pd.DataFrame()
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
print(df.shape)
data_df.to_csv('results_3.csv')
You can check values inside csv file I fetched all the 200 names and link mentioned in the page.