How can I save the multiple printed output to the csv? Thanks!
page_html = driver.page_source
soup = BeautifulSoup(page_html, 'lxml')
names = soup.findAll('a', {"class": "a3H7pd r29r0b shntl"})
links = soup.findAll('a', {"class": "a3H7pd r29r0b shntl"})
for index, name in enumerate(names):
r = name.get_text(), "https://www.google.com" + links[index]['href']
print(r)
import pandas as pd
data = {'column_name' : [0] * len(names) }
df = pd.DataFrame(data, columns = ['column_name'])
for index, name in enumerate(names):
r = name.get_text(), "https://www.google.com" + links[index]['href']
df.loc[index, 'column_name'] = r
df.to_csv('your_file_path_and_name.csv')
Or more concise :
list = [name.get_text(), "https://www.google.com" + links[index]['href'] for index, name in enumerate(names)]
df = pd.DataFrame(list, columns = ['column_name'])
df.to_csv('your_file_path_and_name.csv')
Related
I am trying to fetch data from 7000 URLs and save the scraped info into csv. Rather then go through all the 7000 URLs once. how can I break the csv into let say 1000 URLs per csv.
Below is an example of my current code. I have change the total to index 7000 = 10 and per csv = 2 url.
urls = ['www.1.com', 'www.2.com', 'www.3.com', 'www.4.com', 'www.5.com', 'www.6.com', 'www.7.com', 'www.8.com', 'www.9.com', 'www.10.com']
ranks = []
names = []
prices = []
count = 0
rows_count = 0
total_index = 10
i = 1
while i < total_index:
for url in urls[rows_count+0:rows_count+2]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
count += 1
print('Loop', count, f'started for {url}')
rank = []
name = []
price = []
# loop for watchlist
for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz'):
item = item.text
rank.append(item)
ranks.append(rank)
# loop for ticker name
for ticker in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1'):
ticker = ticker.text
name.append(ticker)
names.append(name)
# loop for price
for price_tag in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle'):
price_tag = price_tag.text
price.append(price_tag)
prices.append(price)
sleep_interval = randint(1, 2)
print('Sleep interval ', sleep_interval)
time.sleep(sleep_interval)
rows_count += 2
df = pd.DataFrame(ranks)
df2 = pd.DataFrame(names)
df3 = pd.DataFrame(prices)
final_table = pd.concat([df, df2, df3], axis=1)
final_table.columns=['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
final_table.to_csv(os.path.join(path,fr'summary_{rows_count}.csv'))
i += 2
Seek senior assistant for my problem.
Or is there any other way to do it.
As I understand it you are getting one row of data from scraping each URL. A generic solution for scraping in chunks and writing to CSVs would look something like this:
def scrape_in_chunks(urls, scrape, chunk_size, filename_template):
""" Apply a scraping function to a list of URLs and save as a series of CSVs with data from
one URL on each row and chunk_size urls in each CSV file.
"""
for i in range(0, len(urls), chunk_size):
df = pd.DataFrame([scrape(url) for url in urls[i:i+chunk_size]])
df.to_csv(filename_template.format(start=i, end=i+chunk_size-1))
def my_scraper(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
print('Loop', count, f'started for {url}')
keys = ['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
data = ([item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz')] +
[item.text for item in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1')] +
[item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle')])
return dict(zip(keys, data)) # You could alternatively return a dataframe or series here but dict seems simpler
scrape_in_chunks(urls, my_scraper, 1000, os.path.join(path, "summary {start}-{end}.csv"))
I have tried to scrape the table http://www.geonames.org/search.html?q=kwadukuza&country=ZA, however only the last line of the table appears
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.geonames.org/search.html?q=kwadukuza&country=ZA'
requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table_data = soup.find('table', class_ = "restable")
headers = []
for i in table_data.find_all('th'):
title = i.text.strip()
headers.append(title)=
df = pd.DataFrame(columns = headers)
for j in table_data.find_all('tr', class_='odd'):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data]
you can use seperate list to append row data to make list of list data and then use it as row for your df
all_rows=[]
for j in table_data.find_all('tr',class_="odd"):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data]
all_rows.append(row)
For DataFrame:
df = pd.DataFrame(columns = headers,data=all_rows)
Output:
df.shape
(25,6)
As the comment already says, you need to put the row = [tr.text.strip() for tr in row_data] in the for loop. Otherwise you would just get the last entry.
In order to add the rows to the DataFrame, you need to make a list of all rows and put it together with the headers to a DataFrame. You could also append the rows to the DataFrame, but it is less efficient
Solution
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.geonames.org/search.html?q=kwadukuza&country=ZA'
requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table_data = soup.find('table', class_ = "restable")
headers = []
for i in table_data.find_all('th'):
title = i.text.strip()
headers.append(title)
data = []
for j in table_data.find_all('tr', class_='odd'):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data] # Put into the for loop
data.append(row)
# DataFrame
df = pd.DataFrame(columns=headers, data=data)
print(df)
I am tried to loop through the lists of lists and scrape all the links and append them to dataframe as one table, but in vain.
Help will be appreciated.
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
data_df = pd.DataFrame()
for sub_tab in company_A_subpg1:
for tab in sub_tab:
sub_table_1 = tab.find_all('a', href=True)
company_name = [name.text.strip() for name in sub_table_1]
company_link = [name.get('href') for name in sub_table_1]
company_link_edit=[convert(name) for name in company_link]
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
data_df.to_csv('results_3.csv')
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
for sub_tab in company_A_subpg1:
temp = sub_tab.find('tbody')
all_rows = temp.find_all('tr')
for val in all_rows:
a_tag = val.find('a', href=True)
company_name.append(a_tag.text.strip())
company_link_edit.append(convert(a_tag.get('href')))
print(len(company_name), len(company_link_edit))
data_df = pd.DataFrame()
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
print(df.shape)
data_df.to_csv('results_3.csv')
You can check values inside csv file I fetched all the 200 names and link mentioned in the page.
I experience dealing with multi tags/attributes in one loop and appending them to the DataFrame. More speicifcally, it concerns Place loop:
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
Appending it to the DataFrame yields only 1 result out of expected 30.
Thank you in advance.
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url + str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
sub_soup_txt = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = sub_soup_txt.find('div', {'id': 'car-attributes'})
soup2 = sub_soup_txt.find('div', {'id': 'vip-seller'})
tmp = []
places = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final['Places'] = pd.Series(places)
df_final.to_csv('auto_database.csv')
As you are adding places to the final df, this line (currently sitting in for pagenumber in ... for car in ...)
places = []
should go all the way up and out of the main for loop here:
frames = []
places = []
Can anyone explain to me why this code does not create a pandas df?
I expect a df named 'sector_tickers', why does none get created?
def scrape_list(site):
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib.request.Request(site, headers=hdr)
page = urllib.request.urlopen(req)
soup = BeautifulSoup(page)
table = soup.find('table', {'class': 'wikitable sortable'})
sector_tickers = dict()
for row in table.findAll('tr'):
col = row.findAll('td')
if len(col) > 0:
sector = str(col[3].string.strip()).lower().replace(' ', '_')
ticker = str(col[0].string.strip())
if sector not in sector_tickers:
sector_tickers[sector] = list()
sector_tickers[sector].append(ticker)
return sector_tickers
Thank you for your help. This is driving me crazy.