A script using pandas for web scraping - python

I've been working on a script that reads in a .csv file with a list of websites all in the 1st column. I want to loop through and parse the HTML for every link in the website column of the .csv file the HTML tags are the same for every link. There are 200 rows, but when I run my code, I only get data for the last row of the column, so row (200) instead of (1-200). I believe the issue may stem from my for loop and my use of iterows but I'm not experienced enough in python/pandas.
'''
df = pd.read_csv('~/Documents/websites.csv', usecols=['website'],
delimiter=',')
url = df['website']
df.dropna(subset=['website'])
data = []
df.to_csv(header=True, index=False, path_or_buf='/Users/Desktop/scraped_data.csv')
print(df)
for index, row in df.iterrows():
website = row['website']
response = requests.get(website)
content = response.text
soup = BeautifulSoup(content,'html.parser')
result = soup.find('div',class_ = 'KeyStatisticsCard_support-card__fK7N2')
tag1 = soup.find('span',class_ = 'KeyStatisticsCard_field-text__GtuGd')
tag2 = soup.find('div',class_ = 'ant-col KeyStatisticsCard_field-info__gYdfV')
tag3 = soup.find('a',class_ = 'KeyStatisticsCard_ellipsis__TE9tk')
tag4 = soup.find('span',class_ = 'KeyStatisticsCard_ellipsis__TE9tk')
d = {'tag1':tag1.text,'tag2':tag2.text, 'tag3':tag3.text,
'tag4':tag4.text if tag4 else None}
data.append(d)
#convert to a pandas df
data_df = pd.DataFrame(data)
'''

Try sliding the d={tag1.... and data.append(d) into the for loop. Because they aren't included in the loop you are only getting the most recent iteration of the loop.
for index, row in df.iterrows():
website = row['website']
response = requests.get(website)
content = response.text
soup = BeautifulSoup(content,'html.parser')
result = soup.find('div',class_ = 'KeyStatisticsCard_support-
card__fK7N2')
tag1 = soup.find('span',class_ = 'KeyStatisticsCard_field-text__GtuGd')
tag2 = soup.find('div',class_ = 'ant-col KeyStatisticsCard_field-
info__gYdfV')
tag3 = soup.find('a',class_ = 'KeyStatisticsCard_ellipsis__TE9tk')
tag4 = soup.find('span',class_ = 'KeyStatisticsCard_ellipsis__TE9tk')
d = {'tag1':tag1.text,'tag2':tag2.text, 'tag3':tag3.text,
'tag4':tag4.text if tag4 else None}
data.append(d)
#convert to a pandas df
data_df = pd.DataFrame(data)

Check the indentation IT looks like both lines are outside of for loop.
d = {'tag1':tag1.text,'tag2':tag2.text, 'tag3':tag3.text,
'tag4':tag4.text if tag4 else None}
data.append(d)
So you put only the last entry in the list data. Regaring your error message you Can add a vérification if not none (to avoid to call the text attribut on none object which means than the class you were looking for is not existing on the URL)
tag3 = soup.find('a',class_ = 'KeyStatisticsCard_ellipsis__TE9tk')
tag4 = soup.find('span',class_ = 'KeyStatisticsCard_ellipsis__TE9tk')
if tag1 is not none:
tag1=tag1.text
if tag2 is not none:
tag2=tag2.text
if tag3 is not none:
tag3=tag3.text
if tag4 is not none:
tag4=tag4.text
d = {'tag1':tag1,'tag2':tag2, 'tag3':tag3,
'tag4':tag4 if tag4 else None}
data.append(d)
#convert to a pandas df
data_df = pd.DataFrame(data)

Related

Python break loop into several section

I am trying to fetch data from 7000 URLs and save the scraped info into csv. Rather then go through all the 7000 URLs once. how can I break the csv into let say 1000 URLs per csv.
Below is an example of my current code. I have change the total to index 7000 = 10 and per csv = 2 url.
urls = ['www.1.com', 'www.2.com', 'www.3.com', 'www.4.com', 'www.5.com', 'www.6.com', 'www.7.com', 'www.8.com', 'www.9.com', 'www.10.com']
ranks = []
names = []
prices = []
count = 0
rows_count = 0
total_index = 10
i = 1
while i < total_index:
for url in urls[rows_count+0:rows_count+2]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
count += 1
print('Loop', count, f'started for {url}')
rank = []
name = []
price = []
# loop for watchlist
for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz'):
item = item.text
rank.append(item)
ranks.append(rank)
# loop for ticker name
for ticker in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1'):
ticker = ticker.text
name.append(ticker)
names.append(name)
# loop for price
for price_tag in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle'):
price_tag = price_tag.text
price.append(price_tag)
prices.append(price)
sleep_interval = randint(1, 2)
print('Sleep interval ', sleep_interval)
time.sleep(sleep_interval)
rows_count += 2
df = pd.DataFrame(ranks)
df2 = pd.DataFrame(names)
df3 = pd.DataFrame(prices)
final_table = pd.concat([df, df2, df3], axis=1)
final_table.columns=['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
final_table.to_csv(os.path.join(path,fr'summary_{rows_count}.csv'))
i += 2
Seek senior assistant for my problem.
Or is there any other way to do it.
As I understand it you are getting one row of data from scraping each URL. A generic solution for scraping in chunks and writing to CSVs would look something like this:
def scrape_in_chunks(urls, scrape, chunk_size, filename_template):
""" Apply a scraping function to a list of URLs and save as a series of CSVs with data from
one URL on each row and chunk_size urls in each CSV file.
"""
for i in range(0, len(urls), chunk_size):
df = pd.DataFrame([scrape(url) for url in urls[i:i+chunk_size]])
df.to_csv(filename_template.format(start=i, end=i+chunk_size-1))
def my_scraper(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
print('Loop', count, f'started for {url}')
keys = ['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
data = ([item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz')] +
[item.text for item in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1')] +
[item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle')])
return dict(zip(keys, data)) # You could alternatively return a dataframe or series here but dict seems simpler
scrape_in_chunks(urls, my_scraper, 1000, os.path.join(path, "summary {start}-{end}.csv"))

Web scraping - scraping data for multiple URL's gives None

1) I am trying to scrape data for multiple URL's stored in CSV, but in result it gives None.
2) I want to store the fetched data simultaneously in rows one by one in a dataframe named df but it only stores one row.
here's my code(i have pasted below from where the data extraction started) -
import csv
df=pd.DataFrame()
with open('test1.csv', newline='', encoding='utf-8-sig' ) as f:
reader = csv.reader(f)
for line in reader:
link = line[0]
print(type(link))
print(link)
driver.get(link)
height = driver.execute_script("return document.body.scrollHeight")
for scrol in range(100,height,100):
driver.execute_script(f"window.scrollTo(0,{scrol})")
time.sleep(0.2)
src = driver.page_source
soup = BeautifulSoup(src, 'lxml')
name_div = soup.find('div', {'class': 'flex-1 mr5'})
name_loc = name_div.find_all('ul')
name = name_loc[0].find('li').get_text().strip()
loc = name_loc[1].find('li').get_text().strip()
connection = name_loc[1].find_all('li')
connection = connection[1].get_text().strip()
exp_section = soup.find('section', {'id': 'experience-section'})
exp_section = exp_section.find('ul')
div_tag = exp_section.find('div')
a_tag = div_tag.find('a')
job_title = a_tag.find('h3').get_text().strip()
company_name = a_tag.find_all('p')[1].get_text().strip()
joining_date = a_tag.find_all('h4')[0].find_all('span')[1].get_text().strip()
exp = a_tag.find_all('h4')[1].find_all('span')[1].get_text().strip()
df['name']=[name]
df['location']=[loc]
df['connection']=[connection]
df['company_name']=[company_name]
df['job_title']=[job_title]
df['joining_date']=[joining_date]
df['tenure']=[exp]
df
output -
name location connection company_name job_title joining_date tenure
0 None None None None None None None
I am not sure whether the for loop goes wrong or whats the exact problem but for a single URL it works fine.
I am using Beautiful soup for the first time so I don't have proper knowledge. Please help me to make the desired changes. Thanks.
I don't think the end of your code is appending new lines to the dataframe.
Try replacing df["name""] = [name] and the other lines with the following:
new_line = {
"name": [name],
"location": [loc],
"connection": [connection],
"company_name": [company_name],
"job_title": [job_title],
"joining_date": [joining_date],
"tenure": [exp],
}
temp_df = pd.DataFrame.from_dict(new_line)
df.append(temp_df)

Troubles appending list to a DataFrame

I experience dealing with multi tags/attributes in one loop and appending them to the DataFrame. More speicifcally, it concerns Place loop:
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
Appending it to the DataFrame yields only 1 result out of expected 30.
Thank you in advance.
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url + str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
sub_soup_txt = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = sub_soup_txt.find('div', {'id': 'car-attributes'})
soup2 = sub_soup_txt.find('div', {'id': 'vip-seller'})
tmp = []
places = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final['Places'] = pd.Series(places)
df_final.to_csv('auto_database.csv')
As you are adding places to the final df, this line (currently sitting in for pagenumber in ... for car in ...)
places = []
should go all the way up and out of the main for loop here:
frames = []
places = []

Writing different columns of a Pandas DataFrame in one row?

I have scraped a website for extracting the shoes and clothes prices , their image ids , image URLs and some other features,I succeeded in writing the dataframe to a csv file but I realized that the dataframe write every feature in different rows while they have to be gathered in one row , i have showed a sample output from my csv file below.
Any suggestions on how to change the code ??
from bs4 import BeautifulSoup
import requests
import re
import csv
import pandas as pd
import os
import urllib.request
df = pd.DataFrame(columns = ['PostID','Description', 'Kind', 'Price', 'ImageID', 'ImageURL'])
def scraping():
global h , df
with open("/home/user/Documents/file.txt") as f:
urls = f.readlines()
urls = ([s.strip('\n') for s in urls ])
code_list = []
for url in urls:
code = url.split('/')[-1]
code_list.append(code)
df = df.append({'PostID': code}, ignore_index=True)
for br in soup.find_all("br"):
br.replace_with("\n")
try:
description = soup.find('div', attrs={'class':'ui fluid card post-description'}).find('div', attrs={'class':'content'})
print(description.text)
df = df.append({'Description': description.text}, ignore_index=True)
item_list = []
items = soup.find_all('span', attrs={'class':'item__title'})
for i in items:
item_list.append(i.text)
item_list.pop(0)
value_list=[]
values = soup.find_all('div', attrs={'class':'value'})
for v in values:
value_list.append(v.text)
my_dictionary = {}
for i in range(1,3):
my_dictionary[item_list[i]] = value_list[i]
df = df.append({'Kind':my_dictionary['نوع آگهی'] }, ignore_index=True)
df = df.append({'Price': my_dictionary['قیمت']}, ignore_index=True)
imageresult = []
path = '/home/user/images'
images = soup.find_all('img')
for img in images:
imgID = img.get('src').split('/')[-1]
df = df.append({'ImageID': imgID}, ignore_index=True)
df = df.append({'ImageURL': img.get('src')}, ignore_index=True)
urllib.request.urlretrieve(img.get('src'), os.path.join(my_path, os.path.basename(img.get('src'))))
print(imgID + img.get('src'))
else:
break
except:
print("your URL is invalid :" + url)
scraping()
df.to_csv('divartest14.csv', index = False , encoding = 'utf-8')
PostID Description Kind Price ImageID
QXZ5RjZj
adidas shoes
feminine
100$
QXZ5RjZj.jpg
That will continue to happen because when you call append, you're telling it to ignore_index, therefore each series get's put in it's own row. I'd suggest passing in all of the items you want in one row in one dictionary, ie:
df = df.append({'c1': 1, 'c2': 2, 'c3': 3, ...etc})

Python scrape, skipping a <tr> tag and row

Scraping a webpage and encountering an "IndexError: list index out of range"
pretty sure it's because a row in the table I am scraping is using as a header - http://www.wsj.com/mdc/public/page/2_3022-mfsctrscan-moneyflow-20161205.html?mod=mdc_pastcalenda
from urllib2 import urlopen
import requests
from bs4 import BeautifulSoup
import re
import datetime
date = datetime.datetime.today()
url = "http://www.wsj.com/mdc/public/page/2_3022-mfsctrscan-moneyflow- 20161205.html?mod=mdc_pastcalendar"
date_time = urlopen(url.format(date=date.strftime('%Y%m%d')))
address = url
print 'Retrieving information from: ' + address
print '\n'
soup = BeautifulSoup (requests.get(address).content, "lxml")
div_main = soup.find('div', {'id': 'column0'})
table_one = div_main.find('table')
rows = table_one.findAll('tr')
if len(soup.findAll('tr')) > 0:
rows = rows[2:]
#print rows
for row in rows:
cells = row.findAll('td')
name = cells[0].get_text()
last = cells[1].get_text()
chg = cells[2].get_text()
pct_chg = cells[3].get_text()
money_flow = cells[4].get_text()
tick_up = cells[5].get_text()
tick_down = cells[6].get_text()
up_down_Ratio = cells[7].get_text()
money_flow = cells[8].get_text()
tick_up = cells[9].get_text()
tick_down = cells[10].get_text()
up_down_Ratio = cells[11].get_text()
The intermediate rows with single cells like "Dow Jones U.S. Total Stock Market Sectors" is the reason you are having this error.
But, instead, why don't you pre-define a list of headers and dynamically create a dictionary from the values of the "data" rows zipping with the list of headers:
rows = soup.select('div#column0 table tr')[2:]
headers = ['name', 'last', 'chg', 'pct_chg',
'total_money_flow', 'total_tick_up', 'total_tick_down', 'total_up_down_ratio',
'block_money_flow', 'block_tick_up', 'block_tick_down', 'block_up_down_ratio']
for row in rows:
# skip non-data rows
if row.find("td", class_="pnum") is None:
continue
print(dict(zip(headers, [cell.get_text(strip=True) for cell in row.find_all('td')])))
div_main = soup.find('div', {'id': 'column0'})
table_one = div_main.find('table')
# to id the right row
def target_row(tag):
is_row = len(tag.find_all('td')) > 5
row_name = tag.name == 'tr'
return is_row and row_name
rows = table_one.find_all(target_row)
for row in rows:
cells = row.findAll('td')
name = cells[0].get_text()
last = cells[1].get_text()
chg = cells[2].get_text()
pct_chg = cells[3].get_text()
money_flow = cells[4].get_text()
tick_up = cells[5].get_text()
tick_down = cells[6].get_text()
up_down_Ratio = cells[7].get_text()
money_flow = cells[8].get_text()
tick_up = cells[9].get_text()
tick_down = cells[10].get_text()
up_down_Ratio = cells[11].get_text()
you can use a function that return a bool as find's parameter, in this way, you code is much clean and maintainable.

Categories