Passing Selenium data results into Pandas - python

I am trying to automate a search which returns a table of information. I am able to print the results in .text but my question is how can I pass the results into a Pandas dataframe. The reason why I am asking is two fold; because I would want to print the results into a CSV file and I need the results in Pandas to do data analysis later on. Appreciate if anyone could help. My code as below:
import time
from selenium import webdriver
import pandas as pd
search = ['0501020210597400','0501020210597500','0501020210597600']
df = pd.DataFrame(search)
chrome_path = [Chrome Path]
driver = webdriver.Chrome(chrome_path)
driver.get('https://enquiry.mpsj.gov.my/v2/service/cuk_search/')
x = 0
while x <(len(df.index)):
search_box = driver.find_element_by_name('sel_value')
new_line = (df[0][x]).format(x)
search_box.send_keys(new_line)
search_box.submit()
time.sleep(5)
table = driver.find_elements_by_class_name('tr-body')
for data in table:
print(data.text)
driver.find_element_by_name('sel_value').clear()
x +=1
driver.close()

To load a CSV file to a DataFrame, you can do:
df = pd.read_csv('example.csv')
See the online doc: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html#pandas.read_csv
To write the data to CSV, consult this article: Pandas writing dataframe to CSV file on SO.
The solution is:
df.to_csv(file_name, sep='\t')

You can use requests and do a POST to get the info rather than use selenium
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
search = ['0501020210597400','0501020210597500','0501020210597600']
headers = {'Referer' : 'https://enquiry.mpsj.gov.my/v2/service/cuk_search/1',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
output = []
dfHeaders = ['No.', 'No. Akaun', 'Nama Di Bil', 'Jumlah Perlu Dibayar', '']
with requests.Session() as s:
for item in search:
r = s.get('https://enquiry.mpsj.gov.my/v2/service/cuk_search/1', headers = headers)
soup = bs(r.content, 'lxml')
key = soup.select_one('[name=ACCESS_KEY]')['value']
body = {'sel_input': 'no_akaun', 'sel_value': item, 'ACCESS_KEY': key}
res = s.post('https://enquiry.mpsj.gov.my/v2/service/cuk_search_submit/', data = body)
soup = bs(res.content, 'lxml')
table = soup.select_one('.tbl-list')
rows = table.select('.tr-body')
for row in rows:
cols = row.find_all('td')
cols = [item.text.strip() for item in cols]
output.append([item for item in cols if item])
df = pd.DataFrame(output, columns = dfHeaders)
print(df)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )

Related

Looping through the page numbers with Python BeautifulSoup

Attempting to update my script so that it searches through not only the url provided but all of the pages in range (1-3) and adds them to the CSV. Can anyone spot why my current code would not be working? The addition to pages following 1 are in the following format: page-2
from bs4 import BeautifulSoup
import requests
from csv import writer
from random import randint
from time import sleep
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
for page in range(1, 4):
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for list in lists:
title = list.find('h2').text
price = list.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
sleep(randint(2,10))
You are overwrite req multiple times and end up only analyzing the results of page 2. Put everything inside your loop.
edit: Also the upper limit in range() is not included, so you probably want to do for page in range(1, 4): to get the first three pages.
edit full example:
from bs4 import BeautifulSoup
import requests
from csv import writer
url = "https://www.propertypal.com/property-for-sale/ballymena-area/page-"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for page in range(1, 4):
req = requests.get(f"{url}{page}", headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
for li in soup.find_all('li', class_="pp-property-box"):
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
The solution from bitflip is fine, however a few things I'll point out to help you.
try to avoid variable names that are predefined functions in python. For example list being one of those.
while csv writer is a fine package to use, also consider using pandas. You will likely further down the road need to do some data manipulation and what not, so might as well familiarise yourself with the package now. It's a very powerful tool.
Here's how I would have coded it.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
from time import sleep
from os.path import exists
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
# Check if csv file exists
file_exists = exists('ballymena.csv')
for page in range(1, 4):
rows = []
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
for li in lists:
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
row = {
'Address':title,
'Price':price}
rows.append(row)
df = pd.DataFrame(rows)
# If file doesnt exists, write initial file
if not file_exists:
df.to_csv('ballymena.csv', index=False)
file_exists = True
# If it already exists, ammend to file
else:
df.to_csv('ballymena.csv', mode = 'a', header = False, index = False)
sleep(randint(2,10))

Scrapping NSE 52 week high Table using Python

I'm relatively new to web scrapping , I used Selenium and beautiful soup to srcape data however I'm unable to, Can someone help get the table data from the following link or any way to download the CSV file in Python please?
'''
print("Start")
from nsetools import Nse
import pandas as pd
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import urllib.request
nse_web = "https://www.nseindia.com/market-data/new-52-week-high-low-equity-market"
req = urllib.request.Request(
nse_web,
data=None,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'}
)
#f = urllib.request.urlopen(req)
#nse_web = "https://www.nseindia.com/market-data/new-52-week-high-low-equity-market"
time.sleep(5)
html = urlopen(req)
print("open URL")
time.sleep(10)
bsObj = BeautifulSoup(html, features="lxml")
print("before_table")
time.sleep(15)
data = []
table = soup.find('table', attrs={'class':'common_table customHeight-table tableScroll alt_row w-100'})
print(table)
table_body = table.find('tbody')
print(table_body)
rows = table_body.find_all('tr')
Print(rows)
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
print(data)
print("process complete")
'''
It seems that you are able to get the data from the table as a list of rows. To make a csv file you need to use python csv module.
import csv
with open('result.csv', 'wb') as csv_file:
csv_obj = csv.writer(csv_file)
for row in data:
csv_obj.writerow(row)
you can obtain your results.csv file in your current directory.

Multiple Page BeautifulSoup Script only Pulling first value

New to screen scraping here and this is my first time posting on stackoverflow. Aplogies in advance for any formatting errors in this post. Attempting to extract data from multiple pages with URL:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
For instance, page 1 is:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-1
Page 2:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-2
and so on...
My script is running without errors. However, my Pandas exported csv only contains 1 row with the first extracted value. At the time of this posting, the first value is:
14.01 Acres   Vestaburg, Montcalm County, MI$275,000
My intent is to create a spreadsheet with hundreds of rows that pull the property description from the URLs.
Here is my code:
import requests
from requests import get
from bs4 import BeautifulSoup
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
for page in range(1,900):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
else:
break
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))
import pandas as pd
df = pd.DataFrame({'description': [desc]})
df.to_csv('test4.csv', encoding = 'utf-8')
I suspect the problem is with the line reading desc = container.getText(strip=True) and have tried changing the line but keep getting errors when running.
Any help is appreciated.
I believe the mistake is in the line:
desc = container.getText(strip=True)
Every time it loops, the value in desc is replaced, not added on. To add items into the list, do:
desc.append(container.getText(strip=True))
Also, since it is already a list, you can remove the brackets from the DataFrame creation like so:
df = pd.DataFrame({'description': desc})
The cause is that no data is being added in the loop, so only the final data is being saved. For testing purposes, this code is now on page 2, so please fix it.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
all_data = pd.DataFrame(index=[], columns=['description'])
for page in range(1,3):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
df = pd.DataFrame({'description': [desc]})
all_data = pd.concat([all_data, df], ignore_index=True)
else:
break
all_data.to_csv('test4.csv', encoding = 'utf-8')
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))

How do I convert a web-scraped table into a csv?

A year ago I learned some python in one of my classes but haven't had to use much since then so this may or may not be a simple question.
I'm trying to web-scrape the top grossing films of all time table from Box Office Mojo and I want to grab the rank, title, and gross for the top 10 films in the 2010s. I've been playing around in python and I can get the entire table into python but I don't know how to manipulate it from there, let alone write out a csv file. Any guidance/tips?
Here is what will print the entire table for me (the first few lines are copied from an old web-scraping assignment to get me started):
import bs4
import requests
from bs4 import BeautifulSoup as soup
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
page_html = requests.get(url, headers=headers)
page_soup = soup(page_html.text, "html.parser")
boxofficemojo_table = page_soup.find("div", {"class": "a-section imdb-scroll-table-inner"})
complete_table = boxofficemojo_table.get_text()
print(complete_table)`
You Can use pd.read_html for this.
import pandas as pd
Data = pd.read_html(r'https://www.boxofficemojo.com/chart/top_lifetime_gross/')
for data in Data:
data.to_csv('Data.csv', ',')
2.Using Bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests
URL = r'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
print('\n>> Exctracting Data using Beautiful Soup for :'+ URL)
try:
res = requests.get(URL)
except Exception as e:
print(repr(e))
print('\n<> URL present status Code = ',(res.status_code))
soup = BeautifulSoup(res.text,"lxml")
table = soup.find('table')
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll(["td"]):
text = cell.text
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
for item in list_of_rows:
' '.join(item)
Data = pd.DataFrame(list_of_rows)
Data.dropna(axis = 0, how = 'all',inplace = True)
print(Data.head(10))
Data.to_csv('Table.csv')

How to scrape additional pages of a webpage

With some help from the community, I was able to scrape some information off a webpage. However, I am facing some trouble scraping information off the additional pages of the website.
The code shown below is able to obtain the following information: ('date', 'type', 'registration', 'operator', 'fat.', 'location', 'cat') from each year of the webpage (from 1919 - 2019). An example of the URL by year is
https://aviation-safety.net/database/dblist.php?Year=1946
However, i realised that there are additional pages in each of the URLs by years such as
https://aviation-safety.net/database/dblist.php?Year=1946&lang=&page=2 https://aviation-safety.net/database/dblist.php?Year=1946&lang=&page=3 https://aviation-safety.net/database/dblist.php?Year=1946&lang=&page=4
Was wondering how to scrape the additional pages for each year?
import pandas as pd
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find_all('a', href = True)
return datatable
datatable = getAndParseURL(mainurl)
#go through the content and grab the URLs
links = []
for link in datatable:
if 'Year' in link['href']:
url = link['href']
links.append(mainurl + url)
#check if links are in dataframe
df = pd.DataFrame(links, columns=['url'])
df.head(10)
#create empty datframe and empty list to store urls that didn't pull a table
results_df = pd.DataFrame()
no_table = []
#Loop through the URLs retrieved previously and append to results_df
for x in df['url']:
try:
html = requests.get(x, headers=headers).text # <----- added headers
table = pd.read_html(html)[0] # <---- used pandas to read in the html and parse table tags. this will return a list of dataframes and want the dataframe in position 0
results_df = results_df.append(table, sort=True).reset_index(drop=True)
print ('Processed: %s' %x)
except:
print ('No table found: %s' %x)
no_table.append(x)
results_df = results_df[['date', 'type', 'registration', 'operator', 'fat.', 'location', 'cat']]
You can use beautifulsoup to check for the <div> tag that contains the number of pages, then it looks like you can just iterate through those. Might be a better way to do it, but I just added another try/except in there to deal with if additional pages are found:
import pandas as pd
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find_all('a', href = True)
return datatable
datatable = getAndParseURL(mainurl)
#go through the content and grab the URLs
links = []
for link in datatable:
if 'Year' in link['href']:
url = link['href']
links.append(mainurl + url)
#check if links are in dataframe
df = pd.DataFrame(links, columns=['url'])
df.head(10)
#create empty datframe and empty list to store urls that didn't pull a table
results_df = pd.DataFrame()
no_table = []
#Loop through the URLs retrieved previously and append to results_df
for x in df['url']:
#Check for additional pages
try:
html = requests.get(x, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
pages = soup.find('div',{'class':'pagenumbers'}).text.strip().split(' ')[-1]
for page in range(1,int(pages)+1):
page_x = x + '&lang=&page=%s' %page
try:
html = requests.get(page_x, headers=headers).text # <----- added headers
table = pd.read_html(html)[0] # <---- used pandas to read in the html and parse table tags. this will return a list of dataframes and want the dataframe in position 0
results_df = results_df.append(table, sort=True).reset_index(drop=True)
print ('Processed: %s' %page_x)
except:
print ('No table found: %s' %page_x)
no_table.append(page_x)
except:
try:
html = requests.get(x, headers=headers).text # <----- added headers
table = pd.read_html(html)[0] # <---- used pandas to read in the html and parse table tags. this will return a list of dataframes and want the dataframe in position 0
results_df = results_df.append(table, sort=True).reset_index(drop=True)
print ('Processed: %s' %x)
except:
print ('No table found: %s' %x)
no_table.append(x)
results_df = results_df[['date', 'type', 'registration', 'operator', 'fat.', 'location', 'cat']]

Categories