how to export the Web scraping data into csv by python - python

i am web scraped the data by Beautifulsoup and printing the data. now i want the import to be imported to excel/csv my program below.i am new to python need help there are multiple pages that i have scraped now i need to export them to csv/excel
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
def scrape_bid_data():
page_no = 1 #initial page number
while True:
print('Hold on creating URL to fetch data...')
URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no) #create dynamic URL
print('URL cerated: ' + URL)
scraped_data = requests.get(URL,verify=False) # request to get the data
soup_data = bs(scraped_data.text, 'lxml') #parse the scraped data using lxml
extracted_data = soup_data.find('div',{'id':'pagi_content'}) #find divs which contains required data
if len(extracted_data) == 0: # **if block** which will check the length of extracted_data if it is 0 then quit and stop the further execution of script.
break
else:
for idx in range(len(extracted_data)): # loops through all the divs and extract and print data
if(idx % 2 == 1): #get data from odd indexes only because we have required data on odd indexes
bid_data = extracted_data.contents[idx].text.strip().split('\n')
print('-' * 100)
print(bid_data[0]) #BID number
print(bid_data[5]) #Items
print(bid_data[6]) #Quantitiy Required
print(bid_data[10] + bid_data[12].strip()) #Department name and address
print(bid_data[16]) #Start date
print(bid_data[17]) #End date
print('-' * 100)
page_no +=1 #increments the page number by 1
scrape_bid_data()
data is coming in the form like this below:

You can use pandas
pip install pandas
obj can be
bid_data = []
for obj in list:
obj= {
"bid_data_0" :bid_data[0],
"bid_data_5" :bid_data[5],
"bid_data_6" :bid_data[6],
"bid_data_10" :bid_data[10],
"bid_data_12" :bid_data[12].strip(),
"bid_data_17" :bid_data_17,
}
bid_data.append(obj)
you can format bid_data to dict obj and in that object add only required field
import pandas as pd
bid_data = pd.DataFrame(bid_data)
bid_data.to_csv("file_name.csv", index=True, encoding='utf-8')
it is the simplest method I have ever used for exporting data to csv.
Let me know if encounter any problem

Related

convert multiple strings from selenium and Beautiful soup to CSV file

I have this scraper I am trying to export as a csv file in Google Colab. I received the scraped information as a string value, but I cannot convert it to a csv. I want each scraped attribute "title", "size", etc to populate a column in a csv file. I have ran the strings through Beautiful soup to remove the HTML formatting. Please see my code below to help.
import pandas as pd
import time
import io
from io import StringIO
import csv
#from google.colab import drive
#drive.mount('drive')
#Use new Library (kora.selenium) to run chromedriver
from kora.selenium import wd
#Import BeautifulSoup to parse HTML formatting
from bs4 import BeautifulSoup
wd.get("https://www.grailed.com/sold/EP8S3v8V_w") #Get webpage
ScrollNumber=round(200/40)+1
for i in range(0,ScrollNumber):
wd.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
#--------------#
#Each new attribute will have to found using XPATH because Grailed's website is written in Javascript (js.react) not HTML
#Only 39 results will show because the JS page is infinite scroll and selenium must be told to keep scrolling.
follow_loop = range(2, 200)
for x in follow_loop:
#Title
title = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
title += str(x)
title += "]/a/div[3]/div[2]/p"
title = wd.find_elements_by_xpath(title)
title = str(title)
#Price
price = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
price += str(x)
price += "]/div/div/p/span"
price = wd.find_elements_by_xpath(price)
price = str(price)
#Size
size = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
size += str(x)
size += "]/a/div[3]/div[1]/p[2]"
size = wd.find_elements_by_xpath(size)
size = str(size)
#Sold
sold = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
sold += str(x)
sold += "]/a/p/span"
sold = wd.find_elements_by_xpath(sold)
sold = str(sold)
#Clean HTML formatting using Beautiful soup
cleantitle = BeautifulSoup(title, "lxml").text
cleanprice = BeautifulSoup(price, "lxml").text
cleansize = BeautifulSoup(size, "lxml").text
cleansold = BeautifulSoup(sold, "lxml").text
This was a lot of work lol
from selenium import webdriver
import time
import csv
driver = webdriver.Chrome()
driver.get("https://www.grailed.com/sold/EP8S3v8V_w")
scroll_count = round(200 / 40) + 1
for i in range(scroll_count):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
time.sleep(3)
titles = driver.find_elements_by_css_selector("p.listing-designer")
prices = driver.find_elements_by_css_selector("p.sub-title.sold-price")
sizes = driver.find_elements_by_css_selector("p.listing-size.sub-title")
sold = driver.find_elements_by_css_selector("div.-overlay")
data = [titles, prices, sizes, sold]
data = [list(map(lambda element: element.text, arr)) for arr in data]
with open('sold_shoes.csv', 'w') as file:
writer = csv.writer(file)
j = 0
while j < len(titles):
row = []
for i in range(len(data)):
row.append(data[i][j])
writer.writerow(row)
j += 1
I'm not sure why it makes a newline between every row in the file, but I assume it's not a problem. Also, it's a naïve solution in that it assumes the size of each list is the same, consider using one list and making new lists from the child elements of the parent. Also, I just used Selenium without BeautifulSoup because it's easier for me, but you should learn BS too because it's faster for scraping than Selenium. Happy coding.

How do I write my data along the columns in a CSV file?

When I write to the csv file all of my data is printed in only the first column. Using my loop, how do I iterate along the columns to write the data?
import csv
import bs4
import urllib
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
#For sites that can't be opened due to Urllib blocker, use a Mozilla User agent to get access
pageRequest = Request('https://coronavirusbellcurve.com/', headers = {'User-Agent': 'Mozilla/5.0'})
htmlPage = urlopen(pageRequest).read()
page_soup = soup(htmlPage, 'html.parser')
specificDiv = page_soup.find("div", {"class": "table-responsive-xl"})
TbodyStats = specificDiv.table.tbody.tr.contents
TbodyDates = specificDiv.table.thead.tr.contents
with open('CovidHTML.csv','w', newline= '') as file:
theWriter = csv.writer(file)
theWriter.writerow(['5/4', ' 5/5', ' 5/6',' 5/7',' 5/8',' 5/9'])
for i in range(3,len(TbodyStats)):
if i%2 != 0:
theWriter.writerow([TbodyStats[i].text])
Another method, For reference only.
from simplified_scrapy import SimplifiedDoc,utils,req
html = req.get('https://coronavirusbellcurve.com/')
doc = SimplifiedDoc(html)
specificDiv = doc.select('div.table-responsive-xl') # Get first div. If you want to get all divs, use this method: doc.selects('div.table-responsive-xl')
# TbodyStats = specificDiv.tbody.trs.selects('td|th').text # Get data
# TbodyDates = specificDiv.thead.trs.selects('td|th').text # Get date
data = specificDiv.table.trs.selects('td|th').text # Get all
rows = []
for row in data:
rows.append(row[1:])
utils.save2csv('test.csv',rows)
Result:
5/5,5/6,5/7,5/8,5/9
1213260,1237960,1266822,1294664,1314610
24423,24700,28862,27842,19946
2.05%,2.04%,2.33%,2.20%,1.54%
I think you may be able to do this (I can't test for sure because I don't have your exact data on hand):
row = []
for i in range(3, len(TbodyStats), 2):
row.append(TbodyStats[i].text)
if len(row) == 6:
theWriter.writerow(row)
row = []
I added the 'step' to your range so you don't have to use % for finding odd numbered indices, then just built each row until it hits 6 members, then flush that to the csv file, then empty the row so you can repeat the process.

How to scrape all values of AJAX search table with PYTHON?

I am trying to scrape the CPU Specs Database at TechPowerUp.
I have found the table updates using AJAX and created the following code:
import requests
from bs4 import BeautifulSoup
import csv
import string
cpus = []
base = 'https://www.techpowerup.com/cpu-specs/?ajaxsrch='
letters = list(string.ascii_lowercase)
letters.extend(range(0, 10))
for i in letters:
URL = base + str(i)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
with open('cpu2.csv', mode='a') as cpu_csv:
headers = ['name', 'family', 'socket', 'release']
writer = csv.DictWriter(cpu_csv, fieldnames=headers, lineterminator='\n')
for tr in soup.find_all('tr')[1:]:
tds = tr.find_all('td')
if tds[0].text.strip() not in cpus:
writer.writerow({'name': tds[0].text.strip(), 'family': tds[1].text.strip(), 'socket': tds[4].text.strip(), 'release': tds[8].text.strip()})
cpus.append(tds[0].text.strip())
else:
print("duplicate")
This code works in the fact it loops through A-Z then 0-9 and populates a csv file ignoring duplicates however I'm getting a logical error where I am only scraping ~600 results where there are over 2000 entries.
I believe this may be due to a limit on the returned items for each AJAX Search request so not all entries are discovered, is there a different approach to fetch all results?
Thanks
import pandas as pd
import string
items = string.digits + string.ascii_lowercase
def main(url):
data = []
for item in items:
print(f"{item}")
df = pd.read_html(url.format(item))[0]
df = df[["Name", "Codename", "Socket", "Released"]]
data.append(df)
data = pd.concat(data)
data.drop_duplicates(subset='Name', keep="first",inplace=True)
data.to_csv("data.csv", index=False)
main("https://www.techpowerup.com/cpu-specs/?ajaxsrch={}")
Total Output is 596 based on removing duplicates By column Name.
View Online
Sample of output:
The easiest way to get the table data using pandas.Get the data in DataFrame and import into csv.
Code:
import string
import pandas as pd
base = 'https://www.techpowerup.com/cpu-specs/?ajaxsrch='
letters = list(string.ascii_lowercase)
letters.extend(range(0, 10))
df=pd.DataFrame()
for i in letters:
URL = base + str(i)
df1=pd.read_html(URL)[0]
df = df.append(df1, ignore_index=True)
print(df[['Name','Codename','Socket','Released']]) #This will give you 1739 records
#If you want to delete duplicates use this
df.drop_duplicates(subset='Name', keep='first', inplace=True)
print(df[['Name','Codename','Socket','Released']]) #This will give you 595 records
#Import into Csv file
df[['Name','Codename','Socket','Released']].to_csv("cpu_csv.csv",index=False)

Unable to write data into Excel file (multiple tabs) using Python

I am not much familiar with data writing in Excel format using Python, need some help to write my data output into single .xlsx (Excel) file with multiple tabs.
My code is given here:
import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter
def write_to_file(file, mode, data, newline=None, with_tab=None):
with open(file, mode, encoding='utf-8') as l:
if with_tab == True:
data = '\t'.join(data)
if newline == True:
data = data+'\n'
l.write(data)
link = ["http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber="+str(var)+"&nextList=Next%C2%A0%3E&selectedPeriods=" for var in range(17500)] # This will read the URL's line by line as per specific value of var.
start = 1
end = 20
for pagenum, links in enumerate(link[start:end]):
print(links)
r = requests.get(links)
time.sleep(random.randint(2,5))
soup = BeautifulSoup(r.content,"lxml")
# Table 2
for items in soup.find(id="tblAccountContactInfo").find_all("tr")[:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
write_to_file('Table3.tsv', 'a', dataset, with_tab=True, newline=True)
write_to_file('Table3.tsv', 'a', links)
# Table 3
for items in soup.find(id="tblChildDetails").find("table").find_all("tr"):
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
write_to_file('Table3.tsv', 'a', dataset, with_tab=True, newline=True)
write_to_file('Table3.tsv', 'a', links)
#workbook = xlsxwriter.Workbook('Table3.xlsx')
#worksheet = workbook.add_worksheet("Table 3")
#worksheet.write(dataset)
#workbook.close()
I need the output in .xlsx Excel sheet in multiple tabs like Table 1 tab and Table 2 tab, currently I am fetching data in .tsv format. I have tried the xlsxwriter but unable to get results so commented those line. Please help
You need to first create two worksheets, and keep track of the current row to be used for each worksheet. An append_row() function can then add one row of data to the required sheet.
import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter
def append_row(ws, row):
for col, value in enumerate(row):
ws.write_string(ws.cur_row, col, value)
ws.cur_row += 1
workbook = xlsxwriter.Workbook('output.xlsx')
ws_2 = workbook.add_worksheet("Table 2")
ws_3 = workbook.add_worksheet("Table 3")
# Keep a track of the row to use in each worksheet
ws_2.cur_row = 0
ws_3.cur_row = 0
start = 1
end = 3
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="
for page_number in range(start, end):
print("Page {}".format(page_number))
url = link.format(page_number)
r = requests.get(url)
time.sleep(random.randint(2, 5))
soup = BeautifulSoup(r.content, "lxml")
# Table 2
for items in soup.find(id="tblAccountContactInfo").find_all("tr")[:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
append_row(ws_2, [url] + dataset])
# Table 3
for items in soup.find(id="tblChildDetails").find("table").find_all("tr"):
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
append_row(ws_3, [url] + dataset])
workbook.close()

Nested while loop for API json collection

I'm requesting 590 pages from the Meetup API. I've iterated with a while loop to get the pages. Now that I have the pages I need to request this pages and format them correctly as python in order to place into a Pandas dataframe.
This is how it looks when you do it for one url :
url = ('https://api.meetup.com/2/groups?offset=1&format=json&category_id=34&photo-host=public&page=100&radius=200.0&fields=&order=id&desc=false&sig_id=243750775&sig=768bcf78d9c73937fcf2f5d41fe6070424f8d0e3')
r = requests.get(url).json()
data = pd.io.json.json_normalize(r['results'])
But because I have so many pages I want to do this automatically and iterate through them all.
That's how nested while loops came to mind and this is what I tried:
urls = 0
offset = 0
url = 'https://api.meetup.com/2/groups?offset=%d&format=json&category_id=34&photo-host=public&page=100&radius=200.0&fields=&order=id&desc=false&sig_id=243750775&sig=768bcf78d9c73937fcf2f5d41fe6070424f8d0e3'
r = requests.get(urls%d = 'https://api.meetup.com/2/groups?offset=%d&format=json&category_id=34&photo-host=public&page=100&radius=200.0&fields=&order=id&desc=false&sig_id=243750775&sig=768bcf78d9c73937fcf2f5d41fe6070424f8d0e3').json()
while urlx < 591:
new_url = r % urls % offset
print(new_url)
offset += 1
However, it isn't working and I'm receiving many errors including this one:
SyntaxError: keyword can't be an expression
Not sure what you're trying to do, and the code has lots of issues.
But if you just want to loop through 0 to 591 and fetch URLs, then here's the code:
import requests
import pandas as pd
dfs = []
base_url = 'https://api.meetup.com/2/groups?offset=%d&format=json&category_id=34&photo-host=public&page=100&radius=200.0&fields=&order=id&desc=false&sig_id=243750775&sig=768bcf78d9c73937fcf2f5d41fe6070424f8d0e3'
for i in range(0, 592):
url = base_url % i
r = requests.get(url).json()
print("Fetching URL: %s\n" % url)
# do something with r here
# here I'll append it to a list of dfs
dfs.append(pd.io.json.json_normalize(r['results']))

Categories