Save web scraped result to txt files by names - python

I scraped a list of professor contact information from a school website, and now I want to save them individually by name, each name txt file contains their email, tel and office.
ideal outcome
Currently my code is
from bs4 import BeautifulSoup as bs
import requests
url = 'https://www.cb.cityu.edu.hk/is/people/academic.html'
webpage = requests.get(url)
page = bs(webpage.content, 'html.parser')
#define list
name_list = []
phone_list = []
email_list = []
result = page.find_all('div', attrs = {'class': 'staff-details'})
for person in result:
print(person.text)

You can use a loop to fetch the data and simultaneously save the data in a text file.
from bs4 import BeautifulSoup as bs
import requests
url = 'https://www.cb.cityu.edu.hk/is/people/academic.html'
webpage = requests.get(url)
page = bs(webpage.content, 'html.parser')
prof_list = page.select(".staff-details")
for i in prof_list:
name = i.select_one('.name >a').text
email = i.select_one('.list-info div.value:nth-child(2) > a').text
phone = i.select_one('.list-info div.value:nth-child(4)').text
office = i.select_one('.list-info div.value:nth-child(6)').text
with open(name+'.txt', 'w+') as file:
file.write("Email:\n")
file.write(email)
file.write('\nPhone:\n')
file.write(phone)
file.write("\nOffice\n")
file.write(office)

Open file with name you want using context manager in w+ mode
here is a sample code for you
---> Inside Your For Loop
with open(file_name_come_here,"w+") as f :
f.write(content_come_here_as_string)

Related

BeautifulSoup 4 HTML Web Scraping - Find Mailto Links and Export to Spreadsheet

I am trying to scrape all email addresses from this index page - http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL
I modified a python script to define the string, parse content with BS4 and save each unique address to an xls file:
import requests
from bs4 import BeautifulSoup
import xlwt
wb = xlwt.Workbook()
ws = wb.add_sheet('Emails')
ws.write(0,0,'Emails')
emailList= []
r=0
#add url of the page you want to scrape to urlString
urlString='http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL'
#function that extracts all emails from a page you provided and stores them in a list
def emailExtractor(urlString):
getH=requests.get(urlString)
h=getH.content
soup=BeautifulSoup(h,'html.parser')
mailtos = soup.select('a[href^=mailto]')
for i in mailtos:
href=i['href']
try:
str1, str2 = href.split(':')
except ValueError:
break
emailList.append(str2)
emailExtractor(urlString)
#adding scraped emails to an excel sheet
for email in emailList:
r=r+1
ws.write(r,0,email)
wb.save('emails.xls')
The xls file exports as expected, but with no email values. If anyone can explain why or how to simplify this solution it would be greatly appreciated!
Because the emails are protected. I am adding only the email scraping part. and not adding the excel part since you dont have issues with that. converting email protected to text credit goes to https://stackoverflow.com/a/36913154/7518304
emailList= []
r=0
#add url of the page you want to scrape to urlString
urlString='http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL'
def decodeEmail(e): #https://stackoverflow.com/a/36913154/7518304
de = ""
k = int(e[:2], 16)
for i in range(2, len(e)-1, 2):
de += chr(int(e[i:i+2], 16)^k)
return de
#function that extracts all emails from a page you provided and stores them in a list
def emailExtractor(urlString):
getH=requests.get(urlString)
h=getH.content
soup=BeautifulSoup(h,'html.parser')
mailtos = soup.select('a[href]')
for i in mailtos:
href=i['href']
if "email-protect" in href:
emailList.append(decodeEmail(href.split("#")[1]))
emailExtractor(urlString)
emailList
You can use pandas for this. Here is the full code:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
urlString = 'http://www.uschess.org/assets/msa_joomla/AffiliateSearch/clubresultsnew.php?st=AL'
# function that extracts all emails from a page you provided and stores them in a list
def emailExtractor(urlString):
emailList = []
getH = requests.get(urlString)
h = getH.content
soup = BeautifulSoup(h, 'html.parser')
mailtos = soup.find_all('a')
href_lst = []
for i in mailtos:
href_lst.append(i['href'])
for href in href_lst:
if ':' in href:
emailList.append(href)
print(emailList)
s = pd.Series(emailList)
s = s.rename('Emails')
s.to_excel('D:\\Emails.xls',index=False)
emailExtractor(urlString)
Output:
['http://msa.uschess.org/AffDtlMain.php?T6006791', 'https://alabamachess.org', 'http://msa.uschess.org/AffDtlMain.php?A6029262', 'http://www.caesarchess.com/', 'http://msa.uschess.org/AffDtlMain.php?A6045660', 'http://msa.uschess.org/AffDtlMain.php?H6046485', 'http://msa.uschess.org/AffDtlMain.php?A6040580']
Excel Sheet Screenshot:
If you want the links to be output to the excel sheet as hyperlinks (you will be redirected to the website once you click the link), then change emailList.append(href) to emailList.append('=HYPERLINK("'+href+'")').
And at the same time, you should also change the file extension to .xlsx. Only then, you can get the links as hyperlinks.
Output:
Hope this helps!

Why my web scraping code is not extracting data like it should?

I am trying to get data from a online shopping website. My code runs without any error but the data is not getting extracted to the csv file like it should. Where am I going wrong with the code?
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome("/usr/bin/chromedriver")
products=[] #List to store name of the product
prices=[] #List to store price of the product
ratings=[] #List to store rating of the product
driver.get("https://www.flipkart.com/lenovo-core-i3-6th-gen-4-gb-1-tb-hdd-windows-10-home-ip-320e-laptop/p/itmf3s32ghxrkrhf?pid=COMEWM7FTAQ9EHRF&srno=b_1_2&otracker=browse&lid=LSTCOMEWM7FTAQ9EHRFBL70ZV&fm=organic&iid=90098c10-e53b-49dc-9359-ff04338c0c4e.COMEWM7FTAQ9EHRF.SEARCH&ssid=2d6xzladk00000001572540087124")
content = driver.page_source
soup = BeautifulSoup(content)
for a in soup.findAll('a',href=True, attrs={'class':'_29OxBi'}):
name = a.find('div', attrs={'class':'_35KyD6'})
price = a.find('div', attrs={'class':'_1vC4OE _3qQ9m1'})
rating= a.find('div', attrs={'class':'hGSR34'})
products.append(name.text)
prices.append(price.text)
ratings.append(rating.text)
df = pd.DataFrame({'Product Name':products,'Price':prices,'Rating':ratings})
df.to_csv('products.csv', index=False, encoding='utf-8')
I expect the code to return data such as name, price and rating of the products available on the website.
flipkart : It is loaded dynamically from a script tag when the browser executes javascript in the webpage. You can regex out this info and parse with json parser to retrieve required info just using requests; without the overhead of selenium.
import requests, re, json
p = re.compile(r'window\.__INITIAL_STATE__ = (.*);')
r = requests.get('https://www.flipkart.com/lenovo-core-i3-6th-gen-4-gb-1-tb-hdd-windows-10-home-ip-320e-laptop/p/itmf3s32ghxrkrhf?pid=COMEWM7FTAQ9EHRF&srno=b_1_2&otracker=browse&lid=LSTCOMEWM7FTAQ9EHRFBL70ZV&fm=organic&iid=90098c10-e53b-49dc-9359-ff04338c0c4e.COMEWM7FTAQ9EHRF.SEARCH&ssid=2d6xzladk00000001572540087124')
data = json.loads(p.findall(r.text)[0])['pageDataV4']['page']['data']['10002'][1]['widget']['data']
##data sections:
# data.keys()
##pricing info:
# data['pricing']['value'].keys()
# data['pricing']['value']['mrp'].keys()
##rating info:
# data['ratingsAndReviews']['value']['rating']
price = data['pricing']['value']['mrp']['currency'] + str(data['pricing']['value']['mrp']['value'])
title = ' '.join(reversed([v for k,v in data['titleComponent']['value'].items() if k in ['title', 'subtitle']]))
average_rating = data['ratingsAndReviews']['value']['rating']['average']

How to write csv and insert scrape data

I am designing scraping project for my research but i am stuck in to write scrape data in csv. Please help me for that?
i have successfully scrape data but i want to store it in csv here below is my code
need to write code to pull all of the html from a website then save it to a csv file.
I believe I somehow need to turn the links into a list and then write the list, but I'm unsure how to do that.
This is what I have so far:
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
print("Wait Scraper is working on ")
time.sleep(10)
if(page.status_code != 200):
print("Error in Scraping check the url")
else:
print("Successfully scrape the data")
time.sleep(10)
print("Loading data in csv")
file = csv.writer(open('dataminer.csv', 'w'))
file.writerow(['ProfileName', 'CompanyName', 'Salary', 'Job', 'Location'])
for pname in soup.find_all(class_="profile-name"):
#print(pname.text)
profname = pname.text
file.writerow([profname, ])
for cname in soup.find_all(class_="company_name"):
print(cname.text)
for salary in soup.find_all(class_="salary"):
print(salary.text)
for lpa in soup.find_all(class_="jobText"):
print(lpa.text)
for loc in soup.find_all(class_="location"):
print(loc.text)
Make a dict and save the data into it then save to csv, check below code!
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
data = []
print("Wait Scrapper is working on ")
if(page.status_code != 200):
print("Error in Srapping check the url")
else:
print("Successfully scrape the data")
for x in soup.find_all('div',attrs={'class':'job-page'}):
data.append({
'pname':x.find(class_="profile-name").text.encode('utf-8'),
'cname':x.find(class_="company_name").text.encode('utf-8'),
'salary':x.find(class_="salary").text.encode('utf-8'),
'lpa':x.find(class_="jobText").text.encode('utf-8'),
'loc':x.find(class_="location").text.encode('utf-8')})
print("Loading data in csv")
with open('dataminer.csv', 'w') as f:
fields = ['salary', 'loc', 'cname', 'pname', 'lpa']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)
Apart from what you have got in other answer, you can scrape and write the content at the same time as well. I used .select() instead of .find_all() to achieve the same.
import csv
import requests
from bs4 import BeautifulSoup
URL = "https://www.myamcat.com/jobs"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'lxml')
with open('myamcat_doc.csv','w',newline="",encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(['pname','cname','salary','loc'])
for item in soup.select(".job-listing .content"):
pname = item.select_one(".profile-name h3").get_text(strip=True)
cname = item.select_one(".company_name").get_text(strip=True)
salary = item.select_one(".salary .jobText").get_text(strip=True)
loc = item.select_one(".location .jobText").get_text(strip=True)
writer.writerow([pname,cname,salary,loc])

python crawling beautifulsoup how to crawl several pages?

Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()

How can I loop scraping data for multiple pages in a website using python and beautifulsoup4

I am trying to scrape data from the PGA.com website to get a table of all of the golf courses in the United States. In my CSV table I want to include the Name of the golf course ,Address ,Ownership ,Website , Phone number. With this data I would like to geocode it and place into a map and have a local copy on my computer
I utilized Python and Beautiful Soup4 to extract my data. I have reached as far to extract the data and import it into a CSV but I am now having a problem of scraping data from multiple pages on the PGA website. I want to extract ALL THE GOLF COURSES but my script is limited only to one page I want to loop it in away that it will capture all data for golf courses from all pages found in the PGA site. There are about 18000 gold courses and 900 pages to capture data
Attached below is my script. I need help on creating code that will capture ALL data from the PGA website and not just one site but multiple. In this manner it will provide me with all the data of gold courses in the United States.
Here is my script below:
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data1=soup.find_all("div",{"class":"views-field-nothing-1"})
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('filename5.csv','wb') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
#for item in g_data1:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-counter"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-course-type"})[0].text
#except:
#pass
#for item in g_data2:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
#except:
#pass
This script only captures 20 at a time and I want to capture all in one script which account for 18000 golf courses and 900 pages to scrape form.
The PGA website's search have multiple pages, the url follows the pattern:
http://www.pga.com/golf-courses/search?page=1 # Additional info after page parameter here
this means you can read the content of the page, then change the value of page by 1, and read the the next page.... and so on.
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
# Your code for each individual page here
if you still read this post , you can try this code too....
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "Details.csv"
f = open(file, "w")
Headers = "Name,Address,City,Phone,Website\n"
f.write(Headers)
for page in range(1,5):
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
Title = soup.find_all("div", {"class":"views-field-nothing"})
for i in Title:
try:
name = i.find("div", {"class":"views-field-title"}).get_text()
address = i.find("div", {"class":"views-field-address"}).get_text()
city = i.find("div", {"class":"views-field-city-state-zip"}).get_text()
phone = i.find("div", {"class":"views-field-work-phone"}).get_text()
website = i.find("div", {"class":"views-field-website"}).get_text()
print(name, address, city, phone, website)
f.write("{}".format(name).replace(",","|")+ ",{}".format(address)+ ",{}".format(city).replace(",", " ")+ ",{}".format(phone) + ",{}".format(website) + "\n")
except: AttributeError
f.close()
where it is written range(1,5) just change that with 0,to the last page , and you will get all details in CSV, i tried very hard to get your data in proper format but it's hard:).
You're putting a link to a single page, it's not going to iterate through each one on its own.
Page 1:
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
Page 2:
http://www.pga.com/golf-courses/search?page=1&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Page 907:
http://www.pga.com/golf-courses/search?page=906&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Since you're running for page 1 you'll only get 20. You'll need to create a loop that'll run through each page.
You can start off by creating a function that does one page then iterate that function.
Right after the search? in the url, starting at page 2, page=1 begins increasing until page 907 where it's page=906.
I noticed that the first solution had a repetition of the first instance, that is because the 0 page and 1 page is the same page. This is resolved by specifying the start page in the range function. Example below...
for i in range(1, 907): #Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib") #Can use whichever parser you prefer
# Your code for each individual page here
Had this same exact problem and the solutions above did not work. I solved mine by accounting for cookies. A requests session helps. Create a session and it'll pull all the pages you need by inserting a cookie to all the numbered pages.
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
s = requests.Session()
r = s.get(url)
The PGA website has changed this question has been asked.
It seems they organize all courses by: State > City > Course
In light of this change and the popularity of this question, here's how I'd solve this problem today.
Step 1 - Import everything we'll need:
import time
import random
from gazpacho import Soup # https://github.com/maxhumber/gazpacho
from tqdm import tqdm # to keep track of progress
Step 2 - Scrape all the state URL endpoints:
URL = "https://www.pga.com"
def get_state_urls():
soup = Soup.get(URL + "/play")
a_tags = soup.find("ul", {"data-cy": "states"}, mode="first").find("a")
state_urls = [URL + a.attrs['href'] for a in a_tags]
return state_urls
state_urls = get_state_urls()
Step 3 - Write a function to scrape all the city links:
def get_state_cities(state_url):
soup = Soup.get(state_url)
a_tags = soup.find("ul", {"data-cy": "city-list"}).find("a")
state_cities = [URL + a.attrs['href'] for a in a_tags]
return state_cities
state_url = state_urls[0]
city_links = get_state_cities(state_url)
Step 4 - Write a function to scrape all of the courses:
def get_courses(city_link):
soup = Soup.get(city_link)
courses = soup.find("div", {"class": "MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-md-6"}, mode="all")
return courses
city_link = city_links[0]
courses = get_courses(city_link)
Step 5 - Write a function to parse all the useful info about a course:
def parse_course(course):
return {
"name": course.find("h5", mode="first").text,
"address": course.find("div", {'class': "jss332"}, mode="first").strip(),
"url": course.find("a", mode="first").attrs["href"]
}
course = courses[0]
parse_course(course)
Step 6 - Loop through everything and save:
all_courses = []
for state_url in tqdm(state_urls):
city_links = get_state_cities(state_url)
time.sleep(random.uniform(1, 10) / 10)
for city_link in city_links:
courses = get_courses(city_link)
time.sleep(random.uniform(1, 10) / 10)
for course in courses:
info = parse_course(course)
all_courses.append(info)

Categories