Web Scraping Real-Time

Web Scraping Real-Time - python

I am currently web scraping using BeautifulSoup which is fetched and wrote in xml as shown in the code below, I am just wondering what could I do to make it real time as the website updates every 5 mins.
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)

You can use a while loop, then at the end you can add a sleep for 5 mins.
Using your example this would be:
import csv
import requests
from bs4 import BeautifulSoup
import time
while True:
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)
time.sleep(5 * 60)

Related

How I do implement multithread in my web scraper?

I am currently working on a project that requires me to extract data from hundreds of pages. However, I notice that the whole extraction is taking too long since the scraper has to process around 800+ pages. I have read about multiprocessing which I believe it can speed things up but I don't really know how to integrate it to my current code.
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import time
final_data = []
for i in range(1,8271,10):
url = (f'https://www.fca.org.uk/news/search-results?np_category=warnings&start={i}')
req = requests.get(url)
start = time.process_time()
page_html = req.content
page_soup = soup(page_html, "lxml")
data = page_soup.find_all('li', class_='search-item')
print(f'Processing {url}')
for x in data:
list = {}
list['name'] = x.find('a','search-item__clickthrough').text.strip()
try:
list['published_date']=x.find('span','meta-item published-date').text
except:
list['published_date'] = 'None'
list['modified_date']=x.find('span','meta-item modified-date').text
final_data.append(list)
df = pd.DataFrame(final_data)
TodaysDate = time.strftime("%Y%m%d")
csvfilename = TodaysDate + "_FCA Macro.csv"
df.to_csv(csvfilename, encoding="utf-8-sig")

from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import time
import threading
final_data = []
def scrape(i):
url = (f'https://www.fca.org.uk/news/search-results?np_category=warnings&start={i}')
req = requests.get(url)
start = time.process_time()
page_html = req.content
page_soup = soup(page_html, "lxml")
data = page_soup.find_all('li', class_='search-item')
print(f'Processing {url}')
for x in data:
list = {}
list['name'] = x.find('a','search-item__clickthrough').text.strip()
try:
list['published_date']=x.find('span','meta-item published-date').text
except:
list['published_date'] = 'None'
list['modified_date']=x.find('span','meta-item modified-date').text
final_data.append(list)
for i in range(1,8271,10):
threading.Thread(target=scrape, args=(i,)).start()
df = pd.DataFrame(final_data)
TodaysDate = time.strftime("%Y%m%d")
csvfilename = TodaysDate + "_FCA Macro.csv"
df.to_csv(csvfilename, encoding="utf-8-sig")
See https://realpython.com/intro-to-python-threading/ for more documentation.
Just use a for loop to create new threads, and pass in i for each new thread created. .start() is needed to start the thread.

How to scrape the website properly and getting all td texts from website

I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)

All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:

How can we scrape the information inside the Javascript code in an instagram profile

I basically need the text preceding the word edge_followed_by.
I am using this code:
from bs4 import BeautifulSoup
from csv import writer
import requests
res = requests.get('https://www.instagram.com/dualipa/?hl=en')
res_text = res.text
soup = BeautifulSoup(res_text,'lxml')
j = soup.select("script")
k = j
k = str(k)
print ('k')

You can use re/json module to parse the data. For example:
import re
import json
import requests
url = 'https://www.instagram.com/dualipa/?hl=en'
html_data = requests.get(url).text
data = json.loads(re.search(r'window\._sharedData = ({.*?});', html_data).group(1))
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
print(data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_followed_by']['count'])
Prints:
51310036

Beautiful Soup scraping stops working after running it down once for scraping

Hi i am creating a small scraper that crawls through tripadvisor's restaurant and populate their emails to a csv file.
Strangely enough, my code works stucks at around 300 urls from my list that i parsed and stopped working. I tried running the cli again and it will just terminate the function after writing the header. Below is the code for reference. Apologies in advance if i couldn't give this in better context.
# -*- coding: utf-8 -*-
import pandas as pd
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from fake_useragent import UserAgent
import requests
import time
import re
import logging
#request domain
domain = 'https://www.tripadvisor.com.my'
ua = UserAgent(verify_ssl=False)
header = {'User-Agent':str(ua.chrome)}
#extracting data from excel file that were parsed
df = pd.read_csv('./data/url_parser.csv')
#calculating the length of the total restaurants that were parsed.
total_restaurants = len(df)
debug = False
if debug:
limit = 100
else:
limit = None
#writing data into excel file
with open('./data/content_parser.csv', 'a') as csvfile:
fieldnames = [
'restaurant_id',
'email'
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
#for each url in the parsed data, scrape the url website
for index, u in enumerate(df['url'][:limit]):
restaurant_id = df['restaurant_id'][index]
print('process = {}/{}'.format(index+1, total_restaurants))
r = requests.get(u, headers=header)
soup = BeautifulSoup(r.text, 'html.parser')
# block = the block that we wanted to get the website url from
block = soup.find('div', {'class':"restaurants-detail-overview-cards-LocationOverviewCard__detailLink--iyzJI restaurants-detail-overview-cards-LocationOverviewCard__contactItem--1flT6"})
email = soup.select('a[href^=mailto]')
for i in email:
href=i['href']
try:
str1, str2 = href.split(':')
except ValueError:
break
#appended email that removes all the html code.
email.append(str2)
#pasting the data scraped and putting into use.
writer.writerow(
{
'restaurant_id':restaurant_id,
'email':str2
}
)
time.sleep(15)

BeautifulSoup with Table

I'm Web Scraping on Beautiful Soup and I am getting an error on line 13: for row in table.findAll('tr').
Its coming up an error on the cmd. Hope someone could help.
import csv
import requests
from bs4 import BeautifulSoup
url='http://www.dublincity.ie/dublintraffic/carparks.htm'
response = requests.get(url)
html= response.content
soup=BeautifulSoup(html)
table=soup.find('tbody', attrs={'id' :'itemsBody'})
list_of_rows=[]
for row in table.findAll('tr'):
list_of_cells=[]
for cell in row.findAll('td'):
text = cell.text.replace(' ','')
list_of_cells.append(text)
list_of_cells.append(list_of_cells)
outfile= open("./carpark.csv", "wb")
writer=csv.writer(outfile)
writer.writerows(["location","spaces"])
writer.writerows(list_of_rows)

If you wanna stick to BeautifulSoup then you can fetch and write the content using its xml parser along with csv.DictWriter(). Check out the implementation:
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)

You could retrieve the data as an xml doc and then parse. This is just an example of part of process you could tailor.
import requests
from xml.etree import ElementTree
import pandas as pd
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
xml_data = requests.get(url).content
tree = ElementTree.fromstring(xml_data)
parking = []
for child in tree:
for nextChild in child:
parking.append([child.tag ,nextChild.attrib['name'],nextChild.attrib['spaces']])
df = pd.DataFrame(parking)
print(df)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web Scraping Real-Time - python

Related

How I do implement multithread in my web scraper?

How to scrape the website properly and getting all td texts from website

How can we scrape the information inside the Javascript code in an instagram profile

Beautiful Soup scraping stops working after running it down once for scraping

BeautifulSoup with Table

Categories

Resources