I am new to Python and attempting to use Python 3 to scrape billboard data. I am using the python billboard api (billboard.py) and want to get the hot 100 tracks in a csv format with Billboard Number, Artist Name, Song Title, Last Week Number, Peak Position and Weeks as headers. I have been looking at this for hours with no luck so any help would be greatly appreciated!
import requests, csv
from bs4 import BeautifulSoup
url = 'http://www.billboard.com/charts/hot-100'
with open('Billboard_Hot100.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Billboard Number','Artist Name','Song Title','Last Week Number','peak_position','weeks_on_chart'])
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
for container in soup.find_all("article",class_="chart-row"):
billboard_number = container.find(class_="chart-row__current-week").text
artist_name_a_tag = container.find(class_="chart-row__artist").text.strip()
song_title = container.find(class_="chart-row__song").text
last_week_number_tag = container.find(class_="chart-row__value")
last_week_number = last_week_number_tag.text
peak_position_tag = last_week_number_tag.find_parent().find_next_sibling().find(class_="chart-row__value")
peak_position = peak_position_tag.text
weeks_on_chart_tag = peak_position_tag.find_parent().find_next_sibling().find(class_="chart-row__value").text
print(billboard_number,artist_name_a_tag,song_title,last_week_number,peak_position,weeks_on_chart_tag)
writer.writerow([billboard_number,artist_name_a_tag,song_title,last_week_number,peak_position,weeks_on_chart_tag])
The csv file returned has the headers but the columns contain no information. Am I missing something?
Can you work with this?
from lxml import html
import requests
page = requests.get('https://www.billboard.com/charts/hot-100')
tree = html.fromstring(page.content)
line = tree.xpath('//span[#class="chart-list-item__title-text"]/text()')
line = line.replace('\n','')
Here's a link that I bookmarked a long time ago. It has helped me a lot over the course of time.
https://docs.python-guide.org/scenarios/scrape/
Related
I'm a beginner with Python & trying to learn with a BeautifulSoup webscraping project.
I'm looking to scrape the record item title, URL of item & purchase date from this URL & export to a CSV.
I made great progress with scraping title & URL but just cannot figure out how to properly code the purchase date info correctly in my for loop (purchase_date variable below).
What's currently happening is the data in the csv file for the purchase date (e.g. p_date title) just displays blank cells with no text.. no error message just no data getting put into csv. Any guidance is much appreciated.
Thank you!!
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
headers = {"Accept-Language": "en-US, en;q=0.5"}
url = "https://www.popsike.com/php/quicksearch.php?searchtext=metal+-signed+-promo+-beatles+-zeppelin+-acetate+-test+-sinatra&sortord=aprice&pagenum=1&incldescr=1&sprice=100&eprice=&endfrom=2020&endthru=2020&bidsfrom=&bidsthru=&layout=&flabel=&fcatno="
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
title = []
date = []
URL = []
record_div = soup.find_all('div', class_='col-md-7 add-desc-box')
for container in record_div:
description = container.a.text
title.append(description)
link = container.find('a')
URL.append(link.get('href'))
purchase_date = container.find('span',class_= 'info-row').text
date.append(purchase_date)
test_data = pd.DataFrame({
'record_description': title,
'link': URL,
'p_date': date
})
test_data['link'] = test_data['link'].str.replace('../','https://www.popsike.com/',1)
print(test_data)
test_data.to_csv('popaaron.csv')
I suggest to change parser type:
soup = BeautifulSoup(results.text, "html5")
And fix search expression for purchase date:
purchase_date = container.select('span.date > b')[0].text.strip(' \t\n\r')
Everything works as expected when I'm using a single URL for the URL variable to scrape, but not getting any results when attempting to read links from a csv. Any help is appreciated.
Info about the CSV:
One column with a header called "Links"
300 rows of links with no space, commoa, ; or other charters before/after the links
One link in each row
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
#print(res.url)
url = res
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_= "app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_= "app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_= "app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_= "app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0,email_elm1,email_elm2,email_elm3)
print(final_email_elm)
df = pd.DataFrame(final_email_elm)
#getting an output in csv format for the dataframe we created
#df.to_csv('draft_part2_scrape.csv')
The problem lies in this part of the code:
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
...
After the loop is executed, res will have the last link. So, this program will only scrape the last link.
To solve this problem, store all the links in a list and iterate that list to scrape each of the link. You can store the scraped result in a seperate dataframe and concatenate them at the end to store in a single file:
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
links = []
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
links.append(link['Links'])
dfs = []
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_="app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_="app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_="app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_="app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0, email_elm1, email_elm2, email_elm3)
print(final_email_elm)
dfs.append(pd.DataFrame(final_email_elm))
#getting an output in csv format for the dataframe we created
df = pd.concat(dfs)
df.to_csv('draft_part2_scrape.csv')
I am new to web scraping and I am trying to scrape all the video links from each page of this specific site and writing that into a csv file. For starters I am trying to scrape the URLs from this site:
https://search.bilibili.com/all?keyword=%E3%82%A2%E3%83%8B%E3%82%B2%E3%83%A9%EF%BC%81%E3%83%87%E3%82%A3%E3%83%89%E3%82%A5%E3%83%BC%E3%83%BC%E3%83%B3
and going through all 19 pages. The problem I'm encountering is that the same 20 video links are being written 19 times(because I'm trying to go through all 19 pages), instead of having (around) 19 distinct sets of URLs.
import requests
from bs4 import BeautifulSoup
from csv import writer
def make_soup(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def scrape_url():
for video in soup.find_all('a', class_='img-anchor'):
link = video['href'].replace('//','')
csv_writer.writerow([link])
with open("videoLinks.csv", 'w') as csv_file:
csv_writer = writer(csv_file)
header = ['URLS']
csv_writer.writerow(header)
url = 'https://search.bilibili.com/all?keyword=%E3%82%A2%E3%83%8B%E3%82%B2%E3%83%A9%EF%BC%81%E3%83%87%E3%82%A3%E3%83%89%E3%82%A5%E3%83%BC%E3%83%BC%E3%83%B3'
soup = make_soup(url)
lastButton = soup.find_all(class_='page-item last')
lastPage = lastButton[0].text
lastPage = int(lastPage)
#print(lastPage)
page = 1
pageExtension = ''
scrape_url()
while page < lastPage:
page = page + 1
if page == 1:
pageExtension = ''
else:
pageExtension = '&page='+str(page)
#print(url+pageExtension)
fullUrl = url+pageExtension
make_soup(fullUrl)
scrape_url()
Any help is much appreciated and I decided to code this specific way so that I can better generalize this throughout the BiliBili site.
A screenshot is linked below showing how the first link repeats a total of 19 times:
Try
soup = make_soup(fullurl)
in last but one line
In the second to last line, you are not assigning the return value of make_soup. In your scrape_url function, you are using a variable called soup, but that only gets assigned once.
If you changed this line to soup = scrape_url() then it should work.
I am new here and newbie with python and currently learning some basic stuff, mostly scraping and I encountered a problem that I hope you can help me to solve.
I'm trying to scrape few details from a website and writing them into a CSV file but I'm able to write only the last results into my CSV, apparently my script just overwrite the data.
Also if you find any mistakes on my code or any room for improvement (which I'm sure there are) I'd be glad if you will point them out as well.
Also2, any recommendation for videos/tutorials that can help me improve my python and scraping skills would be appreciated.
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup (source, 'lxml')
csv_file = open('contacts.csv', 'w')
csv_writer = csv.writer (csv_file)
csv_writer.writerow(["department", "name", "position", "phone"])
for department in soup.find_all("div", class_="view-content"):
department_name = department.h3
print (department_name.text)
for contacts in soup.find_all("div", class_="col-md-7 col-xs-10"):
contact_name = contacts.strong
print(contact_name.text)
for position in soup.find_all("div", class_="field-content"):
print(position.text)
for phone in soup.find_all("div", class_="modal-content"):
first_phone = phone.h3
first_phones = first_phone
print(first_phones)
csv_writer.writerow([department_name, contact_name, position, first_phones])
csv_file.close()
Thanks Thomas,
Actually I tweaked my code a little bit by thinking how I can make it simpler (four for loops are too much, no?) so with the following code I solved my problem(dropped the 'department' and 'phones' because some other issues):
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup (source, 'lxml')
f = open("contactslot.csv", "w+")
csv_writer = csv.writer (f)
csv_writer.writerow(["Name", "Position"])
infomation = soup.find_all("div", class_="well profile")
info = information[0]
for info in information:
contact_name = info.find_all("div", class_="col-md-7 col-xs-10")
names = contact_name[0].strong
name = names.text
print (name)
position_name = info.find_all("div", class_="field-content")
position = position_name[0].text
print(position)
print("")
csv_writer.writerow([name, position])
f.close()
Hi Babr welcome to use python. Your answer is good and here is one more little thing you may can do better.
use find replace find_all if you just want one element
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
f = open("/Users/mingjunliu/Downloads/contacts.csv", "w+")
csv_writer = csv.writer(f)
csv_writer.writerow(["Name", "Position"])
for info in soup.find_all("div", class_="well profile"):
contact_name = info.find("div", class_="col-md-7 col-xs-10")
names = contact_name.strong
name = names.text
print(name)
position_name = info.find("div", class_="field-content")
position = position_name.text
print(position)
print("")
csv_writer.writerow([name, position])
f.close()
And the reason you need to drop phone and department is because of the bad website structure. It's not your fault.
Two issues.
Goal etl a 3 columns csv with column headings of date, time, & tweet.
My attempts at extracting the span text/time out of the li results in duplicating the span info inside the time and tweet columns.
It's my first week working with python, i've tried to replace() the tweet columns 'time' with "" but I end up removing both columns 'time' instances.
combining the columns together in-order or correctly mixing the data columns together as they appear. The code I write either results in 30,000 or 1000 lines. The correct csv file should be around 520 lines.
import bs4 as bs
import requests, urllib.request, csv
from urllib.request import urlopen
sauce = urllib.request.urlopen('https://www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858').read()
soup = bs.BeautifulSoup(sauce, 'html.parser')
lists = soup.find_all('li', class_='visible')
dates = soup.find_all("li", attrs={"data-date": True})
tweet_data = ['date, time, tweets']
for li in dates[1:]:
date = li['data-date']
tweet_data.append([date])
for list in lists[1:]:
time = list.find_all('span', {"class": "gray"})[0].text
tweets = list.text
tweet_data.append([time, tweets])
with open('tweets_attempt_8.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(tweet_data)
Here is code for which you needed to you out put...
I hope you are satisfy with this answers.
import bs4 as bs
import urllib2,csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
url='www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858'
sauce = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
con = urllib2.urlopen(sauce)
data = con.read()
soup = bs.BeautifulSoup(data, 'html.parser')
lists = soup.find_all('li', class_='visible')
dates = soup.find_all("li", attrs={"data-date": True})
tweet_data = ['date, time, tweets']
for li,list in zip(dates[1:],lists[1:]):
date = li['data-date']
time = list.find_all('span', {"class": "gray"})[0].text
tweets = list.text
tweet_data.append([date,time, tweets])
with open('/tmp/tweets_attempt_8.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(tweet_data)
As you want the Out Put look at this
Try this. There are 504 lines in that page which you want to parse. You will get all of them with a csv output.
import csv ; import requests ; from bs4 import BeautifulSoup
with open('tweets_attempt_8.csv', 'w', newline='', encoding='utf8') as outfile:
writer = csv.writer(outfile)
writer.writerow(['date','time','tweets'])
sauce = requests.get('https://www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858',headers={"User-Agent":"Existed"}).text
soup = BeautifulSoup(sauce,"html.parser")
for item in soup.select("li.pg-excerpt.visible"):
date = item.get('data-date')
time = item.select("span.gray")[0].text
title = item.text.strip()
print(date, time, title[10:])
writer.writerow([date, time, title[10:]])