Python web text list to dataframe - python

I have several values scraped from a website, and I wanna covert to dataframe like:
dates titles links
2021-05-13 AAA https
I use:
html_link = 'https://www.ksei.co.id/publications/new-securities-registration?setLocale=en-US'
html = requests.get(html_link).text
soup = BeautifulSoup(html, 'html.parser')
titles = []
links = []
dates = []
for ultag in soup.find_all('ul', {'class': 'list-nostyle'}):
for litag in ultag.find_all('li'):
for dates in litag.find_all('small', {'class': 'muted'}):
title = litag.find('h2', {'class': 'h4 no-margin'})
link = litag.find('a', href=True)
titles.append(title.text)
links.append(f"https://www.pds.com.ph{link['href']}")
dates.append(dates.text)
#print(titles,links,dates)
dataframe = pd.DataFrame(zip(titles, links, dates), columns=['Titles', 'Links', 'Dates'])
print(dataframe)
But it only returns the first two rows, no idea why. I am confused with zip function, and always stuck with the lists to dataframe...
Thanks for the help in advance!

You're making this way too complicated than it should be. And by that I mean those nested for loops.
Just grab all the "boxes" and scoop out all the parts you need. Finally, dump the list of lists to a DataFrame and you're all done!
Here's how:
import pandas as pd
import requests
from bs4 import BeautifulSoup
html_link = 'https://www.ksei.co.id/publications/new-securities-registration?setLocale=en-US'
boxes = BeautifulSoup(requests.get(html_link).text, 'html.parser').select(".box--medium")
data = []
for box in boxes:
title = box.find("h2").getText()
date = box.find("b").getText().replace(", ", " ")
name = box.find("p").getText()
link = f'https://www.pds.com.ph{box.find("a")["href"]}'
data.append([title, date, name, link])
df = pd.DataFrame(data, columns=['Titles', 'Dates', 'Names', 'Links'])
print(df.head())
df.to_csv("your_data.csv", index=False)
Output:
Titles ... Links
0 KSEI-3512/DIR/0521 ... https://www.pds.com.ph/Announcement/Files/1271...
1 KSEI-3482/DIR/0521 ... https://www.pds.com.ph/Announcement/Files/1270...
2 KSEI-7362/JKU/0521 ... https://www.pds.com.ph/Announcement/Files/1270...
3 KSEI-3440/DIR/0521 ... https://www.pds.com.ph/Announcement/Files/1269...
4 KSEI-3394/DIR/0521 ... https://www.pds.com.ph/Announcement/Files/1268...
[5 rows x 4 columns]
and a .csv file:

The problem is because you have assigned same variable name(dates) to two data structures.there is a list assigned with name dates and also in the third loop you've dates variable.so, zip function is taking dates variable from the for loop
html_link = 'https://www.ksei.co.id/publications/new-securities-registration?setLocale=en-US'
html = requests.get(html_link).text
soup = BeautifulSoup(html, 'html.parser')
titles = []
links = []
dates_1 = []
for ultag in soup.find_all('ul', {'class': 'list-nostyle'}):
for litag in ultag.find_all('li'):
for dates in litag.find_all('small', {'class': 'muted'}):
title = litag.find('h2', {'class': 'h4 no-margin'})
link = litag.find('a', href=True)
titles.append(title.text)
links.append(f"https://www.pds.com.ph{link['href']}")
print(dates.text)
print(dates)
dates_1.append(dates.text)
dataframe = pd.DataFrame(zip(titles, links, dates_1), columns=['Titles', 'Links', 'Dates'])

Variables are reused and zip will take the shortest
import itertools
html_link = 'https://www.ksei.co.id/publications/new-securities-registration?setLocale=en-US'
html = requests.get(html_link).text
soup = BeautifulSoup(html, 'html.parser')
titles = []
links = []
dates = []
for ultag in soup.find_all('ul', {'class': 'list-nostyle'}):
for litag in ultag.find_all('li'):
for dat in litag.find_all('small', {'class': 'muted'}):
title = litag.find('h2', {'class': 'h4 no-margin'})
link = litag.find('a', href=True)
titles.append(title.text)
links.append(f"https://www.pds.com.ph{link['href']}")
dates.append(dat.text)
#print(titles,links,dates)
dataframe = pd.DataFrame(itertools.zip_longest(titles, links, dates), columns=['Titles', 'Links', 'Dates'])
print(dataframe)

Related

Check how many links I have in each page. Then placing that count at a dataframe column

I'm doing this project to scrape how many links a series of webpages have.
My ideia is to add the count of the links for each page in a column of a Pandas dataframe. The ideia is to have something like this:
title count links
0 page1 2
1 page2 3
2 page3 0
I did this code:
links_bs4 = ['page1', 'page2']
article_title = []
links = []
for item in links_bs4:
page = requests.get(item)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
article_title.append(title.string)
body_text = soup.find('div', class_='article-body')
for link in body_text.find_all('a'):
links.append((link.get('href')))
count_of_links = len(links)
s1 = pd.Series(article_title, name='title')
s2 = pd.Series(count_of_links, name='count links')
df = pd.concat([s1, s2], axis=1)
It partly works. The count_of_links = len(links) generates a count of all links of all pages combined.
I wish the count for each page, not the total as is happening now. How can I do this? My for loop is adding the count for the whole list. I should create a new list for each URL I scrape? Or use another thing in Python?
I'm clearly missing some part of the logic.
You can treat count_of_links the same way as article_title. Below is based on your code, but with my changes.
links_bs4 = ['page1', 'page2']
article_title = []
count_of_links = [] # <------ added
links = []
for item in links_bs4:
page = requests.get(item)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
article_title.append(title.string)
body_text = soup.find('div', class_='article-body')
count = 0 # <------- added
for link in body_text.find_all('a'):
links.append((link.get('href')))
# count_of_links = len(links) # <------- commented out
count += 1 # <------- added
count_of_links.append(count) # <------- added
s1 = pd.Series(article_title, name='title')
s2 = pd.Series(count_of_links, name='count links')
df = pd.concat([s1, s2], axis=1)
Or you may code it this way, then you won't need to create one variable for one new column, instead you only need to expand the dictionary.
links_bs4 = ['page1', 'page2']
data = []
links = []
for item in links_bs4:
page = requests.get(item)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
body_text = soup.find('div', class_='article-body')
link_temp = [link.get('href') for link in body_text.find_all('a')]
data.append({'title': title.string, 'count links': len(link_temp)})
links.extend(link_temp)
df = pd.DataFrame(data)

How to list out all the h2, h3, and p tags then create a dataframe to store them

I had given a website to scrape all of the key items
But the output I got is only for one item using BeautifulSoup4. So wonder if I need to use anything like soup.findall to extract all the key items in a list from the website.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
url=
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
column= soup.find(class_ = re.compile('columns is-multiline'))
print(column.prettify())
position = column.h2.text
company = column.h3.text
city_state= column.find_all('p')[-2].text
print (position, company, city_state)
Thank you.
Try this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://realpython.github.io/fake-jobs/'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
positions = [pos.text for pos in soup.find_all('h2')]
companies = [com.text for com in soup.find_all('h3')]
city_state0 = []
city_state1 = []
for p in soup.find_all('p', {'class' : 'location'}):
city_state0.append(p.text.split(',')[0].strip())
city_state1.append(p.text.split(',')[1].strip())
df = pd.DataFrame({
'city_state1': city_state0,
'city_state2': city_state1,
'companies' : companies,
'positions' : positions
})
print(df)
Output:
You need to use find_all to get all the elements like so. find only gets the first element.
titles = soup.find_all('h2', class_='title is-5')
companies = soup.find_all('h3', class_='subtitle is-6 company')
locations = soup.find_all('p', class_='location')
# loop over locations and extract the city and state
for location in locations:
city = location.split(', ')[0]
state = location.split(', ')[1]

Python issue for crawling multiple page title

I am a marketer and want to conduct some basic market research using Python.
I wrote a simple coding to crawl multiple pages of title, but it does not work to put the title text in the list and to transfer it into Excel format. How can I do in this case?
I tried to create a list and used the extend() method to put these looped titles on the list, but it did not work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
def content_get(url):
count = 0
while count < 4: #this case was to crawl titles of 4 pages
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
titles = soup.find(id="main-container").find_all("div", class_="r-ent")
for title in titles:
print([title.find('div', class_='title').text])
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
firstpage = "https://www.ptt.cc/bbs/movie/index9002.html"
content_get(firstpage)
You need to add the titles to a list outside of the while loop:
def content_get(url):
count = 0
titles = []
while count < 4:
r = requests.get(url)
soup = BeautifulSoup(r.text)
title_page = [title.text.replace('\n', '') for title in soup.find_all('div', {'class': 'title'})]
titles.extend(title_page)
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
return titles
If you don't want the list comprehension to get titles_page, that can be replaced with a traditional for loop:
titles_page = []
titles = soup.find_all('div', {'class': 'title'})
for title in titles:
titles_page.append(title.text.replace('\n', ''))
For the excel file:
def to_excel(text):
df = pd.DataFrame(text, columns=['Title'])
return df.to_excel('output.xlsx')

Troubles appending list to a DataFrame

I experience dealing with multi tags/attributes in one loop and appending them to the DataFrame. More speicifcally, it concerns Place loop:
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
Appending it to the DataFrame yields only 1 result out of expected 30.
Thank you in advance.
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url + str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
sub_soup_txt = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = sub_soup_txt.find('div', {'id': 'car-attributes'})
soup2 = sub_soup_txt.find('div', {'id': 'vip-seller'})
tmp = []
places = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final['Places'] = pd.Series(places)
df_final.to_csv('auto_database.csv')
As you are adding places to the final df, this line (currently sitting in for pagenumber in ... for car in ...)
places = []
should go all the way up and out of the main for loop here:
frames = []
places = []

requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied

I am working on a web scraping project and have run into the following error.
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Below is my code. I retrieve all of the links from the html table and they print out as expected. But when I try to loop through them (links) with request.get I get the error above.
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
Your mistake is second for loop in code
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
ref['href'] gives you single url but you use it as list in next for loop.
So you have
for link in ref['href']:
and it gives you first char from url http://properties.kimcore... which is h
Full working code
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
link = ref['href']
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
BTW: if you use comma in (ref['href'], ) then you get tuple and then second for works correclty.
EDIT: it create list table_data at start and add all data into this list. And it convert into DataFrame at the end.
But now I see it read the same page few times - because in every row the same url is in every column. You would have to get url only from one column.
EDIT: now it doesn't read the same url many times
EDIT: now it get text and hre from first link and add to every element in list when you use append().
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table_data = []
# all rows in table except first ([1:]) - headers
rows = soup.select('table tr')[1:]
for row in rows:
# link in first column (td[0]
#link = row.select('td')[0].find('a')
link = row.find('a')
link_href = link['href']
link_text = link.text
print('text:', link_text)
print('href:', link_href)
page = requests.get(link_href)
soup = BeautifulSoup(page.content, 'html.parser')
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
lis = anchor.find_all('li')
item1 = unicodedata.normalize("NFKD", lis[0].text).strip()
item2 = lis[1].text
item3 = lis[2].text.strip()
table_data.append([item1, item2, item3, link_text, link_href])
print('table_data size:', len(table_data))
headers = ['Number', 'Tenant', 'Square Footage', 'Link Text', 'Link Href']
df = DataFrame(table_data, columns=headers)
print(df)

Categories