Extract Header and Table text from a .docx file

Extract Header and Table text from a .docx file - python

I'm trying to extract page and header data from a docx file. The file is several hundred pages, each with a table and a header. The header has pertinent information that needs to be paired with each table. I'm able to extract the header and table data, I just can't reliably pair them together.
Using win32com this is what I've got so far
# getting the table page number
app = Dispatch("Word.Application")
doc = app.Documents.Open(filename)
table_1_page = doc.Tables(1).Range.Information(3) # 3 == wdActiveEndPageNumber
The problem occurs because the headers TextFrames and are duplicated on multiple pages, so when I call:
# getting the header page number
doc.Sections(1).Headers(1).Shapes(1).TextFrame.TextRange.Information(3)
I get one of the pages that the TextFrame occurs on. The page is chooses seems somewhat arbitrary, sometimes its the first others its the last, but its not predictable.
I'm spent a bit of time reading over the object model here. Ultimately it would be nice to capture all of the items displayed per page without reinventing the wheel.
EDIT 10/25/16 per request, here is some minimum working code**
# filename docx_parser.py
import pythoncom
class OpenDoc(object):
def __init__(self, docx_path):
import win32com.client as win32
self.path = docx_path
self.word = win32.Dispatch("Word.Application")
self.word.Visible = 0
self.word.Documents.Open(p)
self.doc = self.word.ActiveDocument
def get_table_count(self):
return self.doc.Tables.Count
def count_table_rows(self, table):
return table.Rows.Count
def count_table_columns(self, table):
return table.Columns.Count
def get_headers(self):
headers = self.doc.Sections(1).Headers(1)
shape_count = headers.Shapes.Count
for shape_num in range(1, shape_count + 1):
t_range = headers.Shapes(shape_num).TextFrame.TextRange
text = t_range.Text
page_num = t_range.Information(3) # 3 == wdActiveEndPageNumber
yield text, page_num
def get_table_text(self, table):
col_count = self.count_table_columns(table)
row_count = self.count_table_rows(table)
for row in range(1, row_count + 1):
row_data = []
for col in range(1, col_count + 1):
try:
row_data.append(table.Cell(Row=row, Column=col).Range.Text.strip(chr(7) + chr(13)))
except pythoncom.com_error as error:
row_data.append("")
yield row_data
def get_all_table_text(self):
for table in self.get_tables():
table_data = []
for row_data in self.get_table_text(table):
table_data.append(row_data)
yield table_data
def get_tables(self):
for table in self.doc.Tables:
yield table
def __del__(self):
self.word.Quit()
if __name__ == "__main__":
try:
path = r"sample.docx"
open_doc = OpenDoc(path)
for table_num, table_text in enumerate(open_doc.get_all_table_text()):
print("\n-------------- Table %s ----------------" % (table_num + 1))
for row_data in table_text:
print(", ".join(row_data))
for header_text, page_num in open_doc.get_headers():
print("header page number: %s, text: %s" % (page_num, header_text))
except Exception as error:
from traceback import format_exc
print(format_exc())
raw_input("")

Related

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image

Split data into multiple worksheets

Since I am going to create a number of dataframes I know won't fit inside just a single google worksheet (because of the limitation of columns) I want to split the data into multiple worksheets. I'm using set_with_dataframe() and defining which worksheet the dataframes is going to get imported to, so my first thought was to create and define several worksheets and then use the same method - the problem is just that I don't know how to "split" the data when there's no more columns in the first worksheet (and then the second, and the third and so on...)
I'm quite new at working with Python and I have been stuck with this for days so any kind of help would be appreciated.
My code looks like this:
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
from bs4 import BeautifulSoup
import pandas as pd
import requests
import traceback
import os
class DataScraper():
def __init__(self, sheets):
self.data_worksheet = sheets.data_worksheet
self.total_urls = 0
self.urls = self.getAllUrls(sheets.url_worksheet)
def getAllUrls(self, urlWorkSheet):
urls = urlWorkSheet.get_all_values()
finalUrls = []
for r in urls:
# Get all urls
modifiedUrls = [d for d in r[:14] if "https://" in d]
if len(modifiedUrls) != 0:
self.total_urls += len(modifiedUrls)
finalUrls.append(modifiedUrls)
return finalUrls
def StartScrape(self):
current_column_count = 1
last_data_frame_max_width = 0
current_element = 0
for urlRow in self.urls:
current_row_count = 1
for url in urlRow:
current_element += 1
error = False
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
labels = []
results = []
tbl = soup.find('table')
for tr in tbl.findAll('tr'):
headers = [th.text.strip() for th in tr.findAll('th')]
data = [td.text.strip() for td in tr.findAll('td')]
labels.append(headers)
results.append(data)
final_results = []
for final_labels, final_data in zip(labels, results):
final_results.append({'Labels': final_labels, 'Data': final_data})
df = pd.DataFrame(final_results)
df['Labels'] = df['Labels'].str[0]
df['Data'] = df['Data'].str[0]
indexNames = df[df['Labels'] == 'Links'].index
df.drop(indexNames , inplace=True)
set_with_dataframe(self.data_worksheet, df, col=current_column_count, row=current_row_count, include_column_header=False)
current_row_count += df.shape[0]+2
if df.shape[1] > last_data_frame_max_width:
last_data_frame_max_width = df.shape[1]
except Exception:
error = True
finally:
print(f"Processed page {current_element}/{self.total_urls} with status: {'success' if not error else 'error'}")
current_column_count += last_data_frame_max_width+5
last_data_frame_max_width = 0
class Sheets():
def __init__(self, filename, key):
self.filename = filename
self.key = key
self.data_worksheet = None
self.url_worksheet = None
self.getSheets(self.getCredentials())
def getCredentials(self):
# sep = seperator
_ = os.path.normpath(__file__).split(os.sep)
_.insert(1, "/")
credentials = service_account.Credentials.from_service_account_file(os.path.join(os.path.join(*_[0:-1]), self.filename))
return credentials.with_scopes( ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
def getSheets(self, scoped_credentials):
gc = gspread.Client(auth=scoped_credentials)
gc.session = AuthorizedSession(scoped_credentials)
spreadsheet_key = gc.open_by_key(self.key)
# Get sheet with data import
self.data_worksheet = spreadsheet_key.worksheet("Data")
# Get list with url's
self.url_worksheet = url_worksheet = spreadsheet_key.worksheet("Felix Copy")
# Get sheets
sheets = Sheets("credentials.json", "key_id")
# Start scraping
scraper = DataScraper(sheets)
scraper.StartScrape()

How to loop through rows of the Excel sheet using openpyxl?

I am using Python, Selenium, openpyxl in order to fill a form online.
To fill the form I am taking values from specific cells on excel (.xlsx).
(to test the code you can just create and excel file with 2 columns, insert under column A some names and column B some age.
From the cell A2, I take the NAME of the person and insert it into the online form
From the cell B2, I take the LASTNAME of the person and insert it into the online form
Then I click 'Reset' (This is an example but in the real code I will click save as a draft).
I would like to create a loop in which the code will start again from driver.get("https://www.roboform.com/filling-test-all-fields") to go again to the page where I need to fill out the form, but this time I would like to take:
From the cell A3, the NAME of the person and insert it into the online form
From the cell B3, the LASTNAME of the person and insert it into the online form
And click 'Send as a draft' again
Then again, another loop to insert the data from row 4, so I would like to program to read again my code from driver.get("https://www.roboform.com/filling-test-all-fields") but this time take values from A4 and B4, and so on, until the row on excel is empty.
With the following code I can insert the data to the online form:
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.common.exceptions import NoSuchElementException
import openpyxl
driver: WebDriver =
webdriver.Chrome("/Users/HHHHH/PycharmProjects/excel/driver/chromedriver")
driver.maximize_window()
excel_document = openpyxl.load_workbook(r"/Users/XPATH OF THE EXCEL FILE YOU CREATE TO TEST THIS CODE",
data_only=True)
sheet = excel_document["Sheet1"]
driver.get("https://www.roboform.com/filling-test-all-fields")
#Insert in the form the Name of the person
prevsymbol = sheet["A2"].value
if prevsymbol == None:
pass
else:
try:
driver.find_element_by_name("02frstname").send_keys(sheet["A2"].value)
except NoSuchElementException:
print("A2:(name) Not Found")
#Insert in the form the Last Name of the person
prevsymbol = sheet["B2"].value
if prevsymbol == None:
pass
else:
try:
driver.find_element_by_name("04lastname").send_keys(sheet["B2"].value)
except NoSuchElementException:
print("B2:(Lastname) Not Found")
#click Save as a draft
driver.find_element_by_xpath("//*[#value='Reset']").click()

I have create a helper class please find if it fulfill your purpose. This code is done in old version of openpyxl. Please update code if needed.
class OpenpyxlImport(object):
def __init__(self, file):
self.file = file
if self.file.name.endswith('.xls'):
self.wb = self.xls_to_xlsx(self.file)
else:
self.wb = load_workbook(self.file)
self.sheets = self.wb.worksheets
def to_camelcase(self, string):
text = re.sub(r'(?!^)_([a-zA-Z])', lambda m: ' ' + m.group(1).upper(), str(string))
return text.upper()
def to_snake_case(self, string):
text = re.sub(r'\s', '_', str(string))
return text.lower()
def xls_to_xlsx(self, content):
xls_book = xlrd.open_workbook(file_contents=content.read())
workbook = openpyxlWorkbook()
for i in range(0, xls_book.nsheets):
xls_sheet = xls_book.sheet_by_index(i)
sheet = workbook.active if i == 0 else workbook.create_sheet()
sheet.title = xls_sheet.name
for row in range(0, xls_sheet.nrows):
for col in range(0, xls_sheet.ncols):
sheet.cell(row=row + 1, column=col + 1).value = xls_sheet.cell_value(row, col)
return workbook
def tally_header(self, row, fields):
# Strip whitespace in cell value
for cell in row:
cell.value = cell.value.rstrip()
return [cell.value for cell in row] == fields
def row_to_dict(self, row):
dct = {}
for cell in row:
dct[self.to_snake_case(self.get_first_sheet()[cell.column + '1'].value)] = cell.value
return dct
def get_sheets(self):
return self.sheets
def get_first_sheet(self):
return self.sheets[0]
def get_sheet_rows(self):
return tuple(self.get_first_sheet().iter_rows())
# Usage
excel = OpenpyxlImport(file)
rows = excel.get_sheet_rows()
if excel.tally_header(rows[0], self.fields):
for row in rows[1:]:
params = excel.row_to_dict(row)

You can get the number of rows in the sheet using the max_row property. So, the code becomes:
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.common.exceptions import NoSuchElementException
import openpyxl
driver: WebDriver =
webdriver.Chrome("/Users/HHHHH/PycharmProjects/excel/driver/chromedriver")
driver.maximize_window()
excel_document = openpyxl.load_workbook(r"/Users/HHHHH/Desktop/testtesttest1.xlsx",
data_only=True)
sheet = excel_document["Sheet1"]
for i in range(1, sheet.max_row+1):
driver.get("https://XXXXXXXXXX")
# Insert in the form the Name of the person
cell = "A" + str(i)
prevsymbol = sheet[cell].value
# Note that instead of doing the work at the else clause, you can negate the term
if prevsymbol is not None:
try:
# Note that we can use prevsymbol here, instead of referring to cell once again
driver.find_element_by_id("name").send_keys(prevsymbol)
except NoSuchElementException:
#
print(cell + ":(name) Not Found")
# Insert in the form the Age of the person
cell = "B" + str(i)
prevsymbol = sheet[cell].value
if prevsymbol is not None:
try:
driver.find_element_by_id("age").send_keys(prevsymbol)
except NoSuchElementException:
print(cell + ":(Age) Not Found")
# Click Save as a draft
driver.find_element_by_xpath("xpath_save_as_draft").click()

Unable to concatenate 2 lists using Python (Int and text lists) and to split output data into separate files

I am facing issue with concatenating data of 2 lists. I have page number (integer value) and table data (text value) both within a list while merging them to print output I am getting this error:
TypeError: object of type 'int' has no len()
My goal is to print following output and also need help to save data from table 3 based on "Aircraft operator ID" keyword into table name "Table 3 A" and data with Header value of "Installation ID" into table name "Table 3 I" into 2 separate tabs of excel file. My code is given below:
import time
import requests
import random
from lxml import html # used to use Xpath
from bs4 import BeautifulSoup
import xlsxwriter
def append_row(ws, row):
for col, value in enumerate(row):
ws.write_string(ws.cur_row, col, value)
ws.cur_row += 1
workbook = xlsxwriter.Workbook('Output.xlsx')
ws_3_A = workbook.add_worksheet("Table 3 A")
ws_3_I = workbook.add_worksheet("Table 3 I")
# Keep a track of the row to use in each worksheet
ws_3_A.cur_row = 0
ws_3_I.cur_row = 0
# Code starts from here:
start = 1
end = 3
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="
for page_number in range(start, end):
print("Page {}".format(page_number))
url = link.format(page_number)
r = requests.get(url)
print(url)
serial_no = [int(x) for x in str(page_number)]
print(serial_no)
time.sleep(random.randint(2, 5))
soup = BeautifulSoup(r.content, "lxml")
# Table 3 Aircraft Operator ID data:
for items in soup.find(id="tblChildDetails").find("table").find_all("tr")[1:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
append_row(ws_3_A, serial_no + [url] + dataset)
# Table 3 Installation ID data:
for items in soup.find(id="tblChildDetails").find("table").find_all("tr")[1:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
print(dataset)
append_row(ws_3_I, serial_no + [url] + dataset)
workbook.close()
Current and Expected output is attached as screenshot. Expected output:
Traceback:
Traceback (most recent call last):
File "D:\QRS\Script.py", line 42, in <module>
append_row(ws_3_A, serial_no + [url] + dataset)
File "D:\QRS\Script.py", line 10, in append_row
ws.write_string(ws.cur_row, col, value)
File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 67, in cell_wrapper
return method(self, *args, **kwargs)
File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 500, in write_string
return self._write_string(row, col, string, cell_format)
File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 512, in _write_string
if len(string) > self.xls_strmax:
TypeError: object of type 'int' has no len()

Each element in [int(x) for x in str(page_number)] should be string. Because each element will pass to variable value in function append_row().
And then , you want to have an expected output , you need to ignore the first tr when page_number bigger start.
Use try...finally , so you can close workbook when script occured error
import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter
def append_row(ws, row):
for col, value in enumerate(row):
ws.write_string(ws.cur_row, col, value)
ws.cur_row += 1
workbook = xlsxwriter.Workbook('Output.xlsx')
def ws_3(name):
return workbook.add_worksheet("Table 3 {}".format(name))
# Code starts from here:
start = 1
end = 5
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="
coldict = {}
try:
for page_number in [1,2,3,342,343]:
print("Page {}".format(page_number))
url = link.format(page_number)
r = requests.get(url)
serial_no = [str(page_number)]
time.sleep(random.randint(2, 5))
soup = BeautifulSoup(r.content, "lxml")
# Table 3 Aircraft Operator ID data:
tr = soup.find(id="tblChildDetails").find("table").find_all("tr")
dataset = [item.get_text(strip=True) for item in tr[1].find_all("td")]
#select or create new table
if not coldict.get(dataset[0]):
ws = ws_3(dataset[0])
ws.cur_row = 0
coldict[dataset[0]] = ws
append_row(ws, ["Page no","Url"] + dataset)
else:
ws = coldict.get(dataset[0])
for items in tr[2:]:
dataset = [item.get_text(strip=True) for item in items.find_all("td")]
print(url)
print(dataset)
append_row(ws, serial_no + [url] + dataset)
finally:
workbook.close()

Extract data from web page

I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.

Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract Header and Table text from a .docx file - python

Related

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

Split data into multiple worksheets

How to loop through rows of the Excel sheet using openpyxl?

Unable to concatenate 2 lists using Python (Int and text lists) and to split output data into separate files

Extract data from web page

Categories

Resources