Formating a table from a csv file - python

I'm trying to make a table from data from a CSV file using only the CSV module. Could anyone tell me what should I do to display the '|' at the end of every row(just after the last element in the row)?
Here's what I have so far:
def display_playlist( filename ):
if filename.endswith('.csv')==False: #check if it ends with CSV extension
filename = filename + ('.csv') #adding .csv if given without .csv extension
max_element_length=0
#aligning columns to the longest elements
for row in get_datalist_from_csv( filename ):
for element in row:
if len(element)>max_element_length:
max_element_length=len(element)
# print(max_element_length)
#return max_element_length
print('-----------------------------------------------------------------------------')
for row in get_datalist_from_csv( filename ):
for element in row:
print('| ', end='')
if (len(element)<=4 and element.isdigit==True):
print(pad_to_length(element,4), end=' |') #trying to get '|' at the end[enter image description here][1]
else:
print(pad_to_length(element, max_element_length), end=' ')
print('\n')
print('-----------------------------------------------------------------------------')
## Read data from a csv format file
def get_datalist_from_csv( filename ):
## Create a 'file object' f, for accessing the file:
with open( filename ) as f:
reader = csv.reader(f) # create a 'csv reader' from the file object
datalist = list( reader ) # create a list from the reader
return datalist # we have a list of lists
## For aligning table columns
## It adds spaces to the end of a string to make it up to length n.
def pad_to_length( string, n):
return string + " "* (n-len(string)) ## s*n gives empty string for n<1
The image I get for now is:
| Track | Artist | Album | Time
| Computer Love | Kraftwerk | Computer World | 7:15
| Paranoid Android | Radiohead | OK Computer | 6:27
| Computer Age | Neil Young | Trans | 5:24
| Digital | Joy Division | Still | 2:50
| Silver Machine | Hawkwind | Roadhawks | 4:39
| Start the Simulator | A-Ha | Foot of the Mountain | 5:11
| Internet Connection | M.I.A. | MAYA | 2:56
| Deep Blue | Arcade Fire | The Suburbs | 4:29
| I Will Derive! | MindofMatthew | You Tube | 3:17
| Lobachevsky | Tom Lehrer | You Tube | 3:04

Related

Hi. I'm trying to scrape infinite scrolling website. It stuck in 200th data

I scrolled with selenium and grabbed all urls and used these urls in beautifulsoup.But there are so many duplicates in scraped data.I tried to left them with drop_duplicates but it stack in about 200th data .I cannot detect the problem. I add the code which i use. I want to grab all prices,areas,rooms et.c.
import requests
from lxml import html
from bs4 import BeautifulSoup as bs
import bs4
import pandas as pd
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from lxml import html
import pandas as pd
import time
driver = webdriver.Chrome(r'C:\Program Files (x86)\chromedriver_win32\chromedriver.exe')
driver.get('https://tap.az/elanlar/dasinmaz-emlak/menziller')
time.sleep(1)
price = []
citi = []
elann = []
bina = []
arrea = []
adres = []
roome = []
baxhise = []
mulkayet = []
descript = []
urll = []
zefer = []
previous_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(2)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == previous_height:
break
previous_height = new_height
lnks=driver.find_elements(By.CSS_SELECTOR, '#content > div > div > div.categories-products.js-categories-products > div.js-endless-container.products.endless-products > div.products-i')
for itema in lnks:
urla=itema.find_element(By.TAG_NAME, 'a')
aae = (urla.get_attribute('href'))
urel = aae.split('/bookmark')[0]
result = requests.get(urel)
soup = bs(result.text, 'html.parser')
casee = soup.find_all("div",{"class":"lot-body l-center"})
for ae in casee:
c = ae.find_all('table', class_ = 'properties')
pp = c[0].text
city = pp.split('Şəhər')[-1].split('Elanın')[0].replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I')
cxe = c[0].text
elan_tipi = cxe.split('Elanın tipi')[-1].split('Binanın tipi')[0].replace(' verilir','')
elane = elan_tipi.replace(' ', '_').replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I')
cx = c[0].text
bina_tipi = cx.split('Binanın tipi')[-1].split('Sahə')[0].replace(' ', '_').replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I')
cx = c[0].text
area = cx.split('tikiliSahə,')[-1].split('Otaq')[0].replace('m²', '')
cx = c[0].text
room = cx.split('Otaq sayı')[-1].split('Yerləşmə yeri')[0]
cx = c[0].text
addresss = cx.split('Yerləşmə yeri')[-1].replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I')
d = ae.find_all('p')
elan_kod = (d[0].text.replace('Elanın nömrəsi:', ''))
d = ae.find_all('p')
baxhis = d[1].text.replace('Baxışların sayı: ', '')
d = ae.find_all('p')
description = (d[3].text.replace('Baxışların sayı: ', '').replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I').replace("\n", ''))
kim = ae.find_all('div', class_ = 'author')
kime = kim[0].text
if 'bütün' in kime:
mulkiyet = int(0)
else:
mulkiyet = int(1)
caseee = soup.find_all("div",{"class":"middle"})
for aecex in caseee:
pricxxe = aecex.find_all('span', class_ = 'price-val')
pricef = pricxxe[0].text.replace(' ' , '')
price.append(pricef)
zefer.append(elane)
elann.append(elan_kod)
citi.append(city)
bina.append(bina_tipi)
arrea.append(area)
adres.append(addresss)
roome.append(room)
baxhise.append(baxhis)
mulkayet.append(mulkiyet)
descript.append(description)
ae = pd.DataFrame({'URL': urel,'Unique_id': elann,'Price': price,'Room': roome,'Area': arrea,'Seher': citi,'Elan_tipi': zefer,'Description': descript,'Address': adres,'Category': bina,'Mulkiyyet': mulkayet})
aere = ae.drop_duplicates()
aere.to_csv('dde.csv', index=False, encoding='utf-8' )
A cause of duplicates is that every time you get lnks, you're getting the products you scraped before scrolling as well. You can probably skip duplicate scrapes by initiating scrapedUrls = [] somewhere at the beginning of your code (OUTSIDE of all loops), and then checking urel against it, as well as adding to it
if urel in scrapedUrls: continue ## add this line
result = requests.get(urel) ## from your code
scrapedUrls.append(urel) ## add this line
but I'm not sure it'll solve your issue.
I don't know why it's happening, but when I try to scrape the links with selenium's find_elements, I get the same url over and over; so I wrote a fuction [getUniqLinks] that you can use to get a unique list of links (prodUrls) by scrolling up to a certain number of times and then parsing page_source to BeautifulSoup. Below are two lines from the printed output of prodUrls = getUniqLinks(fullUrl, rootUrl, max_scrolls=250, tmo=1):
WITH SELENIUM found 10957 product links [1 unique]
PARSED PAGE_SOURCE ---> found 12583 product links [12576 unique]
(The full function and printed output are at https://pastebin.com/b3gwUAJZ.)
Some notes:
If you increase tmo, you can increase max_scrolls too, but it starts getting quite slow after 100 scrolls.
I used selenium to get links as well, just to print and show the difference, but you can remove all lines that end with # remove to get rid of those unnecessary parts.
I used selenium's WebDriverWait instead of time.sleep because it stops waiting after the relevant elements have loaded - it raises an error if it doesn't load it the allowed time (tmo), so I found it more convenient and readable to use in a try...except block instead of using driver.implicitly_wait
I don't know if this is related to whatever is causing your program to hang [since mine is probably just because of the number of elements being too many], but mine also hangs if I try to use selenium to get all the links after scrolling instead of adding to prodLinks in chunks inside the loop.
Now, you can loop through prodUrls and get the data you want, but I think it's better to build a list with a separate dictionary for each link [i.e., having a dictionary for each row rather than having a separate list for each column].
If you use these two functions, then you just have to prepare a reference dictionary of selectors like
refDict = {
'title': 'h1.js-lot-title',
'price_text': 'div.price-container',
'price_amt': 'div.price-container > .price span.price-val',
'price_cur': 'div.price-container > .price span.price-cur',
'.lot-text tr.property': {'k':'td.property-name', 'v':'td.property-value'},
'contact_name': '.author > div.name',
'contact_phone': '.author > a.phone',
'lot_warning': 'div.lot-warning',
'div.lot-info': {'sel': 'p', 'sep': ':'},
'description': '.lot-text p'
}
that can be passed to fillDict_fromTag like in the code below:
## FIRST PASTE FUNTION DEFINITIONS FROM https://pastebin.com/hKXYetmj
productDetails = []
puLen = len(prodUrls)
for pi, pUrl in enumerate(prodUrls[:500]):
print('', end=f'\rScraping [for {pi+1} of {puLen}] {pUrl}')
pDets = {'prodId': [w for w in pUrl.split('/') if w][-1]}
resp = requests.get(pUrl)
if resp.status_code != 200:
pDets['Error_Message'] = f'{resp.raise_for_status()}'
pDets['sourceUrl'] = pUrl
productDetails.append(pDets)
continue
pSoup = BeautifulSoup(resp.content, 'html.parser')
pDets = fillDict_fromTag(pSoup, refDict, pDets, rootUrl)
pDets['sourceUrl'] = pUrl
productDetails.append(pDets)
print()
prodDf = pd.DataFrame(productDetails).set_index('prodId')
prodDf.to_csv('ProductDetails.csv')
I have uploaded both 'prodLinks.csv' and 'ProductDetails.csv' here, although there are only the first 500 scrapes' results since I manually interrupted after around 20 minutes; I'm also pasting the first 3 rows here (printed with print(prodDf.loc[prodDf.index[:3]].to_markdown()))
| prodId | title | price_text | price_amt | price_cur | Şəhər | Elanın tipi | Elanın tipi [link] | Binanın tipi | Binanın tipi [link] | Sahə, m² | Otaq sayı | Yerləşmə yeri | contact_name | contact_phone | lot_warning | Elanın nömrəsi | Baxışların sayı | Yeniləndi | description | sourceUrl |
|---------:|:---------------------------------------------------------|:-------------|:------------|:------------|:--------|:---------------|:----------------------------------------------------------------|:---------------|:----------------------------------------------------------------|-----------:|------------:|:----------------|:---------------|:----------------|:-----------------------------------------------------------------------------|-----------------:|------------------:|:---------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------|
| 35828514 | 2-otaqlı yeni tikili kirayə verilir, 20 Yanvar m., 45 m² | 600 AZN | 600 | AZN | Bakı | Kirayə verilir | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B740%5D=3724 | Yeni tikili | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B747%5D=3849 | 45 | 2 | 20 Yanvar m. | Elşad Bəy | (055) 568-12-13 | Diqqət! Beh göndərməmişdən öncə sövdələşmənin təhlükəsiz olduğuna əmin olun! | 35828514 | 105 | 22 Noyabr 2022 | 20 Yanvar metrosuna və Inşatcılar metrosuna 8 - 11 dəiqqə arası olan ərazidə, yeni tikili binada 1 otaq 2 otaq təmirli şəraitiynən mənzil kirayə 600 manata, ailiyə və iş adamına verilir. Qabaqçadan 2 ay ödəniş olsa kamendant pulu daxil, ayı 600 manat olaçaq, mənzili götūrən şəxs 1 ayın 20 % vasitəciyə ödəniş etməlidir. Xahìş olunur, rial olmuyan şəxs zəng etməsin. | https://tap.az/elanlar/dasinmaz-emlak/menziller/35828514 |
| 35833080 | 1-otaqlı yeni tikili kirayə verilir, Quba r., 60 m² | 40 AZN | 40 | AZN | Quba | Kirayə verilir | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B740%5D=3724 | Yeni tikili | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B747%5D=3849 | 60 | 1 | Quba r. | Orxan | (050) 604-27-60 | Diqqət! Beh göndərməmişdən öncə sövdələşmənin təhlükəsiz olduğuna əmin olun! | 35833080 | 114 | 22 Noyabr 2022 | Quba merkezde her weraiti olan GUNLUK KIRAYE EV.Daimi isti soyuq su hamam metbex wifi.iwciler ve aile ucun elveriwlidir Təmirli | https://tap.az/elanlar/dasinmaz-emlak/menziller/35833080 |
| 35898353 | 4-otaqlı mənzil, Nizami r., 100 m² | 153 000 AZN | 153 000 | AZN | Bakı | Satılır | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B740%5D=3722 | Köhnə tikili | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B747%5D=3850 | 100 | 4 | Nizami r. | Araz M | (070) 723-54-50 | Diqqət! Beh göndərməmişdən öncə sövdələşmənin təhlükəsiz olduğuna əmin olun! | 35898353 | 71 | 27 Noyabr 2022 | X.Dostluğu metrosuna 2 deq mesafede Leninqrad lahiyeli 9 mərtəbəli binanın 5-ci mərtəbəsində 4 otaqlı yaxsi temirli mənzil satılır.Əmlak ofisinə ödəniş alıcı tərəfindən məbləğin 1%-ni təşkil edir. | https://tap.az/elanlar/dasinmaz-emlak/menziller/35898353 |

Why pywinauto won't detect the control identifiers in a window except the title bar?

I've been trying to automate the Cinebench window with python using pyautogui, as this is the best library that i came across. I made a few projects that worked well, but with Cinebench i don't get any control identifiers (Except for title, and the normal 3 top buttons). My main objective is to be able to automatically start benchmarks and read the final score.
I didn't come here to bother you all as soon as I hit an issue, so here's all of the things that i tried:
Switching backend="uia" to backend="win32". Result: code stopped working
Waiting for the window to load, using time.sleep(). Result: no difference was noticed
Adding a timeout=10 to the .connect() function. Result: no difference was noticed
Researching if Cinebench had an API. Result: of course it doesn't (as of what i found)
Researching if there was another library to do it. Result: didn't find any.
I really don't want to do this using "click at this coordinates" and even so i wouldn't be able to read from it, so it would be useless.
The code that i used:
app = Application(backend="uia").start(rf"C:/Users/{os.getlogin()}/Desktop/MasterBench/Benchmarks/Cinebench.exe")
app = Application(backend="uia").connect(title=CINEBENCH_WINDOW_NAME, timeout=10)
app.CINEBENCHR23200.print_control_identifiers()
What i got:
Control Identifiers:
Dialog - 'CINEBENCH R23.200' (L-8, T-8, R1928, B1088)
['CINEBENCH R23.200', 'CINEBENCH R23.200Dialog', 'Dialog']
child_window(title="CINEBENCH R23.200", control_type="Window")
|
| TitleBar - '' (L16, T-5, R1920, B23)
| ['TitleBar']
| |
| | Menu - 'Sistema' (L0, T0, R22, B22)
| | ['SistemaMenu', 'Sistema', 'Menu', 'Sistema0', 'Sistema1']
| | child_window(title="Sistema", auto_id="MenuBar", control_type="MenuBar")
| | |
| | | MenuItem - 'Sistema' (L0, T0, R22, B22)
| | | ['Sistema2', 'SistemaMenuItem', 'MenuItem']
| | | child_window(title="Sistema", control_type="MenuItem")
| |
| | Button - 'Riduci a icona' (L1779, T8, R1826, B22)
| | ['Button', 'Riduci a iconaButton', 'Riduci a icona', 'Button0', 'Button1']
| | child_window(title="Riduci a icona", control_type="Button")
| |
| | Button - 'Ripristino' (L1826, T8, R1872, B22)
| | ['Button2', 'Ripristino', 'RipristinoButton']
| | child_window(title="Ripristino", control_type="Button")
| |
| | Button - 'Chiudi' (L1872, T8, R1928, B22)
| | ['Button3', 'Chiudi', 'ChiudiButton']
| | child_window(title="Chiudi", control_type="Button")

Python regex pattern match starts with dot and store it in dict format

#-----------------------------------------------------------------------------------
from pprint import pprint
data = '''
.
.
.
#Long log file
-------------------------------------------------------------------------------
Section Name | Budget | Size | Prev Size | Overflow
--------------------------------+-----------+-----------+-----------+----------
.text.resident | 712924 | 794576 | 832688 | YES
.rodata.resident | 77824 | 77560 | 21496 | YES
.data.resident | 28672 | 28660 | 42308 | NO
.bss.resident | 52672 | 1051632 | 1455728 | YES
.
.
.
'''
Output expected:
MEMDICT = {'.text.resident' : {'Budget':'712924', 'Size':'794576', 'Prev Size': '832688' , 'Overflow': 'YES'},
'.rodata.resident' : {'Budget':'', 'Size':'', 'Prev Size': '' , 'Overflow': 'YES'},
'.data.resident' :{'Budget':'', 'Size':'', 'Prev Size': '' , 'Overflow': 'NO'},
'.bss.resident' :{'Budget':'', 'Size':'', 'Prev Size': '' , 'Overflow': 'YES'}}
I am a beginer in python. Please suggest some simple steps
Logic:
Search for a regex pattern and get the headers in a list
pattern = re.compile(r'\sSection Name\s|\sBudget*') # This can be improved,
if(pattern.match(line)):
key_list = (''.join(line.split())).split('|') # Unable to handle space issues, so trimmed and used.
Search for a regex pattern to match .something.resident | \d+ | \d+ | \d+ | **
Need some help and get it in value_list
Making all list into the dict in a loop
mem_info = {} # reset the list
for i in range(0,len(key_list)):
mem_info[key_list[i]] = value_list[i]
MEMDICT[sta_info[0]] = sta_info
The only thing you haven't shown us is what line ends the section. Other than that, this is what you need:
keeper = False
memdict = {}
for line in open(file):
if not keeper:
if 'Section Name' in line:
keeper = True
continue
if '-------------------' in line:
continue
if 'whatever ends the section' in line:
break
parts = line.split()
memdict[parts[0]] = {
'Budget': int(parts[1]),
'Size': int(parts[2]),
'Prev Size': int(parts[3]),
'Overflow': parts[4]
)

How to disable text wrap in a columnar column?

|---------|------------------|------------------|-----------|------------------|
|serial no|ggggggg name |status |status code|AAAAAAAAAurl |
|==============================================================================|
|1 |ggggggggggg-kkkkkk|Healthy |200 |http://aaaaaaaaaaa|
| |e | | |-service.dev.sdddd|
| | | | |1.cccc.cc/health/l|
| | | | |ive |
|---------|------------------|------------------|-----------|------------------|
|2 |zzzzzzzz-jjjjjj |Healthy |200 |http://ddddddddddd|
| | | | |ader.dev.ffffff.cc|
| | | | |cc.cc/health/live |
|---------|------------------|------------------|-----------|------------------|
I am trying to get the last column in one row the entire url. I am using the following python library to print this, tried few things but unable to get this working. I tried https://pypi.org/project/Columnar/ setting max column width and min column width and such as mentioned here, but none are working
Edit: Headers are simply names of the columns, you can name it anything you want
from columnar import columnar
headers = ['serial no', 'service name', 'status', 'status code']
...
tabledata = []
counter = 0
for x in services:
zzz = requests.get("http://xxx.yyy"+ x)
counter = counter + 1
i = counter
myrowdata = [i, x, zzz.text, zzz.status_code]
tabledata.append(myrowdata)
table = columnar(tabledata, headers, no_borders=True, max_column_width=None)
print(table)
1.) You missed the column name "url" from headers.
You should do as follows:
headers = ['serial no', 'service name', 'status', 'status code', 'url']
2.) You have to add url to myrowdata:
myrowdata = [i, x, zzz.text, zzz.status_code, "http://xxx.yyy"+ x]
Update:
If you did all the fixes above, you have to run it in an external system terminal to get the real result, as some internal IDE console constrains the width of the display:
In Spyder:
SERIAL NO SERVICE NAME STATUS STATUS CODE URL
1 Anyname Anytext Anystatus_code http://aaaaaaaaaaaaaaaaaaa
aadddddddddddddddddddddddd
dddddddaaaaaaaaa.com
In external system terminal:

getting alphabets after applying sentence tokenizer of nltk instead of sentences in Python 3.5.1

import codecs, os
import re
import string
import mysql
import mysql.connector
y_ = ""
'''Searching and reading text files from a folder.'''
for root, dirs, files in os.walk("/Users/ultaman/Documents/PAN dataset/Pan Plagiarism dataset 2010/pan-plagiarism-corpus-2010/source-documents/test1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
y_ = y_ + lines
'''Tokenizing the senteces of the text file.'''
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(y_)
tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
'''Removing punctuation marks.'''
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = ''
for review in tokenized_docs:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review+= new_token
tokenized_docs_no_punctuation += (new_review)
print(tokenized_docs_no_punctuation)
'''Connecting and inserting tokenized documents without punctuation in database field.'''
def connect():
for i in range(len(tokenized_docs_no_punctuation)):
conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'test' )
cursor = conn.cursor()
cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,(tokenized_docs_no_punctuation[i])))
conn.commit()
conn.close()
if __name__ == '__main__':
connect()
After writing the above code, The result is like
2 | S | N |
| 3 | S | o |
| 4 | S | |
| 5 | S | d |
| 6 | S | o |
| 7 | S | u |
| 8 | S | b |
| 9 | S | t |
| 10 | S | |
| 11 | S | m |
| 12 | S | y |
| 13 | S |
| 14 | S | d
in the database.
It should be like:
1 | S | No doubt, my dear friend.
2 | S | no doubt.
I suggest making the following edits(use what you would like). But this is what I used to get your code running. Your issue is that review in for review in tokenized_docs: is already a string. So, this makes token in for token in review: characters. Therefore to fix this I tried -
tokenized_docs = ['"No doubt, my dear friend, no doubt; but in the meanwhile suppose we talk of this annuity.', 'Shall we say one thousand francs a year."', '"What!"', 'asked Bonelle, looking at him very fixedly.', '"My dear friend, I mistook; I meant two thousand francs per annum," hurriedly rejoined Ramin.', 'Monsieur Bonelle closed his eyes, and appeared to fall into a gentle slumber.', 'The mercer coughed;\nthe sick man never moved.', '"Monsieur Bonelle."']
'''Removing punctuation marks.'''
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
new_token = regex.sub(u'', review)
if not new_token == u'':
tokenized_docs_no_punctuation.append(new_token)
print(tokenized_docs_no_punctuation)
and got this -
['No doubt my dear friend no doubt but in the meanwhile suppose we talk of this annuity', 'Shall we say one thousand francs a year', 'What', 'asked Bonelle looking at him very fixedly', 'My dear friend I mistook I meant two thousand francs per annum hurriedly rejoined Ramin', 'Monsieur Bonelle closed his eyes and appeared to fall into a gentle slumber', 'The mercer coughed\nthe sick man never moved', 'Monsieur Bonelle']
The final format of the output is up to you. I prefer using lists. But you could concatenate this into a string as well.
nw = []
for review in tokenized_docs[0]:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review += new_token
nw.append(new_review)
'''Inserting into database'''
def connect():
for j in nw:
conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
cursor = conn.cursor()
cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,j))
conn.commit()
conn.close()
if __name__ == '__main__':
connect()

Categories