Reading File and Printing Output - python

I am reading a file and the output is supposed to look like the one below, ignoring the actual table the values for my hours,minutes and seconds are off as is the money, which is supposed to be calculated by rounding up to the minute; I have tried many ways to solve this and this is my last resort.
+--------------+------------------------------+---+---------+--------+
| Phone number | Name | # |Duration | Due |
+--------------+------------------------------+---+---------+--------
|(780) 123 4567|Ameneh Gholipour Shahraki |384|55h07m53s|$ 876.97|**
|(780) 123 6789|Stuart Johnson |132|17h53m19s|$ 288.81|
|(780) 321 4567|Md Toukir Imam |363|49h52m12s|$ 827.48|++
|(780) 432 1098|Hamman Samuel |112|16h05m09s|$ 259.66|
|(780) 492 2860|Osmar Zaiane |502|69h27m48s|$1160.52|**
|(780) 789 0123|Elham Ahmadi |259|35h56m10s|$ 596.94|
|(780) 876 5432|Amir Hossein Faghih Dinevari |129|17h22m32s|$ 288.56|
|(780) 890 7654|Weifeng Chen |245|33h48m46s|$ 539.41|
|(780) 987 6543|Farrukh Ahmed |374|52h50m11s|$ 883.72|**
+--------------+------------------------------+---+---------+--------+
| Total dues | $ 5722.07|
+--------------+-----------------------------------------------------+
This is my code and I am having the most trouble with the time() and due() functions
from collections import Counter
customers=open('customers.txt','r')
calls=open('calls.txt.','r')
def main():
customers=open('customers.txt','r')
calls=open('calls.txt.','r')
print("+--------------+------------------------------+---+---------+--------+")
print("| Phone number | Name | # |Duration | Due |")
print("+--------------+------------------------------+---+---------+--------+")
phone_sorter()
number_calls()
time()
due()
def phone_sorter():
sorted_no={}
for line in customers:
rows=line.split(";")
sorted_no[rows[1]]=rows[0]
for value in sorted(sorted_no.values()):
for key in sorted_no.keys():
if sorted_no[key] == value:
print(sorted_no[key],key)
def number_calls():
no_calls={}
for line in calls:
rows=line.split(";")
if rows[1] not in no_calls:
no_calls[rows[1]]=1
else:
no_calls[rows[1]]+=1
s=sorted(no_calls.keys())
for key in s:
print(no_calls[key])
def time():
calls=open('calls.txt.','r')
n_list=[]
d={}
for line in calls:
rows=line.split(";")
d[rows[1]]=rows[3]
if rows[1] not in d:
d[rows[1]]=rows[3]
else:
d[rows[1]]+=rows[3]
x=sorted(d.keys())
for value in x:
m, s = divmod(int(value), 60)
h, m = divmod(m, 60)
print("%d:%02d:%02d" % (h, m, s))
def due():
calls=open('calls.txt.','r')
d2={}
for line in calls:
rows=line.split(";")
d2[rows[1]]=float(rows[3])*float(rows[4])
if rows[1] not in d2:
d2[rows[1]]=float(rows[3])*float(rows[4])
else:
d2[rows[1]]+=float(rows[3])*float(rows[4])
x=sorted(d2.keys())
for key in x:
print(d2[key])
print(sum(d2.values()))
main()
This is the link to the file I am reading in pastebin: http://pastebin.com/RSMnXDtq
The first column is for the phone number. This number has to be formatted as (999) 999 9999.
The second column is for the name and it has to be 30 characters wide.
The third column is for the number of calls originating from the phone in question. It should be on 3 digits.
The fourth column is for the total duration of the calls originating from the phone in question. This duration should be formatted as follows: 99h99m99s for hours, minutes and seconds. The minutes and seconds should have a prefix of 0 if less than 10.
The fifth column is for the amount paid for the calls calculated based on the rates attached to each call. Note that the duration for each call should be rounded up to the minute in order to use the rate per minute. This amount should be printed with 7 positions and only 2 after the decimal point.

Here is a solution using pandas:
from pandas import np, read_csv
#
# Load and process call data
#
def get_data(calls_fname, custs_fname):
# load call data
calls = read_csv(
calls_fname,
sep = ";",
names = ["session", "phone", "to", "seconds", "rate"],
header = None
)
# calculate cost per call (time rounded up to the next minute)
calls["cost"] = np.ceil(calls["seconds"] / 60.) * calls["rate"]
# add a call-count column
# (I think there is a better way to do this using np.size in
# the .groupby, but I haven't been able to figure it out)
calls["count"] = 1
# find per-cust totals
calls = calls.groupby(["phone"]).agg({"seconds":np.sum, "cost":np.sum, "count":np.sum})
# load customer data
custs = read_csv(
custs_fname,
sep = ";",
names = ["phone", "name"],
header = None,
index_col = 0 # index by phone number
)
# join to find customer name
return calls.join(custs, sort=False).reset_index()
#
# output formatting functions
#
def phone_str(i):
"""
Convert int 1234567890 to str "(123) 456 7890"
"""
s = str(i).zfill(10)
return "({}) {} {}".format(s[0:3], s[3:6], s[6:10])
def time_str(i):
"""
Convert int 3662 to str " 1h01m02s"
"""
m, s = divmod(i, 60)
h, m = divmod(m, 60)
return "{:>2d}h{:02d}m{:02d}s".format(h, m, s)
def make_table(totals):
header = (
"+--------------+------------------------------+---+---------+--------+\n"
"| Phone number | Name | # |Duration | Due |\n"
"+--------------+------------------------------+---+---------+--------+\n"
)
rows = [
"|{}|{:<30}|{:>3d}|{}|${:7.2f}|\n"
.format(
phone_str(row["phone" ]),
row["name" ],
row["count" ],
time_str (row["seconds"]),
row["cost" ]
)
for i,row in totals.iterrows()
]
total_dues = np.sum(totals["cost"])
footer = (
"+--------------+------------------------------+---+---------+--------+\n"
"| Total dues | ${:10.2f}|\n"
"+--------------+-----------------------------------------------------+"
.format(total_dues)
)
return header + "".join(rows) + footer
def main():
totals = get_data("calls.txt", "customers.txt")
print(make_table(totals))
if __name__ == "__main__":
main()
Using the data from your pastebin link as calls.txt, and the following as customers.txt:
7801236789;Stuart Johnson
7804321098;Hamman Samuel
7803214567;Md Toukir Imam
7804922860;Osmar Zaiane
7801234567;Ameneh Gholipour Shahraki
7807890123;Elham Ahmadi
7808765432;Amir Hossein Faghih Dinevari
7808907654;Weifeng Chen
7809876543;Farrukh Ahmed
it produces
+--------------+------------------------------+---+---------+--------+
| Phone number | Name | # |Duration | Due |
+--------------+------------------------------+---+---------+--------+
|(780) 123 4567|Ameneh Gholipour Shahraki |384|55h07m53s|$ 876.97|
|(780) 123 6789|Stuart Johnson |132|17h53m19s|$ 288.81|
|(780) 321 4567|Md Toukir Imam |363|49h52m12s|$ 827.48|
|(780) 432 1098|Hamman Samuel |112|16h05m09s|$ 259.66|
|(780) 492 2860|Osmar Zaiane |502|69h27m48s|$1160.52|
|(780) 789 0123|Elham Ahmadi |259|35h56m10s|$ 596.94|
|(780) 876 5432|Amir Hossein Faghih Dinevari |129|17h22m32s|$ 288.56|
|(780) 890 7654|Weifeng Chen |245|33h48m46s|$ 539.41|
|(780) 987 6543|Farrukh Ahmed |374|52h50m11s|$ 883.72|
+--------------+------------------------------+---+---------+--------+
| Total dues | $ 5722.07|
+--------------+-----------------------------------------------------+

Related

Hi. I'm trying to scrape infinite scrolling website. It stuck in 200th data

I scrolled with selenium and grabbed all urls and used these urls in beautifulsoup.But there are so many duplicates in scraped data.I tried to left them with drop_duplicates but it stack in about 200th data .I cannot detect the problem. I add the code which i use. I want to grab all prices,areas,rooms et.c.
import requests
from lxml import html
from bs4 import BeautifulSoup as bs
import bs4
import pandas as pd
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from lxml import html
import pandas as pd
import time
driver = webdriver.Chrome(r'C:\Program Files (x86)\chromedriver_win32\chromedriver.exe')
driver.get('https://tap.az/elanlar/dasinmaz-emlak/menziller')
time.sleep(1)
price = []
citi = []
elann = []
bina = []
arrea = []
adres = []
roome = []
baxhise = []
mulkayet = []
descript = []
urll = []
zefer = []
previous_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(2)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == previous_height:
break
previous_height = new_height
lnks=driver.find_elements(By.CSS_SELECTOR, '#content > div > div > div.categories-products.js-categories-products > div.js-endless-container.products.endless-products > div.products-i')
for itema in lnks:
urla=itema.find_element(By.TAG_NAME, 'a')
aae = (urla.get_attribute('href'))
urel = aae.split('/bookmark')[0]
result = requests.get(urel)
soup = bs(result.text, 'html.parser')
casee = soup.find_all("div",{"class":"lot-body l-center"})
for ae in casee:
c = ae.find_all('table', class_ = 'properties')
pp = c[0].text
city = pp.split('Şəhər')[-1].split('Elanın')[0].replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I')
cxe = c[0].text
elan_tipi = cxe.split('Elanın tipi')[-1].split('Binanın tipi')[0].replace(' verilir','')
elane = elan_tipi.replace(' ', '_').replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I')
cx = c[0].text
bina_tipi = cx.split('Binanın tipi')[-1].split('Sahə')[0].replace(' ', '_').replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I')
cx = c[0].text
area = cx.split('tikiliSahə,')[-1].split('Otaq')[0].replace('m²', '')
cx = c[0].text
room = cx.split('Otaq sayı')[-1].split('Yerləşmə yeri')[0]
cx = c[0].text
addresss = cx.split('Yerləşmə yeri')[-1].replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I')
d = ae.find_all('p')
elan_kod = (d[0].text.replace('Elanın nömrəsi:', ''))
d = ae.find_all('p')
baxhis = d[1].text.replace('Baxışların sayı: ', '')
d = ae.find_all('p')
description = (d[3].text.replace('Baxışların sayı: ', '').replace('ş' ,'sh').replace('ə' ,'e').replace('ü' ,'u').replace('ö' ,'o').replace('ı' ,'i').replace('ğ' ,'g').replace('ç' ,'ch').replace('Ç', 'ch').replace('Ş', 'sh').replace('Ə' ,'e').replace('Ü' ,'u').replace('Ö' ,'o').replace('İ', 'I').replace("\n", ''))
kim = ae.find_all('div', class_ = 'author')
kime = kim[0].text
if 'bütün' in kime:
mulkiyet = int(0)
else:
mulkiyet = int(1)
caseee = soup.find_all("div",{"class":"middle"})
for aecex in caseee:
pricxxe = aecex.find_all('span', class_ = 'price-val')
pricef = pricxxe[0].text.replace(' ' , '')
price.append(pricef)
zefer.append(elane)
elann.append(elan_kod)
citi.append(city)
bina.append(bina_tipi)
arrea.append(area)
adres.append(addresss)
roome.append(room)
baxhise.append(baxhis)
mulkayet.append(mulkiyet)
descript.append(description)
ae = pd.DataFrame({'URL': urel,'Unique_id': elann,'Price': price,'Room': roome,'Area': arrea,'Seher': citi,'Elan_tipi': zefer,'Description': descript,'Address': adres,'Category': bina,'Mulkiyyet': mulkayet})
aere = ae.drop_duplicates()
aere.to_csv('dde.csv', index=False, encoding='utf-8' )
A cause of duplicates is that every time you get lnks, you're getting the products you scraped before scrolling as well. You can probably skip duplicate scrapes by initiating scrapedUrls = [] somewhere at the beginning of your code (OUTSIDE of all loops), and then checking urel against it, as well as adding to it
if urel in scrapedUrls: continue ## add this line
result = requests.get(urel) ## from your code
scrapedUrls.append(urel) ## add this line
but I'm not sure it'll solve your issue.
I don't know why it's happening, but when I try to scrape the links with selenium's find_elements, I get the same url over and over; so I wrote a fuction [getUniqLinks] that you can use to get a unique list of links (prodUrls) by scrolling up to a certain number of times and then parsing page_source to BeautifulSoup. Below are two lines from the printed output of prodUrls = getUniqLinks(fullUrl, rootUrl, max_scrolls=250, tmo=1):
WITH SELENIUM found 10957 product links [1 unique]
PARSED PAGE_SOURCE ---> found 12583 product links [12576 unique]
(The full function and printed output are at https://pastebin.com/b3gwUAJZ.)
Some notes:
If you increase tmo, you can increase max_scrolls too, but it starts getting quite slow after 100 scrolls.
I used selenium to get links as well, just to print and show the difference, but you can remove all lines that end with # remove to get rid of those unnecessary parts.
I used selenium's WebDriverWait instead of time.sleep because it stops waiting after the relevant elements have loaded - it raises an error if it doesn't load it the allowed time (tmo), so I found it more convenient and readable to use in a try...except block instead of using driver.implicitly_wait
I don't know if this is related to whatever is causing your program to hang [since mine is probably just because of the number of elements being too many], but mine also hangs if I try to use selenium to get all the links after scrolling instead of adding to prodLinks in chunks inside the loop.
Now, you can loop through prodUrls and get the data you want, but I think it's better to build a list with a separate dictionary for each link [i.e., having a dictionary for each row rather than having a separate list for each column].
If you use these two functions, then you just have to prepare a reference dictionary of selectors like
refDict = {
'title': 'h1.js-lot-title',
'price_text': 'div.price-container',
'price_amt': 'div.price-container > .price span.price-val',
'price_cur': 'div.price-container > .price span.price-cur',
'.lot-text tr.property': {'k':'td.property-name', 'v':'td.property-value'},
'contact_name': '.author > div.name',
'contact_phone': '.author > a.phone',
'lot_warning': 'div.lot-warning',
'div.lot-info': {'sel': 'p', 'sep': ':'},
'description': '.lot-text p'
}
that can be passed to fillDict_fromTag like in the code below:
## FIRST PASTE FUNTION DEFINITIONS FROM https://pastebin.com/hKXYetmj
productDetails = []
puLen = len(prodUrls)
for pi, pUrl in enumerate(prodUrls[:500]):
print('', end=f'\rScraping [for {pi+1} of {puLen}] {pUrl}')
pDets = {'prodId': [w for w in pUrl.split('/') if w][-1]}
resp = requests.get(pUrl)
if resp.status_code != 200:
pDets['Error_Message'] = f'{resp.raise_for_status()}'
pDets['sourceUrl'] = pUrl
productDetails.append(pDets)
continue
pSoup = BeautifulSoup(resp.content, 'html.parser')
pDets = fillDict_fromTag(pSoup, refDict, pDets, rootUrl)
pDets['sourceUrl'] = pUrl
productDetails.append(pDets)
print()
prodDf = pd.DataFrame(productDetails).set_index('prodId')
prodDf.to_csv('ProductDetails.csv')
I have uploaded both 'prodLinks.csv' and 'ProductDetails.csv' here, although there are only the first 500 scrapes' results since I manually interrupted after around 20 minutes; I'm also pasting the first 3 rows here (printed with print(prodDf.loc[prodDf.index[:3]].to_markdown()))
| prodId | title | price_text | price_amt | price_cur | Şəhər | Elanın tipi | Elanın tipi [link] | Binanın tipi | Binanın tipi [link] | Sahə, m² | Otaq sayı | Yerləşmə yeri | contact_name | contact_phone | lot_warning | Elanın nömrəsi | Baxışların sayı | Yeniləndi | description | sourceUrl |
|---------:|:---------------------------------------------------------|:-------------|:------------|:------------|:--------|:---------------|:----------------------------------------------------------------|:---------------|:----------------------------------------------------------------|-----------:|------------:|:----------------|:---------------|:----------------|:-----------------------------------------------------------------------------|-----------------:|------------------:|:---------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------|
| 35828514 | 2-otaqlı yeni tikili kirayə verilir, 20 Yanvar m., 45 m² | 600 AZN | 600 | AZN | Bakı | Kirayə verilir | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B740%5D=3724 | Yeni tikili | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B747%5D=3849 | 45 | 2 | 20 Yanvar m. | Elşad Bəy | (055) 568-12-13 | Diqqət! Beh göndərməmişdən öncə sövdələşmənin təhlükəsiz olduğuna əmin olun! | 35828514 | 105 | 22 Noyabr 2022 | 20 Yanvar metrosuna və Inşatcılar metrosuna 8 - 11 dəiqqə arası olan ərazidə, yeni tikili binada 1 otaq 2 otaq təmirli şəraitiynən mənzil kirayə 600 manata, ailiyə və iş adamına verilir. Qabaqçadan 2 ay ödəniş olsa kamendant pulu daxil, ayı 600 manat olaçaq, mənzili götūrən şəxs 1 ayın 20 % vasitəciyə ödəniş etməlidir. Xahìş olunur, rial olmuyan şəxs zəng etməsin. | https://tap.az/elanlar/dasinmaz-emlak/menziller/35828514 |
| 35833080 | 1-otaqlı yeni tikili kirayə verilir, Quba r., 60 m² | 40 AZN | 40 | AZN | Quba | Kirayə verilir | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B740%5D=3724 | Yeni tikili | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B747%5D=3849 | 60 | 1 | Quba r. | Orxan | (050) 604-27-60 | Diqqət! Beh göndərməmişdən öncə sövdələşmənin təhlükəsiz olduğuna əmin olun! | 35833080 | 114 | 22 Noyabr 2022 | Quba merkezde her weraiti olan GUNLUK KIRAYE EV.Daimi isti soyuq su hamam metbex wifi.iwciler ve aile ucun elveriwlidir Təmirli | https://tap.az/elanlar/dasinmaz-emlak/menziller/35833080 |
| 35898353 | 4-otaqlı mənzil, Nizami r., 100 m² | 153 000 AZN | 153 000 | AZN | Bakı | Satılır | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B740%5D=3722 | Köhnə tikili | https://tap.az/elanlar/dasinmaz-emlak/menziller?p%5B747%5D=3850 | 100 | 4 | Nizami r. | Araz M | (070) 723-54-50 | Diqqət! Beh göndərməmişdən öncə sövdələşmənin təhlükəsiz olduğuna əmin olun! | 35898353 | 71 | 27 Noyabr 2022 | X.Dostluğu metrosuna 2 deq mesafede Leninqrad lahiyeli 9 mərtəbəli binanın 5-ci mərtəbəsində 4 otaqlı yaxsi temirli mənzil satılır.Əmlak ofisinə ödəniş alıcı tərəfindən məbləğin 1%-ni təşkil edir. | https://tap.az/elanlar/dasinmaz-emlak/menziller/35898353 |

How to disable text wrap in a columnar column?

|---------|------------------|------------------|-----------|------------------|
|serial no|ggggggg name |status |status code|AAAAAAAAAurl |
|==============================================================================|
|1 |ggggggggggg-kkkkkk|Healthy |200 |http://aaaaaaaaaaa|
| |e | | |-service.dev.sdddd|
| | | | |1.cccc.cc/health/l|
| | | | |ive |
|---------|------------------|------------------|-----------|------------------|
|2 |zzzzzzzz-jjjjjj |Healthy |200 |http://ddddddddddd|
| | | | |ader.dev.ffffff.cc|
| | | | |cc.cc/health/live |
|---------|------------------|------------------|-----------|------------------|
I am trying to get the last column in one row the entire url. I am using the following python library to print this, tried few things but unable to get this working. I tried https://pypi.org/project/Columnar/ setting max column width and min column width and such as mentioned here, but none are working
Edit: Headers are simply names of the columns, you can name it anything you want
from columnar import columnar
headers = ['serial no', 'service name', 'status', 'status code']
...
tabledata = []
counter = 0
for x in services:
zzz = requests.get("http://xxx.yyy"+ x)
counter = counter + 1
i = counter
myrowdata = [i, x, zzz.text, zzz.status_code]
tabledata.append(myrowdata)
table = columnar(tabledata, headers, no_borders=True, max_column_width=None)
print(table)
1.) You missed the column name "url" from headers.
You should do as follows:
headers = ['serial no', 'service name', 'status', 'status code', 'url']
2.) You have to add url to myrowdata:
myrowdata = [i, x, zzz.text, zzz.status_code, "http://xxx.yyy"+ x]
Update:
If you did all the fixes above, you have to run it in an external system terminal to get the real result, as some internal IDE console constrains the width of the display:
In Spyder:
SERIAL NO SERVICE NAME STATUS STATUS CODE URL
1 Anyname Anytext Anystatus_code http://aaaaaaaaaaaaaaaaaaa
aadddddddddddddddddddddddd
dddddddaaaaaaaaa.com
In external system terminal:

Formating a table from a csv file

I'm trying to make a table from data from a CSV file using only the CSV module. Could anyone tell me what should I do to display the '|' at the end of every row(just after the last element in the row)?
Here's what I have so far:
def display_playlist( filename ):
if filename.endswith('.csv')==False: #check if it ends with CSV extension
filename = filename + ('.csv') #adding .csv if given without .csv extension
max_element_length=0
#aligning columns to the longest elements
for row in get_datalist_from_csv( filename ):
for element in row:
if len(element)>max_element_length:
max_element_length=len(element)
# print(max_element_length)
#return max_element_length
print('-----------------------------------------------------------------------------')
for row in get_datalist_from_csv( filename ):
for element in row:
print('| ', end='')
if (len(element)<=4 and element.isdigit==True):
print(pad_to_length(element,4), end=' |') #trying to get '|' at the end[enter image description here][1]
else:
print(pad_to_length(element, max_element_length), end=' ')
print('\n')
print('-----------------------------------------------------------------------------')
## Read data from a csv format file
def get_datalist_from_csv( filename ):
## Create a 'file object' f, for accessing the file:
with open( filename ) as f:
reader = csv.reader(f) # create a 'csv reader' from the file object
datalist = list( reader ) # create a list from the reader
return datalist # we have a list of lists
## For aligning table columns
## It adds spaces to the end of a string to make it up to length n.
def pad_to_length( string, n):
return string + " "* (n-len(string)) ## s*n gives empty string for n<1
The image I get for now is:
| Track | Artist | Album | Time
| Computer Love | Kraftwerk | Computer World | 7:15
| Paranoid Android | Radiohead | OK Computer | 6:27
| Computer Age | Neil Young | Trans | 5:24
| Digital | Joy Division | Still | 2:50
| Silver Machine | Hawkwind | Roadhawks | 4:39
| Start the Simulator | A-Ha | Foot of the Mountain | 5:11
| Internet Connection | M.I.A. | MAYA | 2:56
| Deep Blue | Arcade Fire | The Suburbs | 4:29
| I Will Derive! | MindofMatthew | You Tube | 3:17
| Lobachevsky | Tom Lehrer | You Tube | 3:04

Python store value into list and run a grep with if/else statement?

Im creating a code in which I need to check the list ip addresses from npat variable which i need to create a loop, the code will run two things 1 is grep and 2 is lookup using whois both of this task has 2 possible output and its either match or unmatch and result should be in the list.
Q's:
store the if/else statement result into a list that is the result from grep/whois?
What pattern should I use to match route: (spaces) from whois? so far my regex pattern for this work especially matching the address but I'm having issue matching the word "route:(spaces).
Some output:
npat list = ['6.120.0.0/18', '6.120.0.0/17', '13.44.61.0/24', '13.44.62.0/24']
Whois possible output:
1.
RADB: % No entries found for the selected source(s).
RADB: route: 6.120.0.0/18
descr: name.com
origin: AS1111
notify: network#email.com
source: RADB
Here's the code:
import re, base64, os, sys
#SAMPLE STRING
teststr = """router#sh ip bgp
Status codes: s suppressed, d damped, h history, * valid, > best, i - internal,
r RIB-failure, S Stale, m multipath, b backup-path, x best-external
Origin codes: i - IGP, e - EGP, ? - incomplete
Network Next Hop Metric LocPrf Weight Path
*> 6.120.0.0/18 2.2.2.11 0 3111 2000 2485 43754 i
*> 6.120.0.0/17 2.2.2.11 0 3111 2000 2485 43754 i
*> 13.44.61.0/24 2.2.2.11 0 3111 4559 i
*> 13.44.62.0/24 2.2.2.11 0 3111 4559 i"""
##print (teststr,"\n")
#SEARCH NETWORK ENTRY*Working)
npat = re.findall(r'(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})/\d+',teststr)
print ("List: \n",npat)
for ips in npat:
ipnet = ips.strip()
print ("Processing ..... ", ipnet)
fgen = "grep " +ipnet+ " /mnt/hgfs/IRR/fgen.txt"
f2pat = re.findall(ipnet,fgen)
print ("\nCommand: ",fgen)
os.system(fgen)
print ("\n NEW NPATH: ",f2pat)
if ipnet in f2pat:
flist = "Grep Found"
print ("Result ", flist)
else:
flist = "Grep Not found"
print ("Result: ",flist)
f = os.popen('whois -h whois.radb.net ' + ipnet)
who = f.read()
radbpat = re.findall(ipnet,who)
print ("\nRADB: ", who)
radbpat = re.findall(r'(?<=route: )(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})/\d+',who)
print ("Radb :",radbpat)
if ipnet in radbpat:
rlist = "Found in RADB"
print ("Result ", rlist)
else:
rlist = "Not found in RADB"
print ("Result: ",rlist)
## OUTPUT
titles = ['RS-SET', 'GREP', 'RADB']
data = [titles] + list(zip(npat, flist, rlist))
for i, d in enumerate(data):
line = '|'.join(str(x).ljust(15) for x in d)
print(line)
if i == 0:
print('-' * len(line))
My target is to create a loop so I could check all the list of ip address from npat then the output shows the result from task 1 and 2??
I have created a table so my target output should be like this.
RS-SET |Grep |RADB
--------------------------------------------
xx.xx.xx.0/yy |not found |Found
My Current output is like this:
RS-SET |GREP |RADB
-----------------------------------------------
27.54.41.0/24 |G |N
223.253.0.0/20 |r |o
27.54.41.0/24 |e |t
27.54.42.0/24 |p |
27.54.43.0/24 | |f
Grep and radb output has been vertically added... my flist and rlist has only 1 data.

getting alphabets after applying sentence tokenizer of nltk instead of sentences in Python 3.5.1

import codecs, os
import re
import string
import mysql
import mysql.connector
y_ = ""
'''Searching and reading text files from a folder.'''
for root, dirs, files in os.walk("/Users/ultaman/Documents/PAN dataset/Pan Plagiarism dataset 2010/pan-plagiarism-corpus-2010/source-documents/test1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
y_ = y_ + lines
'''Tokenizing the senteces of the text file.'''
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(y_)
tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
'''Removing punctuation marks.'''
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = ''
for review in tokenized_docs:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review+= new_token
tokenized_docs_no_punctuation += (new_review)
print(tokenized_docs_no_punctuation)
'''Connecting and inserting tokenized documents without punctuation in database field.'''
def connect():
for i in range(len(tokenized_docs_no_punctuation)):
conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'test' )
cursor = conn.cursor()
cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,(tokenized_docs_no_punctuation[i])))
conn.commit()
conn.close()
if __name__ == '__main__':
connect()
After writing the above code, The result is like
2 | S | N |
| 3 | S | o |
| 4 | S | |
| 5 | S | d |
| 6 | S | o |
| 7 | S | u |
| 8 | S | b |
| 9 | S | t |
| 10 | S | |
| 11 | S | m |
| 12 | S | y |
| 13 | S |
| 14 | S | d
in the database.
It should be like:
1 | S | No doubt, my dear friend.
2 | S | no doubt.
I suggest making the following edits(use what you would like). But this is what I used to get your code running. Your issue is that review in for review in tokenized_docs: is already a string. So, this makes token in for token in review: characters. Therefore to fix this I tried -
tokenized_docs = ['"No doubt, my dear friend, no doubt; but in the meanwhile suppose we talk of this annuity.', 'Shall we say one thousand francs a year."', '"What!"', 'asked Bonelle, looking at him very fixedly.', '"My dear friend, I mistook; I meant two thousand francs per annum," hurriedly rejoined Ramin.', 'Monsieur Bonelle closed his eyes, and appeared to fall into a gentle slumber.', 'The mercer coughed;\nthe sick man never moved.', '"Monsieur Bonelle."']
'''Removing punctuation marks.'''
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
new_token = regex.sub(u'', review)
if not new_token == u'':
tokenized_docs_no_punctuation.append(new_token)
print(tokenized_docs_no_punctuation)
and got this -
['No doubt my dear friend no doubt but in the meanwhile suppose we talk of this annuity', 'Shall we say one thousand francs a year', 'What', 'asked Bonelle looking at him very fixedly', 'My dear friend I mistook I meant two thousand francs per annum hurriedly rejoined Ramin', 'Monsieur Bonelle closed his eyes and appeared to fall into a gentle slumber', 'The mercer coughed\nthe sick man never moved', 'Monsieur Bonelle']
The final format of the output is up to you. I prefer using lists. But you could concatenate this into a string as well.
nw = []
for review in tokenized_docs[0]:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review += new_token
nw.append(new_review)
'''Inserting into database'''
def connect():
for j in nw:
conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
cursor = conn.cursor()
cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,j))
conn.commit()
conn.close()
if __name__ == '__main__':
connect()

Categories