from __future__ import print_function
import os
import sys
import argparse
import numpy as np
from collections import Counter
from itertools import chain
def read_file(filename):
# check if file exists
if os.path.isfile(filename) != True:
print(f"{filename!r} does not exist")
sys.exit(1)
# read text in the file
# NOTE: 'encoding' option works only in python3
with open(filename, 'r', encoding='utf-8-sig') as f:
text = f.readlines()
return text
def calc_entropy(text):
# flatten text (make list of characters), and set Counter
chars = list(chain.from_iterable(text))
c = Counter(chars)
# total number of characters
total = len(chars)
print(f"No. of characters = {total}")
assert total == np.sum(list(c.values()))
prob = np.array([n/float(total) for n in c.values()])
entropy = np.sum(-prob * np.log2(prob))
print(f"Shannon entropy = {entropy:.4f}")
print("Statistics of characters")
for key, n in c.most_common():
p = n / float(total) * 100
print(f" {key!r:6} {n:7d} {p:8.2f}%")
def main():
# set commandline argument
parser = argparse.ArgumentParser()
parser.add_argument('filename', help="filename of a text file. The line break must be LF.")
args = parser.parse_args()
print(args)
txt = read_file(args.filename)
calc_entropy(txt)
if __name__ == '__main__':
main()