from __future__ import print_function import os import sys import argparse import numpy as np from collections import Counter from itertools import chain def read_file(filename): # check if file exists if os.path.isfile(filename) != True: print(f"{filename!r} does not exist") sys.exit(1) # read text in the file # NOTE: 'encoding' option works only in python3 with open(filename, 'r', encoding='utf-8-sig') as f: text = f.readlines() return text def calc_entropy(text): # flatten text (make list of characters), and set Counter chars = list(chain.from_iterable(text)) c = Counter(chars) # total number of characters total = len(chars) print(f"No. of characters = {total}") assert total == np.sum(list(c.values())) prob = np.array([n/float(total) for n in c.values()]) entropy = np.sum(-prob * np.log2(prob)) print(f"Shannon entropy = {entropy:.4f}") print("Statistics of characters") for key, n in c.most_common(): p = n / float(total) * 100 print(f" {key!r:6} {n:7d} {p:8.2f}%") def main(): # set commandline argument parser = argparse.ArgumentParser() parser.add_argument('filename', help="filename of a text file. The line break must be LF.") args = parser.parse_args() print(args) txt = read_file(args.filename) calc_entropy(txt) if __name__ == '__main__': main()