shannon.py

shannon.py

from __future__ import print_function

import os
import sys
import argparse
import numpy as np
from collections import Counter
from itertools import chain


def read_file(filename):

    # check if file exists
    if os.path.isfile(filename) != True:
        print(f"{filename!r} does not exist")
        sys.exit(1)

    # read text in the file
    #   NOTE: 'encoding' option works only in python3
    with open(filename, 'r', encoding='utf-8-sig') as f:
        text = f.readlines()

    return text


def calc_entropy(text):

    # flatten text (make list of characters), and set Counter
    chars = list(chain.from_iterable(text))
    c = Counter(chars)

    # total number of characters
    total = len(chars)
    print(f"No. of characters = {total}")
    assert total == np.sum(list(c.values()))

    prob = np.array([n/float(total) for n in c.values()])
    entropy = np.sum(-prob * np.log2(prob))
    print(f"Shannon entropy = {entropy:.4f}")

    print("Statistics of characters")
    for key, n in c.most_common():
        p = n / float(total) * 100
        print(f" {key!r:6}  {n:7d}  {p:8.2f}%")


def main():
    # set commandline argument
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help="filename of a text file. The line break must be LF.")
    args = parser.parse_args()
    print(args)

    txt = read_file(args.filename)
    calc_entropy(txt)


if __name__ == '__main__':
    main()