ChiSquared.cs

Chi Squared Data/Byte/Text Test

Updated: Jan-19,2021

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
public static class ChiSquared
{
    /// <summary>
    ///     Calculated from an English word dictionary containing over 466,000 words.
    /// </summary>
    private static readonly float[] _expectedPercentages = {.0846f, .0189f, .0420f, .0353f, .1098f, .0125f, .0243f, .0274f, .0864f, .0018f, .0089f, .0574f, .0292f, .0715f, .0709f, .0310f, .0019f, .0704f, .0705f, .0647f, .0363f, .0099f, .0085f, .0028f, .0192f, .0041f};
    /// <summary>
    ///     Not accurate 100% all of the time.
    /// </summary>
    /// <param name="path"></param>
    public static bool IsFileCompressed(this string path)
    {
        var arr = File.ReadAllBytes(path);
        var r1  = arr.ChiSquaredTest();
        return r1.isRandom;
    }
    /// <summary>
    ///     Tests a buffer for randomness. Returns chi squared values.
    ///     isRandom - is the buffer a random sequence.
    ///     Quality - Less than 1 or greater than 1 is off target. Observed is off expected.
    ///     Entropy - Calculates a 8 bit Entropy level of the buffer as a percentage of perfect disorder 100%
    ///     ExpectedChiSq - The expected chi squared value.
    ///     LowLimit - (R - (2*sqrt(R)))
    ///     chiSqValue - The observed chi squared value.
    ///     UpperLimit - (R + (2*sqrt(R)))
    /// </summary>
    /// <param name="bArr">The byte Array</param>
    public static (bool isRandom, float Quality, float Entropy, int ExpectedChiSq, float LowLimit, float chiSqValue, float UpperLimit) ChiSquaredTest(this byte[] bArr)
    {
        if (bArr != null)
        {
            var iArr = Ia(bArr);
            var ent  = Entropy(bArr);
            if (ent < 80)
                return (false, 0, ent, 0, 0, 0, 0);
            var aLen = iArr.Length;
            var rLim = aLen / 10;
            var n    = aLen;
            var r    = rLim - 1;
            var freq = new ConcurrentDictionary<int, int>();
            iArr.AsParallel().WithDegreeOfParallelism(2).ForAll(I =>
            {
                var iT = Math.Abs(Math.Abs(I) % rLim - rLim);
                if (!freq.ContainsKey(iT))
                    freq.TryAdd(iT, 1);
                else
                    freq[iT] += 1;
            });
            var t  = freq.Sum(e => (float) Math.Pow(e.Value, 2));
            var cS = Math.Abs(r * t / n - n);
            var fL = r - 2.0f * (float) Math.Sqrt(r);
            var fH = r + 2.0f * (float) Math.Sqrt(r);
            var iR = (fL < cS) & (fH > cS);
            var q  = cS / r;
            var nfL = 0;
            var nfH = fH - fL;
            var ncS = cS - fL;
            return (iR, q, ent, (int)(r-fL), (int)nfL, (int)ncS, (int)nfH);
        }
        return default;
    }
    private static int[] Ia(byte[] ba)
    {
        var bal        = ba.Length;
        var dWordCount = bal / 4 + (bal % 4 == 0 ? 0 : 1);
        var arr        = new int[dWordCount];
        Buffer.BlockCopy(ba, 0, arr, 0, bal);
        return arr;
    }
    private static float Entropy(byte[] s)
    {
        float len = s.Length;
        var   map = new int[256];
        for (var i = 0; i < (int) len; i++)
            map[s[i]]++;
        var result = 0f;
        for (var idx = 0; idx < map.Length; idx++)
        {
            var frequency = map[idx] / len;
            if (frequency > 0)
                result -= frequency * (float) Math.Log(frequency, 2);
        }
        return result / 8f * 100f;
    }
    public static int ChiSquaredCount(this byte[] s, byte b)
    {
        float len = s.Length;
        var   map = new int[256];
        for (var i = 0; i < (int) len; i++)
            map[s[i]]++;
        return map[b];
    }
    public static int ChiSquaredCount(this string s, char b)
    {
        float len = s.Length;
        var   map = new int[256];
        for (var i = 0; i < (int) len; i++)
            map[s[i]]++;
        return map[b];
    }
    public static float ChiSquaredAsPercent(this string s, char b)
    {
        float len = s.Length;
        var   map = new int[256];
        for (var i = 0; i < (int) len; i++)
            map[s[i]]++;
        return map[b] / len;
    }
    /// <summary>
    ///     Compute the letter frequencies within the English language.
    ///     Use a large English language text block for accurate testing.
    /// </summary>
    /// <param name="s">String that contains the large English text</param>
    public static KeyValuePair<char, float>[] ChiSquaredTextAsPercent(this string s)
    {
        float len = s.Length;
        s = s.ToLower(CultureInfo.CurrentCulture);
        var lst = new Dictionary<char, float>();
        var map = new int[256];
        for (var i = 0; i < (int) len; i++)
            if (s[i].IsLetter())
                map[s[i]]++;
        var t = map.Sum(e => e);
        foreach (var l in "abcdefghijklmnopqrstuvwxyz")
            lst.Add(l, map[l] / (float) t);
        var klst      = lst.OrderBy(e => e.Key).ToArray();
        var KeyList   = "";
        var ValueList = "";
        foreach (var kv in klst)
        {
            KeyList   += $"{kv.Key},";
            ValueList += $"{kv.Value:.0000},";
        }
        var nlst = lst.OrderBy(e => e.Value).ToArray();
        return nlst;
    }
    public static float ChiSquaredTextTest(this string s)
    {
        var realLen = 0;
        s = s.ToLower(CultureInfo.CurrentCulture);
        var observed = new Dictionary<char, int>();
        foreach (var c in s)
            if (c.IsLetter())
            {
                if (!observed.ContainsKey(c))
                    observed.Add(c, 1);
                else
                    observed[c]++;
                realLen++;
            }
        var expected = new Dictionary<char, float>();
        for (var i = 0; i < 26; i++)
            expected.Add((char) (i + 97), _expectedPercentages[i] * realLen);
        var cSList = new List<float>();
        foreach (var item in expected)
        {
            var c = item.Key;
            if (observed.ContainsKey(c))
                cSList.Add((float) Math.Pow(observed[c] - expected[c], 2) / expected[c]);
        }
        return cSList.Sum(e => e) / realLen * 100f;
    }
    /// <summary>
    ///     The value of 10 as a combined chi-squared total distance
    ///     percentage threshold is subjective. Determined from
    ///     about 40 test runs of over 1 million mixed files. Most
    ///     non-text files have readings in the 100's
    /// </summary>
    /// <param name="path">Path to the file to test</param>
    public static bool IsTextFile(this string path)
    {
        return File.ReadAllText(path).ChiSquaredTextTest() < 10;
    }
    
}

Leave a Reply

Your email address will not be published. Required fields are marked *