Chi Squared Data/Byte/Text Test
Updated: Jan-19,2021
using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; public static class ChiSquared { /// <summary> /// Calculated from an English word dictionary containing over 466,000 words. /// </summary> private static readonly float[] _expectedPercentages = {.0846f, .0189f, .0420f, .0353f, .1098f, .0125f, .0243f, .0274f, .0864f, .0018f, .0089f, .0574f, .0292f, .0715f, .0709f, .0310f, .0019f, .0704f, .0705f, .0647f, .0363f, .0099f, .0085f, .0028f, .0192f, .0041f}; /// <summary> /// Not accurate 100% all of the time. /// </summary> /// <param name="path"></param> public static bool IsFileCompressed(this string path) { var arr = File.ReadAllBytes(path); var r1 = arr.ChiSquaredTest(); return r1.isRandom; } /// <summary> /// Tests a buffer for randomness. Returns chi squared values. /// isRandom - is the buffer a random sequence. /// Quality - Less than 1 or greater than 1 is off target. Observed is off expected. /// Entropy - Calculates a 8 bit Entropy level of the buffer as a percentage of perfect disorder 100% /// ExpectedChiSq - The expected chi squared value. /// LowLimit - (R - (2*sqrt(R))) /// chiSqValue - The observed chi squared value. /// UpperLimit - (R + (2*sqrt(R))) /// </summary> /// <param name="bArr">The byte Array</param> public static (bool isRandom, float Quality, float Entropy, int ExpectedChiSq, float LowLimit, float chiSqValue, float UpperLimit) ChiSquaredTest(this byte[] bArr) { if (bArr != null) { var iArr = Ia(bArr); var ent = Entropy(bArr); if (ent < 80) return (false, 0, ent, 0, 0, 0, 0); var aLen = iArr.Length; var rLim = aLen / 10; var n = aLen; var r = rLim - 1; var freq = new ConcurrentDictionary<int, int>(); iArr.AsParallel().WithDegreeOfParallelism(2).ForAll(I => { var iT = Math.Abs(Math.Abs(I) % rLim - rLim); if (!freq.ContainsKey(iT)) freq.TryAdd(iT, 1); else freq[iT] += 1; }); var t = freq.Sum(e => (float) Math.Pow(e.Value, 2)); var cS = Math.Abs(r * t / n - n); var fL = r - 2.0f * (float) Math.Sqrt(r); var fH = r + 2.0f * (float) Math.Sqrt(r); var iR = (fL < cS) & (fH > cS); var q = cS / r; var nfL = 0; var nfH = fH - fL; var ncS = cS - fL; return (iR, q, ent, (int)(r-fL), (int)nfL, (int)ncS, (int)nfH); } return default; } private static int[] Ia(byte[] ba) { var bal = ba.Length; var dWordCount = bal / 4 + (bal % 4 == 0 ? 0 : 1); var arr = new int[dWordCount]; Buffer.BlockCopy(ba, 0, arr, 0, bal); return arr; } private static float Entropy(byte[] s) { float len = s.Length; var map = new int[256]; for (var i = 0; i < (int) len; i++) map[s[i]]++; var result = 0f; for (var idx = 0; idx < map.Length; idx++) { var frequency = map[idx] / len; if (frequency > 0) result -= frequency * (float) Math.Log(frequency, 2); } return result / 8f * 100f; } public static int ChiSquaredCount(this byte[] s, byte b) { float len = s.Length; var map = new int[256]; for (var i = 0; i < (int) len; i++) map[s[i]]++; return map[b]; } public static int ChiSquaredCount(this string s, char b) { float len = s.Length; var map = new int[256]; for (var i = 0; i < (int) len; i++) map[s[i]]++; return map[b]; } public static float ChiSquaredAsPercent(this string s, char b) { float len = s.Length; var map = new int[256]; for (var i = 0; i < (int) len; i++) map[s[i]]++; return map[b] / len; } /// <summary> /// Compute the letter frequencies within the English language. /// Use a large English language text block for accurate testing. /// </summary> /// <param name="s">String that contains the large English text</param> public static KeyValuePair<char, float>[] ChiSquaredTextAsPercent(this string s) { float len = s.Length; s = s.ToLower(CultureInfo.CurrentCulture); var lst = new Dictionary<char, float>(); var map = new int[256]; for (var i = 0; i < (int) len; i++) if (s[i].IsLetter()) map[s[i]]++; var t = map.Sum(e => e); foreach (var l in "abcdefghijklmnopqrstuvwxyz") lst.Add(l, map[l] / (float) t); var klst = lst.OrderBy(e => e.Key).ToArray(); var KeyList = ""; var ValueList = ""; foreach (var kv in klst) { KeyList += $"{kv.Key},"; ValueList += $"{kv.Value:.0000},"; } var nlst = lst.OrderBy(e => e.Value).ToArray(); return nlst; } public static float ChiSquaredTextTest(this string s) { var realLen = 0; s = s.ToLower(CultureInfo.CurrentCulture); var observed = new Dictionary<char, int>(); foreach (var c in s) if (c.IsLetter()) { if (!observed.ContainsKey(c)) observed.Add(c, 1); else observed[c]++; realLen++; } var expected = new Dictionary<char, float>(); for (var i = 0; i < 26; i++) expected.Add((char) (i + 97), _expectedPercentages[i] * realLen); var cSList = new List<float>(); foreach (var item in expected) { var c = item.Key; if (observed.ContainsKey(c)) cSList.Add((float) Math.Pow(observed[c] - expected[c], 2) / expected[c]); } return cSList.Sum(e => e) / realLen * 100f; } /// <summary> /// The value of 10 as a combined chi-squared total distance /// percentage threshold is subjective. Determined from /// about 40 test runs of over 1 million mixed files. Most /// non-text files have readings in the 100's /// </summary> /// <param name="path">Path to the file to test</param> public static bool IsTextFile(this string path) { return File.ReadAllText(path).ChiSquaredTextTest() < 10; } }