{"id":68,"date":"2020-06-11T04:52:50","date_gmt":"2020-06-11T04:52:50","guid":{"rendered":"https:\/\/michaeljohnsteiner.com\/?p=68"},"modified":"2022-03-05T13:00:31","modified_gmt":"2022-03-05T13:00:31","slug":"textbinaryfileid-cs","status":"publish","type":"post","link":"https:\/\/michaeljohnsteiner.com\/index.php\/2020\/06\/11\/textbinaryfileid-cs\/","title":{"rendered":"TextBinaryFileId.cs"},"content":{"rendered":"\n<p>Determine Byte Array Binary or Text<\/p>\n\n\n\n<p>Updated: March-5, 2022<\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"csharp\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">using System;\nusing System.Collections.Generic;\nusing System.IO;\nusing System.Linq;\nusing System.Text;\npublic class TextBinaryFileId\n{\n    private readonly BoyerMooreByte             _boyerMoore4Null = new(new byte[] { 0, 0, 0, 0 });\n    public           Dictionary&lt;string, string> BinaryFiles      = new();\n    public           Dictionary&lt;string, string> EncodingFiles    = new();\n    public           Dictionary&lt;string, double> TextFiles        = new();\n    public bool IsBlockText(byte[] BinData, double ConfidenceThreshold = 25)\n    {\n        if (BinData.Length == 0)\n            return false;\n        if (_boyerMoore4Null.Search(BinData) != -1)\n            return false;\n        var enc     = GetEncoding(BinData);\n        var CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.Unicode || enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII)));\n        var asp     = (double)CcCount \/ BinData.Length * 100d;\n        return !(asp > ConfidenceThreshold);\n    }\n    public byte[] ReadBytes(string path)\n    {\n        try\n        {\n            var ba  = File.ReadAllBytes(path);\n            var enc = GetEncoding(ba);\n            EncodingFiles.Add(path, enc.EncodingName);\n            if (Equals(enc, Encoding.UTF7) || Equals(enc, Encoding.UTF8) || Equals(enc, Encoding.ASCII)) return ba;\n            if (Equals(enc, Encoding.Unicode))\n            {\n                var len  = ba.Length - 2;\n                var blen = len \/ 2;\n                if (blen * 2 &lt; len) blen++;\n                var b = new byte[blen];\n                for (int i = 2, j = 0; i &lt; ba.Length &amp;&amp; j &lt; blen; i += 2, ++j)\n                    b[j] = ba[i];\n                return b;\n            }\n            if (Equals(enc, Encoding.UTF32))\n            {\n                var len1  = ba.Length - 4;\n                var blen1 = len1 \/ 4;\n                if (blen1 * 4 &lt; len1)\n                    blen1++;\n                var b1 = new byte[blen1];\n                for (int i = 4, j = 0; i &lt; ba.Length &amp;&amp; j &lt; blen1; i += 4, ++j)\n                    b1[j] = ba[i];\n                return b1;\n            }\n            return ba;\n        }\n        catch (Exception ex)\n        {\n            ExceptionLog.ExLog(ex, \"ReadBytes\", \"ReadBytes\");\n        }\n        return null;\n    }\n    public bool IsTextQik(string path, bool TestEntireFile = false)\n    {\n        var    isText = true;\n        double asp    = 0;\n        using (var fileStream = File.OpenRead(path))\n        {\n            var WindowSize = 0l;\n            if (TestEntireFile)\n            {\n                WindowSize = fileStream.Length;\n            }\n            else\n            {\n                WindowSize = 512;\n                if (WindowSize > fileStream.Length)\n                    WindowSize = fileStream.Length;\n            }\n            if (fileStream.Length == 0)\n                return false;\n            var BinData = new byte[WindowSize];\n            fileStream.Read(BinData, 0, BinData.Length);\n            if (fileStream.Length &lt; 4)\n            {\n                foreach (var b in BinData)\n                    if (!IsValidTextByte(b))\n                        return false;\n                return true;\n            }\n            if (_boyerMoore4Null.Search(BinData) != -1)\n                isText = false;\n            if (!ComfirmIsText(BinData))\n                isText = false;\n        }\n        return isText;\n    }\n    public bool IsTextQik(byte[] buffer, bool TestEntireFile = false)\n    {\n        var    isText = true;\n        double asp    = 0;\n        if (buffer.Length == 0)\n            return false;\n        var WindowSize = 0;\n        if (TestEntireFile)\n        {\n            WindowSize = buffer.Length;\n        }\n        else\n        {\n            WindowSize = 512;\n            if (WindowSize > buffer.Length)\n                WindowSize = buffer.Length;\n        }\n        var BinData = buffer.SubArray(0, WindowSize);\n        if (BinData.Count(b => !IsValidTextByte(b)) > 0)\n            return false;\n        if (_boyerMoore4Null.Search(BinData) != -1)\n            return false;\n        return true;\n    }\n    public bool IsText(string path, bool TestEntireFile = false, double ConfidenceThreshold = 100, bool TestEncoding = true)\n    {\n        var    Reason = \"None\";\n        var    isText = true;\n        double asp    = 0;\n        using (var fileStream = File.OpenRead(path))\n        {\n            var WindowSize = 0l;\n            if (TestEntireFile)\n            {\n                WindowSize = fileStream.Length;\n            }\n            else\n            {\n                WindowSize = 512;\n                if (WindowSize > fileStream.Length)\n                    WindowSize = fileStream.Length;\n            }\n            if (fileStream.Length == 0)\n            {\n                BinaryFiles.Add(path, \"Zero Length File.\");\n                return false;\n            }\n            var BinData       = new byte[WindowSize];\n            var BinDataLength = fileStream.Read(BinData, 0, BinData.Length);\n            fileStream.Seek(0, SeekOrigin.Begin);\n            if (fileStream.Length &lt; 4)\n            {\n                var r = BinData.All(b => IsValidTextByte(b));\n                if (!r)\n                    BinaryFiles.Add(path, \"Length 4 file Contains invalid Characters.\");\n                return r;\n            }\n            if (_boyerMoore4Null.Search(BinData) != -1)\n            {\n                Reason = \"4 Sequential Nulls Found within File.\";\n                isText = false;\n            }\n            if (isText)\n            {\n                var enc = GetEncoding(BinData);\n                if (TestEncoding)\n                {\n                    var TextData = new char[WindowSize];\n                    var eMatches = 0;\n                    using (var streamReader = new StreamReader(fileStream))\n                    {\n                        streamReader.Read(TextData, 0, TextData.Length);\n                    }\n                    using (var memoryStream = new MemoryStream())\n                    {\n                        using (var streamWriter = new StreamWriter(memoryStream, enc))\n                        {\n                            streamWriter.Write(TextData);\n                            streamWriter.Flush();\n                            var memoryBuffer = memoryStream.GetBuffer();\n                            for (var i = 0; i &lt; BinDataLength; i++)\n                                if (BinData[i] == memoryBuffer[i])\n                                    eMatches++;\n                            var er = (double)eMatches \/ BinDataLength * 100d;\n                            if ((int)er &lt; 99)\n                            {\n                                isText = false;\n                                Reason = $\"Encoding Mismatch: {er:0.0}\";\n                            }\n                        }\n                    }\n                }\n                if (isText)\n                {\n                    double CcCount = BinData.AsParallel().Count(b =>\n                        !IsValidTextByte(b, !(enc == Encoding.Unicode || enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII)));\n                    asp = CcCount \/ BinData.Length * 100d;\n                    if (asp > ConfidenceThreshold)\n                    {\n                        Reason = $\"Confidence threshold {ConfidenceThreshold:0.0} Exceeded: {asp:0.0}\";\n                        isText = false;\n                    }\n                }\n            }\n        }\n        if (isText)\n            TextFiles.Add(path, asp.TruncateToDecimalPlace(1));\n        else\n            BinaryFiles.Add(path, Reason);\n        return isText;\n    }\n    public static Encoding GetEncoding(byte[] Data)\n    {\n        if (Data == null)\n            throw new Exception(\"Array cannot be null.\");\n        if (Data.Length &lt; 2)\n            return Encoding.Default;\n        if (Data[0] == 0xff &amp;&amp; Data[1] == 0xfe)\n            return Encoding.Unicode;\n        if (Data[0] == 0xfe &amp;&amp; Data[1] == 0xff)\n            return Encoding.BigEndianUnicode;\n        if (Data.Length &lt; 3)\n            return Encoding.Default;\n        if (Data[0] == 0xef &amp;&amp; Data[1] == 0xbb &amp;&amp; Data[2] == 0xbf)\n            return Encoding.UTF8;\n        if (Data[0] == 0x2b &amp;&amp; Data[1] == 0x2f &amp;&amp; Data[2] == 0x76)\n            return Encoding.UTF7;\n        if (Data.Length &lt; 4)\n            return Encoding.Default;\n        if (Data[0] == 0xff &amp;&amp; Data[1] == 0xfe &amp;&amp; Data[2] == 0 &amp;&amp; Data[3] == 0)\n            return Encoding.UTF32;\n        return Encoding.Default;\n    }\n    public static bool ComfirmIsText(byte[] Data)\n    {\n        if (Data == null)\n            throw new Exception(\"Array cannot be null.\");\n        if (Data.Length &lt; 2)\n            return false;\n        if (Data[0] == 0xff &amp;&amp; Data[1] == 0xfe)\n            return true;\n        if (Data[0] == 0xfe &amp;&amp; Data[1] == 0xff)\n            return true;\n        if (Data.Length &lt; 3)\n            return false;\n        if (Data[0] == 0xef &amp;&amp; Data[1] == 0xbb &amp;&amp; Data[2] == 0xbf)\n            return true;\n        if (Data[0] == 0x2b &amp;&amp; Data[1] == 0x2f &amp;&amp; Data[2] == 0x76)\n            return true;\n        if (Data.Length &lt; 4)\n            return false;\n        if (Data[0] == 0xff &amp;&amp; Data[1] == 0xfe &amp;&amp; Data[2] == 0 &amp;&amp; Data[3] == 0)\n            return true;\n        var ld = Data.SubArray(0, Data.Length >= 6 ? 6 : Data.Length);\n        foreach (var b in ld)\n            if (!IsValidTextByte(b))\n                return false;\n        return true;\n        ;\n    }\n    private static bool IsValidTextByte(byte _byte, bool IncludeNull = false)\n    {\n        if (IncludeNull)\n            if (_byte == 0x00)\n                return true;\n        if (_byte == 0x0A || _byte == 0x0D || _byte == 0x09 || (_byte >= 0x20 &amp;&amp; _byte &lt;= 0x2F) || (_byte >= 0x30 &amp;&amp; _byte &lt;= 0x39) || (_byte >= 0x3A &amp;&amp; _byte &lt;= 0x40) ||\n            (_byte >= 0x41 &amp;&amp; _byte &lt;= 0x5A) || (_byte >= 0x5B &amp;&amp; _byte &lt;= 0x60) || (_byte >= 0x61 &amp;&amp; _byte &lt;= 0x7A) || (_byte >= 0x7B &amp;&amp; _byte &lt;= 0x7E))\n            return true;\n        return false;\n    }\n}<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>Determine Byte Array Binary or Text Updated: March-5, 2022<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[2],"tags":[10,4,8,11,9],"_links":{"self":[{"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/posts\/68"}],"collection":[{"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/comments?post=68"}],"version-history":[{"count":2,"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/posts\/68\/revisions"}],"predecessor-version":[{"id":526,"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/posts\/68\/revisions\/526"}],"wp:attachment":[{"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/media?parent=68"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/categories?post=68"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/michaeljohnsteiner.com\/index.php\/wp-json\/wp\/v2\/tags?post=68"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}