Determine Byte Array Binary or Text
Updated: March-5, 2022
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; public class TextBinaryFileId { private readonly BoyerMooreByte _boyerMoore4Null = new(new byte[] { 0, 0, 0, 0 }); public Dictionary<string, string> BinaryFiles = new(); public Dictionary<string, string> EncodingFiles = new(); public Dictionary<string, double> TextFiles = new(); public bool IsBlockText(byte[] BinData, double ConfidenceThreshold = 25) { if (BinData.Length == 0) return false; if (_boyerMoore4Null.Search(BinData) != -1) return false; var enc = GetEncoding(BinData); var CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.Unicode || enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII))); var asp = (double)CcCount / BinData.Length * 100d; return !(asp > ConfidenceThreshold); } public byte[] ReadBytes(string path) { try { var ba = File.ReadAllBytes(path); var enc = GetEncoding(ba); EncodingFiles.Add(path, enc.EncodingName); if (Equals(enc, Encoding.UTF7) || Equals(enc, Encoding.UTF8) || Equals(enc, Encoding.ASCII)) return ba; if (Equals(enc, Encoding.Unicode)) { var len = ba.Length - 2; var blen = len / 2; if (blen * 2 < len) blen++; var b = new byte[blen]; for (int i = 2, j = 0; i < ba.Length && j < blen; i += 2, ++j) b[j] = ba[i]; return b; } if (Equals(enc, Encoding.UTF32)) { var len1 = ba.Length - 4; var blen1 = len1 / 4; if (blen1 * 4 < len1) blen1++; var b1 = new byte[blen1]; for (int i = 4, j = 0; i < ba.Length && j < blen1; i += 4, ++j) b1[j] = ba[i]; return b1; } return ba; } catch (Exception ex) { ExceptionLog.ExLog(ex, "ReadBytes", "ReadBytes"); } return null; } public bool IsTextQik(string path, bool TestEntireFile = false) { var isText = true; double asp = 0; using (var fileStream = File.OpenRead(path)) { var WindowSize = 0l; if (TestEntireFile) { WindowSize = fileStream.Length; } else { WindowSize = 512; if (WindowSize > fileStream.Length) WindowSize = fileStream.Length; } if (fileStream.Length == 0) return false; var BinData = new byte[WindowSize]; fileStream.Read(BinData, 0, BinData.Length); if (fileStream.Length < 4) { foreach (var b in BinData) if (!IsValidTextByte(b)) return false; return true; } if (_boyerMoore4Null.Search(BinData) != -1) isText = false; if (!ComfirmIsText(BinData)) isText = false; } return isText; } public bool IsTextQik(byte[] buffer, bool TestEntireFile = false) { var isText = true; double asp = 0; if (buffer.Length == 0) return false; var WindowSize = 0; if (TestEntireFile) { WindowSize = buffer.Length; } else { WindowSize = 512; if (WindowSize > buffer.Length) WindowSize = buffer.Length; } var BinData = buffer.SubArray(0, WindowSize); if (BinData.Count(b => !IsValidTextByte(b)) > 0) return false; if (_boyerMoore4Null.Search(BinData) != -1) return false; return true; } public bool IsText(string path, bool TestEntireFile = false, double ConfidenceThreshold = 100, bool TestEncoding = true) { var Reason = "None"; var isText = true; double asp = 0; using (var fileStream = File.OpenRead(path)) { var WindowSize = 0l; if (TestEntireFile) { WindowSize = fileStream.Length; } else { WindowSize = 512; if (WindowSize > fileStream.Length) WindowSize = fileStream.Length; } if (fileStream.Length == 0) { BinaryFiles.Add(path, "Zero Length File."); return false; } var BinData = new byte[WindowSize]; var BinDataLength = fileStream.Read(BinData, 0, BinData.Length); fileStream.Seek(0, SeekOrigin.Begin); if (fileStream.Length < 4) { var r = BinData.All(b => IsValidTextByte(b)); if (!r) BinaryFiles.Add(path, "Length 4 file Contains invalid Characters."); return r; } if (_boyerMoore4Null.Search(BinData) != -1) { Reason = "4 Sequential Nulls Found within File."; isText = false; } if (isText) { var enc = GetEncoding(BinData); if (TestEncoding) { var TextData = new char[WindowSize]; var eMatches = 0; using (var streamReader = new StreamReader(fileStream)) { streamReader.Read(TextData, 0, TextData.Length); } using (var memoryStream = new MemoryStream()) { using (var streamWriter = new StreamWriter(memoryStream, enc)) { streamWriter.Write(TextData); streamWriter.Flush(); var memoryBuffer = memoryStream.GetBuffer(); for (var i = 0; i < BinDataLength; i++) if (BinData[i] == memoryBuffer[i]) eMatches++; var er = (double)eMatches / BinDataLength * 100d; if ((int)er < 99) { isText = false; Reason = $"Encoding Mismatch: {er:0.0}"; } } } } if (isText) { double CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.Unicode || enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII))); asp = CcCount / BinData.Length * 100d; if (asp > ConfidenceThreshold) { Reason = $"Confidence threshold {ConfidenceThreshold:0.0} Exceeded: {asp:0.0}"; isText = false; } } } } if (isText) TextFiles.Add(path, asp.TruncateToDecimalPlace(1)); else BinaryFiles.Add(path, Reason); return isText; } public static Encoding GetEncoding(byte[] Data) { if (Data == null) throw new Exception("Array cannot be null."); if (Data.Length < 2) return Encoding.Default; if (Data[0] == 0xff && Data[1] == 0xfe) return Encoding.Unicode; if (Data[0] == 0xfe && Data[1] == 0xff) return Encoding.BigEndianUnicode; if (Data.Length < 3) return Encoding.Default; if (Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf) return Encoding.UTF8; if (Data[0] == 0x2b && Data[1] == 0x2f && Data[2] == 0x76) return Encoding.UTF7; if (Data.Length < 4) return Encoding.Default; if (Data[0] == 0xff && Data[1] == 0xfe && Data[2] == 0 && Data[3] == 0) return Encoding.UTF32; return Encoding.Default; } public static bool ComfirmIsText(byte[] Data) { if (Data == null) throw new Exception("Array cannot be null."); if (Data.Length < 2) return false; if (Data[0] == 0xff && Data[1] == 0xfe) return true; if (Data[0] == 0xfe && Data[1] == 0xff) return true; if (Data.Length < 3) return false; if (Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf) return true; if (Data[0] == 0x2b && Data[1] == 0x2f && Data[2] == 0x76) return true; if (Data.Length < 4) return false; if (Data[0] == 0xff && Data[1] == 0xfe && Data[2] == 0 && Data[3] == 0) return true; var ld = Data.SubArray(0, Data.Length >= 6 ? 6 : Data.Length); foreach (var b in ld) if (!IsValidTextByte(b)) return false; return true; ; } private static bool IsValidTextByte(byte _byte, bool IncludeNull = false) { if (IncludeNull) if (_byte == 0x00) return true; if (_byte == 0x0A || _byte == 0x0D || _byte == 0x09 || (_byte >= 0x20 && _byte <= 0x2F) || (_byte >= 0x30 && _byte <= 0x39) || (_byte >= 0x3A && _byte <= 0x40) || (_byte >= 0x41 && _byte <= 0x5A) || (_byte >= 0x5B && _byte <= 0x60) || (_byte >= 0x61 && _byte <= 0x7A) || (_byte >= 0x7B && _byte <= 0x7E)) return true; return false; } }