Determine Byte Array Binary or Text
Updated: March-5, 2022
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
public class TextBinaryFileId
{
private readonly BoyerMooreByte _boyerMoore4Null = new(new byte[] { 0, 0, 0, 0 });
public Dictionary<string, string> BinaryFiles = new();
public Dictionary<string, string> EncodingFiles = new();
public Dictionary<string, double> TextFiles = new();
public bool IsBlockText(byte[] BinData, double ConfidenceThreshold = 25)
{
if (BinData.Length == 0)
return false;
if (_boyerMoore4Null.Search(BinData) != -1)
return false;
var enc = GetEncoding(BinData);
var CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.Unicode || enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII)));
var asp = (double)CcCount / BinData.Length * 100d;
return !(asp > ConfidenceThreshold);
}
public byte[] ReadBytes(string path)
{
try
{
var ba = File.ReadAllBytes(path);
var enc = GetEncoding(ba);
EncodingFiles.Add(path, enc.EncodingName);
if (Equals(enc, Encoding.UTF7) || Equals(enc, Encoding.UTF8) || Equals(enc, Encoding.ASCII)) return ba;
if (Equals(enc, Encoding.Unicode))
{
var len = ba.Length - 2;
var blen = len / 2;
if (blen * 2 < len) blen++;
var b = new byte[blen];
for (int i = 2, j = 0; i < ba.Length && j < blen; i += 2, ++j)
b[j] = ba[i];
return b;
}
if (Equals(enc, Encoding.UTF32))
{
var len1 = ba.Length - 4;
var blen1 = len1 / 4;
if (blen1 * 4 < len1)
blen1++;
var b1 = new byte[blen1];
for (int i = 4, j = 0; i < ba.Length && j < blen1; i += 4, ++j)
b1[j] = ba[i];
return b1;
}
return ba;
}
catch (Exception ex)
{
ExceptionLog.ExLog(ex, "ReadBytes", "ReadBytes");
}
return null;
}
public bool IsTextQik(string path, bool TestEntireFile = false)
{
var isText = true;
double asp = 0;
using (var fileStream = File.OpenRead(path))
{
var WindowSize = 0l;
if (TestEntireFile)
{
WindowSize = fileStream.Length;
}
else
{
WindowSize = 512;
if (WindowSize > fileStream.Length)
WindowSize = fileStream.Length;
}
if (fileStream.Length == 0)
return false;
var BinData = new byte[WindowSize];
fileStream.Read(BinData, 0, BinData.Length);
if (fileStream.Length < 4)
{
foreach (var b in BinData)
if (!IsValidTextByte(b))
return false;
return true;
}
if (_boyerMoore4Null.Search(BinData) != -1)
isText = false;
if (!ComfirmIsText(BinData))
isText = false;
}
return isText;
}
public bool IsTextQik(byte[] buffer, bool TestEntireFile = false)
{
var isText = true;
double asp = 0;
if (buffer.Length == 0)
return false;
var WindowSize = 0;
if (TestEntireFile)
{
WindowSize = buffer.Length;
}
else
{
WindowSize = 512;
if (WindowSize > buffer.Length)
WindowSize = buffer.Length;
}
var BinData = buffer.SubArray(0, WindowSize);
if (BinData.Count(b => !IsValidTextByte(b)) > 0)
return false;
if (_boyerMoore4Null.Search(BinData) != -1)
return false;
return true;
}
public bool IsText(string path, bool TestEntireFile = false, double ConfidenceThreshold = 100, bool TestEncoding = true)
{
var Reason = "None";
var isText = true;
double asp = 0;
using (var fileStream = File.OpenRead(path))
{
var WindowSize = 0l;
if (TestEntireFile)
{
WindowSize = fileStream.Length;
}
else
{
WindowSize = 512;
if (WindowSize > fileStream.Length)
WindowSize = fileStream.Length;
}
if (fileStream.Length == 0)
{
BinaryFiles.Add(path, "Zero Length File.");
return false;
}
var BinData = new byte[WindowSize];
var BinDataLength = fileStream.Read(BinData, 0, BinData.Length);
fileStream.Seek(0, SeekOrigin.Begin);
if (fileStream.Length < 4)
{
var r = BinData.All(b => IsValidTextByte(b));
if (!r)
BinaryFiles.Add(path, "Length 4 file Contains invalid Characters.");
return r;
}
if (_boyerMoore4Null.Search(BinData) != -1)
{
Reason = "4 Sequential Nulls Found within File.";
isText = false;
}
if (isText)
{
var enc = GetEncoding(BinData);
if (TestEncoding)
{
var TextData = new char[WindowSize];
var eMatches = 0;
using (var streamReader = new StreamReader(fileStream))
{
streamReader.Read(TextData, 0, TextData.Length);
}
using (var memoryStream = new MemoryStream())
{
using (var streamWriter = new StreamWriter(memoryStream, enc))
{
streamWriter.Write(TextData);
streamWriter.Flush();
var memoryBuffer = memoryStream.GetBuffer();
for (var i = 0; i < BinDataLength; i++)
if (BinData[i] == memoryBuffer[i])
eMatches++;
var er = (double)eMatches / BinDataLength * 100d;
if ((int)er < 99)
{
isText = false;
Reason = $"Encoding Mismatch: {er:0.0}";
}
}
}
}
if (isText)
{
double CcCount = BinData.AsParallel().Count(b =>
!IsValidTextByte(b, !(enc == Encoding.Unicode || enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII)));
asp = CcCount / BinData.Length * 100d;
if (asp > ConfidenceThreshold)
{
Reason = $"Confidence threshold {ConfidenceThreshold:0.0} Exceeded: {asp:0.0}";
isText = false;
}
}
}
}
if (isText)
TextFiles.Add(path, asp.TruncateToDecimalPlace(1));
else
BinaryFiles.Add(path, Reason);
return isText;
}
public static Encoding GetEncoding(byte[] Data)
{
if (Data == null)
throw new Exception("Array cannot be null.");
if (Data.Length < 2)
return Encoding.Default;
if (Data[0] == 0xff && Data[1] == 0xfe)
return Encoding.Unicode;
if (Data[0] == 0xfe && Data[1] == 0xff)
return Encoding.BigEndianUnicode;
if (Data.Length < 3)
return Encoding.Default;
if (Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf)
return Encoding.UTF8;
if (Data[0] == 0x2b && Data[1] == 0x2f && Data[2] == 0x76)
return Encoding.UTF7;
if (Data.Length < 4)
return Encoding.Default;
if (Data[0] == 0xff && Data[1] == 0xfe && Data[2] == 0 && Data[3] == 0)
return Encoding.UTF32;
return Encoding.Default;
}
public static bool ComfirmIsText(byte[] Data)
{
if (Data == null)
throw new Exception("Array cannot be null.");
if (Data.Length < 2)
return false;
if (Data[0] == 0xff && Data[1] == 0xfe)
return true;
if (Data[0] == 0xfe && Data[1] == 0xff)
return true;
if (Data.Length < 3)
return false;
if (Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf)
return true;
if (Data[0] == 0x2b && Data[1] == 0x2f && Data[2] == 0x76)
return true;
if (Data.Length < 4)
return false;
if (Data[0] == 0xff && Data[1] == 0xfe && Data[2] == 0 && Data[3] == 0)
return true;
var ld = Data.SubArray(0, Data.Length >= 6 ? 6 : Data.Length);
foreach (var b in ld)
if (!IsValidTextByte(b))
return false;
return true;
;
}
private static bool IsValidTextByte(byte _byte, bool IncludeNull = false)
{
if (IncludeNull)
if (_byte == 0x00)
return true;
if (_byte == 0x0A || _byte == 0x0D || _byte == 0x09 || (_byte >= 0x20 && _byte <= 0x2F) || (_byte >= 0x30 && _byte <= 0x39) || (_byte >= 0x3A && _byte <= 0x40) ||
(_byte >= 0x41 && _byte <= 0x5A) || (_byte >= 0x5B && _byte <= 0x60) || (_byte >= 0x61 && _byte <= 0x7A) || (_byte >= 0x7B && _byte <= 0x7E))
return true;
return false;
}
}