TextBinaryFileId.cs

Determine Byte Array Binary or Text

Updated: March-5, 2022

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
public class TextBinaryFileId
{
    private readonly BoyerMooreByte             _boyerMoore4Null = new(new byte[] { 0, 0, 0, 0 });
    public           Dictionary<string, string> BinaryFiles      = new();
    public           Dictionary<string, string> EncodingFiles    = new();
    public           Dictionary<string, double> TextFiles        = new();
    public bool IsBlockText(byte[] BinData, double ConfidenceThreshold = 25)
    {
        if (BinData.Length == 0)
            return false;
        if (_boyerMoore4Null.Search(BinData) != -1)
            return false;
        var enc     = GetEncoding(BinData);
        var CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.Unicode || enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII)));
        var asp     = (double)CcCount / BinData.Length * 100d;
        return !(asp > ConfidenceThreshold);
    }
    public byte[] ReadBytes(string path)
    {
        try
        {
            var ba  = File.ReadAllBytes(path);
            var enc = GetEncoding(ba);
            EncodingFiles.Add(path, enc.EncodingName);
            if (Equals(enc, Encoding.UTF7) || Equals(enc, Encoding.UTF8) || Equals(enc, Encoding.ASCII)) return ba;
            if (Equals(enc, Encoding.Unicode))
            {
                var len  = ba.Length - 2;
                var blen = len / 2;
                if (blen * 2 < len) blen++;
                var b = new byte[blen];
                for (int i = 2, j = 0; i < ba.Length && j < blen; i += 2, ++j)
                    b[j] = ba[i];
                return b;
            }
            if (Equals(enc, Encoding.UTF32))
            {
                var len1  = ba.Length - 4;
                var blen1 = len1 / 4;
                if (blen1 * 4 < len1)
                    blen1++;
                var b1 = new byte[blen1];
                for (int i = 4, j = 0; i < ba.Length && j < blen1; i += 4, ++j)
                    b1[j] = ba[i];
                return b1;
            }
            return ba;
        }
        catch (Exception ex)
        {
            ExceptionLog.ExLog(ex, "ReadBytes", "ReadBytes");
        }
        return null;
    }
    public bool IsTextQik(string path, bool TestEntireFile = false)
    {
        var    isText = true;
        double asp    = 0;
        using (var fileStream = File.OpenRead(path))
        {
            var WindowSize = 0l;
            if (TestEntireFile)
            {
                WindowSize = fileStream.Length;
            }
            else
            {
                WindowSize = 512;
                if (WindowSize > fileStream.Length)
                    WindowSize = fileStream.Length;
            }
            if (fileStream.Length == 0)
                return false;
            var BinData = new byte[WindowSize];
            fileStream.Read(BinData, 0, BinData.Length);
            if (fileStream.Length < 4)
            {
                foreach (var b in BinData)
                    if (!IsValidTextByte(b))
                        return false;
                return true;
            }
            if (_boyerMoore4Null.Search(BinData) != -1)
                isText = false;
            if (!ComfirmIsText(BinData))
                isText = false;
        }
        return isText;
    }
    public bool IsTextQik(byte[] buffer, bool TestEntireFile = false)
    {
        var    isText = true;
        double asp    = 0;
        if (buffer.Length == 0)
            return false;
        var WindowSize = 0;
        if (TestEntireFile)
        {
            WindowSize = buffer.Length;
        }
        else
        {
            WindowSize = 512;
            if (WindowSize > buffer.Length)
                WindowSize = buffer.Length;
        }
        var BinData = buffer.SubArray(0, WindowSize);
        if (BinData.Count(b => !IsValidTextByte(b)) > 0)
            return false;
        if (_boyerMoore4Null.Search(BinData) != -1)
            return false;
        return true;
    }
    public bool IsText(string path, bool TestEntireFile = false, double ConfidenceThreshold = 100, bool TestEncoding = true)
    {
        var    Reason = "None";
        var    isText = true;
        double asp    = 0;
        using (var fileStream = File.OpenRead(path))
        {
            var WindowSize = 0l;
            if (TestEntireFile)
            {
                WindowSize = fileStream.Length;
            }
            else
            {
                WindowSize = 512;
                if (WindowSize > fileStream.Length)
                    WindowSize = fileStream.Length;
            }
            if (fileStream.Length == 0)
            {
                BinaryFiles.Add(path, "Zero Length File.");
                return false;
            }
            var BinData       = new byte[WindowSize];
            var BinDataLength = fileStream.Read(BinData, 0, BinData.Length);
            fileStream.Seek(0, SeekOrigin.Begin);
            if (fileStream.Length < 4)
            {
                var r = BinData.All(b => IsValidTextByte(b));
                if (!r)
                    BinaryFiles.Add(path, "Length 4 file Contains invalid Characters.");
                return r;
            }
            if (_boyerMoore4Null.Search(BinData) != -1)
            {
                Reason = "4 Sequential Nulls Found within File.";
                isText = false;
            }
            if (isText)
            {
                var enc = GetEncoding(BinData);
                if (TestEncoding)
                {
                    var TextData = new char[WindowSize];
                    var eMatches = 0;
                    using (var streamReader = new StreamReader(fileStream))
                    {
                        streamReader.Read(TextData, 0, TextData.Length);
                    }
                    using (var memoryStream = new MemoryStream())
                    {
                        using (var streamWriter = new StreamWriter(memoryStream, enc))
                        {
                            streamWriter.Write(TextData);
                            streamWriter.Flush();
                            var memoryBuffer = memoryStream.GetBuffer();
                            for (var i = 0; i < BinDataLength; i++)
                                if (BinData[i] == memoryBuffer[i])
                                    eMatches++;
                            var er = (double)eMatches / BinDataLength * 100d;
                            if ((int)er < 99)
                            {
                                isText = false;
                                Reason = $"Encoding Mismatch: {er:0.0}";
                            }
                        }
                    }
                }
                if (isText)
                {
                    double CcCount = BinData.AsParallel().Count(b =>
                        !IsValidTextByte(b, !(enc == Encoding.Unicode || enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII)));
                    asp = CcCount / BinData.Length * 100d;
                    if (asp > ConfidenceThreshold)
                    {
                        Reason = $"Confidence threshold {ConfidenceThreshold:0.0} Exceeded: {asp:0.0}";
                        isText = false;
                    }
                }
            }
        }
        if (isText)
            TextFiles.Add(path, asp.TruncateToDecimalPlace(1));
        else
            BinaryFiles.Add(path, Reason);
        return isText;
    }
    public static Encoding GetEncoding(byte[] Data)
    {
        if (Data == null)
            throw new Exception("Array cannot be null.");
        if (Data.Length < 2)
            return Encoding.Default;
        if (Data[0] == 0xff && Data[1] == 0xfe)
            return Encoding.Unicode;
        if (Data[0] == 0xfe && Data[1] == 0xff)
            return Encoding.BigEndianUnicode;
        if (Data.Length < 3)
            return Encoding.Default;
        if (Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf)
            return Encoding.UTF8;
        if (Data[0] == 0x2b && Data[1] == 0x2f && Data[2] == 0x76)
            return Encoding.UTF7;
        if (Data.Length < 4)
            return Encoding.Default;
        if (Data[0] == 0xff && Data[1] == 0xfe && Data[2] == 0 && Data[3] == 0)
            return Encoding.UTF32;
        return Encoding.Default;
    }
    public static bool ComfirmIsText(byte[] Data)
    {
        if (Data == null)
            throw new Exception("Array cannot be null.");
        if (Data.Length < 2)
            return false;
        if (Data[0] == 0xff && Data[1] == 0xfe)
            return true;
        if (Data[0] == 0xfe && Data[1] == 0xff)
            return true;
        if (Data.Length < 3)
            return false;
        if (Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf)
            return true;
        if (Data[0] == 0x2b && Data[1] == 0x2f && Data[2] == 0x76)
            return true;
        if (Data.Length < 4)
            return false;
        if (Data[0] == 0xff && Data[1] == 0xfe && Data[2] == 0 && Data[3] == 0)
            return true;
        var ld = Data.SubArray(0, Data.Length >= 6 ? 6 : Data.Length);
        foreach (var b in ld)
            if (!IsValidTextByte(b))
                return false;
        return true;
        ;
    }
    private static bool IsValidTextByte(byte _byte, bool IncludeNull = false)
    {
        if (IncludeNull)
            if (_byte == 0x00)
                return true;
        if (_byte == 0x0A || _byte == 0x0D || _byte == 0x09 || (_byte >= 0x20 && _byte <= 0x2F) || (_byte >= 0x30 && _byte <= 0x39) || (_byte >= 0x3A && _byte <= 0x40) ||
            (_byte >= 0x41 && _byte <= 0x5A) || (_byte >= 0x5B && _byte <= 0x60) || (_byte >= 0x61 && _byte <= 0x7A) || (_byte >= 0x7B && _byte <= 0x7E))
            return true;
        return false;
    }
}