You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

184 lines
4.2 KiB

  1. // Copyright (c) 2011 AlphaSierraPapa for the SharpDevelop Team
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy of this
  4. // software and associated documentation files (the "Software"), to deal in the Software
  5. // without restriction, including without limitation the rights to use, copy, modify, merge,
  6. // publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
  7. // to whom the Software is furnished to do so, subject to the following conditions:
  8. //
  9. // The above copyright notice and this permission notice shall be included in all copies or
  10. // substantial portions of the Software.
  11. //
  12. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  13. // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
  14. // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
  15. // FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  16. // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  17. // DEALINGS IN THE SOFTWARE.
  18. using System;
  19. using System.IO;
  20. using System.Text;
  21. using System.Xml;
  22. namespace ICSharpCode.ILSpy
  23. {
  24. /// <summary>
  25. /// Static methods for determining the type of a file.
  26. /// </summary>
  27. static class GuessFileType
  28. {
  29. public static FileType DetectFileType(Stream stream)
  30. {
  31. StreamReader reader;
  32. if (stream.Length >= 2)
  33. {
  34. int firstByte = stream.ReadByte();
  35. int secondByte = stream.ReadByte();
  36. switch ((firstByte << 8) | secondByte)
  37. {
  38. case 0xfffe: // UTF-16 LE BOM / UTF-32 LE BOM
  39. case 0xfeff: // UTF-16 BE BOM
  40. stream.Position -= 2;
  41. reader = new StreamReader(stream, detectEncodingFromByteOrderMarks: true);
  42. break;
  43. case 0xefbb: // start of UTF-8 BOM
  44. if (stream.ReadByte() == 0xbf)
  45. {
  46. reader = new StreamReader(stream, Encoding.UTF8);
  47. break;
  48. }
  49. else
  50. {
  51. return FileType.Binary;
  52. }
  53. default:
  54. if (IsUTF8(stream, (byte)firstByte, (byte)secondByte))
  55. {
  56. stream.Position = 0;
  57. reader = new StreamReader(stream, Encoding.UTF8);
  58. break;
  59. }
  60. else
  61. {
  62. return FileType.Binary;
  63. }
  64. }
  65. }
  66. else
  67. {
  68. return FileType.Binary;
  69. }
  70. // Now we got a StreamReader with the correct encoding
  71. // Check for XML now
  72. try
  73. {
  74. XmlTextReader xmlReader = new XmlTextReader(reader);
  75. xmlReader.XmlResolver = null;
  76. xmlReader.MoveToContent();
  77. return FileType.Xml;
  78. }
  79. catch (XmlException)
  80. {
  81. return FileType.Text;
  82. }
  83. }
  84. static bool IsUTF8(Stream fs, byte firstByte, byte secondByte)
  85. {
  86. int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB
  87. const int ASCII = 0;
  88. const int Error = 1;
  89. const int UTF8 = 2;
  90. const int UTF8Sequence = 3;
  91. int state = ASCII;
  92. int sequenceLength = 0;
  93. byte b;
  94. for (int i = 0; i < max; i++)
  95. {
  96. if (i == 0)
  97. {
  98. b = firstByte;
  99. }
  100. else if (i == 1)
  101. {
  102. b = secondByte;
  103. }
  104. else
  105. {
  106. b = (byte)fs.ReadByte();
  107. }
  108. if (b < 0x80)
  109. {
  110. // normal ASCII character
  111. if (state == UTF8Sequence)
  112. {
  113. state = Error;
  114. break;
  115. }
  116. }
  117. else if (b < 0xc0)
  118. {
  119. // 10xxxxxx : continues UTF8 byte sequence
  120. if (state == UTF8Sequence)
  121. {
  122. --sequenceLength;
  123. if (sequenceLength < 0)
  124. {
  125. state = Error;
  126. break;
  127. }
  128. else if (sequenceLength == 0)
  129. {
  130. state = UTF8;
  131. }
  132. }
  133. else
  134. {
  135. state = Error;
  136. break;
  137. }
  138. }
  139. else if (b >= 0xc2 && b < 0xf5)
  140. {
  141. // beginning of byte sequence
  142. if (state == UTF8 || state == ASCII)
  143. {
  144. state = UTF8Sequence;
  145. if (b < 0xe0)
  146. {
  147. sequenceLength = 1; // one more byte following
  148. }
  149. else if (b < 0xf0)
  150. {
  151. sequenceLength = 2; // two more bytes following
  152. }
  153. else
  154. {
  155. sequenceLength = 3; // three more bytes following
  156. }
  157. }
  158. else
  159. {
  160. state = Error;
  161. break;
  162. }
  163. }
  164. else
  165. {
  166. // 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629)
  167. state = Error;
  168. break;
  169. }
  170. }
  171. return state != Error;
  172. }
  173. }
  174. enum FileType
  175. {
  176. Binary,
  177. Text,
  178. Xml
  179. }
  180. }