You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1655 lines
66 KiB

4 years ago
4 years ago
4 years ago
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Configuration;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. namespace MarkdownSharp
  7. {
  8. /// <summary>
  9. /// Markdown is a text-to-HTML conversion tool for web writers.
  10. /// Markdown allows you to write using an easy-to-read, easy-to-write plain text format,
  11. /// then convert it to structurally valid XHTML (or HTML).
  12. /// </summary>
  13. internal class Markdown
  14. {
  15. private const string _version = "1.13";
  16. #region Constructors and Options
  17. /// <summary>
  18. /// Create a new Markdown instance using default options
  19. /// </summary>
  20. public Markdown()
  21. {
  22. }
  23. #if NETFX || NETCORE
  24. /// <summary>
  25. /// Create a new Markdown instance and optionally load options from a configuration
  26. /// file. There they should be stored in the appSettings section, available options are:
  27. /// Markdown.StrictBoldItalic (true/false)
  28. /// Markdown.EmptyElementSuffix (">" or " />" without the quotes)
  29. /// Markdown.LinkEmails (true/false)
  30. /// Markdown.AutoNewLines (true/false)
  31. /// Markdown.AutoHyperlink (true/false)
  32. /// Markdown.AsteriskIntraWordEmphasis (true/false)
  33. /// </summary>
  34. public Markdown(System.Collections.Specialized.NameValueCollection loadOptionsFromConfigFile = null)
  35. {
  36. if (loadOptionsFromConfigFile == null) return;
  37. var settings = loadOptionsFromConfigFile; // ConfigurationManager.AppSettings;
  38. foreach (string key in settings.Keys)
  39. {
  40. switch (key)
  41. {
  42. case "Markdown.AutoHyperlink":
  43. AutoHyperlink = Convert.ToBoolean(settings[key]);
  44. break;
  45. case "Markdown.AutoNewlines":
  46. AutoNewLines = Convert.ToBoolean(settings[key]);
  47. break;
  48. case "Markdown.EmptyElementSuffix":
  49. EmptyElementSuffix = settings[key];
  50. break;
  51. case "Markdown.LinkEmails":
  52. LinkEmails = Convert.ToBoolean(settings[key]);
  53. break;
  54. case "Markdown.StrictBoldItalic":
  55. StrictBoldItalic = Convert.ToBoolean(settings[key]);
  56. break;
  57. case "Markdown.AsteriskIntraWordEmphasis":
  58. AsteriskIntraWordEmphasis = Convert.ToBoolean(settings[key]);
  59. break;
  60. }
  61. }
  62. }
  63. #endif
  64. /// <summary>
  65. /// Create a new Markdown instance and set the options from the MarkdownOptions object.
  66. /// </summary>
  67. public Markdown(MarkdownOptions options)
  68. {
  69. AutoHyperlink = options.AutoHyperlink;
  70. AutoNewLines = options.AutoNewlines;
  71. if (!string.IsNullOrEmpty(options.EmptyElementSuffix))
  72. EmptyElementSuffix = options.EmptyElementSuffix;
  73. LinkEmails = options.LinkEmails;
  74. StrictBoldItalic = options.StrictBoldItalic;
  75. AsteriskIntraWordEmphasis = options.AsteriskIntraWordEmphasis;
  76. }
  77. /// <summary>
  78. /// use ">" for HTML output, or " />" for XHTML output
  79. /// </summary>
  80. public string EmptyElementSuffix { get; set; } = " />";
  81. /// <summary>
  82. /// when false, email addresses will never be auto-linked
  83. /// WARNING: this is a significant deviation from the markdown spec
  84. /// </summary>
  85. public bool LinkEmails { get; set; } = true;
  86. /// <summary>
  87. /// when true, bold and italic require non-word characters on either side
  88. /// WARNING: this is a significant deviation from the markdown spec
  89. /// </summary>
  90. public bool StrictBoldItalic { get; set; } = false;
  91. /// <summary>
  92. /// when true, asterisks may be used for intraword emphasis
  93. /// this does nothing if StrictBoldItalic is false
  94. /// </summary>
  95. public bool AsteriskIntraWordEmphasis { get; set; } = false;
  96. /// <summary>
  97. /// when true, RETURN becomes a literal newline
  98. /// WARNING: this is a significant deviation from the markdown spec
  99. /// </summary>
  100. public bool AutoNewLines { get; set; } = false;
  101. /// <summary>
  102. /// when true, (most) bare plain URLs are auto-hyperlinked
  103. /// WARNING: this is a significant deviation from the markdown spec
  104. /// </summary>
  105. public bool AutoHyperlink { get; set; } = false;
  106. #endregion
  107. /// <summary>
  108. /// maximum nested depth of [] and () supported by the transform; implementation detail
  109. /// </summary>
  110. private const int _nestDepth = 6;
  111. /// <summary>
  112. /// Tabs are automatically converted to spaces as part of the transform
  113. /// this constant determines how "wide" those tabs become in spaces
  114. /// </summary>
  115. private const int _tabWidth = 4;
  116. private const string _markerUL = "[*+-]";
  117. private const string _markerOL = @"\d+[.]";
  118. private static readonly Dictionary<string, string> _escapeTable;
  119. private static readonly Dictionary<string, string> _invertedEscapeTable;
  120. private static readonly Dictionary<string, string> _backslashEscapeTable;
  121. private readonly Dictionary<string, string> _urls = new Dictionary<string, string>();
  122. private readonly Dictionary<string, string> _titles = new Dictionary<string, string>();
  123. private readonly Dictionary<string, string> _htmlBlocks = new Dictionary<string, string>();
  124. private int _listLevel;
  125. private const string AutoLinkPreventionMarker = "\x1AP"; // temporarily replaces "://" where auto-linking shouldn't happen
  126. /// <summary>
  127. /// In the static constuctor we'll initialize what stays the same across all transforms.
  128. /// </summary>
  129. static Markdown()
  130. {
  131. // Table of hash values for escaped characters:
  132. _escapeTable = new Dictionary<string, string>();
  133. _invertedEscapeTable = new Dictionary<string, string>();
  134. // Table of hash value for backslash escaped characters:
  135. _backslashEscapeTable = new Dictionary<string, string>();
  136. string backslashPattern = "";
  137. foreach (char c in @"\`*_{}[]()>#+-.!/:")
  138. {
  139. string key = c.ToString();
  140. string hash = GetHashKey(key, isHtmlBlock: false);
  141. _escapeTable.Add(key, hash);
  142. _invertedEscapeTable.Add(hash, key);
  143. _backslashEscapeTable.Add(@"\" + key, hash);
  144. backslashPattern += Regex.Escape(@"\" + key) + "|";
  145. }
  146. _backslashEscapes = new Regex(backslashPattern.Substring(0, backslashPattern.Length - 1), RegexOptions.Compiled);
  147. }
  148. /// <summary>
  149. /// current version of MarkdownSharp;
  150. /// see http://code.google.com/p/markdownsharp/ for the latest code or to contribute
  151. /// </summary>
  152. public string Version
  153. {
  154. get { return _version; }
  155. }
  156. /// <summary>
  157. /// Transforms the provided Markdown-formatted text to HTML;
  158. /// see http://en.wikipedia.org/wiki/Markdown
  159. /// </summary>
  160. /// <remarks>
  161. /// The order in which other subs are called here is
  162. /// essential. Link and image substitutions need to happen before
  163. /// EscapeSpecialChars(), so that any *'s or _'s in the a
  164. /// and img tags get encoded.
  165. /// </remarks>
  166. public string Transform(string text)
  167. {
  168. if (string.IsNullOrEmpty(text)) return "";
  169. Setup();
  170. text = Normalize(text);
  171. text = HashHTMLBlocks(text);
  172. text = StripLinkDefinitions(text);
  173. text = RunBlockGamut(text);
  174. text = Unescape(text);
  175. Cleanup();
  176. return text + "\n";
  177. }
  178. /// <summary>
  179. /// Perform transformations that form block-level tags like paragraphs, headers, and list items.
  180. /// </summary>
  181. private string RunBlockGamut(string text, bool unhash = true, bool createParagraphs = true)
  182. {
  183. text = DoHeaders(text);
  184. text = DoHorizontalRules(text);
  185. text = DoLists(text);
  186. text = DoCodeBlocks(text);
  187. text = DoBlockQuotes(text);
  188. // We already ran HashHTMLBlocks() before, in Markdown(), but that
  189. // was to escape raw HTML in the original Markdown source. This time,
  190. // we're escaping the markup we've just created, so that we don't wrap
  191. // <p> tags around block-level tags.
  192. text = HashHTMLBlocks(text);
  193. return FormParagraphs(text, unhash: unhash, createParagraphs: createParagraphs);
  194. }
  195. /// <summary>
  196. /// Perform transformations that occur *within* block-level tags like paragraphs, headers, and list items.
  197. /// </summary>
  198. private string RunSpanGamut(string text)
  199. {
  200. text = DoCodeSpans(text);
  201. text = EscapeSpecialCharsWithinTagAttributes(text);
  202. text = EscapeBackslashes(text);
  203. // Images must come first, because ![foo][f] looks like an anchor.
  204. text = DoImages(text);
  205. text = DoAnchors(text);
  206. // Must come after DoAnchors(), because you can use < and >
  207. // delimiters in inline links like [this](<url>).
  208. text = DoAutoLinks(text);
  209. text = text.Replace(AutoLinkPreventionMarker, "://");
  210. text = EncodeAmpsAndAngles(text);
  211. text = DoItalicsAndBold(text);
  212. return DoHardBreaks(text);
  213. }
  214. private static readonly Regex _newlinesLeadingTrailing = new Regex(@"^\n+|\n+\z", RegexOptions.Compiled);
  215. private static readonly Regex _newlinesMultiple = new Regex(@"\n{2,}", RegexOptions.Compiled);
  216. private static readonly Regex _leadingWhitespace = new Regex("^[ ]*", RegexOptions.Compiled);
  217. private static readonly Regex _htmlBlockHash = new Regex("\x1AH\\d+H", RegexOptions.Compiled);
  218. /// <summary>
  219. /// splits on two or more newlines, to form "paragraphs";
  220. /// each paragraph is then unhashed (if it is a hash and unhashing isn't turned off) or wrapped in HTML p tag
  221. /// </summary>
  222. private string FormParagraphs(string text, bool unhash = true, bool createParagraphs = true)
  223. {
  224. // split on two or more newlines
  225. string[] grafs = _newlinesMultiple.Split(_newlinesLeadingTrailing.Replace(text, ""));
  226. for (int i = 0; i < grafs.Length; i++)
  227. {
  228. if (grafs[i].Contains("\x1AH"))
  229. {
  230. // unhashify HTML blocks
  231. if (unhash)
  232. {
  233. int sanityCheck = 50; // just for safety, guard against an infinite loop
  234. bool keepGoing = true; // as long as replacements where made, keep going
  235. while (keepGoing && sanityCheck > 0)
  236. {
  237. keepGoing = false;
  238. grafs[i] = _htmlBlockHash.Replace(grafs[i], match =>
  239. {
  240. keepGoing = true;
  241. return _htmlBlocks[match.Value];
  242. });
  243. sanityCheck--;
  244. }
  245. /* if (keepGoing)
  246. {
  247. // Logging of an infinite loop goes here.
  248. // If such a thing should happen, please open a new issue on http://code.google.com/p/markdownsharp/
  249. // with the input that caused it.
  250. }*/
  251. }
  252. }
  253. else
  254. {
  255. // do span level processing inside the block, then wrap result in <p> tags
  256. grafs[i] = _leadingWhitespace.Replace(RunSpanGamut(grafs[i]), createParagraphs ? "<p>" : "") + (createParagraphs ? "</p>" : "");
  257. }
  258. }
  259. return string.Join("\n\n", grafs);
  260. }
  261. private void Setup()
  262. {
  263. // Clear the global hashes. If we don't clear these, you get conflicts
  264. // from other articles when generating a page which contains more than
  265. // one article (e.g. an index page that shows the N most recent
  266. // articles):
  267. _urls.Clear();
  268. _titles.Clear();
  269. _htmlBlocks.Clear();
  270. _listLevel = 0;
  271. }
  272. private void Cleanup()
  273. {
  274. Setup();
  275. }
  276. private static string _nestedBracketsPattern;
  277. /// <summary>
  278. /// Reusable pattern to match balanced [brackets]. See Friedl's
  279. /// "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  280. /// </summary>
  281. private static string GetNestedBracketsPattern()
  282. {
  283. // in other words [this] and [this[also]] and [this[also[too]]]
  284. // up to _nestDepth
  285. if (_nestedBracketsPattern == null)
  286. {
  287. _nestedBracketsPattern =
  288. RepeatString(@"
  289. (?> # Atomic matching
  290. [^\[\]]+ # Anything other than brackets
  291. |
  292. \[
  293. ", _nestDepth) + RepeatString(
  294. @" \]
  295. )*"
  296. , _nestDepth);
  297. }
  298. return _nestedBracketsPattern;
  299. }
  300. private static string _nestedParensPattern;
  301. /// <summary>
  302. /// Reusable pattern to match balanced (parens). See Friedl's
  303. /// "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  304. /// </summary>
  305. private static string GetNestedParensPattern()
  306. {
  307. // in other words (this) and (this(also)) and (this(also(too)))
  308. // up to _nestDepth
  309. if (_nestedParensPattern == null)
  310. {
  311. _nestedParensPattern =
  312. RepeatString(@"
  313. (?> # Atomic matching
  314. [^()\s]+ # Anything other than parens or whitespace
  315. |
  316. \(
  317. ", _nestDepth) + RepeatString(
  318. @" \)
  319. )*"
  320. , _nestDepth);
  321. }
  322. return _nestedParensPattern;
  323. }
  324. private static readonly Regex _linkDef = new Regex(string.Format(@"
  325. ^[ ]{{0,{0}}}\[([^\[\]]+)\]: # id = $1
  326. [ ]*
  327. \n? # maybe *one* newline
  328. [ ]*
  329. <?(\S+?)>? # url = $2
  330. [ ]*
  331. \n? # maybe one newline
  332. [ ]*
  333. (?:
  334. (?<=\s) # lookbehind for whitespace
  335. [""(]
  336. (.+?) # title = $3
  337. ["")]
  338. [ ]*
  339. )? # title is optional
  340. (?:\n+|\Z)", _tabWidth - 1), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  341. /// <summary>
  342. /// Strips link definitions from text, stores the URLs and titles in hash references.
  343. /// </summary>
  344. /// <remarks>
  345. /// ^[id]: url "optional title"
  346. /// </remarks>
  347. private string StripLinkDefinitions(string text) => _linkDef.Replace(text, new MatchEvaluator(LinkEvaluator));
  348. private string LinkEvaluator(Match match)
  349. {
  350. string linkID = match.Groups[1].Value.ToLowerInvariant();
  351. _urls[linkID] = EncodeAmpsAndAngles(match.Groups[2].Value);
  352. if (match.Groups[3]?.Length > 0)
  353. _titles[linkID] = match.Groups[3].Value.Replace("\"", "&quot;");
  354. return "";
  355. }
  356. // compiling this monster regex results in worse performance. trust me.
  357. private static readonly Regex _blocksHtml = new Regex(GetBlockPattern(), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
  358. /// <summary>
  359. /// derived pretty much verbatim from PHP Markdown
  360. /// </summary>
  361. private static string GetBlockPattern()
  362. {
  363. // Hashify HTML blocks:
  364. // We only want to do this for block-level HTML tags, such as headers,
  365. // lists, and tables. That's because we still want to wrap <p>s around
  366. // "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  367. // phrase emphasis, and spans. The list of tags we're looking for is
  368. // hard-coded:
  369. //
  370. // * List "a" is made of tags which can be both inline or block-level.
  371. // These will be treated block-level when the start tag is alone on
  372. // its line, otherwise they're not matched here and will be taken as
  373. // inline later.
  374. // * List "b" is made of tags which are always block-level;
  375. //
  376. const string blockTagsA = "ins|del";
  377. const string blockTagsB = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|script|noscript|form|fieldset|iframe|math";
  378. // Regular expression for the content of a block tag.
  379. const string attr = @"
  380. (?> # optional tag attributes
  381. \s # starts with whitespace
  382. (?>
  383. [^>""/]+ # text outside quotes
  384. |
  385. /+(?!>) # slash not followed by >
  386. |
  387. ""[^""]*"" # text inside double quotes (tolerate >)
  388. |
  389. '[^']*' # text inside single quotes (tolerate >)
  390. )*
  391. )?
  392. ";
  393. string content = RepeatString(@"
  394. (?>
  395. [^<]+ # content without tag
  396. |
  397. <\2 # nested opening tag
  398. " + attr + @" # attributes
  399. (?>
  400. />
  401. |
  402. >", _nestDepth) + // end of opening tag
  403. ".*?" + // last level nested tag content
  404. RepeatString(@"
  405. </\2\s*> # closing nested tag
  406. )
  407. |
  408. <(?!/\2\s*> # other tags with a different name
  409. )
  410. )*", _nestDepth);
  411. string content2 = content.Replace(@"\2", @"\3");
  412. // First, look for nested blocks, e.g.:
  413. // <div>
  414. // <div>
  415. // tags for inner block must be indented.
  416. // </div>
  417. // </div>
  418. //
  419. // The outermost tags must start at the left margin for this to match, and
  420. // the inner nested divs must be indented.
  421. // We need to do this before the next, more liberal match, because the next
  422. // match will start at the first `<div>` and stop at the first `</div>`.
  423. string pattern = @"
  424. (?>
  425. (?>
  426. (?<=\n) # Starting at the beginning of a line
  427. | # or
  428. \A\n? # the beginning of the doc
  429. )
  430. ( # save in $1
  431. # Match from `\n<tag>` to `</tag>\n`, handling nested tags
  432. # in between.
  433. <($block_tags_b_re) # start tag = $2
  434. $attr> # attributes followed by > and \n
  435. $content # content, support nesting
  436. </\2> # the matching end tag
  437. [ ]* # trailing spaces
  438. (?=\n+|\Z) # followed by a newline or end of document
  439. | # Special version for tags of group a.
  440. <($block_tags_a_re) # start tag = $3
  441. $attr>[ ]*\n # attributes followed by >
  442. $content2 # content, support nesting
  443. </\3> # the matching end tag
  444. [ ]* # trailing spaces
  445. (?=\n+|\Z) # followed by a newline or end of document
  446. | # Special case just for <hr />. It was easier to make a special
  447. # case than to make the other regex more complicated.
  448. [ ]{0,$less_than_tab}
  449. <hr
  450. $attr # attributes
  451. /?> # the matching end tag
  452. [ ]*
  453. (?=\n{2,}|\Z) # followed by a blank line or end of document
  454. | # Special case for standalone HTML comments:
  455. (?<=\n\n|\A) # preceded by a blank line or start of document
  456. [ ]{0,$less_than_tab}
  457. (?s:
  458. <!--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)-->
  459. )
  460. [ ]*
  461. (?=\n{2,}|\Z) # followed by a blank line or end of document
  462. | # PHP and ASP-style processor instructions (<? and <%)
  463. [ ]{0,$less_than_tab}
  464. (?s:
  465. <([?%]) # $4
  466. .*?
  467. \4>
  468. )
  469. [ ]*
  470. (?=\n{2,}|\Z) # followed by a blank line or end of document
  471. )
  472. )";
  473. pattern = pattern.Replace("$less_than_tab", (_tabWidth - 1).ToString());
  474. pattern = pattern.Replace("$block_tags_b_re", blockTagsB);
  475. pattern = pattern.Replace("$block_tags_a_re", blockTagsA);
  476. pattern = pattern.Replace("$attr", attr);
  477. pattern = pattern.Replace("$content2", content2);
  478. return pattern.Replace("$content", content);
  479. }
  480. /// <summary>
  481. /// replaces any block-level HTML blocks with hash entries
  482. /// </summary>
  483. private string HashHTMLBlocks(string text)
  484. {
  485. return _blocksHtml.Replace(text, new MatchEvaluator(HtmlEvaluator));
  486. }
  487. private string HtmlEvaluator(Match match)
  488. {
  489. string text = match.Groups[1].Value;
  490. string key = GetHashKey(text, isHtmlBlock: true);
  491. _htmlBlocks[key] = text;
  492. return string.Concat("\n\n", key, "\n\n");
  493. }
  494. private static string GetHashKey(string s, bool isHtmlBlock)
  495. {
  496. var delim = isHtmlBlock ? 'H' : 'E';
  497. return "\x1A" + delim + Math.Abs(s.GetHashCode()).ToString() + delim;
  498. }
  499. private static readonly Regex _htmlTokens = new Regex(@"
  500. (<!--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)-->)| # match <!-- foo -->
  501. (<\?.*?\?>)| # match <?foo?> " +
  502. RepeatString(@"
  503. (<[A-Za-z\/!$](?:[^<>]|", _nestDepth - 1) + @"
  504. (<[A-Za-z\/!$](?:[^<>]"
  505. + RepeatString(")*>)", _nestDepth) +
  506. " # match <tag> and </tag>",
  507. RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  508. /// <summary>
  509. /// returns an array of HTML tokens comprising the input string. Each token is
  510. /// either a tag (possibly with nested, tags contained therein, such
  511. /// as &lt;a href="&lt;MTFoo&gt;"&gt;, or a run of text between tags. Each element of the
  512. /// array is a two-element array; the first is either 'tag' or 'text'; the second is
  513. /// the actual value.
  514. /// </summary>
  515. private List<Token> TokenizeHTML(string text)
  516. {
  517. int pos = 0;
  518. int tagStart = 0;
  519. var tokens = new List<Token>();
  520. // this regex is derived from the _tokenize() subroutine in Brad Choate's MTRegex plugin.
  521. // http://www.bradchoate.com/past/mtregex.php
  522. foreach (Match m in _htmlTokens.Matches(text))
  523. {
  524. tagStart = m.Index;
  525. if (pos < tagStart)
  526. tokens.Add(new Token(TokenType.Text, text.Substring(pos, tagStart - pos)));
  527. tokens.Add(new Token(TokenType.Tag, m.Value));
  528. pos = tagStart + m.Length;
  529. }
  530. if (pos < text.Length)
  531. tokens.Add(new Token(TokenType.Text, text.Substring(pos, text.Length - pos)));
  532. return tokens;
  533. }
  534. private static readonly Regex _anchorRef = new Regex(string.Format(@"
  535. ( # wrap whole match in $1
  536. \[
  537. ({0}) # link text = $2
  538. \]
  539. [ ]? # one optional space
  540. (?:\n[ ]*)? # one optional newline followed by spaces
  541. \[
  542. (.*?) # id = $3
  543. \]
  544. )", GetNestedBracketsPattern()), RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  545. private static readonly Regex _anchorInline = new Regex(string.Format(@"
  546. ( # wrap whole match in $1
  547. \[
  548. ({0}) # link text = $2
  549. \]
  550. \( # literal paren
  551. [ ]*
  552. ({1}) # href = $3
  553. [ ]*
  554. ( # $4
  555. (['""]) # quote char = $5
  556. (.*?) # title = $6
  557. \5 # matching quote
  558. [ ]* # ignore any spaces between closing quote and )
  559. )? # title is optional
  560. \)
  561. )", GetNestedBracketsPattern(), GetNestedParensPattern()),
  562. RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  563. private static readonly Regex _anchorRefShortcut = new Regex(@"
  564. ( # wrap whole match in $1
  565. \[
  566. ([^\[\]]+) # link text = $2; can't contain [ or ]
  567. \]
  568. )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  569. /// <summary>
  570. /// Turn Markdown link shortcuts into HTML anchor tags
  571. /// </summary>
  572. /// <remarks>
  573. /// [link text](url "title")
  574. /// [link text][id]
  575. /// [id]
  576. /// </remarks>
  577. private string DoAnchors(string text)
  578. {
  579. if (!text.Contains("["))
  580. return text;
  581. // First, handle reference-style links: [link text] [id]
  582. text = _anchorRef.Replace(text, new MatchEvaluator(AnchorRefEvaluator));
  583. // Next, inline-style links: [link text](url "optional title") or [link text](url "optional title")
  584. text = _anchorInline.Replace(text, new MatchEvaluator(AnchorInlineEvaluator));
  585. // Last, handle reference-style shortcuts: [link text]
  586. // These must come last in case you've also got [link test][1]
  587. // or [link test](/foo)
  588. return _anchorRefShortcut.Replace(text, new MatchEvaluator(AnchorRefShortcutEvaluator));
  589. }
  590. private string SaveFromAutoLinking(string s)
  591. {
  592. return s.Replace("://", AutoLinkPreventionMarker);
  593. }
  594. private string AnchorRefEvaluator(Match match)
  595. {
  596. string wholeMatch = match.Groups[1].Value;
  597. string linkText = SaveFromAutoLinking(match.Groups[2].Value);
  598. string linkID = match.Groups[3].Value.ToLowerInvariant();
  599. string result;
  600. // for shortcut links like [this][].
  601. if (linkID?.Length == 0)
  602. linkID = linkText.ToLowerInvariant();
  603. if (_urls.ContainsKey(linkID))
  604. {
  605. string url = _urls[linkID];
  606. url = AttributeSafeUrl(url);
  607. result = "<a href=\"" + url + "\"";
  608. if (_titles.ContainsKey(linkID))
  609. {
  610. string title = AttributeEncode(_titles[linkID]);
  611. title = AttributeEncode(EscapeBoldItalic(title));
  612. result += " title=\"" + title + "\"";
  613. }
  614. result += ">" + linkText + "</a>";
  615. }
  616. else
  617. {
  618. result = wholeMatch;
  619. }
  620. return result;
  621. }
  622. private string AnchorRefShortcutEvaluator(Match match)
  623. {
  624. string wholeMatch = match.Groups[1].Value;
  625. string linkText = SaveFromAutoLinking(match.Groups[2].Value);
  626. string linkID = Regex.Replace(linkText.ToLowerInvariant(), @"[ ]*\n[ ]*", " "); // lower case and remove newlines / extra spaces
  627. string result;
  628. if (_urls.ContainsKey(linkID))
  629. {
  630. string url = _urls[linkID];
  631. url = AttributeSafeUrl(url);
  632. result = "<a href=\"" + url + "\"";
  633. if (_titles.ContainsKey(linkID))
  634. {
  635. string title = AttributeEncode(_titles[linkID]);
  636. title = EscapeBoldItalic(title);
  637. result += " title=\"" + title + "\"";
  638. }
  639. result += ">" + linkText + "</a>";
  640. }
  641. else
  642. {
  643. result = wholeMatch;
  644. }
  645. return result;
  646. }
  647. private string AnchorInlineEvaluator(Match match)
  648. {
  649. string linkText = SaveFromAutoLinking(match.Groups[2].Value);
  650. string url = match.Groups[3].Value;
  651. string title = match.Groups[6].Value;
  652. string result;
  653. if (url.StartsWith("<") && url.EndsWith(">"))
  654. url = url.Substring(1, url.Length - 2); // remove <>'s surrounding URL, if present
  655. url = AttributeSafeUrl(url);
  656. result = string.Format("<a href=\"{0}\"", url);
  657. if (!string.IsNullOrEmpty(title))
  658. {
  659. title = AttributeEncode(title);
  660. title = EscapeBoldItalic(title);
  661. result += string.Format(" title=\"{0}\"", title);
  662. }
  663. result += string.Format(">{0}</a>", linkText);
  664. return result;
  665. }
  666. private static readonly Regex _imagesRef = new Regex(@"
  667. ( # wrap whole match in $1
  668. !\[
  669. (.*?) # alt text = $2
  670. \]
  671. [ ]? # one optional space
  672. (?:\n[ ]*)? # one optional newline followed by spaces
  673. \[
  674. (.*?) # id = $3
  675. \]
  676. )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  677. private static readonly Regex _imagesInline = new Regex(string.Format(@"
  678. ( # wrap whole match in $1
  679. !\[
  680. (.*?) # alt text = $2
  681. \]
  682. \s? # one optional whitespace character
  683. \( # literal paren
  684. [ ]*
  685. ({0}) # href = $3
  686. [ ]*
  687. ( # $4
  688. (['""]) # quote char = $5
  689. (.*?) # title = $6
  690. \5 # matching quote
  691. [ ]*
  692. )? # title is optional
  693. \)
  694. )", GetNestedParensPattern()),
  695. RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  696. /// <summary>
  697. /// Turn Markdown image shortcuts into HTML img tags.
  698. /// </summary>
  699. /// <remarks>
  700. /// ![alt text][id]
  701. /// ![alt text](url "optional title")
  702. /// </remarks>
  703. private string DoImages(string text)
  704. {
  705. if (!text.Contains("!["))
  706. return text;
  707. // First, handle reference-style labeled images: ![alt text][id]
  708. text = _imagesRef.Replace(text, new MatchEvaluator(ImageReferenceEvaluator));
  709. // Next, handle inline images: ![alt text](url "optional title")
  710. // Don't forget: encode * and _
  711. return _imagesInline.Replace(text, new MatchEvaluator(ImageInlineEvaluator));
  712. }
  713. // This prevents the creation of horribly broken HTML when some syntax ambiguities
  714. // collide. It likely still doesn't do what the user meant, but at least we're not
  715. // outputting garbage.
  716. private string EscapeImageAltText(string s)
  717. {
  718. s = EscapeBoldItalic(s);
  719. return Regex.Replace(s, @"[\[\]()]", m => _escapeTable[m.ToString()]);
  720. }
  721. private string ImageReferenceEvaluator(Match match)
  722. {
  723. string wholeMatch = match.Groups[1].Value;
  724. string altText = match.Groups[2].Value;
  725. string linkID = match.Groups[3].Value.ToLowerInvariant();
  726. // for shortcut links like ![this][].
  727. if (linkID?.Length == 0)
  728. linkID = altText.ToLowerInvariant();
  729. if (_urls.ContainsKey(linkID))
  730. {
  731. string url = _urls[linkID];
  732. string title = null;
  733. if (_titles.ContainsKey(linkID))
  734. title = _titles[linkID];
  735. return ImageTag(url, altText, title);
  736. }
  737. else
  738. {
  739. // If there's no such link ID, leave intact:
  740. return wholeMatch;
  741. }
  742. }
  743. private string ImageInlineEvaluator(Match match)
  744. {
  745. string alt = match.Groups[2].Value;
  746. string url = match.Groups[3].Value;
  747. string title = match.Groups[6].Value;
  748. if (url.StartsWith("<") && url.EndsWith(">"))
  749. url = url.Substring(1, url.Length - 2); // Remove <>'s surrounding URL, if present
  750. return ImageTag(url, alt, title);
  751. }
  752. private string ImageTag(string url, string altText, string title)
  753. {
  754. altText = EscapeImageAltText(AttributeEncode(altText));
  755. url = AttributeSafeUrl(url);
  756. var result = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, altText);
  757. if (!string.IsNullOrEmpty(title))
  758. {
  759. title = AttributeEncode(EscapeBoldItalic(title));
  760. result += string.Format(" title=\"{0}\"", title);
  761. }
  762. result += EmptyElementSuffix;
  763. return result;
  764. }
  765. private static readonly Regex _headerSetext = new Regex(@"
  766. ^(.+?)
  767. [ ]*
  768. \n
  769. (=+|-+) # $1 = string of ='s or -'s
  770. [ ]*
  771. \n+",
  772. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  773. private static readonly Regex _headerAtx = new Regex(@"
  774. ^(\#{1,6}) # $1 = string of #'s
  775. [ ]*
  776. (.+?) # $2 = Header text
  777. [ ]*
  778. \#* # optional closing #'s (not counted)
  779. \n+",
  780. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  781. /// <summary>
  782. /// Turn Markdown headers into HTML header tags
  783. /// </summary>
  784. /// <remarks>
  785. /// <para>
  786. /// Header 1
  787. /// ========
  788. /// </para>
  789. /// <para>
  790. /// Header 2
  791. /// --------
  792. /// </para>
  793. /// <para>
  794. /// # Header 1
  795. /// ## Header 2
  796. /// ## Header 2 with closing hashes ##
  797. /// ...
  798. /// ###### Header 6
  799. /// </para>
  800. /// </remarks>
  801. private string DoHeaders(string text)
  802. {
  803. text = _headerSetext.Replace(text, new MatchEvaluator(SetextHeaderEvaluator));
  804. return _headerAtx.Replace(text, new MatchEvaluator(AtxHeaderEvaluator));
  805. }
  806. private string SetextHeaderEvaluator(Match match)
  807. {
  808. string header = match.Groups[1].Value;
  809. int level = match.Groups[2].Value.StartsWith("=") ? 1 : 2;
  810. return string.Format("<h{1}>{0}</h{1}>\n\n", RunSpanGamut(header), level);
  811. }
  812. private string AtxHeaderEvaluator(Match match)
  813. {
  814. string header = match.Groups[2].Value;
  815. int level = match.Groups[1].Value.Length;
  816. return string.Format("<h{1}>{0}</h{1}>\n\n", RunSpanGamut(header), level);
  817. }
  818. private static readonly Regex _horizontalRules = new Regex(@"
  819. ^[ ]{0,3} # Leading space
  820. ([-*_]) # $1: First marker
  821. (?> # Repeated marker group
  822. [ ]{0,2} # Zero, one, or two spaces.
  823. \1 # Marker character
  824. ){2,} # Group repeated at least twice
  825. [ ]* # Trailing spaces
  826. $ # End of line.
  827. ", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  828. /// <summary>
  829. /// Turn Markdown horizontal rules into HTML hr tags
  830. /// </summary>
  831. /// <remarks>
  832. /// ***
  833. /// * * *
  834. /// ---
  835. /// - - -
  836. /// </remarks>
  837. private string DoHorizontalRules(string text)
  838. {
  839. return _horizontalRules.Replace(text, "<hr" + EmptyElementSuffix + "\n");
  840. }
  841. private static readonly string _wholeList = string.Format(@"
  842. ( # $1 = whole list
  843. ( # $2
  844. [ ]{{0,{1}}}
  845. ({0}) # $3 = first list item marker
  846. [ ]+
  847. )
  848. (?s:.+?)
  849. ( # $4
  850. \z
  851. |
  852. \n{{2,}}
  853. (?=\S)
  854. (?! # Negative lookahead for another list item marker
  855. [ ]*
  856. {0}[ ]+
  857. )
  858. )
  859. )", string.Format("(?:{0}|{1})", _markerUL, _markerOL), _tabWidth - 1);
  860. private static readonly Regex _listNested = new Regex("^" + _wholeList,
  861. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  862. private static readonly Regex _listTopLevel = new Regex(@"(?:(?<=\n\n)|\A\n?)" + _wholeList,
  863. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  864. /// <summary>
  865. /// Turn Markdown lists into HTML ul and ol and li tags
  866. /// </summary>
  867. private string DoLists(string text)
  868. {
  869. // We use a different prefix before nested lists than top-level lists.
  870. // See extended comment in _ProcessListItems().
  871. if (_listLevel > 0)
  872. {
  873. return _listNested.Replace(text, new MatchEvaluator(ListEvaluator));
  874. }
  875. else
  876. {
  877. return _listTopLevel.Replace(text, new MatchEvaluator(ListEvaluator));
  878. }
  879. }
  880. private string ListEvaluator(Match match)
  881. {
  882. string list = match.Groups[1].Value;
  883. string marker = match.Groups[3].Value;
  884. string listType = Regex.IsMatch(marker, _markerUL) ? "ul" : "ol";
  885. string result;
  886. string start = "";
  887. if (listType == "ol")
  888. {
  889. int.TryParse(marker.Substring(0, marker.Length - 1), out int firstNumber);
  890. if (firstNumber != 1 && firstNumber != 0)
  891. start = " start=\"" + firstNumber + "\"";
  892. }
  893. result = ProcessListItems(list, listType == "ul" ? _markerUL : _markerOL);
  894. return string.Format("<{0}{1}>\n{2}</{0}>\n", listType, start, result);
  895. }
  896. /// <summary>
  897. /// Process the contents of a single ordered or unordered list, splitting it
  898. /// into individual list items.
  899. /// </summary>
  900. private string ProcessListItems(string list, string marker)
  901. {
  902. // The listLevel global keeps track of when we're inside a list.
  903. // Each time we enter a list, we increment it; when we leave a list,
  904. // we decrement. If it's zero, we're not in a list anymore.
  905. // We do this because when we're not inside a list, we want to treat
  906. // something like this:
  907. // I recommend upgrading to version
  908. // 8. Oops, now this line is treated
  909. // as a sub-list.
  910. // As a single paragraph, despite the fact that the second line starts
  911. // with a digit-period-space sequence.
  912. // Whereas when we're inside a list (or sub-list), that line will be
  913. // treated as the start of a sub-list. What a kludge, huh? This is
  914. // an aspect of Markdown's syntax that's hard to parse perfectly
  915. // without resorting to mind-reading. Perhaps the solution is to
  916. // change the syntax rules such that sub-lists must start with a
  917. // starting cardinal number; e.g. "1." or "a.".
  918. _listLevel++;
  919. // Trim trailing blank lines:
  920. list = Regex.Replace(list, @"\n{2,}\z", "\n");
  921. string pattern = string.Format(
  922. @"(^[ ]*) # leading whitespace = $1
  923. ({0}) [ ]+ # list marker = $2
  924. ((?s:.+?) # list item text = $3
  925. (\n+))
  926. (?= (\z | \1 ({0}) [ ]+))", marker);
  927. bool lastItemHadADoubleNewline = false;
  928. // has to be a closure, so subsequent invocations can share the bool
  929. string ListItemEvaluator(Match match)
  930. {
  931. string item = match.Groups[3].Value;
  932. bool endsWithDoubleNewline = item.EndsWith("\n\n");
  933. bool containsDoubleNewline = endsWithDoubleNewline || item.Contains("\n\n");
  934. var loose = containsDoubleNewline || lastItemHadADoubleNewline;
  935. // we could correct any bad indentation here..
  936. item = RunBlockGamut(Outdent(item) + "\n", unhash: false, createParagraphs: loose);
  937. lastItemHadADoubleNewline = endsWithDoubleNewline;
  938. return string.Format("<li>{0}</li>\n", item);
  939. }
  940. list = Regex.Replace(list, pattern, new MatchEvaluator(ListItemEvaluator),
  941. RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  942. _listLevel--;
  943. return list;
  944. }
  945. private static readonly Regex _codeBlock = new Regex(string.Format(@"
  946. (?:\n\n|\A\n?)
  947. ( # $1 = the code block -- one or more lines, starting with a space
  948. (?:
  949. (?:[ ]{{{0}}}) # Lines must start with a tab-width of spaces
  950. .*\n+
  951. )+
  952. )
  953. ((?=^[ ]{{0,{0}}}[^ \t\n])|\Z) # Lookahead for non-space at line-start, or end of doc",
  954. _tabWidth), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  955. /// <summary>
  956. /// /// Turn Markdown 4-space indented code into HTML pre code blocks
  957. /// </summary>
  958. private string DoCodeBlocks(string text)
  959. {
  960. return _codeBlock.Replace(text, new MatchEvaluator(CodeBlockEvaluator));
  961. }
  962. private string CodeBlockEvaluator(Match match)
  963. {
  964. string codeBlock = match.Groups[1].Value;
  965. codeBlock = EncodeCode(Outdent(codeBlock));
  966. codeBlock = _newlinesLeadingTrailing.Replace(codeBlock, "");
  967. return string.Concat("\n\n<pre><code>", codeBlock, "\n</code></pre>\n\n");
  968. }
  969. private static readonly Regex _codeSpan = new Regex(@"
  970. (?<![\\`]) # Character before opening ` can't be a backslash or backtick
  971. (`+) # $1 = Opening run of `
  972. (?!`) # and no more backticks -- match the full run
  973. (.+?) # $2 = The code block
  974. (?<!`)
  975. \1
  976. (?!`)", RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
  977. /// <summary>
  978. /// Turn Markdown `code spans` into HTML code tags
  979. /// </summary>
  980. private string DoCodeSpans(string text)
  981. {
  982. // * You can use multiple backticks as the delimiters if you want to
  983. // include literal backticks in the code span. So, this input:
  984. //
  985. // Just type ``foo `bar` baz`` at the prompt.
  986. //
  987. // Will translate to:
  988. //
  989. // <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
  990. //
  991. // There's no arbitrary limit to the number of backticks you
  992. // can use as delimters. If you need three consecutive backticks
  993. // in your code, use four for delimiters, etc.
  994. //
  995. // * You can use spaces to get literal backticks at the edges:
  996. //
  997. // ... type `` `bar` `` ...
  998. //
  999. // Turns to:
  1000. //
  1001. // ... type <code>`bar`</code> ...
  1002. //
  1003. return _codeSpan.Replace(text, new MatchEvaluator(CodeSpanEvaluator));
  1004. }
  1005. private string CodeSpanEvaluator(Match match)
  1006. {
  1007. string span = match.Groups[2].Value;
  1008. span = Regex.Replace(span, "^[ ]*", ""); // leading whitespace
  1009. span = Regex.Replace(span, "[ ]*$", ""); // trailing whitespace
  1010. span = EncodeCode(span);
  1011. span = SaveFromAutoLinking(span); // to prevent auto-linking. Not necessary in code *blocks*, but in code spans.
  1012. return string.Concat("<code>", span, "</code>");
  1013. }
  1014. private static readonly Regex _bold = new Regex(@"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1",
  1015. RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
  1016. private static readonly Regex _semiStrictBold = new Regex(@"(?=.[*_]|[*_])(^|(?=\W__|(?!\*)[\W_]\*\*|\w\*\*\w).)(\*\*|__)(?!\2)(?=\S)((?:|.*?(?!\2).)(?=\S_|\w|\S\*\*(?:[\W_]|$)).)(?=__(?:\W|$)|\*\*(?:[^*]|$))\2",
  1017. RegexOptions.Singleline | RegexOptions.Compiled);
  1018. private static readonly Regex _strictBold = new Regex(@"(^|[\W_])(?:(?!\1)|(?=^))(\*|_)\2(?=\S)(.*?\S)\2\2(?!\2)(?=[\W_]|$)",
  1019. RegexOptions.Singleline | RegexOptions.Compiled);
  1020. private static readonly Regex _italic = new Regex(@"(\*|_) (?=\S) (.+?) (?<=\S) \1",
  1021. RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
  1022. private static readonly Regex _semiStrictItalic = new Regex(@"(?=.[*_]|[*_])(^|(?=\W_|(?!\*)(?:[\W_]\*|\D\*(?=\w)\D)).)(\*|_)(?!\2\2\2)(?=\S)((?:(?!\2).)*?(?=[^\s_]_|(?=\w)\D\*\D|[^\s*]\*(?:[\W_]|$)).)(?=_(?:\W|$)|\*(?:[^*]|$))\2",
  1023. RegexOptions.Singleline | RegexOptions.Compiled);
  1024. private static readonly Regex _strictItalic = new Regex(@"(^|[\W_])(?:(?!\1)|(?=^))(\*|_)(?=\S)((?:(?!\2).)*?\S)\2(?!\2)(?=[\W_]|$)",
  1025. RegexOptions.Singleline | RegexOptions.Compiled);
  1026. /// <summary>
  1027. /// Turn Markdown *italics* and **bold** into HTML strong and em tags
  1028. /// </summary>
  1029. private string DoItalicsAndBold(string text)
  1030. {
  1031. if (!(text.Contains("*") || text.Contains("_")))
  1032. return text;
  1033. // <strong> must go first, then <em>
  1034. if (StrictBoldItalic)
  1035. {
  1036. if (AsteriskIntraWordEmphasis)
  1037. {
  1038. text = _semiStrictBold.Replace(text, "$1<strong>$3</strong>");
  1039. text = _semiStrictItalic.Replace(text, "$1<em>$3</em>");
  1040. }
  1041. else
  1042. {
  1043. text = _strictBold.Replace(text, "$1<strong>$3</strong>");
  1044. text = _strictItalic.Replace(text, "$1<em>$3</em>");
  1045. }
  1046. }
  1047. else
  1048. {
  1049. text = _bold.Replace(text, "<strong>$2</strong>");
  1050. text = _italic.Replace(text, "<em>$2</em>");
  1051. }
  1052. return text;
  1053. }
  1054. /// <summary>
  1055. /// Turn markdown line breaks (two space at end of line) into HTML break tags
  1056. /// </summary>
  1057. private string DoHardBreaks(string text)
  1058. {
  1059. if (AutoNewLines)
  1060. {
  1061. return Regex.Replace(text, @"\n", string.Format("<br{0}\n", EmptyElementSuffix));
  1062. }
  1063. else
  1064. {
  1065. return Regex.Replace(text, @" {2,}\n", string.Format("<br{0}\n", EmptyElementSuffix));
  1066. }
  1067. }
  1068. private static readonly Regex _blockquote = new Regex(@"
  1069. ( # Wrap whole match in $1
  1070. (
  1071. ^[ ]*>[ ]? # '>' at the start of a line
  1072. .+\n # rest of the first line
  1073. (.+\n)* # subsequent consecutive lines
  1074. \n* # blanks
  1075. )+
  1076. )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.Compiled);
  1077. /// <summary>
  1078. /// Turn Markdown > quoted blocks into HTML blockquote blocks
  1079. /// </summary>
  1080. private string DoBlockQuotes(string text)
  1081. {
  1082. return _blockquote.Replace(text, new MatchEvaluator(BlockQuoteEvaluator));
  1083. }
  1084. private string BlockQuoteEvaluator(Match match)
  1085. {
  1086. string bq = match.Groups[1].Value;
  1087. bq = Regex.Replace(bq, "^[ ]*>[ ]?", "", RegexOptions.Multiline); // trim one level of quoting
  1088. bq = Regex.Replace(bq, "^[ ]+$", "", RegexOptions.Multiline); // trim whitespace-only lines
  1089. bq = RunBlockGamut(bq); // recurse
  1090. bq = Regex.Replace(bq, "^", " ", RegexOptions.Multiline);
  1091. // These leading spaces screw with <pre> content, so we need to fix that:
  1092. bq = Regex.Replace(bq, @"(\s*<pre>.+?</pre>)", new MatchEvaluator(BlockQuoteEvaluator2), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
  1093. bq = string.Format("<blockquote>\n{0}\n</blockquote>", bq);
  1094. string key = GetHashKey(bq, isHtmlBlock: true);
  1095. _htmlBlocks[key] = bq;
  1096. return "\n\n" + key + "\n\n";
  1097. }
  1098. private string BlockQuoteEvaluator2(Match match)
  1099. {
  1100. return Regex.Replace(match.Groups[1].Value, "^ ", "", RegexOptions.Multiline);
  1101. }
  1102. private const string _charInsideUrl = @"[-A-Z0-9+&@#/%?=~_|\[\]\(\)!:,\.;" + "\x1a]";
  1103. private const string _charEndingUrl = "[-A-Z0-9+&@#/%=~_|\\[\\])]";
  1104. private static readonly Regex _autolinkBare = new Regex(@"(<|="")?\b(https?|ftp)(://" + _charInsideUrl + "*" + _charEndingUrl + ")(?=$|\\W)",
  1105. RegexOptions.IgnoreCase | RegexOptions.Compiled);
  1106. private static readonly Regex _endCharRegex = new Regex(_charEndingUrl, RegexOptions.IgnoreCase | RegexOptions.Compiled);
  1107. private static string HandleTrailingParens(Match match)
  1108. {
  1109. // The first group is essentially a negative lookbehind -- if there's a < or a =", we don't touch this.
  1110. // We're not using a *real* lookbehind, because of links with in links, like <a href="http://web.archive.org/web/20121130000728/http://www.google.com/">
  1111. // With a real lookbehind, the full link would never be matched, and thus the http://www.google.com *would* be matched.
  1112. // With the simulated lookbehind, the full link *is* matched (just not handled, because of this early return), causing
  1113. // the google link to not be matched again.
  1114. if (match.Groups[1].Success)
  1115. return match.Value;
  1116. var protocol = match.Groups[2].Value;
  1117. var link = match.Groups[3].Value;
  1118. if (!link.EndsWith(")"))
  1119. return "<" + protocol + link + ">";
  1120. var level = 0;
  1121. foreach (Match c in Regex.Matches(link, "[()]"))
  1122. {
  1123. if (c.Value == "(")
  1124. {
  1125. if (level <= 0)
  1126. level = 1;
  1127. else
  1128. level++;
  1129. }
  1130. else
  1131. {
  1132. level--;
  1133. }
  1134. }
  1135. var tail = "";
  1136. if (level < 0)
  1137. {
  1138. link = Regex.Replace(link, @"\){1," + (-level) + "}$", m => { tail = m.Value; return ""; });
  1139. }
  1140. if (tail.Length > 0)
  1141. {
  1142. var lastChar = link[link.Length - 1];
  1143. if (!_endCharRegex.IsMatch(lastChar.ToString()))
  1144. {
  1145. tail = lastChar + tail;
  1146. link = link.Substring(0, link.Length - 1);
  1147. }
  1148. }
  1149. return "<" + protocol + link + ">" + tail;
  1150. }
  1151. private static readonly Regex _autoEmailBare = new Regex(@"(<|="")?(?:mailto:)?([-.\w]+\@[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
  1152. private static string EmailBareLinkEvaluator(Match match)
  1153. {
  1154. // We matched an opening <, so it's already enclosed
  1155. if (match.Groups[1].Success)
  1156. {
  1157. return match.Value;
  1158. }
  1159. return "<" + match.Value + ">";
  1160. }
  1161. private readonly static Regex _linkEmail = new Regex(@"<
  1162. (?:mailto:)?
  1163. (
  1164. [-.\w]+
  1165. \@
  1166. [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  1167. )
  1168. >", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
  1169. /// <summary>
  1170. /// Turn angle-delimited URLs into HTML anchor tags
  1171. /// </summary>
  1172. /// <remarks>
  1173. /// &lt;http://www.example.com&gt;
  1174. /// </remarks>
  1175. private string DoAutoLinks(string text)
  1176. {
  1177. if (AutoHyperlink)
  1178. {
  1179. // fixup arbitrary URLs by adding Markdown < > so they get linked as well
  1180. // note that at this point, all other URL in the text are already hyperlinked as <a href=""></a>
  1181. // *except* for the <http://www.foo.com> case
  1182. text = _autolinkBare.Replace(text, HandleTrailingParens);
  1183. }
  1184. // Hyperlinks: <http://foo.com>
  1185. text = Regex.Replace(text, "<((https?|ftp):[^'\">\\s]+)>", new MatchEvaluator(HyperlinkEvaluator));
  1186. if (LinkEmails)
  1187. {
  1188. // Email addresses: <address@domain.foo> or <mailto:address@domain.foo>
  1189. // Also allow "address@domain.foo" and "mailto:address@domain.foo", without the <>
  1190. //text = _autoEmailBare.Replace(text, EmailBareLinkEvaluator);
  1191. text = _linkEmail.Replace(text, new MatchEvaluator(EmailEvaluator));
  1192. }
  1193. return text;
  1194. }
  1195. private string HyperlinkEvaluator(Match match)
  1196. {
  1197. string link = match.Groups[1].Value;
  1198. string url = AttributeSafeUrl(link);
  1199. return string.Format("<a href=\"{0}\">{1}</a>", url, link);
  1200. }
  1201. private string EmailEvaluator(Match match)
  1202. {
  1203. string email = Unescape(match.Groups[1].Value);
  1204. //
  1205. // Input: an email address, e.g. "foo@example.com"
  1206. //
  1207. // Output: the email address as a mailto link, with each character
  1208. // of the address encoded as either a decimal or hex entity, in
  1209. // the hopes of foiling most address harvesting spam bots. E.g.:
  1210. //
  1211. // <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
  1212. // x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
  1213. // &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
  1214. //
  1215. // Based by a filter by Matthew Wickline, posted to the BBEdit-Talk
  1216. // mailing list: <http://tinyurl.com/yu7ue>
  1217. //
  1218. email = "mailto:" + email;
  1219. // leave ':' alone (to spot mailto: later)
  1220. email = EncodeEmailAddress(email);
  1221. email = string.Format("<a href=\"{0}\">{0}</a>", email);
  1222. // strip the mailto: from the visible part
  1223. return Regex.Replace(email, "\">.+?:", "\">");
  1224. }
  1225. private static readonly Regex _outDent = new Regex("^[ ]{1," + _tabWidth + "}", RegexOptions.Multiline | RegexOptions.Compiled);
  1226. /// <summary>
  1227. /// Remove one level of line-leading spaces
  1228. /// </summary>
  1229. private string Outdent(string block)
  1230. {
  1231. return _outDent.Replace(block, "");
  1232. }
  1233. #region Encoding and Normalization
  1234. /// <summary>
  1235. /// encodes email address randomly
  1236. /// roughly 10% raw, 45% hex, 45% dec
  1237. /// note that @ is always encoded and : never is
  1238. /// </summary>
  1239. private string EncodeEmailAddress(string addr)
  1240. {
  1241. var sb = new StringBuilder(addr.Length * 5);
  1242. var rand = new Random();
  1243. int r;
  1244. foreach (char c in addr)
  1245. {
  1246. r = rand.Next(1, 100);
  1247. if ((r > 90 || c == ':') && c != '@')
  1248. sb.Append(c); // m
  1249. else if (r < 45)
  1250. sb.AppendFormat("&#x{0:x};", (int)c); // &#x6D
  1251. else
  1252. sb.AppendFormat("&#{0};", (int)c); // &#109
  1253. }
  1254. return sb.ToString();
  1255. }
  1256. private static readonly Regex _codeEncoder = new Regex(@"&|<|>|\\|\*|_|\{|\}|\[|\]", RegexOptions.Compiled);
  1257. /// <summary>
  1258. /// Encode/escape certain Markdown characters inside code blocks and spans where they are literals
  1259. /// </summary>
  1260. private string EncodeCode(string code)
  1261. {
  1262. return _codeEncoder.Replace(code, EncodeCodeEvaluator);
  1263. }
  1264. private string EncodeCodeEvaluator(Match match)
  1265. {
  1266. switch (match.Value)
  1267. {
  1268. // Encode all ampersands; HTML entities are not
  1269. // entities within a Markdown code span.
  1270. case "&":
  1271. return "&amp;";
  1272. // Do the angle bracket song and dance
  1273. case "<":
  1274. return "&lt;";
  1275. case ">":
  1276. return "&gt;";
  1277. // escape characters that are magic in Markdown
  1278. default:
  1279. return _escapeTable[match.Value];
  1280. }
  1281. }
  1282. private static readonly Regex _amps = new Regex("&(?!((#[0-9]+)|(#[xX][a-fA-F0-9]+)|([a-zA-Z][a-zA-Z0-9]*));)", RegexOptions.ExplicitCapture | RegexOptions.Compiled);
  1283. private static readonly Regex _angles = new Regex(@"<(?![A-Za-z/?\$!])", RegexOptions.ExplicitCapture | RegexOptions.Compiled);
  1284. /// <summary>
  1285. /// Encode any ampersands (that aren't part of an HTML entity) and left or right angle brackets
  1286. /// </summary>
  1287. private string EncodeAmpsAndAngles(string s)
  1288. {
  1289. s = _amps.Replace(s, "&amp;");
  1290. return _angles.Replace(s, "&lt;");
  1291. }
  1292. private static readonly Regex _backslashEscapes;
  1293. /// <summary>
  1294. /// Encodes any escaped characters such as \`, \*, \[ etc
  1295. /// </summary>
  1296. private string EscapeBackslashes(string s) => _backslashEscapes.Replace(s, new MatchEvaluator(EscapeBackslashesEvaluator));
  1297. private string EscapeBackslashesEvaluator(Match match) => _backslashEscapeTable[match.Value];
  1298. // note: this space MATTERS - do not remove (hex / unicode) \|/
  1299. #pragma warning disable RCS1190 // Join string expressions.
  1300. private static readonly Regex _unescapes = new Regex("\x1A" + "E\\d+E", RegexOptions.Compiled);
  1301. #pragma warning restore RCS1190 // Join string expressions.
  1302. /// <summary>
  1303. /// swap back in all the special characters we've hidden
  1304. /// </summary>
  1305. private string Unescape(string s) => _unescapes.Replace(s, new MatchEvaluator(UnescapeEvaluator));
  1306. private string UnescapeEvaluator(Match match) => _invertedEscapeTable[match.Value];
  1307. /// <summary>
  1308. /// escapes Bold [ * ] and Italic [ _ ] characters
  1309. /// </summary>
  1310. private string EscapeBoldItalic(string s)
  1311. {
  1312. s = s.Replace("*", _escapeTable["*"]);
  1313. return s.Replace("_", _escapeTable["_"]);
  1314. }
  1315. private static string AttributeEncode(string s)
  1316. {
  1317. return s.Replace(">", "&gt;").Replace("<", "&lt;").Replace("\"", "&quot;").Replace("'", "&#39;");
  1318. }
  1319. private static string AttributeSafeUrl(string s)
  1320. {
  1321. s = AttributeEncode(s);
  1322. foreach (var c in "*_:()[]")
  1323. s = s.Replace(c.ToString(), _escapeTable[c.ToString()]);
  1324. return s;
  1325. }
  1326. /// <summary>
  1327. /// Within tags -- meaning between &lt; and &gt; -- encode [\ ` * _] so they
  1328. /// don't conflict with their use in Markdown for code, italics and strong.
  1329. /// We're replacing each such character with its corresponding hash
  1330. /// value; this is likely overkill, but it should prevent us from colliding
  1331. /// with the escape values by accident.
  1332. /// </summary>
  1333. private string EscapeSpecialCharsWithinTagAttributes(string text)
  1334. {
  1335. var tokens = TokenizeHTML(text);
  1336. // now, rebuild text from the tokens
  1337. var sb = new StringBuilder(text.Length);
  1338. foreach (var token in tokens)
  1339. {
  1340. string value = token.Value;
  1341. if (token.Type == TokenType.Tag)
  1342. {
  1343. value = value.Replace(@"\", _escapeTable[@"\"]);
  1344. if (AutoHyperlink && value.StartsWith("<!")) // escape slashes in comments to prevent autolinking there -- https://meta.stackexchange.com/questions/95987/html-comment-containing-url-breaks-if-followed-by-another-html-comment
  1345. value = value.Replace("/", _escapeTable["/"]);
  1346. value = Regex.Replace(value, "(?<=.)</?code>(?=.)", _escapeTable["`"]);
  1347. value = EscapeBoldItalic(value);
  1348. }
  1349. sb.Append(value);
  1350. }
  1351. return sb.ToString();
  1352. }
  1353. /// <summary>
  1354. /// convert all tabs to _tabWidth spaces;
  1355. /// standardizes line endings from DOS (CR LF) or Mac (CR) to UNIX (LF);
  1356. /// makes sure text ends with a couple of newlines;
  1357. /// removes any blank lines (only spaces) in the text
  1358. /// </summary>
  1359. private string Normalize(string text)
  1360. {
  1361. var output = new StringBuilder(text.Length);
  1362. var line = new StringBuilder();
  1363. bool valid = false;
  1364. for (int i = 0; i < text.Length; i++)
  1365. {
  1366. switch (text[i])
  1367. {
  1368. case '\n':
  1369. if (valid) output.Append(line);
  1370. output.Append('\n');
  1371. line.Length = 0; valid = false;
  1372. break;
  1373. case '\r':
  1374. if ((i < text.Length - 1) && (text[i + 1] != '\n'))
  1375. {
  1376. if (valid) output.Append(line);
  1377. output.Append('\n');
  1378. line.Length = 0; valid = false;
  1379. }
  1380. break;
  1381. case '\t':
  1382. int width = (_tabWidth - (line.Length % _tabWidth));
  1383. for (int k = 0; k < width; k++)
  1384. line.Append(' ');
  1385. break;
  1386. case '\x1A':
  1387. break;
  1388. default:
  1389. if (!valid && text[i] != ' ') valid = true;
  1390. line.Append(text[i]);
  1391. break;
  1392. }
  1393. }
  1394. if (valid) output.Append(line);
  1395. output.Append('\n');
  1396. // add two newlines to the end before return
  1397. return output.Append("\n\n").ToString();
  1398. }
  1399. #endregion
  1400. /// <summary>
  1401. /// this is to emulate what's evailable in PHP
  1402. /// </summary>
  1403. private static string RepeatString(string text, int count)
  1404. {
  1405. var sb = new StringBuilder(text.Length * count);
  1406. for (int i = 0; i < count; i++)
  1407. sb.Append(text);
  1408. return sb.ToString();
  1409. }
  1410. }
  1411. }