You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1608 lines
64 KiB

3 years ago
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Configuration;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. namespace MarkdownSharp
  7. {
  8. /// <summary>
  9. /// Markdown is a text-to-HTML conversion tool for web writers.
  10. /// Markdown allows you to write using an easy-to-read, easy-to-write plain text format,
  11. /// then convert it to structurally valid XHTML (or HTML).
  12. /// </summary>
  13. internal class Markdown
  14. {
  15. private const string _version = "1.13";
  16. #region Constructors and Options
  17. /// <summary>
  18. /// Create a new Markdown instance using default options
  19. /// </summary>
  20. public Markdown() { }
  21. /// <summary>
  22. /// Create a new Markdown instance and set the options from the MarkdownOptions object.
  23. /// </summary>
  24. public Markdown(MarkdownOptions options)
  25. {
  26. AutoHyperlink = options.AutoHyperlink;
  27. AutoNewLines = options.AutoNewlines;
  28. if (!string.IsNullOrEmpty(options.EmptyElementSuffix))
  29. EmptyElementSuffix = options.EmptyElementSuffix;
  30. LinkEmails = options.LinkEmails;
  31. StrictBoldItalic = options.StrictBoldItalic;
  32. AsteriskIntraWordEmphasis = options.AsteriskIntraWordEmphasis;
  33. }
  34. /// <summary>
  35. /// use ">" for HTML output, or " />" for XHTML output
  36. /// </summary>
  37. public string EmptyElementSuffix { get; set; } = " />";
  38. /// <summary>
  39. /// when false, email addresses will never be auto-linked
  40. /// WARNING: this is a significant deviation from the markdown spec
  41. /// </summary>
  42. public bool LinkEmails { get; set; } = true;
  43. /// <summary>
  44. /// when true, bold and italic require non-word characters on either side
  45. /// WARNING: this is a significant deviation from the markdown spec
  46. /// </summary>
  47. public bool StrictBoldItalic { get; set; } = false;
  48. /// <summary>
  49. /// when true, asterisks may be used for intraword emphasis
  50. /// this does nothing if StrictBoldItalic is false
  51. /// </summary>
  52. public bool AsteriskIntraWordEmphasis { get; set; } = false;
  53. /// <summary>
  54. /// when true, RETURN becomes a literal newline
  55. /// WARNING: this is a significant deviation from the markdown spec
  56. /// </summary>
  57. public bool AutoNewLines { get; set; } = false;
  58. /// <summary>
  59. /// when true, (most) bare plain URLs are auto-hyperlinked
  60. /// WARNING: this is a significant deviation from the markdown spec
  61. /// </summary>
  62. public bool AutoHyperlink { get; set; } = false;
  63. #endregion
  64. /// <summary>
  65. /// maximum nested depth of [] and () supported by the transform; implementation detail
  66. /// </summary>
  67. private const int _nestDepth = 6;
  68. /// <summary>
  69. /// Tabs are automatically converted to spaces as part of the transform
  70. /// this constant determines how "wide" those tabs become in spaces
  71. /// </summary>
  72. private const int _tabWidth = 4;
  73. private const string _markerUL = "[*+-]";
  74. private const string _markerOL = @"\d+[.]";
  75. private static readonly Dictionary<string, string> _escapeTable;
  76. private static readonly Dictionary<string, string> _invertedEscapeTable;
  77. private static readonly Dictionary<string, string> _backslashEscapeTable;
  78. private readonly Dictionary<string, string> _urls = new Dictionary<string, string>();
  79. private readonly Dictionary<string, string> _titles = new Dictionary<string, string>();
  80. private readonly Dictionary<string, string> _htmlBlocks = new Dictionary<string, string>();
  81. private int _listLevel;
  82. private const string AutoLinkPreventionMarker = "\x1AP"; // temporarily replaces "://" where auto-linking shouldn't happen
  83. /// <summary>
  84. /// In the static constuctor we'll initialize what stays the same across all transforms.
  85. /// </summary>
  86. static Markdown()
  87. {
  88. // Table of hash values for escaped characters:
  89. _escapeTable = new Dictionary<string, string>();
  90. _invertedEscapeTable = new Dictionary<string, string>();
  91. // Table of hash value for backslash escaped characters:
  92. _backslashEscapeTable = new Dictionary<string, string>();
  93. string backslashPattern = "";
  94. foreach (char c in @"\`*_{}[]()>#+-.!/:")
  95. {
  96. string key = c.ToString();
  97. string hash = GetHashKey(key, isHtmlBlock: false);
  98. _escapeTable.Add(key, hash);
  99. _invertedEscapeTable.Add(hash, key);
  100. _backslashEscapeTable.Add(@"\" + key, hash);
  101. backslashPattern += Regex.Escape(@"\" + key) + "|";
  102. }
  103. _backslashEscapes = new Regex(backslashPattern.Substring(0, backslashPattern.Length - 1), RegexOptions.Compiled);
  104. }
  105. /// <summary>
  106. /// current version of MarkdownSharp;
  107. /// see http://code.google.com/p/markdownsharp/ for the latest code or to contribute
  108. /// </summary>
  109. public string Version
  110. {
  111. get { return _version; }
  112. }
  113. /// <summary>
  114. /// Transforms the provided Markdown-formatted text to HTML;
  115. /// see http://en.wikipedia.org/wiki/Markdown
  116. /// </summary>
  117. /// <remarks>
  118. /// The order in which other subs are called here is
  119. /// essential. Link and image substitutions need to happen before
  120. /// EscapeSpecialChars(), so that any *'s or _'s in the a
  121. /// and img tags get encoded.
  122. /// </remarks>
  123. public string Transform(string text)
  124. {
  125. if (string.IsNullOrEmpty(text)) return "";
  126. Setup();
  127. text = Normalize(text);
  128. text = HashHTMLBlocks(text);
  129. text = StripLinkDefinitions(text);
  130. text = RunBlockGamut(text);
  131. text = Unescape(text);
  132. Cleanup();
  133. return text + "\n";
  134. }
  135. /// <summary>
  136. /// Perform transformations that form block-level tags like paragraphs, headers, and list items.
  137. /// </summary>
  138. private string RunBlockGamut(string text, bool unhash = true, bool createParagraphs = true)
  139. {
  140. text = DoHeaders(text);
  141. text = DoHorizontalRules(text);
  142. text = DoLists(text);
  143. text = DoCodeBlocks(text);
  144. text = DoBlockQuotes(text);
  145. // We already ran HashHTMLBlocks() before, in Markdown(), but that
  146. // was to escape raw HTML in the original Markdown source. This time,
  147. // we're escaping the markup we've just created, so that we don't wrap
  148. // <p> tags around block-level tags.
  149. text = HashHTMLBlocks(text);
  150. return FormParagraphs(text, unhash: unhash, createParagraphs: createParagraphs);
  151. }
  152. /// <summary>
  153. /// Perform transformations that occur *within* block-level tags like paragraphs, headers, and list items.
  154. /// </summary>
  155. private string RunSpanGamut(string text)
  156. {
  157. text = DoCodeSpans(text);
  158. text = EscapeSpecialCharsWithinTagAttributes(text);
  159. text = EscapeBackslashes(text);
  160. // Images must come first, because ![foo][f] looks like an anchor.
  161. text = DoImages(text);
  162. text = DoAnchors(text);
  163. // Must come after DoAnchors(), because you can use < and >
  164. // delimiters in inline links like [this](<url>).
  165. text = DoAutoLinks(text);
  166. text = text.Replace(AutoLinkPreventionMarker, "://");
  167. text = EncodeAmpsAndAngles(text);
  168. text = DoItalicsAndBold(text);
  169. return DoHardBreaks(text);
  170. }
  171. private static readonly Regex _newlinesLeadingTrailing = new Regex(@"^\n+|\n+\z", RegexOptions.Compiled);
  172. private static readonly Regex _newlinesMultiple = new Regex(@"\n{2,}", RegexOptions.Compiled);
  173. private static readonly Regex _leadingWhitespace = new Regex("^[ ]*", RegexOptions.Compiled);
  174. private static readonly Regex _htmlBlockHash = new Regex("\x1AH\\d+H", RegexOptions.Compiled);
  175. /// <summary>
  176. /// splits on two or more newlines, to form "paragraphs";
  177. /// each paragraph is then unhashed (if it is a hash and unhashing isn't turned off) or wrapped in HTML p tag
  178. /// </summary>
  179. private string FormParagraphs(string text, bool unhash = true, bool createParagraphs = true)
  180. {
  181. // split on two or more newlines
  182. string[] grafs = _newlinesMultiple.Split(_newlinesLeadingTrailing.Replace(text, ""));
  183. for (int i = 0; i < grafs.Length; i++)
  184. {
  185. if (grafs[i].Contains("\x1AH"))
  186. {
  187. // unhashify HTML blocks
  188. if (unhash)
  189. {
  190. int sanityCheck = 50; // just for safety, guard against an infinite loop
  191. bool keepGoing = true; // as long as replacements where made, keep going
  192. while (keepGoing && sanityCheck > 0)
  193. {
  194. keepGoing = false;
  195. grafs[i] = _htmlBlockHash.Replace(grafs[i], match =>
  196. {
  197. keepGoing = true;
  198. return _htmlBlocks[match.Value];
  199. });
  200. sanityCheck--;
  201. }
  202. /* if (keepGoing)
  203. {
  204. // Logging of an infinite loop goes here.
  205. // If such a thing should happen, please open a new issue on http://code.google.com/p/markdownsharp/
  206. // with the input that caused it.
  207. }*/
  208. }
  209. }
  210. else
  211. {
  212. // do span level processing inside the block, then wrap result in <p> tags
  213. grafs[i] = _leadingWhitespace.Replace(RunSpanGamut(grafs[i]), createParagraphs ? "<p>" : "") + (createParagraphs ? "</p>" : "");
  214. }
  215. }
  216. return string.Join("\n\n", grafs);
  217. }
  218. private void Setup()
  219. {
  220. // Clear the global hashes. If we don't clear these, you get conflicts
  221. // from other articles when generating a page which contains more than
  222. // one article (e.g. an index page that shows the N most recent
  223. // articles):
  224. _urls.Clear();
  225. _titles.Clear();
  226. _htmlBlocks.Clear();
  227. _listLevel = 0;
  228. }
  229. private void Cleanup()
  230. {
  231. Setup();
  232. }
  233. private static string _nestedBracketsPattern;
  234. /// <summary>
  235. /// Reusable pattern to match balanced [brackets]. See Friedl's
  236. /// "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  237. /// </summary>
  238. private static string GetNestedBracketsPattern()
  239. {
  240. // in other words [this] and [this[also]] and [this[also[too]]]
  241. // up to _nestDepth
  242. if (_nestedBracketsPattern == null)
  243. {
  244. _nestedBracketsPattern =
  245. RepeatString(@"
  246. (?> # Atomic matching
  247. [^\[\]]+ # Anything other than brackets
  248. |
  249. \[
  250. ", _nestDepth) + RepeatString(
  251. @" \]
  252. )*"
  253. , _nestDepth);
  254. }
  255. return _nestedBracketsPattern;
  256. }
  257. private static string _nestedParensPattern;
  258. /// <summary>
  259. /// Reusable pattern to match balanced (parens). See Friedl's
  260. /// "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  261. /// </summary>
  262. private static string GetNestedParensPattern()
  263. {
  264. // in other words (this) and (this(also)) and (this(also(too)))
  265. // up to _nestDepth
  266. if (_nestedParensPattern == null)
  267. {
  268. _nestedParensPattern =
  269. RepeatString(@"
  270. (?> # Atomic matching
  271. [^()\s]+ # Anything other than parens or whitespace
  272. |
  273. \(
  274. ", _nestDepth) + RepeatString(
  275. @" \)
  276. )*"
  277. , _nestDepth);
  278. }
  279. return _nestedParensPattern;
  280. }
  281. private static readonly Regex _linkDef = new Regex(string.Format(@"
  282. ^[ ]{{0,{0}}}\[([^\[\]]+)\]: # id = $1
  283. [ ]*
  284. \n? # maybe *one* newline
  285. [ ]*
  286. <?(\S+?)>? # url = $2
  287. [ ]*
  288. \n? # maybe one newline
  289. [ ]*
  290. (?:
  291. (?<=\s) # lookbehind for whitespace
  292. [""(]
  293. (.+?) # title = $3
  294. ["")]
  295. [ ]*
  296. )? # title is optional
  297. (?:\n+|\Z)", _tabWidth - 1), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  298. /// <summary>
  299. /// Strips link definitions from text, stores the URLs and titles in hash references.
  300. /// </summary>
  301. /// <remarks>
  302. /// ^[id]: url "optional title"
  303. /// </remarks>
  304. private string StripLinkDefinitions(string text) => _linkDef.Replace(text, new MatchEvaluator(LinkEvaluator));
  305. private string LinkEvaluator(Match match)
  306. {
  307. string linkID = match.Groups[1].Value.ToLowerInvariant();
  308. _urls[linkID] = EncodeAmpsAndAngles(match.Groups[2].Value);
  309. if (match.Groups[3]?.Length > 0)
  310. _titles[linkID] = match.Groups[3].Value.Replace("\"", "&quot;");
  311. return "";
  312. }
  313. // compiling this monster regex results in worse performance. trust me.
  314. private static readonly Regex _blocksHtml = new Regex(GetBlockPattern(), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
  315. /// <summary>
  316. /// derived pretty much verbatim from PHP Markdown
  317. /// </summary>
  318. private static string GetBlockPattern()
  319. {
  320. // Hashify HTML blocks:
  321. // We only want to do this for block-level HTML tags, such as headers,
  322. // lists, and tables. That's because we still want to wrap <p>s around
  323. // "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  324. // phrase emphasis, and spans. The list of tags we're looking for is
  325. // hard-coded:
  326. //
  327. // * List "a" is made of tags which can be both inline or block-level.
  328. // These will be treated block-level when the start tag is alone on
  329. // its line, otherwise they're not matched here and will be taken as
  330. // inline later.
  331. // * List "b" is made of tags which are always block-level;
  332. //
  333. const string blockTagsA = "ins|del";
  334. const string blockTagsB = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|script|noscript|form|fieldset|iframe|math";
  335. // Regular expression for the content of a block tag.
  336. const string attr = @"
  337. (?> # optional tag attributes
  338. \s # starts with whitespace
  339. (?>
  340. [^>""/]+ # text outside quotes
  341. |
  342. /+(?!>) # slash not followed by >
  343. |
  344. ""[^""]*"" # text inside double quotes (tolerate >)
  345. |
  346. '[^']*' # text inside single quotes (tolerate >)
  347. )*
  348. )?
  349. ";
  350. string content = RepeatString(@"
  351. (?>
  352. [^<]+ # content without tag
  353. |
  354. <\2 # nested opening tag
  355. " + attr + @" # attributes
  356. (?>
  357. />
  358. |
  359. >", _nestDepth) + // end of opening tag
  360. ".*?" + // last level nested tag content
  361. RepeatString(@"
  362. </\2\s*> # closing nested tag
  363. )
  364. |
  365. <(?!/\2\s*> # other tags with a different name
  366. )
  367. )*", _nestDepth);
  368. string content2 = content.Replace(@"\2", @"\3");
  369. // First, look for nested blocks, e.g.:
  370. // <div>
  371. // <div>
  372. // tags for inner block must be indented.
  373. // </div>
  374. // </div>
  375. //
  376. // The outermost tags must start at the left margin for this to match, and
  377. // the inner nested divs must be indented.
  378. // We need to do this before the next, more liberal match, because the next
  379. // match will start at the first `<div>` and stop at the first `</div>`.
  380. string pattern = @"
  381. (?>
  382. (?>
  383. (?<=\n) # Starting at the beginning of a line
  384. | # or
  385. \A\n? # the beginning of the doc
  386. )
  387. ( # save in $1
  388. # Match from `\n<tag>` to `</tag>\n`, handling nested tags
  389. # in between.
  390. <($block_tags_b_re) # start tag = $2
  391. $attr> # attributes followed by > and \n
  392. $content # content, support nesting
  393. </\2> # the matching end tag
  394. [ ]* # trailing spaces
  395. (?=\n+|\Z) # followed by a newline or end of document
  396. | # Special version for tags of group a.
  397. <($block_tags_a_re) # start tag = $3
  398. $attr>[ ]*\n # attributes followed by >
  399. $content2 # content, support nesting
  400. </\3> # the matching end tag
  401. [ ]* # trailing spaces
  402. (?=\n+|\Z) # followed by a newline or end of document
  403. | # Special case just for <hr />. It was easier to make a special
  404. # case than to make the other regex more complicated.
  405. [ ]{0,$less_than_tab}
  406. <hr
  407. $attr # attributes
  408. /?> # the matching end tag
  409. [ ]*
  410. (?=\n{2,}|\Z) # followed by a blank line or end of document
  411. | # Special case for standalone HTML comments:
  412. (?<=\n\n|\A) # preceded by a blank line or start of document
  413. [ ]{0,$less_than_tab}
  414. (?s:
  415. <!--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)-->
  416. )
  417. [ ]*
  418. (?=\n{2,}|\Z) # followed by a blank line or end of document
  419. | # PHP and ASP-style processor instructions (<? and <%)
  420. [ ]{0,$less_than_tab}
  421. (?s:
  422. <([?%]) # $4
  423. .*?
  424. \4>
  425. )
  426. [ ]*
  427. (?=\n{2,}|\Z) # followed by a blank line or end of document
  428. )
  429. )";
  430. pattern = pattern.Replace("$less_than_tab", (_tabWidth - 1).ToString());
  431. pattern = pattern.Replace("$block_tags_b_re", blockTagsB);
  432. pattern = pattern.Replace("$block_tags_a_re", blockTagsA);
  433. pattern = pattern.Replace("$attr", attr);
  434. pattern = pattern.Replace("$content2", content2);
  435. return pattern.Replace("$content", content);
  436. }
  437. /// <summary>
  438. /// replaces any block-level HTML blocks with hash entries
  439. /// </summary>
  440. private string HashHTMLBlocks(string text)
  441. {
  442. return _blocksHtml.Replace(text, new MatchEvaluator(HtmlEvaluator));
  443. }
  444. private string HtmlEvaluator(Match match)
  445. {
  446. string text = match.Groups[1].Value;
  447. string key = GetHashKey(text, isHtmlBlock: true);
  448. _htmlBlocks[key] = text;
  449. return string.Concat("\n\n", key, "\n\n");
  450. }
  451. private static string GetHashKey(string s, bool isHtmlBlock)
  452. {
  453. var delim = isHtmlBlock ? 'H' : 'E';
  454. return "\x1A" + delim + Math.Abs(s.GetHashCode()).ToString() + delim;
  455. }
  456. private static readonly Regex _htmlTokens = new Regex(@"
  457. (<!--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)-->)| # match <!-- foo -->
  458. (<\?.*?\?>)| # match <?foo?> " +
  459. RepeatString(@"
  460. (<[A-Za-z\/!$](?:[^<>]|", _nestDepth - 1) + @"
  461. (<[A-Za-z\/!$](?:[^<>]"
  462. + RepeatString(")*>)", _nestDepth) +
  463. " # match <tag> and </tag>",
  464. RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  465. /// <summary>
  466. /// returns an array of HTML tokens comprising the input string. Each token is
  467. /// either a tag (possibly with nested, tags contained therein, such
  468. /// as &lt;a href="&lt;MTFoo&gt;"&gt;, or a run of text between tags. Each element of the
  469. /// array is a two-element array; the first is either 'tag' or 'text'; the second is
  470. /// the actual value.
  471. /// </summary>
  472. private List<Token> TokenizeHTML(string text)
  473. {
  474. int pos = 0;
  475. int tagStart = 0;
  476. var tokens = new List<Token>();
  477. // this regex is derived from the _tokenize() subroutine in Brad Choate's MTRegex plugin.
  478. // http://www.bradchoate.com/past/mtregex.php
  479. foreach (Match m in _htmlTokens.Matches(text))
  480. {
  481. tagStart = m.Index;
  482. if (pos < tagStart)
  483. tokens.Add(new Token(TokenType.Text, text.Substring(pos, tagStart - pos)));
  484. tokens.Add(new Token(TokenType.Tag, m.Value));
  485. pos = tagStart + m.Length;
  486. }
  487. if (pos < text.Length)
  488. tokens.Add(new Token(TokenType.Text, text.Substring(pos, text.Length - pos)));
  489. return tokens;
  490. }
  491. private static readonly Regex _anchorRef = new Regex(string.Format(@"
  492. ( # wrap whole match in $1
  493. \[
  494. ({0}) # link text = $2
  495. \]
  496. [ ]? # one optional space
  497. (?:\n[ ]*)? # one optional newline followed by spaces
  498. \[
  499. (.*?) # id = $3
  500. \]
  501. )", GetNestedBracketsPattern()), RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  502. private static readonly Regex _anchorInline = new Regex(string.Format(@"
  503. ( # wrap whole match in $1
  504. \[
  505. ({0}) # link text = $2
  506. \]
  507. \( # literal paren
  508. [ ]*
  509. ({1}) # href = $3
  510. [ ]*
  511. ( # $4
  512. (['""]) # quote char = $5
  513. (.*?) # title = $6
  514. \5 # matching quote
  515. [ ]* # ignore any spaces between closing quote and )
  516. )? # title is optional
  517. \)
  518. )", GetNestedBracketsPattern(), GetNestedParensPattern()),
  519. RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  520. private static readonly Regex _anchorRefShortcut = new Regex(@"
  521. ( # wrap whole match in $1
  522. \[
  523. ([^\[\]]+) # link text = $2; can't contain [ or ]
  524. \]
  525. )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  526. /// <summary>
  527. /// Turn Markdown link shortcuts into HTML anchor tags
  528. /// </summary>
  529. /// <remarks>
  530. /// [link text](url "title")
  531. /// [link text][id]
  532. /// [id]
  533. /// </remarks>
  534. private string DoAnchors(string text)
  535. {
  536. if (!text.Contains("["))
  537. return text;
  538. // First, handle reference-style links: [link text] [id]
  539. text = _anchorRef.Replace(text, new MatchEvaluator(AnchorRefEvaluator));
  540. // Next, inline-style links: [link text](url "optional title") or [link text](url "optional title")
  541. text = _anchorInline.Replace(text, new MatchEvaluator(AnchorInlineEvaluator));
  542. // Last, handle reference-style shortcuts: [link text]
  543. // These must come last in case you've also got [link test][1]
  544. // or [link test](/foo)
  545. return _anchorRefShortcut.Replace(text, new MatchEvaluator(AnchorRefShortcutEvaluator));
  546. }
  547. private string SaveFromAutoLinking(string s)
  548. {
  549. return s.Replace("://", AutoLinkPreventionMarker);
  550. }
  551. private string AnchorRefEvaluator(Match match)
  552. {
  553. string wholeMatch = match.Groups[1].Value;
  554. string linkText = SaveFromAutoLinking(match.Groups[2].Value);
  555. string linkID = match.Groups[3].Value.ToLowerInvariant();
  556. string result;
  557. // for shortcut links like [this][].
  558. if (linkID?.Length == 0)
  559. linkID = linkText.ToLowerInvariant();
  560. if (_urls.ContainsKey(linkID))
  561. {
  562. string url = _urls[linkID];
  563. url = AttributeSafeUrl(url);
  564. result = "<a href=\"" + url + "\"";
  565. if (_titles.ContainsKey(linkID))
  566. {
  567. string title = AttributeEncode(_titles[linkID]);
  568. title = AttributeEncode(EscapeBoldItalic(title));
  569. result += " title=\"" + title + "\"";
  570. }
  571. result += ">" + linkText + "</a>";
  572. }
  573. else
  574. {
  575. result = wholeMatch;
  576. }
  577. return result;
  578. }
  579. private string AnchorRefShortcutEvaluator(Match match)
  580. {
  581. string wholeMatch = match.Groups[1].Value;
  582. string linkText = SaveFromAutoLinking(match.Groups[2].Value);
  583. string linkID = Regex.Replace(linkText.ToLowerInvariant(), @"[ ]*\n[ ]*", " "); // lower case and remove newlines / extra spaces
  584. string result;
  585. if (_urls.ContainsKey(linkID))
  586. {
  587. string url = _urls[linkID];
  588. url = AttributeSafeUrl(url);
  589. result = "<a href=\"" + url + "\"";
  590. if (_titles.ContainsKey(linkID))
  591. {
  592. string title = AttributeEncode(_titles[linkID]);
  593. title = EscapeBoldItalic(title);
  594. result += " title=\"" + title + "\"";
  595. }
  596. result += ">" + linkText + "</a>";
  597. }
  598. else
  599. {
  600. result = wholeMatch;
  601. }
  602. return result;
  603. }
  604. private string AnchorInlineEvaluator(Match match)
  605. {
  606. string linkText = SaveFromAutoLinking(match.Groups[2].Value);
  607. string url = match.Groups[3].Value;
  608. string title = match.Groups[6].Value;
  609. string result;
  610. if (url.StartsWith("<") && url.EndsWith(">"))
  611. url = url.Substring(1, url.Length - 2); // remove <>'s surrounding URL, if present
  612. url = AttributeSafeUrl(url);
  613. result = string.Format("<a href=\"{0}\"", url);
  614. if (!string.IsNullOrEmpty(title))
  615. {
  616. title = AttributeEncode(title);
  617. title = EscapeBoldItalic(title);
  618. result += string.Format(" title=\"{0}\"", title);
  619. }
  620. result += string.Format(">{0}</a>", linkText);
  621. return result;
  622. }
  623. private static readonly Regex _imagesRef = new Regex(@"
  624. ( # wrap whole match in $1
  625. !\[
  626. (.*?) # alt text = $2
  627. \]
  628. [ ]? # one optional space
  629. (?:\n[ ]*)? # one optional newline followed by spaces
  630. \[
  631. (.*?) # id = $3
  632. \]
  633. )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  634. private static readonly Regex _imagesInline = new Regex(string.Format(@"
  635. ( # wrap whole match in $1
  636. !\[
  637. (.*?) # alt text = $2
  638. \]
  639. \s? # one optional whitespace character
  640. \( # literal paren
  641. [ ]*
  642. ({0}) # href = $3
  643. [ ]*
  644. ( # $4
  645. (['""]) # quote char = $5
  646. (.*?) # title = $6
  647. \5 # matching quote
  648. [ ]*
  649. )? # title is optional
  650. \)
  651. )", GetNestedParensPattern()),
  652. RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  653. /// <summary>
  654. /// Turn Markdown image shortcuts into HTML img tags.
  655. /// </summary>
  656. /// <remarks>
  657. /// ![alt text][id]
  658. /// ![alt text](url "optional title")
  659. /// </remarks>
  660. private string DoImages(string text)
  661. {
  662. if (!text.Contains("!["))
  663. return text;
  664. // First, handle reference-style labeled images: ![alt text][id]
  665. text = _imagesRef.Replace(text, new MatchEvaluator(ImageReferenceEvaluator));
  666. // Next, handle inline images: ![alt text](url "optional title")
  667. // Don't forget: encode * and _
  668. return _imagesInline.Replace(text, new MatchEvaluator(ImageInlineEvaluator));
  669. }
  670. // This prevents the creation of horribly broken HTML when some syntax ambiguities
  671. // collide. It likely still doesn't do what the user meant, but at least we're not
  672. // outputting garbage.
  673. private string EscapeImageAltText(string s)
  674. {
  675. s = EscapeBoldItalic(s);
  676. return Regex.Replace(s, @"[\[\]()]", m => _escapeTable[m.ToString()]);
  677. }
  678. private string ImageReferenceEvaluator(Match match)
  679. {
  680. string wholeMatch = match.Groups[1].Value;
  681. string altText = match.Groups[2].Value;
  682. string linkID = match.Groups[3].Value.ToLowerInvariant();
  683. // for shortcut links like ![this][].
  684. if (linkID?.Length == 0)
  685. linkID = altText.ToLowerInvariant();
  686. if (_urls.ContainsKey(linkID))
  687. {
  688. string url = _urls[linkID];
  689. string title = null;
  690. if (_titles.ContainsKey(linkID))
  691. title = _titles[linkID];
  692. return ImageTag(url, altText, title);
  693. }
  694. else
  695. {
  696. // If there's no such link ID, leave intact:
  697. return wholeMatch;
  698. }
  699. }
  700. private string ImageInlineEvaluator(Match match)
  701. {
  702. string alt = match.Groups[2].Value;
  703. string url = match.Groups[3].Value;
  704. string title = match.Groups[6].Value;
  705. if (url.StartsWith("<") && url.EndsWith(">"))
  706. url = url.Substring(1, url.Length - 2); // Remove <>'s surrounding URL, if present
  707. return ImageTag(url, alt, title);
  708. }
  709. private string ImageTag(string url, string altText, string title)
  710. {
  711. altText = EscapeImageAltText(AttributeEncode(altText));
  712. url = AttributeSafeUrl(url);
  713. var result = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, altText);
  714. if (!string.IsNullOrEmpty(title))
  715. {
  716. title = AttributeEncode(EscapeBoldItalic(title));
  717. result += string.Format(" title=\"{0}\"", title);
  718. }
  719. result += EmptyElementSuffix;
  720. return result;
  721. }
  722. private static readonly Regex _headerSetext = new Regex(@"
  723. ^(.+?)
  724. [ ]*
  725. \n
  726. (=+|-+) # $1 = string of ='s or -'s
  727. [ ]*
  728. \n+",
  729. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  730. private static readonly Regex _headerAtx = new Regex(@"
  731. ^(\#{1,6}) # $1 = string of #'s
  732. [ ]*
  733. (.+?) # $2 = Header text
  734. [ ]*
  735. \#* # optional closing #'s (not counted)
  736. \n+",
  737. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  738. /// <summary>
  739. /// Turn Markdown headers into HTML header tags
  740. /// </summary>
  741. /// <remarks>
  742. /// <para>
  743. /// Header 1
  744. /// ========
  745. /// </para>
  746. /// <para>
  747. /// Header 2
  748. /// --------
  749. /// </para>
  750. /// <para>
  751. /// # Header 1
  752. /// ## Header 2
  753. /// ## Header 2 with closing hashes ##
  754. /// ...
  755. /// ###### Header 6
  756. /// </para>
  757. /// </remarks>
  758. private string DoHeaders(string text)
  759. {
  760. text = _headerSetext.Replace(text, new MatchEvaluator(SetextHeaderEvaluator));
  761. return _headerAtx.Replace(text, new MatchEvaluator(AtxHeaderEvaluator));
  762. }
  763. private string SetextHeaderEvaluator(Match match)
  764. {
  765. string header = match.Groups[1].Value;
  766. int level = match.Groups[2].Value.StartsWith("=") ? 1 : 2;
  767. return string.Format("<h{1}>{0}</h{1}>\n\n", RunSpanGamut(header), level);
  768. }
  769. private string AtxHeaderEvaluator(Match match)
  770. {
  771. string header = match.Groups[2].Value;
  772. int level = match.Groups[1].Value.Length;
  773. return string.Format("<h{1}>{0}</h{1}>\n\n", RunSpanGamut(header), level);
  774. }
  775. private static readonly Regex _horizontalRules = new Regex(@"
  776. ^[ ]{0,3} # Leading space
  777. ([-*_]) # $1: First marker
  778. (?> # Repeated marker group
  779. [ ]{0,2} # Zero, one, or two spaces.
  780. \1 # Marker character
  781. ){2,} # Group repeated at least twice
  782. [ ]* # Trailing spaces
  783. $ # End of line.
  784. ", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  785. /// <summary>
  786. /// Turn Markdown horizontal rules into HTML hr tags
  787. /// </summary>
  788. /// <remarks>
  789. /// ***
  790. /// * * *
  791. /// ---
  792. /// - - -
  793. /// </remarks>
  794. private string DoHorizontalRules(string text)
  795. {
  796. return _horizontalRules.Replace(text, "<hr" + EmptyElementSuffix + "\n");
  797. }
  798. private static readonly string _wholeList = string.Format(@"
  799. ( # $1 = whole list
  800. ( # $2
  801. [ ]{{0,{1}}}
  802. ({0}) # $3 = first list item marker
  803. [ ]+
  804. )
  805. (?s:.+?)
  806. ( # $4
  807. \z
  808. |
  809. \n{{2,}}
  810. (?=\S)
  811. (?! # Negative lookahead for another list item marker
  812. [ ]*
  813. {0}[ ]+
  814. )
  815. )
  816. )", string.Format("(?:{0}|{1})", _markerUL, _markerOL), _tabWidth - 1);
  817. private static readonly Regex _listNested = new Regex("^" + _wholeList,
  818. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  819. private static readonly Regex _listTopLevel = new Regex(@"(?:(?<=\n\n)|\A\n?)" + _wholeList,
  820. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  821. /// <summary>
  822. /// Turn Markdown lists into HTML ul and ol and li tags
  823. /// </summary>
  824. private string DoLists(string text)
  825. {
  826. // We use a different prefix before nested lists than top-level lists.
  827. // See extended comment in _ProcessListItems().
  828. if (_listLevel > 0)
  829. {
  830. return _listNested.Replace(text, new MatchEvaluator(ListEvaluator));
  831. }
  832. else
  833. {
  834. return _listTopLevel.Replace(text, new MatchEvaluator(ListEvaluator));
  835. }
  836. }
  837. private string ListEvaluator(Match match)
  838. {
  839. string list = match.Groups[1].Value;
  840. string marker = match.Groups[3].Value;
  841. string listType = Regex.IsMatch(marker, _markerUL) ? "ul" : "ol";
  842. string result;
  843. string start = "";
  844. if (listType == "ol")
  845. {
  846. int.TryParse(marker.Substring(0, marker.Length - 1), out int firstNumber);
  847. if (firstNumber != 1 && firstNumber != 0)
  848. start = " start=\"" + firstNumber + "\"";
  849. }
  850. result = ProcessListItems(list, listType == "ul" ? _markerUL : _markerOL);
  851. return string.Format("<{0}{1}>\n{2}</{0}>\n", listType, start, result);
  852. }
  853. /// <summary>
  854. /// Process the contents of a single ordered or unordered list, splitting it
  855. /// into individual list items.
  856. /// </summary>
  857. private string ProcessListItems(string list, string marker)
  858. {
  859. // The listLevel global keeps track of when we're inside a list.
  860. // Each time we enter a list, we increment it; when we leave a list,
  861. // we decrement. If it's zero, we're not in a list anymore.
  862. // We do this because when we're not inside a list, we want to treat
  863. // something like this:
  864. // I recommend upgrading to version
  865. // 8. Oops, now this line is treated
  866. // as a sub-list.
  867. // As a single paragraph, despite the fact that the second line starts
  868. // with a digit-period-space sequence.
  869. // Whereas when we're inside a list (or sub-list), that line will be
  870. // treated as the start of a sub-list. What a kludge, huh? This is
  871. // an aspect of Markdown's syntax that's hard to parse perfectly
  872. // without resorting to mind-reading. Perhaps the solution is to
  873. // change the syntax rules such that sub-lists must start with a
  874. // starting cardinal number; e.g. "1." or "a.".
  875. _listLevel++;
  876. // Trim trailing blank lines:
  877. list = Regex.Replace(list, @"\n{2,}\z", "\n");
  878. string pattern = string.Format(
  879. @"(^[ ]*) # leading whitespace = $1
  880. ({0}) [ ]+ # list marker = $2
  881. ((?s:.+?) # list item text = $3
  882. (\n+))
  883. (?= (\z | \1 ({0}) [ ]+))", marker);
  884. bool lastItemHadADoubleNewline = false;
  885. // has to be a closure, so subsequent invocations can share the bool
  886. string ListItemEvaluator(Match match)
  887. {
  888. string item = match.Groups[3].Value;
  889. bool endsWithDoubleNewline = item.EndsWith("\n\n");
  890. bool containsDoubleNewline = endsWithDoubleNewline || item.Contains("\n\n");
  891. var loose = containsDoubleNewline || lastItemHadADoubleNewline;
  892. // we could correct any bad indentation here..
  893. item = RunBlockGamut(Outdent(item) + "\n", unhash: false, createParagraphs: loose);
  894. lastItemHadADoubleNewline = endsWithDoubleNewline;
  895. return string.Format("<li>{0}</li>\n", item);
  896. }
  897. list = Regex.Replace(list, pattern, new MatchEvaluator(ListItemEvaluator),
  898. RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  899. _listLevel--;
  900. return list;
  901. }
  902. private static readonly Regex _codeBlock = new Regex(string.Format(@"
  903. (?:\n\n|\A\n?)
  904. ( # $1 = the code block -- one or more lines, starting with a space
  905. (?:
  906. (?:[ ]{{{0}}}) # Lines must start with a tab-width of spaces
  907. .*\n+
  908. )+
  909. )
  910. ((?=^[ ]{{0,{0}}}[^ \t\n])|\Z) # Lookahead for non-space at line-start, or end of doc",
  911. _tabWidth), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  912. /// <summary>
  913. /// /// Turn Markdown 4-space indented code into HTML pre code blocks
  914. /// </summary>
  915. private string DoCodeBlocks(string text)
  916. {
  917. return _codeBlock.Replace(text, new MatchEvaluator(CodeBlockEvaluator));
  918. }
  919. private string CodeBlockEvaluator(Match match)
  920. {
  921. string codeBlock = match.Groups[1].Value;
  922. codeBlock = EncodeCode(Outdent(codeBlock));
  923. codeBlock = _newlinesLeadingTrailing.Replace(codeBlock, "");
  924. return string.Concat("\n\n<pre><code>", codeBlock, "\n</code></pre>\n\n");
  925. }
  926. private static readonly Regex _codeSpan = new Regex(@"
  927. (?<![\\`]) # Character before opening ` can't be a backslash or backtick
  928. (`+) # $1 = Opening run of `
  929. (?!`) # and no more backticks -- match the full run
  930. (.+?) # $2 = The code block
  931. (?<!`)
  932. \1
  933. (?!`)", RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
  934. /// <summary>
  935. /// Turn Markdown `code spans` into HTML code tags
  936. /// </summary>
  937. private string DoCodeSpans(string text)
  938. {
  939. // * You can use multiple backticks as the delimiters if you want to
  940. // include literal backticks in the code span. So, this input:
  941. //
  942. // Just type ``foo `bar` baz`` at the prompt.
  943. //
  944. // Will translate to:
  945. //
  946. // <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
  947. //
  948. // There's no arbitrary limit to the number of backticks you
  949. // can use as delimters. If you need three consecutive backticks
  950. // in your code, use four for delimiters, etc.
  951. //
  952. // * You can use spaces to get literal backticks at the edges:
  953. //
  954. // ... type `` `bar` `` ...
  955. //
  956. // Turns to:
  957. //
  958. // ... type <code>`bar`</code> ...
  959. //
  960. return _codeSpan.Replace(text, new MatchEvaluator(CodeSpanEvaluator));
  961. }
  962. private string CodeSpanEvaluator(Match match)
  963. {
  964. string span = match.Groups[2].Value;
  965. span = Regex.Replace(span, "^[ ]*", ""); // leading whitespace
  966. span = Regex.Replace(span, "[ ]*$", ""); // trailing whitespace
  967. span = EncodeCode(span);
  968. span = SaveFromAutoLinking(span); // to prevent auto-linking. Not necessary in code *blocks*, but in code spans.
  969. return string.Concat("<code>", span, "</code>");
  970. }
  971. private static readonly Regex _bold = new Regex(@"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1",
  972. RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
  973. private static readonly Regex _semiStrictBold = new Regex(@"(?=.[*_]|[*_])(^|(?=\W__|(?!\*)[\W_]\*\*|\w\*\*\w).)(\*\*|__)(?!\2)(?=\S)((?:|.*?(?!\2).)(?=\S_|\w|\S\*\*(?:[\W_]|$)).)(?=__(?:\W|$)|\*\*(?:[^*]|$))\2",
  974. RegexOptions.Singleline | RegexOptions.Compiled);
  975. private static readonly Regex _strictBold = new Regex(@"(^|[\W_])(?:(?!\1)|(?=^))(\*|_)\2(?=\S)(.*?\S)\2\2(?!\2)(?=[\W_]|$)",
  976. RegexOptions.Singleline | RegexOptions.Compiled);
  977. private static readonly Regex _italic = new Regex(@"(\*|_) (?=\S) (.+?) (?<=\S) \1",
  978. RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
  979. private static readonly Regex _semiStrictItalic = new Regex(@"(?=.[*_]|[*_])(^|(?=\W_|(?!\*)(?:[\W_]\*|\D\*(?=\w)\D)).)(\*|_)(?!\2\2\2)(?=\S)((?:(?!\2).)*?(?=[^\s_]_|(?=\w)\D\*\D|[^\s*]\*(?:[\W_]|$)).)(?=_(?:\W|$)|\*(?:[^*]|$))\2",
  980. RegexOptions.Singleline | RegexOptions.Compiled);
  981. private static readonly Regex _strictItalic = new Regex(@"(^|[\W_])(?:(?!\1)|(?=^))(\*|_)(?=\S)((?:(?!\2).)*?\S)\2(?!\2)(?=[\W_]|$)",
  982. RegexOptions.Singleline | RegexOptions.Compiled);
  983. /// <summary>
  984. /// Turn Markdown *italics* and **bold** into HTML strong and em tags
  985. /// </summary>
  986. private string DoItalicsAndBold(string text)
  987. {
  988. if (!(text.Contains("*") || text.Contains("_")))
  989. return text;
  990. // <strong> must go first, then <em>
  991. if (StrictBoldItalic)
  992. {
  993. if (AsteriskIntraWordEmphasis)
  994. {
  995. text = _semiStrictBold.Replace(text, "$1<strong>$3</strong>");
  996. text = _semiStrictItalic.Replace(text, "$1<em>$3</em>");
  997. }
  998. else
  999. {
  1000. text = _strictBold.Replace(text, "$1<strong>$3</strong>");
  1001. text = _strictItalic.Replace(text, "$1<em>$3</em>");
  1002. }
  1003. }
  1004. else
  1005. {
  1006. text = _bold.Replace(text, "<strong>$2</strong>");
  1007. text = _italic.Replace(text, "<em>$2</em>");
  1008. }
  1009. return text;
  1010. }
  1011. /// <summary>
  1012. /// Turn markdown line breaks (two space at end of line) into HTML break tags
  1013. /// </summary>
  1014. private string DoHardBreaks(string text)
  1015. {
  1016. if (AutoNewLines)
  1017. {
  1018. return Regex.Replace(text, @"\n", string.Format("<br{0}\n", EmptyElementSuffix));
  1019. }
  1020. else
  1021. {
  1022. return Regex.Replace(text, @" {2,}\n", string.Format("<br{0}\n", EmptyElementSuffix));
  1023. }
  1024. }
  1025. private static readonly Regex _blockquote = new Regex(@"
  1026. ( # Wrap whole match in $1
  1027. (
  1028. ^[ ]*>[ ]? # '>' at the start of a line
  1029. .+\n # rest of the first line
  1030. (.+\n)* # subsequent consecutive lines
  1031. \n* # blanks
  1032. )+
  1033. )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.Compiled);
  1034. /// <summary>
  1035. /// Turn Markdown > quoted blocks into HTML blockquote blocks
  1036. /// </summary>
  1037. private string DoBlockQuotes(string text)
  1038. {
  1039. return _blockquote.Replace(text, new MatchEvaluator(BlockQuoteEvaluator));
  1040. }
  1041. private string BlockQuoteEvaluator(Match match)
  1042. {
  1043. string bq = match.Groups[1].Value;
  1044. bq = Regex.Replace(bq, "^[ ]*>[ ]?", "", RegexOptions.Multiline); // trim one level of quoting
  1045. bq = Regex.Replace(bq, "^[ ]+$", "", RegexOptions.Multiline); // trim whitespace-only lines
  1046. bq = RunBlockGamut(bq); // recurse
  1047. bq = Regex.Replace(bq, "^", " ", RegexOptions.Multiline);
  1048. // These leading spaces screw with <pre> content, so we need to fix that:
  1049. bq = Regex.Replace(bq, @"(\s*<pre>.+?</pre>)", new MatchEvaluator(BlockQuoteEvaluator2), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
  1050. bq = string.Format("<blockquote>\n{0}\n</blockquote>", bq);
  1051. string key = GetHashKey(bq, isHtmlBlock: true);
  1052. _htmlBlocks[key] = bq;
  1053. return "\n\n" + key + "\n\n";
  1054. }
  1055. private string BlockQuoteEvaluator2(Match match)
  1056. {
  1057. return Regex.Replace(match.Groups[1].Value, "^ ", "", RegexOptions.Multiline);
  1058. }
  1059. private const string _charInsideUrl = @"[-A-Z0-9+&@#/%?=~_|\[\]\(\)!:,\.;" + "\x1a]";
  1060. private const string _charEndingUrl = "[-A-Z0-9+&@#/%=~_|\\[\\])]";
  1061. private static readonly Regex _autolinkBare = new Regex(@"(<|="")?\b(https?|ftp)(://" + _charInsideUrl + "*" + _charEndingUrl + ")(?=$|\\W)",
  1062. RegexOptions.IgnoreCase | RegexOptions.Compiled);
  1063. private static readonly Regex _endCharRegex = new Regex(_charEndingUrl, RegexOptions.IgnoreCase | RegexOptions.Compiled);
  1064. private static string HandleTrailingParens(Match match)
  1065. {
  1066. // The first group is essentially a negative lookbehind -- if there's a < or a =", we don't touch this.
  1067. // We're not using a *real* lookbehind, because of links with in links, like <a href="http://web.archive.org/web/20121130000728/http://www.google.com/">
  1068. // With a real lookbehind, the full link would never be matched, and thus the http://www.google.com *would* be matched.
  1069. // With the simulated lookbehind, the full link *is* matched (just not handled, because of this early return), causing
  1070. // the google link to not be matched again.
  1071. if (match.Groups[1].Success)
  1072. return match.Value;
  1073. var protocol = match.Groups[2].Value;
  1074. var link = match.Groups[3].Value;
  1075. if (!link.EndsWith(")"))
  1076. return "<" + protocol + link + ">";
  1077. var level = 0;
  1078. foreach (Match c in Regex.Matches(link, "[()]"))
  1079. {
  1080. if (c.Value == "(")
  1081. {
  1082. if (level <= 0)
  1083. level = 1;
  1084. else
  1085. level++;
  1086. }
  1087. else
  1088. {
  1089. level--;
  1090. }
  1091. }
  1092. var tail = "";
  1093. if (level < 0)
  1094. {
  1095. link = Regex.Replace(link, @"\){1," + (-level) + "}$", m => { tail = m.Value; return ""; });
  1096. }
  1097. if (tail.Length > 0)
  1098. {
  1099. var lastChar = link[link.Length - 1];
  1100. if (!_endCharRegex.IsMatch(lastChar.ToString()))
  1101. {
  1102. tail = lastChar + tail;
  1103. link = link.Substring(0, link.Length - 1);
  1104. }
  1105. }
  1106. return "<" + protocol + link + ">" + tail;
  1107. }
  1108. private static readonly Regex _autoEmailBare = new Regex(@"(<|="")?(?:mailto:)?([-.\w]+\@[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
  1109. private static string EmailBareLinkEvaluator(Match match)
  1110. {
  1111. // We matched an opening <, so it's already enclosed
  1112. if (match.Groups[1].Success)
  1113. {
  1114. return match.Value;
  1115. }
  1116. return "<" + match.Value + ">";
  1117. }
  1118. private readonly static Regex _linkEmail = new Regex(@"<
  1119. (?:mailto:)?
  1120. (
  1121. [-.\w]+
  1122. \@
  1123. [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  1124. )
  1125. >", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
  1126. /// <summary>
  1127. /// Turn angle-delimited URLs into HTML anchor tags
  1128. /// </summary>
  1129. /// <remarks>
  1130. /// &lt;http://www.example.com&gt;
  1131. /// </remarks>
  1132. private string DoAutoLinks(string text)
  1133. {
  1134. if (AutoHyperlink)
  1135. {
  1136. // fixup arbitrary URLs by adding Markdown < > so they get linked as well
  1137. // note that at this point, all other URL in the text are already hyperlinked as <a href=""></a>
  1138. // *except* for the <http://www.foo.com> case
  1139. text = _autolinkBare.Replace(text, HandleTrailingParens);
  1140. }
  1141. // Hyperlinks: <http://foo.com>
  1142. text = Regex.Replace(text, "<((https?|ftp):[^'\">\\s]+)>", new MatchEvaluator(HyperlinkEvaluator));
  1143. if (LinkEmails)
  1144. {
  1145. // Email addresses: <address@domain.foo> or <mailto:address@domain.foo>
  1146. // Also allow "address@domain.foo" and "mailto:address@domain.foo", without the <>
  1147. //text = _autoEmailBare.Replace(text, EmailBareLinkEvaluator);
  1148. text = _linkEmail.Replace(text, new MatchEvaluator(EmailEvaluator));
  1149. }
  1150. return text;
  1151. }
  1152. private string HyperlinkEvaluator(Match match)
  1153. {
  1154. string link = match.Groups[1].Value;
  1155. string url = AttributeSafeUrl(link);
  1156. return string.Format("<a href=\"{0}\">{1}</a>", url, link);
  1157. }
  1158. private string EmailEvaluator(Match match)
  1159. {
  1160. string email = Unescape(match.Groups[1].Value);
  1161. //
  1162. // Input: an email address, e.g. "foo@example.com"
  1163. //
  1164. // Output: the email address as a mailto link, with each character
  1165. // of the address encoded as either a decimal or hex entity, in
  1166. // the hopes of foiling most address harvesting spam bots. E.g.:
  1167. //
  1168. // <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
  1169. // x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
  1170. // &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
  1171. //
  1172. // Based by a filter by Matthew Wickline, posted to the BBEdit-Talk
  1173. // mailing list: <http://tinyurl.com/yu7ue>
  1174. //
  1175. email = "mailto:" + email;
  1176. // leave ':' alone (to spot mailto: later)
  1177. email = EncodeEmailAddress(email);
  1178. email = string.Format("<a href=\"{0}\">{0}</a>", email);
  1179. // strip the mailto: from the visible part
  1180. return Regex.Replace(email, "\">.+?:", "\">");
  1181. }
  1182. private static readonly Regex _outDent = new Regex("^[ ]{1," + _tabWidth + "}", RegexOptions.Multiline | RegexOptions.Compiled);
  1183. /// <summary>
  1184. /// Remove one level of line-leading spaces
  1185. /// </summary>
  1186. private string Outdent(string block)
  1187. {
  1188. return _outDent.Replace(block, "");
  1189. }
  1190. #region Encoding and Normalization
  1191. /// <summary>
  1192. /// encodes email address randomly
  1193. /// roughly 10% raw, 45% hex, 45% dec
  1194. /// note that @ is always encoded and : never is
  1195. /// </summary>
  1196. private string EncodeEmailAddress(string addr)
  1197. {
  1198. var sb = new StringBuilder(addr.Length * 5);
  1199. var rand = new Random();
  1200. int r;
  1201. foreach (char c in addr)
  1202. {
  1203. r = rand.Next(1, 100);
  1204. if ((r > 90 || c == ':') && c != '@')
  1205. sb.Append(c); // m
  1206. else if (r < 45)
  1207. sb.AppendFormat("&#x{0:x};", (int)c); // &#x6D
  1208. else
  1209. sb.AppendFormat("&#{0};", (int)c); // &#109
  1210. }
  1211. return sb.ToString();
  1212. }
  1213. private static readonly Regex _codeEncoder = new Regex(@"&|<|>|\\|\*|_|\{|\}|\[|\]", RegexOptions.Compiled);
  1214. /// <summary>
  1215. /// Encode/escape certain Markdown characters inside code blocks and spans where they are literals
  1216. /// </summary>
  1217. private string EncodeCode(string code)
  1218. {
  1219. return _codeEncoder.Replace(code, EncodeCodeEvaluator);
  1220. }
  1221. private string EncodeCodeEvaluator(Match match)
  1222. {
  1223. switch (match.Value)
  1224. {
  1225. // Encode all ampersands; HTML entities are not
  1226. // entities within a Markdown code span.
  1227. case "&":
  1228. return "&amp;";
  1229. // Do the angle bracket song and dance
  1230. case "<":
  1231. return "&lt;";
  1232. case ">":
  1233. return "&gt;";
  1234. // escape characters that are magic in Markdown
  1235. default:
  1236. return _escapeTable[match.Value];
  1237. }
  1238. }
  1239. private static readonly Regex _amps = new Regex("&(?!((#[0-9]+)|(#[xX][a-fA-F0-9]+)|([a-zA-Z][a-zA-Z0-9]*));)", RegexOptions.ExplicitCapture | RegexOptions.Compiled);
  1240. private static readonly Regex _angles = new Regex(@"<(?![A-Za-z/?\$!])", RegexOptions.ExplicitCapture | RegexOptions.Compiled);
  1241. /// <summary>
  1242. /// Encode any ampersands (that aren't part of an HTML entity) and left or right angle brackets
  1243. /// </summary>
  1244. private string EncodeAmpsAndAngles(string s)
  1245. {
  1246. s = _amps.Replace(s, "&amp;");
  1247. return _angles.Replace(s, "&lt;");
  1248. }
  1249. private static readonly Regex _backslashEscapes;
  1250. /// <summary>
  1251. /// Encodes any escaped characters such as \`, \*, \[ etc
  1252. /// </summary>
  1253. private string EscapeBackslashes(string s) => _backslashEscapes.Replace(s, new MatchEvaluator(EscapeBackslashesEvaluator));
  1254. private string EscapeBackslashesEvaluator(Match match) => _backslashEscapeTable[match.Value];
  1255. // note: this space MATTERS - do not remove (hex / unicode) \|/
  1256. #pragma warning disable RCS1190 // Join string expressions.
  1257. private static readonly Regex _unescapes = new Regex("\x1A" + "E\\d+E", RegexOptions.Compiled);
  1258. #pragma warning restore RCS1190 // Join string expressions.
  1259. /// <summary>
  1260. /// swap back in all the special characters we've hidden
  1261. /// </summary>
  1262. private string Unescape(string s) => _unescapes.Replace(s, new MatchEvaluator(UnescapeEvaluator));
  1263. private string UnescapeEvaluator(Match match) => _invertedEscapeTable[match.Value];
  1264. /// <summary>
  1265. /// escapes Bold [ * ] and Italic [ _ ] characters
  1266. /// </summary>
  1267. private string EscapeBoldItalic(string s)
  1268. {
  1269. s = s.Replace("*", _escapeTable["*"]);
  1270. return s.Replace("_", _escapeTable["_"]);
  1271. }
  1272. private static string AttributeEncode(string s)
  1273. {
  1274. return s.Replace(">", "&gt;").Replace("<", "&lt;").Replace("\"", "&quot;").Replace("'", "&#39;");
  1275. }
  1276. private static string AttributeSafeUrl(string s)
  1277. {
  1278. s = AttributeEncode(s);
  1279. foreach (var c in "*_:()[]")
  1280. s = s.Replace(c.ToString(), _escapeTable[c.ToString()]);
  1281. return s;
  1282. }
  1283. /// <summary>
  1284. /// Within tags -- meaning between &lt; and &gt; -- encode [\ ` * _] so they
  1285. /// don't conflict with their use in Markdown for code, italics and strong.
  1286. /// We're replacing each such character with its corresponding hash
  1287. /// value; this is likely overkill, but it should prevent us from colliding
  1288. /// with the escape values by accident.
  1289. /// </summary>
  1290. private string EscapeSpecialCharsWithinTagAttributes(string text)
  1291. {
  1292. var tokens = TokenizeHTML(text);
  1293. // now, rebuild text from the tokens
  1294. var sb = new StringBuilder(text.Length);
  1295. foreach (var token in tokens)
  1296. {
  1297. string value = token.Value;
  1298. if (token.Type == TokenType.Tag)
  1299. {
  1300. value = value.Replace(@"\", _escapeTable[@"\"]);
  1301. if (AutoHyperlink && value.StartsWith("<!")) // escape slashes in comments to prevent autolinking there -- https://meta.stackexchange.com/questions/95987/html-comment-containing-url-breaks-if-followed-by-another-html-comment
  1302. value = value.Replace("/", _escapeTable["/"]);
  1303. value = Regex.Replace(value, "(?<=.)</?code>(?=.)", _escapeTable["`"]);
  1304. value = EscapeBoldItalic(value);
  1305. }
  1306. sb.Append(value);
  1307. }
  1308. return sb.ToString();
  1309. }
  1310. /// <summary>
  1311. /// convert all tabs to _tabWidth spaces;
  1312. /// standardizes line endings from DOS (CR LF) or Mac (CR) to UNIX (LF);
  1313. /// makes sure text ends with a couple of newlines;
  1314. /// removes any blank lines (only spaces) in the text
  1315. /// </summary>
  1316. private string Normalize(string text)
  1317. {
  1318. var output = new StringBuilder(text.Length);
  1319. var line = new StringBuilder();
  1320. bool valid = false;
  1321. for (int i = 0; i < text.Length; i++)
  1322. {
  1323. switch (text[i])
  1324. {
  1325. case '\n':
  1326. if (valid) output.Append(line);
  1327. output.Append('\n');
  1328. line.Length = 0; valid = false;
  1329. break;
  1330. case '\r':
  1331. if ((i < text.Length - 1) && (text[i + 1] != '\n'))
  1332. {
  1333. if (valid) output.Append(line);
  1334. output.Append('\n');
  1335. line.Length = 0; valid = false;
  1336. }
  1337. break;
  1338. case '\t':
  1339. int width = (_tabWidth - (line.Length % _tabWidth));
  1340. for (int k = 0; k < width; k++)
  1341. line.Append(' ');
  1342. break;
  1343. case '\x1A':
  1344. break;
  1345. default:
  1346. if (!valid && text[i] != ' ') valid = true;
  1347. line.Append(text[i]);
  1348. break;
  1349. }
  1350. }
  1351. if (valid) output.Append(line);
  1352. output.Append('\n');
  1353. // add two newlines to the end before return
  1354. return output.Append("\n\n").ToString();
  1355. }
  1356. #endregion
  1357. /// <summary>
  1358. /// this is to emulate what's evailable in PHP
  1359. /// </summary>
  1360. private static string RepeatString(string text, int count)
  1361. {
  1362. var sb = new StringBuilder(text.Length * count);
  1363. for (int i = 0; i < count; i++)
  1364. sb.Append(text);
  1365. return sb.ToString();
  1366. }
  1367. }
  1368. }