You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
263 lines
10 KiB
263 lines
10 KiB
using System;
|
|
using System.Text.RegularExpressions;
|
|
using System.Collections.Generic;
|
|
using SiteServer.Utils.Enumerations;
|
|
|
|
namespace SiteServer.Utils
|
|
{
|
|
public class RegexUtils
|
|
{
|
|
|
|
/*
|
|
* 通用:.*?
|
|
* 所有链接:<a\s*.*?href=(?:"(?<url>[^"]*)"|'(?<url>[^']*)'|(?<url>\S+)).*?>
|
|
* */
|
|
|
|
public static RegexOptions Options = ((RegexOptions.Singleline | RegexOptions.IgnoreCase) | RegexOptions.IgnorePatternWhitespace);
|
|
|
|
public static List<string> GetImageSrcs(string baseUrl, string html)
|
|
{
|
|
const string regex = "(img|input)[^><]*\\s+src\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetUrls(regex, html, baseUrl);
|
|
}
|
|
|
|
public static List<string> GetOriginalImageSrcs(string html)
|
|
{
|
|
const string regex = "(img|input)[^><]*\\s+src\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetContents("url", regex, html);
|
|
}
|
|
|
|
public static List<string> GetOriginalLinkHrefs(string html)
|
|
{
|
|
const string regex = "a[^><]*\\s+href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetContents("url", regex, html);
|
|
}
|
|
|
|
public static List<string> GetFlashSrcs(string baseUrl, string html)
|
|
{
|
|
const string regex = "embed\\s+[^><]*src\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))|param\\s+[^><]*value\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetUrls(regex, html, baseUrl);
|
|
}
|
|
|
|
public static List<string> GetOriginalFlashSrcs(string html)
|
|
{
|
|
const string regex = "embed\\s+[^><]*src\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))|param\\s+[^><]*value\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetContents("url", regex, html);
|
|
}
|
|
|
|
public static List<string> GetStyleImageUrls(string baseUrl, string html)
|
|
{
|
|
const string regex = "url\\((?<url>[^\\(\\)]*)\\)";
|
|
var arraylist = GetUrls(regex, html, baseUrl);
|
|
var list = new List<string>();
|
|
foreach (var url in arraylist)
|
|
{
|
|
if (!list.Contains(url) && EFileSystemTypeUtils.IsImage(PathUtils.GetExtension(url)))
|
|
{
|
|
list.Add(url);
|
|
}
|
|
}
|
|
return list;
|
|
}
|
|
|
|
public static List<string> GetOriginalStyleImageUrls(string html)
|
|
{
|
|
//background-image: url(../images/leftline.gif);
|
|
const string regex = "url\\((?<url>[^\\(\\)]*)\\)";
|
|
var arraylist = GetContents("url", regex, html);
|
|
var list = new List<string>();
|
|
foreach (var url in arraylist)
|
|
{
|
|
if (!list.Contains(url) && EFileSystemTypeUtils.IsImage(PathUtils.GetExtension(url)))
|
|
{
|
|
list.Add(url);
|
|
}
|
|
}
|
|
return list;
|
|
}
|
|
|
|
public static List<string> GetBackgroundImageSrcs(string baseUrl, string html)
|
|
{
|
|
const string regex = "background\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetUrls(regex, html, baseUrl);
|
|
}
|
|
|
|
public static List<string> GetOriginalBackgroundImageSrcs(string html)
|
|
{
|
|
const string regex = "background\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetContents("url", regex, html);
|
|
}
|
|
|
|
public static List<string> GetCssHrefs(string baseUrl, string html)
|
|
{
|
|
//string regex = "link\\s+[^><]*href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>\\S+))|@import\\s*url\\((?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>\\S+))\\)";
|
|
const string regex = "link\\s+[^><]*href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))|\\@import\\s*url\\s*\\(\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>.*?))\\s*\\)";
|
|
return GetUrls(regex, html, baseUrl);
|
|
}
|
|
|
|
public static List<string> GetOriginalCssHrefs(string html)
|
|
{
|
|
const string regex = "link\\s+[^><]*href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))|\\@import\\s*url\\s*\\(\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>.*?))\\s*\\)";
|
|
return GetContents("url", regex, html);
|
|
}
|
|
|
|
public static List<string> GetScriptSrcs(string baseUrl, string html)
|
|
{
|
|
const string regex = "script\\s+[^><]*src\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetUrls(regex, html, baseUrl);
|
|
}
|
|
|
|
public static List<string> GetOriginalScriptSrcs(string html)
|
|
{
|
|
const string regex = "script\\s+[^><]*src\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
|
|
return GetContents("url", regex, html);
|
|
}
|
|
|
|
public static List<string> GetTagInnerContents(string tagName, string html)
|
|
{
|
|
string regex = $"<{tagName}\\s+[^><]*>\\s*(?<content>[\\s\\S]+?)\\s*</{tagName}>";
|
|
return GetContents("content", regex, html);
|
|
}
|
|
|
|
public static List<string> GetTagContents(string tagName, string html)
|
|
{
|
|
var list = new List<string>();
|
|
|
|
string regex = $@"<({tagName})[^>]*>(.*?)</\1>|<{tagName}[^><]*/>";
|
|
|
|
var matches = Regex.Matches(html, regex, RegexOptions.IgnoreCase);
|
|
foreach (Match match in matches)
|
|
{
|
|
if (match.Success)
|
|
{
|
|
list.Add(match.Result("$0"));
|
|
}
|
|
}
|
|
|
|
return list;
|
|
}
|
|
|
|
public static string GetTagName(string html)
|
|
{
|
|
var match = Regex.Match(html, "<([^>\\s]+)[\\s\\SS]*>", RegexOptions.IgnoreCase);
|
|
return match.Success ? match.Result("$1") : string.Empty;
|
|
}
|
|
|
|
public static string GetInnerContent(string tagName, string html)
|
|
{
|
|
string regex = $"<{tagName}[^><]*>(?<content>[\\s\\S]+?)</{tagName}>";
|
|
return GetContent("content", regex, html);
|
|
}
|
|
|
|
public static string GetAttributeContent(string attributeName, string html)
|
|
{
|
|
string regex =
|
|
$"<[^><]+\\s*{attributeName}\\s*=\\s*(?:\"(?<value>[^\"]*)\"|'(?<value>[^']*)'|(?<value>[^>\\s]*)).*?>";
|
|
return GetContent("value", regex, html);
|
|
}
|
|
|
|
public static List<string> GetUrls(string html, string baseUrl)
|
|
{
|
|
const string regex = "<a\\s*.*?href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*)).*?>";
|
|
return GetUrls(regex, html, baseUrl);
|
|
}
|
|
|
|
public static List<string> GetUrls(string regex, string html, string baseUrl)
|
|
{
|
|
var list = new List<string>();
|
|
if (string.IsNullOrEmpty(regex))
|
|
{
|
|
regex = "<a\\s*.*?href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*)).*?>";
|
|
}
|
|
var groupName = "url";
|
|
var arraylist = GetContents(groupName, regex, html);
|
|
foreach (var rawUrl in arraylist)
|
|
{
|
|
var url = PageUtils.GetUrlByBaseUrl(rawUrl, baseUrl);
|
|
if (!string.IsNullOrEmpty(url) && !list.Contains(url))
|
|
{
|
|
list.Add(url);
|
|
}
|
|
}
|
|
return list;
|
|
}
|
|
|
|
public static string GetUrl(string regex, string html, string baseUrl)
|
|
{
|
|
return PageUtils.GetUrlByBaseUrl(GetContent("url", regex, html), baseUrl);
|
|
}
|
|
|
|
public static string GetContent(string groupName, string regex, string html)
|
|
{
|
|
var content = string.Empty;
|
|
if (string.IsNullOrEmpty(regex)) return content;
|
|
if (regex.IndexOf("<" + groupName + ">", StringComparison.Ordinal) == -1)
|
|
{
|
|
return regex;
|
|
}
|
|
|
|
var reg = new Regex(regex, Options);
|
|
var match = reg.Match(html);
|
|
if (match.Success)
|
|
{
|
|
content = match.Groups[groupName].Value;
|
|
}
|
|
|
|
return content;
|
|
}
|
|
|
|
public static string Replace(string regex, string input, string replacement)
|
|
{
|
|
if (string.IsNullOrEmpty(input)) return input;
|
|
var reg = new Regex(regex, Options);
|
|
return reg.Replace(input, replacement);
|
|
}
|
|
|
|
public static string Replace(string regex, string input, string replacement, int count)
|
|
{
|
|
if (count == 0)
|
|
{
|
|
return Replace(regex, input, replacement);
|
|
}
|
|
if (string.IsNullOrEmpty(input)) return input;
|
|
var reg = new Regex(regex, Options);
|
|
return reg.Replace(input, replacement, count);
|
|
}
|
|
|
|
public static bool IsMatch(string regex, string input)
|
|
{
|
|
var reg = new Regex(regex, Options);
|
|
return reg.IsMatch(input);
|
|
}
|
|
|
|
public static List<string> GetContents(string groupName, string regex, string html)
|
|
{
|
|
if (string.IsNullOrEmpty(regex)) return new List<string>();
|
|
|
|
var list = new List<string>();
|
|
var reg = new Regex(regex, Options);
|
|
|
|
for (var match = reg.Match(html); match.Success; match = match.NextMatch())
|
|
{
|
|
var theValue = match.Groups[groupName].Value;
|
|
if (!list.Contains(theValue))
|
|
{
|
|
list.Add(theValue);
|
|
}
|
|
}
|
|
return list;
|
|
}
|
|
|
|
public static string RemoveScripts(string html)
|
|
{
|
|
const string regex = "<script[^><]*>.*?<\\/script>";
|
|
return Replace(regex, html, string.Empty);
|
|
}
|
|
|
|
public static string RemoveImages(string html)
|
|
{
|
|
const string regex = "<img[^><]*>";
|
|
return Replace(regex, html, string.Empty);
|
|
}
|
|
}
|
|
}
|