| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- using HtmlAgilityPack;
- using System;
- using System.IO;
- namespace PX.Data
- {
- //Source: https://github.com/ceee/ReadSharp/blob/master/ReadSharp/HtmlUtilities.cs
- public class HtmlUtilities
- {
- /// <summary>
- /// Converts HTML to plain text / strips tags.
- /// </summary>
- /// <param name="html">The HTML.</param>
- /// <returns></returns>
- public static string ConvertToPlainText(string html)
- {
- HtmlDocument doc = new HtmlDocument();
- doc.LoadHtml(html);
- StringWriter sw = new StringWriter();
- ConvertTo(doc.DocumentNode, sw);
- sw.Flush();
- return sw.ToString();
- }
- /// <summary>
- /// Count the words.
- /// The content has to be converted to plain text before (using ConvertToPlainText).
- /// </summary>
- /// <param name="plainText">The plain text.</param>
- /// <returns></returns>
- public static int CountWords(string plainText)
- {
- return !String.IsNullOrEmpty(plainText) ? plainText.Split(' ', '\n').Length : 0;
- }
- public static string Cut(string text, int length)
- {
- if (!String.IsNullOrEmpty(text) && text.Length > length)
- {
- text = text.Substring(0, length - 4) + " ...";
- }
- return text;
- }
- private static void ConvertContentTo(HtmlNode node, TextWriter outText)
- {
- foreach (HtmlNode subnode in node.ChildNodes)
- {
- ConvertTo(subnode, outText);
- }
- }
- private static void ConvertTo(HtmlNode node, TextWriter outText)
- {
- string html;
- switch (node.NodeType)
- {
- case HtmlNodeType.Comment:
- // don't output comments
- break;
- case HtmlNodeType.Document:
- ConvertContentTo(node, outText);
- break;
- case HtmlNodeType.Text:
- // script and style must not be output
- string parentName = node.ParentNode.Name;
- if ((parentName == "script") || (parentName == "style"))
- break;
- // get text
- html = ((HtmlTextNode)node).Text;
- // is it in fact a special closing node output as text?
- if (HtmlNode.IsOverlappedClosingElement(html))
- break;
- // check the text is meaningful and not a bunch of whitespaces
- if (html.Trim().Length > 0)
- {
- outText.Write(HtmlEntity.DeEntitize(html));
- }
- break;
- case HtmlNodeType.Element:
- switch (node.Name)
- {
- case "p":
- // treat paragraphs as crlf
- outText.Write("\r\n");
- break;
- case "br":
- outText.Write("\r\n");
- break;
- }
- if (node.HasChildNodes)
- {
- ConvertContentTo(node, outText);
- }
- break;
- }
- }
- }
- }
|