HTMLUtility.cs 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. using HtmlAgilityPack;
  2. using System;
  3. using System.IO;
  4. namespace PX.Data
  5. {
  6. //Source: https://github.com/ceee/ReadSharp/blob/master/ReadSharp/HtmlUtilities.cs
  7. public class HtmlUtilities
  8. {
  9. /// <summary>
  10. /// Converts HTML to plain text / strips tags.
  11. /// </summary>
  12. /// <param name="html">The HTML.</param>
  13. /// <returns></returns>
  14. public static string ConvertToPlainText(string html)
  15. {
  16. HtmlDocument doc = new HtmlDocument();
  17. doc.LoadHtml(html);
  18. StringWriter sw = new StringWriter();
  19. ConvertTo(doc.DocumentNode, sw);
  20. sw.Flush();
  21. return sw.ToString();
  22. }
  23. /// <summary>
  24. /// Count the words.
  25. /// The content has to be converted to plain text before (using ConvertToPlainText).
  26. /// </summary>
  27. /// <param name="plainText">The plain text.</param>
  28. /// <returns></returns>
  29. public static int CountWords(string plainText)
  30. {
  31. return !String.IsNullOrEmpty(plainText) ? plainText.Split(' ', '\n').Length : 0;
  32. }
  33. public static string Cut(string text, int length)
  34. {
  35. if (!String.IsNullOrEmpty(text) && text.Length > length)
  36. {
  37. text = text.Substring(0, length - 4) + " ...";
  38. }
  39. return text;
  40. }
  41. private static void ConvertContentTo(HtmlNode node, TextWriter outText)
  42. {
  43. foreach (HtmlNode subnode in node.ChildNodes)
  44. {
  45. ConvertTo(subnode, outText);
  46. }
  47. }
  48. private static void ConvertTo(HtmlNode node, TextWriter outText)
  49. {
  50. string html;
  51. switch (node.NodeType)
  52. {
  53. case HtmlNodeType.Comment:
  54. // don't output comments
  55. break;
  56. case HtmlNodeType.Document:
  57. ConvertContentTo(node, outText);
  58. break;
  59. case HtmlNodeType.Text:
  60. // script and style must not be output
  61. string parentName = node.ParentNode.Name;
  62. if ((parentName == "script") || (parentName == "style"))
  63. break;
  64. // get text
  65. html = ((HtmlTextNode)node).Text;
  66. // is it in fact a special closing node output as text?
  67. if (HtmlNode.IsOverlappedClosingElement(html))
  68. break;
  69. // check the text is meaningful and not a bunch of whitespaces
  70. if (html.Trim().Length > 0)
  71. {
  72. outText.Write(HtmlEntity.DeEntitize(html));
  73. }
  74. break;
  75. case HtmlNodeType.Element:
  76. switch (node.Name)
  77. {
  78. case "p":
  79. // treat paragraphs as crlf
  80. outText.Write("\r\n");
  81. break;
  82. case "br":
  83. outText.Write("\r\n");
  84. break;
  85. }
  86. if (node.HasChildNodes)
  87. {
  88. ConvertContentTo(node, outText);
  89. }
  90. break;
  91. }
  92. }
  93. }
  94. }