浏览代码

First Commit

Kitt Parker 4 月之前
当前提交
4bc2f1838c
共有 3 个文件被更改,包括 110 次插入0 次删除
  1. 0 0
      HTMLTextAttribute.cs
  2. 110 0
      HTMLUtility.cs
  3. 0 0
      README.md

+ 0 - 0
HTMLTextAttribute.cs


+ 110 - 0
HTMLUtility.cs

@@ -0,0 +1,110 @@
+using HtmlAgilityPack;
+using System;
+using System.IO;
+
+namespace PX.Data
+{
+  //Source: https://github.com/ceee/ReadSharp/blob/master/ReadSharp/HtmlUtilities.cs
+  public class HtmlUtilities
+  {
+    /// <summary>
+    /// Converts HTML to plain text / strips tags.
+    /// </summary>
+    /// <param name="html">The HTML.</param>
+    /// <returns></returns>
+    public static string ConvertToPlainText(string html)
+    {
+      HtmlDocument doc = new HtmlDocument();
+      doc.LoadHtml(html);
+
+      StringWriter sw = new StringWriter();
+      ConvertTo(doc.DocumentNode, sw);
+      sw.Flush();
+      return sw.ToString();
+    }
+
+    /// <summary>
+    /// Count the words.
+    /// The content has to be converted to plain text before (using ConvertToPlainText).
+    /// </summary>
+    /// <param name="plainText">The plain text.</param>
+    /// <returns></returns>
+    public static int CountWords(string plainText)
+    {
+      return !String.IsNullOrEmpty(plainText) ? plainText.Split(' ', '\n').Length : 0;
+    }
+
+
+    public static string Cut(string text, int length)
+    {
+      if (!String.IsNullOrEmpty(text) && text.Length > length)
+      {
+        text = text.Substring(0, length - 4) + " ...";
+      }
+      return text;
+    }
+
+
+    private static void ConvertContentTo(HtmlNode node, TextWriter outText)
+    {
+      foreach (HtmlNode subnode in node.ChildNodes)
+      {
+        ConvertTo(subnode, outText);
+      }
+    }
+
+
+    private static void ConvertTo(HtmlNode node, TextWriter outText)
+    {
+      string html;
+      switch (node.NodeType)
+      {
+        case HtmlNodeType.Comment:
+          // don't output comments
+          break;
+
+        case HtmlNodeType.Document:
+          ConvertContentTo(node, outText);
+          break;
+
+        case HtmlNodeType.Text:
+          // script and style must not be output
+          string parentName = node.ParentNode.Name;
+          if ((parentName == "script") || (parentName == "style"))
+            break;
+
+          // get text
+          html = ((HtmlTextNode)node).Text;
+
+          // is it in fact a special closing node output as text?
+          if (HtmlNode.IsOverlappedClosingElement(html))
+            break;
+
+          // check the text is meaningful and not a bunch of whitespaces
+          if (html.Trim().Length > 0)
+          {
+            outText.Write(HtmlEntity.DeEntitize(html));
+          }
+          break;
+
+        case HtmlNodeType.Element:
+          switch (node.Name)
+          {
+            case "p":
+              // treat paragraphs as crlf
+              outText.Write("\r\n");
+              break;
+            case "br":
+              outText.Write("\r\n");
+              break;
+          }
+
+          if (node.HasChildNodes)
+          {
+            ConvertContentTo(node, outText);
+          }
+          break;
+      }
+    }
+  }
+}

+ 0 - 0
README.md