C# 提取html网页内文字

目前用 System.IO.Stream respStream 获取了string版本的html网页，想要提取网页内标签的纯文本部分，然后循环这个操作遍历整个网页，例如 <div class="info-panel"><h2><a href="/chengjiao/BJSJ88768797.html" target="_blank">苹果园小区二区 2室1厅 82平米</a></h2> 提取其中的中文部分，然后遍历这个过程。

举报该文章

相关建议 2015-05-11

/// <summary>
        /// 去除HTML格式
        /// </summary>
        /// <param name="Htmlstring"></param>
        /// <returns></returns>
        protected string DelHTML(string Htmlstring)//将HTML去除
        {
            #region
            //删除脚本

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            //删除HTML

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"-->", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<!--.*", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<A>.*</A>","");

            //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<[a-zA-Z]*=\.[a-zA-Z]*\?[a-zA-Z]+=\d&\w=%[a-zA-Z]*|[A-Z0-9]","");

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(amp|#38);", "&", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(lt|#60);", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(gt|#62);", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&#(\d+);", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            Htmlstring.Replace("<", "");

            Htmlstring.Replace(">", "");

            Htmlstring.Replace("\r\n", "");

            //Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
            #endregion

            return Htmlstring;

        }

温馨提示：内容为网友见解，仅供参考

当前网址：https://aolonic.com/aa/adgg4da3141wnwgga54.html

其他看法

无其他回答

C# 正则表达式提取html中的文本
static void Main(string[] args){ String s = @"<Body> <div>这里是要取出的文本A <img src=""\/>这里是要取出的文本B <a href="">超链接里的文本不取出 <\/a>这里是要取出的文本C <\/div> <body>";Regex regex = new Regex( "(\/?\\\\w+)[^>]*>([^<]*)<", RegexOptions.Ign...

c#正则提取html源码内的信息
第一种，修改你的，因为<\/strong>后有回车换行空格等空白字符，需要\\s 正则表达式：<strong>中文名：<\/strong>\\s*<span>([\\S\\s]*?)<\/span> 第二种，直接匹配出名字正则表达式：(?<=中文名：<\/strong>\\s*<span>)[^<]+

C# 解析Html网页里 table 里所有内容
您要排除唯一性，就是说class中的skcx_B只在这里有，如果其他地方有，可以使用这种方法获取 <xsl:value-of select="\/\/div[@class='skcx_B'][1]"\/>这个[1]是指<div class="skcx_B"><\/div>处在第几个位置，记住：这个是从1开始的，并不是从0开始的。

c# 怎样截取HTML标签<BR><BR>中间的文字
string Htmlstring = "<p style=\\"margin-top: 0pt; margin-bottom: 0pt\\">哈尔滨阵雨转多云西南风2-3级转3-4级 22\/28 <BR>加格达奇雷阵雨转阵雨偏南风2-3级 15\/24 <BR>黑河雷阵雨偏南风3-4级...

C#如何使用正则表达式提取超链接中的文字部分?就是<a>文字部分<\/a>中 ...
string html = 要匹配的字符串;Regex reg = new Regex(@"<a\\s*[^>]*>([\\s\\S]+?)<\/a>", RegexOptions.IgnoreCase);Match m = reg.Match(html);while(m.IsSuccess){ string innerHTML = m.Result("$1");\/\/ 得到正则的括号里的内容，就是a的innerHTML innerHTML = Regex.Replace(...

C# 用html agility pack怎么获取网页上的纯文本
Main(string[] args) { var web=new HtmlWeb(); var doc=web.Load("网址http:\/\/开头"); Console.WriteLine(doc.DocumentNode.InnerText);\/\/输出网页的全部文本 } }

c# 利用正则表达式提取html中数据
Macth m = Regex.Match(html, "<strong style=\\"font-size: 14px\\">(?<CompanyName>.*?)<\/strong>", ...);if (m.Success){ string companyName = m.Group["CompanyName"].Value;}

C# 提取一段html中特定标签中的内容??
string temp="<a>ggg<\/a>";int start=temp.IndexOf(">"); \/\/>最先出现的位置 int over=temp.LastIndexOf("<"); \/\/<最后出现的位置 string str=temp.Substring(start+1, over-1); \/\/截取从start+1开始,到 \/\/over-1结束的一段子串 \/\/字串第一个字符位置为0 ...

...http:\/\/newgame.17173.com\/testing-list.html 上表格里面的信息_百 ...
推荐使用HTML Agility Pack来进行解析。比如 \/\/ul[@id='testInfosList']\/li可以解析出所有行使用\/\/ul[@id='testInfosList']\/li[1]\/span[2]可以解析出第一行第二列。以此类推可以解析出所有行及列也可以先解析出所有行，然后再逐一解析列。

相似回答

大家正在搜