C#获取HTML源码
2024年03月23日记录
以前的那个从网上找到的方法, 在一些网站上用不了,如17K,取出来的是乱码,要么就是一坨JS,好像是用JS又重新加载了什么的
cs
using System;
using System.Collections.Generic;
using System.Web;
using System.Net;
using System.IO;
using System.Text;
using System.Net.Security;
using System.Security.Authentication;
using System.Security.Cryptography.X509Certificates;
namespace Niunan.XiaoShuo.Util
{
/// <summary>
/// http连接基础类,负责底层的http通信
/// </summary>
public class HttpService
{
public static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
{
//直接确认,否则打不开
return true;
}
/// <summary>
/// post提交
/// </summary>
/// <param name="xml"></param>
/// <param name="url"></param>
/// <param name="isUseCert"></param>
/// <param name="timeout"></param>
/// <param name="contenttype">如:application/x-www-form-urlencoded,text/xml</param>
/// <param name="Authorization">为空的时候就不用加,用于容联云通讯</param>
/// <returns></returns>
public static string Post(string xml, string url, bool isUseCert, int timeout,string contenttype = "application/x-www-form-urlencoded",string Authorization="")
{
System.GC.Collect();//垃圾回收,回收没有正常关闭的http连接
string result = "";//返回结果
HttpWebRequest request = null;
HttpWebResponse response = null;
Stream reqStream = null;
try
{
//设置最大连接数
ServicePointManager.DefaultConnectionLimit = 200;
//设置https验证方式
if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
{
ServicePointManager.ServerCertificateValidationCallback =
new RemoteCertificateValidationCallback(CheckValidationResult);
}
/***************************************************************
* 下面设置HttpWebRequest的相关属性
* ************************************************************/
request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "POST";
request.Timeout = timeout * 1000;
if (!string.IsNullOrEmpty(Authorization))
{
request.Headers.Add(HttpRequestHeader.Authorization, Authorization);
}
//设置代理服务器
//WebProxy proxy = new WebProxy(); //定义一个网关对象
//proxy.Address = new Uri(WxPayConfig.PROXY_URL); //网关服务器端口:端口
//request.Proxy = proxy;
//设置POST的数据类型和长度
request.ContentType =contenttype;
byte[] data = System.Text.Encoding.UTF8.GetBytes(xml);
request.ContentLength = data.Length;
//是否使用证书
if (isUseCert)
{
//复制微信DEMO的,这里不用证书
//string path = HttpContext.Current.Request.PhysicalApplicationPath;
//X509Certificate2 cert = new X509Certificate2(path + WxPayConfig.SSLCERT_PATH, WxPayConfig.SSLCERT_PASSWORD);
//request.ClientCertificates.Add(cert);
//Log.Debug("WxPayApi", "PostXml used cert");
}
//往服务器写入数据
reqStream = request.GetRequestStream();
reqStream.Write(data, 0, data.Length);
reqStream.Close();
//获取服务端返回
response = (HttpWebResponse)request.GetResponse();
//获取服务端返回数据
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
result = sr.ReadToEnd().Trim();
sr.Close();
}
catch (Exception e)
{
// Log.Error("HttpService", e.ToString());
throw e;
}
finally
{
//关闭连接和流
if (response != null)
{
response.Close();
}
if(request != null)
{
request.Abort();
}
}
return result;
}
/// <summary>
/// 处理http GET请求,返回数据
/// </summary>
/// <param name="url">请求的url地址</param>
/// <returns>http GET成功后返回的数据,失败抛WebException异常</returns>
public static string Get(string url)
{
System.GC.Collect();
string result = "";
HttpWebRequest request = null;
HttpWebResponse response = null;
//请求url以获取数据
try
{
//设置最大连接数
ServicePointManager.DefaultConnectionLimit = 200;
//设置https验证方式
if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
{
ServicePointManager.ServerCertificateValidationCallback =
new RemoteCertificateValidationCallback(CheckValidationResult);
}
/***************************************************************
* 下面设置HttpWebRequest的相关属性
* ************************************************************/
request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
//设置代理
//WebProxy proxy = new WebProxy();
//proxy.Address = new Uri(WxPayConfig.PROXY_URL);
//request.Proxy = proxy;
//获取服务器返回
response = (HttpWebResponse)request.GetResponse();
//获取HTTP返回数据
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
result = sr.ReadToEnd().Trim();
sr.Close();
}
catch (Exception e)
{
throw e;
}
finally
{
//关闭连接和流
if (response != null)
{
response.Close();
}
if (request != null)
{
request.Abort();
}
}
return result;
}
}
}
弄了一上午,到处问人到处查,发现下面的代码可以用于17K网站,
cs
var handler = new HttpClientHandler()
{
AutomaticDecompression = System.Net.DecompressionMethods.GZip | System.Net.DecompressionMethods.Deflate,
UseCookies=false,
};
var httpClient = new HttpClient(handler);
var requestMessage = new HttpRequestMessage(HttpMethod.Get, url);
requestMessage.Headers.Add("Accept-encoding", "gzip, deflate, br, zstd");
var message = await httpClient.SendAsync(requestMessage);
var content = await message.Content.ReadAsStringAsync();
//后来发现这段代码前几次可以抓取到,然后又抓不到了。。只能用下面的模拟浏览器打开网页抓取源代码了
后来又来了个更狠的,用PuppeteerSharp, 相当于用代码来控制让系统中的chrome浏览器打开一个网页,然后再来获取这个网页的源代码
cs
using PuppeteerSharp; //nuget引入一下
namespace ConsoleApp2
{
internal class Program
{
static async Task Main(string[] args)
{
await new BrowserFetcher().DownloadAsync(BrowserTag.Stable); //自动下载他提供的无头浏览器,不用这一行就得在下面指定本地的浏览器
var browser = await Puppeteer.LaunchAsync(new LaunchOptions
{
//ExecutablePath= "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
Headless = true
});
var page = await browser.NewPageAsync();
await page.GoToAsync("https://www.17k.com/book/554720.html");
await page.WaitForTimeoutAsync(2000);
string html = await page.GetContentAsync();
Console.WriteLine(html);
await browser.CloseAsync();
}
}
}
然后还有一个playwright的也能实现操作浏览器打开网页的功能,用于自动化测试的,以前有记录过这个名字,不过一直没有时间看。。。主要是"懒"。。。。。