go.mod
module main
go 1.22
require golang.org/x/net v0.25.0 // indirect
main.go
package main
import (
"bufio"
"fmt"
"io/ioutil"
"main/createMap"
"net/http"
"net/url"
"os"
"path/filepath"
"regexp"
"strings"
)
var resourceCache map[string]bool
var cacheFilePath = "resourceCache.txt"
var websitePath = "website"
func main() {
//生成txt地图
createMap.CreateMap("http://127.0.0.1:9090/")
//生成xml地图
createMap.CreateMapXml("http://127.0.0.1:9090/")
//静态化网站
createStatic("http://127.0.0.1:9090/sitemap.xml", false, "C:\\Users\\Administrator\\Desktop\\website", "resourceCache11.txt")
}
// 移除缓存文件的方法
func removeCacheFile() {
err := os.Remove(cacheFilePath)
if err != nil {
if os.IsNotExist(err) {
fmt.Println("缓存文件不存在:", cacheFilePath)
} else {
fmt.Println("移除缓存文件时出错:", err)
}
} else {
fmt.Println("缓存文件已移除:", cacheFilePath)
}
}
/*
mapUrl 网站地图文件xml格式
onlyPage 是否是单页【为true时,mapUrl为单连接】
setWebsitePath 生成的网站地址路径 默认website
setcacheFilePath 缓存下载记录文件名 默认resourceCache.txt
*/
func createStatic(mapUrl string, onlyPage bool, setWebsitePath string, setcacheFilePath string) {
cacheFilePath = setcacheFilePath
websitePath = setWebsitePath
//加载缓存文件
loadCache()
defer saveCache()
var urls []string
//单页
if onlyPage {
urls = append(urls, mapUrl)
} else {
//整站
resp, err := http.Get(mapUrl)
if err != nil {
fmt.Println("获取网站地图时出错:", err)
return
}
defer resp.Body.Close()
sitemapBytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("读取网站地图时出错:", err)
return
}
sitemap := string(sitemapBytes)
urls = extractURLsFromSitemap(sitemap)
}
for _, url := range urls {
staticizePage(url)
}
fmt.Println("网站静态化完成。")
}
func extractURLsFromSitemap(sitemap string) []string {
var urls []string
startTag := "<loc>"
endTag := "</loc>"
startIdx := strings.Index(sitemap, startTag)
for startIdx != -1 {
endIdx := strings.Index(sitemap[startIdx:], endTag)
if endIdx == -1 {
break
}
endIdx += startIdx
url := sitemap[startIdx+len(startTag) : endIdx]
urls = append(urls, url)
startIdx = strings.Index(sitemap[endIdx:], startTag)
if startIdx != -1 {
startIdx += endIdx
}
}
return urls
}
func staticizePage(urlStr string) {
resp, err := http.Get(urlStr)
if err != nil {
fmt.Println("获取页面", urlStr, "时出错:", err)
return
}
defer resp.Body.Close()
pageBytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("读取页面", urlStr, "时出错:", err)
return
}
u, err := url.Parse(urlStr)
if err != nil {
fmt.Println("解析URL时出错:", err)
return
}
path := filepath.Join(websitePath, strings.TrimPrefix(u.Path, "/"))
if !strings.HasSuffix(path, "/") {
path = filepath.Dir(path)
}
err = os.MkdirAll(path, os.ModePerm)
if err != nil {
fmt.Println("为", urlStr, "创建目录时出错:", err)
return
}
fileName := filepath.Base(u.Path)
filePath := filepath.Join(path, fileName)
err = ioutil.WriteFile(filePath, pageBytes, 0644)
if err != nil {
fmt.Println("为", urlStr, "写入静态文件时出错:", err)
return
}
fmt.Println("页面已静态化:", urlStr)
links := extractResourceLinks(pageBytes, u)
for _, link := range links {
saveResource(link, websitePath)
}
}
func extractResourceLinks(pageBytes []byte, baseURL *url.URL) []string {
var links []string
imgPattern := regexp.MustCompile(`(?i)src=["']([^"']+)["']`)
cssPattern := regexp.MustCompile(`(?i)href=["']([^"']+\.(css))["']`)
jsPattern := regexp.MustCompile(`(?i)src=["']([^"']+\.(js))["']`)
fontPattern := regexp.MustCompile(`(?i)url\(["']?([^"')]+)\.(woff2|woff|ttf|otf)["']?\)`)
bgPattern := regexp.MustCompile(`(?i)background(?:-image)?:\s*url\(["']?([^"')]+)["']?\)`)
imgMatches := imgPattern.FindAllSubmatch(pageBytes, -1)
for _, match := range imgMatches {
links = append(links, buildAbsoluteURL(string(match[1]), baseURL))
}
cssMatches := cssPattern.FindAllSubmatch(pageBytes, -1)
for _, match := range cssMatches {
links = append(links, buildAbsoluteURL(string(match[1]), baseURL))
}
jsMatches := jsPattern.FindAllSubmatch(pageBytes, -1)
for _, match := range jsMatches {
links = append(links, buildAbsoluteURL(string(match[1]), baseURL))
}
fontMatches := fontPattern.FindAllSubmatch(pageBytes, -1)
for _, match := range fontMatches {
links = append(links, buildAbsoluteURL(string(match[1]), baseURL))
}
bgMatches := bgPattern.FindAllSubmatch(pageBytes, -1)
for _, match := range bgMatches {
links = append(links, buildAbsoluteURL(string(match[1]), baseURL))
}
return links
}
func buildAbsoluteURL(link string, baseURL *url.URL) string {
u, err := url.Parse(link)
if err != nil {
fmt.Println("解析URL时出错:", err)
return ""
}
return baseURL.ResolveReference(u).String()
}
func saveResource(link, basePath string) {
if resourceCache[link] {
fmt.Println("资源已下载:", link)
return
}
resp, err := http.Get(link)
if err != nil {
fmt.Println("获取资源", link, "时出错:", err)
return
}
defer resp.Body.Close()
resourceBytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("读取资源", link, "时出错:", err)
return
}
u, err := url.Parse(link)
if err != nil {
fmt.Println("解析资源URL时出错:", err)
return
}
var filePath string
if strings.Contains(link, "uploadfile") {
uploadfile := strings.Split(link, "url=")
filePath = filepath.Join(basePath, uploadfile[1])
} else {
filePath = filepath.Join(basePath, u.Path)
}
dirPath := filepath.Dir(filePath)
err = os.MkdirAll(dirPath, os.ModePerm)
if err != nil {
fmt.Println("为资源", link, "创建目录时出错:", err)
return
}
err = ioutil.WriteFile(filePath, resourceBytes, 0644)
if err != nil {
fmt.Println("写入资源文件", filePath, "时出错:", err)
return
}
resourceCache[link] = true
appendCache(link)
fmt.Println("资源已保存:", link)
// 如果是CSS文件,提取并下载背景图资源
if strings.HasSuffix(link, ".css") {
fmt.Println("解析css背景文件")
extractAndSaveCSSResources(resourceBytes, u)
fmt.Println("解析css字体文件:")
extractAndSaveCSSResources(resourceBytes, u)
}
}
func extractAndSaveCSSResources(cssBytes []byte, baseURL *url.URL) {
//backackground
bgPattern := regexp.MustCompile(`(?i)url\(["']?([^"')]+)["']?\)`)
matches := bgPattern.FindAllSubmatch(cssBytes, -1)
for _, match := range matches {
resourceURL := buildAbsoluteURL(string(match[1]), baseURL)
saveResource(resourceURL, websitePath)
}
//fonts
fontPattern := regexp.MustCompile(`(?i)url\(["']?([^"')]+)\.(woff2|woff|ttf|otf)["']?\)`)
matchesFonts := fontPattern.FindAllSubmatch(cssBytes, -1)
for _, match := range matchesFonts {
fontURL := buildAbsoluteURL(string(match[1]), baseURL)
saveResource(fontURL, websitePath)
}
}
func loadCache() {
resourceCache = make(map[string]bool)
file, err := os.Open(cacheFilePath)
if err != nil {
if !os.IsNotExist(err) {
fmt.Println("读取缓存文件时出错:", err)
}
return
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
resourceCache[scanner.Text()] = true
}
if err := scanner.Err(); err != nil {
fmt.Println("扫描缓存文件时出错:", err)
}
}
func appendCache(link string) {
file, err := os.OpenFile(cacheFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
fmt.Println("打开缓存文件时出错:", err)
return
}
defer file.Close()
_, err = file.WriteString(link + "\n")
if err != nil {
fmt.Println("写入缓存文件时出错:", err)
}
}
func saveCache() {
file, err := os.Create(cacheFilePath)
if err != nil {
fmt.Println("创建缓存文件时出错:", err)
return
}
defer file.Close()
writer := bufio.NewWriter(file)
for link := range resourceCache {
_, err := writer.WriteString(link + "\n")
if err != nil {
fmt.Println("写入缓存文件时出错:", err)
return
}
}
writer.Flush()
}
createMap/createMap.go
package createMap
import (
"fmt"
"golang.org/x/net/html"
"net/http"
"net/url"
"os"
"strings"
)
func CreateMap(url string) {
// 指定要抓取的网站地址和最大深度
websiteURL := url
maxDepth := 1000
// 抓取网站所有链接
links := crawlSite(websiteURL, websiteURL, maxDepth)
// 生成网站地图文件
err := generateSitemap(links, "sitemap.txt")
if err != nil {
fmt.Println("Error generating sitemap:", err)
return
}
fmt.Println("Sitemap generated successfully.")
}
// 抓取网站所有链接
func crawlSite(currentURL, rootURL string, maxDepth int) []string {
var links []string
visited := make(map[string]bool)
crawl(currentURL, rootURL, 0, maxDepth, &links, visited)
return links
}
// 递归爬取网站链接
func crawl(currentURL, rootURL string, depth, maxDepth int, links *[]string, visited map[string]bool) {
// 检查深度是否超过最大深度
if depth > maxDepth {
return
}
// 发起 HTTP 请求获取网页内容
resp, err := http.Get(currentURL)
if err != nil {
fmt.Println("Error fetching page", currentURL, ":", err)
return
}
defer resp.Body.Close()
// 解析 HTML 内容,提取链接
tokens := html.NewTokenizer(resp.Body)
for {
tokenType := tokens.Next()
if tokenType == html.ErrorToken {
break
}
token := tokens.Token()
if tokenType == html.StartTagToken && token.Data == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
// 解析链接,确保绝对路径
link, err := resolveURL(attr.Val, rootURL)
if err != nil {
continue
}
// 排除包含 "#" 的链接
if !strings.Contains(link, "#") {
// 检查链接是否已经访问过且是否为同一域名下的链接
if !visited[link] && isSameDomain(link, rootURL) {
*links = append(*links, link)
visited[link] = true
// 递归爬取链接
crawl(link, rootURL, depth+1, maxDepth, links, visited)
}
}
}
}
}
}
}
// 解析链接,确保绝对路径
func resolveURL(link, rootURL string) (string, error) {
parsedURL, err := url.Parse(link)
if err != nil {
return "", err
}
baseURL, err := url.Parse(rootURL)
if err != nil {
return "", err
}
resolvedURL := baseURL.ResolveReference(parsedURL)
return resolvedURL.String(), nil
}
// 判断链接是否为同一域名下的链接
func isSameDomain(link, rootURL string) bool {
parsedLink, err := url.Parse(link)
if err != nil {
return false
}
parsedRootURL, err := url.Parse(rootURL)
if err != nil {
return false
}
return parsedLink.Host == parsedRootURL.Host
}
// 生成网站地图文件
func generateSitemap(links []string, filePath string) error {
file, err := os.Create(filePath)
if err != nil {
return err
}
defer file.Close()
for _, link := range links {
// 写入地图文件
_, err := fmt.Fprintln(file, link)
if err != nil {
return err
}
}
return nil
}
createMap/createMapXml.go
package createMap
import (
"encoding/xml"
"fmt"
"golang.org/x/net/html"
"io/ioutil"
"net/http"
"net/url"
"strings"
"time"
)
// URLSet 结构用于表示网站地图中的 urlset
type URLSet struct {
XMLName xml.Name `xml:"urlset"`
Xmlns string `xml:"xmlns,attr"`
URLs []URL `xml:"url"`
}
// URL 结构用于表示网站地图中的 url
type URL struct {
Loc string `xml:"loc"`
Lastmod string `xml:"lastmod"`
Changefreq string `xml:"changefreq"`
Priority string `xml:"priority"`
}
func CreateMapXml(url string) {
// 指定要抓取的网站地址和最大深度
websiteURL := url
maxDepth := 1000
// 抓取网站所有链接
links := crawlXmlSiteXml(websiteURL, websiteURL, maxDepth)
// 生成网站地图文件
err := generateSitemapXml(links, "sitemap.xml")
if err != nil {
fmt.Println("Error generating sitemap:", err)
return
}
fmt.Println("Sitemap generated successfully.")
}
// 抓取网站所有链接
func crawlXmlSiteXml(currentURL, rootURL string, maxDepth int) []string {
var links []string
visited := make(map[string]bool)
crawlXml(currentURL, rootURL, 0, maxDepth, &links, visited)
return links
}
// 递归爬取网站链接
func crawlXml(currentURL, rootURL string, depth, maxDepth int, links *[]string, visited map[string]bool) {
// 检查深度是否超过最大深度
if depth > maxDepth {
return
}
// 发起 HTTP 请求获取网页内容
resp, err := http.Get(currentURL)
if err != nil {
fmt.Println("Error fetching page", currentURL, ":", err)
return
}
defer resp.Body.Close()
// 解析 HTML 内容,提取链接
tokens := html.NewTokenizer(resp.Body)
for {
tokenType := tokens.Next()
if tokenType == html.ErrorToken {
break
}
token := tokens.Token()
if tokenType == html.StartTagToken && token.Data == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
// 解析链接,确保绝对路径
link, err := resolveURLXml(attr.Val, rootURL)
if err != nil {
continue
}
// 排除包含 "#" 的链接
if !strings.Contains(link, "#") {
// 检查链接是否已经访问过且是否为同一域名下的链接
if !visited[link] && isSameDomainXml(link, rootURL) {
*links = append(*links, link)
visited[link] = true
// 递归爬取链接
crawlXml(link, rootURL, depth+1, maxDepth, links, visited)
}
}
}
}
}
}
}
// 解析链接,确保绝对路径
func resolveURLXml(link, rootURL string) (string, error) {
parsedURL, err := url.Parse(link)
if err != nil {
return "", err
}
baseURL, err := url.Parse(rootURL)
if err != nil {
return "", err
}
resolvedURL := baseURL.ResolveReference(parsedURL)
return resolvedURL.String(), nil
}
// 判断链接是否为同一域名下的链接
func isSameDomainXml(link, rootURL string) bool {
parsedLink, err := url.Parse(link)
if err != nil {
return false
}
parsedRootURL, err := url.Parse(rootURL)
if err != nil {
return false
}
return parsedLink.Host == parsedRootURL.Host
}
// 生成网站地图文件
func generateSitemapXml(links []string, filePath string) error {
var urls []URL
for _, link := range links {
urls = append(urls, URL{
Loc: link,
Lastmod: time.Now().Format("2006-01-02"),
Changefreq: "always",
Priority: "1.0",
})
}
urlSet := URLSet{
Xmlns: "http://www.sitemaps.org/schemas/sitemap/0.9",
URLs: urls,
}
xmlData, err := xml.MarshalIndent(urlSet, "", " ")
if err != nil {
return err
}
err = ioutil.WriteFile(filePath, xmlData, 0644)
if err != nil {
return err
}
return nil
}
资源文件url网站地图-静态化网站