前两天朋友叫我模仿一个网站,刚刚开始,我一个页面一个页面查看源码并复制和保存,花了我很多时间,一个字“累”,为了减轻工作量,我写了个网站“克隆工具”,一键克隆,比起人工操作,
效率提高了200%以上,精确度也大大提高,下面我将我写的“网站克隆工具”实现方法分享给大家。

一睹为快,先看看界面:

开发工具:vs2012(winform)

1.新建UrlModel模型

public class UrlModel{public string RelatedPath { get; set; }public string AbsoluteUri { get; set; }public string CurrPath { get; set; }public string RootPath { get; set; }public string Host { get; set; }public int Port { get; set; }public string Scheme { get; set; }}

2.新建UrlParser解析器

public class UrlParser{public static UrlModel Parse(string url){UrlModel model = new UrlModel();//默认if (url.Length < 8)throw new Exception("url参数不正确");else if (!url.ToLower().StartsWith("http:") && !url.ToLower().StartsWith("https:"))throw new Exception("url格式有误");if (url.LastIndexOf('/') < 8)url = url + "/";Regex reg = new Regex("(?<scheme>(http|https))://(?<host>.+?)/", RegexOptions.Singleline);if (reg.IsMatch(url)){string scheme = reg.Match(url).Groups["scheme"].Value;string host = reg.Match(url).Groups["host"].Value;if (host.Contains(":")){var aa = host.Split(':');if (aa.Length == 2){model.Host = aa[0];model.Port = int.Parse(aa[1]);}}else{model.Host = host;model.Port = 80;}int index = url.IndexOf('/', 8);model.RelatedPath = url.Substring(index);model.AbsoluteUri = url;model.Scheme = scheme;model.CurrPath = url.Substring(0, url.LastIndexOf("/"));if (80 == model.Port){model.RootPath = string.Format("{0}://{1}", model.Scheme, model.Host);}else{model.RootPath = string.Format("{0}://{1}:{2", model.Scheme, model.Host, model.Port);}}else{throw new Exception("url解析失败!");}return model;}}

3.网页处理服务工具

/// <summary>/// 网页处理服务工具/// </summary>public class WebPageService{private static string[] excludekeys = { "http:", "https:", "//", "#", "javascript:", "?", "tel:", "mailto:" };/// <summary>/// 获取所有html元素的href属性值,只获取站点本地的链接,站外的不获取/// </summary>/// <param name="html">页面的html源码</param>/// <returns></returns>public static List<UrlModel> GetLocalHrefs(string url,string html){if (string.IsNullOrEmpty(html))return new List<UrlModel>();Dictionary<string, UrlModel> urls = GetHrefs(url,html);List<UrlModel> newUrls = new List<UrlModel>();if (null != urls){foreach (string key in urls.Keys){string newkey = key.ToLower();bool iscontained = false;foreach (var exkey in excludekeys){if (newkey.IndexOf(exkey) == 0){iscontained = true;break;}}if (!iscontained) {//只获取本地路径
                        newUrls.Add(urls[key]);}}}return newUrls;}/// <summary>/// 获取所有html元素的src属性值,只获取站点本地的链接,站外的不获取/// </summary>/// <param name="html">页面的html源码</param>/// <returns></returns>public static List<UrlModel> GetLocalSrcs(string url,string html){if (string.IsNullOrEmpty(html))return new List<UrlModel>();Dictionary<string, UrlModel> urls = GetSrc(url, html);List<UrlModel> newUrls = new List<UrlModel>();if (null != urls){foreach (string key in urls.Keys){string newkey = key.ToLower();bool iscontained = false;foreach (var exkey in excludekeys){if (newkey.IndexOf(exkey) == 0){iscontained = true;break;}}if (!iscontained){//只获取本地路径
                        newUrls.Add(urls[key]);}}}return newUrls;}private static Dictionary<string, UrlModel> GetHrefs(string url,string html){if (string.IsNullOrEmpty(html))return null;UrlModel currUrl = UrlParser.Parse(url);Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>();Regex reg = new Regex("href=\"(?<Url>.+?)\"", RegexOptions.IgnoreCase);if (currUrl != null){AddUrlModel(html, currUrl, urls, reg);}return urls;}private static Dictionary<string, UrlModel> GetSrc(string url,string html){if (string.IsNullOrEmpty(html))return null;UrlModel currUrl = UrlParser.Parse(url);Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>();Regex reg = new Regex("(src=\"(?<Url>.+?)\"|url\\((?<Url>.+?)\\))", RegexOptions.IgnoreCase);if (currUrl != null){AddUrlModel(html, currUrl, urls, reg);}return urls;}private static void AddUrlModel(string html, UrlModel currUrl, Dictionary<string, UrlModel> urls, Regex reg){if (reg.IsMatch(html)){MatchCollection matchs = reg.Matches(html);foreach (Match item in matchs){try{string strUrl = item.Groups["Url"].Value;UrlModel model = new UrlModel();model.RelatedPath = strUrl;model.CurrPath = currUrl.CurrPath;model.RootPath = currUrl.RootPath;model.Scheme = currUrl.Scheme;model.Port = currUrl.Port;model.Host = currUrl.Host;if (strUrl.StartsWith("/")){//绝对目录情况下model.AbsoluteUri = string.Format("{0}{1}", model.RootPath, model.RelatedPath);}else{//相对目录情况下string currPath = model.CurrPath;int depth = 0;string path = model.RelatedPath;if (path.StartsWith("..")){try{while (path.StartsWith("..")){depth++;path = path.Substring(3);currPath = currPath.Substring(0, currPath.LastIndexOf("/"));}model.AbsoluteUri = string.Format("{0}/{1}", currPath, path);}catch{}}else{model.AbsoluteUri = string.Format("{0}/{1}", currPath, path);}}strUrl = strUrl.Trim().ToLower();urls.Add(strUrl, model);}catch{}}}}}

4.新建网站克隆接口

interface IWebCloneWorker{void Start();void Cancel();}

5.新建实现

public class WebCloneWorker : IWebCloneWorker{//网站页面克隆深度(如:0-首页,1-分类页,2-详细页面)public static int depth = 0;//要克隆的网站网址public string Url { get; set; }//克隆后,保存的路径public string SavePath { get; set; }private BackgroundWorker backgroundWorker1 = null;public event UrlChangedEventHandler UrlChanged;public event FileSavedSuccessEventHandler FileSavedSuccess;public event FileSavedFailEventHandler FileSavedFail;public event DownloadCompletedEventHandler DownloadCompleted;public event CollectingUrlEventHandler CollectingUrl;public event CollectedUrlEventHandler CollectedUrl;public event ProgressChangedEventHandler ProgressChanged;//所有页面、文件资源地址集合private Dictionary<string, UrlModel> _Hrefs = new Dictionary<string, UrlModel>();/// <summary>/// 所有页面、文件资源地址集合/// </summary>public Dictionary<string,UrlModel> Hrefs{get { return _Hrefs; }set { _Hrefs = value; }}//网站页面请求编码,默认为UTF-8private string _Encoding = "utf-8";//网站页面请求编码,默认为UTF-8public string Encoding{get { return _Encoding; }set { _Encoding = value; }}public WebCloneWorker() { }public WebCloneWorker(string url,string path) {//设置网站、保存路径this.Url = url;this.SavePath = path;if (string.IsNullOrEmpty(this.Url))throw new Exception("请输入网址");if (string.IsNullOrEmpty(this.SavePath))throw new Exception("请选择要保存的目录");backgroundWorker1 = new BackgroundWorker();//设置报告进度更新backgroundWorker1.WorkerReportsProgress = true;backgroundWorker1.WorkerSupportsCancellation = true;//注册线程主体方法backgroundWorker1.DoWork += backgroundWorker1_DoWork;//注册更新UI方法backgroundWorker1.ProgressChanged += backgroundWorker1_ProgressChanged;//处理完毕backgroundWorker1.RunWorkerCompleted += backgroundWorker1_RunWorkerCompleted;}void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e){if (e.Cancelled) {return;}if (this.DownloadCompleted != null){DownloadCompletedEventArgs eventArgs = new DownloadCompletedEventArgs(e.Result, e.Error, e.Cancelled);this.DownloadCompleted(this, eventArgs);}}void backgroundWorker1_ProgressChanged(object sender, ProgressChangedEventArgs e){//进度回调if (this.ProgressChanged != null) this.ProgressChanged(this, e);UrlModel model = (UrlModel)e.UserState;if (this.UrlChanged != null){//Url改变后,回调UrlChangedEventArgs eventArgs = new UrlChangedEventArgs(model);this.UrlChanged(this, eventArgs);}try{string dir = this.SavePath;string url = model.AbsoluteUri;string AbsolutePath = url.Substring(url.IndexOf('/', 8));string fileName = "";if (url.IndexOf('?') > 0){string path = AbsolutePath.Substring(0, model.RelatedPath.IndexOf('?'));fileName = System.IO.Path.GetFileName(path);}else{fileName = System.IO.Path.GetFileName(AbsolutePath);}//默认首页if (string.IsNullOrEmpty(fileName) || fileName.IndexOf(".") < 0){fileName = "index.html";if (!AbsolutePath.EndsWith("/"))AbsolutePath = AbsolutePath + "/";}fileName = System.Web.HttpUtility.UrlDecode(fileName);string localPath = string.Format("{0}{1}", dir, System.IO.Path.GetDirectoryName(AbsolutePath));if (!System.IO.Directory.Exists(localPath)){System.IO.Directory.CreateDirectory(localPath);}//判断文件是否存在,存在不再下载string path2 = Path.Combine(localPath, fileName);if (File.Exists(path2)){return;}//下载网页、图片、资源文件
                HttpTool.DownFile(url, localPath, fileName);//保存成功后,回调if (this.FileSavedSuccess != null){FileSavedSuccessEventArgs eventArgs = new FileSavedSuccessEventArgs(model);this.FileSavedSuccess(this, eventArgs);}}catch (Exception ex){//保存失败后,回调if (this.FileSavedFail != null){FileSavedFailEventArgs eventArgs = new FileSavedFailEventArgs(ex);this.FileSavedFail(this, eventArgs);}}}void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e){//获取资源
            GetResource();int index = 1;if (this.Hrefs.Keys.Count > 0){foreach (var k in this.Hrefs.Keys){//取消操作if (backgroundWorker1.CancellationPending){e.Cancel = true;return;}backgroundWorker1.ReportProgress(index, this.Hrefs[k]);index++;//挂起当前线程200毫秒Thread.Sleep(200);}}}public void Start(){if (this.backgroundWorker1.IsBusy)return;this.backgroundWorker1.RunWorkerAsync();}public void Cancel(){if (this.backgroundWorker1.CancellationPending)return;this.backgroundWorker1.CancelAsync();}private void GetResource(){string url = this.Url;string referer = this.Url;string msg = "";string html = HttpTool.HttpGet(url, referer, this.Encoding, out msg);//收集页面链接GetHrefs(0, url, html);//收集完毕if (null != CollectedUrl){UrlModel urlModel = new UrlModel();CollectedUrlEventArgs eventArgs = new CollectedUrlEventArgs(urlModel);this.CollectedUrl(this, eventArgs);}}private void GetHrefs(int level,string url,string html){#region 添加当前页UrlModel currUrl = UrlParser.Parse(url);try{//取消if (backgroundWorker1.CancellationPending)return;this.Hrefs.Add(currUrl.RelatedPath, currUrl);//收集回调if (null != CollectingUrl){CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(currUrl);this.CollectingUrl(this, eventArgs);}}catch{}#endregion//获取相关链接(含有href属性的)List<UrlModel> list1 = WebPageService.GetLocalHrefs(url,html);//获取图片,文件等资源文件(含有src属性的)List<UrlModel> listSrcs = WebPageService.GetLocalSrcs(url,html);#region 获取当级资源文件if (listSrcs != null){for (int i = 0; i < listSrcs.Count; i++){UrlModel urlModel = listSrcs[i];try{//取消if (backgroundWorker1.CancellationPending) return;this.Hrefs.Add(urlModel.RelatedPath, urlModel);//收集回调if (null != CollectingUrl){CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);this.CollectingUrl(this, eventArgs);}}catch{ }}}#endregion#region 获取子级页面资源//获取第二级if (list1 != null){for (int i = 0; i < list1.Count; i++){UrlModel urlModel = list1[i];try{//取消if (backgroundWorker1.CancellationPending)return;this.Hrefs.Add(urlModel.RelatedPath, urlModel);//收集回调if (null != CollectingUrl){CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);this.CollectingUrl(this, eventArgs);}}catch{ }string msg = "";html = HttpTool.HttpGet(urlModel.AbsoluteUri, urlModel.AbsoluteUri, this.Encoding, out msg);#region 获取子级资源文件/** 获取二级资源文件* */listSrcs = WebPageService.GetLocalSrcs(urlModel.AbsoluteUri, html);//资源文件if (listSrcs != null){for (int j = 0; j < listSrcs.Count; j++){UrlModel urlModel2 = listSrcs[j];try{//取消if (backgroundWorker1.CancellationPending)return;this.Hrefs.Add(urlModel2.RelatedPath, urlModel2);//收集回调if (null != CollectingUrl){CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel2);this.CollectingUrl(this, eventArgs);}}catch{ }//挂起线程20毫秒Thread.Sleep(20);}}#endregion//挂起线程20毫秒Thread.Sleep(20);//到达指定深度后,退出if (level >= depth)return;//递归GetHrefs(level + 1, urlModel.AbsoluteUri, html);}}#endregion}

6.代码有点多,各位有需要的还是下载源码查看并运行吧。

旧版本有很多问题,有需要新版本的请加我微信:xiaoqiu20121212。

转载于:https://www.cnblogs.com/jonlan/p/9533116.html

一步步教你如何打造一个网站克隆工具仿站相关推荐

  1. 订阅号微信公众号历史文章爬虫php,一步步教你怎么打造微信公众号历史文章爬虫...

    原标题:一步步教你怎么打造微信公众号历史文章爬虫 开篇语: 邓爷爷曾说过:不管黑猫白猫,逮到耗子就是好猫.不管我是凑的还是笨办法堆出来的,确实把批量导出微信公众号所有历史文章的这个功能给做出来了,而且 ...

  2. 教你怎么看一个网站是否存在漏洞!

    教你怎么看一个网站是否存在漏洞!!!  漏洞  近来很多网站受到了各种各样形式的攻击,黑客攻击的动机各不一样,黑客人攻击的目标也有不确定性,作为一家企业的网管.或CEO您是否担心您的网站也遭受同样的命 ...

  3. 【跨境神器】教你如何打造一个Facebook商店

    关键词:Facebook商店  Facebook Shop  SaleSmartly(ss客服) Facebook作为一个有着庞大用户群体的社交平台,许多跨境商家也会希望能在Facebook上扩张自己 ...

  4. 炸了!没有任何HTML/CSS ! 纯Python打造一个网站!

    点上方"菜鸟学Python",选择"星标" 第490篇原创干货,第一时间送达 大家好,我是菜鸟哥! 如果想用Python做网站,在Python世界里面有2大非常 ...

  5. python可以制作网站吗_Python大神带你用30行代码打造一个网站,爬虫+web不一样的玩法...

    首先,先把实际的效果图放上来: 用Python做的个性签名网站-效果图 在开始做之前,我们必须得知道这个用了那些模块: flask:一个轻量级的web开发框架,相信很多人也听说过这个牛逼加简洁的框架 ...

  6. 怎么看网站用的什么服务器,教你如何查看一个网站的服务器,系统和语言,地理位置,IP地址,所属国家,服务器类型及使用程序...

    通过下面这个网址可以查看出大多数服务器的相关信息.比较齐全. 要看一个网站的服务器,系统和语言一般有几种办法 1.在网站的域名下随便打一个网址,对于一般的网站,服务器就会在404的错误页面里告诉你这些 ...

  7. 手把手教你快速打造一个AI识物点读机

    0 项目背景 "六·一"儿童节到了,献上一个识物读英文的AI点读机作为一个节日礼物. 在完成前面几个"点读"相关项目后,我们会发现,其实从pipeline上看, ...

  8. 怎么建一个网站?新手建站教程

    怎么建一个网站?站长网今天来分享一下建站教程. 建站也叫网站搭建,或者网站建设,网站制作.做网站.创立网站等等说法.就是指在互联网上建立一个可以访问的网站.不论是个人网站还是公司网站,要想搭建一个网站 ...

  9. 如何用织梦仿制php网站首页,dedecms仿站如何做

    如果你想了解更多关于DEDECMS的知识,可以点击: 我们在日常浏览网站的过程中,经常会看到UI设计非常好的网站又感觉适合自己的行业,如果让美工按照目标网站进行出图后切图制作程序,那时间上和功夫上都花 ...

最新文章

  1. linux 物理内存用完了_Linux用户空间与内核空间(理解高端内存)
  2. python socket编程
  3. 面试者面试官,双向角度的程序员面试指南!
  4. 标准库中的智能指针shared_ptr
  5. 梯度下降法和随机梯度下降法的区别
  6. android 高度上分权重,安卓自适应布局(关于权重weight的使用技巧!)
  7. Java 8 新特性之Stream API
  8. Android开发如何使用JNA
  9. 根据要求调参用matplotlib做一个一模一样的直方图(以及如何把成图变得更好看)
  10. ubuntu 18.04 vim的安装
  11. 破碎的互联网下,加密技术正在恢复数据主权!
  12. 简单Android app开发_什么方法开发APP最简单?试试0代码开发平台
  13. 螺钉装弹垫平垫机器人_一种批量组装螺钉、弹垫、平垫的工装及使用方法
  14. python—模块-configparser
  15. [GRE] 填空机经 Section51-55
  16. linux yum安装mysql
  17. UML建模:基于智慧校园的二手交易平台
  18. java 播放h264_一个可以解码并实时播放H264的播放器
  19. EV SSL证书对网站的作用
  20. Layui的eleTree树式选择器使用

热门文章

  1. 大数据岗位的面试总结
  2. WordPress CMS百度快速收录 API 提交代码以及配置教程
  3. linux没有ifconfig命令
  4. solidworks与UG价格相差数十倍,功能上有哪些差别?
  5. MATLAB中eps使用
  6. 为了好看删除快捷方式箭头的坏处!!
  7. (一)、音视频相关名词
  8. 龙芯1b(LS1B200)使用LVGL7.0.1组件的滑杆控件控制三色RGB灯的亮度
  9. jQuery斑马条纹表
  10. 部署Zabbix监控平台,配置及使用Zabbix监控系统,自定义Zabbix监控项目