C语言URL解析器(代码分享)

By qianghaohao(Xqiang)

本程序可以解析出URL中每个字段的值然后存入结构体
 中.也可以调用提供的API只获取需要的部分.
本URL解析器代码来自https://github.com/jwerle/url.h
在此基础上进行了大量的修改,修复了很多bug.总体思路
没有改变,就是把很多细节改了下,现在可以正常使用了.
-->可能还存在bug,如果网友发现了可以及时指正.
感受:花了一天多的时间修复,充分地感受到了IDE对编程
的效率的影响,一个很奇葩的越界导致的问题结果调了半天
没找到问题。。。结果人家五分钟搞定。。。
话不多说,直接上源代码:
接口文件:url_parser.h:
#ifndef URL_PARSER
#define URL_PARSER/*** Dependencies*/#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>/*** url.h version*/#define URL_VERSION 0.0.2/*** Max length of a url protocol scheme*/#define URL_PROTOCOL_MAX_LENGTH 16/*** Max length of a url host part*/#define URL_HOSTNAME_MAX_LENGTH 128/*** Max length of a url tld part*/#define URL_TLD_MAX_LENGTH 16/*** Max length of a url auth part*/#define URL_AUTH_MAX_LENGTH 32/*** `url_data` struct that defines parts* of a parsed URL such as host and protocol*/#define URL_MAX_LENGTH 1024//  url结构信息
typedef struct url_data {char *href;char *protocol;char *host;char *auth;char *hostname;char *pathname;char *search;char *path;char *hash;char *query;char *port;
} url_data_t;// prototype/*** Parses a url into parts and returns* a `url_data_t *` pointer*/url_data_t *
url_parse (char *url);char *
url_get_protocol (char *url);char *
url_get_auth (char *url);char *
url_get_hostname (char *url);char *
url_get_host (char *url);char *
url_get_pathname (char *url);char *
url_get_path (char *url);char *
url_get_search (char *url);char *
url_get_query (char *url);char *
url_get_hash (char *url);char *
url_get_port (char *url);void
url_free (url_data_t *data);bool
url_is_protocol (char *str);bool
url_is_ssh (char *str);void
url_inspect (char *url);void
url_data_inspect (url_data_t *data);#endif

    实现文件:url_parser.c:
#include "url_parser.h"/*** URI Schemes* http://en.wikipedia.org/wiki/URI_scheme*/
//  URL协议头表:用来判断解析出来的协议是否在此表中
char *URL_SCHEMES[] = {// official IANA registered schemes"aaa", "aaas", "about", "acap", "acct", "adiumxtra", "afp", "afs", "aim", "apt", "attachment", "aw","beshare", "bitcoin", "bolo", "callto", "cap", "chrome", "crome-extension", "com-evenbrite-attendee","cid", "coap", "coaps","content", "crid", "cvs", "data", "dav", "dict", "lna-playsingle", "dln-playcontainer","dns", "dtn", "dvb", "ed2k", "facetime", "fax", "feed", "file", "finger", "fish","ftp", "geo", "gg","git","gizmoproject", "go", "gopher", "gtalk", "h323", "hcp", "http", "https", "iax", "icap", "icon","im","imap", "info", "ipn", "ipp", "irc", "irc6", "ircs", "iris", "iris.beep", "iris.xpc", "iris.xpcs","iris.lws","itms", "jabber", "jar", "jms", "keyparc", "lastfm", "ldap", "ldaps", "magnet", "mailserver","mailto","maps", "market", "message", "mid", "mms", "modem", "ms-help", "mssettings-power", "msnim", "msrp","msrps", "mtqp", "mumble", "mupdate", "mvn", "news", "nfs", "ni", "nih", "nntp", "notes","oid","paquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform", "pop", "pres", "prospero", "proxy","psyc","query", "reload", "res", "resource", "rmi", "rsync", "rtmp","rtsp",  "secondlife", "service","session","sftp", "sgn", "shttp", "sieve", "sip", "sips", "skype", "smb", "sms", "snews", "snmp", "soap.beep","soap.beeps","soldat", "spotify", "ssh", "steam", "svn", "tag", "teamspeak", "tel", "telnet", "tftp", "things","thismessage","tn3270", "tip", "tv", "udp", "unreal", "urn", "ut2004", "vemmi","ventrilo", "videotex", "view-source", "wais","webcal","ws", "wss", "wtai", "wyciwyg", "xcon", "xcon-userid", "xfire","xmlrpc.beep",  "xmlrpc.beeps", "xmpp", "xri","ymsgr",// unofficial schemes"javascript", "jdbc", "doi"
};char *
m_strdup (const char *str) {int n = strlen(str) + 1;char *dup = malloc(n);if (dup) strcpy(dup, str);return dup;
}static char *
get_part (char *url, const char *format, int l) {bool has = false;char *tmp = malloc(URL_MAX_LENGTH * sizeof(char));memset(tmp, 0, URL_MAX_LENGTH * sizeof(char));char *fmt_url = m_strdup(url);char *ret = NULL;if (!tmp || !fmt_url)return NULL;strcpy(tmp, "");strcpy(fmt_url, "");// move pointer exactly the amount// of characters in the `prototcol` char// plus 3 characters that represent the `://`// part of the urlfmt_url = fmt_url + l;sscanf(fmt_url, format, tmp);// if (0 != strcmp(tmp, tmp_url)) {if (0 != strcmp(tmp, fmt_url)) {has = true;ret = m_strdup(tmp);}// descrement pointer to original// position so it can be free'dfmt_url = fmt_url - l;free(tmp);free(fmt_url);if (!has) {free(ret);return NULL;} else {return ret;}
}/************* 解析URL各部分到url_data_t结构体 *************/
url_data_t *
url_parse (char *url) {url_data_t *data = malloc(sizeof(url_data_t));if (!data) return NULL;data->href = url;char *tmp;char *tmp_url = m_strdup(url);bool is_ssh = false;/************ 解析协议头 ***********/char *protocol = url_get_protocol(tmp_url);if (!protocol) return NULL;// length of protocol plus ://int protocol_len = (int) strlen(protocol) + 3;data->protocol = protocol;is_ssh = url_is_ssh(protocol);/************ 解析用户名 ***********/char *auth = NULL;int auth_len = 0;if ((tmp = strstr(tmp_url, "@"))) {auth = get_part(tmp_url, "%[^@]", protocol_len);auth_len = strlen(auth);if (auth) auth_len++;}data->auth = auth;char *hostname = NULL;/************ 解析主机名(包括端口号) ***********/hostname = (is_ssh)? get_part(tmp_url, "%[^:]", protocol_len + auth_len): get_part(tmp_url, "%[^/]", protocol_len + auth_len);if (!hostname) return NULL;int hostname_len = (int) strlen(hostname);char *tmp_hostname = m_strdup(hostname);data->hostname = hostname;/************ 解析主机名 ***********/char *host = malloc(strlen(tmp_hostname) * sizeof(char));memset(host, 0, strlen(tmp_hostname) * sizeof(char));sscanf(tmp_hostname, "%[^:]", host);if (!host) return NULL;int host_len = (int) strlen(host);data->host = host;/************ 解析端口号 ***********/char *port = malloc(URL_PROTOCOL_MAX_LENGTH * (sizeof(char)));memset(port, 0, URL_PROTOCOL_MAX_LENGTH * (sizeof(char)));if (!port) return NULL;tmp_hostname = tmp_hostname + (host_len + 1);sscanf(tmp_hostname, "%s", port);tmp_hostname = tmp_hostname - (host_len + 1);data->port = port;free(tmp_hostname);/************ 解析完整路径名 ***********/char *tmp_path;tmp_path = (is_ssh)? get_part(tmp_url, ":%s", protocol_len + auth_len + hostname_len): get_part(tmp_url, "/%s", protocol_len + auth_len + hostname_len);char *path = malloc(strlen(tmp_path) * sizeof(char) + 1);memset(path, 0, strlen(tmp_path) * sizeof(char) + 1);if (!path) return NULL;char *fmt = (is_ssh)? "%s" : "/%s";sprintf(path, fmt, tmp_path);data->path = path;free(tmp_path);/************ 解析路径名(不包括参数) ***********/char *pathname = malloc(strlen(path) * sizeof(char) + 1);memset(pathname, 0, strlen(path) * sizeof(char) + 1);if (!pathname) return NULL;tmp_path = m_strdup(path);sscanf(tmp_path, "%[^? | ^#]", pathname);int pathname_len = strlen(pathname);data->pathname = pathname;/************* 解析搜索参数 ***********/char *search = malloc(URL_AUTH_MAX_LENGTH * sizeof(search));memset(search, 0, URL_AUTH_MAX_LENGTH * sizeof(search));if (!search) return NULL;tmp_path = tmp_path + pathname_len;sscanf(tmp_path, "%[^#]", search);tmp_path = tmp_path - pathname_len;data->search = search;int search_len = strlen(search);free(tmp_path);/************* 解析查询参数 ***********/char *query = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(query, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!query) return NULL;sscanf(data->search, "?%s", query);data->query = query;/************* 解析hash值 ***********/char *hash = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(hash, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!hash) return NULL;tmp_path = data->path + (pathname_len + search_len);sscanf(tmp_path, "%s", hash);tmp_path = data->path - (pathname_len + search_len);data->hash = hash;free(tmp_path);return data;
}bool
url_is_protocol (char *str) {int count = sizeof(URL_SCHEMES) / sizeof(URL_SCHEMES[0]);for (int i = 0; i < count; ++i) {if (0 == strcmp(URL_SCHEMES[i], str)) {return true;}}return false;
}bool
url_is_ssh (char *str) {str = m_strdup(str);if (0 == strcmp(str, "ssh") ||0 == strcmp(str, "git")) {free(str);return true;}return false;
}char *
url_get_protocol (char *url) {char *protocol = malloc(URL_PROTOCOL_MAX_LENGTH * sizeof(char));if (!protocol) return NULL;sscanf(url, "%[^://]", protocol);if (url_is_protocol(protocol)) return protocol;return NULL;
}char *
url_get_auth (char *url) {char *protocol = url_get_protocol(url);if (!protocol) return NULL;int l = (int) strlen(protocol) + 3;return get_part(url, "%[^@]", l);
}char *
url_get_hostname (char *url) {int l = 3;char *protocol = url_get_protocol(url);char *tmp_protocol = m_strdup(protocol);char *auth = url_get_auth(url);if (!protocol) return NULL;if (auth) l += strlen(auth) + 1; // add one @ symbolif (auth) free(auth);l += (int) strlen(protocol);free(protocol);char * hostname = url_is_ssh(tmp_protocol)? get_part(url, "%[^:]", l): get_part(url, "%[^/]", l);free(tmp_protocol);return hostname;
}char *
url_get_host (char *url) {char *host = malloc(URL_HOSTNAME_MAX_LENGTH * sizeof(char));char *hostname = url_get_hostname(url);memset(host, 0, URL_HOSTNAME_MAX_LENGTH * sizeof(char));if (!host || !hostname) return NULL;sscanf(hostname, "%[^:]", host);free(hostname);return host;
}char *
url_get_pathname (char *url) {char *path = url_get_path(url);char *pathname = malloc(URL_MAX_LENGTH * sizeof(char));memset(pathname, 0, URL_MAX_LENGTH * sizeof(char));if (!path || !pathname) return NULL;sscanf(path, "%[^?]", pathname);free(path);return pathname;
}char *
url_get_path (char *url) {int l = 3;char *tmp_path;char *protocol = url_get_protocol(url);char *auth = url_get_auth(url);char *hostname = url_get_hostname(url);if (!protocol || !hostname)return NULL;bool is_ssh = url_is_ssh(protocol);l += (int) strlen(protocol) + (int) strlen(hostname);if (auth) l+= (int) strlen(auth) +1; // @ symboltmp_path = (is_ssh)? get_part(url, ":%s", l): get_part(url, "/%s", l);char *fmt = (is_ssh)? "%s" : "/%s";char *path = malloc(strlen(tmp_path) * sizeof(char) + 1);memset(path, 0, strlen(tmp_path) * sizeof(char) + 1);sprintf(path, fmt, tmp_path);if (auth) free(auth);free(protocol);free(hostname);free(tmp_path);return path;}char *
url_get_search (char *url) {char *path = url_get_path(url);char *pathname = url_get_pathname(url);char *search = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(search, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!path || !search) return NULL;path = path + (int)strlen(pathname);sscanf(path, "%[^#]", search);path = path - (int)strlen(pathname);free(path);free(pathname);return search;
}char *
url_get_query (char *url) {char *search = url_get_search(url);char *query = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(query, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!search) return NULL;sscanf(search, "?%s", query);free(search);return query;
}char *
url_get_hash (char *url) {char *hash = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(hash, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!hash) return NULL;char *path = url_get_path(url);if (!path) return NULL;char *pathname = url_get_pathname(url);if (!pathname) return NULL;char *search = url_get_search(url);int pathname_len = (int) strlen(pathname);int search_len = (int) strlen(search);path = path + (pathname_len + search_len);sscanf(path, "%s", hash);path = path - (pathname_len + search_len);free(pathname);free(path);if (search) free(search);return hash;
}char *
url_get_port (char *url) {char *port = malloc(URL_PROTOCOL_MAX_LENGTH * sizeof(char));memset(port, 0, URL_PROTOCOL_MAX_LENGTH * sizeof(char));char *hostname = url_get_hostname(url);char *host = url_get_host(url);if (!port || !hostname) return NULL;char *tmp_hostname = hostname;tmp_hostname = hostname + (strlen(host) + 1);sscanf(tmp_hostname, "%s", port);free(hostname);return port;
}void
url_inspect (char *url) {url_data_inspect(url_parse(url));
}void
url_data_inspect (url_data_t *data) {printf("#url =>\n");printf("    .href: \"%s\"\n",     data->href);printf("    .protocol: \"%s\"\n", data->protocol);printf("    .host: \"%s\"\n",     data->host);printf("    .auth: \"%s\"\n",     data->auth);printf("    .hostname: \"%s\"\n", data->hostname);printf("    .pathname: \"%s\"\n", data->pathname);printf("    .search: \"%s\"\n",   data->search);printf("    .path: \"%s\"\n",     data->path);printf("    .hash: \"%s\"\n",     data->hash);printf("    .query: \"%s\"\n",    data->query);printf("    .port: \"%s\"\n",     data->port);
}void
url_free (url_data_t *data) {if (!data) return;if (data->auth) free(data->auth);if (data->protocol) free(data->protocol);if (data->hostname) free(data->hostname);if (data->host) free(data->host);if (data->pathname) free(data->pathname);if (data->path) free(data->path);if (data->hash) free(data->hash);if (data->search) free(data->search);if (data->query) free(data->query);
}
  使用用例:main.c:
/*************************************************************************> File Name: main.c> Author: qianghaohao(Xqiang)> Program: url_parser> Readme: 本URL解析器代码来自https://github.com/jwerle/url.h>         在此基础上进行了大量的修改,修复了很多bug.>         总体思路没有改变,就是把很多细节改了下,现在可以正常使用了.>    -->可能还存在bug,如果网友发现了可以及时指正.> Platform: 可以跨平台使用> Created Time: 2016年06月11日 星期六 18时39分14秒************************************************************************/#include"url_parser.h"
url_data_t *url_info = NULL;
int main(int argc, char **argv) {if (argc != 2) {fprintf(stderr, "usage:%s <url>", argv[0]);return -1;}url_info = url_parse(argv[1]);if (NULL == url_info) {fprintf(stderr, "%s error...\n", argv[1]);return -1;}url_data_inspect(url_info);//  测试用例:http://www.baidu.com:80/cig-bin/index.html?sdkfj#283sjkdfprintf("\n================= API Demo ================\n");printf("herf:%s\n", argv[1]);char *hostname = url_get_host(argv[1]);printf("hostname:%s\n", hostname);char *path = url_get_path(argv[1]);printf("path:%s\n", path);char *host = url_get_host(argv[1]);printf("host:%s\n", host);char *proto = url_get_protocol(argv[1]);printf("protocol:%s\n", proto);char *auth = url_get_auth(argv[1]);printf("auth:%s\n", auth);char *search = url_get_search(argv[1]);printf("search:%s\n", search);char *hash = url_get_hash(argv[1]);printf("hash:%s\n", hash);char *query = url_get_query(argv[1]);printf("query:%s\n", query);char *port = url_get_port(argv[1]);printf("port:%s\n", port);return 0;
}
运行结果:

C语言URL解析器(代码分享)相关推荐

  1. 「Python 编程」编码实现网络请求库中的 URL 解析器

    相信各位 Python 开发者都用过 Requests 库,有些朋友还用过 WebSockets 库.这里回顾一下它们的基本用法,例如使用 Requests 库向目标网站发出 GET 请求: impo ...

  2. c语言注释语句执行吗,C语言学习笔记之C语言概念解析(附资料分享)每一个语句都必须以分号结尾但预处理命令函数头和花括号“}”之后不能加分号...

    [[怪兽爱C语言]C语言学习笔记之C语言概念解析(附资料分享)]https://toutiao.com/group/6582429294901854728/?iid=15906422033&a ...

  3. 先弄个XML解析器代码抄一抄 慢慢研究 O(∩_∩)O哈哈~

     出处:http://bbs.csdn.net/topics/390229172 已经自我放逐好几年了.打算去上班得了.在最后的自由日子里,做点有意义的事吧... 先来下载地址    http:/ ...

  4. 李宏毅2023机器学习作业HW03解析和代码分享

    ML2023Spring - HW3 相关信息: 课程主页 课程视频 Kaggle link Sample code HW03 视频 HW03 PDF 个人完整代码分享: GitHub | Gitee ...

  5. c语言xml解析器libxm2

    写这篇文章的原因有如下几点:1)C++标准库中没有操作XML的方法,用C++操作XML文件必须熟悉一种函数库,LIBXML2是其中一种很优秀的XML库,而且它同时支持多种编程语言:2)LIBXML2库 ...

  6. c语言统计字母字符数字字符个数,C语言统计字符个数代码分享

    C语言实现统计字符个数 #include int main() { int sz[10]={0},zm[26]={0},z[26]={0},i,space=0,e=0,t=0; char c; pri ...

  7. java编写文件管理器,java编写的文件管理器代码分享

    比较适合新手.逻辑上仍然有点问题.可以用于学习java文件操作 下面是主要的JAVA文件操作代码 FileHelp.java package self.yy.filesystem.fileutil; ...

  8. Python Django 根路由命名空间URL解析方式代码示例

  9. R语言画克利夫兰点图-代码分享

    使用colnames函数更改列名 使用sort函数获取某一列的排序索引 使用factor函数根据某一列数值大小,更改另一列的levels 尽量少使用coord_flip函数,开始画图就想好谁是X轴,谁 ...

  10. c语言快速排序算法代码,c语言快速排序算法示例代码分享

    #include #include #include #define RANDOM(i) (rand()%i) #define N 9    //设置数组长度 //分区操作 int Partition ...

最新文章

  1. MongoDb优化指南
  2. access哪个速度快 vfp_大学计算机二级考试,报考哪个科目比较好?
  3. windows 常用工具
  4. 系统安装,重装与优化:chapter2 硬盘的分区与格式化
  5. Spring Cloud GatewayAPI网关服务
  6. boost::function模块实现contains的测试程序
  7. 将mnist获得的数据还原成图片形式
  8. C++ Programming with TDD之一:GMOCK框架简介
  9. AWS披露面向Amazon S3的AI监控方案
  10. Tomcat线程池原理
  11. Android项目(完整版+免费版)
  12. 很多网友问那个磁力搜索站好用,就由本君说说吧!
  13. C++实现求复数的模长
  14. 切换无线网卡失败服务器提示,无线网卡切换为AP模式时提示ICS启动失败的解决方法...
  15. 教你 用c语言输出乘法口诀表 一giao我嘞gaiogiao
  16. ShareIntentUtil【调用系统自带的分享的工具类】
  17. 百万前端之js通过链接生成二维码可以保存下载复制
  18. 【HGE引擎】源码解析——常用公共函数(二)
  19. [网络工程师]-防火墙-入侵防护系统IPS
  20. tableau制作日历图学习

热门文章

  1. prometheus如何评估告警策略以及如何推送告警消息到alertmanager?
  2. pyTest官方手册(Release 4.2)之蹩脚翻译(2)
  3. IDEA添加gitlab仓库并上传代码(无需使用任何git指令),报错Ask a project Owner or Maintainer to create a default branch解决方案
  4. PTX-NPs 纳米粒子修饰紫杉醇/与桦木酸PEG/邻硝基苯丙酸紫杉醇偶联物的制备
  5. Solr之Facet与FacetPivot的使用和区别
  6. 导航卫星系统实时可视化平台开发
  7. android如何设置qq邮箱格式,邮件客户端和手机设置QQ邮箱IMAP服务
  8. OpenHarmony更新编译问题及解决办法
  9. 16位院士加盟!“双一流”上海大学成立人工智能研究院
  10. 新手焊接电路板_手把手教您如何掌握焊接电路板基础知识