C语言URL解析器(代码分享)
C语言URL解析器(代码分享)
By qianghaohao(Xqiang)
#ifndef URL_PARSER
#define URL_PARSER/*** Dependencies*/#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>/*** url.h version*/#define URL_VERSION 0.0.2/*** Max length of a url protocol scheme*/#define URL_PROTOCOL_MAX_LENGTH 16/*** Max length of a url host part*/#define URL_HOSTNAME_MAX_LENGTH 128/*** Max length of a url tld part*/#define URL_TLD_MAX_LENGTH 16/*** Max length of a url auth part*/#define URL_AUTH_MAX_LENGTH 32/*** `url_data` struct that defines parts* of a parsed URL such as host and protocol*/#define URL_MAX_LENGTH 1024// url结构信息
typedef struct url_data {char *href;char *protocol;char *host;char *auth;char *hostname;char *pathname;char *search;char *path;char *hash;char *query;char *port;
} url_data_t;// prototype/*** Parses a url into parts and returns* a `url_data_t *` pointer*/url_data_t *
url_parse (char *url);char *
url_get_protocol (char *url);char *
url_get_auth (char *url);char *
url_get_hostname (char *url);char *
url_get_host (char *url);char *
url_get_pathname (char *url);char *
url_get_path (char *url);char *
url_get_search (char *url);char *
url_get_query (char *url);char *
url_get_hash (char *url);char *
url_get_port (char *url);void
url_free (url_data_t *data);bool
url_is_protocol (char *str);bool
url_is_ssh (char *str);void
url_inspect (char *url);void
url_data_inspect (url_data_t *data);#endif
实现文件:url_parser.c:
#include "url_parser.h"/*** URI Schemes* http://en.wikipedia.org/wiki/URI_scheme*/
// URL协议头表:用来判断解析出来的协议是否在此表中
char *URL_SCHEMES[] = {// official IANA registered schemes"aaa", "aaas", "about", "acap", "acct", "adiumxtra", "afp", "afs", "aim", "apt", "attachment", "aw","beshare", "bitcoin", "bolo", "callto", "cap", "chrome", "crome-extension", "com-evenbrite-attendee","cid", "coap", "coaps","content", "crid", "cvs", "data", "dav", "dict", "lna-playsingle", "dln-playcontainer","dns", "dtn", "dvb", "ed2k", "facetime", "fax", "feed", "file", "finger", "fish","ftp", "geo", "gg","git","gizmoproject", "go", "gopher", "gtalk", "h323", "hcp", "http", "https", "iax", "icap", "icon","im","imap", "info", "ipn", "ipp", "irc", "irc6", "ircs", "iris", "iris.beep", "iris.xpc", "iris.xpcs","iris.lws","itms", "jabber", "jar", "jms", "keyparc", "lastfm", "ldap", "ldaps", "magnet", "mailserver","mailto","maps", "market", "message", "mid", "mms", "modem", "ms-help", "mssettings-power", "msnim", "msrp","msrps", "mtqp", "mumble", "mupdate", "mvn", "news", "nfs", "ni", "nih", "nntp", "notes","oid","paquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform", "pop", "pres", "prospero", "proxy","psyc","query", "reload", "res", "resource", "rmi", "rsync", "rtmp","rtsp", "secondlife", "service","session","sftp", "sgn", "shttp", "sieve", "sip", "sips", "skype", "smb", "sms", "snews", "snmp", "soap.beep","soap.beeps","soldat", "spotify", "ssh", "steam", "svn", "tag", "teamspeak", "tel", "telnet", "tftp", "things","thismessage","tn3270", "tip", "tv", "udp", "unreal", "urn", "ut2004", "vemmi","ventrilo", "videotex", "view-source", "wais","webcal","ws", "wss", "wtai", "wyciwyg", "xcon", "xcon-userid", "xfire","xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri","ymsgr",// unofficial schemes"javascript", "jdbc", "doi"
};char *
m_strdup (const char *str) {int n = strlen(str) + 1;char *dup = malloc(n);if (dup) strcpy(dup, str);return dup;
}static char *
get_part (char *url, const char *format, int l) {bool has = false;char *tmp = malloc(URL_MAX_LENGTH * sizeof(char));memset(tmp, 0, URL_MAX_LENGTH * sizeof(char));char *fmt_url = m_strdup(url);char *ret = NULL;if (!tmp || !fmt_url)return NULL;strcpy(tmp, "");strcpy(fmt_url, "");// move pointer exactly the amount// of characters in the `prototcol` char// plus 3 characters that represent the `://`// part of the urlfmt_url = fmt_url + l;sscanf(fmt_url, format, tmp);// if (0 != strcmp(tmp, tmp_url)) {if (0 != strcmp(tmp, fmt_url)) {has = true;ret = m_strdup(tmp);}// descrement pointer to original// position so it can be free'dfmt_url = fmt_url - l;free(tmp);free(fmt_url);if (!has) {free(ret);return NULL;} else {return ret;}
}/************* 解析URL各部分到url_data_t结构体 *************/
url_data_t *
url_parse (char *url) {url_data_t *data = malloc(sizeof(url_data_t));if (!data) return NULL;data->href = url;char *tmp;char *tmp_url = m_strdup(url);bool is_ssh = false;/************ 解析协议头 ***********/char *protocol = url_get_protocol(tmp_url);if (!protocol) return NULL;// length of protocol plus ://int protocol_len = (int) strlen(protocol) + 3;data->protocol = protocol;is_ssh = url_is_ssh(protocol);/************ 解析用户名 ***********/char *auth = NULL;int auth_len = 0;if ((tmp = strstr(tmp_url, "@"))) {auth = get_part(tmp_url, "%[^@]", protocol_len);auth_len = strlen(auth);if (auth) auth_len++;}data->auth = auth;char *hostname = NULL;/************ 解析主机名(包括端口号) ***********/hostname = (is_ssh)? get_part(tmp_url, "%[^:]", protocol_len + auth_len): get_part(tmp_url, "%[^/]", protocol_len + auth_len);if (!hostname) return NULL;int hostname_len = (int) strlen(hostname);char *tmp_hostname = m_strdup(hostname);data->hostname = hostname;/************ 解析主机名 ***********/char *host = malloc(strlen(tmp_hostname) * sizeof(char));memset(host, 0, strlen(tmp_hostname) * sizeof(char));sscanf(tmp_hostname, "%[^:]", host);if (!host) return NULL;int host_len = (int) strlen(host);data->host = host;/************ 解析端口号 ***********/char *port = malloc(URL_PROTOCOL_MAX_LENGTH * (sizeof(char)));memset(port, 0, URL_PROTOCOL_MAX_LENGTH * (sizeof(char)));if (!port) return NULL;tmp_hostname = tmp_hostname + (host_len + 1);sscanf(tmp_hostname, "%s", port);tmp_hostname = tmp_hostname - (host_len + 1);data->port = port;free(tmp_hostname);/************ 解析完整路径名 ***********/char *tmp_path;tmp_path = (is_ssh)? get_part(tmp_url, ":%s", protocol_len + auth_len + hostname_len): get_part(tmp_url, "/%s", protocol_len + auth_len + hostname_len);char *path = malloc(strlen(tmp_path) * sizeof(char) + 1);memset(path, 0, strlen(tmp_path) * sizeof(char) + 1);if (!path) return NULL;char *fmt = (is_ssh)? "%s" : "/%s";sprintf(path, fmt, tmp_path);data->path = path;free(tmp_path);/************ 解析路径名(不包括参数) ***********/char *pathname = malloc(strlen(path) * sizeof(char) + 1);memset(pathname, 0, strlen(path) * sizeof(char) + 1);if (!pathname) return NULL;tmp_path = m_strdup(path);sscanf(tmp_path, "%[^? | ^#]", pathname);int pathname_len = strlen(pathname);data->pathname = pathname;/************* 解析搜索参数 ***********/char *search = malloc(URL_AUTH_MAX_LENGTH * sizeof(search));memset(search, 0, URL_AUTH_MAX_LENGTH * sizeof(search));if (!search) return NULL;tmp_path = tmp_path + pathname_len;sscanf(tmp_path, "%[^#]", search);tmp_path = tmp_path - pathname_len;data->search = search;int search_len = strlen(search);free(tmp_path);/************* 解析查询参数 ***********/char *query = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(query, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!query) return NULL;sscanf(data->search, "?%s", query);data->query = query;/************* 解析hash值 ***********/char *hash = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(hash, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!hash) return NULL;tmp_path = data->path + (pathname_len + search_len);sscanf(tmp_path, "%s", hash);tmp_path = data->path - (pathname_len + search_len);data->hash = hash;free(tmp_path);return data;
}bool
url_is_protocol (char *str) {int count = sizeof(URL_SCHEMES) / sizeof(URL_SCHEMES[0]);for (int i = 0; i < count; ++i) {if (0 == strcmp(URL_SCHEMES[i], str)) {return true;}}return false;
}bool
url_is_ssh (char *str) {str = m_strdup(str);if (0 == strcmp(str, "ssh") ||0 == strcmp(str, "git")) {free(str);return true;}return false;
}char *
url_get_protocol (char *url) {char *protocol = malloc(URL_PROTOCOL_MAX_LENGTH * sizeof(char));if (!protocol) return NULL;sscanf(url, "%[^://]", protocol);if (url_is_protocol(protocol)) return protocol;return NULL;
}char *
url_get_auth (char *url) {char *protocol = url_get_protocol(url);if (!protocol) return NULL;int l = (int) strlen(protocol) + 3;return get_part(url, "%[^@]", l);
}char *
url_get_hostname (char *url) {int l = 3;char *protocol = url_get_protocol(url);char *tmp_protocol = m_strdup(protocol);char *auth = url_get_auth(url);if (!protocol) return NULL;if (auth) l += strlen(auth) + 1; // add one @ symbolif (auth) free(auth);l += (int) strlen(protocol);free(protocol);char * hostname = url_is_ssh(tmp_protocol)? get_part(url, "%[^:]", l): get_part(url, "%[^/]", l);free(tmp_protocol);return hostname;
}char *
url_get_host (char *url) {char *host = malloc(URL_HOSTNAME_MAX_LENGTH * sizeof(char));char *hostname = url_get_hostname(url);memset(host, 0, URL_HOSTNAME_MAX_LENGTH * sizeof(char));if (!host || !hostname) return NULL;sscanf(hostname, "%[^:]", host);free(hostname);return host;
}char *
url_get_pathname (char *url) {char *path = url_get_path(url);char *pathname = malloc(URL_MAX_LENGTH * sizeof(char));memset(pathname, 0, URL_MAX_LENGTH * sizeof(char));if (!path || !pathname) return NULL;sscanf(path, "%[^?]", pathname);free(path);return pathname;
}char *
url_get_path (char *url) {int l = 3;char *tmp_path;char *protocol = url_get_protocol(url);char *auth = url_get_auth(url);char *hostname = url_get_hostname(url);if (!protocol || !hostname)return NULL;bool is_ssh = url_is_ssh(protocol);l += (int) strlen(protocol) + (int) strlen(hostname);if (auth) l+= (int) strlen(auth) +1; // @ symboltmp_path = (is_ssh)? get_part(url, ":%s", l): get_part(url, "/%s", l);char *fmt = (is_ssh)? "%s" : "/%s";char *path = malloc(strlen(tmp_path) * sizeof(char) + 1);memset(path, 0, strlen(tmp_path) * sizeof(char) + 1);sprintf(path, fmt, tmp_path);if (auth) free(auth);free(protocol);free(hostname);free(tmp_path);return path;}char *
url_get_search (char *url) {char *path = url_get_path(url);char *pathname = url_get_pathname(url);char *search = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(search, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!path || !search) return NULL;path = path + (int)strlen(pathname);sscanf(path, "%[^#]", search);path = path - (int)strlen(pathname);free(path);free(pathname);return search;
}char *
url_get_query (char *url) {char *search = url_get_search(url);char *query = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(query, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!search) return NULL;sscanf(search, "?%s", query);free(search);return query;
}char *
url_get_hash (char *url) {char *hash = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));memset(hash, 0, URL_AUTH_MAX_LENGTH * sizeof(char));if (!hash) return NULL;char *path = url_get_path(url);if (!path) return NULL;char *pathname = url_get_pathname(url);if (!pathname) return NULL;char *search = url_get_search(url);int pathname_len = (int) strlen(pathname);int search_len = (int) strlen(search);path = path + (pathname_len + search_len);sscanf(path, "%s", hash);path = path - (pathname_len + search_len);free(pathname);free(path);if (search) free(search);return hash;
}char *
url_get_port (char *url) {char *port = malloc(URL_PROTOCOL_MAX_LENGTH * sizeof(char));memset(port, 0, URL_PROTOCOL_MAX_LENGTH * sizeof(char));char *hostname = url_get_hostname(url);char *host = url_get_host(url);if (!port || !hostname) return NULL;char *tmp_hostname = hostname;tmp_hostname = hostname + (strlen(host) + 1);sscanf(tmp_hostname, "%s", port);free(hostname);return port;
}void
url_inspect (char *url) {url_data_inspect(url_parse(url));
}void
url_data_inspect (url_data_t *data) {printf("#url =>\n");printf(" .href: \"%s\"\n", data->href);printf(" .protocol: \"%s\"\n", data->protocol);printf(" .host: \"%s\"\n", data->host);printf(" .auth: \"%s\"\n", data->auth);printf(" .hostname: \"%s\"\n", data->hostname);printf(" .pathname: \"%s\"\n", data->pathname);printf(" .search: \"%s\"\n", data->search);printf(" .path: \"%s\"\n", data->path);printf(" .hash: \"%s\"\n", data->hash);printf(" .query: \"%s\"\n", data->query);printf(" .port: \"%s\"\n", data->port);
}void
url_free (url_data_t *data) {if (!data) return;if (data->auth) free(data->auth);if (data->protocol) free(data->protocol);if (data->hostname) free(data->hostname);if (data->host) free(data->host);if (data->pathname) free(data->pathname);if (data->path) free(data->path);if (data->hash) free(data->hash);if (data->search) free(data->search);if (data->query) free(data->query);
}
/*************************************************************************> File Name: main.c> Author: qianghaohao(Xqiang)> Program: url_parser> Readme: 本URL解析器代码来自https://github.com/jwerle/url.h> 在此基础上进行了大量的修改,修复了很多bug.> 总体思路没有改变,就是把很多细节改了下,现在可以正常使用了.> -->可能还存在bug,如果网友发现了可以及时指正.> Platform: 可以跨平台使用> Created Time: 2016年06月11日 星期六 18时39分14秒************************************************************************/#include"url_parser.h"
url_data_t *url_info = NULL;
int main(int argc, char **argv) {if (argc != 2) {fprintf(stderr, "usage:%s <url>", argv[0]);return -1;}url_info = url_parse(argv[1]);if (NULL == url_info) {fprintf(stderr, "%s error...\n", argv[1]);return -1;}url_data_inspect(url_info);// 测试用例:http://www.baidu.com:80/cig-bin/index.html?sdkfj#283sjkdfprintf("\n================= API Demo ================\n");printf("herf:%s\n", argv[1]);char *hostname = url_get_host(argv[1]);printf("hostname:%s\n", hostname);char *path = url_get_path(argv[1]);printf("path:%s\n", path);char *host = url_get_host(argv[1]);printf("host:%s\n", host);char *proto = url_get_protocol(argv[1]);printf("protocol:%s\n", proto);char *auth = url_get_auth(argv[1]);printf("auth:%s\n", auth);char *search = url_get_search(argv[1]);printf("search:%s\n", search);char *hash = url_get_hash(argv[1]);printf("hash:%s\n", hash);char *query = url_get_query(argv[1]);printf("query:%s\n", query);char *port = url_get_port(argv[1]);printf("port:%s\n", port);return 0;
}
C语言URL解析器(代码分享)相关推荐
- 「Python 编程」编码实现网络请求库中的 URL 解析器
相信各位 Python 开发者都用过 Requests 库,有些朋友还用过 WebSockets 库.这里回顾一下它们的基本用法,例如使用 Requests 库向目标网站发出 GET 请求: impo ...
- c语言注释语句执行吗,C语言学习笔记之C语言概念解析(附资料分享)每一个语句都必须以分号结尾但预处理命令函数头和花括号“}”之后不能加分号...
[[怪兽爱C语言]C语言学习笔记之C语言概念解析(附资料分享)]https://toutiao.com/group/6582429294901854728/?iid=15906422033&a ...
- 先弄个XML解析器代码抄一抄 慢慢研究 O(∩_∩)O哈哈~
出处:http://bbs.csdn.net/topics/390229172 已经自我放逐好几年了.打算去上班得了.在最后的自由日子里,做点有意义的事吧... 先来下载地址 http:/ ...
- 李宏毅2023机器学习作业HW03解析和代码分享
ML2023Spring - HW3 相关信息: 课程主页 课程视频 Kaggle link Sample code HW03 视频 HW03 PDF 个人完整代码分享: GitHub | Gitee ...
- c语言xml解析器libxm2
写这篇文章的原因有如下几点:1)C++标准库中没有操作XML的方法,用C++操作XML文件必须熟悉一种函数库,LIBXML2是其中一种很优秀的XML库,而且它同时支持多种编程语言:2)LIBXML2库 ...
- c语言统计字母字符数字字符个数,C语言统计字符个数代码分享
C语言实现统计字符个数 #include int main() { int sz[10]={0},zm[26]={0},z[26]={0},i,space=0,e=0,t=0; char c; pri ...
- java编写文件管理器,java编写的文件管理器代码分享
比较适合新手.逻辑上仍然有点问题.可以用于学习java文件操作 下面是主要的JAVA文件操作代码 FileHelp.java package self.yy.filesystem.fileutil; ...
- Python Django 根路由命名空间URL解析方式代码示例
- R语言画克利夫兰点图-代码分享
使用colnames函数更改列名 使用sort函数获取某一列的排序索引 使用factor函数根据某一列数值大小,更改另一列的levels 尽量少使用coord_flip函数,开始画图就想好谁是X轴,谁 ...
- c语言快速排序算法代码,c语言快速排序算法示例代码分享
#include #include #include #define RANDOM(i) (rand()%i) #define N 9 //设置数组长度 //分区操作 int Partition ...
最新文章
- MongoDb优化指南
- access哪个速度快 vfp_大学计算机二级考试,报考哪个科目比较好?
- windows 常用工具
- 系统安装,重装与优化:chapter2 硬盘的分区与格式化
- Spring Cloud GatewayAPI网关服务
- boost::function模块实现contains的测试程序
- 将mnist获得的数据还原成图片形式
- C++ Programming with TDD之一:GMOCK框架简介
- AWS披露面向Amazon S3的AI监控方案
- Tomcat线程池原理
- Android项目(完整版+免费版)
- 很多网友问那个磁力搜索站好用,就由本君说说吧!
- C++实现求复数的模长
- 切换无线网卡失败服务器提示,无线网卡切换为AP模式时提示ICS启动失败的解决方法...
- 教你 用c语言输出乘法口诀表 一giao我嘞gaiogiao
- ShareIntentUtil【调用系统自带的分享的工具类】
- 百万前端之js通过链接生成二维码可以保存下载复制
- 【HGE引擎】源码解析——常用公共函数(二)
- [网络工程师]-防火墙-入侵防护系统IPS
- tableau制作日历图学习
热门文章
- prometheus如何评估告警策略以及如何推送告警消息到alertmanager?
- pyTest官方手册(Release 4.2)之蹩脚翻译(2)
- IDEA添加gitlab仓库并上传代码(无需使用任何git指令),报错Ask a project Owner or Maintainer to create a default branch解决方案
- PTX-NPs 纳米粒子修饰紫杉醇/与桦木酸PEG/邻硝基苯丙酸紫杉醇偶联物的制备
- Solr之Facet与FacetPivot的使用和区别
- 导航卫星系统实时可视化平台开发
- android如何设置qq邮箱格式,邮件客户端和手机设置QQ邮箱IMAP服务
- OpenHarmony更新编译问题及解决办法
- 16位院士加盟!“双一流”上海大学成立人工智能研究院
- 新手焊接电路板_手把手教您如何掌握焊接电路板基础知识