select, epoll的个人总结

参考链接

https://www.cnblogs.com/lojunren/p/3856290.html

上面的文章中有几个错误：1.单个poll所能监听的文件描述符的大小是有限制的，因为

typedef unsigned long int nfds_t;

select 的参考链接

https://www.cnblogs.com/jaydenhpj/p/5121030.html

https://blog.csdn.net/diaozhiwa5526/article/details/102152201（简单版）

线程被select阻塞

int select(int maxfdp,fd_set *readfds,fd_set *writefds,fd_set *errorfds,struct timeval *timeout);

maxfdp:最大文件描述符编号+1

2.select的添加和返回判断的fd是一个东西吗？

添加的时候，会将内部的一个数组置位
返回判断的时候，将readset和内部的数组进行比较

3.select什么时候被唤醒

当文件描述符就绪的时候，调用select的线程将会从设备的等待队列中唤醒，

然后do_select线程会检查就绪的文件描述符，然后将内核态的fd_set中对应的位置位

之后asm_select会将内核态的fd_set拷贝到用户内存

4.唤醒后会返回到用户态的select()

源码解析：（linux2.6.11）

asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,fd_set __user *exp, struct timeval __user *tvp)
{fd_set_bits fds;char *bits;long timeout;int ret, size, max_fdset;timeout = MAX_SCHEDULE_TIMEOUT;if (tvp) {time_t sec, usec;if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))|| (ret = __get_user(sec, &tvp->tv_sec))|| (ret = __get_user(usec, &tvp->tv_usec)))goto out_nofds;ret = -EINVAL;if (sec < 0 || usec < 0)goto out_nofds;if ((unsigned long) sec < MAX_SELECT_SECONDS) {timeout = ROUND_UP(usec, 1000000/HZ);timeout += sec * (unsigned long) HZ;}}ret = -EINVAL;if (n < 0)goto out_nofds;/* max_fdset can increase, so grab it once to avoid race */max_fdset = current->files->max_fdset;if (n > max_fdset)n = max_fdset;ret = -ENOMEM;size = FDS_BYTES(n);bits = select_bits_alloc(size);if (!bits)goto out_nofds;fds.in      = (unsigned long *)  bits;fds.out     = (unsigned long *) (bits +   size);fds.ex      = (unsigned long *) (bits + 2*size);fds.res_in  = (unsigned long *) (bits + 3*size);fds.res_out = (unsigned long *) (bits + 4*size);fds.res_ex  = (unsigned long *) (bits + 5*size);/* 将所有关心的fd的读、写、异常位从用户态复制到内核态 */if ((ret = get_fd_set(n, inp, fds.in)) ||(ret = get_fd_set(n, outp, fds.out)) ||(ret = get_fd_set(n, exp, fds.ex)))goto out;zero_fd_set(n, fds.res_in);zero_fd_set(n, fds.res_out);zero_fd_set(n, fds.res_ex);/* 主要函数 */ret = do_select(n, &fds, &timeout);if (tvp && !(current->personality & STICKY_TIMEOUTS)) {time_t sec = 0, usec = 0;if (timeout) {sec = timeout / HZ;usec = timeout % HZ;usec *= (1000000/HZ);}put_user(sec, &tvp->tv_sec);put_user(usec, &tvp->tv_usec);}if (ret < 0)goto out;if (!ret) {ret = -ERESTARTNOHAND;if (signal_pending(current))goto out;ret = 0;}if (set_fd_set(n, inp, fds.res_in) ||set_fd_set(n, outp, fds.res_out) ||set_fd_set(n, exp, fds.res_ex))ret = -EFAULT;out:select_bits_free(bits, size);
out_nofds:return ret;
}

sys_select 负责将用户态数据拷贝到内核态，

返回后将内核态数据拷贝到用户态

do_select

int do_select(int n, fd_set_bits *fds, long *timeout)
{struct poll_wqueues table;poll_table *wait;int retval, i;long __timeout = *timeout;spin_lock(&current->files->file_lock);retval = max_select_fd(n, fds);spin_unlock(&current->files->file_lock);if (retval < 0)return retval;n = retval;poll_initwait(&table);wait = &table.pt;if (!__timeout)wait = NULL;retval = 0;for (;;) {unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;/* 设置当前的进程状态为可中断睡眠状态，但是当前进程还没有被调度出去 */set_current_state(TASK_INTERRUPTIBLE);inp = fds->in; outp = fds->out; exp = fds->ex;rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;for (i = 0; i < n; ++rinp, ++routp, ++rexp) {unsigned long in, out, ex, all_bits, bit = 1, mask, j;unsigned long res_in = 0, res_out = 0, res_ex = 0;struct file_operations *f_op = NULL;struct file *file = NULL;/* 这里要跳过一些并没有关心的bit位，浪费了时间 */in = *inp++; out = *outp++; ex = *exp++;all_bits = in | out | ex;if (all_bits == 0) {i += __NFDBITS;continue;}/* 循环遍历所有关注的bit 位*/for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {if (i >= n)break;if (!(bit & all_bits))continue;file = fget(i);if (file) {f_op = file->f_op;mask = DEFAULT_POLLMASK;if (f_op && f_op->poll)/* 调用poll函数，将当前进程挂上等待队列，
以及设置唤醒函数（驱动收到数据时会调用唤醒函数唤醒进程）。
并获取当前关心的fd的可读、可写、异常情况(套接字的sock_poll 初始化在socket_file_ops)*/mask = (*f_op->poll)(file, retval ? NULL : wait);fput(file);/* 表示可读 */if ((mask & POLLIN_SET) && (in & bit)) {res_in |= bit;retval++;}/* 表示可写 */if ((mask & POLLOUT_SET) && (out & bit)) {res_out |= bit;retval++;}/* 表示异常 */if ((mask & POLLEX_SET) && (ex & bit)) {res_ex |= bit;retval++;}}/*** 如果有必要，就重新调度进程*/cond_resched();}if (res_in)*rinp = res_in;if (res_out)*routp = res_out;if (res_ex)*rexp = res_ex;}/* 遍历完后，检查retval，看是否有可读可写异常，如果有retval不为0，那么则退出死循环 */wait = NULL;if (retval || !__timeout || signal_pending(current))break;if(table.error) {retval = table.error;break;}/* 如果上面没有检查到关心的bit位有可读可写异常。
如果调用select时设置的是无限等待，那么下面函数会进行进程调度，将当前进程调度出去。
驱动收到数据时会调换用poll函数设置的唤醒函数，
来唤醒当前进程对关心的bit位进行重新检查*/__timeout = schedule_timeout(__timeout);} // 这个括号包括了整个for循环__set_current_state(TASK_RUNNING);poll_freewait(&table);/** Up-to-date the caller timeout.*/*timeout = __timeout;return retval;
}

如果第一次检测发现一个文件描述符没有就绪，则将调用select的进程的文件描述符插入设备的等待队列中，

如果都没有就绪，则阻塞。

其中任何一个就绪的时候，进程将被唤醒，并且重新检查所有的文件描述符。。然后把就绪的位设置一下

epoll 源码：

int epoll_create(int size);

https://blog.csdn.net/justlinux2010/article/details/8506940

size参数是真的没用。。。

epoll_create生成了一个eventpoll结构并将它与一个文件绑定

struct eventpoll
{spin_lock_t lock;            //对本数据结构的访问struct mutex mtx;            //防止使用时被删除wait_queue_head_t wq;        //sys_epoll_wait() 使用的等待队列wait_queue_head_t poll_wait; //file->poll()使用的等待队列struct list_head rdllist;    //事件满足条件的双向链表struct rb_root rbr;          //红黑树struct epitem *ovflist;      //将事件到达的fd进行链接起来发送至用户空间
}

epoll_ctl:

https://blog.csdn.net/Mr_H9527/article/details/99745659

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

当插入，删除，修改的时候，都是查找eventpoll结构的红黑树

事件的数据结构：

struct epitem
{struct rb_node rbn;            //用于主结构管理的红黑树struct list_head rdllink;       //事件就绪队列struct epitem *next;           //用于主结构体中的链表struct epoll_filefd ffd;         //每个fd生成的一个结构int nwait;                 struct list_head pwqlist;     //poll等待队列struct eventpoll *ep;          //该项属于哪个主结构体struct list_head fllink;         //链接fd对应的file链表struct epoll_event event;  //注册的感兴趣的事件,也就是用户空间的epoll_event}

重点是下面的插入函数

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,struct file *tfile, int fd)
{int error, revents, pwake = 0;unsigned long flags;struct epitem *epi;struct ep_pqueue epq;error = -ENOMEM;// 分配一个epitem结构体来保存每个加入的fdif (!(epi = kmem_cache_alloc(epi_cache, SLAB_KERNEL)))goto eexit_1;/* Item initialization follow here ... */// 初始化结构体ep_rb_initnode(&epi->rbn);INIT_LIST_HEAD(&epi->rdllink);INIT_LIST_HEAD(&epi->fllink);INIT_LIST_HEAD(&epi->txlink);INIT_LIST_HEAD(&epi->pwqlist);epi->ep = ep;ep_set_ffd(&epi->ffd, tfile, fd);epi->event = *event;atomic_set(&epi->usecnt, 1);epi->nwait = 0;/* Initialize the poll table using the queue callback */epq.epi = epi;// 安装poll回调函数，这一行是非常重要的。。。。init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);/** Attach the item to the poll hooks and get current event bits.* We can safely use the file* here because its usage count has* been increased by the caller of this function.*/// 将当前item添加至poll hook中，然后获取当前event位revents = tfile->f_op->poll(tfile, &epq.pt);/** We have to check if something went wrong during the poll wait queue* install process. Namely an allocation for a wait queue failed due* high memory pressure.*/if (epi->nwait < 0)goto eexit_2;/* Add the current item to the list of active epoll hook for this file */spin_lock(&tfile->f_ep_lock);list_add_tail(&epi->fllink, &tfile->f_ep_links);spin_unlock(&tfile->f_ep_lock);/* We have to drop the new item inside our item list to keep track of it */write_lock_irqsave(&ep->lock, flags);/* Add the current item to the rb-tree */ep_rbtree_insert(ep, epi);/* If the file is already "ready" we drop it inside the ready list */if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {list_add_tail(&epi->rdllink, &ep->rdllist);/* Notify waiting tasks that events are available */if (waitqueue_active(&ep->wq))wake_up(&ep->wq);if (waitqueue_active(&ep->poll_wait))pwake++;}write_unlock_irqrestore(&ep->lock, flags);/* We have to call this outside the lock */if (pwake)ep_poll_safewake(&psw, &ep->poll_wait);DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",current, ep, tfile, fd));return 0;eexit_2:ep_unregister_pollwait(ep, epi);/** We need to do this because an event could have been arrived on some* allocated wait queue.*/write_lock_irqsave(&ep->lock, flags);if (ep_is_linked(&epi->rdllink))ep_list_del(&epi->rdllink);write_unlock_irqrestore(&ep->lock, flags);kmem_cache_free(epi_cache, epi);
eexit_1:return error;
}

最重要的一行是下面这行：

    // 安装poll回调函数，这一行是非常重要的。。。。init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

ep_ptable_queue_proc函数设置了等待队列的ep_poll_callback回调函数。在设备硬件数据到来时，硬件中断函数唤醒该等待队列上等待的进程时，会调用唤醒函数ep_poll_callback。这个函数会将事件添加到rdllist里面去

https://blog.csdn.net/zhaobryant/article/details/80557262，这篇博客基本涵盖了所有关于epoll的问题

下面看看epoll_wait

asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,int maxevents, int timeout)
{int error;struct file *file;struct eventpoll *ep;DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",current, epfd, events, maxevents, timeout));/* The maximum number of event must be greater than zero */if (maxevents <= 0 || maxevents > MAX_EVENTS) // 检查maxevents参数return -EINVAL;/* Verify that the area passed by the user is writeable */// 检查用户空间传入的events指向的内存是否可写if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {error = -EFAULT;goto eexit_1;}/* Get the "struct file *" for the eventpoll file */error = -EBADF;file = fget(epfd); // 获取epfd对应的eventpoll文件的file实例，file结构是在epoll_create中创建的if (!file)goto eexit_1;/** We have to check that the file structure underneath the fd* the user passed to us _is_ an eventpoll file.*/error = -EINVAL;if (!is_file_epoll(file))goto eexit_2;/** At this point it is safe to assume that the "private_data" contains* our own data structure.*/ep = file->private_data;/* Time to fish for events ... */// 核心处理函数error = ep_poll(ep, events, maxevents, timeout);eexit_2:fput(file);
eexit_1:DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",current, epfd, events, maxevents, timeout, error));return error;
}

下面是ep_poll:

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,int maxevents, long timeout)
{int res, eavail;unsigned long flags;long jtimeout;wait_queue_t wait;/** Calculate the timeout by checking for the "infinite" value ( -1 )* and the overflow condition. The passed timeout is in milliseconds,* that why (t * HZ) / 1000.*/jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;retry:write_lock_irqsave(&ep->lock, flags);res = 0;
// 如果ep->rdllist是空的，就阻塞当前进程if (list_empty(&ep->rdllist)) {/** We don't have any available event to return to the caller.* We need to sleep here, and we will be wake up by* ep_poll_callback() when events will become available.*/init_waitqueue_entry(&wait, current);add_wait_queue(&ep->wq, &wait);for (;;) {/** We don't want to sleep if the ep_poll_callback() sends us* a wakeup in between. That's why we set the task state* to TASK_INTERRUPTIBLE before doing the checks.*/set_current_state(TASK_INTERRUPTIBLE);if (!list_empty(&ep->rdllist) || !jtimeout)break;if (signal_pending(current)) {res = -EINTR;break;}write_unlock_irqrestore(&ep->lock, flags);jtimeout = schedule_timeout(jtimeout);write_lock_irqsave(&ep->lock, flags);}remove_wait_queue(&ep->wq, &wait);set_current_state(TASK_RUNNING);}/* Is it worth to try to dig for events ? */eavail = !list_empty(&ep->rdllist);write_unlock_irqrestore(&ep->lock, flags);/** Try to transfer events to user space. In case we get 0 events and* there's still timeout left over, we go trying again in search of* more luck.*/if (!res && eavail &&!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)goto retry;return res;
}

epoll_wait调用ep_poll，当rdlist为空（无就绪fd）时挂起当前进程，直到rdlist不空时进程才被唤醒。
文件fd状态改变（buffer由不可读变为可读或由不可写变为可写），导致相应fd上的回调函数ep_poll_callback()被调用。
ep_poll_callback将相应fd对应epitem加入rdlist，导致rdlist不空，进程被唤醒，epoll_wait得以继续执行。
ep_events_transfer函数将rdlist中的epitem拷贝到txlist中，并将rdlist清空。
ep_send_events函数（很关键），它扫描txlist中的每个epitem，调用其关联fd对用的poll方法。此时对poll的调用仅仅是取得fd上较新的events（防止之前events被更新），之后将取得的events和相应的fd发送到用户空间（封装在struct epoll_event，从epoll_wait返回）。（https://blog.csdn.net/zhaobryant/article/details/80557262）

ET 和 LT的区别？？

https://blog.csdn.net/eyucham/article/details/86502186

本质在于，LT在每次会执行蓝线，将fd重新放回rdllist,而ET不放回。。

select, epoll的个人总结相关推荐

使用多线程还是用IO复用select/epoll? epoll 或者 kqueue 的原理是什么？
原作者:蓝形参原文:http://www.zhihu.com/question/20114168/answer/14024115 使用多线程还是用IO复用select/epoll? 多线程模型适用于 ...
Python网络编程（4）——异步编程select epoll
在SocketServer模块的学习中,我们了解了多线程和多进程简单Server的实现,使用多线程.多进程技术的服务端为每一个新的client连接创建一个新的进/线程,当client数量较多时,这种技 ...
linux IO多路复用 select epoll
概念 IO多路复用是指内核一旦发现进程指定的一个或者多个IO条件准备读取,它就通知该进程通俗理解(摘自网上一大神) 这些名词比较绕口,理解涵义就好.一个epoll场景:一个酒吧服务员(一个线程),前 ...
Linux之poll/select/epoll代码示例
Linux poll and epoll poll 问题:假如应用需要根据IO的状态来读或写多个IO,如何处理?如果是一个进程处理,一个一个IO的处理,那么就势必会出现阻塞等待某个IO的过程,此时就可 ...
多路IO复用模型 select epoll 等
同步阻塞IO在等待数据就绪上花去太多时间,而传统的同步非阻塞IO虽然不会阻塞进程,但是结合轮询来判断数据是否就绪仍然会耗费大量的CPU时间. 多路IO复用提供了对大量文件描述符进行就绪检查的高性能方案 ...
select,epoll,poll比较
select,poll,epoll简介 select select本质上是通过设置或者检查存放fd标志位的数据结构来进行下一步处理.这样所带来的缺点是: 1 单个进程可监视的fd数量被限制 2 需要维 ...
select,epoll,poll比较（网络资源总结）
select,poll,epoll简介 select select本质上是通过设置或者检查存放fd标志位的数据结构来进行下一步处理.这样所带来的缺点是: 1 单个进程可监视的fd数量被限制 2 需要维 ...
Linux下socket(select,epoll)
1. Linuxsocket的简介在linux支持select模式,poll模式,在内核2.6版本以后支持epoll模式: epoll模式的优点: A:支持进程打开的最大socket数据 B:IO效 ...
IO多路复用器 select, epoll发展历程，工作原理，区别
什么是IO多路复用器首先这里的IO指的是网络IO,也就是网络连接,如果把一个IO想象成一条路,这些路是连接到内核的,就程序自身而言它是不知道这些路上有没有数据到达的. 为了知道这一点,程序就得遍历每 ...

select, epoll的个人总结

当插入，删除，修改的时候，都是查找eventpoll结构的红黑树

select, epoll的个人总结相关推荐

最新文章

热门文章