Linux I/O 复用 poll源码解析

Linux I/O poll 源码分析

poll

poll函数系统调用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
int poll(struct polled* fds, nfds_t nfds, int timeout);
struct pollfd {
int fd;
short events;
short revents;
};

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
long, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;

//计算timespec类型的超时时间
if (timeout_msecs >= 0) {
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}

ret = do_sys_poll(ufds, nfds, to); //重要函数,返回就绪的文件描述符的总数

  // do_sys_poll 被信号中断, 重新调用, 对使用者来说 poll 是不会被信号中断的.
if (ret == -EINTR) {
struct restart_block *restart_block;
restart_block = &current_thread_info()->restart_block;
restart_block->fn = do_restart_poll; //设置重启的函数
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;

if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;
// ERESTART_RESTARTBLOCK 不会返回给用户进程,  而是会被系统捕获, 然后调用 do_restart_poll
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static long do_restart_poll(struct restart_block *restart_block)
{
struct pollfd __user *ufds = restart_block->poll.ufds;
int nfds = restart_block->poll.nfds;
struct timespec *to = NULL, end_time;
int ret;

if (restart_block->poll.has_timeout) {
end_time.tv_sec = restart_block->poll.tv_sec;
end_time.tv_nsec = restart_block->poll.tv_nsec;
to = &end_time;
}

ret = do_sys_poll(ufds, nfds, to);

if (ret == -EINTR) {
restart_block->fn = do_restart_poll;
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
//首先使用栈上的空间,加速访问
long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; //long stack_pps[64];

/*
struct poll_list {
struct poll_list *next;
int len; //本poll_list的entries长度
struct pollfd entries[0];
};
*/

struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;

if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
return -EINVAL; // 文件描述符数量超过当前进程限制

len = min_t(unsigned int, nfds, N_STACK_PPS); //计算链表头结点的entries数组长度
for (;;) { //将用户空间的pollfd全部拷贝到内核空间
walk->next = NULL;
walk->len = len;
if (!len)
break;
//当用户传入的fd很多时,由于poll系统调用每次都要把所有struct pollfd拷进内核,所以参数传递和页分配此时就成了poll系统调用的性能瓶颈。
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len)) //拷贝前walk->len个
goto out_fds;

todo -= walk->len; //计算未拷贝的pollfd个数
if (!todo)
break; //全部已拷贝,退出

len = min(todo, POLLFD_PER_PAGE); //计算个数
size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; //计算申请空间的大小
walk = walk->next = kmalloc(size, GFP_KERNEL); //在堆上申请空间
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
//初始化table,设置函数指针table.pt.qproc为__pollwait
poll_initwait(&table);
fdcount = do_poll(nfds, head, &table, end_time);
poll_freewait(&table);

for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;

for (j = 0; j < walk->len; j++, ufds++)
if (__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}

err = fdcount;
out_fds:
walk = head->next;
while (walk) {
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}
return err;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->polling_task = current;
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->qproc = qproc; //__pollwait
pt->key = ~0UL; /* all events enabled */
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
static int do_poll(unsigned int nfds,  struct poll_list *list,
struct poll_wqueues *wait, struct timespec *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;

if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { //已超时
pt = NULL; //不执行__pollwait
timed_out = 1; //不睡眠
}

if (end_time && !timed_out) //计算睡眠时间,纳秒
slack = estimate_accuracy(end_time);

for (;;) {
struct poll_list *walk;

for (walk = list; walk != NULL; walk = walk->next) { //遍历整个链表
struct pollfd * pfd, * pfd_end;

pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) { //遍历一个节点中的entries数组
//当用户传入的fd很多时,对do_pollfd就会调用很多次,poll效率瓶颈的另一原因就在这里。
if (do_pollfd(pfd, pt)) {
count++; //就绪文件描述符个数加1
pt = NULL; //有文件描述符就绪,进程不需要再睡眠了,所以不需要再给后面文件的wait queue中添加唤醒函数了
}
}
}
pt = NULL; //所有的文件描述符都已执行__pollwait,以后不再执行
if (!count) { //若count为0(表示无文件描述符就绪)
count = wait->error; //count = 0;
//当用户程序在调用poll后挂起时,发信号可以让程序迅速退出poll调用,而通常的系统调用是不会被信号打断的。
if (signal_pending(current)) //判断是否为信号唤醒
count = -EINTR;
}
if (count || timed_out) //有文件描述符就绪 或者 进程不再睡眠
break;

if (end_time && !to) { //仅首次循环时执行,超时时间转换,转换为内核时间
expire = timespec_to_ktime(*end_time);
to = &expire;
}
//当前进程从这里进入睡眠,等待直到超时,或由回调函数唤醒,喊醒后文件等待队列中的等待 //队列项并不清除,到后边统一清除
//超时,返回0;被唤醒,返回-EINTR
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
return count;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
unsigned int mask;
int fd;

mask = 0;
fd = pollfd->fd; //得到要检测文件的文件描述符
if (fd >= 0) { //检测fd的正确性
int fput_needed;
struct file * file;

//从当前进程描述符中取得文件结构,并增加引用计数
file = fget_light(fd, &fput_needed);
mask = POLLNVAL; //初始化,文件描述符没有打开
if (file != NULL) {
//#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll) {
if (pwait) //设置当前fd待检测的事件掩码
pwait->key = pollfd->events | POLLERR | POLLHUP;
//获取当前的就绪状态,并添加进程到文件的对应等待队列中
mask = file->f_op->poll(file, pwait);
}
//去除不关心的状态掩码
mask &= pollfd->events | POLLERR | POLLHUP;
//释放通过fget_light得到的file指针,实际就是减少引用计数
fput_light(file, fput_needed);
}
}
//记录当前fd的就绪状态
pollfd->revents = mask;
return mask;
}
  1. select用位图组织文件描述符和事件掩码,而poll用链表+数组的形式
  2. 要检测的文件描述符多的情况下,select全部用堆上的空间,poll用栈空间和堆空间
  3. select可以被信号终止,poll不能被信号终止,会重新启动

poll系统调用的函数调用过程

poll系统调用图解