客户端读写操作分析

本节设计到了Librados以及Osdc等操作,相关类如 RadosClient、Objecter、IoCtxImpl等介绍见上一节:
Ceph学习——Librados与Osdc实现源码解析


1)调用rados_create()创建一个RadosClient对象。
2) 调用rados_config_read()读取配置文件。
3)调用rados_connect()函数,最终他会调用RadosClient::connect()来完成初始化
4)调用rados_ioctx_create(),它最终调用RadosClient::create_ioctx()穿件pool相关的IoCtxImpl类。
5)调用 rados_write 函数 想该pool中写入对象。调用了IoCtxImpl::write()。


写操作消息封装

int librados::IoCtxImpl::write(const object_t& oid, bufferlist& bl,size_t len, uint64_t off)
{if (len > UINT_MAX/2)return -E2BIG;::ObjectOperation op;//创建ObjectOperation对象prepare_assert_ops(&op);//封装相关写操作bufferlist mybl;mybl.substr_of(bl, 0, len);op.write(off, mybl);return operate(oid, &op, NULL);//调用operate处理
}

write中调用operate

int librados::IoCtxImpl::operate(const object_t& oid, ::ObjectOperation *o,ceph::real_time *pmtime, int flags)
{ceph::real_time ut = (pmtime ? *pmtime :ceph::real_clock::now());/* can't write to a snapshot */if (snap_seq != CEPH_NOSNAP)return -EROFS;if (!o->size())return 0;Mutex mylock("IoCtxImpl::operate::mylock");Cond cond;bool done;int r;version_t ver;Context *oncommit = new C_SafeCond(&mylock, &cond, &done, &r);int op = o->ops[0].op.op;ldout(client->cct, 10) << ceph_osd_op_name(op) << " oid=" << oid<< " nspace=" << oloc.nspace << dendl;//调用objecter->prepare_mutate_op把ObjectOperation封装为Op类型Objecter::Op *objecter_op = objecter->prepare_mutate_op(oid, oloc,*o, snapc, ut, flags,oncommit, &ver);objecter->op_submit(objecter_op);//发送消息mylock.Lock();while (!done)cond.Wait(mylock);mylock.Unlock();ldout(client->cct, 10) << "Objecter returned from "<< ceph_osd_op_name(op) << " r=" << r << dendl;set_sync_op_version(ver);return r;
}

发送消息 op_submit

该函数将封装好的Op操作通过网络发送出去。在op_submit中调用了_op_submit_with_budget用来处理Throttle相关的流量信息以及超时处理,最后该函数调用 _op_submit用来完成关键地址寻址和发送工作。
函数

oid Objecter::op_submit(Op *op, ceph_tid_t *ptid, int *ctx_budget)
{shunique_lock rl(rwlock, ceph::acquire_shared);ceph_tid_t tid = 0;if (!ptid)ptid = &tid;op->trace.event("op submit");_op_submit_with_budget(op, rl, ptid, ctx_budget);//调用_op_submit_with_budget
}void Op *op, shunique_lock& sul,ceph_tid_t *ptid,int *ctx_budget)
{assert(initialized);assert(op->ops.size() == op->out_bl.size());assert(op->ops.size() == op->out_rval.size());assert(op->ops.size() == op->out_handler.size());// throttle.  before we look at any state, because// _take_op_budget() may drop our lock while it blocks.if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) {int op_budget = _take_op_budget(op, sul);// take and pass out the budget for the first OP// in the context sessionif (ctx_budget && (*ctx_budget == -1)) {*ctx_budget = op_budget;}}if (osd_timeout > timespan(0)) {if (op->tid == 0)op->tid = ++last_tid;auto tid = op->tid;op->ontimeout = timer.add_event(osd_timeout,[this, tid]() {op_cancel(tid, -ETIMEDOUT); });}_op_submit(op, sul, ptid);
}void Objecter::_op_submit(Op *op, shunique_lock& sul, ceph_tid_t *ptid)
{// rwlock is lockedldout(cct, 10) << __func__ << " op " << op << dendl;// pick targetassert(op->session == NULL);OSDSession *s = NULL;//调用_calc_target 来计算目标OSDbool check_for_latest_map = _calc_target(&op->target, nullptr)== RECALC_OP_TARGET_POOL_DNE;// Try to get a session, including a retry if we need to take write lock//调用函数 _get_session 获取目标OSD的连接,如果返回-EAGAIN,就升级为写锁,重新获取int r = _get_session(op->target.osd, &s, sul);if (r == -EAGAIN ||(check_for_latest_map && sul.owns_lock_shared())) {epoch_t orig_epoch = osdmap->get_epoch();sul.unlock();if (cct->_conf->objecter_debug_inject_relock_delay) {sleep(1);}sul.lock();if (orig_epoch != osdmap->get_epoch()) {// map changed; recalculate mappingldout(cct, 10) << __func__ << " relock raced with osdmap, recalc target"<< dendl;//调用_calc_target 来计算目标OSDcheck_for_latest_map = _calc_target(&op->target, nullptr)== RECALC_OP_TARGET_POOL_DNE;if (s) {put_session(s);s = NULL;r = -EAGAIN;}}}if (r == -EAGAIN) {assert(s == NULL);r = _get_session(op->target.osd, &s, sul);}assert(r == 0);assert(s);  // may be homeless_send_op_account(op);// send?assert(op->target.flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE));if (osdmap_full_try) {op->target.flags |= CEPH_OSD_FLAG_FULL_TRY;}bool need_send = false;//判断当前状态,如果可以发送请求就调用 函数_prepare_osd_op 准备请求消息,调用函数_send_op发送消息if (osdmap->get_epoch() < epoch_barrier) {ldout(cct, 10) << " barrier, paused " << op << " tid " << op->tid<< dendl;op->target.paused = true;_maybe_request_map();} else if ((op->target.flags & CEPH_OSD_FLAG_WRITE) &&osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) {ldout(cct, 10) << " paused modify " << op << " tid " << op->tid<< dendl;op->target.paused = true;_maybe_request_map();} else if ((op->target.flags & CEPH_OSD_FLAG_READ) &&osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) {ldout(cct, 10) << " paused read " << op << " tid " << op->tid<< dendl;op->target.paused = true;_maybe_request_map();} else if (op->respects_full() &&(_osdmap_full_flag() ||_osdmap_pool_full(op->target.base_oloc.pool))) {ldout(cct, 0) << " FULL, paused modify " << op << " tid "<< op->tid << dendl;op->target.paused = true;_maybe_request_map();} else if (!s->is_homeless()) {need_send = true;} else {_maybe_request_map();}//如果可以发送请求就调用 函数_prepare_osd_op 准备请求消息MOSDOp *m = NULL;if (need_send) {m = _prepare_osd_op(op);}OSDSession::unique_lock sl(s->lock);if (op->tid == 0)op->tid = ++last_tid;ldout(cct, 10) << "_op_submit oid " << op->target.base_oid<< " '" << op->target.base_oloc << "' '"<< op->target.target_oloc << "' " << op->ops << " tid "<< op->tid << " osd." << (!s->is_homeless() ? s->osd : -1)<< dendl;_session_op_assign(s, op);//如果可以发送请求就调用,调用函数_send_op发送消息if (need_send) {_send_op(op, m);}// Last chance to touch Op here, after giving up session lock it can// be freed at any time by response handler.ceph_tid_t tid = op->tid;if (check_for_latest_map) {_send_op_map_check(op);}if (ptid)*ptid = tid;op = NULL;sl.unlock();put_session(s);ldout(cct, 5) << num_in_flight << " in flight" << dendl;
}

对象寻址 _calc_target

int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
{............const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);if (!pi) {t->osd = -1;return RECALC_OP_TARGET_POOL_DNE;}ldout(cct,30) << __func__ << "  base pi " << pi<< " pg_num " << pi->get_pg_num() << dendl;bool force_resend = false;if (osdmap->get_epoch() == pi->last_force_op_resend) {if (t->last_force_resend < pi->last_force_op_resend) {t->last_force_resend = pi->last_force_op_resend;force_resend = true;} else if (t->last_force_resend == 0) {force_resend = true;}}// apply tieringt->target_oid = t->base_oid;t->target_oloc = t->base_oloc;if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {if (is_read && pi->has_read_tier())t->target_oloc.pool = pi->read_tier;if (is_write && pi->has_write_tier())t->target_oloc.pool = pi->write_tier;pi = osdmap->get_pg_pool(t->target_oloc.pool);if (!pi) {t->osd = -1;return RECALC_OP_TARGET_POOL_DNE;}}pg_t pgid;if (t->precalc_pgid) {assert(t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY);assert(t->base_oid.name.empty()); // make sure this is a pg opassert(t->base_oloc.pool == (int64_t)t->base_pgid.pool());pgid = t->base_pgid;} else {//!!!!!!!!!!!!!!!!获取目标对象所在的PG!!!!!!!!!!!!!!!!!!!!!!!!!!!int ret = osdmap->object_locator_to_pg(t->target_oid, t->target_oloc,pgid);if (ret == -ENOENT) {t->osd = -1;return RECALC_OP_TARGET_POOL_DNE;}}ldout(cct,20) << __func__ << " target " << t->target_oid << " "<< t->target_oloc << " -> pgid " << pgid << dendl;ldout(cct,30) << __func__ << "  target pi " << pi<< " pg_num " << pi->get_pg_num() << dendl;t->pool_ever_existed = true;int size = pi->size;int min_size = pi->min_size;unsigned pg_num = pi->get_pg_num();int up_primary, acting_primary;vector<int> up, acting;//!!!!!!!!!!!!!!!通过CRUSH算法,获取该PG对应的OSD列表!!!!!!!!!!!!!!!!!osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,&acting, &acting_primary);bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);bool recovery_deletes = osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES);unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);pg_t prev_pgid(prev_seed, pgid.pool());............return RECALC_OP_TARGET_NO_ACTION;
}

Ceph学习——客户端读写操作分析相关推荐

  1. Ceph 学习——OSD读写流程与源码分析(一)

    消息从客户端发送而来,之前几节介绍了 客户端下 对象存储.块存储库的实现以及他们在客户端下API请求的发送过程(Ceph学习--Librados与Osdc实现源码解析 . Ceph学习--客户端读写操 ...

  2. Ceph学习笔记2-在Kolla-Ansible中使用Ceph后端存储

    环境说明 使用 Kolla-Ansible 请参考<使用 Kolla-Ansible 在 CentOS 7 单节点上部署 OpenStack Pike >: 部署 Ceph 服务请参考&l ...

  3. ceph学习--ceph常用运维技巧总结

    ceph常用运维技巧总结 格式 json 数据增强可读性 --format json-pretty -f json-pretty ceph quorum_status -f json-prettyce ...

  4. ceph学习笔记和基础知识

    ceph源码下载: http://ceph.com/download/ 主要文档下载连接: http://download.csdn.net/detail/skdkjxy/8149989 /***** ...

  5. Ceph学习——Librbd块存储库与RBD读写流程源码分析

    Librbd是Ceph提供块存储的库,它实现了RBD接口,基于LIbrados实现了对RBD的基本操作.Librbd对于元数据的相关操作是通过cls_rbd实现的.cls_rbd是Cls的一个扩展模块 ...

  6. StackExchange.Redis客户端读写主从配置,以及哨兵配置

    今天简单分享一下StackExchange.Redis客户端中配置主从分离以及哨兵的配置. 关于哨兵如果有不了解的朋友,可以看我之前的一篇分享,当然主从复制文章也可以找到.http://www.cnb ...

  7. Ceph学习——Librados与Osdc实现源码解析

    Librados RadosClient类 IoctxImpl AioCompletionImpl OSDC ObjectOperation 封装操作 op_target 封装PG信息 Op 封装操作 ...

  8. Ceph学习(1)---Ceph入门

    Ceph操作 1.准备机器 CentOS7 四台虚拟机(4C/4G/50G/50G)这个配置是我联系用的,生产请根据实际情况使用 2.虚拟机分配如下 主机名 角色 ip NAT ceph01 mon+ ...

  9. ceph学习笔记之七 数据平衡

    数据平衡 当在集群中新增一个OSD设备时,整个集群将会发生数据迁移使数据重新分布达到均衡.在Ceph集群中数据迁移的的基本单位是PG.其实在迁移过程中是将PG中的所有对象作为一个整体来进行迁移. 数据 ...

最新文章

  1. 机器学习+优化问题的种类、如何优化、凸优化、非凸优化、对偶问题、KKT条件
  2. python手机版安卓-当python遇到Android手机 那么,万物皆可盘
  3. 【设计模式】状态模式 ( 简介 | 适用场景 | 优缺点 | 代码示例 )
  4. win10电脑亮度无法调节
  5. 启明云端分享| ESP32-S3点480*480分辨率的RGB 2.1寸旋钮屏刷新效果到底会怎么样呢
  6. Serverless 工程实践 | 自建 Apache OpenWhisk 平台
  7. 学生开源项目_吸引学生加入您的开源项目的9种方法
  8. jQuery基础 - 改变CSS样式
  9. 李书福退出吉利汽车集团公司董事
  10. Android安卓|安卓概述、安卓开发、安卓入门、安卓架构
  11. 1075 - Incorrect table definition;there can be only one auto column and it must be defined as a key
  12. 初学者的持续集成和交付(DevOps)
  13. 最大学术出版商妥协!与挪威46所机构签协议,90%出版物免费阅读
  14. 解决 Command “python setup.py egg_info“ failed with error code 1 问题
  15. Atitit 从api的使用区分工程师级别 高级 中级 初级工程师常使用的api与框架类库 目录 1. 初级工程师使用的api和框架类库ssm 1 2. 中级工程师常使用的api和框架类库 1 3.
  16. 软件架构师的12项修炼4
  17. db_create_file_dest
  18. 鬼谷八荒逆天改命存档
  19. Mac用homebrew安装unrar
  20. CSV保存身份证后再打开后4位0000的解决办法

热门文章

  1. 农业大田作物智慧种植科研
  2. 放大电路中的反馈-反馈的基本概念及判断方法
  3. (不重点考)试算平衡的分类
  4. [PCB]PCB光板生产过程中板子弯曲或翘起原因分析
  5. 思成五笔的通俗易懂讲解
  6. Access把每一天的数据累加_SQL 数据库语句大全
  7. Python学习DAY5|数据分析简介与实战
  8. 【轮子1】造一个炫酷的DEBUG输出
  9. 不同性质的公司在英文中不同说法
  10. 【华人学者风采】刘庄 苏州大学