ceph rgw:lifecycle实现

lifecycle policy的存储机制

当我们为一个bucket配置lifecycle policy时,lifecycle相关的数据会存储在2个位置:

  1. 在bucket.instance对象的xattr中写入key为user.rgw.lc value为LifecycleConfig的属性(真正的lifecycle rules列表)。
  2. 在lc.0 - lc.31共32个对象(这个值可配置)中选择其中一个,向其omap写入lifecycle的状态信息。但其实在该omap对应的header中也有lc相关的信息,比如用于记录当前omap的lifecycle遍历进度的marker,但这些数据不是在设置lc时设置的。

下面我们验证下:

首先,我们通过boto向一个名为bucket1的bucket配置lifecycle policy:

bucket1 = conn.get_bucket('bucket1')
expir = Expiration(days=1)
lc = Lifecycle()
lc.add_rule(
    prefix = "test/",
    expiration = expir,
)
bucket1.configure_lifecycle(lc)

之后,去该bucket1对应的bucket.instance对象的xattr中查看。

$ rados -p default.rgw.meta ls --namespace root
bucket1
.bucket.meta.bucket1:38d08ed7-3883-49de-ab89-0dea7c8c960f.4162.1

$ rados -p default.rgw.meta --namespace root listxattr .bucket.meta.bucket1:38d08ed7-3883-49de-ab89-0dea7c8c960f.4162.1
ceph.objclass.version
user.rgw.acl
user.rgw.lc

user.rgw.lc对应的value就是该bucket的lifecycle rule列表。

然后,再去查看lc.xx对象

$ rados -p default.rgw.log --namespace=lc ls
lc.6
lc.14
lc.29
lc.8
lc.10
lc.26
lc.22
lc.17
lc.27
lc.4
lc.11
lc.18
lc.20
lc.7
lc.2
lc.13
lc.16
lc.12
lc.30
lc.24
lc.9
lc.15
lc.19
lc.21
lc.23
lc.31
lc.25
lc.5
lc.3
lc.28
lc.1
lc.0

RGWPutLC::execute()代码

lifecycle的组织方式也可以在put lc操作的代码中窥见一斑。

void RGWPutLC::execute()
{
  bufferlist bl;
  
  RGWLifecycleConfiguration_S3 *config = NULL;
  RGWLCXMLParser_S3 parser(s->cct);
  RGWLifecycleConfiguration_S3 new_config(s->cct);

  // 从http header中取出md5到content_md5
  content_md5 = s->info.env->get("HTTP_CONTENT_MD5");
  if (content_md5 == nullptr) {
    op_ret = -ERR_INVALID_REQUEST;
    s->err.message = "Missing required header for this request: Content-MD5";
    ldout(s->cct, 5) << s->err.message << dendl;
    return;
  }
  // 将取出的md5从base64解码到content_md5_bin
  std::string content_md5_bin;
  try {
    content_md5_bin = rgw::from_base64(boost::string_view(content_md5));
  } catch (...) {
    s->err.message = "Request header Content-MD5 contains character "
                     "that is not base64 encoded.";
    ldout(s->cct, 5) << s->err.message << dendl;
    op_ret = -ERR_BAD_DIGEST;
    return;
  }

  if (!parser.init()) {
    op_ret = -EINVAL;
    return;
  }
  // 从req_state中解析出put lc所需的参数存入RGWPutLC.data长度为RGWPutLC.len
  op_ret = get_params();
  if (op_ret < 0)
    return;

  ldout(s->cct, 15) << "read len=" << len << " data=" << (data ? data : "") << dendl;

  // 计算params的MD5
  MD5 data_hash;
  unsigned char data_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE];
  data_hash.Update(reinterpret_cast<const byte*>(data), len);
  data_hash.Final(data_hash_res);
  // 比较计算出的md5和客户端传入的md5是否一致,以判断数据是否损坏
  if (memcmp(data_hash_res, content_md5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) {
    op_ret = -ERR_BAD_DIGEST;
    s->err.message = "The Content-MD5 you specified did not match what we received.";
    ldout(s->cct, 5) << s->err.message
                     << " Specified content md5: " << content_md5
                     << ", calculated content md5: " << data_hash_res
                     << dendl;
    return;
  }

  // 将data中的参数数据解析到parser对象中
  if (!parser.parse(data, len, 1)) {
    op_ret = -ERR_MALFORMED_XML;
    return;
  }
  // 解析出的xml对象是一颗树结构
  /*
  class XMLObj
  {
    XMLObj *parent;
    ......
    multimap<string, XMLObj *> children;
    ......
  }
  */
  // 如上,每一个标签作为一个节点,分别包含指向其父节点的指针和孩子节点的指针列表
  config = static_cast<RGWLifecycleConfiguration_S3 *>(parser.find_first("LifecycleConfiguration"));
  if (!config) {
    op_ret = -ERR_MALFORMED_XML;
    return;
  }
  // 将config中的rule_map中的rule转存到new_config的rule_map和prefix_map中
  op_ret = config->rebuild(store, new_config);
  if (op_ret < 0)
    return;

  if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
    ldout(s->cct, 15) << "New LifecycleConfiguration:";
    new_config.to_xml(*_dout);
    *_dout << dendl;
  }
  // 将rule_map编码存入bl,并copy一个attrs map,增加RGW_ATTR_LC->bl项,
  new_config.encode(bl);
  map<string, bufferlist> attrs;
  attrs = s->bucket_attrs;
  attrs[RGW_ATTR_LC] = bl;
  // 将新的attrs写入bucket.instance对象的xattr中,
  op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, &s->bucket_info.objv_tracker);
  if (op_ret < 0)
    return;

  string shard_id = s->bucket.tenant + ':' + s->bucket.name + ':' + s->bucket.bucket_id;  
  string oid; 
  // 从default.rgw.log pool中的32个lc.xx对象中选择一个,构造oid,xx表示一个0-31的整数
  // get_lc_oid代码如下:
  /*
  static void get_lc_oid(struct req_state *s, string& oid){
    string shard_id = s->bucket.name + ':' +s->bucket.bucket_id;
    int max_objs = (s->cct->_conf->rgw_lc_max_objs > HASH_PRIME)?HASH_PRIME:s->cct->_conf->rgw_lc_max_objs;
    int index = ceph_str_hash_linux(shard_id.c_str(), shard_id.size()) % HASH_PRIME % max_objs;
    oid = lc_oid_prefix;
    char buf[32];
    snprintf(buf, 32, ".%d", index);
    oid.append(buf);
    return;
  }
  */
  get_lc_oid(s, oid);
  // 构造要写入omap的entry内容
  pair<string, int> entry(shard_id, lc_uninitial);
  int max_lock_secs = s->cct->_conf->rgw_lc_lock_max_time;
  rados::cls::lock::Lock l(lc_index_lock_name); 
  utime_t time(max_lock_secs, 0);
  l.set_duration(time);
  l.set_cookie(cookie);
  librados::IoCtx *ctx = store->get_lc_pool_ctx();
  do {
    op_ret = l.lock_exclusive(ctx, oid);
    if (op_ret == -EBUSY) {
      dout(0) << "RGWLC::RGWPutLC() failed to acquire lock on, sleep 5, try again" << oid << dendl;
      sleep(5);
      continue;
    }
    if (op_ret < 0) {
      dout(0) << "RGWLC::RGWPutLC() failed to acquire lock " << oid << op_ret << dendl;
      break;
    }

    // 在lc.xx对象关联的omap中写入entry
    op_ret = cls_rgw_lc_set_entry(*ctx, oid, entry);
    if (op_ret < 0) {
      dout(0) << "RGWLC::RGWPutLC() failed to set entry " << oid << op_ret << dendl;     
    }
    break;
  }while(1);
  l.unlock(ctx, oid);
  return;
}

lifecycle的作用机制

RGWLC类是负责执行lc的类,它会根据用户的配置开启1个或多个worker线程,这些worker线程的任务是在一个无限循环中,每隔一段时间(生产环境是一天一次,测试环境比较频繁)判断一下当前是否应该执行lifecycle的遍历工作,如果是的话,调用RGWLC类的process方法,随机选择32个lc.xx对象中的一个,根据其header中的标记,取出其未遍历的下一个omap entry,更新header中的标记,更新该entry的状态为processing,然后处理该entry,遍历该条entry对应的bucket中的所有对象,根据lc规则删除或转换bucket中过期的object,并写日志。

代码追踪如下:

下面这个函数是worker线程的的执行内容,可以看到,它在一个while循环中,每隔一段时间判断should_work,如果通过的话,那么就调用lc->process()函数进行遍历,然后设置下一次被唤醒的时间,进入阻塞状态。

void *RGWLC::LCWorker::entry() {
  do {
    utime_t start = ceph_clock_now();
    if (should_work(start)) {
      dout(2) << "life cycle: start" << dendl;
      int r = lc->process();
      if (r < 0) {
        dout(0) << "ERROR: do life cycle process() returned error r=" << r << dendl;
      }
      dout(2) << "life cycle: stop" << dendl;
    }
    if (lc->going_down())
      break;

    utime_t end = ceph_clock_now();
    int secs = schedule_next_start_time(start, end);
    utime_t next;
    next.set_from_double(end + secs);

    dout(5) << "schedule life cycle next start time: " << rgw_to_asctime(next) <<dendl;

    lock.Lock();
    cond.WaitInterval(lock, utime_t(secs, 0));
    lock.Unlock();
  } while (!lc->going_down());

  return NULL;
}

在RGWLC::process函数中,主要做了以下几件事:
1.从lc.xx对象的header中获得omap中要遍历的下一个entry
2.将拿到的entry的状态设为processing(正在处理)
3.更新header中记录的下一个entry
4.调用bucket_lc_process函数处理当前的entry对应的lc规则

int RGWLC::process(int index, int max_lock_secs)
{
  rados::cls::lock::Lock l(lc_index_lock_name);
  do {
    utime_t now = ceph_clock_now();
    pair<string, int > entry;//string = bucket_name:bucket_id ,int = LC_BUCKET_STATUS
    if (max_lock_secs <= 0)
      return -EAGAIN;

    utime_t time(max_lock_secs, 0);
    l.set_duration(time);

    int ret = l.lock_exclusive(&store->lc_pool_ctx, obj_names[index]);
    if (ret == -EBUSY) { /* already locked by another lc processor */
      dout(0) << "RGWLC::process() failed to acquire lock on, sleep 5, try again" << obj_names[index] << dendl;
      sleep(5);
      continue;
    }
    if (ret < 0)
      return 0;
    // 读取lc.xx对象的head
    string marker;
    cls_rgw_lc_obj_head head;
    ret = cls_rgw_lc_get_head(store->lc_pool_ctx, obj_names[index], head);
    if (ret < 0) {
      dout(0) << "RGWLC::process() failed to get obj head " << obj_names[index] << ret << dendl;
      goto exit;
    }

    if(!if_already_run_today(head.start_date)) {
      head.start_date = now;
      head.marker.clear();
      ret = bucket_lc_prepare(index);
      if (ret < 0) {
      dout(0) << "RGWLC::process() failed to update lc object " << obj_names[index] << ret << dendl;
      goto exit;
      }
    }
    // 从lc.xx对象的header中获取下一个要遍历的omap entry
    ret = cls_rgw_lc_get_next_entry(store->lc_pool_ctx, obj_names[index], head.marker, entry);
    if (ret < 0) {
      dout(0) << "RGWLC::process() failed to get obj entry " << obj_names[index] << dendl;
      goto exit;
    }

    if (entry.first.empty())
      goto exit;
    // 将该entry的状态设为processing
    entry.second = lc_processing;
    ret = cls_rgw_lc_set_entry(store->lc_pool_ctx, obj_names[index],  entry);
    if (ret < 0) {
      dout(0) << "RGWLC::process() failed to set obj entry " << obj_names[index] << entry.first << entry.second << dendl;
      goto exit;
    }
    // 更新header中的下一个entry标记
    head.marker = entry.first;
    ret = cls_rgw_lc_put_head(store->lc_pool_ctx, obj_names[index],  head);
    if (ret < 0) {
      dout(0) << "RGWLC::process() failed to put head " << obj_names[index] << dendl;
      goto exit;
    }
    l.unlock(&store->lc_pool_ctx, obj_names[index]);
    // 处理当前的entry对应的lc规则
    ret = bucket_lc_process(entry.first);
    bucket_lc_post(index, max_lock_secs, entry, ret);
  }while(1);

exit:
    l.unlock(&store->lc_pool_ctx, obj_names[index]);
    return 0;
}

bucket_lc_process函数做的则就是最终的处理工作了:遍历某条lc规则对应的bucket的所有objects,根据prefix和tagging找到lc 规则作用的object,然后判断这些objects是否过期,如果过期,做对应的删除处理。

要注意的是,目前L版本的ceph仅支持到期删除的lifecycle,也就是Expiration。不支持Transition。

推荐阅读更多精彩内容

  • 转至元数据结尾创建: 董潇伟,最新修改于: 十二月 23, 2016 转至元数据起始第一章:isa和Class一....
    萌萌的小伟哥阅读 613评论 0 1
  • Spring Cloud为开发人员提供了快速构建分布式系统中一些常见模式的工具(例如配置管理,服务发现,断路器,智...
    卡卡罗2017阅读 86,947评论 14 122
  • 百战程序员_ Java1573题 QQ群:561832648489034603 掌握80%年薪20万掌握50%年薪...
    Albert陈凯阅读 12,239评论 2 32
  • 第一章 Nginx简介 Nginx是什么 没有听过Nginx?那么一定听过它的“同行”Apache吧!Ngi...
    JokerW阅读 28,148评论 24 982
  • 直到你告诉我你要去实习了,我才发现原来一转眼我们都认识那么多年了。也不知道是怎么了眼泪就是不听话的一直往下掉,止也...
    什么难阅读 57评论 0 1