|  | // SPDX-License-Identifier: GPL-2.0 | 
|  |  | 
|  | #include <linux/ceph/ceph_debug.h> | 
|  |  | 
|  | #include <linux/module.h> | 
|  | #include <linux/slab.h> | 
|  |  | 
|  | #include <linux/ceph/libceph.h> | 
|  | #include <linux/ceph/osdmap.h> | 
|  | #include <linux/ceph/decode.h> | 
|  | #include <linux/crush/hash.h> | 
|  | #include <linux/crush/mapper.h> | 
|  |  | 
|  | char *ceph_osdmap_state_str(char *str, int len, u32 state) | 
|  | { | 
|  | if (!len) | 
|  | return str; | 
|  |  | 
|  | if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) | 
|  | snprintf(str, len, "exists, up"); | 
|  | else if (state & CEPH_OSD_EXISTS) | 
|  | snprintf(str, len, "exists"); | 
|  | else if (state & CEPH_OSD_UP) | 
|  | snprintf(str, len, "up"); | 
|  | else | 
|  | snprintf(str, len, "doesn't exist"); | 
|  |  | 
|  | return str; | 
|  | } | 
|  |  | 
|  | /* maps */ | 
|  |  | 
|  | static int calc_bits_of(unsigned int t) | 
|  | { | 
|  | int b = 0; | 
|  | while (t) { | 
|  | t = t >> 1; | 
|  | b++; | 
|  | } | 
|  | return b; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * the foo_mask is the smallest value 2^n-1 that is >= foo. | 
|  | */ | 
|  | static void calc_pg_masks(struct ceph_pg_pool_info *pi) | 
|  | { | 
|  | pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; | 
|  | pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * decode crush map | 
|  | */ | 
|  | static int crush_decode_uniform_bucket(void **p, void *end, | 
|  | struct crush_bucket_uniform *b) | 
|  | { | 
|  | dout("crush_decode_uniform_bucket %p to %p\n", *p, end); | 
|  | ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); | 
|  | b->item_weight = ceph_decode_32(p); | 
|  | return 0; | 
|  | bad: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int crush_decode_list_bucket(void **p, void *end, | 
|  | struct crush_bucket_list *b) | 
|  | { | 
|  | int j; | 
|  | dout("crush_decode_list_bucket %p to %p\n", *p, end); | 
|  | b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | 
|  | if (b->item_weights == NULL) | 
|  | return -ENOMEM; | 
|  | b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | 
|  | if (b->sum_weights == NULL) | 
|  | return -ENOMEM; | 
|  | ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); | 
|  | for (j = 0; j < b->h.size; j++) { | 
|  | b->item_weights[j] = ceph_decode_32(p); | 
|  | b->sum_weights[j] = ceph_decode_32(p); | 
|  | } | 
|  | return 0; | 
|  | bad: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int crush_decode_tree_bucket(void **p, void *end, | 
|  | struct crush_bucket_tree *b) | 
|  | { | 
|  | int j; | 
|  | dout("crush_decode_tree_bucket %p to %p\n", *p, end); | 
|  | ceph_decode_8_safe(p, end, b->num_nodes, bad); | 
|  | b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); | 
|  | if (b->node_weights == NULL) | 
|  | return -ENOMEM; | 
|  | ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); | 
|  | for (j = 0; j < b->num_nodes; j++) | 
|  | b->node_weights[j] = ceph_decode_32(p); | 
|  | return 0; | 
|  | bad: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int crush_decode_straw_bucket(void **p, void *end, | 
|  | struct crush_bucket_straw *b) | 
|  | { | 
|  | int j; | 
|  | dout("crush_decode_straw_bucket %p to %p\n", *p, end); | 
|  | b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | 
|  | if (b->item_weights == NULL) | 
|  | return -ENOMEM; | 
|  | b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | 
|  | if (b->straws == NULL) | 
|  | return -ENOMEM; | 
|  | ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); | 
|  | for (j = 0; j < b->h.size; j++) { | 
|  | b->item_weights[j] = ceph_decode_32(p); | 
|  | b->straws[j] = ceph_decode_32(p); | 
|  | } | 
|  | return 0; | 
|  | bad: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int crush_decode_straw2_bucket(void **p, void *end, | 
|  | struct crush_bucket_straw2 *b) | 
|  | { | 
|  | int j; | 
|  | dout("crush_decode_straw2_bucket %p to %p\n", *p, end); | 
|  | b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | 
|  | if (b->item_weights == NULL) | 
|  | return -ENOMEM; | 
|  | ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); | 
|  | for (j = 0; j < b->h.size; j++) | 
|  | b->item_weights[j] = ceph_decode_32(p); | 
|  | return 0; | 
|  | bad: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static struct crush_choose_arg_map *alloc_choose_arg_map(void) | 
|  | { | 
|  | struct crush_choose_arg_map *arg_map; | 
|  |  | 
|  | arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO); | 
|  | if (!arg_map) | 
|  | return NULL; | 
|  |  | 
|  | RB_CLEAR_NODE(&arg_map->node); | 
|  | return arg_map; | 
|  | } | 
|  |  | 
|  | static void free_choose_arg_map(struct crush_choose_arg_map *arg_map) | 
|  | { | 
|  | if (arg_map) { | 
|  | int i, j; | 
|  |  | 
|  | WARN_ON(!RB_EMPTY_NODE(&arg_map->node)); | 
|  |  | 
|  | for (i = 0; i < arg_map->size; i++) { | 
|  | struct crush_choose_arg *arg = &arg_map->args[i]; | 
|  |  | 
|  | for (j = 0; j < arg->weight_set_size; j++) | 
|  | kfree(arg->weight_set[j].weights); | 
|  | kfree(arg->weight_set); | 
|  | kfree(arg->ids); | 
|  | } | 
|  | kfree(arg_map->args); | 
|  | kfree(arg_map); | 
|  | } | 
|  | } | 
|  |  | 
|  | DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index, | 
|  | node); | 
|  |  | 
|  | void clear_choose_args(struct crush_map *c) | 
|  | { | 
|  | while (!RB_EMPTY_ROOT(&c->choose_args)) { | 
|  | struct crush_choose_arg_map *arg_map = | 
|  | rb_entry(rb_first(&c->choose_args), | 
|  | struct crush_choose_arg_map, node); | 
|  |  | 
|  | erase_choose_arg_map(&c->choose_args, arg_map); | 
|  | free_choose_arg_map(arg_map); | 
|  | } | 
|  | } | 
|  |  | 
|  | static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen) | 
|  | { | 
|  | u32 *a = NULL; | 
|  | u32 len; | 
|  | int ret; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | if (len) { | 
|  | u32 i; | 
|  |  | 
|  | a = kmalloc_array(len, sizeof(u32), GFP_NOIO); | 
|  | if (!a) { | 
|  | ret = -ENOMEM; | 
|  | goto fail; | 
|  | } | 
|  |  | 
|  | ceph_decode_need(p, end, len * sizeof(u32), e_inval); | 
|  | for (i = 0; i < len; i++) | 
|  | a[i] = ceph_decode_32(p); | 
|  | } | 
|  |  | 
|  | *plen = len; | 
|  | return a; | 
|  |  | 
|  | e_inval: | 
|  | ret = -EINVAL; | 
|  | fail: | 
|  | kfree(a); | 
|  | return ERR_PTR(ret); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Assumes @arg is zero-initialized. | 
|  | */ | 
|  | static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg) | 
|  | { | 
|  | int ret; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval); | 
|  | if (arg->weight_set_size) { | 
|  | u32 i; | 
|  |  | 
|  | arg->weight_set = kmalloc_array(arg->weight_set_size, | 
|  | sizeof(*arg->weight_set), | 
|  | GFP_NOIO); | 
|  | if (!arg->weight_set) | 
|  | return -ENOMEM; | 
|  |  | 
|  | for (i = 0; i < arg->weight_set_size; i++) { | 
|  | struct crush_weight_set *w = &arg->weight_set[i]; | 
|  |  | 
|  | w->weights = decode_array_32_alloc(p, end, &w->size); | 
|  | if (IS_ERR(w->weights)) { | 
|  | ret = PTR_ERR(w->weights); | 
|  | w->weights = NULL; | 
|  | return ret; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | arg->ids = decode_array_32_alloc(p, end, &arg->ids_size); | 
|  | if (IS_ERR(arg->ids)) { | 
|  | ret = PTR_ERR(arg->ids); | 
|  | arg->ids = NULL; | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int decode_choose_args(void **p, void *end, struct crush_map *c) | 
|  | { | 
|  | struct crush_choose_arg_map *arg_map = NULL; | 
|  | u32 num_choose_arg_maps, num_buckets; | 
|  | int ret; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval); | 
|  | while (num_choose_arg_maps--) { | 
|  | arg_map = alloc_choose_arg_map(); | 
|  | if (!arg_map) { | 
|  | ret = -ENOMEM; | 
|  | goto fail; | 
|  | } | 
|  |  | 
|  | ceph_decode_64_safe(p, end, arg_map->choose_args_index, | 
|  | e_inval); | 
|  | arg_map->size = c->max_buckets; | 
|  | arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args), | 
|  | GFP_NOIO); | 
|  | if (!arg_map->args) { | 
|  | ret = -ENOMEM; | 
|  | goto fail; | 
|  | } | 
|  |  | 
|  | ceph_decode_32_safe(p, end, num_buckets, e_inval); | 
|  | while (num_buckets--) { | 
|  | struct crush_choose_arg *arg; | 
|  | u32 bucket_index; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, bucket_index, e_inval); | 
|  | if (bucket_index >= arg_map->size) | 
|  | goto e_inval; | 
|  |  | 
|  | arg = &arg_map->args[bucket_index]; | 
|  | ret = decode_choose_arg(p, end, arg); | 
|  | if (ret) | 
|  | goto fail; | 
|  |  | 
|  | if (arg->ids_size && | 
|  | arg->ids_size != c->buckets[bucket_index]->size) | 
|  | goto e_inval; | 
|  | } | 
|  |  | 
|  | insert_choose_arg_map(&c->choose_args, arg_map); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | ret = -EINVAL; | 
|  | fail: | 
|  | free_choose_arg_map(arg_map); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static void crush_finalize(struct crush_map *c) | 
|  | { | 
|  | __s32 b; | 
|  |  | 
|  | /* Space for the array of pointers to per-bucket workspace */ | 
|  | c->working_size = sizeof(struct crush_work) + | 
|  | c->max_buckets * sizeof(struct crush_work_bucket *); | 
|  |  | 
|  | for (b = 0; b < c->max_buckets; b++) { | 
|  | if (!c->buckets[b]) | 
|  | continue; | 
|  |  | 
|  | switch (c->buckets[b]->alg) { | 
|  | default: | 
|  | /* | 
|  | * The base case, permutation variables and | 
|  | * the pointer to the permutation array. | 
|  | */ | 
|  | c->working_size += sizeof(struct crush_work_bucket); | 
|  | break; | 
|  | } | 
|  | /* Every bucket has a permutation array. */ | 
|  | c->working_size += c->buckets[b]->size * sizeof(__u32); | 
|  | } | 
|  | } | 
|  |  | 
|  | static struct crush_map *crush_decode(void *pbyval, void *end) | 
|  | { | 
|  | struct crush_map *c; | 
|  | int err; | 
|  | int i, j; | 
|  | void **p = &pbyval; | 
|  | void *start = pbyval; | 
|  | u32 magic; | 
|  |  | 
|  | dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); | 
|  |  | 
|  | c = kzalloc(sizeof(*c), GFP_NOFS); | 
|  | if (c == NULL) | 
|  | return ERR_PTR(-ENOMEM); | 
|  |  | 
|  | c->choose_args = RB_ROOT; | 
|  |  | 
|  | /* set tunables to default values */ | 
|  | c->choose_local_tries = 2; | 
|  | c->choose_local_fallback_tries = 5; | 
|  | c->choose_total_tries = 19; | 
|  | c->chooseleaf_descend_once = 0; | 
|  |  | 
|  | ceph_decode_need(p, end, 4*sizeof(u32), bad); | 
|  | magic = ceph_decode_32(p); | 
|  | if (magic != CRUSH_MAGIC) { | 
|  | pr_err("crush_decode magic %x != current %x\n", | 
|  | (unsigned int)magic, (unsigned int)CRUSH_MAGIC); | 
|  | goto bad; | 
|  | } | 
|  | c->max_buckets = ceph_decode_32(p); | 
|  | c->max_rules = ceph_decode_32(p); | 
|  | c->max_devices = ceph_decode_32(p); | 
|  |  | 
|  | c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); | 
|  | if (c->buckets == NULL) | 
|  | goto badmem; | 
|  | c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); | 
|  | if (c->rules == NULL) | 
|  | goto badmem; | 
|  |  | 
|  | /* buckets */ | 
|  | for (i = 0; i < c->max_buckets; i++) { | 
|  | int size = 0; | 
|  | u32 alg; | 
|  | struct crush_bucket *b; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, alg, bad); | 
|  | if (alg == 0) { | 
|  | c->buckets[i] = NULL; | 
|  | continue; | 
|  | } | 
|  | dout("crush_decode bucket %d off %x %p to %p\n", | 
|  | i, (int)(*p-start), *p, end); | 
|  |  | 
|  | switch (alg) { | 
|  | case CRUSH_BUCKET_UNIFORM: | 
|  | size = sizeof(struct crush_bucket_uniform); | 
|  | break; | 
|  | case CRUSH_BUCKET_LIST: | 
|  | size = sizeof(struct crush_bucket_list); | 
|  | break; | 
|  | case CRUSH_BUCKET_TREE: | 
|  | size = sizeof(struct crush_bucket_tree); | 
|  | break; | 
|  | case CRUSH_BUCKET_STRAW: | 
|  | size = sizeof(struct crush_bucket_straw); | 
|  | break; | 
|  | case CRUSH_BUCKET_STRAW2: | 
|  | size = sizeof(struct crush_bucket_straw2); | 
|  | break; | 
|  | default: | 
|  | goto bad; | 
|  | } | 
|  | BUG_ON(size == 0); | 
|  | b = c->buckets[i] = kzalloc(size, GFP_NOFS); | 
|  | if (b == NULL) | 
|  | goto badmem; | 
|  |  | 
|  | ceph_decode_need(p, end, 4*sizeof(u32), bad); | 
|  | b->id = ceph_decode_32(p); | 
|  | b->type = ceph_decode_16(p); | 
|  | b->alg = ceph_decode_8(p); | 
|  | b->hash = ceph_decode_8(p); | 
|  | b->weight = ceph_decode_32(p); | 
|  | b->size = ceph_decode_32(p); | 
|  |  | 
|  | dout("crush_decode bucket size %d off %x %p to %p\n", | 
|  | b->size, (int)(*p-start), *p, end); | 
|  |  | 
|  | b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); | 
|  | if (b->items == NULL) | 
|  | goto badmem; | 
|  |  | 
|  | ceph_decode_need(p, end, b->size*sizeof(u32), bad); | 
|  | for (j = 0; j < b->size; j++) | 
|  | b->items[j] = ceph_decode_32(p); | 
|  |  | 
|  | switch (b->alg) { | 
|  | case CRUSH_BUCKET_UNIFORM: | 
|  | err = crush_decode_uniform_bucket(p, end, | 
|  | (struct crush_bucket_uniform *)b); | 
|  | if (err < 0) | 
|  | goto fail; | 
|  | break; | 
|  | case CRUSH_BUCKET_LIST: | 
|  | err = crush_decode_list_bucket(p, end, | 
|  | (struct crush_bucket_list *)b); | 
|  | if (err < 0) | 
|  | goto fail; | 
|  | break; | 
|  | case CRUSH_BUCKET_TREE: | 
|  | err = crush_decode_tree_bucket(p, end, | 
|  | (struct crush_bucket_tree *)b); | 
|  | if (err < 0) | 
|  | goto fail; | 
|  | break; | 
|  | case CRUSH_BUCKET_STRAW: | 
|  | err = crush_decode_straw_bucket(p, end, | 
|  | (struct crush_bucket_straw *)b); | 
|  | if (err < 0) | 
|  | goto fail; | 
|  | break; | 
|  | case CRUSH_BUCKET_STRAW2: | 
|  | err = crush_decode_straw2_bucket(p, end, | 
|  | (struct crush_bucket_straw2 *)b); | 
|  | if (err < 0) | 
|  | goto fail; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* rules */ | 
|  | dout("rule vec is %p\n", c->rules); | 
|  | for (i = 0; i < c->max_rules; i++) { | 
|  | u32 yes; | 
|  | struct crush_rule *r; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, yes, bad); | 
|  | if (!yes) { | 
|  | dout("crush_decode NO rule %d off %x %p to %p\n", | 
|  | i, (int)(*p-start), *p, end); | 
|  | c->rules[i] = NULL; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | dout("crush_decode rule %d off %x %p to %p\n", | 
|  | i, (int)(*p-start), *p, end); | 
|  |  | 
|  | /* len */ | 
|  | ceph_decode_32_safe(p, end, yes, bad); | 
|  | #if BITS_PER_LONG == 32 | 
|  | if (yes > (ULONG_MAX - sizeof(*r)) | 
|  | / sizeof(struct crush_rule_step)) | 
|  | goto bad; | 
|  | #endif | 
|  | r = c->rules[i] = kmalloc(sizeof(*r) + | 
|  | yes*sizeof(struct crush_rule_step), | 
|  | GFP_NOFS); | 
|  | if (r == NULL) | 
|  | goto badmem; | 
|  | dout(" rule %d is at %p\n", i, r); | 
|  | r->len = yes; | 
|  | ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ | 
|  | ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); | 
|  | for (j = 0; j < r->len; j++) { | 
|  | r->steps[j].op = ceph_decode_32(p); | 
|  | r->steps[j].arg1 = ceph_decode_32(p); | 
|  | r->steps[j].arg2 = ceph_decode_32(p); | 
|  | } | 
|  | } | 
|  |  | 
|  | ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */ | 
|  | ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */ | 
|  | ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ | 
|  |  | 
|  | /* tunables */ | 
|  | ceph_decode_need(p, end, 3*sizeof(u32), done); | 
|  | c->choose_local_tries = ceph_decode_32(p); | 
|  | c->choose_local_fallback_tries =  ceph_decode_32(p); | 
|  | c->choose_total_tries = ceph_decode_32(p); | 
|  | dout("crush decode tunable choose_local_tries = %d\n", | 
|  | c->choose_local_tries); | 
|  | dout("crush decode tunable choose_local_fallback_tries = %d\n", | 
|  | c->choose_local_fallback_tries); | 
|  | dout("crush decode tunable choose_total_tries = %d\n", | 
|  | c->choose_total_tries); | 
|  |  | 
|  | ceph_decode_need(p, end, sizeof(u32), done); | 
|  | c->chooseleaf_descend_once = ceph_decode_32(p); | 
|  | dout("crush decode tunable chooseleaf_descend_once = %d\n", | 
|  | c->chooseleaf_descend_once); | 
|  |  | 
|  | ceph_decode_need(p, end, sizeof(u8), done); | 
|  | c->chooseleaf_vary_r = ceph_decode_8(p); | 
|  | dout("crush decode tunable chooseleaf_vary_r = %d\n", | 
|  | c->chooseleaf_vary_r); | 
|  |  | 
|  | /* skip straw_calc_version, allowed_bucket_algs */ | 
|  | ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); | 
|  | *p += sizeof(u8) + sizeof(u32); | 
|  |  | 
|  | ceph_decode_need(p, end, sizeof(u8), done); | 
|  | c->chooseleaf_stable = ceph_decode_8(p); | 
|  | dout("crush decode tunable chooseleaf_stable = %d\n", | 
|  | c->chooseleaf_stable); | 
|  |  | 
|  | if (*p != end) { | 
|  | /* class_map */ | 
|  | ceph_decode_skip_map(p, end, 32, 32, bad); | 
|  | /* class_name */ | 
|  | ceph_decode_skip_map(p, end, 32, string, bad); | 
|  | /* class_bucket */ | 
|  | ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad); | 
|  | } | 
|  |  | 
|  | if (*p != end) { | 
|  | err = decode_choose_args(p, end, c); | 
|  | if (err) | 
|  | goto fail; | 
|  | } | 
|  |  | 
|  | done: | 
|  | crush_finalize(c); | 
|  | dout("crush_decode success\n"); | 
|  | return c; | 
|  |  | 
|  | badmem: | 
|  | err = -ENOMEM; | 
|  | fail: | 
|  | dout("crush_decode fail %d\n", err); | 
|  | crush_destroy(c); | 
|  | return ERR_PTR(err); | 
|  |  | 
|  | bad: | 
|  | err = -EINVAL; | 
|  | goto fail; | 
|  | } | 
|  |  | 
|  | int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) | 
|  | { | 
|  | if (lhs->pool < rhs->pool) | 
|  | return -1; | 
|  | if (lhs->pool > rhs->pool) | 
|  | return 1; | 
|  | if (lhs->seed < rhs->seed) | 
|  | return -1; | 
|  | if (lhs->seed > rhs->seed) | 
|  | return 1; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs) | 
|  | { | 
|  | int ret; | 
|  |  | 
|  | ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid); | 
|  | if (ret) | 
|  | return ret; | 
|  |  | 
|  | if (lhs->shard < rhs->shard) | 
|  | return -1; | 
|  | if (lhs->shard > rhs->shard) | 
|  | return 1; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len) | 
|  | { | 
|  | struct ceph_pg_mapping *pg; | 
|  |  | 
|  | pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO); | 
|  | if (!pg) | 
|  | return NULL; | 
|  |  | 
|  | RB_CLEAR_NODE(&pg->node); | 
|  | return pg; | 
|  | } | 
|  |  | 
|  | static void free_pg_mapping(struct ceph_pg_mapping *pg) | 
|  | { | 
|  | WARN_ON(!RB_EMPTY_NODE(&pg->node)); | 
|  |  | 
|  | kfree(pg); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid | 
|  | * to a set of osds) and primary_temp (explicit primary setting) | 
|  | */ | 
|  | DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, | 
|  | RB_BYPTR, const struct ceph_pg *, node) | 
|  |  | 
|  | /* | 
|  | * rbtree of pg pool info | 
|  | */ | 
|  | static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) | 
|  | { | 
|  | struct rb_node **p = &root->rb_node; | 
|  | struct rb_node *parent = NULL; | 
|  | struct ceph_pg_pool_info *pi = NULL; | 
|  |  | 
|  | while (*p) { | 
|  | parent = *p; | 
|  | pi = rb_entry(parent, struct ceph_pg_pool_info, node); | 
|  | if (new->id < pi->id) | 
|  | p = &(*p)->rb_left; | 
|  | else if (new->id > pi->id) | 
|  | p = &(*p)->rb_right; | 
|  | else | 
|  | return -EEXIST; | 
|  | } | 
|  |  | 
|  | rb_link_node(&new->node, parent, p); | 
|  | rb_insert_color(&new->node, root); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) | 
|  | { | 
|  | struct ceph_pg_pool_info *pi; | 
|  | struct rb_node *n = root->rb_node; | 
|  |  | 
|  | while (n) { | 
|  | pi = rb_entry(n, struct ceph_pg_pool_info, node); | 
|  | if (id < pi->id) | 
|  | n = n->rb_left; | 
|  | else if (id > pi->id) | 
|  | n = n->rb_right; | 
|  | else | 
|  | return pi; | 
|  | } | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) | 
|  | { | 
|  | return __lookup_pg_pool(&map->pg_pools, id); | 
|  | } | 
|  |  | 
|  | const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) | 
|  | { | 
|  | struct ceph_pg_pool_info *pi; | 
|  |  | 
|  | if (id == CEPH_NOPOOL) | 
|  | return NULL; | 
|  |  | 
|  | if (WARN_ON_ONCE(id > (u64) INT_MAX)) | 
|  | return NULL; | 
|  |  | 
|  | pi = __lookup_pg_pool(&map->pg_pools, (int) id); | 
|  |  | 
|  | return pi ? pi->name : NULL; | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_pg_pool_name_by_id); | 
|  |  | 
|  | int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) | 
|  | { | 
|  | struct rb_node *rbp; | 
|  |  | 
|  | for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { | 
|  | struct ceph_pg_pool_info *pi = | 
|  | rb_entry(rbp, struct ceph_pg_pool_info, node); | 
|  | if (pi->name && strcmp(pi->name, name) == 0) | 
|  | return pi->id; | 
|  | } | 
|  | return -ENOENT; | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_pg_poolid_by_name); | 
|  |  | 
|  | static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) | 
|  | { | 
|  | rb_erase(&pi->node, root); | 
|  | kfree(pi->name); | 
|  | kfree(pi); | 
|  | } | 
|  |  | 
|  | static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) | 
|  | { | 
|  | u8 ev, cv; | 
|  | unsigned len, num; | 
|  | void *pool_end; | 
|  |  | 
|  | ceph_decode_need(p, end, 2 + 4, bad); | 
|  | ev = ceph_decode_8(p);  /* encoding version */ | 
|  | cv = ceph_decode_8(p); /* compat version */ | 
|  | if (ev < 5) { | 
|  | pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); | 
|  | return -EINVAL; | 
|  | } | 
|  | if (cv > 9) { | 
|  | pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); | 
|  | return -EINVAL; | 
|  | } | 
|  | len = ceph_decode_32(p); | 
|  | ceph_decode_need(p, end, len, bad); | 
|  | pool_end = *p + len; | 
|  |  | 
|  | pi->type = ceph_decode_8(p); | 
|  | pi->size = ceph_decode_8(p); | 
|  | pi->crush_ruleset = ceph_decode_8(p); | 
|  | pi->object_hash = ceph_decode_8(p); | 
|  |  | 
|  | pi->pg_num = ceph_decode_32(p); | 
|  | pi->pgp_num = ceph_decode_32(p); | 
|  |  | 
|  | *p += 4 + 4;  /* skip lpg* */ | 
|  | *p += 4;      /* skip last_change */ | 
|  | *p += 8 + 4;  /* skip snap_seq, snap_epoch */ | 
|  |  | 
|  | /* skip snaps */ | 
|  | num = ceph_decode_32(p); | 
|  | while (num--) { | 
|  | *p += 8;  /* snapid key */ | 
|  | *p += 1 + 1; /* versions */ | 
|  | len = ceph_decode_32(p); | 
|  | *p += len; | 
|  | } | 
|  |  | 
|  | /* skip removed_snaps */ | 
|  | num = ceph_decode_32(p); | 
|  | *p += num * (8 + 8); | 
|  |  | 
|  | *p += 8;  /* skip auid */ | 
|  | pi->flags = ceph_decode_64(p); | 
|  | *p += 4;  /* skip crash_replay_interval */ | 
|  |  | 
|  | if (ev >= 7) | 
|  | pi->min_size = ceph_decode_8(p); | 
|  | else | 
|  | pi->min_size = pi->size - pi->size / 2; | 
|  |  | 
|  | if (ev >= 8) | 
|  | *p += 8 + 8;  /* skip quota_max_* */ | 
|  |  | 
|  | if (ev >= 9) { | 
|  | /* skip tiers */ | 
|  | num = ceph_decode_32(p); | 
|  | *p += num * 8; | 
|  |  | 
|  | *p += 8;  /* skip tier_of */ | 
|  | *p += 1;  /* skip cache_mode */ | 
|  |  | 
|  | pi->read_tier = ceph_decode_64(p); | 
|  | pi->write_tier = ceph_decode_64(p); | 
|  | } else { | 
|  | pi->read_tier = -1; | 
|  | pi->write_tier = -1; | 
|  | } | 
|  |  | 
|  | if (ev >= 10) { | 
|  | /* skip properties */ | 
|  | num = ceph_decode_32(p); | 
|  | while (num--) { | 
|  | len = ceph_decode_32(p); | 
|  | *p += len; /* key */ | 
|  | len = ceph_decode_32(p); | 
|  | *p += len; /* val */ | 
|  | } | 
|  | } | 
|  |  | 
|  | if (ev >= 11) { | 
|  | /* skip hit_set_params */ | 
|  | *p += 1 + 1; /* versions */ | 
|  | len = ceph_decode_32(p); | 
|  | *p += len; | 
|  |  | 
|  | *p += 4; /* skip hit_set_period */ | 
|  | *p += 4; /* skip hit_set_count */ | 
|  | } | 
|  |  | 
|  | if (ev >= 12) | 
|  | *p += 4; /* skip stripe_width */ | 
|  |  | 
|  | if (ev >= 13) { | 
|  | *p += 8; /* skip target_max_bytes */ | 
|  | *p += 8; /* skip target_max_objects */ | 
|  | *p += 4; /* skip cache_target_dirty_ratio_micro */ | 
|  | *p += 4; /* skip cache_target_full_ratio_micro */ | 
|  | *p += 4; /* skip cache_min_flush_age */ | 
|  | *p += 4; /* skip cache_min_evict_age */ | 
|  | } | 
|  |  | 
|  | if (ev >=  14) { | 
|  | /* skip erasure_code_profile */ | 
|  | len = ceph_decode_32(p); | 
|  | *p += len; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * last_force_op_resend_preluminous, will be overridden if the | 
|  | * map was encoded with RESEND_ON_SPLIT | 
|  | */ | 
|  | if (ev >= 15) | 
|  | pi->last_force_request_resend = ceph_decode_32(p); | 
|  | else | 
|  | pi->last_force_request_resend = 0; | 
|  |  | 
|  | if (ev >= 16) | 
|  | *p += 4; /* skip min_read_recency_for_promote */ | 
|  |  | 
|  | if (ev >= 17) | 
|  | *p += 8; /* skip expected_num_objects */ | 
|  |  | 
|  | if (ev >= 19) | 
|  | *p += 4; /* skip cache_target_dirty_high_ratio_micro */ | 
|  |  | 
|  | if (ev >= 20) | 
|  | *p += 4; /* skip min_write_recency_for_promote */ | 
|  |  | 
|  | if (ev >= 21) | 
|  | *p += 1; /* skip use_gmt_hitset */ | 
|  |  | 
|  | if (ev >= 22) | 
|  | *p += 1; /* skip fast_read */ | 
|  |  | 
|  | if (ev >= 23) { | 
|  | *p += 4; /* skip hit_set_grade_decay_rate */ | 
|  | *p += 4; /* skip hit_set_search_last_n */ | 
|  | } | 
|  |  | 
|  | if (ev >= 24) { | 
|  | /* skip opts */ | 
|  | *p += 1 + 1; /* versions */ | 
|  | len = ceph_decode_32(p); | 
|  | *p += len; | 
|  | } | 
|  |  | 
|  | if (ev >= 25) | 
|  | pi->last_force_request_resend = ceph_decode_32(p); | 
|  |  | 
|  | /* ignore the rest */ | 
|  |  | 
|  | *p = pool_end; | 
|  | calc_pg_masks(pi); | 
|  | return 0; | 
|  |  | 
|  | bad: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | struct ceph_pg_pool_info *pi; | 
|  | u32 num, len; | 
|  | u64 pool; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, num, bad); | 
|  | dout(" %d pool names\n", num); | 
|  | while (num--) { | 
|  | ceph_decode_64_safe(p, end, pool, bad); | 
|  | ceph_decode_32_safe(p, end, len, bad); | 
|  | dout("  pool %llu len %d\n", pool, len); | 
|  | ceph_decode_need(p, end, len, bad); | 
|  | pi = __lookup_pg_pool(&map->pg_pools, pool); | 
|  | if (pi) { | 
|  | char *name = kstrndup(*p, len, GFP_NOFS); | 
|  |  | 
|  | if (!name) | 
|  | return -ENOMEM; | 
|  | kfree(pi->name); | 
|  | pi->name = name; | 
|  | dout("  name is %s\n", pi->name); | 
|  | } | 
|  | *p += len; | 
|  | } | 
|  | return 0; | 
|  |  | 
|  | bad: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * osd map | 
|  | */ | 
|  | struct ceph_osdmap *ceph_osdmap_alloc(void) | 
|  | { | 
|  | struct ceph_osdmap *map; | 
|  |  | 
|  | map = kzalloc(sizeof(*map), GFP_NOIO); | 
|  | if (!map) | 
|  | return NULL; | 
|  |  | 
|  | map->pg_pools = RB_ROOT; | 
|  | map->pool_max = -1; | 
|  | map->pg_temp = RB_ROOT; | 
|  | map->primary_temp = RB_ROOT; | 
|  | map->pg_upmap = RB_ROOT; | 
|  | map->pg_upmap_items = RB_ROOT; | 
|  | mutex_init(&map->crush_workspace_mutex); | 
|  |  | 
|  | return map; | 
|  | } | 
|  |  | 
|  | void ceph_osdmap_destroy(struct ceph_osdmap *map) | 
|  | { | 
|  | dout("osdmap_destroy %p\n", map); | 
|  | if (map->crush) | 
|  | crush_destroy(map->crush); | 
|  | while (!RB_EMPTY_ROOT(&map->pg_temp)) { | 
|  | struct ceph_pg_mapping *pg = | 
|  | rb_entry(rb_first(&map->pg_temp), | 
|  | struct ceph_pg_mapping, node); | 
|  | erase_pg_mapping(&map->pg_temp, pg); | 
|  | free_pg_mapping(pg); | 
|  | } | 
|  | while (!RB_EMPTY_ROOT(&map->primary_temp)) { | 
|  | struct ceph_pg_mapping *pg = | 
|  | rb_entry(rb_first(&map->primary_temp), | 
|  | struct ceph_pg_mapping, node); | 
|  | erase_pg_mapping(&map->primary_temp, pg); | 
|  | free_pg_mapping(pg); | 
|  | } | 
|  | while (!RB_EMPTY_ROOT(&map->pg_upmap)) { | 
|  | struct ceph_pg_mapping *pg = | 
|  | rb_entry(rb_first(&map->pg_upmap), | 
|  | struct ceph_pg_mapping, node); | 
|  | rb_erase(&pg->node, &map->pg_upmap); | 
|  | kfree(pg); | 
|  | } | 
|  | while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) { | 
|  | struct ceph_pg_mapping *pg = | 
|  | rb_entry(rb_first(&map->pg_upmap_items), | 
|  | struct ceph_pg_mapping, node); | 
|  | rb_erase(&pg->node, &map->pg_upmap_items); | 
|  | kfree(pg); | 
|  | } | 
|  | while (!RB_EMPTY_ROOT(&map->pg_pools)) { | 
|  | struct ceph_pg_pool_info *pi = | 
|  | rb_entry(rb_first(&map->pg_pools), | 
|  | struct ceph_pg_pool_info, node); | 
|  | __remove_pg_pool(&map->pg_pools, pi); | 
|  | } | 
|  | kfree(map->osd_state); | 
|  | kfree(map->osd_weight); | 
|  | kfree(map->osd_addr); | 
|  | kfree(map->osd_primary_affinity); | 
|  | kfree(map->crush_workspace); | 
|  | kfree(map); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Adjust max_osd value, (re)allocate arrays. | 
|  | * | 
|  | * The new elements are properly initialized. | 
|  | */ | 
|  | static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) | 
|  | { | 
|  | u32 *state; | 
|  | u32 *weight; | 
|  | struct ceph_entity_addr *addr; | 
|  | int i; | 
|  |  | 
|  | state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); | 
|  | if (!state) | 
|  | return -ENOMEM; | 
|  | map->osd_state = state; | 
|  |  | 
|  | weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); | 
|  | if (!weight) | 
|  | return -ENOMEM; | 
|  | map->osd_weight = weight; | 
|  |  | 
|  | addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); | 
|  | if (!addr) | 
|  | return -ENOMEM; | 
|  | map->osd_addr = addr; | 
|  |  | 
|  | for (i = map->max_osd; i < max; i++) { | 
|  | map->osd_state[i] = 0; | 
|  | map->osd_weight[i] = CEPH_OSD_OUT; | 
|  | memset(map->osd_addr + i, 0, sizeof(*map->osd_addr)); | 
|  | } | 
|  |  | 
|  | if (map->osd_primary_affinity) { | 
|  | u32 *affinity; | 
|  |  | 
|  | affinity = krealloc(map->osd_primary_affinity, | 
|  | max*sizeof(*affinity), GFP_NOFS); | 
|  | if (!affinity) | 
|  | return -ENOMEM; | 
|  | map->osd_primary_affinity = affinity; | 
|  |  | 
|  | for (i = map->max_osd; i < max; i++) | 
|  | map->osd_primary_affinity[i] = | 
|  | CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; | 
|  | } | 
|  |  | 
|  | map->max_osd = max; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) | 
|  | { | 
|  | void *workspace; | 
|  | size_t work_size; | 
|  |  | 
|  | if (IS_ERR(crush)) | 
|  | return PTR_ERR(crush); | 
|  |  | 
|  | work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); | 
|  | dout("%s work_size %zu bytes\n", __func__, work_size); | 
|  | workspace = kmalloc(work_size, GFP_NOIO); | 
|  | if (!workspace) { | 
|  | crush_destroy(crush); | 
|  | return -ENOMEM; | 
|  | } | 
|  | crush_init_workspace(crush, workspace); | 
|  |  | 
|  | if (map->crush) | 
|  | crush_destroy(map->crush); | 
|  | kfree(map->crush_workspace); | 
|  | map->crush = crush; | 
|  | map->crush_workspace = workspace; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | #define OSDMAP_WRAPPER_COMPAT_VER	7 | 
|  | #define OSDMAP_CLIENT_DATA_COMPAT_VER	1 | 
|  |  | 
|  | /* | 
|  | * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps, | 
|  | * to struct_v of the client_data section for new (v7 and above) | 
|  | * osdmaps. | 
|  | */ | 
|  | static int get_osdmap_client_data_v(void **p, void *end, | 
|  | const char *prefix, u8 *v) | 
|  | { | 
|  | u8 struct_v; | 
|  |  | 
|  | ceph_decode_8_safe(p, end, struct_v, e_inval); | 
|  | if (struct_v >= 7) { | 
|  | u8 struct_compat; | 
|  |  | 
|  | ceph_decode_8_safe(p, end, struct_compat, e_inval); | 
|  | if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { | 
|  | pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n", | 
|  | struct_v, struct_compat, | 
|  | OSDMAP_WRAPPER_COMPAT_VER, prefix); | 
|  | return -EINVAL; | 
|  | } | 
|  | *p += 4; /* ignore wrapper struct_len */ | 
|  |  | 
|  | ceph_decode_8_safe(p, end, struct_v, e_inval); | 
|  | ceph_decode_8_safe(p, end, struct_compat, e_inval); | 
|  | if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { | 
|  | pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n", | 
|  | struct_v, struct_compat, | 
|  | OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); | 
|  | return -EINVAL; | 
|  | } | 
|  | *p += 4; /* ignore client data struct_len */ | 
|  | } else { | 
|  | u16 version; | 
|  |  | 
|  | *p -= 1; | 
|  | ceph_decode_16_safe(p, end, version, e_inval); | 
|  | if (version < 6) { | 
|  | pr_warn("got v %d < 6 of %s ceph_osdmap\n", | 
|  | version, prefix); | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | /* old osdmap enconding */ | 
|  | struct_v = 0; | 
|  | } | 
|  |  | 
|  | *v = struct_v; | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, | 
|  | bool incremental) | 
|  | { | 
|  | u32 n; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, n, e_inval); | 
|  | while (n--) { | 
|  | struct ceph_pg_pool_info *pi; | 
|  | u64 pool; | 
|  | int ret; | 
|  |  | 
|  | ceph_decode_64_safe(p, end, pool, e_inval); | 
|  |  | 
|  | pi = __lookup_pg_pool(&map->pg_pools, pool); | 
|  | if (!incremental || !pi) { | 
|  | pi = kzalloc(sizeof(*pi), GFP_NOFS); | 
|  | if (!pi) | 
|  | return -ENOMEM; | 
|  |  | 
|  | pi->id = pool; | 
|  |  | 
|  | ret = __insert_pg_pool(&map->pg_pools, pi); | 
|  | if (ret) { | 
|  | kfree(pi); | 
|  | return ret; | 
|  | } | 
|  | } | 
|  |  | 
|  | ret = decode_pool(p, end, pi); | 
|  | if (ret) | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int decode_pools(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return __decode_pools(p, end, map, false); | 
|  | } | 
|  |  | 
|  | static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return __decode_pools(p, end, map, true); | 
|  | } | 
|  |  | 
|  | typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool); | 
|  |  | 
|  | static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root, | 
|  | decode_mapping_fn_t fn, bool incremental) | 
|  | { | 
|  | u32 n; | 
|  |  | 
|  | WARN_ON(!incremental && !fn); | 
|  |  | 
|  | ceph_decode_32_safe(p, end, n, e_inval); | 
|  | while (n--) { | 
|  | struct ceph_pg_mapping *pg; | 
|  | struct ceph_pg pgid; | 
|  | int ret; | 
|  |  | 
|  | ret = ceph_decode_pgid(p, end, &pgid); | 
|  | if (ret) | 
|  | return ret; | 
|  |  | 
|  | pg = lookup_pg_mapping(mapping_root, &pgid); | 
|  | if (pg) { | 
|  | WARN_ON(!incremental); | 
|  | erase_pg_mapping(mapping_root, pg); | 
|  | free_pg_mapping(pg); | 
|  | } | 
|  |  | 
|  | if (fn) { | 
|  | pg = fn(p, end, incremental); | 
|  | if (IS_ERR(pg)) | 
|  | return PTR_ERR(pg); | 
|  |  | 
|  | if (pg) { | 
|  | pg->pgid = pgid; /* struct */ | 
|  | insert_pg_mapping(mapping_root, pg); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end, | 
|  | bool incremental) | 
|  | { | 
|  | struct ceph_pg_mapping *pg; | 
|  | u32 len, i; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | if (len == 0 && incremental) | 
|  | return NULL;	/* new_pg_temp: [] to remove */ | 
|  | if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) | 
|  | return ERR_PTR(-EINVAL); | 
|  |  | 
|  | ceph_decode_need(p, end, len * sizeof(u32), e_inval); | 
|  | pg = alloc_pg_mapping(len * sizeof(u32)); | 
|  | if (!pg) | 
|  | return ERR_PTR(-ENOMEM); | 
|  |  | 
|  | pg->pg_temp.len = len; | 
|  | for (i = 0; i < len; i++) | 
|  | pg->pg_temp.osds[i] = ceph_decode_32(p); | 
|  |  | 
|  | return pg; | 
|  |  | 
|  | e_inval: | 
|  | return ERR_PTR(-EINVAL); | 
|  | } | 
|  |  | 
|  | static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, | 
|  | false); | 
|  | } | 
|  |  | 
|  | static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, | 
|  | true); | 
|  | } | 
|  |  | 
|  | static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end, | 
|  | bool incremental) | 
|  | { | 
|  | struct ceph_pg_mapping *pg; | 
|  | u32 osd; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, osd, e_inval); | 
|  | if (osd == (u32)-1 && incremental) | 
|  | return NULL;	/* new_primary_temp: -1 to remove */ | 
|  |  | 
|  | pg = alloc_pg_mapping(0); | 
|  | if (!pg) | 
|  | return ERR_PTR(-ENOMEM); | 
|  |  | 
|  | pg->primary_temp.osd = osd; | 
|  | return pg; | 
|  |  | 
|  | e_inval: | 
|  | return ERR_PTR(-EINVAL); | 
|  | } | 
|  |  | 
|  | static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->primary_temp, | 
|  | __decode_primary_temp, false); | 
|  | } | 
|  |  | 
|  | static int decode_new_primary_temp(void **p, void *end, | 
|  | struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->primary_temp, | 
|  | __decode_primary_temp, true); | 
|  | } | 
|  |  | 
|  | u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) | 
|  | { | 
|  | BUG_ON(osd >= map->max_osd); | 
|  |  | 
|  | if (!map->osd_primary_affinity) | 
|  | return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; | 
|  |  | 
|  | return map->osd_primary_affinity[osd]; | 
|  | } | 
|  |  | 
|  | static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) | 
|  | { | 
|  | BUG_ON(osd >= map->max_osd); | 
|  |  | 
|  | if (!map->osd_primary_affinity) { | 
|  | int i; | 
|  |  | 
|  | map->osd_primary_affinity = kmalloc_array(map->max_osd, | 
|  | sizeof(u32), | 
|  | GFP_NOFS); | 
|  | if (!map->osd_primary_affinity) | 
|  | return -ENOMEM; | 
|  |  | 
|  | for (i = 0; i < map->max_osd; i++) | 
|  | map->osd_primary_affinity[i] = | 
|  | CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; | 
|  | } | 
|  |  | 
|  | map->osd_primary_affinity[osd] = aff; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int decode_primary_affinity(void **p, void *end, | 
|  | struct ceph_osdmap *map) | 
|  | { | 
|  | u32 len, i; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | if (len == 0) { | 
|  | kfree(map->osd_primary_affinity); | 
|  | map->osd_primary_affinity = NULL; | 
|  | return 0; | 
|  | } | 
|  | if (len != map->max_osd) | 
|  | goto e_inval; | 
|  |  | 
|  | ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); | 
|  |  | 
|  | for (i = 0; i < map->max_osd; i++) { | 
|  | int ret; | 
|  |  | 
|  | ret = set_primary_affinity(map, i, ceph_decode_32(p)); | 
|  | if (ret) | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static int decode_new_primary_affinity(void **p, void *end, | 
|  | struct ceph_osdmap *map) | 
|  | { | 
|  | u32 n; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, n, e_inval); | 
|  | while (n--) { | 
|  | u32 osd, aff; | 
|  | int ret; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, osd, e_inval); | 
|  | ceph_decode_32_safe(p, end, aff, e_inval); | 
|  |  | 
|  | ret = set_primary_affinity(map, osd, aff); | 
|  | if (ret) | 
|  | return ret; | 
|  |  | 
|  | pr_info("osd%d primary-affinity 0x%x\n", osd, aff); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end, | 
|  | bool __unused) | 
|  | { | 
|  | return __decode_pg_temp(p, end, false); | 
|  | } | 
|  |  | 
|  | static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, | 
|  | false); | 
|  | } | 
|  |  | 
|  | static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, | 
|  | true); | 
|  | } | 
|  |  | 
|  | static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true); | 
|  | } | 
|  |  | 
|  | static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, | 
|  | bool __unused) | 
|  | { | 
|  | struct ceph_pg_mapping *pg; | 
|  | u32 len, i; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) | 
|  | return ERR_PTR(-EINVAL); | 
|  |  | 
|  | ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); | 
|  | pg = alloc_pg_mapping(2 * len * sizeof(u32)); | 
|  | if (!pg) | 
|  | return ERR_PTR(-ENOMEM); | 
|  |  | 
|  | pg->pg_upmap_items.len = len; | 
|  | for (i = 0; i < len; i++) { | 
|  | pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p); | 
|  | pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p); | 
|  | } | 
|  |  | 
|  | return pg; | 
|  |  | 
|  | e_inval: | 
|  | return ERR_PTR(-EINVAL); | 
|  | } | 
|  |  | 
|  | static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->pg_upmap_items, | 
|  | __decode_pg_upmap_items, false); | 
|  | } | 
|  |  | 
|  | static int decode_new_pg_upmap_items(void **p, void *end, | 
|  | struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->pg_upmap_items, | 
|  | __decode_pg_upmap_items, true); | 
|  | } | 
|  |  | 
|  | static int decode_old_pg_upmap_items(void **p, void *end, | 
|  | struct ceph_osdmap *map) | 
|  | { | 
|  | return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * decode a full map. | 
|  | */ | 
|  | static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) | 
|  | { | 
|  | u8 struct_v; | 
|  | u32 epoch = 0; | 
|  | void *start = *p; | 
|  | u32 max; | 
|  | u32 len, i; | 
|  | int err; | 
|  |  | 
|  | dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); | 
|  |  | 
|  | err = get_osdmap_client_data_v(p, end, "full", &struct_v); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* fsid, epoch, created, modified */ | 
|  | ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + | 
|  | sizeof(map->created) + sizeof(map->modified), e_inval); | 
|  | ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); | 
|  | epoch = map->epoch = ceph_decode_32(p); | 
|  | ceph_decode_copy(p, &map->created, sizeof(map->created)); | 
|  | ceph_decode_copy(p, &map->modified, sizeof(map->modified)); | 
|  |  | 
|  | /* pools */ | 
|  | err = decode_pools(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* pool_name */ | 
|  | err = decode_pool_names(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | ceph_decode_32_safe(p, end, map->pool_max, e_inval); | 
|  |  | 
|  | ceph_decode_32_safe(p, end, map->flags, e_inval); | 
|  |  | 
|  | /* max_osd */ | 
|  | ceph_decode_32_safe(p, end, max, e_inval); | 
|  |  | 
|  | /* (re)alloc osd arrays */ | 
|  | err = osdmap_set_max_osd(map, max); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* osd_state, osd_weight, osd_addrs->client_addr */ | 
|  | ceph_decode_need(p, end, 3*sizeof(u32) + | 
|  | map->max_osd*((struct_v >= 5 ? sizeof(u32) : | 
|  | sizeof(u8)) + | 
|  | sizeof(*map->osd_weight) + | 
|  | sizeof(*map->osd_addr)), e_inval); | 
|  |  | 
|  | if (ceph_decode_32(p) != map->max_osd) | 
|  | goto e_inval; | 
|  |  | 
|  | if (struct_v >= 5) { | 
|  | for (i = 0; i < map->max_osd; i++) | 
|  | map->osd_state[i] = ceph_decode_32(p); | 
|  | } else { | 
|  | for (i = 0; i < map->max_osd; i++) | 
|  | map->osd_state[i] = ceph_decode_8(p); | 
|  | } | 
|  |  | 
|  | if (ceph_decode_32(p) != map->max_osd) | 
|  | goto e_inval; | 
|  |  | 
|  | for (i = 0; i < map->max_osd; i++) | 
|  | map->osd_weight[i] = ceph_decode_32(p); | 
|  |  | 
|  | if (ceph_decode_32(p) != map->max_osd) | 
|  | goto e_inval; | 
|  |  | 
|  | ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); | 
|  | for (i = 0; i < map->max_osd; i++) | 
|  | ceph_decode_addr(&map->osd_addr[i]); | 
|  |  | 
|  | /* pg_temp */ | 
|  | err = decode_pg_temp(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* primary_temp */ | 
|  | if (struct_v >= 1) { | 
|  | err = decode_primary_temp(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  | } | 
|  |  | 
|  | /* primary_affinity */ | 
|  | if (struct_v >= 2) { | 
|  | err = decode_primary_affinity(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  | } else { | 
|  | WARN_ON(map->osd_primary_affinity); | 
|  | } | 
|  |  | 
|  | /* crush */ | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | *p += len; | 
|  | if (struct_v >= 3) { | 
|  | /* erasure_code_profiles */ | 
|  | ceph_decode_skip_map_of_map(p, end, string, string, string, | 
|  | e_inval); | 
|  | } | 
|  |  | 
|  | if (struct_v >= 4) { | 
|  | err = decode_pg_upmap(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | err = decode_pg_upmap_items(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  | } else { | 
|  | WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap)); | 
|  | WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items)); | 
|  | } | 
|  |  | 
|  | /* ignore the rest */ | 
|  | *p = end; | 
|  |  | 
|  | dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | err = -EINVAL; | 
|  | bad: | 
|  | pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", | 
|  | err, epoch, (int)(*p - start), *p, start, end); | 
|  | print_hex_dump(KERN_DEBUG, "osdmap: ", | 
|  | DUMP_PREFIX_OFFSET, 16, 1, | 
|  | start, end - start, true); | 
|  | return err; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Allocate and decode a full map. | 
|  | */ | 
|  | struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) | 
|  | { | 
|  | struct ceph_osdmap *map; | 
|  | int ret; | 
|  |  | 
|  | map = ceph_osdmap_alloc(); | 
|  | if (!map) | 
|  | return ERR_PTR(-ENOMEM); | 
|  |  | 
|  | ret = osdmap_decode(p, end, map); | 
|  | if (ret) { | 
|  | ceph_osdmap_destroy(map); | 
|  | return ERR_PTR(ret); | 
|  | } | 
|  |  | 
|  | return map; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Encoding order is (new_up_client, new_state, new_weight).  Need to | 
|  | * apply in the (new_weight, new_state, new_up_client) order, because | 
|  | * an incremental map may look like e.g. | 
|  | * | 
|  | *     new_up_client: { osd=6, addr=... } # set osd_state and addr | 
|  | *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state | 
|  | */ | 
|  | static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, | 
|  | struct ceph_osdmap *map) | 
|  | { | 
|  | void *new_up_client; | 
|  | void *new_state; | 
|  | void *new_weight_end; | 
|  | u32 len; | 
|  |  | 
|  | new_up_client = *p; | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | len *= sizeof(u32) + sizeof(struct ceph_entity_addr); | 
|  | ceph_decode_need(p, end, len, e_inval); | 
|  | *p += len; | 
|  |  | 
|  | new_state = *p; | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8)); | 
|  | ceph_decode_need(p, end, len, e_inval); | 
|  | *p += len; | 
|  |  | 
|  | /* new_weight */ | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | while (len--) { | 
|  | s32 osd; | 
|  | u32 w; | 
|  |  | 
|  | ceph_decode_need(p, end, 2*sizeof(u32), e_inval); | 
|  | osd = ceph_decode_32(p); | 
|  | w = ceph_decode_32(p); | 
|  | BUG_ON(osd >= map->max_osd); | 
|  | pr_info("osd%d weight 0x%x %s\n", osd, w, | 
|  | w == CEPH_OSD_IN ? "(in)" : | 
|  | (w == CEPH_OSD_OUT ? "(out)" : "")); | 
|  | map->osd_weight[osd] = w; | 
|  |  | 
|  | /* | 
|  | * If we are marking in, set the EXISTS, and clear the | 
|  | * AUTOOUT and NEW bits. | 
|  | */ | 
|  | if (w) { | 
|  | map->osd_state[osd] |= CEPH_OSD_EXISTS; | 
|  | map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT | | 
|  | CEPH_OSD_NEW); | 
|  | } | 
|  | } | 
|  | new_weight_end = *p; | 
|  |  | 
|  | /* new_state (up/down) */ | 
|  | *p = new_state; | 
|  | len = ceph_decode_32(p); | 
|  | while (len--) { | 
|  | s32 osd; | 
|  | u32 xorstate; | 
|  | int ret; | 
|  |  | 
|  | osd = ceph_decode_32(p); | 
|  | if (struct_v >= 5) | 
|  | xorstate = ceph_decode_32(p); | 
|  | else | 
|  | xorstate = ceph_decode_8(p); | 
|  | if (xorstate == 0) | 
|  | xorstate = CEPH_OSD_UP; | 
|  | BUG_ON(osd >= map->max_osd); | 
|  | if ((map->osd_state[osd] & CEPH_OSD_UP) && | 
|  | (xorstate & CEPH_OSD_UP)) | 
|  | pr_info("osd%d down\n", osd); | 
|  | if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && | 
|  | (xorstate & CEPH_OSD_EXISTS)) { | 
|  | pr_info("osd%d does not exist\n", osd); | 
|  | ret = set_primary_affinity(map, osd, | 
|  | CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); | 
|  | if (ret) | 
|  | return ret; | 
|  | memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr)); | 
|  | map->osd_state[osd] = 0; | 
|  | } else { | 
|  | map->osd_state[osd] ^= xorstate; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* new_up_client */ | 
|  | *p = new_up_client; | 
|  | len = ceph_decode_32(p); | 
|  | while (len--) { | 
|  | s32 osd; | 
|  | struct ceph_entity_addr addr; | 
|  |  | 
|  | osd = ceph_decode_32(p); | 
|  | ceph_decode_copy(p, &addr, sizeof(addr)); | 
|  | ceph_decode_addr(&addr); | 
|  | BUG_ON(osd >= map->max_osd); | 
|  | pr_info("osd%d up\n", osd); | 
|  | map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; | 
|  | map->osd_addr[osd] = addr; | 
|  | } | 
|  |  | 
|  | *p = new_weight_end; | 
|  | return 0; | 
|  |  | 
|  | e_inval: | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * decode and apply an incremental map update. | 
|  | */ | 
|  | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | 
|  | struct ceph_osdmap *map) | 
|  | { | 
|  | struct ceph_fsid fsid; | 
|  | u32 epoch = 0; | 
|  | struct ceph_timespec modified; | 
|  | s32 len; | 
|  | u64 pool; | 
|  | __s64 new_pool_max; | 
|  | __s32 new_flags, max; | 
|  | void *start = *p; | 
|  | int err; | 
|  | u8 struct_v; | 
|  |  | 
|  | dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); | 
|  |  | 
|  | err = get_osdmap_client_data_v(p, end, "inc", &struct_v); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* fsid, epoch, modified, new_pool_max, new_flags */ | 
|  | ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + | 
|  | sizeof(u64) + sizeof(u32), e_inval); | 
|  | ceph_decode_copy(p, &fsid, sizeof(fsid)); | 
|  | epoch = ceph_decode_32(p); | 
|  | BUG_ON(epoch != map->epoch+1); | 
|  | ceph_decode_copy(p, &modified, sizeof(modified)); | 
|  | new_pool_max = ceph_decode_64(p); | 
|  | new_flags = ceph_decode_32(p); | 
|  |  | 
|  | /* full map? */ | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | if (len > 0) { | 
|  | dout("apply_incremental full map len %d, %p to %p\n", | 
|  | len, *p, end); | 
|  | return ceph_osdmap_decode(p, min(*p+len, end)); | 
|  | } | 
|  |  | 
|  | /* new crush? */ | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | if (len > 0) { | 
|  | err = osdmap_set_crush(map, | 
|  | crush_decode(*p, min(*p + len, end))); | 
|  | if (err) | 
|  | goto bad; | 
|  | *p += len; | 
|  | } | 
|  |  | 
|  | /* new flags? */ | 
|  | if (new_flags >= 0) | 
|  | map->flags = new_flags; | 
|  | if (new_pool_max >= 0) | 
|  | map->pool_max = new_pool_max; | 
|  |  | 
|  | /* new max? */ | 
|  | ceph_decode_32_safe(p, end, max, e_inval); | 
|  | if (max >= 0) { | 
|  | err = osdmap_set_max_osd(map, max); | 
|  | if (err) | 
|  | goto bad; | 
|  | } | 
|  |  | 
|  | map->epoch++; | 
|  | map->modified = modified; | 
|  |  | 
|  | /* new_pools */ | 
|  | err = decode_new_pools(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* new_pool_names */ | 
|  | err = decode_pool_names(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* old_pool */ | 
|  | ceph_decode_32_safe(p, end, len, e_inval); | 
|  | while (len--) { | 
|  | struct ceph_pg_pool_info *pi; | 
|  |  | 
|  | ceph_decode_64_safe(p, end, pool, e_inval); | 
|  | pi = __lookup_pg_pool(&map->pg_pools, pool); | 
|  | if (pi) | 
|  | __remove_pg_pool(&map->pg_pools, pi); | 
|  | } | 
|  |  | 
|  | /* new_up_client, new_state, new_weight */ | 
|  | err = decode_new_up_state_weight(p, end, struct_v, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* new_pg_temp */ | 
|  | err = decode_new_pg_temp(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | /* new_primary_temp */ | 
|  | if (struct_v >= 1) { | 
|  | err = decode_new_primary_temp(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  | } | 
|  |  | 
|  | /* new_primary_affinity */ | 
|  | if (struct_v >= 2) { | 
|  | err = decode_new_primary_affinity(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  | } | 
|  |  | 
|  | if (struct_v >= 3) { | 
|  | /* new_erasure_code_profiles */ | 
|  | ceph_decode_skip_map_of_map(p, end, string, string, string, | 
|  | e_inval); | 
|  | /* old_erasure_code_profiles */ | 
|  | ceph_decode_skip_set(p, end, string, e_inval); | 
|  | } | 
|  |  | 
|  | if (struct_v >= 4) { | 
|  | err = decode_new_pg_upmap(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | err = decode_old_pg_upmap(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | err = decode_new_pg_upmap_items(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  |  | 
|  | err = decode_old_pg_upmap_items(p, end, map); | 
|  | if (err) | 
|  | goto bad; | 
|  | } | 
|  |  | 
|  | /* ignore the rest */ | 
|  | *p = end; | 
|  |  | 
|  | dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); | 
|  | return map; | 
|  |  | 
|  | e_inval: | 
|  | err = -EINVAL; | 
|  | bad: | 
|  | pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", | 
|  | err, epoch, (int)(*p - start), *p, start, end); | 
|  | print_hex_dump(KERN_DEBUG, "osdmap: ", | 
|  | DUMP_PREFIX_OFFSET, 16, 1, | 
|  | start, end - start, true); | 
|  | return ERR_PTR(err); | 
|  | } | 
|  |  | 
|  | void ceph_oloc_copy(struct ceph_object_locator *dest, | 
|  | const struct ceph_object_locator *src) | 
|  | { | 
|  | ceph_oloc_destroy(dest); | 
|  |  | 
|  | dest->pool = src->pool; | 
|  | if (src->pool_ns) | 
|  | dest->pool_ns = ceph_get_string(src->pool_ns); | 
|  | else | 
|  | dest->pool_ns = NULL; | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_oloc_copy); | 
|  |  | 
|  | void ceph_oloc_destroy(struct ceph_object_locator *oloc) | 
|  | { | 
|  | ceph_put_string(oloc->pool_ns); | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_oloc_destroy); | 
|  |  | 
|  | void ceph_oid_copy(struct ceph_object_id *dest, | 
|  | const struct ceph_object_id *src) | 
|  | { | 
|  | ceph_oid_destroy(dest); | 
|  |  | 
|  | if (src->name != src->inline_name) { | 
|  | /* very rare, see ceph_object_id definition */ | 
|  | dest->name = kmalloc(src->name_len + 1, | 
|  | GFP_NOIO | __GFP_NOFAIL); | 
|  | } else { | 
|  | dest->name = dest->inline_name; | 
|  | } | 
|  | memcpy(dest->name, src->name, src->name_len + 1); | 
|  | dest->name_len = src->name_len; | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_oid_copy); | 
|  |  | 
|  | static __printf(2, 0) | 
|  | int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap) | 
|  | { | 
|  | int len; | 
|  |  | 
|  | WARN_ON(!ceph_oid_empty(oid)); | 
|  |  | 
|  | len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap); | 
|  | if (len >= sizeof(oid->inline_name)) | 
|  | return len; | 
|  |  | 
|  | oid->name_len = len; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * If oid doesn't fit into inline buffer, BUG. | 
|  | */ | 
|  | void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...) | 
|  | { | 
|  | va_list ap; | 
|  |  | 
|  | va_start(ap, fmt); | 
|  | BUG_ON(oid_printf_vargs(oid, fmt, ap)); | 
|  | va_end(ap); | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_oid_printf); | 
|  |  | 
|  | static __printf(3, 0) | 
|  | int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp, | 
|  | const char *fmt, va_list ap) | 
|  | { | 
|  | va_list aq; | 
|  | int len; | 
|  |  | 
|  | va_copy(aq, ap); | 
|  | len = oid_printf_vargs(oid, fmt, aq); | 
|  | va_end(aq); | 
|  |  | 
|  | if (len) { | 
|  | char *external_name; | 
|  |  | 
|  | external_name = kmalloc(len + 1, gfp); | 
|  | if (!external_name) | 
|  | return -ENOMEM; | 
|  |  | 
|  | oid->name = external_name; | 
|  | WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len); | 
|  | oid->name_len = len; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * If oid doesn't fit into inline buffer, allocate. | 
|  | */ | 
|  | int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, | 
|  | const char *fmt, ...) | 
|  | { | 
|  | va_list ap; | 
|  | int ret; | 
|  |  | 
|  | va_start(ap, fmt); | 
|  | ret = oid_aprintf_vargs(oid, gfp, fmt, ap); | 
|  | va_end(ap); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_oid_aprintf); | 
|  |  | 
|  | void ceph_oid_destroy(struct ceph_object_id *oid) | 
|  | { | 
|  | if (oid->name != oid->inline_name) | 
|  | kfree(oid->name); | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_oid_destroy); | 
|  |  | 
|  | /* | 
|  | * osds only | 
|  | */ | 
|  | static bool __osds_equal(const struct ceph_osds *lhs, | 
|  | const struct ceph_osds *rhs) | 
|  | { | 
|  | if (lhs->size == rhs->size && | 
|  | !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0]))) | 
|  | return true; | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * osds + primary | 
|  | */ | 
|  | static bool osds_equal(const struct ceph_osds *lhs, | 
|  | const struct ceph_osds *rhs) | 
|  | { | 
|  | if (__osds_equal(lhs, rhs) && | 
|  | lhs->primary == rhs->primary) | 
|  | return true; | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | static bool osds_valid(const struct ceph_osds *set) | 
|  | { | 
|  | /* non-empty set */ | 
|  | if (set->size > 0 && set->primary >= 0) | 
|  | return true; | 
|  |  | 
|  | /* empty can_shift_osds set */ | 
|  | if (!set->size && set->primary == -1) | 
|  | return true; | 
|  |  | 
|  | /* empty !can_shift_osds set - all NONE */ | 
|  | if (set->size > 0 && set->primary == -1) { | 
|  | int i; | 
|  |  | 
|  | for (i = 0; i < set->size; i++) { | 
|  | if (set->osds[i] != CRUSH_ITEM_NONE) | 
|  | break; | 
|  | } | 
|  | if (i == set->size) | 
|  | return true; | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) | 
|  | { | 
|  | memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0])); | 
|  | dest->size = src->size; | 
|  | dest->primary = src->primary; | 
|  | } | 
|  |  | 
|  | bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, | 
|  | u32 new_pg_num) | 
|  | { | 
|  | int old_bits = calc_bits_of(old_pg_num); | 
|  | int old_mask = (1 << old_bits) - 1; | 
|  | int n; | 
|  |  | 
|  | WARN_ON(pgid->seed >= old_pg_num); | 
|  | if (new_pg_num <= old_pg_num) | 
|  | return false; | 
|  |  | 
|  | for (n = 1; ; n++) { | 
|  | int next_bit = n << (old_bits - 1); | 
|  | u32 s = next_bit | pgid->seed; | 
|  |  | 
|  | if (s < old_pg_num || s == pgid->seed) | 
|  | continue; | 
|  | if (s >= new_pg_num) | 
|  | break; | 
|  |  | 
|  | s = ceph_stable_mod(s, old_pg_num, old_mask); | 
|  | if (s == pgid->seed) | 
|  | return true; | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | bool ceph_is_new_interval(const struct ceph_osds *old_acting, | 
|  | const struct ceph_osds *new_acting, | 
|  | const struct ceph_osds *old_up, | 
|  | const struct ceph_osds *new_up, | 
|  | int old_size, | 
|  | int new_size, | 
|  | int old_min_size, | 
|  | int new_min_size, | 
|  | u32 old_pg_num, | 
|  | u32 new_pg_num, | 
|  | bool old_sort_bitwise, | 
|  | bool new_sort_bitwise, | 
|  | bool old_recovery_deletes, | 
|  | bool new_recovery_deletes, | 
|  | const struct ceph_pg *pgid) | 
|  | { | 
|  | return !osds_equal(old_acting, new_acting) || | 
|  | !osds_equal(old_up, new_up) || | 
|  | old_size != new_size || | 
|  | old_min_size != new_min_size || | 
|  | ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || | 
|  | old_sort_bitwise != new_sort_bitwise || | 
|  | old_recovery_deletes != new_recovery_deletes; | 
|  | } | 
|  |  | 
|  | static int calc_pg_rank(int osd, const struct ceph_osds *acting) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | for (i = 0; i < acting->size; i++) { | 
|  | if (acting->osds[i] == osd) | 
|  | return i; | 
|  | } | 
|  |  | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | static bool primary_changed(const struct ceph_osds *old_acting, | 
|  | const struct ceph_osds *new_acting) | 
|  | { | 
|  | if (!old_acting->size && !new_acting->size) | 
|  | return false; /* both still empty */ | 
|  |  | 
|  | if (!old_acting->size ^ !new_acting->size) | 
|  | return true; /* was empty, now not, or vice versa */ | 
|  |  | 
|  | if (old_acting->primary != new_acting->primary) | 
|  | return true; /* primary changed */ | 
|  |  | 
|  | if (calc_pg_rank(old_acting->primary, old_acting) != | 
|  | calc_pg_rank(new_acting->primary, new_acting)) | 
|  | return true; | 
|  |  | 
|  | return false; /* same primary (tho replicas may have changed) */ | 
|  | } | 
|  |  | 
|  | bool ceph_osds_changed(const struct ceph_osds *old_acting, | 
|  | const struct ceph_osds *new_acting, | 
|  | bool any_change) | 
|  | { | 
|  | if (primary_changed(old_acting, new_acting)) | 
|  | return true; | 
|  |  | 
|  | if (any_change && !__osds_equal(old_acting, new_acting)) | 
|  | return true; | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Map an object into a PG. | 
|  | * | 
|  | * Should only be called with target_oid and target_oloc (as opposed to | 
|  | * base_oid and base_oloc), since tiering isn't taken into account. | 
|  | */ | 
|  | void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, | 
|  | const struct ceph_object_id *oid, | 
|  | const struct ceph_object_locator *oloc, | 
|  | struct ceph_pg *raw_pgid) | 
|  | { | 
|  | WARN_ON(pi->id != oloc->pool); | 
|  |  | 
|  | if (!oloc->pool_ns) { | 
|  | raw_pgid->pool = oloc->pool; | 
|  | raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, | 
|  | oid->name_len); | 
|  | dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, | 
|  | raw_pgid->pool, raw_pgid->seed); | 
|  | } else { | 
|  | char stack_buf[256]; | 
|  | char *buf = stack_buf; | 
|  | int nsl = oloc->pool_ns->len; | 
|  | size_t total = nsl + 1 + oid->name_len; | 
|  |  | 
|  | if (total > sizeof(stack_buf)) | 
|  | buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL); | 
|  | memcpy(buf, oloc->pool_ns->str, nsl); | 
|  | buf[nsl] = '\037'; | 
|  | memcpy(buf + nsl + 1, oid->name, oid->name_len); | 
|  | raw_pgid->pool = oloc->pool; | 
|  | raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total); | 
|  | if (buf != stack_buf) | 
|  | kfree(buf); | 
|  | dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__, | 
|  | oid->name, nsl, oloc->pool_ns->str, | 
|  | raw_pgid->pool, raw_pgid->seed); | 
|  | } | 
|  | } | 
|  |  | 
|  | int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, | 
|  | const struct ceph_object_id *oid, | 
|  | const struct ceph_object_locator *oloc, | 
|  | struct ceph_pg *raw_pgid) | 
|  | { | 
|  | struct ceph_pg_pool_info *pi; | 
|  |  | 
|  | pi = ceph_pg_pool_by_id(osdmap, oloc->pool); | 
|  | if (!pi) | 
|  | return -ENOENT; | 
|  |  | 
|  | __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); | 
|  | return 0; | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_object_locator_to_pg); | 
|  |  | 
|  | /* | 
|  | * Map a raw PG (full precision ps) into an actual PG. | 
|  | */ | 
|  | static void raw_pg_to_pg(struct ceph_pg_pool_info *pi, | 
|  | const struct ceph_pg *raw_pgid, | 
|  | struct ceph_pg *pgid) | 
|  | { | 
|  | pgid->pool = raw_pgid->pool; | 
|  | pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num, | 
|  | pi->pg_num_mask); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Map a raw PG (full precision ps) into a placement ps (placement | 
|  | * seed).  Include pool id in that value so that different pools don't | 
|  | * use the same seeds. | 
|  | */ | 
|  | static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, | 
|  | const struct ceph_pg *raw_pgid) | 
|  | { | 
|  | if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) { | 
|  | /* hash pool id and seed so that pool PGs do not overlap */ | 
|  | return crush_hash32_2(CRUSH_HASH_RJENKINS1, | 
|  | ceph_stable_mod(raw_pgid->seed, | 
|  | pi->pgp_num, | 
|  | pi->pgp_num_mask), | 
|  | raw_pgid->pool); | 
|  | } else { | 
|  | /* | 
|  | * legacy behavior: add ps and pool together.  this is | 
|  | * not a great approach because the PGs from each pool | 
|  | * will overlap on top of each other: 0.5 == 1.4 == | 
|  | * 2.3 == ... | 
|  | */ | 
|  | return ceph_stable_mod(raw_pgid->seed, pi->pgp_num, | 
|  | pi->pgp_num_mask) + | 
|  | (unsigned)raw_pgid->pool; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Magic value used for a "default" fallback choose_args, used if the | 
|  | * crush_choose_arg_map passed to do_crush() does not exist.  If this | 
|  | * also doesn't exist, fall back to canonical weights. | 
|  | */ | 
|  | #define CEPH_DEFAULT_CHOOSE_ARGS	-1 | 
|  |  | 
|  | static int do_crush(struct ceph_osdmap *map, int ruleno, int x, | 
|  | int *result, int result_max, | 
|  | const __u32 *weight, int weight_max, | 
|  | s64 choose_args_index) | 
|  | { | 
|  | struct crush_choose_arg_map *arg_map; | 
|  | int r; | 
|  |  | 
|  | BUG_ON(result_max > CEPH_PG_MAX_SIZE); | 
|  |  | 
|  | arg_map = lookup_choose_arg_map(&map->crush->choose_args, | 
|  | choose_args_index); | 
|  | if (!arg_map) | 
|  | arg_map = lookup_choose_arg_map(&map->crush->choose_args, | 
|  | CEPH_DEFAULT_CHOOSE_ARGS); | 
|  |  | 
|  | mutex_lock(&map->crush_workspace_mutex); | 
|  | r = crush_do_rule(map->crush, ruleno, x, result, result_max, | 
|  | weight, weight_max, map->crush_workspace, | 
|  | arg_map ? arg_map->args : NULL); | 
|  | mutex_unlock(&map->crush_workspace_mutex); | 
|  |  | 
|  | return r; | 
|  | } | 
|  |  | 
|  | static void remove_nonexistent_osds(struct ceph_osdmap *osdmap, | 
|  | struct ceph_pg_pool_info *pi, | 
|  | struct ceph_osds *set) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | if (ceph_can_shift_osds(pi)) { | 
|  | int removed = 0; | 
|  |  | 
|  | /* shift left */ | 
|  | for (i = 0; i < set->size; i++) { | 
|  | if (!ceph_osd_exists(osdmap, set->osds[i])) { | 
|  | removed++; | 
|  | continue; | 
|  | } | 
|  | if (removed) | 
|  | set->osds[i - removed] = set->osds[i]; | 
|  | } | 
|  | set->size -= removed; | 
|  | } else { | 
|  | /* set dne devices to NONE */ | 
|  | for (i = 0; i < set->size; i++) { | 
|  | if (!ceph_osd_exists(osdmap, set->osds[i])) | 
|  | set->osds[i] = CRUSH_ITEM_NONE; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Calculate raw set (CRUSH output) for given PG and filter out | 
|  | * nonexistent OSDs.  ->primary is undefined for a raw set. | 
|  | * | 
|  | * Placement seed (CRUSH input) is returned through @ppps. | 
|  | */ | 
|  | static void pg_to_raw_osds(struct ceph_osdmap *osdmap, | 
|  | struct ceph_pg_pool_info *pi, | 
|  | const struct ceph_pg *raw_pgid, | 
|  | struct ceph_osds *raw, | 
|  | u32 *ppps) | 
|  | { | 
|  | u32 pps = raw_pg_to_pps(pi, raw_pgid); | 
|  | int ruleno; | 
|  | int len; | 
|  |  | 
|  | ceph_osds_init(raw); | 
|  | if (ppps) | 
|  | *ppps = pps; | 
|  |  | 
|  | ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type, | 
|  | pi->size); | 
|  | if (ruleno < 0) { | 
|  | pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", | 
|  | pi->id, pi->crush_ruleset, pi->type, pi->size); | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (pi->size > ARRAY_SIZE(raw->osds)) { | 
|  | pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n", | 
|  | pi->id, pi->crush_ruleset, pi->type, pi->size, | 
|  | ARRAY_SIZE(raw->osds)); | 
|  | return; | 
|  | } | 
|  |  | 
|  | len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, | 
|  | osdmap->osd_weight, osdmap->max_osd, pi->id); | 
|  | if (len < 0) { | 
|  | pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", | 
|  | len, ruleno, pi->id, pi->crush_ruleset, pi->type, | 
|  | pi->size); | 
|  | return; | 
|  | } | 
|  |  | 
|  | raw->size = len; | 
|  | remove_nonexistent_osds(osdmap, pi, raw); | 
|  | } | 
|  |  | 
|  | /* apply pg_upmap[_items] mappings */ | 
|  | static void apply_upmap(struct ceph_osdmap *osdmap, | 
|  | const struct ceph_pg *pgid, | 
|  | struct ceph_osds *raw) | 
|  | { | 
|  | struct ceph_pg_mapping *pg; | 
|  | int i, j; | 
|  |  | 
|  | pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid); | 
|  | if (pg) { | 
|  | /* make sure targets aren't marked out */ | 
|  | for (i = 0; i < pg->pg_upmap.len; i++) { | 
|  | int osd = pg->pg_upmap.osds[i]; | 
|  |  | 
|  | if (osd != CRUSH_ITEM_NONE && | 
|  | osd < osdmap->max_osd && | 
|  | osdmap->osd_weight[osd] == 0) { | 
|  | /* reject/ignore explicit mapping */ | 
|  | return; | 
|  | } | 
|  | } | 
|  | for (i = 0; i < pg->pg_upmap.len; i++) | 
|  | raw->osds[i] = pg->pg_upmap.osds[i]; | 
|  | raw->size = pg->pg_upmap.len; | 
|  | /* check and apply pg_upmap_items, if any */ | 
|  | } | 
|  |  | 
|  | pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); | 
|  | if (pg) { | 
|  | /* | 
|  | * Note: this approach does not allow a bidirectional swap, | 
|  | * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. | 
|  | */ | 
|  | for (i = 0; i < pg->pg_upmap_items.len; i++) { | 
|  | int from = pg->pg_upmap_items.from_to[i][0]; | 
|  | int to = pg->pg_upmap_items.from_to[i][1]; | 
|  | int pos = -1; | 
|  | bool exists = false; | 
|  |  | 
|  | /* make sure replacement doesn't already appear */ | 
|  | for (j = 0; j < raw->size; j++) { | 
|  | int osd = raw->osds[j]; | 
|  |  | 
|  | if (osd == to) { | 
|  | exists = true; | 
|  | break; | 
|  | } | 
|  | /* ignore mapping if target is marked out */ | 
|  | if (osd == from && pos < 0 && | 
|  | !(to != CRUSH_ITEM_NONE && | 
|  | to < osdmap->max_osd && | 
|  | osdmap->osd_weight[to] == 0)) { | 
|  | pos = j; | 
|  | } | 
|  | } | 
|  | if (!exists && pos >= 0) | 
|  | raw->osds[pos] = to; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Given raw set, calculate up set and up primary.  By definition of an | 
|  | * up set, the result won't contain nonexistent or down OSDs. | 
|  | * | 
|  | * This is done in-place - on return @set is the up set.  If it's | 
|  | * empty, ->primary will remain undefined. | 
|  | */ | 
|  | static void raw_to_up_osds(struct ceph_osdmap *osdmap, | 
|  | struct ceph_pg_pool_info *pi, | 
|  | struct ceph_osds *set) | 
|  | { | 
|  | int i; | 
|  |  | 
|  | /* ->primary is undefined for a raw set */ | 
|  | BUG_ON(set->primary != -1); | 
|  |  | 
|  | if (ceph_can_shift_osds(pi)) { | 
|  | int removed = 0; | 
|  |  | 
|  | /* shift left */ | 
|  | for (i = 0; i < set->size; i++) { | 
|  | if (ceph_osd_is_down(osdmap, set->osds[i])) { | 
|  | removed++; | 
|  | continue; | 
|  | } | 
|  | if (removed) | 
|  | set->osds[i - removed] = set->osds[i]; | 
|  | } | 
|  | set->size -= removed; | 
|  | if (set->size > 0) | 
|  | set->primary = set->osds[0]; | 
|  | } else { | 
|  | /* set down/dne devices to NONE */ | 
|  | for (i = set->size - 1; i >= 0; i--) { | 
|  | if (ceph_osd_is_down(osdmap, set->osds[i])) | 
|  | set->osds[i] = CRUSH_ITEM_NONE; | 
|  | else | 
|  | set->primary = set->osds[i]; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static void apply_primary_affinity(struct ceph_osdmap *osdmap, | 
|  | struct ceph_pg_pool_info *pi, | 
|  | u32 pps, | 
|  | struct ceph_osds *up) | 
|  | { | 
|  | int i; | 
|  | int pos = -1; | 
|  |  | 
|  | /* | 
|  | * Do we have any non-default primary_affinity values for these | 
|  | * osds? | 
|  | */ | 
|  | if (!osdmap->osd_primary_affinity) | 
|  | return; | 
|  |  | 
|  | for (i = 0; i < up->size; i++) { | 
|  | int osd = up->osds[i]; | 
|  |  | 
|  | if (osd != CRUSH_ITEM_NONE && | 
|  | osdmap->osd_primary_affinity[osd] != | 
|  | CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { | 
|  | break; | 
|  | } | 
|  | } | 
|  | if (i == up->size) | 
|  | return; | 
|  |  | 
|  | /* | 
|  | * Pick the primary.  Feed both the seed (for the pg) and the | 
|  | * osd into the hash/rng so that a proportional fraction of an | 
|  | * osd's pgs get rejected as primary. | 
|  | */ | 
|  | for (i = 0; i < up->size; i++) { | 
|  | int osd = up->osds[i]; | 
|  | u32 aff; | 
|  |  | 
|  | if (osd == CRUSH_ITEM_NONE) | 
|  | continue; | 
|  |  | 
|  | aff = osdmap->osd_primary_affinity[osd]; | 
|  | if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && | 
|  | (crush_hash32_2(CRUSH_HASH_RJENKINS1, | 
|  | pps, osd) >> 16) >= aff) { | 
|  | /* | 
|  | * We chose not to use this primary.  Note it | 
|  | * anyway as a fallback in case we don't pick | 
|  | * anyone else, but keep looking. | 
|  | */ | 
|  | if (pos < 0) | 
|  | pos = i; | 
|  | } else { | 
|  | pos = i; | 
|  | break; | 
|  | } | 
|  | } | 
|  | if (pos < 0) | 
|  | return; | 
|  |  | 
|  | up->primary = up->osds[pos]; | 
|  |  | 
|  | if (ceph_can_shift_osds(pi) && pos > 0) { | 
|  | /* move the new primary to the front */ | 
|  | for (i = pos; i > 0; i--) | 
|  | up->osds[i] = up->osds[i - 1]; | 
|  | up->osds[0] = up->primary; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Get pg_temp and primary_temp mappings for given PG. | 
|  | * | 
|  | * Note that a PG may have none, only pg_temp, only primary_temp or | 
|  | * both pg_temp and primary_temp mappings.  This means @temp isn't | 
|  | * always a valid OSD set on return: in the "only primary_temp" case, | 
|  | * @temp will have its ->primary >= 0 but ->size == 0. | 
|  | */ | 
|  | static void get_temp_osds(struct ceph_osdmap *osdmap, | 
|  | struct ceph_pg_pool_info *pi, | 
|  | const struct ceph_pg *pgid, | 
|  | struct ceph_osds *temp) | 
|  | { | 
|  | struct ceph_pg_mapping *pg; | 
|  | int i; | 
|  |  | 
|  | ceph_osds_init(temp); | 
|  |  | 
|  | /* pg_temp? */ | 
|  | pg = lookup_pg_mapping(&osdmap->pg_temp, pgid); | 
|  | if (pg) { | 
|  | for (i = 0; i < pg->pg_temp.len; i++) { | 
|  | if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { | 
|  | if (ceph_can_shift_osds(pi)) | 
|  | continue; | 
|  |  | 
|  | temp->osds[temp->size++] = CRUSH_ITEM_NONE; | 
|  | } else { | 
|  | temp->osds[temp->size++] = pg->pg_temp.osds[i]; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* apply pg_temp's primary */ | 
|  | for (i = 0; i < temp->size; i++) { | 
|  | if (temp->osds[i] != CRUSH_ITEM_NONE) { | 
|  | temp->primary = temp->osds[i]; | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* primary_temp? */ | 
|  | pg = lookup_pg_mapping(&osdmap->primary_temp, pgid); | 
|  | if (pg) | 
|  | temp->primary = pg->primary_temp.osd; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Map a PG to its acting set as well as its up set. | 
|  | * | 
|  | * Acting set is used for data mapping purposes, while up set can be | 
|  | * recorded for detecting interval changes and deciding whether to | 
|  | * resend a request. | 
|  | */ | 
|  | void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, | 
|  | struct ceph_pg_pool_info *pi, | 
|  | const struct ceph_pg *raw_pgid, | 
|  | struct ceph_osds *up, | 
|  | struct ceph_osds *acting) | 
|  | { | 
|  | struct ceph_pg pgid; | 
|  | u32 pps; | 
|  |  | 
|  | WARN_ON(pi->id != raw_pgid->pool); | 
|  | raw_pg_to_pg(pi, raw_pgid, &pgid); | 
|  |  | 
|  | pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); | 
|  | apply_upmap(osdmap, &pgid, up); | 
|  | raw_to_up_osds(osdmap, pi, up); | 
|  | apply_primary_affinity(osdmap, pi, pps, up); | 
|  | get_temp_osds(osdmap, pi, &pgid, acting); | 
|  | if (!acting->size) { | 
|  | memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); | 
|  | acting->size = up->size; | 
|  | if (acting->primary == -1) | 
|  | acting->primary = up->primary; | 
|  | } | 
|  | WARN_ON(!osds_valid(up) || !osds_valid(acting)); | 
|  | } | 
|  |  | 
|  | bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, | 
|  | struct ceph_pg_pool_info *pi, | 
|  | const struct ceph_pg *raw_pgid, | 
|  | struct ceph_spg *spgid) | 
|  | { | 
|  | struct ceph_pg pgid; | 
|  | struct ceph_osds up, acting; | 
|  | int i; | 
|  |  | 
|  | WARN_ON(pi->id != raw_pgid->pool); | 
|  | raw_pg_to_pg(pi, raw_pgid, &pgid); | 
|  |  | 
|  | if (ceph_can_shift_osds(pi)) { | 
|  | spgid->pgid = pgid; /* struct */ | 
|  | spgid->shard = CEPH_SPG_NOSHARD; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting); | 
|  | for (i = 0; i < acting.size; i++) { | 
|  | if (acting.osds[i] == acting.primary) { | 
|  | spgid->pgid = pgid; /* struct */ | 
|  | spgid->shard = i; | 
|  | return true; | 
|  | } | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Return acting primary for given PG, or -1 if none. | 
|  | */ | 
|  | int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, | 
|  | const struct ceph_pg *raw_pgid) | 
|  | { | 
|  | struct ceph_pg_pool_info *pi; | 
|  | struct ceph_osds up, acting; | 
|  |  | 
|  | pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); | 
|  | if (!pi) | 
|  | return -1; | 
|  |  | 
|  | ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting); | 
|  | return acting.primary; | 
|  | } | 
|  | EXPORT_SYMBOL(ceph_pg_to_acting_primary); |