diff --git a/MAINTAINERS b/MAINTAINERS index cf9c9221c3883..47875c792bde7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3671,7 +3671,6 @@ F: drivers/md/bcache/ BCACHEFS M: Kent Overstreet -R: Brian Foster L: linux-bcachefs@vger.kernel.org S: Supported C: irc://irc.oftc.net/bcache diff --git a/drivers/gpu/drm/ci/xfails/requirements.txt b/drivers/gpu/drm/ci/xfails/requirements.txt index e9994c9db799b..8ca68727a5883 100644 --- a/drivers/gpu/drm/ci/xfails/requirements.txt +++ b/drivers/gpu/drm/ci/xfails/requirements.txt @@ -2,7 +2,7 @@ git+https://gitlab.freedesktop.org/gfx-ci/ci-collate@09e7142715c16f54344ddf97013 termcolor==2.3.0 # ci-collate dependencies -certifi==2023.7.22 +certifi==2024.7.4 charset-normalizer==3.2.0 idna==3.4 pip==23.3 diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 66ca0bbee6394..0ab533a2b03be 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -29,10 +29,11 @@ bcachefs-y := \ clock.o \ compress.o \ darray.o \ + data_update.o \ debug.o \ dirent.o \ + disk_accounting.o \ disk_groups.o \ - data_update.o \ ec.o \ errcode.o \ error.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 8dec2c6cbb7eb..8e8aed2a5f99a 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -14,6 +14,7 @@ #include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "lru.h" @@ -29,7 +30,7 @@ #include #include -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); +static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); /* Persistent alloc info: */ @@ -267,27 +268,41 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, i == READ ? "read" : "write", a.v->io_time[i], LRU_TIME_MAX); + unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) > + offsetof(struct bch_alloc_v4, stripe_sectors) + ? a.v->stripe_sectors + : 0; + switch (a.v->data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe, + bkey_fsck_err_on(stripe_sectors || + a.v->dirty_sectors || + a.v->cached_sectors || + a.v->stripe, c, err, alloc_key_empty_but_have_data, - "empty data type free but have data"); + "empty data type free but have data %u.%u.%u %u", + stripe_sectors, + a.v->dirty_sectors, + a.v->cached_sectors, + a.v->stripe); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), + bkey_fsck_err_on(!a.v->dirty_sectors && + !stripe_sectors, c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", bch2_data_type_str(a.v->data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || - bch2_bucket_sectors_dirty(*a.v) || + a.v->dirty_sectors || + stripe_sectors || a.v->stripe, c, err, alloc_key_cached_inconsistency, "data type inconsistency"); @@ -318,6 +333,7 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); a->fragmentation_lru = swab64(a->fragmentation_lru); + a->stripe_sectors = swab32(a->stripe_sectors); bps = alloc_v4_backpointers(a); for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { @@ -342,6 +358,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); + prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); prt_printf(out, "cached_sectors %u\n", a->cached_sectors); prt_printf(out, "stripe %u\n", a->stripe); prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); @@ -459,7 +476,8 @@ bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_i } __flatten -struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos) +struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); @@ -467,7 +485,7 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + ret = bch2_trans_update(trans, &iter, &a->k_i, flags); bch2_trans_iter_exit(trans, &iter); return unlikely(ret) ? ERR_PTR(ret) : a; } @@ -578,8 +596,6 @@ int bch2_alloc_read(struct bch_fs *c) struct bch_dev *ca = NULL; int ret; - down_read(&c->gc_lock); - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_prefetch, k, ({ @@ -628,7 +644,6 @@ int bch2_alloc_read(struct bch_fs *c) bch2_dev_put(ca); bch2_trans_put(trans); - up_read(&c->gc_lock); bch_err_fn(c, ret); return ret; @@ -743,6 +758,61 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; } +static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca, + enum bch_data_type data_type, + s64 delta_buckets, + s64 delta_sectors, + s64 delta_fragmented, unsigned flags) +{ + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_dev_data_type, + .dev_data_type.dev = ca->dev_idx, + .dev_data_type.data_type = data_type, + }; + s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; + + return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); +} + +int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, + const struct bch_alloc_v4 *old, + const struct bch_alloc_v4 *new, + unsigned flags) +{ + s64 old_sectors = bch2_bucket_sectors(*old); + s64 new_sectors = bch2_bucket_sectors(*new); + if (old->data_type != new->data_type) { + int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, + 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?: + bch2_dev_data_type_accounting_mod(trans, ca, old->data_type, + -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags); + if (ret) + return ret; + } else if (old_sectors != new_sectors) { + int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, + 0, + new_sectors - old_sectors, + bch2_bucket_sectors_fragmented(ca, *new) - + bch2_bucket_sectors_fragmented(ca, *old), flags); + if (ret) + return ret; + } + + s64 old_unstriped = bch2_bucket_sectors_unstriped(*old); + s64 new_unstriped = bch2_bucket_sectors_unstriped(*new); + if (old_unstriped != new_unstriped) { + int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped, + !!new_unstriped - !!old_unstriped, + new_unstriped - old_unstriped, + 0, + flags); + if (ret) + return ret; + } + + return 0; +} + int bch2_trigger_alloc(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, @@ -758,10 +828,9 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); + struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; if (flags & BTREE_TRIGGER_transactional) { - struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; - alloc_data_type_set(new_a, new_a->data_type); if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { @@ -818,22 +887,21 @@ int bch2_trigger_alloc(struct btree_trans *trans, goto err; } - /* - * need to know if we're getting called from the invalidate path or - * not: - */ - if ((flags & BTREE_TRIGGER_bucket_invalidate) && old_a->cached_sectors) { - ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, - -((s64) old_a->cached_sectors)); + ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, + -((s64) old_a->cached_sectors), + flags & BTREE_TRIGGER_gc); if (ret) goto err; } + + ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); + if (ret) + goto err; } if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; @@ -862,26 +930,22 @@ int bch2_trigger_alloc(struct btree_trans *trans, c->journal.flushed_seq_ondisk, new.k->p.inode, new.k->p.offset, bucket_journal_seq); - if (ret) { - bch2_fs_fatal_error(c, - "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)); + if (bch2_fs_fatal_err_on(ret, c, + "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) goto err; - } } - percpu_down_read(&c->mark_lock); if (new_a->gen != old_a->gen) { + rcu_read_lock(); u8 *gen = bucket_gen(ca, new.k->p.offset); if (unlikely(!gen)) { - percpu_up_read(&c->mark_lock); + rcu_read_unlock(); goto invalid_bucket; } *gen = new_a->gen; + rcu_read_unlock(); } - bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); - percpu_up_read(&c->mark_lock); - #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) @@ -893,42 +957,27 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (statechange(a->data_type == BCH_DATA_need_discard) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) - bch2_discard_one_bucket_fast(c, new.k->p); + bch2_discard_one_bucket_fast(ca, new.k->p.offset); if (statechange(a->data_type == BCH_DATA_cached) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_gc_gens_async(c); } - if ((flags & BTREE_TRIGGER_gc) && - (flags & BTREE_TRIGGER_bucket_invalidate)) { - struct bch_alloc_v4 new_a_convert; - const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); - - percpu_down_read(&c->mark_lock); + if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { + rcu_read_lock(); struct bucket *g = gc_bucket(ca, new.k->p.offset); if (unlikely(!g)) { - percpu_up_read(&c->mark_lock); + rcu_read_unlock(); goto invalid_bucket; } g->gen_valid = 1; - - bucket_lock(g); - - g->gen_valid = 1; - g->gen = new_a->gen; - g->data_type = new_a->data_type; - g->stripe = new_a->stripe; - g->stripe_redundancy = new_a->stripe_redundancy; - g->dirty_sectors = new_a->dirty_sectors; - g->cached_sectors = new_a->cached_sectors; - - bucket_unlock(g); - percpu_up_read(&c->mark_lock); + g->gen = new_a->gen; + rcu_read_unlock(); } err: printbuf_exit(&buf); @@ -1062,7 +1111,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); if (fsck_err_on(!ca, - c, alloc_key_to_missing_dev_bucket, + trans, alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) ret = bch2_btree_delete_at(trans, alloc_iter, 0); @@ -1082,7 +1131,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; if (fsck_err_on(k.k->type != discard_key_type, - c, need_discard_key_wrong, + trans, need_discard_key_wrong, "incorrect key in need_discard btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], @@ -1112,7 +1161,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; if (fsck_err_on(k.k->type != freespace_key_type, - c, freespace_key_wrong, + trans, freespace_key_wrong, "incorrect key in freespace btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], @@ -1143,7 +1192,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), - c, bucket_gens_key_wrong, + trans, bucket_gens_key_wrong, "incorrect gen in bucket_gens btree (got %u should be %u)\n" " %s", alloc_gen(k, gens_offset), a->gen, @@ -1184,7 +1233,6 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, struct bpos *end, struct btree_iter *freespace_iter) { - struct bch_fs *c = trans->c; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; @@ -1202,7 +1250,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, *end = bkey_min(k.k->p, *end); if (fsck_err_on(k.k->type != KEY_TYPE_set, - c, freespace_hole_missing, + trans, freespace_hole_missing, "hole in alloc btree missing in freespace btree\n" " device %llu buckets %llu-%llu", freespace_iter->pos.inode, @@ -1238,7 +1286,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, struct bpos *end, struct btree_iter *bucket_gens_iter) { - struct bch_fs *c = trans->c; struct bkey_s_c k; struct printbuf buf = PRINTBUF; unsigned i, gens_offset, gens_end_offset; @@ -1262,7 +1309,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, bkey_reassemble(&g.k_i, k); for (i = gens_offset; i < gens_end_offset; i++) { - if (fsck_err_on(g.v.gens[i], c, + if (fsck_err_on(g.v.gens[i], trans, bucket_gens_hole_wrong, "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", bucket_gens_pos_to_alloc(k.k->p, i).inode, @@ -1320,8 +1367,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran if (ret) return ret; - if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, - need_discard_freespace_key_to_invalid_dev_bucket, + if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), + trans, need_discard_freespace_key_to_invalid_dev_bucket, "entry in %s btree for nonexistant dev:bucket %llu:%llu", bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) goto delete; @@ -1330,8 +1377,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran if (fsck_err_on(a->data_type != state || (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(*a)), c, - need_discard_freespace_key_bad, + genbits != alloc_freespace_genbits(*a)), + trans, need_discard_freespace_key_bad, "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_id_str(iter->btree_id), @@ -1378,7 +1425,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); if (!ca) { - if (fsck_err(c, bucket_gens_to_invalid_dev, + if (fsck_err(trans, bucket_gens_to_invalid_dev, "bucket_gens key for invalid device:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, 0); @@ -1386,8 +1433,8 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, } if (fsck_err_on(end <= ca->mi.first_bucket || - start >= ca->mi.nbuckets, c, - bucket_gens_to_invalid_buckets, + start >= ca->mi.nbuckets, + trans, bucket_gens_to_invalid_buckets, "bucket_gens key for invalid buckets:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); @@ -1395,16 +1442,16 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, } for (b = start; b < ca->mi.first_bucket; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, - bucket_gens_nonzero_for_invalid_buckets, + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], + trans, bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; } for (b = ca->mi.nbuckets; b < end; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, - bucket_gens_nonzero_for_invalid_buckets, + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], + trans, bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; @@ -1576,8 +1623,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (a->data_type != BCH_DATA_cached) return 0; - if (fsck_err_on(!a->io_time[READ], c, - alloc_key_cached_but_read_time_zero, + if (fsck_err_on(!a->io_time[READ], + trans, alloc_key_cached_but_read_time_zero, "cached bucket with read_time 0\n" " %s", (printbuf_reset(&buf), @@ -1605,8 +1652,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) return ret; - if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, - alloc_key_to_missing_lru_entry, + if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, + trans, alloc_key_to_missing_lru_entry, "missing lru entry\n" " %s", (printbuf_reset(&buf), @@ -1636,34 +1683,38 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } -static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) +static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { int ret; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { ret = -BCH_ERR_EEXIST_discard_in_flight_add; goto out; } - ret = darray_push(&c->discard_buckets_in_flight, bucket); + ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { + .in_progress = in_progress, + .bucket = bucket, + })); out: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); return ret; } -static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) +static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { - darray_remove_item(&c->discard_buckets_in_flight, i); + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + BUG_ON(!i->in_progress); + darray_remove_item(&ca->discard_buckets_in_flight, i); goto found; } BUG(); found: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); } struct discard_buckets_state { @@ -1671,26 +1722,11 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - struct bch_dev *ca; u64 need_journal_commit_this_dev; }; -static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) -{ - if (s->ca == ca) - return; - - if (s->ca && s->need_journal_commit_this_dev > - bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) - bch2_journal_flush_async(&c->journal, NULL); - - if (s->ca) - percpu_ref_put(&s->ca->io_ref); - s->ca = ca; - s->need_journal_commit_this_dev = 0; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, struct discard_buckets_state *s) @@ -1704,16 +1740,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, bool discard_locked = false; int ret = 0; - struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode - ? s->ca - : bch2_dev_get_ioref(c, pos.inode, WRITE); - if (!ca) { - bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); - return 0; - } - - discard_buckets_next_dev(c, s, ca); - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; goto out; @@ -1773,7 +1799,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) + if (discard_in_flight_add(ca, iter.pos.offset, true)) goto out; discard_locked = true; @@ -1811,7 +1837,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; out: if (discard_locked) - discard_in_flight_remove(c, iter.pos); + discard_in_flight_remove(ca, iter.pos.offset); s->seen++; bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -1820,7 +1846,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, static void bch2_do_discards_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); + struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1831,23 +1858,41 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - - discard_buckets_next_dev(c, &s, NULL); + for_each_btree_key_upto(trans, iter, + BTREE_ID_need_discard, + POS(ca->dev_idx, 0), + POS(ca->dev_idx, U64_MAX), 0, k, + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_write_ref_put(c, BCH_WRITE_REF_discard); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_discards(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_discards(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && - !queue_work(c->write_ref_wq, &c->discard_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + for_each_member_device(c, ca) + bch2_dev_do_discards(ca); } static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) @@ -1876,68 +1921,69 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo static void bch2_do_discards_fast_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); + struct bch_fs *c = ca->fs; while (1) { bool got_bucket = false; - struct bpos bucket; - struct bch_dev *ca; - - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) { - if (i->snapshot) - continue; + u64 bucket; - ca = bch2_dev_get_ioref(c, i->inode, WRITE); - if (!ca) { - darray_remove_item(&c->discard_buckets_in_flight, i); + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) { + if (i->in_progress) continue; - } got_bucket = true; - bucket = *i; - i->snapshot = true; + bucket = i->bucket; + i->in_progress = true; break; } - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); if (!got_bucket) break; if (ca->mi.discard && !c->opts.nochanges) blkdev_issue_discard(ca->disk_sb.bdev, - bucket.offset * ca->mi.bucket_size, + bucket_to_sector(ca, bucket), ca->mi.bucket_size, GFP_KERNEL); int ret = bch2_trans_do(c, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, - bch2_clear_bucket_needs_discard(trans, bucket)); + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, + bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); bch_err_fn(c, ret); - percpu_ref_put(&ca->io_ref); - discard_in_flight_remove(c, bucket); + discard_in_flight_remove(ca, bucket); if (ret) break; } bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + percpu_ref_put(&ca->io_ref); } -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) +static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) { - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode); - bool dead = !ca || percpu_ref_is_dying(&ca->io_ref); - rcu_read_unlock(); + struct bch_fs *c = ca->fs; + + if (discard_in_flight_add(ca, bucket, false)) + return; - if (!dead && - !discard_in_flight_add(c, bucket) && - bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && - !queue_work(c->write_ref_wq, &c->discard_fast_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +put_ioref: + percpu_ref_put(&ca->io_ref); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -1963,7 +2009,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, bucket); + a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; @@ -1984,6 +2030,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.gen++; a->v.data_type = 0; a->v.dirty_sectors = 0; + a->v.stripe_sectors = 0; a->v.cached_sectors = 0; a->v.io_time[READ] = bch2_current_io_time(c, READ); a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); @@ -2038,7 +2085,8 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter static void bch2_do_invalidates_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); + struct bch_fs *c = ca->fs; struct btree_trans *trans = bch2_trans_get(c); int ret = 0; @@ -2046,52 +2094,63 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (ret) goto err; - for_each_member_device(c, ca) { - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - struct btree_iter iter; - bool wrapped = false; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, - ((bch2_current_io_time(c, READ) + U32_MAX) & - LRU_TIME_MAX)), 0); - - while (true) { - bch2_trans_begin(trans); - - struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - if (!k.k) - break; + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + struct btree_iter iter; + bool wrapped = false; - ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); - if (ret) - break; + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, + ((bch2_current_io_time(c, READ) + U32_MAX) & + LRU_TIME_MAX)), 0); - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); + while (true) { + bch2_trans_begin(trans); - if (ret < 0) { - bch2_dev_put(ca); + struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) break; - } + if (!k.k) + break; + + ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + if (ret) + break; + + bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_invalidates(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->invalidate_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_invalidates(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && - !queue_work(c->write_ref_wq, &c->invalidate_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + for_each_member_device(c, ca) + bch2_dev_do_invalidates(ca); } int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, @@ -2407,16 +2466,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_fs_allocator_background_exit(struct bch_fs *c) +void bch2_dev_allocator_background_exit(struct bch_dev *ca) +{ + darray_exit(&ca->discard_buckets_in_flight); +} + +void bch2_dev_allocator_background_init(struct bch_dev *ca) { - darray_exit(&c->discard_buckets_in_flight); + mutex_init(&ca->discard_buckets_in_flight_lock); + INIT_WORK(&ca->discard_work, bch2_do_discards_work); + INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); + INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); } void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - mutex_init(&c->discard_buckets_in_flight_lock); - INIT_WORK(&c->discard_work, bch2_do_discards_work); - INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); - INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index c3cc3c5ba5b63..8d2b62c9588e7 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -41,6 +41,7 @@ static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) { dst->gen = src.gen; dst->data_type = src.data_type; + dst->stripe_sectors = src.stripe_sectors; dst->dirty_sectors = src.dirty_sectors; dst->cached_sectors = src.cached_sectors; dst->stripe = src.stripe; @@ -50,6 +51,7 @@ static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket s { dst->gen = src.gen; dst->data_type = src.data_type; + dst->stripe_sectors = src.stripe_sectors; dst->dirty_sectors = src.dirty_sectors; dst->cached_sectors = src.cached_sectors; dst->stripe = src.stripe; @@ -80,30 +82,49 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, bucket_data_type(bucket) != bucket_data_type(ptr); } -static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a) +static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) { - return a.dirty_sectors + a.cached_sectors; + return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; } -static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) +static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) { - return a.dirty_sectors; + return a.stripe_sectors + a.dirty_sectors; } -static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca, +static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a) +{ + return a.data_type == BCH_DATA_cached + ? a.cached_sectors + : bch2_bucket_sectors_dirty(a); +} + +static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca, struct bch_alloc_v4 a) { - int d = bch2_bucket_sectors_dirty(a); + int d = bch2_bucket_sectors(a); + + return d ? max(0, ca->mi.bucket_size - d) : 0; +} + +static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a) +{ + int d = a.stripe_sectors + a.dirty_sectors; return d ? max(0, ca->mi.bucket_size - d) : 0; } +static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a) +{ + return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0; +} + static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, enum bch_data_type data_type) { if (a.stripe) return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (a.dirty_sectors) + if (bch2_bucket_sectors_dirty(a)) return data_type; if (a.cached_sectors) return BCH_DATA_cached; @@ -185,7 +206,8 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct bpos); +bch2_trans_start_alloc_update(struct btree_trans *, struct bpos, + enum btree_iter_update_trigger_flags); void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); @@ -270,11 +292,15 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); +int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, + const struct bch_alloc_v4 *, + const struct bch_alloc_v4 *, unsigned); int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_dev_do_discards(struct bch_dev *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, @@ -289,6 +315,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } +void bch2_dev_do_invalidates(struct bch_dev *); void bch2_do_invalidates(struct bch_fs *); static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) @@ -312,7 +339,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_fs_allocator_background_exit(struct bch_fs *); +void bch2_dev_allocator_background_exit(struct bch_dev *); +void bch2_dev_allocator_background_init(struct bch_dev *); + void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h index b4ec20be93b86..47d9d006502cb 100644 --- a/fs/bcachefs/alloc_background_format.h +++ b/fs/bcachefs/alloc_background_format.h @@ -70,6 +70,8 @@ struct bch_alloc_v4 { __u32 stripe; __u32 nr_external_backpointers; __u64 fragmentation_lru; + __u32 stripe_sectors; + __u32 pad; } __packed __aligned(8); #define BCH_ALLOC_V4_U64s_V0 6 diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 927a5f300b30e..73228b25da6fa 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -621,13 +621,13 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, avail = dev_buckets_free(ca, *usage, watermark); if (usage->d[BCH_DATA_need_discard].buckets > avail) - bch2_do_discards(c); + bch2_dev_do_discards(ca); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (!avail) { if (cl && !waiting) { @@ -1705,15 +1705,13 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) printbuf_tabstop_push(out, 24); - percpu_down_read(&c->mark_lock); - prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden)); - prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree)); - prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data)); - prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached)); - prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved)); - prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); - prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes)); - percpu_up_read(&c->mark_lock); + prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); + prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); + prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data)); + prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached)); + prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved)); + prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); + prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes)); prt_newline(out); prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 4321f9fb73bd9..ca16fa5de7c8f 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -395,7 +395,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ struct bpos bucket; if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { - if (fsck_err(c, backpointer_to_missing_device, + if (fsck_err(trans, backpointer_to_missing_device, "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, bp_iter, 0); @@ -407,8 +407,8 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ if (ret) goto out; - if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, - backpointer_to_missing_alloc, + if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, + trans, backpointer_to_missing_alloc, "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", alloc_iter.pos.inode, alloc_iter.pos.offset, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -512,7 +512,7 @@ static int check_extent_checksum(struct btree_trans *trans, struct nonce nonce = extent_nonce(extent.k->version, p.crc); struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), - c, dup_backpointer_to_bad_csum_extent, + trans, dup_backpointer_to_bad_csum_extent, "%s", buf.buf)) ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; fsck_err: @@ -671,7 +671,7 @@ static int check_bp_exists(struct btree_trans *trans, prt_printf(&buf, "\n want: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); - if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) + if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); goto out; @@ -932,8 +932,8 @@ static int check_one_backpointer(struct btree_trans *trans, goto out; } - if (fsck_err_on(!k.k, c, - backpointer_to_missing_ptr, + if (fsck_err_on(!k.k, + trans, backpointer_to_missing_ptr, "backpointer for missing %s\n %s", bp.v->level ? "btree node" : "extent", (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a6b83ecab7ce5..372bc339c05db 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -205,6 +205,7 @@ #include #include "bcachefs_format.h" +#include "disk_accounting_types.h" #include "errcode.h" #include "fifo.h" #include "nocow_locking_types.h" @@ -266,6 +267,8 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") +void bch2_print_str(struct bch_fs *, const char *); + __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); @@ -493,6 +496,11 @@ struct io_count { u64 sectors[2][BCH_DATA_NR]; }; +struct discard_in_flight { + bool in_progress:1; + u64 bucket:63; +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -530,8 +538,8 @@ struct bch_dev { /* * Buckets: * Per-bucket arrays are protected by c->mark_lock, bucket_lock and - * gc_lock, for device resize - holding any is sufficient for access: - * Or rcu_read_lock(), but only for dev_ptr_stale(): + * gc_gens_lock, for device resize - holding any is sufficient for + * access: Or rcu_read_lock(), but only for dev_ptr_stale(): */ struct bucket_array __rcu *buckets_gc; struct bucket_gens __rcu *bucket_gens; @@ -539,9 +547,7 @@ struct bch_dev { unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; - struct bch_dev_usage *usage_base; - struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; - struct bch_dev_usage __percpu *usage_gc; + struct bch_dev_usage __percpu *usage; /* Allocator: */ u64 new_fs_bucket_idx; @@ -554,6 +560,12 @@ struct bch_dev { size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; + struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct discard_in_flight) discard_buckets_in_flight; + struct work_struct discard_fast_work; + atomic64_t rebalance_work; struct journal_device journal; @@ -581,6 +593,8 @@ struct bch_dev { #define BCH_FS_FLAGS() \ x(new_fs) \ x(started) \ + x(btree_running) \ + x(accounting_replay_done) \ x(may_go_rw) \ x(rw) \ x(was_rw) \ @@ -659,8 +673,6 @@ struct btree_trans_buf { struct btree_trans *trans; }; -#define REPLICAS_DELTA_LIST_MAX (1U << 16) - #define BCACHEFS_ROOT_SUBVOL_INUM \ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) @@ -730,15 +742,14 @@ struct bch_fs { struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; + struct bch_accounting_mem accounting; + struct bch_replicas_cpu replicas; struct bch_replicas_cpu replicas_gc; struct mutex replicas_gc_lock; - mempool_t replicas_delta_pool; struct journal_entry_res btree_root_journal_res; - struct journal_entry_res replicas_journal_res; struct journal_entry_res clock_journal_res; - struct journal_entry_res dev_usage_journal_res; struct bch_disk_groups_cpu __rcu *disk_groups; @@ -878,15 +889,9 @@ struct bch_fs { struct percpu_rw_semaphore mark_lock; seqcount_t usage_lock; - struct bch_fs_usage *usage_base; - struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; - struct bch_fs_usage __percpu *usage_gc; + struct bch_fs_usage_base __percpu *usage; u64 __percpu *online_reserved; - /* single element mempool: */ - struct mutex usage_scratch_lock; - struct bch_fs_usage_online *usage_scratch; - struct io_clock io_clock[2]; /* JOURNAL SEQ BLACKLIST */ @@ -915,11 +920,6 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; - DARRAY(struct bpos) discard_buckets_in_flight; - struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ struct work_struct gc_gens_work; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index e3b1bde489c3b..74a60b1a4ddfa 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -417,7 +417,8 @@ static inline void bkey_init(struct bkey *k) x(bucket_gens, 30) \ x(snapshot_tree, 31) \ x(logged_op_truncate, 32) \ - x(logged_op_finsert, 33) + x(logged_op_finsert, 33) \ + x(accounting, 34) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -467,18 +468,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -/* LRU btree: */ - -struct bch_lru { - struct bch_val v; - __le64 idx; -} __packed __aligned(8); - -#define LRU_ID_STRIPES (1U << 16) - -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -505,6 +494,9 @@ struct bch_sb_field { x(downgrade, 14) #include "alloc_background_format.h" +#include "dirent_format.h" +#include "disk_accounting_format.h" +#include "disk_groups_format.h" #include "extents_format.h" #include "ec_format.h" #include "dirent_format.h" @@ -512,6 +504,7 @@ struct bch_sb_field { #include "inode_format.h" #include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" +#include "lru_format.h" #include "quota_format.h" #include "reflink_format.h" #include "replicas_format.h" @@ -602,48 +595,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -#define BCH_DATA_TYPES() \ - x(free, 0) \ - x(sb, 1) \ - x(journal, 2) \ - x(btree, 3) \ - x(user, 4) \ - x(cached, 5) \ - x(parity, 6) \ - x(stripe, 7) \ - x(need_gc_gens, 8) \ - x(need_discard, 9) - -enum bch_data_type { -#define x(t, n) BCH_DATA_##t, - BCH_DATA_TYPES() -#undef x - BCH_DATA_NR -}; - -static inline bool data_type_is_empty(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - return true; - default: - return false; - } -} - -static inline bool data_type_is_hidden(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_sb: - case BCH_DATA_journal: - return true; - default: - return false; - } -} - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: @@ -722,7 +673,9 @@ struct bch_sb_field_ext { x(member_seq, BCH_VERSION(1, 4)) \ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ x(btree_subvolume_children, BCH_VERSION(1, 6)) \ - x(mi_btree_bitmap, BCH_VERSION(1, 7)) + x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ + x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ + x(disk_accounting_v2, BCH_VERSION(1, 9)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1174,7 +1127,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e) switch (e->type) { case BCH_JSET_ENTRY_btree_keys: case BCH_JSET_ENTRY_btree_root: - case BCH_JSET_ENTRY_overwrite: case BCH_JSET_ENTRY_write_buffer_keys: return true; } @@ -1375,7 +1327,9 @@ enum btree_id_flags { x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ x(subvolume_children, 19, 0, \ - BIT_ULL(KEY_TYPE_set)) + BIT_ULL(KEY_TYPE_set)) \ + x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \ + BIT_ULL(KEY_TYPE_accounting)) \ enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 4b8fba754b1c1..3c23bdf788cea 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -5,6 +5,7 @@ #include #include #include "bcachefs_format.h" +#include "bkey_types.h" /* * Flags common to multiple ioctls: @@ -85,6 +86,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) +#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -251,12 +253,18 @@ struct bch_replicas_usage { struct bch_replicas_entry_v1 r; } __packed; +static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u) +{ + return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r); +} + static inline struct bch_replicas_usage * replicas_usage_next(struct bch_replicas_usage *u) { - return (void *) u + replicas_entry_bytes(&u->r) + 8; + return (void *) u + replicas_usage_bytes(u); } +/* Obsolete */ /* * BCH_IOCTL_FS_USAGE: query filesystem disk space usage * @@ -282,6 +290,7 @@ struct bch_ioctl_fs_usage { struct bch_replicas_usage replicas[]; }; +/* Obsolete */ /* * BCH_IOCTL_DEV_USAGE: query device disk space usage * @@ -306,6 +315,7 @@ struct bch_ioctl_dev_usage { } d[10]; }; +/* Obsolete */ struct bch_ioctl_dev_usage_v2 { __u64 dev; __u32 flags; @@ -409,4 +419,28 @@ struct bch_ioctl_fsck_online { __u64 opts; /* string */ }; +/* + * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting + * + * Returns disk space usage broken out by data type, number of replicas, and + * by component device + * + * @replica_entries_bytes - size, in bytes, allocated for replica usage entries + * + * On success, @replica_entries_bytes will be changed to indicate the number of + * bytes actually used. + * + * Returns -ERANGE if @replica_entries_bytes was too small + */ +struct bch_ioctl_query_accounting { + __u64 capacity; + __u64 used; + __u64 online_reserved; + + __u32 accounting_u64s; /* input parameter */ + __u32 accounting_types_mask; /* input parameter */ + + struct bkey_i_accounting accounting[]; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index bd32aac051921..5f07cf853d0c7 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -7,6 +7,7 @@ #include "btree_types.h" #include "alloc_background.h" #include "dirent.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "extents.h" diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 4f5e411771ba1..f5d85b50b6f2f 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -602,8 +602,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure struct btree_cache *bc = &c->btree_cache; struct task_struct *old; - old = cmpxchg(&bc->alloc_lock, NULL, current); - if (old == NULL || old == current) + old = NULL; + if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) goto success; if (!cl) { @@ -614,8 +614,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure closure_wait(&bc->alloc_wait, cl); /* Try again, after adding ourselves to waitlist */ - old = cmpxchg(&bc->alloc_lock, NULL, current); - if (old == NULL || old == current) { + old = NULL; + if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { /* We raced */ closure_wake_up(&bc->alloc_wait); goto success; @@ -1257,6 +1257,14 @@ const char *bch2_btree_id_str(enum btree_id btree) return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; } +void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) +{ + if (btree < BTREE_ID_NR) + prt_str(out, __bch2_btree_ids[btree]); + else + prt_printf(out, "(unknown btree %u)", btree); +} + void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { prt_printf(out, "%s level %u/%u\n ", diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index fed35de3e4de7..c0eb87a057ccb 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -132,6 +132,8 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) } const char *bch2_btree_id_str(enum btree_id); +void bch2_btree_id_to_text(struct printbuf *, enum btree_id); + void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0e477a926579a..2e9ccb2071ff7 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -20,6 +20,7 @@ #include "buckets.h" #include "clock.h" #include "debug.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "extents.h" @@ -44,6 +45,22 @@ #define DROP_PREV_NODE 11 #define DID_FILL_FROM_SCAN 12 +static const char * const bch2_gc_phase_strs[] = { +#define x(n) #n, + GC_PHASES() +#undef x + NULL +}; + +void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) +{ + prt_str(out, bch2_gc_phase_strs[p->phase]); + prt_char(out, ' '); + bch2_btree_id_to_text(out, p->btree); + prt_printf(out, " l=%u ", p->level); + bch2_bpos_to_text(out, p->pos); +} + static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) { return (struct bkey_s) {{{ @@ -174,10 +191,11 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) return 0; } -static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b, +static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b, struct btree *prev, struct btree *cur, struct bpos *pulled_from_scan) { + struct bch_fs *c = trans->c; struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); @@ -215,29 +233,29 @@ static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b, *pulled_from_scan = cur->data->min_key; ret = DID_FILL_FROM_SCAN; } else { - if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, "btree node with incorrect min_key%s", buf.buf)) ret = set_node_min(c, cur, expected_start); } } else { /* overlap */ if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ - if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node, + if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, "btree node overwritten by next node%s", buf.buf)) ret = DROP_PREV_NODE; } else { - if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, "btree node with incorrect max_key%s", buf.buf)) ret = set_node_max(c, prev, bpos_predecessor(cur->data->min_key)); } } else { if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ - if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node, + if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, "btree node overwritten by prev node%s", buf.buf)) ret = DROP_THIS_NODE; } else { - if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, "btree node with incorrect min_key%s", buf.buf)) ret = set_node_min(c, cur, expected_start); } @@ -249,9 +267,10 @@ static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b, return ret; } -static int btree_repair_node_end(struct bch_fs *c, struct btree *b, +static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, struct btree *child, struct bpos *pulled_from_scan) { + struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; @@ -265,7 +284,7 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b, prt_str(&buf, "\n child: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); - if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, "btree node with incorrect max_key%s", buf.buf)) { if (b->c.level == 1 && bpos_lt(*pulled_from_scan, b->key.k.p)) { @@ -324,8 +343,8 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c, - btree_node_unreadable, + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), + trans, btree_node_unreadable, "Topology repair: unreadable btree node at btree %s level %u:\n" " %s", bch2_btree_id_str(b->c.btree_id), @@ -362,7 +381,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct continue; } - ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan); + ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan); if (ret == DID_FILL_FROM_SCAN) { new_pass = true; ret = 0; @@ -403,7 +422,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(c, b, prev, pulled_from_scan); + ret = btree_repair_node_end(trans, b, prev, pulled_from_scan); if (ret == DID_FILL_FROM_SCAN) { new_pass = true; ret = 0; @@ -461,8 +480,8 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - if (mustfix_fsck_err_on(!have_child, c, - btree_node_topology_interior_node_empty, + if (mustfix_fsck_err_on(!have_child, + trans, btree_node_topology_interior_node_empty, "empty interior btree node at btree %s level %u\n" " %s", bch2_btree_id_str(b->c.btree_id), @@ -509,7 +528,7 @@ int bch2_check_topology(struct bch_fs *c) r->error = 0; if (!bch2_btree_has_scanned_nodes(c, i)) { - mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, + mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); bch2_btree_root_alloc_fake_trans(trans, i, 0); } else { @@ -583,8 +602,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, BUG_ON(bch2_journal_seq_verify && k.k->version.lo > atomic64_read(&c->journal.seq)); - if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, - bkey_version_in_future, + if (fsck_err_on(btree_id != BTREE_ID_accounting && + k.k->version.lo > atomic64_read(&c->key_version), + trans, bkey_version_in_future, "key version number higher than recorded %llu\n %s", atomic64_read(&c->key_version), (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -592,7 +612,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, } if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), - c, btree_bitmap_not_marked, + trans, btree_bitmap_not_marked, "btree ptr not marked in member info btree allocated bitmap\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), @@ -622,7 +642,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, } ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), - BTREE_TRIGGER_gc|flags); + BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags); out: fsck_err: printbuf_exit(&buf); @@ -633,29 +653,14 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; - int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1; + unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; int ret = 0; /* We need to make sure every leaf node is readable before going RW */ if (initial) target_depth = 0; - /* root */ - mutex_lock(&c->btree_root_lock); - struct btree *b = bch2_btree_id_root(c, btree)->b; - if (!btree_node_fake(b)) { - gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); - ret = lockrestart_do(trans, - bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, - NULL, NULL, bkey_i_to_s_c(&b->key), initial)); - level = b->c.level; - } - mutex_unlock(&c->btree_root_lock); - - if (ret) - return ret; - - for (; level >= target_depth; --level) { + for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) { struct btree *prev = NULL; struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, @@ -666,9 +671,21 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); })); if (ret) - break; + goto err; } + /* root */ + mutex_lock(&c->btree_root_lock); + struct btree *b = bch2_btree_id_root(c, btree)->b; + if (!btree_node_fake(b)) { + gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); + ret = lockrestart_do(trans, + bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, + NULL, NULL, bkey_i_to_s_c(&b->key), initial)); + } + mutex_unlock(&c->btree_root_lock); +err: + bch_err_fn(c, ret); return ret; } @@ -697,7 +714,7 @@ static int bch2_gc_btrees(struct bch_fs *c) ret = bch2_gc_btree(trans, btree, true); if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - c, btree_node_read_error, + trans, btree_node_read_error, "btree node read error for %s", bch2_btree_id_str(btree))) ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); @@ -720,131 +737,25 @@ static int bch2_mark_superblocks(struct bch_fs *c) static void bch2_gc_free(struct bch_fs *c) { + bch2_accounting_gc_free(c); + genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); for_each_member_device(c, ca) { kvfree(rcu_dereference_protected(ca->buckets_gc, 1)); ca->buckets_gc = NULL; - - free_percpu(ca->usage_gc); - ca->usage_gc = NULL; - } - - free_percpu(c->usage_gc); - c->usage_gc = NULL; -} - -static int bch2_gc_done(struct bch_fs *c) -{ - struct bch_dev *ca = NULL; - struct printbuf buf = PRINTBUF; - unsigned i; - int ret = 0; - - percpu_down_write(&c->mark_lock); - -#define copy_field(_err, _f, _msg, ...) \ - if (fsck_err_on(dst->_f != src->_f, c, _err, \ - _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \ - dst->_f, src->_f)) \ - dst->_f = src->_f -#define copy_dev_field(_err, _f, _msg, ...) \ - copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) -#define copy_fs_field(_err, _f, _msg, ...) \ - copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) - - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - - __for_each_member_device(c, ca) { - struct bch_dev_usage *dst = ca->usage_base; - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, - dev_usage_u64s()); - - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(dev_usage_buckets_wrong, - d[i].buckets, "%s buckets", bch2_data_type_str(i)); - copy_dev_field(dev_usage_sectors_wrong, - d[i].sectors, "%s sectors", bch2_data_type_str(i)); - copy_dev_field(dev_usage_fragmented_wrong, - d[i].fragmented, "%s fragmented", bch2_data_type_str(i)); - } - } - - { - unsigned nr = fs_usage_u64s(c); - struct bch_fs_usage *dst = c->usage_base; - struct bch_fs_usage *src = (void *) - bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); - - copy_fs_field(fs_usage_hidden_wrong, - b.hidden, "hidden"); - copy_fs_field(fs_usage_btree_wrong, - b.btree, "btree"); - - copy_fs_field(fs_usage_data_wrong, - b.data, "data"); - copy_fs_field(fs_usage_cached_wrong, - b.cached, "cached"); - copy_fs_field(fs_usage_reserved_wrong, - b.reserved, "reserved"); - copy_fs_field(fs_usage_nr_inodes_wrong, - b.nr_inodes,"nr_inodes"); - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(fs_usage_persistent_reserved_wrong, - persistent_reserved[i], - "persistent_reserved[%i]", i); - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - printbuf_reset(&buf); - bch2_replicas_entry_to_text(&buf, e); - - copy_fs_field(fs_usage_replicas_wrong, - replicas[i], "%s", buf.buf); - } } - -#undef copy_fs_field -#undef copy_dev_field -#undef copy_stripe_field -#undef copy_field -fsck_err: - bch2_dev_put(ca); - bch_err_fn(c, ret); - percpu_up_write(&c->mark_lock); - printbuf_exit(&buf); - return ret; } static int bch2_gc_start(struct bch_fs *c) { - BUG_ON(c->usage_gc); - - c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!c->usage_gc) { - bch_err(c, "error allocating c->usage_gc"); - return -BCH_ERR_ENOMEM_gc_start; - } - for_each_member_device(c, ca) { - BUG_ON(ca->usage_gc); - - ca->usage_gc = alloc_percpu(struct bch_dev_usage); - if (!ca->usage_gc) { - bch_err(c, "error allocating ca->usage_gc"); + int ret = bch2_dev_usage_init(ca, true); + if (ret) { bch2_dev_put(ca); - return -BCH_ERR_ENOMEM_gc_start; + return ret; } - - this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, - ca->mi.nbuckets - ca->mi.first_bucket); } return 0; @@ -858,6 +769,7 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, l.oldest_gen != r.oldest_gen || l.data_type != r.data_type || l.dirty_sectors != r.dirty_sectors || + l.stripe_sectors != r.stripe_sectors || l.cached_sectors != r.cached_sectors || l.stripe_redundancy != r.stripe_redundancy || l.stripe != r.stripe; @@ -891,6 +803,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, gc.data_type = old->data_type; gc.dirty_sectors = old->dirty_sectors; } + percpu_up_read(&c->mark_lock); /* * gc.data_type doesn't yet include need_discard & need_gc_gen states - @@ -899,12 +812,14 @@ static int bch2_alloc_write_key(struct btree_trans *trans, alloc_data_type_set(&gc, gc.data_type); if (gc.data_type != old_gc.data_type || - gc.dirty_sectors != old_gc.dirty_sectors) - bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true); - percpu_up_read(&c->mark_lock); + gc.dirty_sectors != old_gc.dirty_sectors) { + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc); + if (ret) + return ret; + } - if (fsck_err_on(new.data_type != gc.data_type, c, - alloc_key_data_type_wrong, + if (fsck_err_on(new.data_type != gc.data_type, + trans, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" ": got %s, should be %s", iter->pos.inode, iter->pos.offset, @@ -914,7 +829,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ - if (fsck_err_on(new._f != gc._f, c, _errtype, \ + if (fsck_err_on(new._f != gc._f, \ + trans, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ @@ -927,6 +843,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, gen); copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); + copy_bucket_field(alloc_key_stripe_sectors_wrong, + stripe_sectors); copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); copy_bucket_field(alloc_key_stripe_wrong, @@ -981,14 +899,16 @@ static int bch2_gc_alloc_done(struct bch_fs *c) static int bch2_gc_alloc_start(struct bch_fs *c) { + int ret = 0; + for_each_member_device(c, ca) { struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { bch2_dev_put(ca); - bch_err(c, "error allocating ca->buckets[gc]"); - return -BCH_ERR_ENOMEM_gc_alloc_start; + ret = -BCH_ERR_ENOMEM_gc_alloc_start; + break; } buckets->first_bucket = ca->mi.first_bucket; @@ -998,27 +918,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c) rcu_assign_pointer(ca->buckets_gc, buckets); } - struct bch_dev *ca = NULL; - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch, k, ({ - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - if (bucket_valid(ca, k.k->p.offset)) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - - struct bucket *g = gc_bucket(ca, k.k->p.offset); - g->gen_valid = 1; - g->gen = a->gen; - } - 0; - }))); - bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } @@ -1048,8 +947,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, return -EINVAL; } - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, - reflink_v_refcount_wrong, + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), + trans, reflink_v_refcount_wrong, "reflink key has wrong refcount:\n" " %s\n" " should be %u", @@ -1147,7 +1046,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, if (bad) bch2_bkey_val_to_text(&buf, c, k); - if (fsck_err_on(bad, c, stripe_sector_count_wrong, + if (fsck_err_on(bad, + trans, stripe_sector_count_wrong, "%s", buf.buf)) { struct bkey_i_stripe *new; @@ -1210,7 +1110,8 @@ int bch2_check_allocations(struct bch_fs *c) bch2_btree_interior_updates_flush(c); - ret = bch2_gc_start(c) ?: + ret = bch2_gc_accounting_start(c) ?: + bch2_gc_start(c) ?: bch2_gc_alloc_start(c) ?: bch2_gc_reflink_start(c); if (ret) @@ -1219,7 +1120,9 @@ int bch2_check_allocations(struct bch_fs *c) gc_pos_set(c, gc_phase(GC_PHASE_start)); ret = bch2_mark_superblocks(c); - BUG_ON(ret); + bch_err_msg(c, ret, "marking superblocks"); + if (ret) + goto out; ret = bch2_gc_btrees(c); if (ret) @@ -1227,15 +1130,11 @@ int bch2_check_allocations(struct bch_fs *c) c->gc_count++; - bch2_journal_block(&c->journal); -out: ret = bch2_gc_alloc_done(c) ?: - bch2_gc_done(c) ?: + bch2_gc_accounting_done(c) ?: bch2_gc_stripes_done(c) ?: bch2_gc_reflink_done(c); - - bch2_journal_unblock(&c->journal); - +out: percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); @@ -1330,7 +1229,7 @@ int bch2_gc_gens(struct bch_fs *c) int ret; /* - * Ideally we would be using state_lock and not gc_lock here, but that + * Ideally we would be using state_lock and not gc_gens_lock here, but that * introduces a deadlock in the RO path - we currently take the state * lock at the start of going RO, thus the gc thread may get stuck: */ @@ -1338,7 +1237,8 @@ int bch2_gc_gens(struct bch_fs *c) return 0; trace_and_count(c, gc_gens_start, c); - down_read(&c->gc_lock); + + down_read(&c->state_lock); for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); @@ -1407,7 +1307,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = NULL; } - up_read(&c->gc_lock); + up_read(&c->state_lock); mutex_unlock(&c->gc_gens_lock); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 876d81e2017d7..8a47e8bd07910 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -47,17 +47,10 @@ static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, }; } -/* - * GC position of the pointers within a btree node: note, _not_ for &b->key - * itself, that lives in the parent node: - */ -static inline struct gc_pos gc_pos_btree_node(struct btree *b) -{ - return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p); -} - static inline int gc_btree_order(enum btree_id btree) { + if (btree == BTREE_ID_alloc) + return -2; if (btree == BTREE_ID_stripes) return -1; return btree; @@ -65,11 +58,11 @@ static inline int gc_btree_order(enum btree_id btree) static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { - return cmp_int(l.phase, r.phase) ?: - cmp_int(gc_btree_order(l.btree), - gc_btree_order(r.btree)) ?: - -cmp_int(l.level, r.level) ?: - bpos_cmp(l.pos, r.pos); + return cmp_int(l.phase, r.phase) ?: + cmp_int(gc_btree_order(l.btree), + gc_btree_order(r.btree)) ?: + cmp_int(l.level, r.level) ?: + bpos_cmp(l.pos, r.pos); } static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) @@ -85,6 +78,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) return ret; } +void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); + int bch2_gc_gens(struct bch_fs *); void bch2_gc_gens_async(struct bch_fs *); void bch2_fs_gc_init(struct bch_fs *); diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h index b82c24bcc0880..c24dd6edf3773 100644 --- a/fs/bcachefs/btree_gc_types.h +++ b/fs/bcachefs/btree_gc_types.h @@ -4,11 +4,16 @@ #include +#define GC_PHASES() \ + x(not_running) \ + x(start) \ + x(sb) \ + x(btree) + enum gc_phase { - GC_PHASE_not_running, - GC_PHASE_start, - GC_PHASE_sb, - GC_PHASE_btree, +#define x(n) GC_PHASE_##n, + GC_PHASES() +#undef x }; struct gc_pos { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 7bca15c604f53..e092f541c4492 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -534,7 +534,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, printbuf_indent_add(out, 2); prt_printf(out, "\nnode offset %u/%u", - b->written, btree_ptr_sectors_written(&b->key)); + b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); if (k) @@ -585,7 +585,7 @@ static int __btree_err(int ret, switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: ret = !silent - ? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf) + ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) : -BCH_ERR_fsck_fix; if (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore) @@ -689,6 +689,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, int write, bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); + unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; int ret = 0; @@ -732,11 +733,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (btree_err_on(offset + sectors > btree_sectors(c), + if (!write && + btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, NULL, bset_past_end_of_btree_node, - "bset past end of btree node")) { + "bset past end of btree node (offset %u len %u but written %zu)", + offset, sectors, ptr_written ?: btree_sectors(c))) { i->u64s = 0; ret = 0; goto out; @@ -1002,7 +1005,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); unsigned u64s; - unsigned ptr_written = btree_ptr_sectors_written(&b->key); + unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); struct printbuf buf = PRINTBUF; int ret = 0, retry_read = 0, write = READ; u64 start_time = local_clock(); @@ -1796,15 +1799,16 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { - unsigned long old, new, v = READ_ONCE(b->will_make_reachable); + unsigned long old, new; + old = READ_ONCE(b->will_make_reachable); do { - old = new = v; + new = old; if (!(old & 1)) break; new &= ~1UL; - } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); + } while (!try_cmpxchg(&b->will_make_reachable, &old, new)); if (old & 1) closure_put(&((struct btree_update *) new)->cl); @@ -1815,14 +1819,14 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, static void __btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); - unsigned long old, new, v; + unsigned long old, new; unsigned type = 0; bch2_btree_complete_write(c, b, w); - v = READ_ONCE(b->flags); + old = READ_ONCE(b->flags); do { - old = new = v; + new = old; if ((old & (1U << BTREE_NODE_dirty)) && (old & (1U << BTREE_NODE_need_write)) && @@ -1842,7 +1846,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) new &= ~(1U << BTREE_NODE_write_in_flight); new &= ~(1U << BTREE_NODE_write_in_flight_inner); } - } while ((v = cmpxchg(&b->flags, old, new)) != old); + } while (!try_cmpxchg(&b->flags, &old, new)); if (new & (1U << BTREE_NODE_write_in_flight)) __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); @@ -2014,8 +2018,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) * dirty bit requires a write lock, we can't race with other threads * redirtying it: */ + old = READ_ONCE(b->flags); do { - old = new = READ_ONCE(b->flags); + new = old; if (!(old & (1 << BTREE_NODE_dirty))) return; @@ -2046,7 +2051,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) new |= (1 << BTREE_NODE_write_in_flight_inner); new |= (1 << BTREE_NODE_just_written); new ^= (1 << BTREE_NODE_write_idx); - } while (cmpxchg_acquire(&b->flags, old, new) != old); + } while (!try_cmpxchg_acquire(&b->flags, &old, new)); if (new & (1U << BTREE_NODE_need_write)) return; @@ -2133,7 +2138,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) if (!b->written && b->key.k.type == KEY_TYPE_btree_ptr_v2) - BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); + BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write); memset(data + bytes_to_write, 0, (sectors_to_write << 9) - bytes_to_write); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 2b8b564fc5608..63d76f5c64031 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -27,10 +27,10 @@ static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b atomic_dec(&c->btree_cache.dirty); } -static inline unsigned btree_ptr_sectors_written(struct bkey_i *k) +static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k) { - return k->k.type == KEY_TYPE_btree_ptr_v2 - ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written) + return k.k->type == KEY_TYPE_btree_ptr_v2 + ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written) : 0; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 0ed9e6574fcd0..80f4a39592b45 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -325,7 +325,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k } void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos, bool key_cache) + struct bpos pos) { bch2_trans_verify_not_unlocked(trans); @@ -336,19 +336,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, btree_trans_sort_paths(trans); trans_for_each_path_inorder(trans, path, iter) { - int cmp = cmp_int(path->btree_id, id) ?: - cmp_int(path->cached, key_cache); - - if (cmp > 0) - break; - if (cmp < 0) - continue; - - if (!btree_node_locked(path, 0) || + if (path->btree_id != id || + !btree_node_locked(path, 0) || !path->should_be_locked) continue; - if (!key_cache) { + if (!path->cached) { if (bkey_ge(pos, path->l[0].b->data->min_key) && bkey_le(pos, path->l[0].b->key.k.p)) return; @@ -361,9 +354,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, bch2_dump_trans_paths_updates(trans); bch2_bpos_to_text(&buf, pos); - panic("not locked: %s %s%s\n", - bch2_btree_id_str(id), buf.buf, - key_cache ? " cached" : ""); + panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); } #else @@ -997,6 +988,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) bch2_trans_unlock(trans); cond_resched(); trans->locked = true; + trans->last_unlock_ip = 0; if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -1465,7 +1457,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans) struct printbuf buf = PRINTBUF; bch2_trans_updates_to_text(&buf, trans); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(trans->c, buf.buf); printbuf_exit(&buf); } @@ -1482,6 +1474,14 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra path->level); bch2_bpos_to_text(out, path->pos); + if (!path->cached && btree_node_locked(path, path->level)) { + prt_char(out, ' '); + struct btree *b = path_l(path)->b; + bch2_bpos_to_text(out, b->data->min_key); + prt_char(out, '-'); + bch2_bpos_to_text(out, b->key.k.p); + } + #ifdef TRACK_PATH_ALLOCATED prt_printf(out, " %pS", (void *) path->ip_allocated); #endif @@ -1557,7 +1557,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(trans->c, buf.buf); printbuf_exit(&buf); } @@ -1801,13 +1801,12 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * goto hole; } else { struct bkey_cached *ck = (void *) path->l[0].b; - - EBUG_ON(ck && - (path->btree_id != ck->key.btree_id || - !bkey_eq(path->pos, ck->key.pos))); - if (!ck || !ck->valid) + if (!ck) return bkey_s_c_null; + EBUG_ON(path->btree_id != ck->key.btree_id || + !bkey_eq(path->pos, ck->key.pos)); + *u = ck->k->k; k = bkey_i_to_s_c(ck->k); } @@ -3090,6 +3089,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; trans->locked = true; + trans->last_unlock_ip = 0; if (trans->restarted) { bch2_btree_path_traverse_all(trans); @@ -3239,15 +3239,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - if (trans->fs_usage_deltas) { - if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == - REPLICAS_DELTA_LIST_MAX) - mempool_free(trans->fs_usage_deltas, - &c->replicas_delta_pool); - else - kfree(trans->fs_usage_deltas); - } - if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); @@ -3283,6 +3274,20 @@ void bch2_trans_put(struct btree_trans *trans) } } +bool bch2_current_has_btree_trans(struct bch_fs *c) +{ + seqmutex_lock(&c->btree_trans_lock); + struct btree_trans *trans; + bool ret = false; + list_for_each_entry(trans, &c->btree_trans_list, list) + if (trans->locking_wait.task == current) { + ret = true; + break; + } + seqmutex_unlock(&c->btree_trans_lock); + return ret; +} + static void __maybe_unused bch2_btree_bkey_cached_common_to_text(struct printbuf *out, struct btree_bkey_cached_common *b) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 798eb1c47966d..c7725865309c0 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -268,12 +268,11 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); -void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, - struct bpos, bool); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); #else static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos, bool key_cache) {} + struct bpos pos) {} #endif void bch2_btree_path_fix_key_modified(struct btree_trans *trans, @@ -866,6 +865,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, _p; \ }) +#define bch2_trans_run(_c, _do) \ +({ \ + struct btree_trans *trans = bch2_trans_get(_c); \ + int _ret = (_do); \ + bch2_trans_put(trans); \ + _ret; \ +}) + void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); @@ -875,6 +882,8 @@ void bch2_dump_trans_paths_updates(struct btree_trans *); struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); +bool bch2_current_has_btree_trans(struct bch_fs *); + extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; unsigned bch2_trans_get_fn_idx(const char *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 332dbf164929e..74933490aaba9 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -16,21 +16,6 @@ * operations for the regular btree iter code to use: */ -static int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) -{ - return (cmp_int(l_btree_id, r->btree_id) ?: - cmp_int(l_level, r->level) ?: - bpos_cmp(l_pos, r->k->k.p)); -} - -static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -{ - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); -} - static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) { size_t gap_size = keys->size - keys->nr; @@ -548,7 +533,13 @@ static void __journal_keys_sort(struct journal_keys *keys) struct journal_key *dst = keys->data; darray_for_each(*keys, src) { - if (src + 1 < &darray_top(*keys) && + /* + * We don't accumulate accounting keys here because we have to + * compare each individual accounting key against the version in + * the btree during replay: + */ + if (src->k->k.type != KEY_TYPE_accounting && + src + 1 < &darray_top(*keys) && !journal_key_cmp(src, src + 1)) continue; diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 1ba4a79b0ef90..1653de9d609bc 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H #define _BCACHEFS_BTREE_JOURNAL_ITER_H +#include "bkey.h" + struct journal_iter { struct list_head list; enum btree_id btree_id; @@ -26,6 +28,21 @@ struct btree_and_journal_iter { bool prefetch; }; +static inline int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + const struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bpos_cmp(l_pos, r->k->k.p)); +} + +static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +{ + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); +} + struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 2d3c0d45c37f9..f2f2e525460b5 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -39,6 +39,15 @@ static const struct rhashtable_params bch2_btree_key_cache_params = { .automatic_shrinking = true, }; +static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, + struct bkey_cached *ck, + enum btree_node_locked_type lock_held) +{ + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; + mark_btree_node_locked(trans, path, 0, lock_held); +} + __flatten inline struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) @@ -196,9 +205,22 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc, six_unlock_intent(&ck->c.lock); } +static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) +{ + struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); + if (unlikely(!ck)) + return NULL; + ck->k = kmalloc(key_u64s * sizeof(u64), gfp); + if (unlikely(!ck->k)) { + kmem_cache_free(bch2_key_cache, ck); + return NULL; + } + ck->u64s = key_u64s; + return ck; +} + static struct bkey_cached * -bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, - bool *was_new) +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; @@ -259,9 +281,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, return ERR_PTR(ret); } - path->l[0].b = (void *) ck; - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); + btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED); ret = bch2_btree_node_lock_write(trans, path, &ck->c); if (unlikely(ret)) { @@ -274,8 +294,10 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, } ck = allocate_dropping_locks(trans, ret, - kmem_cache_zalloc(bch2_key_cache, _gfp)); + __bkey_cached_alloc(key_u64s, _gfp)); if (ret) { + if (ck) + kfree(ck->k); kmem_cache_free(bch2_key_cache, ck); return ERR_PTR(ret); } @@ -289,7 +311,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, ck->c.cached = true; BUG_ON(!six_trylock_intent(&ck->c.lock)); BUG_ON(!six_trylock_write(&ck->c.lock)); - *was_new = true; return ck; } @@ -319,71 +340,102 @@ bkey_cached_reuse(struct btree_key_cache *c) return ck; } -static struct bkey_cached * -btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) +static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck; - bool was_new = false; - ck = bkey_cached_alloc(trans, path, &was_new); - if (IS_ERR(ck)) - return ck; + /* + * bch2_varint_decode can read past the end of the buffer by at + * most 7 bytes (it won't be used): + */ + unsigned key_u64s = k.k->u64s + 1; + + /* + * Allocate some extra space so that the transaction commit path is less + * likely to have to reallocate, since that requires a transaction + * restart: + */ + key_u64s = min(256U, (key_u64s * 3) / 2); + key_u64s = roundup_pow_of_two(key_u64s); + + struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); + int ret = PTR_ERR_OR_ZERO(ck); + if (ret) + return ret; if (unlikely(!ck)) { ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", bch2_btree_id_str(path->btree_id)); - return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); + return -BCH_ERR_ENOMEM_btree_key_cache_create; } - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); } ck->c.level = 0; ck->c.btree_id = path->btree_id; ck->key.btree_id = path->btree_id; ck->key.pos = path->pos; - ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; - if (unlikely(rhashtable_lookup_insert_fast(&bc->table, - &ck->hash, - bch2_btree_key_cache_params))) { - /* We raced with another fill: */ - - if (likely(was_new)) { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - kfree(ck); - } else { - bkey_cached_free_fast(bc, ck); + if (unlikely(key_u64s > ck->u64s)) { + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + + struct bkey_i *new_k = allocate_dropping_locks(trans, ret, + kmalloc(key_u64s * sizeof(u64), _gfp)); + if (unlikely(!new_k)) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(ck->key.btree_id), key_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + } else if (ret) { + kfree(new_k); + goto err; } - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - return NULL; + kfree(ck->k); + ck->k = new_k; + ck->u64s = key_u64s; } - atomic_long_inc(&bc->nr_keys); + bkey_reassemble(ck->k, k); + + ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); + if (unlikely(ret)) /* raced with another fill? */ + goto err; + atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); - return ck; + enum six_lock_type lock_want = __btree_lock_want(path, 0); + if (lock_want == SIX_LOCK_read) + six_lock_downgrade(&ck->c.lock); + btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); + path->uptodate = BTREE_ITER_UPTODATE; + return 0; +err: + bkey_cached_free_fast(bc, ck); + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + + return ret; } -static int btree_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, - struct bkey_cached *ck) +static noinline int btree_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + unsigned flags) { + if (flags & BTREE_ITER_cached_nofill) { + ck_path->uptodate = BTREE_ITER_UPTODATE; + return 0; + } + + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - unsigned new_u64s = 0; - struct bkey_i *new_k = NULL; int ret; - bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, + bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; @@ -392,70 +444,15 @@ static int btree_key_cache_fill(struct btree_trans *trans, if (ret) goto err; - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); - goto err; - } - - /* - * bch2_varint_decode can read past the end of the buffer by at - * most 7 bytes (it won't be used): - */ - new_u64s = k.k->u64s + 1; - - /* - * Allocate some extra space so that the transaction commit path is less - * likely to have to reallocate, since that requires a transaction - * restart: - */ - new_u64s = min(256U, (new_u64s * 3) / 2); - - if (new_u64s > ck->u64s) { - new_u64s = roundup_pow_of_two(new_u64s); - new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); - if (!new_k) { - bch2_trans_unlock(trans); - - new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); - if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(ck->key.btree_id), new_u64s); - ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; - goto err; - } - - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_k); - goto err; - } - - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - kfree(new_k); - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); - goto err; - } - } - } + /* Recheck after btree lookup, before allocating: */ + ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; + if (unlikely(ret)) + goto out; - ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); - if (ret) { - kfree(new_k); + ret = btree_key_cache_create(trans, ck_path, k); + if (ret) goto err; - } - - if (new_k) { - kfree(ck->k); - ck->u64s = new_u64s; - ck->k = new_k; - } - - bkey_reassemble(ck->k, k); - ck->valid = true; - bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); - +out: /* We're not likely to need this iterator again: */ bch2_set_btree_iter_dontneed(&iter); err: @@ -463,137 +460,62 @@ static int btree_key_cache_fill(struct btree_trans *trans, return ret; } -static noinline int -bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, - unsigned flags) +static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, + struct btree_path *path) { struct bch_fs *c = trans->c; struct bkey_cached *ck; - int ret = 0; - - BUG_ON(path->level); - - path->l[1].b = NULL; - - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - ck = (void *) path->l[0].b; - goto fill; - } retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) { - ck = btree_key_cache_create(trans, path); - ret = PTR_ERR_OR_ZERO(ck); - if (ret) - goto err; - if (!ck) - goto retry; - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); - path->locks_want = 1; - } else { - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - BUG_ON(ret); - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; - } - - mark_btree_node_locked(trans, path, 0, - (enum btree_node_locked_type) lock_want); - } + if (!ck) + return -ENOENT; - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; -fill: - path->uptodate = BTREE_ITER_UPTODATE; + enum six_lock_type lock_want = __btree_lock_want(path, 0); - if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) { - ret = bch2_btree_path_upgrade(trans, path, 1) ?: - btree_key_cache_fill(trans, path, ck) ?: - bch2_btree_path_relock(trans, path, _THIS_IP_); - if (ret) - goto err; + int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); + if (ret) + return ret; - path->uptodate = BTREE_ITER_UPTODATE; + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; } if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); - BUG_ON(path->uptodate); - - return ret; -err: - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(ret); - } - return ret; + btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); + path->uptodate = BTREE_ITER_UPTODATE; + return 0; } int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, unsigned flags) { - struct bch_fs *c = trans->c; - struct bkey_cached *ck; - int ret = 0; - EBUG_ON(path->level); path->l[1].b = NULL; if (bch2_btree_node_relock_notrace(trans, path, 0)) { - ck = (void *) path->l[0].b; - goto fill; + path->uptodate = BTREE_ITER_UPTODATE; + return 0; } -retry: - ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) { - return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - } else { - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - if (ret) - return ret; - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; + int ret; + do { + ret = btree_path_traverse_cached_fast(trans, path); + if (unlikely(ret == -ENOENT)) + ret = btree_key_cache_fill(trans, path, flags); + } while (ret == -EEXIST); + + if (unlikely(ret)) { + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); } - - mark_btree_node_locked(trans, path, 0, - (enum btree_node_locked_type) lock_want); } - - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; -fill: - if (!ck->valid) - return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - - if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - - path->uptodate = BTREE_ITER_UPTODATE; - EBUG_ON(!ck->valid); - EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); - return ret; } @@ -632,8 +554,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; } - BUG_ON(!ck->valid); - if (journal_seq && ck->journal.seq != journal_seq) goto out; @@ -755,7 +675,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); bkey_copy(ck->k, insert); - ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); @@ -794,10 +713,9 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck = (void *) path->l[0].b; - BUG_ON(!ck->valid); - /* * We just did an update to the btree, bypassing the key cache: the key * cache key is now stale and must be dropped, even if dirty: @@ -808,7 +726,11 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, bch2_journal_pin_drop(&c->journal, &ck->journal); } - ck->valid = false; + bkey_cached_evict(bc, ck); + bkey_cached_free_fast(bc, ck); + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index d66fff22109ae..79057cde4c412 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -793,6 +793,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) } trans->locked = true; + trans->last_unlock_ip = 0; out: bch2_trans_verify_locks(trans); return 0; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 7f41545b9147f..2530fdb29b154 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -136,6 +136,7 @@ static inline void btree_node_unlock(struct btree_trans *trans, int lock_type = btree_node_locked_type(path, level); EBUG_ON(level >= BTREE_MAX_DEPTH); + EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED); if (lock_type != BTREE_NODE_UNLOCKED) { six_unlock_type(&path->l[level].b->c.lock, lock_type); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 74e1ff225674c..cca336fe46e9b 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -10,6 +10,7 @@ #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "disk_accounting.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -136,7 +137,8 @@ static inline void bch2_trans_unlock_write(struct btree_trans *trans) { if (likely(trans->write_locked)) { trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) + if (btree_node_locked_type(trans->paths + i->path, i->level) == + BTREE_NODE_WRITE_LOCKED) bch2_btree_node_unlock_write_inlined(trans, trans->paths + i->path, insert_l(trans, i)->b); trans->write_locked = false; @@ -228,14 +230,14 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); struct btree_trans *trans = bch2_trans_get(c); - unsigned long old, new, v; + unsigned long old, new; unsigned idx = w - b->writes; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - v = READ_ONCE(b->flags); + old = READ_ONCE(b->flags); do { - old = new = v; + new = old; if (!(old & (1 << BTREE_NODE_dirty)) || !!(old & (1 << BTREE_NODE_write_idx)) != idx || @@ -245,7 +247,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, new &= ~BTREE_WRITE_TYPE_MASK; new |= BTREE_WRITE_journal_reclaim; new |= 1 << BTREE_NODE_need_write; - } while ((v = cmpxchg(&b->flags, old, new)) != old); + } while (!try_cmpxchg(&b->flags, &old, new)); btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); @@ -456,34 +458,36 @@ static int run_one_mem_trigger(struct btree_trans *trans, struct btree_insert_entry *i, unsigned flags) { - struct bkey_s_c old = { &i->old_k, i->old_v }; - struct bkey_i *new = i->k; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - int ret; - verify_update_old_key(trans, i); if (unlikely(flags & BTREE_TRIGGER_norun)) return 0; - if (old_ops->trigger == new_ops->trigger) { - ret = bch2_key_trigger(trans, i->btree_id, i->level, + struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + + if (old_ops->trigger == new_ops->trigger) + return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); - } else { - ret = bch2_key_trigger_new(trans, i->btree_id, i->level, + else + return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(new), flags) ?: - bch2_key_trigger_old(trans, i->btree_id, i->level, + bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags); - } - - return ret; } static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, bool overwrite) { + verify_update_old_key(trans, i); + + if ((i->flags & BTREE_TRIGGER_norun) || + !btree_node_type_has_trans_triggers(i->bkey_type)) + return 0; + /* * Transactional triggers create new btree_insert_entries, so we can't * pass them a pointer to a btree_insert_entry, that memory is going to @@ -495,12 +499,6 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); unsigned flags = i->flags|BTREE_TRIGGER_transactional; - verify_update_old_key(trans, i); - - if ((i->flags & BTREE_TRIGGER_norun) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - return 0; - if (!i->insert_trigger_run && !i->overwrite_trigger_run && old_ops->trigger == new_ops->trigger) { @@ -523,10 +521,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, unsigned btree_id_start) { - bool trans_trigger_run; - int ret, overwrite; - - for (overwrite = 1; overwrite >= 0; --overwrite) { + for (int overwrite = 1; overwrite >= 0; --overwrite) { + bool trans_trigger_run; /* * Running triggers will append more updates to the list of updates as @@ -541,7 +537,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, if (trans->updates[i].btree_id != btree_id) continue; - ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); + int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); if (ret < 0) return ret; if (ret) @@ -594,7 +590,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + btree_node_type_has_trans_triggers(i->bkey_type) && (!i->insert_trigger_run || !i->overwrite_trigger_run)); #endif return 0; @@ -602,24 +598,25 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { - trans_for_each_update(trans, i) { - /* - * XXX: synchronization of cached update triggers with gc - * XXX: synchronization of interior node updates with gc - */ - BUG_ON(i->cached || i->level); - - if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && - gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { + trans_for_each_update(trans, i) + if (btree_node_type_has_triggers(i->bkey_type) && + gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) { int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); if (ret) return ret; } - } return 0; } +static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) +{ + return (struct bversion) { + .hi = res->seq >> 32, + .lo = (res->seq << 32) | (res->offset + offset), + }; +} + static inline int bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct btree_insert_entry **stopped_at, @@ -628,7 +625,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct bch_fs *c = trans->c; struct btree_trans_commit_hook *h; unsigned u64s = 0; - int ret; + int ret = 0; bch2_trans_verify_not_unlocked(trans); bch2_trans_verify_not_in_restart(trans); @@ -693,23 +690,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, i->k->k.version = MAX_VERSION; } - if (trans->fs_usage_deltas && - bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) - return -BCH_ERR_btree_insert_need_mark_replicas; - - /* XXX: we only want to run this if deltas are nonzero */ - bch2_trans_account_disk_usage_change(trans); - h = trans->hooks; while (h) { ret = h->fn(trans, h); if (ret) - goto revert_fs_usage; + return ret; h = h->next; } + struct jset_entry *entry = trans->journal_entries; + + if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { + percpu_down_read(&c->mark_lock); + + for (entry = trans->journal_entries; + entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + entry = vstruct_next(entry)) + if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) { + struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); + + a->k.version = journal_pos_to_bversion(&trans->journal_res, + (u64 *) entry - (u64 *) trans->journal_entries); + BUG_ON(bversion_zero(a->k.version)); + ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false); + if (ret) + goto revert_fs_usage; + } + percpu_up_read(&c->mark_lock); + + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); + } + trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { + if (btree_node_type_has_atomic_triggers(i->bkey_type)) { ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); if (ret) goto fatal_err; @@ -764,29 +778,44 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_update(trans, i) { struct btree_path *path = trans->paths + i->path; - if (!i->cached) { + if (!i->cached) bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); - } else if (!i->key_cache_already_flushed) + else if (!i->key_cache_already_flushed) bch2_btree_insert_key_cached(trans, flags, i); - else { + else bch2_btree_key_cache_drop(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - } } return 0; fatal_err: - bch2_fatal_error(c); + bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); + percpu_down_read(&c->mark_lock); revert_fs_usage: - if (trans->fs_usage_deltas) - bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); + for (struct jset_entry *entry2 = trans->journal_entries; + entry2 != entry; + entry2 = vstruct_next(entry2)) + if (jset_entry_is_key(entry2) && entry2->start->k.type == KEY_TYPE_accounting) { + struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); + + bch2_accounting_neg(a); + bch2_accounting_mem_mod_locked(trans, a.c, false); + bch2_accounting_neg(a); + } + percpu_up_read(&c->mark_lock); return ret; } static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { + /* + * Accounting keys aren't deduped in the journal: we have to compare + * each individual update against what's in the btree to see if it has + * been applied yet, and accounting updates also don't overwrite, + * they're deltas that accumulate. + */ trans_for_each_update(trans, i) - bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); + if (i->k->k.type != KEY_TYPE_accounting) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); } static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, @@ -922,7 +951,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, break; case -BCH_ERR_btree_insert_need_mark_replicas: ret = drop_locks_do(trans, - bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); + bch2_accounting_update_sb(trans)); break; case -BCH_ERR_journal_res_get_blocked: /* @@ -993,15 +1022,24 @@ static noinline int do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { struct bch_fs *c = trans->c; - int ret = 0; trans_for_each_update(trans, i) { - ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) - break; + return ret; } - return ret; + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) + if (i->type == BCH_JSET_ENTRY_btree_keys || + i->type == BCH_JSET_ENTRY_write_buffer_keys) { + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start); + if (ret) + return ret; + } + + return 0; } int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) @@ -1017,8 +1055,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) !trans->journal_entries_u64s) goto out_reset; - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); - ret = bch2_trans_commit_run_triggers(trans); if (ret) goto out_reset; @@ -1115,6 +1151,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) bch2_trans_verify_not_in_restart(trans); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 87f485e9c552d..c9c9864a87b0c 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -388,7 +388,6 @@ struct bkey_cached { unsigned long flags; unsigned long btree_trans_barrier_seq; u16 u64s; - bool valid; struct bkey_cached_key key; struct rhash_head hash; @@ -522,7 +521,6 @@ struct btree_trans { unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ - struct replicas_delta_list *fs_usage_deltas; /* Entries before this are zeroed out on every bch2_trans_get() call */ @@ -754,9 +752,19 @@ const char *bch2_btree_node_type_str(enum btree_node_type); (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) -static inline bool btree_node_type_needs_gc(enum btree_node_type type) +static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type) { - return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type); + return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS; +} + +static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type) +{ + return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS; +} + +static inline bool btree_node_type_has_triggers(enum btree_node_type type) +{ + return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; } static inline bool btree_node_type_is_extents(enum btree_node_type type) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index f3c645a43dcba..d6f6df10dcc3d 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -656,14 +656,16 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, * @disk_res: must be non-NULL whenever inserting or potentially * splitting data extents * @flags: transaction commit flags + * @iter_flags: btree iter update trigger flags * * Returns: 0 on success, error code on failure */ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, - struct disk_reservation *disk_res, int flags) + struct disk_reservation *disk_res, int flags, + enum btree_iter_update_trigger_flags iter_flags) { return bch2_trans_do(c, disk_res, NULL, flags, - bch2_btree_insert_trans(trans, id, k, 0)); + bch2_btree_insert_trans(trans, id, k, iter_flags)); } int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index b4894e4d5447f..60393e98084d7 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -29,6 +29,7 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, "pin journal entry referred to by trans->journal_res.seq") \ x(journal_reclaim, "operation required for journal reclaim; may return error" \ "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ + x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied") enum __bch_trans_commit_flags { /* First bits for bch_watermark: */ @@ -56,8 +57,9 @@ int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, enum btree_iter_update_trigger_flags); -int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, - struct disk_reservation *, int flags); +int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct + disk_reservation *, int flags, enum + btree_iter_update_trigger_flags iter_flags); int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); @@ -130,7 +132,19 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr enum btree_id btree, struct bkey_i *k) { - if (unlikely(trans->journal_replay_not_finished)) + /* + * Most updates skip the btree write buffer until journal replay is + * finished because synchronization with journal replay relies on having + * a btree node locked - if we're overwriting a key in the journal that + * journal replay hasn't yet replayed, we have to mark it as + * overwritten. + * + * But accounting updates don't overwrite, they're deltas, and they have + * to be flushed to the btree strictly in order for journal replay to be + * able to tell which updates need to be applied: + */ + if (k->k.type != KEY_TYPE_accounting && + unlikely(trans->journal_replay_not_finished)) return bch2_btree_insert_clone_trans(trans, btree, k); struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); @@ -178,14 +192,6 @@ static inline int bch2_trans_commit(struct btree_trans *trans, nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_flags))) -#define bch2_trans_run(_c, _do) \ -({ \ - struct btree_trans *trans = bch2_trans_get(_c); \ - int _ret = (_do); \ - bch2_trans_put(trans); \ - _ret; \ -}) - #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) @@ -203,14 +209,6 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans->journal_entries_u64s = 0; trans->hooks = NULL; trans->extra_disk_res = 0; - - if (trans->fs_usage_deltas) { - trans->fs_usage_deltas->used = 0; - memset((void *) trans->fs_usage_deltas + - offsetof(struct replicas_delta_list, memset_start), 0, - (void *) &trans->fs_usage_deltas->memset_end - - (void *) &trans->fs_usage_deltas->memset_start); - } } static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 60b8544cea48c..9575fb65b1f9a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -61,7 +61,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (!bpos_eq(b->data->min_key, POS_MIN)) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); - need_fsck_err(c, btree_root_bad_min_key, + need_fsck_err(trans, btree_root_bad_min_key, "btree root with incorrect min_key: %s", buf.buf); goto topology_repair; } @@ -69,7 +69,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (!bpos_eq(b->data->max_key, SPOS_MAX)) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->max_key); - need_fsck_err(c, btree_root_bad_max_key, + need_fsck_err(trans, btree_root_bad_max_key, "btree root with incorrect max_key: %s", buf.buf); goto topology_repair; } @@ -105,7 +105,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) prt_str(&buf, "\n next "); bch2_bkey_val_to_text(&buf, c, k); - need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf); + need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); goto topology_repair; } @@ -122,7 +122,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) bch2_btree_id_str(b->c.btree_id), b->c.level); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf); + need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); goto topology_repair; } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { bch2_topology_error(c); @@ -135,7 +135,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) prt_str(&buf, "\n last key "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf); + need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); goto topology_repair; } out: @@ -1356,10 +1356,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct bch_fs *c = as->c; struct bkey_packed *k; struct printbuf buf = PRINTBUF; - unsigned long old, new, v; + unsigned long old, new; BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && - !btree_ptr_sectors_written(insert)); + !btree_ptr_sectors_written(bkey_i_to_s_c(insert))); if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); @@ -1395,14 +1395,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); set_btree_node_dirty_acct(c, b); - v = READ_ONCE(b->flags); + old = READ_ONCE(b->flags); do { - old = new = v; + new = old; new &= ~BTREE_WRITE_TYPE_MASK; new |= BTREE_WRITE_interior; new |= 1 << BTREE_NODE_need_write; - } while ((v = cmpxchg(&b->flags, old, new)) != old); + } while (!try_cmpxchg(&b->flags, &old, new)); printbuf_exit(&buf); } diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 75c8a196b3f63..5e1262a542933 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -5,6 +5,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "disk_accounting.h" #include "error.h" #include "journal.h" #include "journal_io.h" @@ -132,7 +133,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans, static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, struct btree_write_buffered_key *wb, - bool *write_locked, size_t *fast) + bool *write_locked, + bool *accounting_accumulated, + size_t *fast) { struct btree_path *path; int ret; @@ -145,6 +148,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite if (ret) return ret; + if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) { + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u); + + if (k.k->type == KEY_TYPE_accounting) + bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k), + bkey_s_c_to_accounting(k)); + } + *accounting_accumulated = true; + /* * We can't clone a path that has write locks: unshare it now, before * set_pos and traverse(): @@ -257,8 +270,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; struct btree_iter iter = { NULL }; - size_t skipped = 0, fast = 0, slowpath = 0; + size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; bool write_locked = false; + bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); int ret = 0; bch2_trans_unlock(trans); @@ -299,11 +313,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) BUG_ON(!k->journal_seq); + if (!accounting_replay_done && + k->k.k.type == KEY_TYPE_accounting) { + slowpath++; + continue; + } + if (i + 1 < &darray_top(wb->sorted) && wb_key_eq(i, i + 1)) { struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - skipped++; + if (k->k.k.type == KEY_TYPE_accounting && + n->k.k.type == KEY_TYPE_accounting) + bch2_accounting_accumulate(bkey_i_to_accounting(&n->k), + bkey_i_to_s_c_accounting(&k->k)); + + overwritten++; n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); k->journal_seq = 0; continue; @@ -338,13 +363,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bch2_btree_iter_set_pos(&iter, k->k.k.p); btree_iter_path(trans, &iter)->preserve = false; + bool accounting_accumulated = false; do { if (race_fault()) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } - ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); + ret = wb_flush_one(trans, &iter, k, &write_locked, + &accounting_accumulated, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); @@ -385,8 +412,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) if (!i->journal_seq) continue; - bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); + if (!accounting_replay_done && + i->k.k.type == KEY_TYPE_accounting) { + could_not_insert++; + continue; + } + + if (!could_not_insert) + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); bch2_trans_begin(trans); @@ -399,13 +433,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) btree_write_buffered_insert(trans, i)); if (ret) goto err; + + i->journal_seq = 0; + } + + /* + * If journal replay hasn't finished with accounting keys we + * can't flush accounting keys at all - condense them and leave + * them for next time. + * + * Q: Can the write buffer overflow? + * A Shouldn't be any actual risk. It's just new accounting + * updates that the write buffer can't flush, and those are only + * going to be generated by interior btree node updates as + * journal replay has to split/rewrite nodes to make room for + * its updates. + * + * And for those new acounting updates, updates to the same + * counters get accumulated as they're flushed from the journal + * to the write buffer - see the patch for eytzingcer tree + * accumulated. So we could only overflow if the number of + * distinct counters touched somehow was very large. + */ + if (could_not_insert) { + struct btree_write_buffered_key *dst = wb->flushing.keys.data; + + darray_for_each(wb->flushing.keys, i) + if (i->journal_seq) + *dst++ = *i; + wb->flushing.keys.nr = dst - wb->flushing.keys.data; } } err: + if (ret || !could_not_insert) { + bch2_journal_pin_drop(j, &wb->flushing.pin); + wb->flushing.keys.nr = 0; + } + bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret)); - trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); - bch2_journal_pin_drop(j, &wb->flushing.pin); - wb->flushing.keys.nr = 0; + trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0); return ret; } @@ -507,6 +573,29 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work) bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); } +static void wb_accounting_sort(struct btree_write_buffer *wb) +{ + eytzinger0_sort(wb->accounting.data, wb->accounting.nr, + sizeof(wb->accounting.data[0]), + wb_key_cmp, NULL); +} + +int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree, + struct bkey_i_accounting *k) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct btree_write_buffered_key new = { .btree = btree }; + + bkey_copy(&new.k, &k->k_i); + + int ret = darray_push(&wb->accounting, new); + if (ret) + return ret; + + wb_accounting_sort(wb); + return 0; +} + int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) @@ -576,11 +665,35 @@ void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_ke bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); + + darray_for_each(wb->accounting, i) + memset(&i->k.v, 0, bkey_val_bytes(&i->k.k)); } -void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) +int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) { struct btree_write_buffer *wb = &c->btree_write_buffer; + unsigned live_accounting_keys = 0; + int ret = 0; + + darray_for_each(wb->accounting, i) + if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) { + i->journal_seq = dst->seq; + live_accounting_keys++; + ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k); + if (ret) + break; + } + + if (live_accounting_keys * 2 < wb->accounting.nr) { + struct btree_write_buffered_key *dst = wb->accounting.data; + + darray_for_each(wb->accounting, src) + if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k))) + *dst++ = *src; + wb->accounting.nr = dst - wb->accounting.data; + wb_accounting_sort(wb); + } if (!dst->wb->keys.nr) bch2_journal_pin_drop(&c->journal, &dst->wb->pin); @@ -593,6 +706,8 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys if (dst->wb == &wb->flushing) mutex_unlock(&wb->flushing.lock); mutex_unlock(&wb->inc.lock); + + return ret; } static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) @@ -616,7 +731,7 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu buf->need_flush_to_write_buffer = false; spin_unlock(&c->journal.lock); out: - bch2_journal_keys_to_write_buffer_end(c, &dst); + ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; return ret; } @@ -648,6 +763,7 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && !bch2_journal_error(&c->journal)); + darray_exit(&wb->accounting); darray_exit(&wb->sorted); darray_exit(&wb->flushing.keys); darray_exit(&wb->inc.keys); diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index eebcd2b15249a..ee3710eafeac3 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -3,6 +3,7 @@ #define _BCACHEFS_BTREE_WRITE_BUFFER_H #include "bkey.h" +#include "disk_accounting.h" static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) { @@ -29,16 +30,45 @@ struct journal_keys_to_wb { u64 seq; }; +static inline int wb_key_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p); +} + +int bch2_accounting_key_to_wb_slowpath(struct bch_fs *, + enum btree_id, struct bkey_i_accounting *); + +static inline int bch2_accounting_key_to_wb(struct bch_fs *c, + enum btree_id btree, struct bkey_i_accounting *k) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct btree_write_buffered_key search; + search.btree = btree; + search.k.k.p = k->k.p; + + unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr, + sizeof(wb->accounting.data[0]), + wb_key_cmp, &search); + + if (idx >= wb->accounting.nr) + return bch2_accounting_key_to_wb_slowpath(c, btree, k); + + struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k); + bch2_accounting_accumulate(dst, accounting_i_to_s_c(k)); + return 0; +} + int bch2_journal_key_to_wb_slowpath(struct bch_fs *, struct journal_keys_to_wb *, enum btree_id, struct bkey_i *); -static inline int bch2_journal_key_to_wb(struct bch_fs *c, +static inline int __bch2_journal_key_to_wb(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) { - EBUG_ON(!dst->seq); - if (unlikely(!dst->room)) return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); @@ -51,8 +81,19 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c, return 0; } +static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + EBUG_ON(!dst->seq); + + return k->k.type == KEY_TYPE_accounting + ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k)) + : __bch2_journal_key_to_wb(c, dst, btree, k); +} + void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); -void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); +int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h index 9b9433de9c368..e9e76e20f43b0 100644 --- a/fs/bcachefs/btree_write_buffer_types.h +++ b/fs/bcachefs/btree_write_buffer_types.h @@ -52,6 +52,8 @@ struct btree_write_buffer { struct btree_write_buffer_keys inc; struct btree_write_buffer_keys flushing; struct work_struct flush_work; + + DARRAY(struct btree_write_buffered_key) accounting; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 743d57eba7607..25549c23106df 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -13,6 +13,7 @@ #include "btree_update.h" #include "buckets.h" #include "buckets_waiting_for_journal.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "inode.h" @@ -25,197 +26,10 @@ #include -static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, - enum bch_data_type data_type, - s64 sectors) -{ - switch (data_type) { - case BCH_DATA_btree: - fs_usage->btree += sectors; - break; - case BCH_DATA_user: - case BCH_DATA_parity: - fs_usage->data += sectors; - break; - case BCH_DATA_cached: - fs_usage->cached += sectors; - break; - default: - break; - } -} - -void bch2_fs_usage_initialize(struct bch_fs *c) -{ - percpu_down_write(&c->mark_lock); - struct bch_fs_usage *usage = c->usage_base; - - for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - - for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) - usage->b.reserved += usage->persistent_reserved[i]; - - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]); - } - - for_each_member_device(c, ca) { - struct bch_dev_usage dev = bch2_dev_usage_read(ca); - - usage->b.hidden += (dev.d[BCH_DATA_sb].buckets + - dev.d[BCH_DATA_journal].buckets) * - ca->mi.bucket_size; - } - - percpu_up_write(&c->mark_lock); -} - -static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, - unsigned journal_seq, - bool gc) -{ - BUG_ON(!gc && !journal_seq); - - return this_cpu_ptr(gc - ? ca->usage_gc - : ca->usage[journal_seq & JOURNAL_BUF_MASK]); -} - void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) { - struct bch_fs *c = ca->fs; - unsigned seq, i, u64s = dev_usage_u64s(); - - do { - seq = read_seqcount_begin(&c->usage_lock); - memcpy(usage, ca->usage_base, u64s * sizeof(u64)); - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) - acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); - } while (read_seqcount_retry(&c->usage_lock, seq)); -} - -u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) -{ - ssize_t offset = v - (u64 *) c->usage_base; - unsigned i, seq; - u64 ret; - - BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); - percpu_rwsem_assert_held(&c->mark_lock); - - do { - seq = read_seqcount_begin(&c->usage_lock); - ret = *v; - - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); - } while (read_seqcount_retry(&c->usage_lock, seq)); - - return ret; -} - -struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) -{ - struct bch_fs_usage_online *ret; - unsigned nr_replicas = READ_ONCE(c->replicas.nr); - unsigned seq, i; -retry: - ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); - if (unlikely(!ret)) - return NULL; - - percpu_down_read(&c->mark_lock); - - if (nr_replicas != c->replicas.nr) { - nr_replicas = c->replicas.nr; - percpu_up_read(&c->mark_lock); - kfree(ret); - goto retry; - } - - ret->online_reserved = percpu_u64_get(c->online_reserved); - - do { - seq = read_seqcount_begin(&c->usage_lock); - unsafe_memcpy(&ret->u, c->usage_base, - __fs_usage_u64s(nr_replicas) * sizeof(u64), - "embedded variable length struct"); - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], - __fs_usage_u64s(nr_replicas)); - } while (read_seqcount_retry(&c->usage_lock, seq)); - - return ret; -} - -void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) -{ - unsigned u64s = fs_usage_u64s(c); - - BUG_ON(idx >= ARRAY_SIZE(c->usage)); - - preempt_disable(); - write_seqcount_begin(&c->usage_lock); - - acc_u64s_percpu((u64 *) c->usage_base, - (u64 __percpu *) c->usage[idx], u64s); - percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, NULL) { - u64s = dev_usage_u64s(); - - acc_u64s_percpu((u64 *) ca->usage_base, - (u64 __percpu *) ca->usage[idx], u64s); - percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); - } - rcu_read_unlock(); - - write_seqcount_end(&c->usage_lock); - preempt_enable(); -} - -void bch2_fs_usage_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_fs_usage_online *fs_usage) -{ - unsigned i; - - prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); - - prt_printf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->u.b.hidden); - prt_printf(out, "data:\t\t\t\t%llu\n", - fs_usage->u.b.data); - prt_printf(out, "cached:\t\t\t\t%llu\n", - fs_usage->u.b.cached); - prt_printf(out, "reserved:\t\t\t%llu\n", - fs_usage->u.b.reserved); - prt_printf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->u.b.nr_inodes); - prt_printf(out, "online reserved:\t\t%llu\n", - fs_usage->online_reserved); - - for (i = 0; - i < ARRAY_SIZE(fs_usage->u.persistent_reserved); - i++) { - prt_printf(out, "%u replicas:\n", i + 1); - prt_printf(out, "\treserved:\t\t%llu\n", - fs_usage->u.persistent_reserved[i]); - } - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - prt_printf(out, "\t"); - bch2_replicas_entry_to_text(out, e); - prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); - } + memset(usage, 0, sizeof(*usage)); + acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s()); } static u64 reserve_factor(u64 r) @@ -223,16 +37,6 @@ static u64 reserve_factor(u64 r) return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); } -u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) -{ - return min(fs_usage->u.b.hidden + - fs_usage->u.b.btree + - fs_usage->u.b.data + - reserve_factor(fs_usage->u.b.reserved + - fs_usage->online_reserved), - c->capacity); -} - static struct bch_fs_usage_short __bch2_fs_usage_read_short(struct bch_fs *c) { @@ -240,17 +44,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c) u64 data, reserved; ret.capacity = c->capacity - - bch2_fs_usage_read_one(c, &c->usage_base->b.hidden); + percpu_u64_get(&c->usage->hidden); - data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) + - bch2_fs_usage_read_one(c, &c->usage_base->b.btree); - reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) + + data = percpu_u64_get(&c->usage->data) + + percpu_u64_get(&c->usage->btree); + reserved = percpu_u64_get(&c->usage->reserved) + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; - ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes); + ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes); return ret; } @@ -267,11 +71,6 @@ bch2_fs_usage_read_short(struct bch_fs *c) return ret; } -void bch2_dev_usage_init(struct bch_dev *ca) -{ - ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; -} - void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) { prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); @@ -285,186 +84,6 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) } } -void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - const struct bch_alloc_v4 *old, - const struct bch_alloc_v4 *new, - u64 journal_seq, bool gc) -{ - struct bch_fs_usage *fs_usage; - struct bch_dev_usage *u; - - preempt_disable(); - fs_usage = fs_usage_ptr(c, journal_seq, gc); - - if (data_type_is_hidden(old->data_type)) - fs_usage->b.hidden -= ca->mi.bucket_size; - if (data_type_is_hidden(new->data_type)) - fs_usage->b.hidden += ca->mi.bucket_size; - - u = dev_usage_ptr(ca, journal_seq, gc); - - u->d[old->data_type].buckets--; - u->d[new->data_type].buckets++; - - u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old); - u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new); - - u->d[BCH_DATA_cached].sectors += new->cached_sectors; - u->d[BCH_DATA_cached].sectors -= old->cached_sectors; - - u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old); - u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new); - - preempt_enable(); -} - -static inline int __update_replicas(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct bch_replicas_entry_v1 *r, - s64 sectors) -{ - int idx = bch2_replicas_entry_idx(c, r); - - if (idx < 0) - return -1; - - fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); - fs_usage->replicas[idx] += sectors; - return 0; -} - -int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, - struct bch_replicas_entry_v1 *r, s64 sectors, - unsigned journal_seq, bool gc) -{ - struct bch_fs_usage *fs_usage; - int idx, ret = 0; - struct printbuf buf = PRINTBUF; - - percpu_down_read(&c->mark_lock); - - idx = bch2_replicas_entry_idx(c, r); - if (idx < 0 && - fsck_err(c, ptr_to_missing_replicas_entry, - "no replicas entry\n while marking %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - percpu_up_read(&c->mark_lock); - ret = bch2_mark_replicas(c, r); - percpu_down_read(&c->mark_lock); - - if (ret) - goto err; - idx = bch2_replicas_entry_idx(c, r); - } - if (idx < 0) { - ret = -1; - goto err; - } - - preempt_disable(); - fs_usage = fs_usage_ptr(c, journal_seq, gc); - fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); - fs_usage->replicas[idx] += sectors; - preempt_enable(); -err: -fsck_err: - percpu_up_read(&c->mark_lock); - printbuf_exit(&buf); - return ret; -} - -static inline int update_cached_sectors(struct bch_fs *c, - struct bkey_s_c k, - unsigned dev, s64 sectors, - unsigned journal_seq, bool gc) -{ - struct bch_replicas_padded r; - - bch2_replicas_entry_cached(&r.e, dev); - - return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc); -} - -static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, - gfp_t gfp) -{ - struct replicas_delta_list *d = trans->fs_usage_deltas; - unsigned new_size = d ? (d->size + more) * 2 : 128; - unsigned alloc_size = sizeof(*d) + new_size; - - WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); - - if (!d || d->used + more > d->size) { - d = krealloc(d, alloc_size, gfp|__GFP_ZERO); - - if (unlikely(!d)) { - if (alloc_size > REPLICAS_DELTA_LIST_MAX) - return -ENOMEM; - - d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); - if (!d) - return -ENOMEM; - - memset(d, 0, REPLICAS_DELTA_LIST_MAX); - - if (trans->fs_usage_deltas) - memcpy(d, trans->fs_usage_deltas, - trans->fs_usage_deltas->size + sizeof(*d)); - - new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); - kfree(trans->fs_usage_deltas); - } - - d->size = new_size; - trans->fs_usage_deltas = d; - } - - return 0; -} - -int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) -{ - return allocate_dropping_locks_errcode(trans, - __replicas_deltas_realloc(trans, more, _gfp)); -} - -int bch2_update_replicas_list(struct btree_trans *trans, - struct bch_replicas_entry_v1 *r, - s64 sectors) -{ - struct replicas_delta_list *d; - struct replicas_delta *n; - unsigned b; - int ret; - - if (!sectors) - return 0; - - b = replicas_entry_bytes(r) + 8; - ret = bch2_replicas_deltas_realloc(trans, b); - if (ret) - return ret; - - d = trans->fs_usage_deltas; - n = (void *) d->d + d->used; - n->delta = sectors; - unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r), - r, replicas_entry_bytes(r), - "flexible array member embedded in strcuct with padding"); - bch2_replicas_entry_sort(&n->r); - d->used += b; - return 0; -} - -int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors) -{ - struct bch_replicas_padded r; - - bch2_replicas_entry_cached(&r.e, dev); - - return bch2_update_replicas_list(trans, &r.e, sectors); -} - static int bch2_check_fix_ptr(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, @@ -477,7 +96,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (!ca) { - if (fsck_err(c, ptr_to_invalid_device, + if (fsck_err(trans, ptr_to_invalid_device, "pointer to missing device %u\n" "while marking %s", p.ptr.dev, @@ -489,7 +108,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); if (!g) { - if (fsck_err(c, ptr_to_invalid_device, + if (fsck_err(trans, ptr_to_invalid_device, "pointer to invalid bucket on device %u\n" "while marking %s", p.ptr.dev, @@ -502,7 +121,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); if (fsck_err_on(!g->gen_valid, - c, ptr_to_missing_alloc_key, + trans, ptr_to_missing_alloc_key, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), @@ -519,7 +138,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, } if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - c, ptr_gen_newer_than_bucket_gen, + trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), @@ -533,6 +152,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, g->gen_valid = true; g->gen = p.ptr.gen; g->data_type = 0; + g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; } else { @@ -541,7 +161,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, } if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - c, ptr_gen_newer_than_bucket_gen, + trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, @@ -552,7 +172,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, *do_update = true; if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - c, stale_dirty_ptr, + trans, stale_dirty_ptr, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), @@ -566,7 +186,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, goto out; if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), - c, ptr_bucket_data_type_mismatch, + trans, ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, @@ -578,6 +198,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, g->gen_valid = true; g->gen = p.ptr.gen; g->data_type = data_type; + g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; } else { @@ -589,7 +210,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); if (fsck_err_on(!m || !m->alive, - c, ptr_to_missing_stripe, + trans, ptr_to_missing_stripe, "pointer to nonexistent stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -598,7 +219,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, *do_update = true; if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), - c, ptr_to_incorrect_stripe, + trans, ptr_to_incorrect_stripe, "pointer does not match stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -766,8 +387,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, BUG_ON(!sectors); if (gen_after(ptr->gen, b_gen)) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, + bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + ptr_gen_newer_than_bucket_gen, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -780,8 +401,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_ptr_too_stale, + bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + ptr_too_stale, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -800,8 +421,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (b_gen != ptr->gen) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_stale_dirty_ptr, + bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -816,8 +437,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, + bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -831,8 +452,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if ((u64) *bucket_sectors + sectors > U32_MAX) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_bucket_sector_count_overflow, + bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + bucket_sector_count_overflow, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -855,47 +476,6 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, goto out; } -void bch2_trans_fs_usage_revert(struct btree_trans *trans, - struct replicas_delta_list *deltas) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage *dst; - struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; - s64 added = 0; - unsigned i; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); - - /* revert changes: */ - for (d = deltas->d; d != top; d = replicas_delta_next(d)) { - switch (d->r.data_type) { - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - added += d->delta; - } - BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); - } - - dst->b.nr_inodes -= deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - added -= deltas->persistent_reserved[i]; - dst->b.reserved -= deltas->persistent_reserved[i]; - dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; - } - - if (added > 0) { - trans->disk_res->sectors += added; - this_cpu_add(*c->online_reserved, added); - } - - preempt_enable(); - percpu_up_read(&c->mark_lock); -} - void bch2_trans_account_disk_usage_change(struct btree_trans *trans) { struct bch_fs *c = trans->c; @@ -904,8 +484,6 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) bool warn = false; percpu_down_read(&c->mark_lock); - preempt_disable(); - struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b; struct bch_fs_usage_base *src = &trans->fs_usage_delta; s64 added = src->btree + src->data + src->reserved; @@ -916,13 +494,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) */ s64 should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { - u64 old, new, v = atomic64_read(&c->sectors_available); + u64 old, new; + old = atomic64_read(&c->sectors_available); do { - old = v; new = max_t(s64, 0, old - should_not_have_added); - } while ((v = atomic64_cmpxchg(&c->sectors_available, - old, new)) != old); + } while (!atomic64_try_cmpxchg(&c->sectors_available, + &old, new)); added -= should_not_have_added; warn = true; @@ -933,13 +511,9 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) this_cpu_sub(*c->online_reserved, added); } - dst->hidden += src->hidden; - dst->btree += src->btree; - dst->data += src->data; - dst->cached += src->cached; - dst->reserved += src->reserved; - dst->nr_inodes += src->nr_inodes; - + preempt_disable(); + struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); + acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); preempt_enable(); percpu_up_read(&c->mark_lock); @@ -949,55 +523,18 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) should_not_have_added, disk_res_sectors); } -int bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) -{ - struct bch_fs *c = trans->c; - struct replicas_delta *d, *d2; - struct replicas_delta *top = (void *) deltas->d + deltas->used; - struct bch_fs_usage *dst; - unsigned i; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); - - for (d = deltas->d; d != top; d = replicas_delta_next(d)) - if (__update_replicas(c, dst, &d->r, d->delta)) - goto need_mark; - - dst->b.nr_inodes += deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - dst->b.reserved += deltas->persistent_reserved[i]; - dst->persistent_reserved[i] += deltas->persistent_reserved[i]; - } - - preempt_enable(); - percpu_up_read(&c->mark_lock); - return 0; -need_mark: - /* revert changes: */ - for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) - BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); - - preempt_enable(); - percpu_up_read(&c->mark_lock); - return -1; -} - /* KEY_TYPE_extent: */ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, - const struct bch_extent_ptr *ptr, + const struct extent_ptr_decoded *p, s64 sectors, enum bch_data_type ptr_data_type, struct bch_alloc_v4 *a) { - u32 *dst_sectors = !ptr->cached - ? &a->dirty_sectors - : &a->cached_sectors; - int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type, + u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : + !p->ptr.cached ? &a->dirty_sectors : + &a->cached_sectors; + int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type, a->gen, a->data_type, dst_sectors); if (ret) @@ -1032,9 +569,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans, *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); if (flags & BTREE_TRIGGER_transactional) { - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket); + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v); + __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v); if (ret) goto err; @@ -1057,14 +594,14 @@ static int bch2_trigger_pointer(struct btree_trans *trans, bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new); - if (!ret) { - alloc_to_bucket(g, new); - bch2_dev_usage_update(c, ca, &old, &new, 0, true); - } + ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new); + alloc_to_bucket(g, new); bucket_unlock(g); err_unlock: percpu_up_read(&c->mark_lock); + + if (!ret) + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } err: bch2_dev_put(ca); @@ -1104,10 +641,12 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, stripe_blockcount_get(&s->v, p.ec.block) + sectors); - struct bch_replicas_padded r; - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); - r.e.data_type = data_type; - ret = bch2_update_replicas_list(trans, &r.e, sectors); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = data_type; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1116,8 +655,6 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, if (flags & BTREE_TRIGGER_gc) { struct bch_fs *c = trans->c; - BUG_ON(!(flags & BTREE_TRIGGER_gc)); - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", @@ -1140,11 +677,16 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, m->block_sectors[p.ec.block] += sectors; - struct bch_replicas_padded r = m->r; + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); mutex_unlock(&c->ec_stripes_heap_lock); - r.e.data_type = data_type; - bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + acc.replicas.data_type = data_type; + int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); + if (ret) + return ret; } return 0; @@ -1156,20 +698,26 @@ static int __trigger_extent(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { bool gc = flags & BTREE_TRIGGER_gc; - struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct bch_replicas_padded r; enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; s64 replicas_sectors = 0; int ret = 0; - r.e.data_type = data_type; - r.e.nr_devs = 0; - r.e.nr_required = 1; + struct disk_accounting_pos acc_replicas_key = { + .type = BCH_DISK_ACCOUNTING_replicas, + .replicas.data_type = data_type, + .replicas.nr_devs = 0, + .replicas.nr_required = 1, + }; + + struct disk_accounting_pos acct_compression_key = { + .type = BCH_DISK_ACCOUNTING_compression, + }; + u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = 0; @@ -1179,19 +727,16 @@ static int __trigger_extent(struct btree_trans *trans, bool stale = ret > 0; + if (p.ptr.cached && stale) + continue; + if (p.ptr.cached) { - if (!stale) { - ret = !gc - ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors) - : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true); - bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors", - bch2_err_str(ret)); - if (ret) - return ret; - } + ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc); + if (ret) + return ret; } else if (!p.has_ec) { replicas_sectors += disk_sectors; - r.e.devs[r.e.nr_devs++] = p.ptr.dev; + acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev; } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); if (ret) @@ -1202,21 +747,72 @@ static int __trigger_extent(struct btree_trans *trans, * if so they're not required for mounting if we have an * erasure coded pointer in this extent: */ - r.e.nr_required = 0; + acc_replicas_key.replicas.nr_required = 0; } - } - if (r.e.nr_devs) { - ret = !gc - ? bch2_update_replicas_list(trans, &r.e, replicas_sectors) - : bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true); - if (unlikely(ret && gc)) { - struct printbuf buf = PRINTBUF; + if (acct_compression_key.compression.type && + acct_compression_key.compression.type != p.crc.compression_type) { + if (flags & BTREE_TRIGGER_overwrite) + bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf); - printbuf_exit(&buf); + ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, + ARRAY_SIZE(compression_acct), gc); + if (ret) + return ret; + + compression_acct[0] = 1; + compression_acct[1] = 0; + compression_acct[2] = 0; + } + + acct_compression_key.compression.type = p.crc.compression_type; + if (p.crc.compression_type) { + compression_acct[1] += p.crc.uncompressed_size; + compression_acct[2] += p.crc.compressed_size; } + } + + if (acc_replicas_key.replicas.nr_devs) { + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); + if (ret) + return ret; + } + + if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { + struct disk_accounting_pos acc_snapshot_key = { + .type = BCH_DISK_ACCOUNTING_snapshot, + .snapshot.id = k.k->p.snapshot, + }; + ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc); + if (ret) + return ret; + } + + if (acct_compression_key.compression.type) { + if (flags & BTREE_TRIGGER_overwrite) + bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); + + ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, + ARRAY_SIZE(compression_acct), gc); + if (ret) + return ret; + } + + if (level) { + struct disk_accounting_pos acc_btree_key = { + .type = BCH_DISK_ACCOUNTING_btree, + .btree.id = btree_id, + }; + ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc); + if (ret) + return ret; + } + + if (bch2_bkey_rebalance_opts(k)) { + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_rebalance_work, + }; + ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc); if (ret) return ret; } @@ -1269,36 +865,18 @@ static int __trigger_reservation(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, enum btree_iter_update_trigger_flags flags) { - struct bch_fs *c = trans->c; - unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - s64 sectors = (s64) k.k->size * replicas; - - if (flags & BTREE_TRIGGER_overwrite) - sectors = -sectors; - - if (flags & BTREE_TRIGGER_transactional) { - int ret = bch2_replicas_deltas_realloc(trans, 0); - if (ret) - return ret; + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { + s64 sectors = k.k->size; - struct replicas_delta_list *d = trans->fs_usage_deltas; - replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved)); + if (flags & BTREE_TRIGGER_overwrite) + sectors = -sectors; - d->persistent_reserved[replicas - 1] += sectors; - } - - if (flags & BTREE_TRIGGER_gc) { - percpu_down_read(&c->mark_lock); - preempt_disable(); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_persistent_reserved, + .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, + }; - struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); - - replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); - fs_usage->b.reserved += sectors; - fs_usage->persistent_reserved[replicas - 1] += sectors; - - preempt_enable(); - percpu_up_read(&c->mark_lock); + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); } return 0; @@ -1330,7 +908,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, if (a->v.data_type && type && a->v.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - BCH_FSCK_ERR_bucket_metadata_type_mismatch, + bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, @@ -1352,10 +930,13 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return ret; } -static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, +static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, u64 b, enum bch_data_type data_type, unsigned sectors, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + int ret = 0; + percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, b); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", @@ -1382,9 +963,10 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, g->data_type = data_type; g->dirty_sectors += sectors; struct bch_alloc_v4 new = bucket_m_to_alloc(*g); - bch2_dev_usage_update(c, ca, &old, &new, 0, true); + bucket_unlock(g); percpu_up_read(&c->mark_lock); - return 0; + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + return ret; err: bucket_unlock(g); err_unlock: @@ -1408,7 +990,7 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return 0; if (flags & BTREE_TRIGGER_gc) - return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags); + return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags); else if (flags & BTREE_TRIGGER_transactional) return commit_do(trans, NULL, NULL, 0, __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); @@ -1523,7 +1105,7 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, u64 sectors, int flags) { struct bch_fs_pcpu *pcpu; - u64 old, v, get; + u64 old, get; s64 sectors_available; int ret; @@ -1534,17 +1116,16 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, if (sectors <= pcpu->sectors_available) goto out; - v = atomic64_read(&c->sectors_available); + old = atomic64_read(&c->sectors_available); do { - old = v; get = min((u64) sectors + SECTORS_CACHE, old); if (get < sectors) { preempt_enable(); goto recalculate; } - } while ((v = atomic64_cmpxchg(&c->sectors_available, - old, old - get)) != old); + } while (!atomic64_try_cmpxchg(&c->sectors_available, + &old, old - get)); pcpu->sectors_available += get; @@ -1636,7 +1217,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->nbuckets - bucket_gens->first_bucket; if (resize) { - down_write(&c->gc_lock); down_write(&ca->bucket_lock); percpu_down_write(&c->mark_lock); } @@ -1659,7 +1239,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (resize) { percpu_up_write(&c->mark_lock); up_write(&ca->bucket_lock); - up_write(&c->gc_lock); } ret = 0; @@ -1674,23 +1253,14 @@ void bch2_dev_buckets_free(struct bch_dev *ca) { kvfree(ca->buckets_nouse); kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); - - for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) - free_percpu(ca->usage[i]); - kfree(ca->usage_base); + free_percpu(ca->usage); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); - if (!ca->usage_base) + ca->usage = alloc_percpu(struct bch_dev_usage); + if (!ca->usage) return -BCH_ERR_ENOMEM_usage_init; - for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) { - ca->usage[i] = alloc_percpu(struct bch_dev_usage); - if (!ca->usage[i]) - return -BCH_ERR_ENOMEM_usage_init; - } - return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 80ee0be9793e6..4a14741b8b36a 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -85,7 +85,7 @@ static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) return rcu_dereference_check(ca->buckets_gc, !ca->fs || percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->fs->state_lock) || lockdep_is_held(&ca->bucket_lock)); } @@ -103,7 +103,7 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) return rcu_dereference_check(ca->bucket_gens, !ca->fs || percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->fs->state_lock) || lockdep_is_held(&ca->bucket_lock)); } @@ -204,7 +204,6 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) return ret; } -void bch2_dev_usage_init(struct bch_dev *); void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) @@ -266,73 +265,14 @@ static inline u64 dev_buckets_available(struct bch_dev *ca, /* Filesystem usage: */ -static inline unsigned __fs_usage_u64s(unsigned nr_replicas) -{ - return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas; -} - -static inline unsigned fs_usage_u64s(struct bch_fs *c) -{ - return __fs_usage_u64s(READ_ONCE(c->replicas.nr)); -} - -static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas) -{ - return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas; -} - -static inline unsigned fs_usage_online_u64s(struct bch_fs *c) -{ - return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr)); -} - static inline unsigned dev_usage_u64s(void) { return sizeof(struct bch_dev_usage) / sizeof(u64); } -u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); - -struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); - -void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); - -void bch2_fs_usage_to_text(struct printbuf *, - struct bch_fs *, struct bch_fs_usage_online *); - -u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); - struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); -void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *, - const struct bch_alloc_v4 *, - const struct bch_alloc_v4 *, u64, bool); - -/* key/bucket marking: */ - -static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, - unsigned journal_seq, - bool gc) -{ - percpu_rwsem_assert_held(&c->mark_lock); - BUG_ON(!gc && !journal_seq); - - return this_cpu_ptr(gc - ? c->usage_gc - : c->usage[journal_seq & JOURNAL_BUF_MASK]); -} - -int bch2_update_replicas(struct bch_fs *, struct bkey_s_c, - struct bch_replicas_entry_v1 *, s64, - unsigned, bool); -int bch2_update_replicas_list(struct btree_trans *, - struct bch_replicas_entry_v1 *, s64); -int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64); -int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); - -void bch2_fs_usage_initialize(struct bch_fs *); - int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *, struct bkey_s_c, const struct bch_extent_ptr *, s64, enum bch_data_type, u8, u8, u32 *); @@ -361,9 +301,6 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, void bch2_trans_account_disk_usage_change(struct btree_trans *); -void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); -int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); - int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64, enum bch_data_type, unsigned, enum btree_iter_update_trigger_flags); @@ -424,13 +361,13 @@ static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reserv #ifdef __KERNEL__ u64 old, new; + old = this_cpu_read(c->pcpu->sectors_available); do { - old = this_cpu_read(c->pcpu->sectors_available); if (sectors > old) return __bch2_disk_reservation_add(c, res, sectors, flags); new = old - sectors; - } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old); + } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new)); this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index f636e17c4cafe..c9698cdf866f9 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -16,7 +16,8 @@ struct bucket { u32 stripe; u32 dirty_sectors; u32 cached_sectors; -}; + u32 stripe_sectors; +} __aligned(sizeof(long)); struct bucket_array { struct rcu_head rcu; @@ -35,7 +36,7 @@ struct bucket_gens { }; struct bch_dev_usage { - struct { + struct bch_dev_usage_type { u64 buckets; u64 sectors; /* _compressed_ sectors: */ /* @@ -56,18 +57,6 @@ struct bch_fs_usage_base { u64 nr_inodes; }; -struct bch_fs_usage { - /* all fields are in units of 512 byte sectors: */ - struct bch_fs_usage_base b; - u64 persistent_reserved[BCH_REPLICAS_MAX]; - u64 replicas[]; -}; - -struct bch_fs_usage_online { - u64 online_reserved; - struct bch_fs_usage u; -}; - struct bch_fs_usage_short { u64 capacity; u64 used; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 6d82e1165adc2..72ade3664d7b0 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -5,6 +5,7 @@ #include "bcachefs_ioctl.h" #include "buckets.h" #include "chardev.h" +#include "disk_accounting.h" #include "journal.h" #include "move.h" #include "recovery_passes.h" @@ -213,9 +214,21 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + char *ro, *rest; + + /* + * If passed a "read_only" mount option, remove it because it is + * no longer a valid mount option, and the filesystem will be + * set "read_only" regardless. + */ + ro = strstr(optstr, "read_only"); + if (ro) { + rest = ro + strlen("read_only"); + memmove(ro, rest, strlen(rest) + 1); + } ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, optstr); + bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); if (!IS_ERR(optstr)) kfree(optstr); @@ -224,6 +237,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a } opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + opt_set(thr->opts, read_only, 1); /* We need request_key() to be called before we punt to kthread: */ opt_set(thr->opts, nostart, true); @@ -503,11 +517,9 @@ static long bch2_ioctl_data(struct bch_fs *c, static long bch2_ioctl_fs_usage(struct bch_fs *c, struct bch_ioctl_fs_usage __user *user_arg) { - struct bch_ioctl_fs_usage *arg = NULL; - struct bch_replicas_usage *dst_e, *dst_end; - struct bch_fs_usage_online *src; + struct bch_ioctl_fs_usage arg = {}; + darray_char replicas = {}; u32 replica_entries_bytes; - unsigned i; int ret = 0; if (!test_bit(BCH_FS_started, &c->flags)) @@ -516,62 +528,60 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) return -EFAULT; - arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL); - if (!arg) - return -ENOMEM; - - src = bch2_fs_usage_read(c); - if (!src) { - ret = -ENOMEM; + ret = bch2_fs_replicas_usage_read(c, &replicas) ?: + (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?: + copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr); + if (ret) goto err; - } - arg->capacity = c->capacity; - arg->used = bch2_fs_sectors_used(c, src); - arg->online_reserved = src->online_reserved; + struct bch_fs_usage_short u = bch2_fs_usage_read_short(c); + arg.capacity = c->capacity; + arg.used = u.used; + arg.online_reserved = percpu_u64_get(c->online_reserved); + arg.replica_entries_bytes = replicas.nr; - for (i = 0; i < BCH_REPLICAS_MAX; i++) - arg->persistent_reserved[i] = src->u.persistent_reserved[i]; - - dst_e = arg->replicas; - dst_end = (void *) arg->replicas + replica_entries_bytes; - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *src_e = - cpu_replicas_entry(&c->replicas, i); - - /* check that we have enough space for one replicas entry */ - if (dst_e + 1 > dst_end) { - ret = -ERANGE; - break; - } - - dst_e->sectors = src->u.replicas[i]; - dst_e->r = *src_e; - - /* recheck after setting nr_devs: */ - if (replicas_usage_next(dst_e) > dst_end) { - ret = -ERANGE; - break; - } - - memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); + for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { + struct disk_accounting_pos k = { + .type = BCH_DISK_ACCOUNTING_persistent_reserved, + .persistent_reserved.nr_replicas = i, + }; - dst_e = replicas_usage_next(dst_e); + bch2_accounting_mem_read(c, + disk_accounting_pos_to_bpos(&k), + &arg.persistent_reserved[i], 1); } - arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; + ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); +err: + darray_exit(&replicas); + return ret; +} - percpu_up_read(&c->mark_lock); - kfree(src); +static long bch2_ioctl_query_accounting(struct bch_fs *c, + struct bch_ioctl_query_accounting __user *user_arg) +{ + struct bch_ioctl_query_accounting arg; + darray_char accounting = {}; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; + ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: + bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?: + (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?: + copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr); if (ret) goto err; - ret = copy_to_user_errcode(user_arg, arg, - sizeof(*arg) + arg->replica_entries_bytes); + arg.capacity = c->capacity; + arg.used = bch2_fs_usage_read_short(c).used; + arg.online_reserved = percpu_u64_get(c->online_reserved); + arg.accounting_u64s = accounting.nr / sizeof(u64); + + ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); err: - kfree(arg); + darray_exit(&accounting); return ret; } @@ -606,7 +616,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.bucket_size = ca->mi.bucket_size; arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - for (i = 0; i < BCH_DATA_NR; i++) { + for (i = 0; i < ARRAY_SIZE(arg.d); i++) { arg.d[i].buckets = src.d[i].buckets; arg.d[i].sectors = src.d[i].sectors; arg.d[i].fragmented = src.d[i].fragmented; @@ -851,7 +861,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, optstr); + bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); if (!IS_ERR(optstr)) kfree(optstr); @@ -928,6 +938,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); case BCH_IOCTL_FSCK_ONLINE: BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); + case BCH_IOCTL_QUERY_ACCOUNTING: + return bch2_ioctl_query_accounting(c, arg); default: return -ENOTTY; } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index c67460d8205db..d743da89308ef 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -534,6 +534,14 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target) { struct qstr name = bch2_dirent_get_name(d); + /* + * Although not required by the kernel code, updating ctx->pos is needed + * for the bcachefs FUSE driver. Without this update, the FUSE + * implementation will be stuck in an infinite loop when reading + * directories (via the bcachefs_fuse_readdir callback). + * In kernel space, ctx->pos is updated by the VFS code. + */ + ctx->pos = d.k->p.offset; bool ret = dir_emit(ctx, name.name, name.len, target.inum, diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c new file mode 100644 index 0000000000000..3703745917242 --- /dev/null +++ b/fs/bcachefs/disk_accounting.c @@ -0,0 +1,788 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bcachefs_ioctl.h" +#include "btree_cache.h" +#include "btree_journal_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "compress.h" +#include "disk_accounting.h" +#include "error.h" +#include "journal_io.h" +#include "replicas.h" + +/* + * Notes on disk accounting: + * + * We have two parallel sets of counters to be concerned with, and both must be + * kept in sync. + * + * - Persistent/on disk accounting, stored in the accounting btree and updated + * via btree write buffer updates that treat new accounting keys as deltas to + * apply to existing values. But reading from a write buffer btree is + * expensive, so we also have + * + * - In memory accounting, where accounting is stored as an array of percpu + * counters, indexed by an eytzinger array of disk acounting keys/bpos (which + * are the same thing, excepting byte swabbing on big endian). + * + * Cheap to read, but non persistent. + * + * Disk accounting updates are generated by transactional triggers; these run as + * keys enter and leave the btree, and can compare old and new versions of keys; + * the output of these triggers are deltas to the various counters. + * + * Disk accounting updates are done as btree write buffer updates, where the + * counters in the disk accounting key are deltas that will be applied to the + * counter in the btree when the key is flushed by the write buffer (or journal + * replay). + * + * To do a disk accounting update: + * - initialize a disk_accounting_pos, to specify which counter is being update + * - initialize counter deltas, as an array of 1-3 s64s + * - call bch2_disk_accounting_mod() + * + * This queues up the accounting update to be done at transaction commit time. + * Underneath, it's a normal btree write buffer update. + * + * The transaction commit path is responsible for propagating updates to the in + * memory counters, with bch2_accounting_mem_mod(). + * + * The commit path also assigns every disk accounting update a unique version + * number, based on the journal sequence number and offset within that journal + * buffer; this is used by journal replay to determine which updates have been + * done. + * + * The transaction commit path also ensures that replicas entry accounting + * updates are properly marked in the superblock (so that we know whether we can + * mount without data being unavailable); it will update the superblock if + * bch2_accounting_mem_mod() tells it to. + */ + +static const char * const disk_accounting_type_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DISK_ACCOUNTING_TYPES() +#undef x + NULL +}; + +static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, + s64 *d, unsigned nr) +{ + struct bkey_i_accounting *acc = bkey_accounting_init(k); + + acc->k.p = disk_accounting_pos_to_bpos(pos); + set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); + + memcpy_u64s_small(acc->v.d, d, nr); +} + +int bch2_disk_accounting_mod(struct btree_trans *trans, + struct disk_accounting_pos *k, + s64 *d, unsigned nr, bool gc) +{ + /* Normalize: */ + switch (k->type) { + case BCH_DISK_ACCOUNTING_replicas: + bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); + break; + } + + BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + + accounting_key_init(&k_i.k, k, d, nr); + + return likely(!gc) + ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k) + : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); +} + +int bch2_mod_dev_cached_sectors(struct btree_trans *trans, + unsigned dev, s64 sectors, + bool gc) +{ + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + + bch2_replicas_entry_cached(&acc.replicas, dev); + + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); +} + +int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags, + struct printbuf *err) +{ + return 0; +} + +void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) +{ + if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { + prt_printf(out, "unknown type %u", k->type); + return; + } + + prt_str(out, disk_accounting_type_strs[k->type]); + prt_str(out, " "); + + switch (k->type) { + case BCH_DISK_ACCOUNTING_nr_inodes: + break; + case BCH_DISK_ACCOUNTING_persistent_reserved: + prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); + break; + case BCH_DISK_ACCOUNTING_replicas: + bch2_replicas_entry_to_text(out, &k->replicas); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); + bch2_prt_data_type(out, k->dev_data_type.data_type); + break; + case BCH_DISK_ACCOUNTING_compression: + bch2_prt_compression_type(out, k->compression.type); + break; + case BCH_DISK_ACCOUNTING_snapshot: + prt_printf(out, "id=%u", k->snapshot.id); + break; + case BCH_DISK_ACCOUNTING_btree: + prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id)); + break; + } +} + +void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + bch2_accounting_key_to_text(out, &acc_k); + + for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) + prt_printf(out, " %lli", acc.v->d[i]); +} + +void bch2_accounting_swab(struct bkey_s k) +{ + for (u64 *p = (u64 *) k.v; + p < (u64 *) bkey_val_end(k); + p++) + *p = swab64(*p); +} + +static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) +{ + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, p); + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_replicas: + unsafe_memcpy(r, &acc_k.replicas, + replicas_entry_bytes(&acc_k.replicas), + "variable length struct"); + return true; + default: + return false; + } +} + +static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) +{ + struct bch_replicas_padded r; + return accounting_to_replicas(&r.e, p) + ? bch2_mark_replicas(c, &r.e) + : 0; +} + +/* + * Ensure accounting keys being updated are present in the superblock, when + * applicable (i.e. replicas updates) + */ +int bch2_accounting_update_sb(struct btree_trans *trans) +{ + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) + if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { + int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); + if (ret) + return ret; + } + + return 0; +} + +static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) +{ + struct bch_accounting_mem *acc = &c->accounting; + + /* raced with another insert, already present: */ + if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &a.k->p) < acc->k.nr) + return 0; + + struct accounting_mem_entry n = { + .pos = a.k->p, + .version = a.k->version, + .nr_counters = bch2_accounting_counters(a.k), + .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL), + }; + + if (!n.v[0]) + goto err; + + if (acc->gc_running) { + n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!n.v[1]) + goto err; + } + + if (darray_push(&acc->k, n)) + goto err; + + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + return 0; +err: + free_percpu(n.v[1]); + free_percpu(n.v[0]); + return -BCH_ERR_ENOMEM_disk_accounting; +} + +int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) +{ + struct bch_replicas_padded r; + + if (accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + percpu_up_read(&c->mark_lock); + percpu_down_write(&c->mark_lock); + int ret = __bch2_accounting_mem_insert(c, a); + percpu_up_write(&c->mark_lock); + percpu_down_read(&c->mark_lock); + return ret; +} + +static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) +{ + for (unsigned i = 0; i < e->nr_counters; i++) + if (percpu_u64_get(e->v[0] + i) || + (e->v[1] && + percpu_u64_get(e->v[1] + i))) + return false; + return true; +} + +void bch2_accounting_mem_gc(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + + percpu_down_write(&c->mark_lock); + struct accounting_mem_entry *dst = acc->k.data; + + darray_for_each(acc->k, src) { + if (accounting_mem_entry_is_zero(src)) { + free_percpu(src->v[0]); + free_percpu(src->v[1]); + } else { + *dst++ = *src; + } + } + + acc->k.nr = dst - acc->k.data; + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + percpu_up_write(&c->mark_lock); +} + +/* + * Read out accounting keys for replicas entries, as an array of + * bch_replicas_usage entries. + * + * Note: this may be deprecated/removed at smoe point in the future and replaced + * with something more general, it exists to support the ioctl used by the + * 'bcachefs fs usage' command. + */ +int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) +{ + struct bch_accounting_mem *acc = &c->accounting; + int ret = 0; + + darray_init(usage); + + percpu_down_read(&c->mark_lock); + darray_for_each(acc->k, i) { + struct { + struct bch_replicas_usage r; + u8 pad[BCH_BKEY_PTRS_MAX]; + } u; + + if (!accounting_to_replicas(&u.r.r, i->pos)) + continue; + + u64 sectors; + bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); + u.r.sectors = sectors; + + ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); + if (ret) + break; + + memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); + usage->nr += replicas_usage_bytes(&u.r); + } + percpu_up_read(&c->mark_lock); + + if (ret) + darray_exit(usage); + return ret; +} + +int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) +{ + + struct bch_accounting_mem *acc = &c->accounting; + int ret = 0; + + darray_init(out_buf); + + percpu_down_read(&c->mark_lock); + darray_for_each(acc->k, i) { + struct disk_accounting_pos a_p; + bpos_to_disk_accounting_pos(&a_p, i->pos); + + if (!(accounting_types_mask & BIT(a_p.type))) + continue; + + ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + + sizeof(u64) * i->nr_counters); + if (ret) + break; + + struct bkey_i_accounting *a_out = + bkey_accounting_init((void *) &darray_top(*out_buf)); + set_bkey_val_u64s(&a_out->k, i->nr_counters); + a_out->k.p = i->pos; + bch2_accounting_mem_read_counters(acc, i - acc->k.data, + a_out->v.d, i->nr_counters, false); + + if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) + out_buf->nr += bkey_bytes(&a_out->k); + } + + percpu_up_read(&c->mark_lock); + + if (ret) + darray_exit(out_buf); + return ret; +} + +void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + + percpu_down_read(&c->mark_lock); + out->atomic++; + + eytzinger0_for_each(i, acc->k.nr) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos); + + bch2_accounting_key_to_text(out, &acc_k); + + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + + prt_str(out, ":"); + for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) + prt_printf(out, " %llu", v[j]); + prt_newline(out); + } + + --out->atomic; + percpu_up_read(&c->mark_lock); +} + +static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) +{ + darray_for_each(acc->k, e) { + free_percpu(e->v[gc]); + e->v[gc] = NULL; + } +} + +int bch2_gc_accounting_start(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + int ret = 0; + + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) { + e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!e->v[1]) { + bch2_accounting_free_counters(acc, true); + ret = -BCH_ERR_ENOMEM_disk_accounting; + break; + } + } + + acc->gc_running = !ret; + percpu_up_write(&c->mark_lock); + + return ret; +} + +int bch2_gc_accounting_done(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + struct bpos pos = POS_MIN; + int ret = 0; + + percpu_down_write(&c->mark_lock); + while (1) { + unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &pos); + + if (idx >= acc->k.nr) + break; + + struct accounting_mem_entry *e = acc->k.data + idx; + pos = bpos_successor(e->pos); + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, e->pos); + + u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; + u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; + + unsigned nr = e->nr_counters; + bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); + bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); + + if (memcmp(dst_v, src_v, nr * sizeof(u64))) { + printbuf_reset(&buf); + prt_str(&buf, "accounting mismatch for "); + bch2_accounting_key_to_text(&buf, &acc_k); + + prt_str(&buf, ": got"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", dst_v[j]); + + prt_str(&buf, " should be"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", src_v[j]); + + for (unsigned j = 0; j < nr; j++) + src_v[j] -= dst_v[j]; + + if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { + percpu_up_write(&c->mark_lock); + ret = commit_do(trans, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); + percpu_down_write(&c->mark_lock); + if (ret) + goto err; + + if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + + accounting_key_init(&k_i.k, &acc_k, src_v, nr); + bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false); + + preempt_disable(); + struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); + struct bch_fs_usage_base *src = &trans->fs_usage_delta; + acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); + preempt_enable(); + } + } + } + } +err: +fsck_err: + percpu_up_write(&c->mark_lock); + printbuf_exit(&buf); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +} + +static int accounting_read_key(struct bch_fs *c, struct btree_trans *trans, struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + + if (k.k->type != KEY_TYPE_accounting) + return 0; + + percpu_down_read(&c->mark_lock); + int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false); + percpu_up_read(&c->mark_lock); + + if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) && + ret == -BCH_ERR_btree_insert_need_mark_replicas) + ret = 0; + + struct disk_accounting_pos acc; + bpos_to_disk_accounting_pos(&acc, k.k->p); + + if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas, + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) + ret = bch2_accounting_update_sb_one(c, k.k->p); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * At startup time, initialize the in memory accounting from the btree (and + * journal) + */ +int bch2_accounting_read(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, + BTREE_ID_accounting, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); + accounting_read_key(c, trans, k); + }))); + if (ret) + goto err; + + struct journal_keys *keys = &c->journal_keys; + struct journal_key *dst = keys->data; + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + if (i->k->k.type == KEY_TYPE_accounting) { + struct bkey_s_c k = bkey_i_to_s_c(i->k); + unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, + sizeof(acc->k.data[0]), + accounting_pos_cmp, &k.k->p); + + bool applied = idx < acc->k.nr && + bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0; + + if (applied) + continue; + + if (i + 1 < &darray_top(*keys) && + i[1].k->k.type == KEY_TYPE_accounting && + !journal_key_cmp(i, i + 1)) { + BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0); + + i[1].journal_seq = i[0].journal_seq; + + bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), + bkey_s_c_to_accounting(k)); + continue; + } + + ret = accounting_read_key(c, NULL, k); + if (ret) + goto err; + } + + *dst++ = *i; + } + keys->gap = keys->nr = dst - keys->data; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + + for (unsigned i = 0; i < acc->k.nr; i++) { + struct disk_accounting_pos k; + bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); + + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + + switch (k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + usage->reserved += v[0] * k.persistent_reserved.nr_replicas; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); + if (ca) { + struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; + percpu_u64_set(&d->buckets, v[0]); + percpu_u64_set(&d->sectors, v[1]); + percpu_u64_set(&d->fragmented, v[2]); + + if (k.dev_data_type.data_type == BCH_DATA_sb || + k.dev_data_type.data_type == BCH_DATA_journal) + usage->hidden += v[0] * ca->mi.bucket_size; + } + rcu_read_unlock(); + break; + } + } + preempt_enable(); + percpu_up_read(&c->mark_lock); +err: + bch_err_fn(c, ret); + return ret; +} + +int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) +{ + return bch2_trans_run(c, + bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ + struct disk_accounting_pos acc; + bpos_to_disk_accounting_pos(&acc, k.k->p); + + acc.type == BCH_DISK_ACCOUNTING_dev_data_type && + acc.dev_data_type.dev == dev + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) + : 0; + })) ?: + bch2_btree_write_buffer_flush_sync(trans)); +} + +int bch2_dev_usage_init(struct bch_dev *ca, bool gc) +{ + struct bch_fs *c = ca->fs; + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_dev_data_type, + .dev_data_type.dev = ca->dev_idx, + .dev_data_type.data_type = BCH_DATA_free, + }; + u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; + + int ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc)); + bch_err_fn(c, ret); + return ret; +} + +void bch2_verify_accounting_clean(struct bch_fs *c) +{ + bool mismatch = false; + struct bch_fs_usage_base base = {}, base_inmem = {}; + + bch2_trans_run(c, + for_each_btree_key(trans, iter, + BTREE_ID_accounting, POS_MIN, + BTREE_ITER_all_snapshots, k, ({ + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); + unsigned nr = bch2_accounting_counters(k.k); + + bch2_accounting_mem_read(c, k.k->p, v, nr); + + if (memcmp(a.v->d, v, nr * sizeof(u64))) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, " !="); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", v[j]); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + mismatch = true; + } + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, a.k->p); + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: { + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + if (!ca) { + rcu_read_unlock(); + continue; + } + + v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); + v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); + v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); + rcu_read_unlock(); + + if (memcmp(a.v->d, v, 3 * sizeof(u64))) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, " in mem"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", v[j]); + + pr_err("dev accounting mismatch: %s", buf.buf); + printbuf_exit(&buf); + mismatch = true; + } + } + } + + 0; + }))); + + acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); + +#define check(x) \ + if (base.x != base_inmem.x) { \ + pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ + mismatch = true; \ + } + + //check(hidden); + check(btree); + check(data); + check(cached); + check(reserved); + check(nr_inodes); + + WARN_ON(mismatch); +} + +void bch2_accounting_gc_free(struct bch_fs *c) +{ + lockdep_assert_held(&c->mark_lock); + + struct bch_accounting_mem *acc = &c->accounting; + + bch2_accounting_free_counters(acc, true); + acc->gc_running = false; +} + +void bch2_fs_accounting_exit(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + + bch2_accounting_free_counters(acc, false); + darray_exit(&acc->k); +} diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h new file mode 100644 index 0000000000000..3d3f25e08b696 --- /dev/null +++ b/fs/bcachefs/disk_accounting.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_H +#define _BCACHEFS_DISK_ACCOUNTING_H + +#include "eytzinger.h" +#include "sb-members.h" + +static inline void bch2_u64s_neg(u64 *v, unsigned nr) +{ + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; +} + +static inline unsigned bch2_accounting_counters(const struct bkey *k) +{ + return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64); +} + +static inline void bch2_accounting_neg(struct bkey_s_accounting a) +{ + bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k)); +} + +static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) +{ + for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) + if (a.v->d[i]) + return false; + return true; +} + +static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, + struct bkey_s_c_accounting src) +{ + EBUG_ON(dst->k.u64s != src.k->u64s); + + for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) + dst->v.d[i] += src.v->d[i]; + if (bversion_cmp(dst->k.version, src.k->version) < 0) + dst->k.version = src.k->version; +} + +static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, + enum bch_data_type data_type, + s64 sectors) +{ + switch (data_type) { + case BCH_DATA_btree: + fs_usage->btree += sectors; + break; + case BCH_DATA_user: + case BCH_DATA_parity: + fs_usage->data += sectors; + break; + case BCH_DATA_cached: + fs_usage->cached += sectors; + break; + default: + break; + } +} + +static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) +{ + acc->_pad = p; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + bch2_bpos_swab(&acc->_pad); +#endif +} + +static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k) +{ + struct bpos ret = k->_pad; + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + bch2_bpos_swab(&ret); +#endif + return ret; +} + +int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, + s64 *, unsigned, bool); +int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); + +int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags, struct printbuf *); +void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); +void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_accounting_swab(struct bkey_s); + +#define bch2_bkey_ops_accounting ((struct bkey_ops) { \ + .key_invalid = bch2_accounting_invalid, \ + .val_to_text = bch2_accounting_to_text, \ + .swab = bch2_accounting_swab, \ + .min_val_size = 8, \ +}) + +int bch2_accounting_update_sb(struct btree_trans *); + +static inline int accounting_pos_cmp(const void *_l, const void *_r) +{ + const struct bpos *l = _l, *r = _r; + + return bpos_cmp(*l, *r); +} + +int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool); +void bch2_accounting_mem_gc(struct bch_fs *); + +static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) +{ + struct bch_accounting_mem *acc = &c->accounting; + unsigned idx; + + EBUG_ON(gc && !acc->gc_running); + + while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { + int ret = bch2_accounting_mem_insert(c, a, gc); + if (ret) + return ret; + } + + struct accounting_mem_entry *e = &acc->k.data[idx]; + + EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); + + for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) + this_cpu_add(e->v[gc][i], a.v->d[i]); + return 0; +} + +/* + * Update in memory counters so they match the btree update we're doing; called + * from transaction commit path + */ +static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) +{ + struct bch_fs *c = trans->c; + + if (!gc) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, a.k->p); + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + if (ca) { + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); + } + rcu_read_unlock(); + break; + } + } + + return __bch2_accounting_mem_mod(c, a, gc); +} + +static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) +{ + percpu_down_read(&trans->c->mark_lock); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc); + percpu_up_read(&trans->c->mark_lock); + return ret; +} + +static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, + unsigned idx, u64 *v, unsigned nr, bool gc) +{ + memset(v, 0, sizeof(*v) * nr); + + if (unlikely(idx >= acc->k.nr)) + return; + + struct accounting_mem_entry *e = &acc->k.data[idx]; + + nr = min_t(unsigned, nr, e->nr_counters); + + for (unsigned i = 0; i < nr; i++) + v[i] = percpu_u64_get(e->v[gc] + i); +} + +static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, + u64 *v, unsigned nr) +{ + struct bch_accounting_mem *acc = &c->accounting; + unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &p); + + bch2_accounting_mem_read_counters(acc, idx, v, nr, false); +} + +int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); +int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); +void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); + +int bch2_gc_accounting_start(struct bch_fs *); +int bch2_gc_accounting_done(struct bch_fs *); + +int bch2_accounting_read(struct bch_fs *); + +int bch2_dev_usage_remove(struct bch_fs *, unsigned); +int bch2_dev_usage_init(struct bch_dev *, bool); + +void bch2_verify_accounting_clean(struct bch_fs *c); + +void bch2_accounting_gc_free(struct bch_fs *); +void bch2_fs_accounting_exit(struct bch_fs *); + +#endif /* _BCACHEFS_DISK_ACCOUNTING_H */ diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h new file mode 100644 index 0000000000000..cba417060b333 --- /dev/null +++ b/fs/bcachefs/disk_accounting_format.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H +#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H + +#include "replicas_format.h" + +/* + * Disk accounting - KEY_TYPE_accounting - on disk format: + * + * Here, the key has considerably more structure than a typical key (bpos); an + * accounting key is 'struct disk_accounting_pos', which is a union of bpos. + * + * More specifically: a key is just a muliword integer (where word endianness + * matches native byte order), so we're treating bpos as an opaque 20 byte + * integer and mapping bch_accounting_key to that. + * + * This is a type-tagged union of all our various subtypes; a disk accounting + * key can be device counters, replicas counters, et cetera - it's extensible. + * + * The value is a list of u64s or s64s; the number of counters is specific to a + * given accounting type. + * + * Unlike with other key types, updates are _deltas_, and the deltas are not + * resolved until the update to the underlying btree, done by btree write buffer + * flush or journal replay. + * + * Journal replay in particular requires special handling. The journal tracks a + * range of entries which may possibly have not yet been applied to the btree + * yet - it does not know definitively whether individual entries are dirty and + * still need to be applied. + * + * To handle this, we use the version field of struct bkey, and give every + * accounting update a unique version number - a total ordering in time; the + * version number is derived from the key's position in the journal. Then + * journal replay can compare the version number of the key from the journal + * with the version number of the key in the btree to determine if a key needs + * to be replayed. + * + * For this to work, we must maintain this strict time ordering of updates as + * they are flushed to the btree, both via write buffer flush and via journal + * replay. This has complications for the write buffer code while journal replay + * is still in progress; the write buffer cannot flush any accounting keys to + * the btree until journal replay has finished replaying its accounting keys, or + * the (newer) version number of the keys from the write buffer will cause + * updates from journal replay to be lost. + */ + +struct bch_accounting { + struct bch_val v; + __u64 d[]; +}; + +#define BCH_ACCOUNTING_MAX_COUNTERS 3 + +#define BCH_DATA_TYPES() \ + x(free, 0) \ + x(sb, 1) \ + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ + x(cached, 5) \ + x(parity, 6) \ + x(stripe, 7) \ + x(need_gc_gens, 8) \ + x(need_discard, 9) \ + x(unstriped, 10) + +enum bch_data_type { +#define x(t, n) BCH_DATA_##t, + BCH_DATA_TYPES() +#undef x + BCH_DATA_NR +}; + +static inline bool data_type_is_empty(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + return true; + default: + return false; + } +} + +static inline bool data_type_is_hidden(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_sb: + case BCH_DATA_journal: + return true; + default: + return false; + } +} + +#define BCH_DISK_ACCOUNTING_TYPES() \ + x(nr_inodes, 0) \ + x(persistent_reserved, 1) \ + x(replicas, 2) \ + x(dev_data_type, 3) \ + x(compression, 4) \ + x(snapshot, 5) \ + x(btree, 6) \ + x(rebalance_work, 7) + +enum disk_accounting_type { +#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, + BCH_DISK_ACCOUNTING_TYPES() +#undef x + BCH_DISK_ACCOUNTING_TYPE_NR, +}; + +struct bch_nr_inodes { +}; + +struct bch_persistent_reserved { + __u8 nr_replicas; +}; + +struct bch_dev_data_type { + __u8 dev; + __u8 data_type; +}; + +struct bch_dev_stripe_buckets { + __u8 dev; +}; + +struct bch_acct_compression { + __u8 type; +}; + +struct bch_acct_snapshot { + __u32 id; +}; + +struct bch_acct_btree { + __u32 id; +}; + +struct disk_accounting_pos { + union { + struct { + __u8 type; + union { + struct bch_nr_inodes nr_inodes; + struct bch_persistent_reserved persistent_reserved; + struct bch_replicas_entry_v1 replicas; + struct bch_dev_data_type dev_data_type; + struct bch_dev_stripe_buckets dev_stripe_buckets; + struct bch_acct_compression compression; + struct bch_acct_snapshot snapshot; + struct bch_acct_btree btree; + }; + }; + struct bpos _pad; + }; +}; + +#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */ diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h new file mode 100644 index 0000000000000..1687a45177a7f --- /dev/null +++ b/fs/bcachefs/disk_accounting_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H +#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H + +#include "darray.h" + +struct accounting_mem_entry { + struct bpos pos; + struct bversion version; + unsigned nr_counters; + u64 __percpu *v[2]; +}; + +struct bch_accounting_mem { + DARRAY(struct accounting_mem_entry) k; + bool gc_running; +}; + +#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 521a86df5e52a..5df8de0b8c021 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -511,7 +511,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, return -EINVAL; if (!c) - return 0; + return -BCH_ERR_option_needs_open_fs; if (!strlen(val) || !strcmp(val, "none")) { *res = 0; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 83e279d41829d..86948d110f6bf 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -13,6 +13,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" +#include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "error.h" @@ -282,7 +283,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update(trans, bucket); + bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); } @@ -300,13 +301,12 @@ static int mark_stripe_bucket(struct btree_trans *trans, bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); - if (!ret) { - alloc_to_bucket(g, new); - bch2_dev_usage_update(c, ca, &old, &new, 0, true); - } + alloc_to_bucket(g, new); bucket_unlock(g); err_unlock: percpu_up_read(&c->mark_lock); + if (!ret) + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } err: bch2_dev_put(ca); @@ -368,7 +368,12 @@ int bch2_trigger_stripe(struct btree_trans *trans, if (unlikely(flags & BTREE_TRIGGER_check_repair)) return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); - if (flags & BTREE_TRIGGER_transactional) { + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); + + + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { /* * If the pointers aren't changing, we don't need to do anything: */ @@ -379,26 +384,58 @@ int bch2_trigger_stripe(struct btree_trans *trans, new_s->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; - BUG_ON(new_s && old_s && - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); + struct gc_stripe *gc = NULL; + if (flags & BTREE_TRIGGER_gc) { + gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + if (!gc) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); + return -BCH_ERR_ENOMEM_mark_stripe; + } + + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + * + * Also: when we bring back runtime gc, locking + */ + gc->alive = true; + gc->sectors = le16_to_cpu(new_s->sectors); + gc->nr_blocks = new_s->nr_blocks; + gc->nr_redundant = new_s->nr_redundant; + + for (unsigned i = 0; i < new_s->nr_blocks; i++) + gc->ptrs[i] = new_s->ptrs[i]; + + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(gc->block_sectors, 0, sizeof(gc->block_sectors)); + } if (new_s) { - s64 sectors = le16_to_cpu(new_s->sectors); + s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; - struct bch_replicas_padded r; - bch2_bkey_to_replicas(&r.e, new); - int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + bch2_bkey_to_replicas(&acc.replicas, new); + int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); if (ret) return ret; + + if (gc) + memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas)); } if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; - struct bch_replicas_padded r; - bch2_bkey_to_replicas(&r.e, old); - int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + bch2_bkey_to_replicas(&acc.replicas, old); + int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); if (ret) return ret; } @@ -447,52 +484,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, } } - if (flags & BTREE_TRIGGER_gc) { - struct gc_stripe *m = - genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - idx); - return -BCH_ERR_ENOMEM_mark_stripe; - } - /* - * This will be wrong when we bring back runtime gc: we should - * be unmarking the old key and then marking the new key - */ - m->alive = true; - m->sectors = le16_to_cpu(new_s->sectors); - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - m->ptrs[i] = new_s->ptrs[i]; - - bch2_bkey_to_replicas(&m->r.e, new); - - /* - * gc recalculates this field from stripe ptr - * references: - */ - memset(m->block_sectors, 0, sizeof(m->block_sectors)); - - int ret = mark_stripe_buckets(trans, old, new, flags); - if (ret) - return ret; - - ret = bch2_update_replicas(c, new, &m->r.e, - ((s64) m->sectors * m->nr_redundant), - 0, true); - if (ret) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, new); - bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf); - printbuf_exit(&buf); - return ret; - } - } - return 0; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 58612abf7927a..a268af3e52bfd 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -257,7 +257,8 @@ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ x(0, need_inode_lock) \ - x(0, invalid_snapshot_node) + x(0, invalid_snapshot_node) \ + x(0, option_needs_open_fs) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index d95c40f1b6af8..a62b631088200 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_iter.h" #include "error.h" #include "journal.h" #include "recovery_passes.h" @@ -98,7 +99,7 @@ static enum ask_yn parse_yn_response(char *buf) } #ifdef __KERNEL__ -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) { struct stdio_redirect *stdio = c->stdio; @@ -108,25 +109,44 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) if (!stdio) return YN_NO; - char buf[100]; + if (trans) + bch2_trans_unlock(trans); + + unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0; + darray_char line = {}; int ret; do { + unsigned long t; bch2_print(c, " (y,n, or Y,N for all errors of this type) "); +rewait: + t = unlock_long_at + ? max_t(long, unlock_long_at - jiffies, 0) + : MAX_SCHEDULE_TIMEOUT; + + int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t); + if (r == -ETIME) { + bch2_trans_unlock_long(trans); + unlock_long_at = 0; + goto rewait; + } - int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); - if (r < 0) - return YN_NO; - buf[r] = '\0'; - } while ((ret = parse_yn_response(buf)) < 0); + if (r < 0) { + ret = YN_NO; + break; + } + + darray_last(line) = '\0'; + } while ((ret = parse_yn_response(line.data)) < 0); + darray_exit(&line); return ret; } #else #include "tools-util.h" -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) { char *buf = NULL; size_t buflen = 0; @@ -198,7 +218,8 @@ static const u8 fsck_flags_extra[] = { #undef x }; -int bch2_fsck_err(struct bch_fs *c, +int __bch2_fsck_err(struct bch_fs *c, + struct btree_trans *trans, enum bch_fsck_flags flags, enum bch_sb_error_id err, const char *fmt, ...) @@ -210,9 +231,16 @@ int bch2_fsck_err(struct bch_fs *c, int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; + might_sleep(); + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) flags |= fsck_flags_extra[err]; + if (!c) + c = trans->c; + + WARN_ON(!trans && bch2_current_has_btree_trans(c)); + if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) return -BCH_ERR_fsck_fix; @@ -314,7 +342,15 @@ int bch2_fsck_err(struct bch_fs *c, bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - int ask = bch2_fsck_ask_yn(c); + int ask = bch2_fsck_ask_yn(c, trans); + + if (trans) { + ret = bch2_trans_relock(trans); + if (ret) { + mutex_unlock(&c->fsck_error_msgs_lock); + goto err; + } + } if (ask >= YN_ALLNO && s) s->fix = ask == YN_ALLNO diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 777711504c35c..995e6bba9bad8 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -110,18 +110,21 @@ struct fsck_err_state { #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) -__printf(4, 5) __cold -int bch2_fsck_err(struct bch_fs *, +__printf(5, 6) __cold +int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, enum bch_fsck_flags, enum bch_sb_error_id, const char *, ...); +#define bch2_fsck_err(c, _flags, _err_type, ...) \ + __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\ + type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\ + _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__) + void bch2_flush_fsck_errs(struct bch_fs *); #define __fsck_err(c, _flags, _err_type, ...) \ ({ \ - int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \ - __VA_ARGS__); \ - \ + int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \ if (_ret != -BCH_ERR_fsck_fix && \ _ret != -BCH_ERR_fsck_ignore) { \ ret = _ret; \ @@ -136,7 +139,14 @@ void bch2_flush_fsck_errs(struct bch_fs *); /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ #define __fsck_err_on(cond, c, _flags, _err_type, ...) \ - (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false) +({ \ + might_sleep(); \ + \ + if (type_is(c, struct bch_fs *)) \ + WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\ + \ + (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ +}) #define need_fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 24840aee335c0..c6e078f27e862 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -284,6 +284,17 @@ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, return eytzinger0_next(idx, nr); } +static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) +{ + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + + if (idx < nr && !cmp(base + idx * size, search)) + return idx; + + return eytzinger0_next(idx, nr); +} + #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ void *_base = (base); \ diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index dde2378595143..c934e807b380f 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_FS_COMMON_H #define _BCACHEFS_FS_COMMON_H +#include "dirent.h" + struct posix_acl; #define BCH_CREATE_TMPFILE (1U << 0) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 54873ecc635cb..cc33d763f7221 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -678,8 +678,8 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_pagecache_add_get(inode); folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, - mapping_gfp_mask(mapping)); + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping)); if (IS_ERR_OR_NULL(folio)) goto err_unlock; @@ -820,9 +820,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_init(&fs); ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, - FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, - mapping_gfp_mask(mapping), - &fs); + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping), &fs); if (ret) goto out; @@ -864,24 +863,26 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, f_pos = pos; f_offset = pos - folio_pos(darray_first(fs)); darray_for_each(fs, fi) { + ssize_t f_reserved; + f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; + f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len); + + if (unlikely(f_reserved != f_len)) { + if (f_reserved < 0) { + if (f == darray_first(fs)) { + ret = f_reserved; + goto out; + } + + folios_trunc(&fs, fi); + end = min(end, folio_end_pos(darray_last(fs))); + } else { + folios_trunc(&fs, fi + 1); + end = f_pos + f_reserved; + } - /* - * XXX: per POSIX and fstests generic/275, on -ENOSPC we're - * supposed to write as much as we have disk space for. - * - * On failure here we should still write out a partial page if - * we aren't completely out of disk space - we don't do that - * yet: - */ - ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); - if (unlikely(ret)) { - folios_trunc(&fs, fi); - if (!fs.nr) - goto out; - - end = min(end, folio_end_pos(darray_last(fs))); break; } diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 049b61bc9a5b3..e246b1e05aa2b 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -179,7 +179,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct bch_inode_info *inode = file_bch_inode(file); struct address_space *mapping = file->f_mapping; size_t count = iov_iter_count(iter); - ssize_t ret; + ssize_t ret = 0; if (!count) return 0; /* skip atime */ @@ -205,7 +205,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += ret; } else { bch2_pagecache_add_get(inode); - ret = generic_file_read_iter(iocb, iter); + ret = filemap_read(iocb, iter, ret); bch2_pagecache_add_put(inode); } out: diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 872283e5bd1e2..a9cc5cad9cc99 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -423,7 +423,7 @@ int bch2_folio_reservation_get(struct bch_fs *c, struct bch_inode_info *inode, struct folio *folio, struct bch2_folio_reservation *res, - unsigned offset, unsigned len) + size_t offset, size_t len) { struct bch_folio *s = bch2_folio_create(folio, 0); unsigned i, disk_sectors = 0, quota_sectors = 0; @@ -437,8 +437,7 @@ int bch2_folio_reservation_get(struct bch_fs *c, for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { - disk_sectors += sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); + disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); quota_sectors += s->s[i].state == SECTOR_unallocated; } @@ -449,12 +448,9 @@ int bch2_folio_reservation_get(struct bch_fs *c, } if (quota_sectors) { - ret = bch2_quota_reservation_add(c, inode, &res->quota, - quota_sectors, true); + ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true); if (unlikely(ret)) { - struct disk_reservation tmp = { - .sectors = disk_sectors - }; + struct disk_reservation tmp = { .sectors = disk_sectors }; bch2_disk_reservation_put(c, &tmp); res->disk.sectors -= disk_sectors; @@ -465,6 +461,31 @@ int bch2_folio_reservation_get(struct bch_fs *c, return 0; } +ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + size_t offset, size_t len) +{ + size_t l, reserved = 0; + int ret; + + while ((l = len - reserved)) { + while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) { + if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c)) + return reserved ?: ret; + + len = reserved + l; + l /= 2; + } + + offset += l; + reserved += l; + } + + return reserved; +} + static void bch2_clear_folio_bits(struct folio *folio) { struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 828c3d7c8f199..fd7d692c087e1 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -153,7 +153,12 @@ int bch2_folio_reservation_get(struct bch_fs *, struct bch_inode_info *, struct folio *, struct bch2_folio_reservation *, - unsigned, unsigned); + size_t, size_t); +ssize_t bch2_folio_reservation_get_partial(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + size_t, size_t); void bch2_set_folio_dirty(struct bch_fs *, struct bch_inode_info *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index ef20b64033e09..189d408ed515a 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -192,7 +192,9 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; + int ret, err; + + trace_bch2_fsync(file, datasync); ret = file_write_and_wait_range(file, start, end); if (ret) @@ -205,6 +207,11 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = bch2_err_class(ret); if (ret == -EROFS) ret = -EIO; + + err = file_check_and_advance_wb_err(file); + if (!ret) + ret = err; + return ret; } @@ -860,9 +867,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) return -EINVAL; - if (remap_flags & REMAP_FILE_DEDUP) - return -EOPNOTSUPP; - if ((pos_src & (block_bytes(c) - 1)) || (pos_dst & (block_bytes(c) - 1))) return -EINVAL; @@ -895,7 +899,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (ret) goto err; - file_update_time(file_dst); + if (!(remap_flags & REMAP_FILE_DEDUP)) + file_update_time(file_dst); bch2_mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 79a0c8732bced..aea8132d2c40e 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -272,6 +272,70 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, return ret; } +static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg) +{ + return put_user(inode->v.i_generation, arg); +} + +static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label) +{ + int ret; + size_t len; + char label[BCH_SB_LABEL_SIZE]; + + BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX); + + mutex_lock(&c->sb_lock); + memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); + mutex_unlock(&c->sb_lock); + + len = strnlen(label, BCH_SB_LABEL_SIZE); + if (len == BCH_SB_LABEL_SIZE) { + bch_warn(c, + "label is too long, return the first %zu bytes", + --len); + } + + ret = copy_to_user(user_label, label, len); + + return ret ? -EFAULT : 0; +} + +static int bch2_ioc_setlabel(struct bch_fs *c, + struct file *file, + struct bch_inode_info *inode, + const char __user *user_label) +{ + int ret; + char label[BCH_SB_LABEL_SIZE]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, user_label, sizeof(label))) + return -EFAULT; + + if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) { + bch_err(c, + "unable to set label with more than %d bytes", + BCH_SB_LABEL_SIZE - 1); + return -EINVAL; + } + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + mutex_lock(&c->sb_lock); + strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); + mutex_unlock(&c->sb_lock); + + ret = bch2_write_super(c); + + mnt_drop_write_file(file); + return ret; +} + static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) { u32 flags; @@ -499,13 +563,21 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; case FS_IOC_GETVERSION: - ret = -ENOTTY; + ret = bch2_ioc_getversion(inode, (u32 __user *) arg); break; case FS_IOC_SETVERSION: ret = -ENOTTY; break; + case FS_IOC_GETFSLABEL: + ret = bch2_ioc_getlabel(c, (void __user *) arg); + break; + + case FS_IOC_SETFSLABEL: + ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg); + break; + case FS_IOC_GOINGDOWN: ret = bch2_ioc_goingdown(c, (u32 __user *) arg); break; @@ -547,6 +619,12 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f9c9a95d7d4ca..fd01a5402c284 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -26,11 +26,13 @@ #include "snapshot.h" #include "super.h" #include "xattr.h" +#include "trace.h" #include #include #include #include +#include #include #include #include @@ -56,9 +58,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, BUG_ON(bi->bi_inum != inode->v.i_ino); - bch2_assert_pos_locked(trans, BTREE_ID_inodes, - POS(0, bi->bi_inum), - c->opts.inodes_use_key_cache); + bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); @@ -888,6 +888,16 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->subvol = inode->ei_subvol; stat->result_mask |= STATX_SUBVOL; + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { + stat->result_mask |= STATX_DIOALIGN; + /* + * this is incorrect; we should be tracking this in superblock, + * and checking the alignment of open devices + */ + stat->dio_mem_align = SECTOR_SIZE; + stat->dio_offset_align = block_bytes(c); + } + if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); @@ -1689,6 +1699,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait) struct bch_fs *c = sb->s_fs_info; int ret; + trace_bch2_sync_fs(sb, wait); + if (c->opts.journal_flush_disabled) return 0; @@ -1717,15 +1729,11 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, char *data) +static int bch2_remount(struct super_block *sb, int *flags, + struct bch_opts opts) { struct bch_fs *c = sb->s_fs_info; - struct bch_opts opts = bch2_opts_empty(); - int ret; - - ret = bch2_parse_mount_opts(c, &opts, data); - if (ret) - goto err; + int ret = 0; opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); @@ -1852,7 +1860,6 @@ static const struct super_operations bch_super_operations = { .statfs = bch2_statfs, .show_devname = bch2_show_devname, .show_options = bch2_show_options, - .remount_fs = bch2_remount, .put_super = bch2_put_super, .freeze_fs = bch2_freeze, .unfreeze_fs = bch2_unfreeze, @@ -1885,76 +1892,63 @@ static int bch2_test_super(struct super_block *s, void *data) return true; } -static struct dentry *bch2_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bch2_fs_get_tree(struct fs_context *fc) { struct bch_fs *c; struct super_block *sb; struct inode *vinode; - struct bch_opts opts = bch2_opts_empty(); + struct bch2_opts_parse *opts_parse = fc->fs_private; + struct bch_opts opts = opts_parse->opts; + darray_str devs; + darray_fs devs_to_fs = {}; int ret; - opt_set(opts, read_only, (flags & SB_RDONLY) != 0); + opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); + opt_set(opts, nostart, true); - ret = bch2_parse_mount_opts(NULL, &opts, data); - if (ret) { - ret = bch2_err_class(ret); - return ERR_PTR(ret); - } - - if (!dev_name || strlen(dev_name) == 0) - return ERR_PTR(-EINVAL); + if (!fc->source || strlen(fc->source) == 0) + return -EINVAL; - darray_str devs; - ret = bch2_split_devs(dev_name, &devs); + ret = bch2_split_devs(fc->source, &devs); if (ret) - return ERR_PTR(ret); + return ret; - darray_fs devs_to_fs = {}; darray_for_each(devs, i) { ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); - if (ret) { - sb = ERR_PTR(ret); - goto got_sb; - } + if (ret) + goto err; } - sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs); + sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); if (!IS_ERR(sb)) goto got_sb; c = bch2_fs_open(devs.data, devs.nr, opts); - if (IS_ERR(c)) { - sb = ERR_CAST(c); - goto got_sb; - } + ret = PTR_ERR_OR_ZERO(c); + if (ret) + goto err; /* Some options can't be parsed until after the fs is started: */ - ret = bch2_parse_mount_opts(c, &opts, data); - if (ret) { - bch2_fs_stop(c); - sb = ERR_PTR(ret); - goto got_sb; - } + opts = bch2_opts_empty(); + ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); + if (ret) + goto err_stop_fs; bch2_opts_apply(&c->opts, opts); - sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); - if (IS_ERR(sb)) - bch2_fs_stop(c); -got_sb: - darray_exit(&devs_to_fs); - bch2_darray_str_exit(&devs); - - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - goto err; - } + ret = bch2_fs_start(c); + if (ret) + goto err_stop_fs; + sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); + ret = PTR_ERR_OR_ZERO(sb); + if (ret) + goto err_stop_fs; +got_sb: c = sb->s_fs_info; if (sb->s_root) { - if ((flags ^ sb->s_flags) & SB_RDONLY) { + if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { ret = -EBUSY; goto err_put_super; } @@ -2020,12 +2014,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_flags |= SB_ACTIVE; out: - return dget(sb->s_root); - -err_put_super: - __bch2_fs_stop(c); - deactivate_locked_super(sb); + fc->root = dget(sb->s_root); err: + darray_exit(&devs_to_fs); + bch2_darray_str_exit(&devs); /* * On an inconsistency error in recovery we might see an -EROFS derived * errorcode (from the journal), but we don't want to return that to @@ -2034,7 +2026,16 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, */ if (bch2_err_matches(ret, EROFS) && ret != -EROFS) ret = -EIO; - return ERR_PTR(bch2_err_class(ret)); + return bch2_err_class(ret); + +err_stop_fs: + bch2_fs_stop(c); + goto err; + +err_put_super: + __bch2_fs_stop(c); + deactivate_locked_super(sb); + goto err; } static void bch2_kill_sb(struct super_block *sb) @@ -2045,12 +2046,76 @@ static void bch2_kill_sb(struct super_block *sb) bch2_fs_free(c); } +static void bch2_fs_context_free(struct fs_context *fc) +{ + struct bch2_opts_parse *opts = fc->fs_private; + + if (opts) { + printbuf_exit(&opts->parse_later); + kfree(opts); + } +} + +static int bch2_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + /* + * the "source" param, i.e., the name of the device(s) to mount, + * is handled by the VFS layer. + */ + if (!strcmp(param->key, "source")) + return -ENOPARAM; + + struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = NULL; + + /* for reconfigure, we already have a struct bch_fs */ + if (fc->root) + c = fc->root->d_sb->s_fs_info; + + int ret = bch2_parse_one_mount_opt(c, &opts->opts, + &opts->parse_later, param->key, + param->string); + + return bch2_err_class(ret); +} + +static int bch2_fs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct bch2_opts_parse *opts = fc->fs_private; + + return bch2_remount(sb, &fc->sb_flags, opts->opts); +} + +static const struct fs_context_operations bch2_context_ops = { + .free = bch2_fs_context_free, + .parse_param = bch2_fs_parse_param, + .get_tree = bch2_fs_get_tree, + .reconfigure = bch2_fs_reconfigure, +}; + +static int bch2_init_fs_context(struct fs_context *fc) +{ + struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); + + if (!opts) + return -ENOMEM; + + opts->parse_later = PRINTBUF; + + fc->ops = &bch2_context_ops; + fc->fs_private = opts; + + return 0; +} + static struct file_system_type bcache_fs_type = { - .owner = THIS_MODULE, - .name = "bcachefs", - .mount = bch2_mount, - .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "bcachefs", + .init_fs_context = bch2_init_fs_context, + .kill_sb = bch2_kill_sb, + .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("bcachefs"); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 921bcdb3e5e4e..cc4f0963c0c50 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -455,31 +455,42 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub return 0; } -static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode) +static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) { struct bch_fs *c = trans->c; - struct bch_inode_unpacked new_inode; + unsigned i_mode = S_IFREG; + u64 i_size = 0; - bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL); - new_inode.bi_size = size; - new_inode.bi_inum = inum; + switch (btree) { + case BTREE_ID_extents: { + struct btree_iter iter = {}; - return __bch2_fsck_write_inode(trans, &new_inode, snapshot); -} + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); + bch2_trans_iter_exit(trans, &iter); + int ret = bkey_err(k); + if (ret) + return ret; -static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum) -{ - struct btree_iter iter = {}; + i_size = k.k->p.offset << 9; + break; + } + case BTREE_ID_dirents: + i_mode = S_IFDIR; + break; + case BTREE_ID_xattrs: + break; + default: + BUG(); + } - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); - bch2_trans_iter_exit(trans, &iter); - int ret = bkey_err(k); - if (ret) - return ret; + struct bch_inode_unpacked new_inode; + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); + new_inode.bi_size = i_size; + new_inode.bi_inum = inum; - return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); + return __bch2_fsck_write_inode(trans, &new_inode, snapshot); } struct snapshots_seen { @@ -824,8 +835,8 @@ static int hash_check_key(struct btree_trans *trans, break; if (fsck_err_on(k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k), c, - hash_table_key_duplicate, + !desc.cmp_bkey(k, hash_k), + trans, hash_table_key_duplicate, "duplicate hash table keys:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), @@ -844,7 +855,7 @@ static int hash_check_key(struct btree_trans *trans, printbuf_exit(&buf); return ret; bad_hash: - if (fsck_err(c, hash_table_key_wrong_offset, + if (fsck_err(trans, hash_table_key_wrong_offset, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), @@ -919,11 +930,11 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i return ret; if (fsck_err_on(ret, - c, inode_points_to_missing_dirent, + trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || fsck_err_on(!ret && !dirent_points_to_inode(d, inode), - c, inode_points_to_wrong_dirent, + trans, inode_points_to_wrong_dirent, "inode points to dirent that does not point back:\n%s", (bch2_bkey_val_to_text(&buf, c, inode_k), prt_newline(&buf), @@ -986,7 +997,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || inode_d_type(prev) != inode_d_type(&u), - c, inode_snapshot_mismatch, + trans, inode_snapshot_mismatch, "inodes in different snapshots don't match")) { bch_err(c, "repair not implemented yet"); return -BCH_ERR_fsck_repair_unimplemented; @@ -1018,7 +1029,8 @@ static int check_inode(struct btree_trans *trans, if (ret < 0) return ret; - fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list, + fsck_err_on(!ret, + trans, unlinked_inode_not_on_deleted_list, "inode %llu:%u unlinked, but not on deleted list", u.bi_inum, k.k->p.snapshot); ret = 0; @@ -1026,7 +1038,7 @@ static int check_inode(struct btree_trans *trans, if (u.bi_flags & BCH_INODE_unlinked && (!c->sb.clean || - fsck_err(c, inode_unlinked_but_clean, + fsck_err(trans, inode_unlinked_but_clean, "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); @@ -1036,7 +1048,7 @@ static int check_inode(struct btree_trans *trans, if (u.bi_flags & BCH_INODE_i_size_dirty && (!c->sb.clean || - fsck_err(c, inode_i_size_dirty_but_clean, + fsck_err(trans, inode_i_size_dirty_but_clean, "filesystem marked clean, but inode %llu has i_size dirty", u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); @@ -1066,7 +1078,7 @@ static int check_inode(struct btree_trans *trans, if (u.bi_flags & BCH_INODE_i_sectors_dirty && (!c->sb.clean || - fsck_err(c, inode_i_sectors_dirty_but_clean, + fsck_err(trans, inode_i_sectors_dirty_but_clean, "filesystem marked clean, but inode %llu has i_sectors dirty", u.bi_inum))) { s64 sectors; @@ -1101,7 +1113,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(u.bi_parent_subvol && (u.bi_subvol == 0 || u.bi_subvol == BCACHEFS_ROOT_SUBVOL), - c, inode_bi_parent_nonzero, + trans, inode_bi_parent_nonzero, "inode %llu:%u has subvol %u but nonzero parent subvol %u", u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { u.bi_parent_subvol = 0; @@ -1121,13 +1133,13 @@ static int check_inode(struct btree_trans *trans, } if (fsck_err_on(ret, - c, inode_bi_subvol_missing, + trans, inode_bi_subvol_missing, "inode %llu:%u bi_subvol points to missing subvolume %u", u.bi_inum, k.k->p.snapshot, u.bi_subvol) || fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), k.k->p.snapshot), - c, inode_bi_subvol_wrong, + trans, inode_bi_subvol_wrong, "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", u.bi_inum, k.k->p.snapshot, u.bi_subvol, le64_to_cpu(s.inode), @@ -1170,6 +1182,71 @@ int bch2_check_inodes(struct bch_fs *c) return ret; } +static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) +{ + switch (btree) { + case BTREE_ID_extents: + return S_ISREG(mode) || S_ISLNK(mode); + case BTREE_ID_dirents: + return S_ISDIR(mode); + case BTREE_ID_xattrs: + return true; + default: + BUG(); + } +} + +static int check_key_has_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct inode_walker *inode, + struct inode_walker_entry *i, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = PTR_ERR_OR_ZERO(i); + if (ret) + return ret; + + if (k.k->type == KEY_TYPE_whiteout) + goto out; + + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + + if (fsck_err_on(!i, + trans, key_in_missing_inode, + "key in missing inode:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + goto delete; + + if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), + trans, key_in_wrong_inode_type, + "key for wrong inode mode %o:\n %s", + i->inode.bi_mode, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + goto delete; +out: +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +delete: + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); + goto out; +} + static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; @@ -1192,7 +1269,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), - c, inode_i_sectors_wrong, + trans, inode_i_sectors_wrong, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_sectors, i->count)) { @@ -1340,7 +1417,7 @@ static int overlapping_extents_found(struct btree_trans *trans, prt_printf(&buf, "\n overwriting %s extent", pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); - if (fsck_err(c, extent_overlapping, + if (fsck_err(trans, extent_overlapping, "overlapping extents%s", buf.buf)) { struct btree_iter *old_iter = &iter1; struct disk_reservation res = { 0 }; @@ -1476,43 +1553,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; } + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) goto err; - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + ret = check_key_has_inode(trans, iter, inode, i, k); if (ret) goto err; if (k.k->type != KEY_TYPE_whiteout) { - if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { - ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - inode->last_pos.inode--; - ret = -BCH_ERR_transaction_restart_nested; - goto err; - } - - if (fsck_err_on(!i, c, extent_in_missing_inode, - "extent in missing inode:\n %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; - - if (fsck_err_on(i && - !S_ISREG(i->inode.bi_mode) && - !S_ISLNK(i->inode.bi_mode), - c, extent_in_non_reg_inode, - "extent in non regular inode mode %o:\n %s", - i->inode.bi_mode, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; - ret = check_overlapping_extents(trans, s, extent_ends, k, iter, &inode->recalculate_sums); if (ret) @@ -1525,7 +1579,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, * didn't have one, iterate over all inodes: */ if (!i) - i = inode->inodes.data + inode->inodes.nr - 1; + i = &darray_last(inode->inodes); for (; inode->inodes.data && i >= inode->inodes.data; @@ -1538,7 +1592,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), - c, extent_past_end_of_inode, + trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n %s", i->inode.bi_inum, i->snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -1574,9 +1628,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, printbuf_exit(&buf); bch_err_fn(c, ret); return ret; -delete: - ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - goto out; } /* @@ -1656,7 +1707,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ } if (fsck_err_on(i->inode.bi_nlink != i->count, - c, inode_dir_wrong_nlink, + trans, inode_dir_wrong_nlink, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; @@ -1692,7 +1743,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, return 0; if (bch2_inode_should_have_bp(target) && - !fsck_err(c, inode_wrong_backpointer, + !fsck_err(trans, inode_wrong_backpointer, "dirent points to inode that does not point back:\n %s", (bch2_bkey_val_to_text(&buf, c, d.s_c), prt_printf(&buf, "\n "), @@ -1718,7 +1769,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, ret = 0; if (fsck_err_on(!backpointer_exists, - c, inode_wrong_backpointer, + trans, inode_wrong_backpointer, "inode %llu:%u has wrong backpointer:\n" "got %llu:%llu\n" "should be %llu:%llu", @@ -1741,7 +1792,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, if (fsck_err_on(backpointer_exists && (S_ISDIR(target->bi_mode) || target->bi_subvol), - c, inode_dir_multiple_links, + trans, inode_dir_multiple_links, "%s %llu:%u with multiple links\n%s", S_ISDIR(target->bi_mode) ? "directory" : "subvolume", target->bi_inum, target_snapshot, buf.buf)) { @@ -1755,7 +1806,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, * it up, it ignores inodes with nlink 0 */ if (fsck_err_on(backpointer_exists && !target->bi_nlink, - c, inode_multiple_links_but_nlink_0, + trans, inode_multiple_links_but_nlink_0, "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { target->bi_nlink++; @@ -1791,7 +1842,7 @@ static int check_dirent_target(struct btree_trans *trans, goto err; if (fsck_err_on(d.v->d_type != inode_d_type(target), - c, dirent_d_type_wrong, + trans, dirent_d_type_wrong, "incorrect d_type: got %s, should be %s:\n%s", bch2_d_type_str(d.v->d_type), bch2_d_type_str(inode_d_type(target)), @@ -1889,11 +1940,12 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * parent_snapshot = d.k->p.snapshot; } - if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, + if (fsck_err_on(ret, + trans, dirent_to_missing_parent_subvol, "dirent parent_subvol points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), - c, dirent_not_visible_in_parent_subvol, + trans, dirent_not_visible_in_parent_subvol, "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", parent_snapshot, (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { @@ -1919,7 +1971,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * return ret; if (ret) { - if (fsck_err(c, dirent_to_missing_subvol, + if (fsck_err(trans, dirent_to_missing_subvol, "dirent points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) return __remove_dirent(trans, d.k->p); @@ -1928,7 +1980,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * } if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, - c, subvol_fs_path_parent_wrong, + trans, subvol_fs_path_parent_wrong, "subvol with wrong fs_path_parent, should be be %u\n%s", parent_subvol, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { @@ -1956,7 +2008,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * } if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, - c, inode_bi_parent_wrong, + trans, inode_bi_parent_wrong, "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", target_inum, subvol_root.bi_parent_subvol, parent_subvol)) { @@ -2009,49 +2061,21 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); - i = walk_inode(trans, dir, k); ret = PTR_ERR_OR_ZERO(i); if (ret < 0) goto err; - if (dir->first_this_inode && dir->inodes.nr) - *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); - dir->first_this_inode = false; - - if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { - ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - dir->last_pos.inode--; - ret = -BCH_ERR_transaction_restart_nested; + ret = check_key_has_inode(trans, iter, dir, i, k); + if (ret) goto err; - } - - if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, - "dirent in nonexisting directory:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - goto out; - } if (!i) goto out; - if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), - c, dirent_in_non_dir_inode, - "dirent in non directory inode type %s:\n%s", - bch2_d_type_str(inode_d_type(&i->inode)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; - } + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &i->inode); + dir->first_this_inode = false; ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) @@ -2077,7 +2101,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; if (fsck_err_on(!target->inodes.nr, - c, dirent_to_missing_inode, + trans, dirent_to_missing_inode, "dirent points to missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), @@ -2156,20 +2180,18 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - if (inode->first_this_inode && inode->inodes.nr) - *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); - inode->first_this_inode = false; - - if (fsck_err_on(!i, c, xattr_in_missing_inode, - "xattr for missing inode %llu", - k.k->p.inode)) - return bch2_btree_delete_at(trans, iter, 0); + ret = check_key_has_inode(trans, iter, inode, i, k); + if (ret) + return ret; if (!i) return 0; + if (inode->first_this_inode) + *hash_info = bch2_hash_info_init(c, &i->inode); + inode->first_this_inode = false; + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); -fsck_err: bch_err_fn(c, ret); return ret; } @@ -2207,7 +2229,7 @@ static int check_root_trans(struct btree_trans *trans) if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (mustfix_fsck_err_on(ret, c, root_subvol_missing, + if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, "root subvol missing")) { struct bkey_i_subvolume *root_subvol = bch2_trans_kmalloc(trans, sizeof(*root_subvol)); @@ -2233,10 +2255,11 @@ static int check_root_trans(struct btree_trans *trans) if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (mustfix_fsck_err_on(ret, c, root_dir_missing, + if (mustfix_fsck_err_on(ret, + trans, root_dir_missing, "root directory missing") || mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), - c, root_inode_not_dir, + trans, root_inode_not_dir, "root inode not a directory")) { bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); @@ -2308,7 +2331,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, break; if (fsck_err_on(!ret, - c, subvol_unreachable, + trans, subvol_unreachable, "unreachable subvolume %s", (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { @@ -2333,7 +2356,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, goto err; if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, - c, subvol_unreachable, + trans, subvol_unreachable, "unreachable subvolume %s", (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { @@ -2412,7 +2435,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino if (bch2_err_matches(ret, ENOENT)) { ret = 0; - if (fsck_err(c, inode_unreachable, + if (fsck_err(trans, inode_unreachable, "unreachable inode\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, inode_k), @@ -2458,7 +2481,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode.bi_inum, snapshot); - if (fsck_err(c, dir_loop, "directory structure loop")) { + if (fsck_err(trans, dir_loop, "directory structure loop")) { ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); if (ret) @@ -2664,7 +2687,6 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite struct nlink_table *links, size_t *idx, u64 range_end) { - struct bch_fs *c = trans->c; struct bch_inode_unpacked u; struct nlink *link = &links->d[*idx]; int ret = 0; @@ -2690,7 +2712,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite } if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, - c, inode_wrong_nlink, + trans, inode_wrong_nlink, "inode %llu type %s has wrong i_nlink (%u, should be %u)", u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index aafa79fa63514..1e20020eadd1f 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -8,6 +8,7 @@ #include "buckets.h" #include "compress.h" #include "dirent.h" +#include "disk_accounting.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -534,12 +535,13 @@ int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { + prt_printf(out, "\n"); printbuf_indent_add(out, 2); prt_printf(out, "mode=%o\n", inode->bi_mode); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, " (%x)\n", inode->bi_flags); + prt_printf(out, "(%x)\n", inode->bi_flags); prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); prt_printf(out, "bi_size=%llu\n", inode->bi_size); @@ -550,6 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, prt_printf(out, #_name "=%llu\n", (u64) inode->_name); BCH_INODE_FIELDS_v3() #undef x + + bch2_printbuf_strip_trailing_newline(out); printbuf_indent_sub(out, 2); } @@ -596,39 +600,26 @@ int bch2_trigger_inode(struct btree_trans *trans, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { - s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k); - - if (flags & BTREE_TRIGGER_transactional) { - if (nr) { - int ret = bch2_replicas_deltas_realloc(trans, 0); - if (ret) - return ret; - - trans->fs_usage_deltas->nr_inodes += nr; - } - - bool old_deleted = bkey_is_deleted_inode(old); - bool new_deleted = bkey_is_deleted_inode(new.s_c); - if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, - new.k->p, new_deleted); - if (ret) - return ret; - } - } - if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { BUG_ON(!trans->journal_res.seq); - bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - if (flags & BTREE_TRIGGER_gc) { - struct bch_fs *c = trans->c; + s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { + struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; + int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + if (ret) + return ret; + } - percpu_down_read(&c->mark_lock); - this_cpu_add(c->usage_gc->b.nr_inodes, nr); - percpu_up_read(&c->mark_lock); + int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) - + (int) bkey_is_deleted_inode(old); + if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) { + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, deleted_delta > 0); + if (ret) + return ret; } return 0; @@ -1096,8 +1087,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans, return ret; ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (fsck_err_on(!bkey_is_inode(k.k), c, - deleted_inode_missing, + if (fsck_err_on(!bkey_is_inode(k.k), + trans, deleted_inode_missing, "nonexistent inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; @@ -1109,7 +1100,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (S_ISDIR(inode.bi_mode)) { ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), - c, deleted_inode_is_dir, + trans, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; @@ -1117,15 +1108,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, - deleted_inode_not_unlinked, + if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), + trans, deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; if (c->sb.clean && - !fsck_err(c, - deleted_inode_but_clean, + !fsck_err(trans, deleted_inode_but_clean, "filesystem marked as clean but have deleted inode %llu:%u", pos.offset, pos.snapshot)) { ret = 0; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 05e0cbef420bc..c6197e6aa0b8c 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -69,11 +69,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) u64 io_latency = time_after64(now, submit_time) ? now - submit_time : 0; - u64 old, new, v = atomic64_read(latency); + u64 old, new; + old = atomic64_read(latency); do { - old = v; - /* * If the io latency was reasonably close to the current * latency, skip doing the update and atomic operation - most of @@ -84,7 +83,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) break; new = ewma_add(old, io_latency, 5); - } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + } while (!atomic64_try_cmpxchg(latency, &old, new)); bch2_congested_acct(ca, io_latency, now, rw); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 13669dd0e3756..6209d7787c6dd 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -230,7 +230,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); unsigned sectors; BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && @@ -238,15 +237,16 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t lockdep_assert_held(&j->lock); + old.v = atomic64_read(&j->reservations.counter); do { - old.v = new.v = v; + new.v = old.v; new.cur_entry_offset = closed_val; if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || old.cur_entry_offset == new.cur_entry_offset) return; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + } while (!atomic64_try_cmpxchg(&j->reservations.counter, + &old.v, new.v)); if (!__journal_entry_is_open(old)) return; @@ -353,7 +353,6 @@ static int journal_entry_open(struct journal *j) ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); union journal_res_state old, new; int u64s; - u64 v; lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); @@ -432,9 +431,9 @@ static int journal_entry_open(struct journal *j) */ j->cur_entry_u64s = u64s; - v = atomic64_read(&j->reservations.counter); + old.v = atomic64_read(&j->reservations.counter); do { - old.v = new.v = v; + new.v = old.v; BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); @@ -446,8 +445,8 @@ static int journal_entry_open(struct journal *j) /* Handle any already added entries */ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + } while (!atomic64_try_cmpxchg(&j->reservations.counter, + &old.v, new.v)); if (nr_unwritten_journal_entries(j) == 1) mod_delayed_work(j->wq, diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index fd1f7cdaa8bc6..00d0f4fc63a1e 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -327,10 +327,10 @@ static inline int journal_res_get_fast(struct journal *j, unsigned flags) { union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); + old.v = atomic64_read(&j->reservations.counter); do { - old.v = new.v = v; + new.v = old.v; /* * Check if there is still room in the current journal @@ -356,8 +356,8 @@ static inline int journal_res_get_fast(struct journal *j, if (flags & JOURNAL_RES_GET_CHECK) return 1; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + } while (!atomic64_try_cmpxchg(&j->reservations.counter, + &old.v, new.v)); res->ref = true; res->idx = old.idx; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index db24ce21b2acf..ff832d20ea603 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -722,13 +722,16 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); + printbuf_indent_add(out, 2); for (i = 0; i < nr_types; i++) { + prt_newline(out); bch2_prt_data_type(out, i); prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].fragmented)); } + printbuf_indent_sub(out, 2); } static int journal_entry_log_validate(struct bch_fs *c, @@ -1583,7 +1586,7 @@ static CLOSURE_CALLBACK(journal_write_done) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; union journal_res_state old, new; - u64 v, seq = le64_to_cpu(w->data->seq); + u64 seq = le64_to_cpu(w->data->seq); int err = 0; bch2_time_stats_update(!JSET_NO_FLUSH(w->data) @@ -1642,14 +1645,15 @@ static CLOSURE_CALLBACK(journal_write_done) if (j->watermark != BCH_WATERMARK_stripe) journal_reclaim_kick(&c->journal); - v = atomic64_read(&j->reservations.counter); + old.v = atomic64_read(&j->reservations.counter); do { - old.v = new.v = v; + new.v = old.v; BUG_ON(journal_state_count(new, new.unwritten_idx)); BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); new.unwritten_idx++; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + } while (!atomic64_try_cmpxchg(&j->reservations.counter, + &old.v, new.v)); closure_wake_up(&w->wait); completed = true; @@ -1854,8 +1858,14 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) } } - if (wb.wb) - bch2_journal_keys_to_write_buffer_end(c, &wb); + if (wb.wb) { + ret = bch2_journal_keys_to_write_buffer_end(c, &wb); + if (ret) { + bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", + bch2_err_str(ret)); + return ret; + } + } spin_lock(&c->journal.lock); w->need_flush_to_write_buffer = false; @@ -2020,8 +2030,9 @@ CLOSURE_CALLBACK(bch2_journal_write) struct printbuf buf = PRINTBUF; buf.atomic++; - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"), - bch2_err_str(ret)); + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), + le64_to_cpu(w->data->seq), + bch2_err_str(ret)); __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); bch2_print_string_as_lines(KERN_ERR, buf.buf); diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index a40d116224edd..c535ba272b68b 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -94,8 +94,8 @@ static int bch2_check_lru_key(struct btree_trans *trans, u64 idx; int ret; - if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, - lru_entry_to_invalid_bucket, + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), + trans, lru_entry_to_invalid_bucket, "lru key points to nonexistent device:bucket %llu:%llu", alloc_pos.inode, alloc_pos.offset)) return bch2_btree_delete_at(trans, lru_iter, 0); @@ -125,7 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, goto out; } - if (fsck_err(c, lru_entry_bad, + if (fsck_err(trans, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index bd71ba77de078..425ba732c9650 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -24,18 +24,6 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) return pos; } -#define BCH_LRU_TYPES() \ - x(read) \ - x(fragmentation) - -enum bch_lru_type { -#define x(n) BCH_LRU_##n, - BCH_LRU_TYPES() -#undef x -}; - -#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) - static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h new file mode 100644 index 0000000000000..f372cb3b8cda0 --- /dev/null +++ b/fs/bcachefs/lru_format.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_FORMAT_H +#define _BCACHEFS_LRU_FORMAT_H + +struct bch_lru { + struct bch_val v; + __le64 idx; +} __packed __aligned(8); + +#define BCH_LRU_TYPES() \ + x(read) \ + x(fragmentation) + +enum bch_lru_type { +#define x(n) BCH_LRU_##n, + BCH_LRU_TYPES() +#undef x +}; + +#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) + +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + +#endif /* _BCACHEFS_LRU_FORMAT_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 6e477fadaa2a5..60e4ef9449586 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -805,7 +805,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (!b) goto next; - unsigned sectors = btree_ptr_sectors_written(&b->key); + unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); ret = bch2_btree_node_rewrite(trans, &iter, b, 0); bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index bb068fd724656..e10fc1da71b19 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -378,6 +378,10 @@ int bch2_opt_parse(struct bch_fs *c, break; case BCH_OPT_FN: ret = opt->fn.parse(c, val, res, err); + + if (ret == -BCH_ERR_option_needs_open_fs) + return ret; + if (ret < 0) { if (err) prt_printf(err, "%s: parse error", @@ -460,14 +464,81 @@ int bch2_opts_check_may_set(struct bch_fs *c) return 0; } +int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, + struct printbuf *parse_later, + const char *name, const char *val) +{ + struct printbuf err = PRINTBUF; + u64 v; + int ret, id; + + id = bch2_mount_opt_lookup(name); + + /* Check for the form "noopt", negation of a boolean opt: */ + if (id < 0 && + !val && + !strncmp("no", name, 2)) { + id = bch2_mount_opt_lookup(name + 2); + val = "0"; + } + + /* Unknown options are ignored: */ + if (id < 0) + return 0; + + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) + goto bad_opt; + + if (id == Opt_acl && + !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) + goto bad_opt; + + if ((id == Opt_usrquota || + id == Opt_grpquota) && + !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) + goto bad_opt; + + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + if (ret == -BCH_ERR_option_needs_open_fs && parse_later) { + prt_printf(parse_later, "%s=%s,", name, val); + if (parse_later->allocation_failure) { + ret = -ENOMEM; + goto out; + } + + ret = 0; + goto out; + } + + if (ret < 0) + goto bad_val; + + if (opts) + bch2_opt_set_by_id(opts, id, v); + + ret = 0; + goto out; + +bad_opt: + pr_err("Bad mount option %s", name); + ret = -BCH_ERR_option_name; + goto out; + +bad_val: + pr_err("Invalid mount option %s", err.buf); + ret = -BCH_ERR_option_value; + +out: + printbuf_exit(&err); + return ret; +} + int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - char *options) + struct printbuf *parse_later, char *options) { char *copied_opts, *copied_opts_start; char *opt, *name, *val; - int ret, id; - struct printbuf err = PRINTBUF; - u64 v; + int ret; if (!options) return 0; @@ -488,53 +559,16 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, name = strsep(&opt, "="); val = opt; - id = bch2_mount_opt_lookup(name); - - /* Check for the form "noopt", negation of a boolean opt: */ - if (id < 0 && - !val && - !strncmp("no", name, 2)) { - id = bch2_mount_opt_lookup(name + 2); - val = "0"; - } - - /* Unknown options are ignored: */ - if (id < 0) - continue; - - if (!(bch2_opt_table[id].flags & OPT_MOUNT)) - goto bad_opt; - - if (id == Opt_acl && - !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) - goto bad_opt; - - if ((id == Opt_usrquota || - id == Opt_grpquota) && - !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) - goto bad_opt; - - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); if (ret < 0) - goto bad_val; - - bch2_opt_set_by_id(opts, id, v); + goto out; } ret = 0; goto out; -bad_opt: - pr_err("Bad mount option %s", name); - ret = -BCH_ERR_option_name; - goto out; -bad_val: - pr_err("Invalid mount option %s", err.buf); - ret = -BCH_ERR_option_value; - goto out; out: kfree(copied_opts_start); - printbuf_exit(&err); return ret; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index b197ec90d4cb0..840dfd7567606 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -406,7 +406,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ @@ -488,6 +488,13 @@ struct bch_opts { #undef x }; +struct bch2_opts_parse { + struct bch_opts opts; + + /* to save opts that can't be parsed before the FS is opened: */ + struct printbuf parse_later; +}; + static const __maybe_unused struct bch_opts bch2_opts_default = { #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ._name##_defined = true, \ @@ -566,7 +573,10 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, int bch2_opt_check_may_set(struct bch_fs *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); -int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); +int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, + struct printbuf *, const char *, const char *); +int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, + char *); /* inode opts: */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index 9f529e4c1b16a..4cf5a2af1e6ff 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -316,6 +316,20 @@ void bch2_prt_newline(struct printbuf *buf) buf->cur_tabstop = 0; } +void bch2_printbuf_strip_trailing_newline(struct printbuf *out) +{ + for (int p = out->pos - 1; p >= 0; --p) { + if (out->buf[p] == '\n') { + out->pos = p; + break; + } + if (out->buf[p] != ' ') + break; + } + + printbuf_nul_terminate_reserved(out); +} + static void __prt_tab(struct printbuf *out) { int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 9ecc56bc96359..1d570387b77f1 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -115,6 +115,7 @@ void bch2_printbuf_indent_add(struct printbuf *, unsigned); void bch2_printbuf_indent_sub(struct printbuf *, unsigned); void bch2_prt_newline(struct printbuf *); +void bch2_printbuf_strip_trailing_newline(struct printbuf *); void bch2_prt_tab(struct printbuf *); void bch2_prt_tab_rjust(struct printbuf *); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1f9d044ed9207..d89eb43c5ce95 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -10,6 +10,7 @@ #include "btree_io.h" #include "buckets.h" #include "dirent.h" +#include "disk_accounting.h" #include "errcode.h" #include "error.h" #include "fs-common.h" @@ -90,6 +91,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); bch2_write_super(c); @@ -134,6 +136,45 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } +static int bch2_journal_replay_accounting_key(struct btree_trans *trans, + struct journal_key *k) +{ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + BTREE_ITER_intent); + int ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + struct bkey u; + struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); + + /* Has this delta already been applied to the btree? */ + if (bversion_cmp(old.k->version, k->k->k.version) >= 0) { + ret = 0; + goto out; + } + + struct bkey_i *new = k->k; + if (old.k->type == KEY_TYPE_accounting) { + new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + bch2_accounting_accumulate(bkey_i_to_accounting(new), + bkey_s_c_to_accounting(old)); + } + + trans->journal_res.seq = k->journal_seq; + + ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int bch2_journal_replay_key(struct btree_trans *trans, struct journal_key *k) { @@ -184,6 +225,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (k->overwritten) goto out; + if (k->k->k.type == KEY_TYPE_accounting) { + ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); + goto out; + } + ret = bch2_trans_update(trans, &iter, k->k, update_flags); out: bch2_trans_iter_exit(trans, &iter); @@ -221,6 +267,30 @@ int bch2_journal_replay(struct bch_fs *c) move_gap(keys, keys->nr); trans = bch2_trans_get(c); + /* + * Replay accounting keys first: we can't allow the write buffer to + * flush accounting keys until we're done + */ + darray_for_each(*keys, k) { + if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) + continue; + + cond_resched(); + + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_skip_accounting_apply| + BCH_TRANS_COMMIT_no_journal_res, + bch2_journal_replay_accounting_key(trans, k)); + if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) + goto err; + + k->overwritten = true; + } + + set_bit(BCH_FS_accounting_replay_done, &c->flags); + /* * First, attempt to replay keys in sorted order. This is more * efficient - better locality of btree access - but some might fail if @@ -241,9 +311,10 @@ int bch2_journal_replay(struct bch_fs *c) commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_skip_accounting_apply| (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), bch2_journal_replay_key(trans, k)); - BUG_ON(!ret && !k->overwritten); + BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); if (ret) { ret = darray_push(&keys_sorted, k); if (ret) @@ -271,6 +342,7 @@ int bch2_journal_replay(struct bch_fs *c) ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_skip_accounting_apply| (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim : 0), @@ -280,7 +352,7 @@ int bch2_journal_replay(struct bch_fs *c) if (ret) goto err; - BUG_ON(!k->overwritten); + BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); } /* @@ -355,45 +427,10 @@ static int journal_replay_entry_early(struct bch_fs *c, container_of(entry, struct jset_entry_usage, entry); switch (entry->btree_id) { - case BCH_FS_USAGE_reserved: - if (entry->level < BCH_REPLICAS_MAX) - c->usage_base->persistent_reserved[entry->level] = - le64_to_cpu(u->v); - break; - case BCH_FS_USAGE_inodes: - c->usage_base->b.nr_inodes = le64_to_cpu(u->v); - break; case BCH_FS_USAGE_key_version: - atomic64_set(&c->key_version, - le64_to_cpu(u->v)); + atomic64_set(&c->key_version, le64_to_cpu(u->v)); break; } - - break; - } - case BCH_JSET_ENTRY_data_usage: { - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); - - ret = bch2_replicas_set_usage(c, &u->r, - le64_to_cpu(u->v)); - break; - } - case BCH_JSET_ENTRY_dev_usage: { - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); - unsigned nr_types = jset_entry_dev_usage_nr_types(u); - - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev)); - if (ca) - for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { - ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); - ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); - ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); - } - rcu_read_unlock(); - break; } case BCH_JSET_ENTRY_blacklist: { @@ -454,8 +491,6 @@ static int journal_replay_early(struct bch_fs *c, } } - bch2_fs_usage_initialize(c); - return 0; } @@ -810,6 +845,10 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; + set_bit(BCH_FS_btree_running, &c->flags); + + ret = bch2_sb_set_upgrade_extra(c); + ret = bch2_run_recovery_passes(c); if (ret) goto err; @@ -969,14 +1008,12 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_unlock(&c->sb_lock); c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; + set_bit(BCH_FS_btree_running, &c->flags); set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc_fake(c, i, 0); - for_each_member_device(c, ca) - bch2_dev_usage_init(ca); - ret = bch2_fs_journal_alloc(c); if (ret) goto err; @@ -986,12 +1023,21 @@ int bch2_fs_initialize(struct bch_fs *c) * set up the journal.pin FIFO and journal.cur pointer: */ bch2_fs_journal_start(&c->journal, 1); + set_bit(BCH_FS_accounting_replay_done, &c->flags); bch2_journal_set_replay_done(&c->journal); ret = bch2_fs_read_write_early(c); if (ret) goto err; + for_each_member_device(c, ca) { + ret = bch2_dev_usage_init(ca, false); + if (ret) { + bch2_dev_put(ca); + goto err; + } + } + /* * Write out the superblock and journal buckets, now that we can do * btree updates @@ -1025,7 +1071,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_pack(&packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0); bch_err_msg(c, ret, "creating root directory"); if (ret) goto err; diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 4a9eb9582b6e5..73339a0a31111 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -5,6 +5,7 @@ #include "backpointers.h" #include "btree_gc.h" #include "btree_node_scan.h" +#include "disk_accounting.h" #include "ec.h" #include "fsck.h" #include "inode.h" @@ -192,6 +193,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c) { int ret = 0; + down_read(&c->state_lock); + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { struct recovery_pass_fn *p = recovery_pass_fns + i; @@ -207,6 +210,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c) break; } + up_read(&c->state_lock); + return ret; } diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 773aea9a0080f..8c7dee5983d27 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -15,6 +15,7 @@ #define BCH_RECOVERY_PASSES() \ x(scan_for_btree_nodes, 37, 0) \ x(check_topology, 4, 0) \ + x(accounting_read, 39, PASS_ALWAYS) \ x(alloc_read, 0, PASS_ALWAYS) \ x(stripes_read, 1, PASS_ALWAYS) \ x(initialize_subvolumes, 2, 0) \ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 9ac6cf21cfbf7..5f92715e1525a 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -171,7 +171,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, not_found: BUG_ON(!(flags & BTREE_TRIGGER_check_repair)); - if (fsck_err(c, reflink_p_to_missing_reflink_v, + if (fsck_err(trans, reflink_p_to_missing_reflink_v, "pointer to missing indirect extent\n" " %s\n" " missing range %llu-%llu", diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 57a1f09cca096..10c96cb2047ae 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "buckets.h" +#include "disk_accounting.h" #include "journal.h" #include "replicas.h" #include "super-io.h" @@ -243,145 +244,25 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r, return __replicas_entry_idx(r, search) >= 0; } -bool bch2_replicas_marked(struct bch_fs *c, +bool bch2_replicas_marked_locked(struct bch_fs *c, struct bch_replicas_entry_v1 *search) { - bool marked; - - if (!search->nr_devs) - return true; - verify_replicas_entry(search); - percpu_down_read(&c->mark_lock); - marked = __replicas_has_entry(&c->replicas, search) && - (likely((!c->replicas_gc.entries)) || - __replicas_has_entry(&c->replicas_gc, search)); - percpu_up_read(&c->mark_lock); - - return marked; -} - -static void __replicas_table_update(struct bch_fs_usage *dst, - struct bch_replicas_cpu *dst_r, - struct bch_fs_usage *src, - struct bch_replicas_cpu *src_r) -{ - int src_idx, dst_idx; - - *dst = *src; - - for (src_idx = 0; src_idx < src_r->nr; src_idx++) { - if (!src->replicas[src_idx]) - continue; - - dst_idx = __replicas_entry_idx(dst_r, - cpu_replicas_entry(src_r, src_idx)); - BUG_ON(dst_idx < 0); - - dst->replicas[dst_idx] = src->replicas[src_idx]; - } -} - -static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, - struct bch_replicas_cpu *dst_r, - struct bch_fs_usage __percpu *src_p, - struct bch_replicas_cpu *src_r) -{ - unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; - struct bch_fs_usage *dst, *src = (void *) - bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); - - preempt_disable(); - dst = this_cpu_ptr(dst_p); - preempt_enable(); - - __replicas_table_update(dst, dst_r, src, src_r); + return !search->nr_devs || + (__replicas_has_entry(&c->replicas, search) && + (likely((!c->replicas_gc.entries)) || + __replicas_has_entry(&c->replicas_gc, search))); } -/* - * Resize filesystem accounting: - */ -static int replicas_table_update(struct bch_fs *c, - struct bch_replicas_cpu *new_r) +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) { - struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; - struct bch_fs_usage_online *new_scratch = NULL; - struct bch_fs_usage __percpu *new_gc = NULL; - struct bch_fs_usage *new_base = NULL; - unsigned i, bytes = sizeof(struct bch_fs_usage) + - sizeof(u64) * new_r->nr; - unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + - sizeof(u64) * new_r->nr; - int ret = 0; - - memset(new_usage, 0, sizeof(new_usage)); - - for (i = 0; i < ARRAY_SIZE(new_usage); i++) - if (!(new_usage[i] = __alloc_percpu_gfp(bytes, - sizeof(u64), GFP_KERNEL))) - goto err; + percpu_down_read(&c->mark_lock); + bool ret = bch2_replicas_marked_locked(c, search); + percpu_up_read(&c->mark_lock); - if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || - !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || - (c->usage_gc && - !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) - goto err; - - for (i = 0; i < ARRAY_SIZE(new_usage); i++) - if (c->usage[i]) - __replicas_table_update_pcpu(new_usage[i], new_r, - c->usage[i], &c->replicas); - if (c->usage_base) - __replicas_table_update(new_base, new_r, - c->usage_base, &c->replicas); - if (c->usage_gc) - __replicas_table_update_pcpu(new_gc, new_r, - c->usage_gc, &c->replicas); - - for (i = 0; i < ARRAY_SIZE(new_usage); i++) - swap(c->usage[i], new_usage[i]); - swap(c->usage_base, new_base); - swap(c->usage_scratch, new_scratch); - swap(c->usage_gc, new_gc); - swap(c->replicas, *new_r); -out: - free_percpu(new_gc); - kfree(new_scratch); - for (i = 0; i < ARRAY_SIZE(new_usage); i++) - free_percpu(new_usage[i]); - kfree(new_base); return ret; -err: - bch_err(c, "error updating replicas table: memory allocation failure"); - ret = -BCH_ERR_ENOMEM_replicas_table; - goto out; -} - -static unsigned reserve_journal_replicas(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_replicas_entry_v1 *e; - unsigned journal_res_u64s = 0; - - /* nr_inodes: */ - journal_res_u64s += - DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); - - /* key_version: */ - journal_res_u64s += - DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); - - /* persistent_reserved: */ - journal_res_u64s += - DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * - BCH_REPLICAS_MAX; - - for_each_cpu_replicas_entry(r, e) - journal_res_u64s += - DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + - e->nr_devs, sizeof(u64)); - return journal_res_u64s; } noinline @@ -417,10 +298,6 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); if (ret) goto err; - - bch2_journal_entry_res_resize(&c->journal, - &c->replicas_journal_res, - reserve_journal_replicas(c, &new_r)); } if (!new_r.entries && @@ -435,7 +312,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, /* don't update in memory replicas until changes are persistent */ percpu_down_write(&c->mark_lock); if (new_r.entries) - ret = replicas_table_update(c, &new_r); + swap(c->replicas, new_r); if (new_gc.entries) swap(new_gc, c->replicas_gc); percpu_up_write(&c->mark_lock); @@ -457,20 +334,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) ? 0 : bch2_mark_replicas_slowpath(c, r); } -/* replicas delta list: */ - -int bch2_replicas_delta_list_mark(struct bch_fs *c, - struct replicas_delta_list *r) -{ - struct replicas_delta *d = r->d; - struct replicas_delta *top = (void *) r->d + r->used; - int ret = 0; - - for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) - ret = bch2_mark_replicas(c, &d->r); - return ret; -} - /* * Old replicas_gc mechanism: only used for journal replicas entries now, should * die at some point: @@ -484,8 +347,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) percpu_down_write(&c->mark_lock); ret = ret ?: - bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: - replicas_table_update(c, &c->replicas_gc); + bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); + if (!ret) + swap(c->replicas, c->replicas_gc); kfree(c->replicas_gc.entries); c->replicas_gc.entries = NULL; @@ -556,10 +420,10 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) int bch2_replicas_gc2(struct bch_fs *c) { struct bch_replicas_cpu new = { 0 }; - unsigned i, nr; + unsigned nr; int ret = 0; - bch2_journal_meta(&c->journal); + bch2_accounting_mem_gc(c); retry: nr = READ_ONCE(c->replicas.nr); new.entry_size = READ_ONCE(c->replicas.entry_size); @@ -580,24 +444,33 @@ int bch2_replicas_gc2(struct bch_fs *c) goto retry; } - for (i = 0; i < c->replicas.nr; i++) { + for (unsigned i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - if (e->data_type == BCH_DATA_journal || - c->usage_base->replicas[i] || - percpu_u64_get(&c->usage[0]->replicas[i]) || - percpu_u64_get(&c->usage[1]->replicas[i]) || - percpu_u64_get(&c->usage[2]->replicas[i]) || - percpu_u64_get(&c->usage[3]->replicas[i])) + struct disk_accounting_pos k = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + + memcpy(&k.replicas, e, replicas_entry_bytes(e)); + + struct bpos p = disk_accounting_pos_to_bpos(&k); + + struct bch_accounting_mem *acc = &c->accounting; + bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &p) >= acc->k.nr; + + if (e->data_type == BCH_DATA_journal || !kill) memcpy(cpu_replicas_entry(&new, new.nr++), e, new.entry_size); } bch2_cpu_replicas_sort(&new); - ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: - replicas_table_update(c, &new); + ret = bch2_cpu_replicas_to_sb_replicas(c, &new); + + if (!ret) + swap(c->replicas, new); kfree(new.entries); @@ -611,34 +484,6 @@ int bch2_replicas_gc2(struct bch_fs *c) return ret; } -int bch2_replicas_set_usage(struct bch_fs *c, - struct bch_replicas_entry_v1 *r, - u64 sectors) -{ - int ret, idx = bch2_replicas_entry_idx(c, r); - - if (idx < 0) { - struct bch_replicas_cpu n; - - n = cpu_replicas_add_entry(c, &c->replicas, r); - if (!n.entries) - return -BCH_ERR_ENOMEM_cpu_replicas; - - ret = replicas_table_update(c, &n); - if (ret) - return ret; - - kfree(n.entries); - - idx = bch2_replicas_entry_idx(c, r); - BUG_ON(ret < 0); - } - - c->usage_base->replicas[idx] = sectors; - - return 0; -} - /* Replicas tracking - superblock: */ static int @@ -724,8 +569,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) bch2_cpu_replicas_sort(&new_r); percpu_down_write(&c->mark_lock); - - ret = replicas_table_update(c, &new_r); + swap(c->replicas, new_r); percpu_up_write(&c->mark_lock); kfree(new_r.entries); @@ -1027,10 +871,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { - unsigned ret; - mutex_lock(&c->sb_lock); - ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); + unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); mutex_unlock(&c->sb_lock); return ret; @@ -1038,25 +880,6 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) void bch2_fs_replicas_exit(struct bch_fs *c) { - unsigned i; - - kfree(c->usage_scratch); - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - free_percpu(c->usage[i]); - kfree(c->usage_base); kfree(c->replicas.entries); kfree(c->replicas_gc.entries); - - mempool_exit(&c->replicas_delta_pool); -} - -int bch2_fs_replicas_init(struct bch_fs *c) -{ - bch2_journal_entry_res_resize(&c->journal, - &c->replicas_journal_res, - reserve_journal_replicas(c, &c->replicas)); - - return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, - REPLICAS_DELTA_LIST_MAX) ?: - replicas_table_update(c, &c->replicas); } diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 654a4b26d3a3c..622482559c3db 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -25,18 +25,13 @@ int bch2_replicas_entry_idx(struct bch_fs *, void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, enum bch_data_type, struct bch_devs_list); + +bool bch2_replicas_marked_locked(struct bch_fs *, + struct bch_replicas_entry_v1 *); bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); int bch2_mark_replicas(struct bch_fs *, struct bch_replicas_entry_v1 *); -static inline struct replicas_delta * -replicas_delta_next(struct replicas_delta *d) -{ - return (void *) d + replicas_entry_bytes(&d->r) + 8; -} - -int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); - void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, @@ -58,10 +53,6 @@ int bch2_replicas_gc_end(struct bch_fs *, int); int bch2_replicas_gc_start(struct bch_fs *, unsigned); int bch2_replicas_gc2(struct bch_fs *); -int bch2_replicas_set_usage(struct bch_fs *, - struct bch_replicas_entry_v1 *, - u64); - #define for_each_cpu_replicas_entry(_r, _i) \ for (_i = (_r)->entries; \ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ @@ -88,6 +79,5 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; void bch2_fs_replicas_exit(struct bch_fs *); -int bch2_fs_replicas_init(struct bch_fs *); #endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h index ac90d142c4e87..fed71c861fe76 100644 --- a/fs/bcachefs/replicas_types.h +++ b/fs/bcachefs/replicas_types.h @@ -8,20 +8,4 @@ struct bch_replicas_cpu { struct bch_replicas_entry_v1 *entries; }; -struct replicas_delta { - s64 delta; - struct bch_replicas_entry_v1 r; -} __packed; - -struct replicas_delta_list { - unsigned size; - unsigned used; - - struct {} memset_start; - u64 nr_inodes; - u64 persistent_reserved[BCH_REPLICAS_MAX]; - struct {} memset_end; - struct replicas_delta d[]; -}; - #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 47f10ab57f40c..c57d42bb8d1b8 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -183,25 +183,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) { - percpu_down_read(&c->mark_lock); - - if (!journal_seq) { - for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - } else { - bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); - } - - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->b.nr_inodes); - } - { struct jset_entry_usage *u = container_of(jset_entry_init(end, sizeof(*u)), @@ -212,49 +193,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->v = cpu_to_le64(atomic64_read(&c->key_version)); } - for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_reserved; - u->entry.level = i; - u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); - } - - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - struct jset_entry_data_usage *u = - container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), - struct jset_entry_data_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_data_usage; - u->v = cpu_to_le64(c->usage_base->replicas[i]); - unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), - "embedded variable length struct"); - } - - for_each_member_device(c, ca) { - unsigned b = sizeof(struct jset_entry_dev_usage) + - sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; - struct jset_entry_dev_usage *u = - container_of(jset_entry_init(end, b), - struct jset_entry_dev_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_dev_usage; - u->dev = cpu_to_le32(ca->dev_idx); - - for (unsigned i = 0; i < BCH_DATA_NR; i++) { - u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); - u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); - u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); - } - } - - percpu_up_read(&c->mark_lock); - for (unsigned i = 0; i < 2; i++) { struct jset_entry_clock *clock = container_of(jset_entry_init(end, sizeof(*clock)), diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 4710b61631f0f..dfbbd33c8731b 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -54,9 +54,32 @@ BCH_FSCK_ERR_subvol_children_not_set) \ x(mi_btree_bitmap, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_btree_bitmap_not_marked) - -#define DOWNGRADE_TABLE() + BCH_FSCK_ERR_btree_bitmap_not_marked) \ + x(disk_accounting_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_bkey_version_in_future, \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_accounting_mismatch) + +#define DOWNGRADE_TABLE() \ + x(bucket_stripe_sectors, \ + 0) \ + x(disk_accounting_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_fs_usage_hidden_wrong, \ + BCH_FSCK_ERR_fs_usage_btree_wrong, \ + BCH_FSCK_ERR_fs_usage_data_wrong, \ + BCH_FSCK_ERR_fs_usage_cached_wrong, \ + BCH_FSCK_ERR_fs_usage_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ + BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_replicas_wrong, \ + BCH_FSCK_ERR_bkey_version_in_future) struct upgrade_downgrade_entry { u64 recovery_passes; @@ -80,6 +103,37 @@ UPGRADE_TABLE() #undef x }; +static int have_stripes(struct bch_fs *c) +{ + return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b); +} + +int bch2_sb_set_upgrade_extra(struct bch_fs *c) +{ + unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; + unsigned new_version = c->sb.version; + bool write_sb = false; + int ret = 0; + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (old_version < bcachefs_metadata_version_bucket_stripe_sectors && + new_version >= bcachefs_metadata_version_bucket_stripe_sectors && + (ret = have_stripes(c) > 0)) { + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent); + write_sb = true; + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret < 0 ? ret : 0; +} + void bch2_sb_set_upgrade(struct bch_fs *c, unsigned old_version, unsigned new_version) @@ -101,16 +155,12 @@ void bch2_sb_set_upgrade(struct bch_fs *c, ext->recovery_passes_required[0] |= cpu_to_le64(bch2_recovery_passes_to_stable(passes)); - for (const u16 *e = i->errors; - e < i->errors + i->nr_errors; - e++) { - __set_bit(*e, c->sb.errors_silent); - ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64)); - } + for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++) + __set_bit_le64(*e, ext->errors_silent); } } -#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ }; +#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ }; DOWNGRADE_TABLE() #undef x @@ -125,6 +175,37 @@ DOWNGRADE_TABLE() #undef x }; +static int downgrade_table_extra(struct bch_fs *c, darray_char *table) +{ + struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); + unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); + int ret = 0; + + unsigned nr_errors = le16_to_cpu(dst->nr_errors); + + switch (le16_to_cpu(dst->version)) { + case bcachefs_metadata_version_bucket_stripe_sectors: + if (have_stripes(c)) { + bytes += sizeof(dst->errors[0]) * 2; + + ret = darray_make_room(table, bytes); + if (ret) + return ret; + + /* open coded __set_bit_le64, as dst is packed and + * dst->recovery_passes is misaligned */ + unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; + dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64)); + + dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong); + } + break; + } + + dst->nr_errors = cpu_to_le16(nr_errors); + return ret; +} + static inline const struct bch_sb_field_downgrade_entry * downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) { @@ -210,6 +291,9 @@ const struct bch_sb_field_ops bch_sb_field_ops_downgrade = { int bch2_sb_downgrade_update(struct bch_fs *c) { + if (!test_bit(BCH_FS_btree_running, &c->flags)) + return 0; + darray_char table = {}; int ret = 0; @@ -234,7 +318,14 @@ int bch2_sb_downgrade_update(struct bch_fs *c) for (unsigned i = 0; i < src->nr_errors; i++) dst->errors[i] = cpu_to_le16(src->errors[i]); - table.nr += bytes; + downgrade_table_extra(c, &table); + + if (!dst->recovery_passes[0] && + !dst->recovery_passes[1] && + !dst->nr_errors) + continue; + + table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); } struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h index 57e6c916fc738..095b7cc9bb473 100644 --- a/fs/bcachefs/sb-downgrade.h +++ b/fs/bcachefs/sb-downgrade.h @@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; int bch2_sb_downgrade_update(struct bch_fs *); void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); +int bch2_sb_set_upgrade_extra(struct bch_fs *); void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d6f35a99c4291..67648b77613f0 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -227,8 +227,8 @@ enum bch_fsck_flags { x(deleted_inode_is_dir, 213, 0) \ x(deleted_inode_not_unlinked, 214, 0) \ x(extent_overlapping, 215, 0) \ - x(extent_in_missing_inode, 216, 0) \ - x(extent_in_non_reg_inode, 217, 0) \ + x(key_in_missing_inode, 216, 0) \ + x(key_in_wrong_inode_type, 217, 0) \ x(extent_past_end_of_inode, 218, 0) \ x(dirent_empty_name, 219, 0) \ x(dirent_val_too_big, 220, 0) \ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 4ef98e696673f..994489a05ce48 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -549,7 +549,7 @@ static int check_snapshot_tree(struct btree_trans *trans, if (fsck_err_on(ret || root_id != bch2_snapshot_root(c, root_id) || st.k->p.offset != le32_to_cpu(s.tree), - c, snapshot_tree_to_missing_snapshot, + trans, snapshot_tree_to_missing_snapshot, "snapshot tree points to missing/incorrect snapshot:\n %s", (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); @@ -562,19 +562,19 @@ static int check_snapshot_tree(struct btree_trans *trans, goto err; if (fsck_err_on(ret, - c, snapshot_tree_to_missing_subvol, + trans, snapshot_tree_to_missing_subvol, "snapshot tree points to missing subvolume:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || fsck_err_on(!bch2_snapshot_is_ancestor(c, le32_to_cpu(subvol.snapshot), root_id), - c, snapshot_tree_to_wrong_subvol, + trans, snapshot_tree_to_wrong_subvol, "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), - c, snapshot_tree_to_snapshot_subvol, + trans, snapshot_tree_to_snapshot_subvol, "snapshot tree points to snapshot subvolume:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { @@ -811,7 +811,7 @@ static int check_snapshot(struct btree_trans *trans, } } else { if (fsck_err_on(s.subvol, - c, snapshot_should_not_have_subvol, + trans, snapshot_should_not_have_subvol, "snapshot should not point to subvol:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); @@ -828,7 +828,8 @@ static int check_snapshot(struct btree_trans *trans, if (ret < 0) goto err; - if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree, + if (fsck_err_on(!ret, + trans, snapshot_to_bad_snapshot_tree, "snapshot points to missing/incorrect tree:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = snapshot_tree_ptr_repair(trans, iter, k, &s); @@ -840,7 +841,7 @@ static int check_snapshot(struct btree_trans *trans, real_depth = bch2_snapshot_depth(c, parent_id); if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, - c, snapshot_bad_depth, + trans, snapshot_bad_depth, "snapshot with incorrect depth field, should be %u:\n %s", real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); @@ -856,7 +857,8 @@ static int check_snapshot(struct btree_trans *trans, if (ret < 0) goto err; - if (fsck_err_on(!ret, c, snapshot_bad_skiplist, + if (fsck_err_on(!ret, + trans, snapshot_bad_skiplist, "snapshot with bad skiplist field:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); @@ -1018,7 +1020,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) darray_for_each(*t, id) { if (fsck_err_on(!bch2_snapshot_equiv(c, *id), - c, snapshot_node_missing, + trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); @@ -1050,8 +1052,8 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, - bkey_in_missing_snapshot, + if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), + trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index dfc9cf305756f..f56720b558626 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -57,7 +57,7 @@ static int check_subvol(struct btree_trans *trans, if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && subvol.v->fs_path_parent, - c, subvol_root_fs_path_parent_nonzero, + trans, subvol_root_fs_path_parent_nonzero, "root subvolume has nonzero fs_path_parent\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct bkey_i_subvolume *n = @@ -80,7 +80,7 @@ static int check_subvol(struct btree_trans *trans, goto err; if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, - c, subvol_children_not_set, + trans, subvol_children_not_set, "subvolume not set in subvolume_children btree at %llu:%llu\n%s", pos.inode, pos.offset, (printbuf_reset(&buf), @@ -101,7 +101,8 @@ static int check_subvol(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (fsck_err_on(ret, c, subvol_to_missing_root, + if (fsck_err_on(ret, + trans, subvol_to_missing_root, "subvolume %llu points to missing subvolume root %llu:%u", k.k->p.offset, le64_to_cpu(subvol.v->inode), le32_to_cpu(subvol.v->snapshot))) { @@ -111,7 +112,7 @@ static int check_subvol(struct btree_trans *trans, } if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, - c, subvol_root_wrong_bi_subvol, + trans, subvol_root_wrong_bi_subvol, "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", inode.bi_inum, inode_iter.k.p.snapshot, inode.bi_subvol, subvol.k->p.offset)) { @@ -139,7 +140,7 @@ static int check_subvol(struct btree_trans *trans, return ret; if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, - c, subvol_not_master_and_not_snapshot, + trans, subvol_not_master_and_not_snapshot, "subvolume %llu is not set as snapshot but is not master subvolume", k.k->p.offset)) { struct bkey_i_subvolume *s = @@ -173,7 +174,6 @@ static int check_subvol_child(struct btree_trans *trans, struct btree_iter *child_iter, struct bkey_s_c child_k) { - struct bch_fs *c = trans->c; struct bch_subvolume s; int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), 0, subvolume, &s); @@ -182,7 +182,7 @@ static int check_subvol_child(struct btree_trans *trans, if (fsck_err_on(ret || le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, - c, subvol_children_bad, + trans, subvol_children_bad, "incorrect entry in subvolume_children btree %llu:%llu", child_k.k->p.inode, child_k.k->p.offset)) { ret = bch2_btree_delete_at(trans, child_iter, 0); @@ -630,9 +630,9 @@ int bch2_initialize_subvolumes(struct bch_fs *c) root_volume.v.snapshot = cpu_to_le32(U32_MAX); root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 641f2975177b6..bf7e998603f11 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -25,6 +25,7 @@ #include "clock.h" #include "compress.h" #include "debug.h" +#include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -88,6 +89,19 @@ const char * const bch2_fs_flag_strs[] = { NULL }; +void bch2_print_str(struct bch_fs *c, const char *str) +{ +#ifdef __KERNEL__ + struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); + + if (unlikely(stdio)) { + bch2_stdio_redirect_printf(stdio, true, "%s", str); + return; + } +#endif + bch2_print_string_as_lines(KERN_ERR, str); +} + __printf(2, 0) static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) { @@ -222,22 +236,6 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) return c; } -static void bch2_dev_usage_journal_reserve(struct bch_fs *c) -{ - unsigned nr = 0, u64s = - ((sizeof(struct jset_entry_dev_usage) + - sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / - sizeof(u64); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, NULL) - nr++; - rcu_read_unlock(); - - bch2_journal_entry_res_resize(&c->journal, - &c->dev_usage_journal_res, u64s * nr); -} - /* Filesystem RO/RW: */ /* @@ -376,6 +374,7 @@ void bch2_fs_read_only(struct bch_fs *c) BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); + bch2_verify_accounting_clean(c); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); @@ -536,7 +535,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); - bch2_fs_allocator_background_exit(c); + bch2_fs_accounting_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); @@ -569,6 +568,7 @@ static void __bch2_fs_free(struct bch_fs *c) darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); + free_percpu(c->usage); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); @@ -787,8 +787,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->list); - mutex_init(&c->usage_scratch_lock); - mutex_init(&c->bio_bounce_pages_lock); mutex_init(&c->snapshot_table_lock); init_rwsem(&c->snapshot_create_lock); @@ -896,6 +894,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || @@ -911,7 +910,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: - bch2_fs_replicas_init(c) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: @@ -942,7 +940,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); - bch2_dev_usage_journal_reserve(c); bch2_journal_entry_res_resize(&c->journal, &c->clock_journal_res, (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); @@ -1195,6 +1192,7 @@ static void bch2_dev_free(struct bch_dev *ca) kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); + bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); @@ -1317,6 +1315,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif + bch2_dev_allocator_background_init(ca); + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || @@ -1541,6 +1541,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + bch2_dev_do_discards(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1609,7 +1610,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_norun, NULL); + BTREE_TRIGGER_norun, NULL) ?: + bch2_dev_usage_remove(c, ca->dev_idx); bch_err_msg(c, ret, "removing dev alloc info"); return ret; } @@ -1646,6 +1648,16 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (ret) goto err; + /* + * We need to flush the entire journal to get rid of keys that reference + * the device being removed before removing the superblock entry + */ + bch2_journal_flush_all_pins(&c->journal); + + /* + * this is really just needed for the bch2_replicas_gc_(start|end) + * calls, and could be cleaned up: + */ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); if (ret) @@ -1688,17 +1700,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) bch2_dev_free(ca); - /* - * At this point the device object has been removed in-core, but the - * on-disk journal might still refer to the device index via sb device - * usage entries. Recovery fails if it sees usage information for an - * invalid device. Flush journal pins to push the back of the journal - * past now invalid device index references before we update the - * superblock, but after the device object has been removed so any - * further journal writes elide usage info for the device. - */ - bch2_journal_flush_all_pins(&c->journal); - /* * Free this device's slot in the bch_member array - all pointers to * this device must be gone: @@ -1711,8 +1712,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->sb_lock); up_write(&c->state_lock); - - bch2_dev_usage_journal_reserve(c); return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_rw && @@ -1760,8 +1759,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err; } - bch2_dev_usage_init(ca); - ret = __bch2_dev_attach_bdev(ca, &sb); if (ret) goto err; @@ -1843,7 +1840,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path) bch2_write_super(c); mutex_unlock(&c->sb_lock); - bch2_dev_usage_journal_reserve(c); + ret = bch2_dev_usage_init(ca, false); + if (ret) + goto err_late; ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(ca, ret, "marking new superblock"); @@ -2015,15 +2014,18 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_dev_data_type, + .dev_data_type.dev = ca->dev_idx, + .dev_data_type.data_type = BCH_DATA_free, + }; + u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; + + ret = bch2_trans_do(ca->fs, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: + bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; - - /* - * XXX: this is all wrong transactionally - we'll be able to do - * this correctly after the disk space accounting rewrite - */ - ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets; } bch2_recalc_capacity(c); @@ -2035,6 +2037,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { + if (!strncmp(name, "/dev/", strlen("/dev/"))) + name += strlen("/dev/"); + for_each_member_device(c, ca) if (!strcmp(name, ca->name)) return ca; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 93ca74d108b17..8df52a232737c 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -21,6 +21,7 @@ #include "buckets.h" #include "clock.h" #include "compress.h" +#include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "inode.h" @@ -198,6 +199,8 @@ read_attribute(disk_groups); read_attribute(has_data); read_attribute(alloc_debug); +read_attribute(accounting); +read_attribute(usage_base); #define x(t, n, ...) read_attribute(t); BCH_PERSISTENT_COUNTERS() @@ -251,91 +254,42 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { - struct btree_trans *trans; - enum btree_id id; - struct compression_type_stats { - u64 nr_extents; - u64 sectors_compressed; - u64 sectors_uncompressed; - } s[BCH_COMPRESSION_TYPE_NR]; - u64 compressed_incompressible = 0; - int ret = 0; - - memset(s, 0, sizeof(s)); - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EPERM; - - trans = bch2_trans_get(c); - - for (id = 0; id < BTREE_ID_NR; id++) { - if (!btree_type_has_ptrs(id)) - continue; - - ret = for_each_btree_key(trans, iter, id, POS_MIN, - BTREE_ITER_all_snapshots, k, ({ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *entry; - bool compressed = false, incompressible = false; - - bkey_for_each_crc(k.k, ptrs, crc, entry) { - incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible; - compressed |= crc_is_compressed(crc); - - if (crc_is_compressed(crc)) { - s[crc.compression_type].nr_extents++; - s[crc.compression_type].sectors_compressed += crc.compressed_size; - s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size; - } - } - - compressed_incompressible += compressed && incompressible; - - if (!compressed) { - unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0; - - s[t].nr_extents++; - s[t].sectors_compressed += k.k->size; - s[t].sectors_uncompressed += k.k->size; - } - 0; - })); - } - - bch2_trans_put(trans); - - if (ret) - return ret; - + prt_str(out, "type"); printbuf_tabstop_push(out, 12); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 24); prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); - for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { + for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) { + struct disk_accounting_pos a = { + .type = BCH_DISK_ACCOUNTING_compression, + .compression.type = i, + }; + struct bpos p = disk_accounting_pos_to_bpos(&a); + u64 v[3]; + bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v)); + + u64 nr_extents = v[0]; + u64 sectors_uncompressed = v[1]; + u64 sectors_compressed = v[2]; + bch2_prt_compression_type(out, i); prt_tab(out); - prt_human_readable_u64(out, s[i].sectors_compressed << 9); + prt_human_readable_u64(out, sectors_compressed << 9); prt_tab_rjust(out); - prt_human_readable_u64(out, s[i].sectors_uncompressed << 9); + prt_human_readable_u64(out, sectors_uncompressed << 9); prt_tab_rjust(out); - prt_human_readable_u64(out, s[i].nr_extents - ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents) + prt_human_readable_u64(out, nr_extents + ? div_u64(sectors_uncompressed << 9, nr_extents) : 0); prt_tab_rjust(out); prt_newline(out); } - if (compressed_incompressible) { - prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible); - prt_newline(out); - } - return 0; } @@ -346,6 +300,20 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "\n"); } +static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_usage_base b = {}; + + acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64)); + + prt_printf(out, "hidden:\t\t%llu\n", b.hidden); + prt_printf(out, "btree:\t\t%llu\n", b.btree); + prt_printf(out, "data:\t\t%llu\n", b.data); + prt_printf(out, "cached:\t%llu\n", b.cached); + prt_printf(out, "reserved:\t\t%llu\n", b.reserved); + prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -429,6 +397,12 @@ SHOW(bch2_fs) if (attr == &sysfs_alloc_debug) bch2_fs_alloc_debug_to_text(out, c); + if (attr == &sysfs_accounting) + bch2_fs_accounting_to_text(out, c); + + if (attr == &sysfs_usage_base) + bch2_fs_usage_base_to_text(out, c); + return 0; } @@ -633,6 +607,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_disk_groups, &sysfs_alloc_debug, + &sysfs_accounting, + &sysfs_usage_base, NULL }; diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 68104b2056d94..01b768c9b7675 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -121,7 +121,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) ck.k.p.offset = i; ck.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); bch_err_msg(c, ret, "insert error"); if (ret) return ret; @@ -176,7 +176,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) ck.k.p.snapshot = U32_MAX; ck.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); bch_err_msg(c, ret, "insert error"); if (ret) return ret; @@ -232,7 +232,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ck.k.p.offset = i * 2; ck.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); bch_err_msg(c, ret, "insert error"); if (ret) return ret; @@ -292,7 +292,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ck.k.p.snapshot = U32_MAX; ck.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); bch_err_msg(c, ret, "insert error"); if (ret) return ret; @@ -396,7 +396,7 @@ static int insert_test_extent(struct bch_fs *c, k.k_i.k.size = end - start; k.k_i.k.version.lo = test_version++; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); bch_err_fn(c, ret); return ret; } @@ -481,7 +481,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = snapid_hi; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); if (ret) return ret; @@ -506,7 +506,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr) bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); if (ret) return ret; diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index b1af7ac430f66..0807ce9b171ac 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -67,9 +67,14 @@ int bch2_run_thread_with_file(struct thread_with_file *thr, /* stdio_redirect */ +static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen) +{ + return stdio->input.buf.nr > seen || stdio->done; +} + static bool stdio_redirect_has_input(struct stdio_redirect *stdio) { - return stdio->input.buf.nr || stdio->done; + return stdio_redirect_has_more_input(stdio, 0); } static bool stdio_redirect_has_output(struct stdio_redirect *stdio) @@ -181,9 +186,13 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu } spin_lock(&buf->lock); - if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE) - darray_make_room_gfp(&buf->buf, - min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT); + size_t makeroom = b; + if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) + makeroom = min_t(ssize_t, makeroom, + max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, + 0)); + darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); + b = min(len, darray_room(buf->buf)); if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { @@ -355,43 +364,67 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le return ret; } -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len) +int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio, + darray_char *line, + unsigned long timeout) { + unsigned long until = jiffies + timeout, t; struct stdio_buf *buf = &stdio->input; - size_t copied = 0; - ssize_t ret = 0; + size_t seen = 0; again: - do { - wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), - sysctl_hung_task_timeout_secs * HZ / 2); - } while (!stdio_redirect_has_input(stdio)); + t = timeout != MAX_SCHEDULE_TIMEOUT + ? max_t(long, until - jiffies, 0) + : timeout; - if (stdio->done) { - ret = -1; - goto out; - } + t = min(t, sysctl_hung_task_timeout_secs * HZ / 2); + + wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t); + + if (stdio->done) + return -1; spin_lock(&buf->lock); - size_t b = min(len, buf->buf.nr); - char *n = memchr(buf->buf.data, '\n', b); - if (n) - b = min_t(size_t, b, n + 1 - buf->buf.data); + seen = buf->buf.nr; + char *n = memchr(buf->buf.data, '\n', seen); + + if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) { + spin_unlock(&buf->lock); + return -ETIME; + } + + if (!n) { + buf->waiting_for_line = true; + spin_unlock(&buf->lock); + goto again; + } + + size_t b = n + 1 - buf->buf.data; + if (b > line->size) { + spin_unlock(&buf->lock); + int ret = darray_resize(line, b); + if (ret) + return ret; + seen = 0; + goto again; + } + buf->buf.nr -= b; - memcpy(ubuf, buf->buf.data, b); + memcpy(line->data, buf->buf.data, b); memmove(buf->buf.data, buf->buf.data + b, buf->buf.nr); - ubuf += b; - len -= b; - copied += b; + line->nr = b; + + buf->waiting_for_line = false; spin_unlock(&buf->lock); wake_up(&buf->wait); + return 0; +} - if (!n && len) - goto again; -out: - return copied ?: ret; +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line) +{ + return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT); } __printf(3, 0) diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 1d63d14d7dcae..72497b9219113 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -71,7 +71,9 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *, int bch2_run_thread_with_stdout(struct thread_with_stdio *, const struct thread_with_stdio_ops *); int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); -int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); + +int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long); +int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *); __printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); __printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h index e0daf4eec341e..f4d484d44f633 100644 --- a/fs/bcachefs/thread_with_file_types.h +++ b/fs/bcachefs/thread_with_file_types.h @@ -8,15 +8,12 @@ struct stdio_buf { spinlock_t lock; wait_queue_head_t wait; darray_char buf; + bool waiting_for_line; }; struct stdio_redirect { struct stdio_buf input; struct stdio_buf output; - - spinlock_t input_lock; - wait_queue_head_t input_wait; - darray_char input_buf; bool done; }; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 84fcf26e306e6..d0e6b9deb6cb4 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -200,6 +200,56 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* fs.c: */ +TRACE_EVENT(bch2_sync_fs, + TP_PROTO(struct super_block *sb, int wait), + + TP_ARGS(sb, wait), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, wait ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->wait = wait; + ), + + TP_printk("dev %d,%d wait %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait) +); + +/* fs-io.c: */ +TRACE_EVENT(bch2_fsync, + TP_PROTO(struct file *file, int datasync), + + TP_ARGS(file, datasync), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, parent ) + __field( int, datasync ) + ), + + TP_fast_assign( + struct dentry *dentry = file->f_path.dentry; + + __entry->dev = dentry->d_sb->s_dev; + __entry->ino = d_inode(dentry)->i_ino; + __entry->parent = d_inode(dentry->d_parent)->i_ino; + __entry->datasync = datasync; + ), + + TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long) __entry->parent, __entry->datasync) +); + /* super-io.c: */ TRACE_EVENT(write_super, TP_PROTO(struct bch_fs *c, unsigned long ip), diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h index 9058017720025..7f647846b511f 100644 --- a/fs/bcachefs/two_state_shared_lock.h +++ b/fs/bcachefs/two_state_shared_lock.h @@ -36,15 +36,14 @@ static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) { long i = s ? 1 : -1; - long v = atomic_long_read(&lock->v), old; + long old; + old = atomic_long_read(&lock->v); do { - old = v; - - if (i > 0 ? v < 0 : v > 0) + if (i > 0 ? old < 0 : old > 0) return false; - } while ((v = atomic_long_cmpxchg_acquire(&lock->v, - old, old + i)) != old); + } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i)); + return true; } diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 5d2c470a49ac9..76ffe08e71b56 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -697,14 +697,19 @@ do { \ } \ } while (0) +#define per_cpu_sum(_p) \ +({ \ + typeof(*_p) _ret = 0; \ + \ + int cpu; \ + for_each_possible_cpu(cpu) \ + _ret += *per_cpu_ptr(_p, cpu); \ + _ret; \ +}) + static inline u64 percpu_u64_get(u64 __percpu *src) { - u64 ret = 0; - int cpu; - - for_each_possible_cpu(cpu) - ret += *per_cpu_ptr(src, cpu); - return ret; + return per_cpu_sum(src); } static inline void percpu_u64_set(u64 __percpu *dst, u64 src) @@ -718,9 +723,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src) static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) { - unsigned i; - - for (i = 0; i < nr; i++) + for (unsigned i = 0; i < nr; i++) acc[i] += src[i]; }