summaryrefslogtreecommitdiffstats
path: root/libglusterfs/src/mem-pool.h
diff options
context:
space:
mode:
authorShehjar Tikoo <shehjart@gluster.com>2009-05-21 01:37:05 +0000
committerAnand V. Avati <avati@dev.gluster.com>2009-05-21 02:58:22 -0700
commitc7fd9f1df36a01ac9a1509d1001b0d259005b0f9 (patch)
tree4ed27b1cddb973536313fc1817dd32828a37c87f /libglusterfs/src/mem-pool.h
parentef7fcc0d7c2695ab3b10477b3e1290e43831da99 (diff)
mem-pool,io-threads: Destroy mem-pool on deallocation
Signed-off-by: Anand V. Avati <avati@dev.gluster.com>
Diffstat (limited to 'libglusterfs/src/mem-pool.h')
-rw-r--r--libglusterfs/src/mem-pool.h2
1 files changed, 2 insertions, 0 deletions
diff --git a/libglusterfs/src/mem-pool.h b/libglusterfs/src/mem-pool.h
index 627945e1fe6..152c17bb412 100644
--- a/libglusterfs/src/mem-pool.h
+++ b/libglusterfs/src/mem-pool.h
@@ -52,4 +52,6 @@ mem_pool_new_fn (unsigned long sizeof_type, unsigned long count);
void mem_put (struct mem_pool *pool, void *ptr);
void *mem_get (struct mem_pool *pool);
+void mem_pool_destroy (struct mem_pool *pool);
+
#endif /* _MEM_POOL_H */
it/glusterfs.git/diff/xlators/cluster/afr/src/Makefile.am?id2=cc0a908bc5d5046bcb2909fc98765038f15e918d'>xlators/cluster/afr/src/Makefile.am32
-rw-r--r--xlators/cluster/afr/src/afr-common.c9518
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c934
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.h49
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c2674
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.h60
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c2400
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.h54
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c3233
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h104
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c2616
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h64
-rw-r--r--xlators/cluster/afr/src/afr-messages.h167
-rw-r--r--xlators/cluster/afr/src/afr-open.c864
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c494
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c1090
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.h60
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c4426
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h113
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c1887
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c3197
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c1028
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c616
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h393
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c1716
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h75
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c3475
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h84
-rw-r--r--xlators/cluster/afr/src/afr.c1968
-rw-r--r--xlators/cluster/afr/src/afr.h2003
-rw-r--r--xlators/cluster/afr/src/pump.c2536
-rw-r--r--xlators/cluster/afr/src/pump.h96
-rw-r--r--xlators/cluster/dht/src/Makefile.am40
-rw-r--r--xlators/cluster/dht/src/dht-common.c14157
-rw-r--r--xlators/cluster/dht/src/dht-common.h1585
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c627
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c147
-rw-r--r--xlators/cluster/dht/src/dht-helper.c2453
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c1658
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c1404
-rw-r--r--xlators/cluster/dht/src/dht-layout.c1189
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c415
-rw-r--r--xlators/cluster/dht/src/dht-lock.c1392
-rw-r--r--xlators/cluster/dht/src/dht-lock.h91
-rw-r--r--xlators/cluster/dht/src/dht-mem-types.h54
-rw-r--r--xlators/cluster/dht/src/dht-messages.h386
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c4702
-rw-r--r--xlators/cluster/dht/src/dht-rename.c2435
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c2846
-rw-r--r--xlators/cluster/dht/src/dht-shared.c1104
-rw-r--r--xlators/cluster/dht/src/dht.c678
-rw-r--r--xlators/cluster/dht/src/nufa.c1183
-rw-r--r--xlators/cluster/dht/src/switch.c1647
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_mock.c73
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_unittest.c127
-rw-r--r--xlators/cluster/ec/Makefile.am (renamed from xlators/cluster/ha/Makefile.am)0
-rw-r--r--xlators/cluster/ec/src/Makefile.am83
-rw-r--r--xlators/cluster/ec/src/ec-code-avx.c109
-rw-r--r--xlators/cluster/ec/src/ec-code-avx.h18
-rw-r--r--xlators/cluster/ec/src/ec-code-c.c11679
-rw-r--r--xlators/cluster/ec/src/ec-code-c.h27
-rw-r--r--xlators/cluster/ec/src/ec-code-intel.c594
-rw-r--r--xlators/cluster/ec/src/ec-code-intel.h191
-rw-r--r--xlators/cluster/ec/src/ec-code-sse.c101
-rw-r--r--xlators/cluster/ec/src/ec-code-sse.h18
-rw-r--r--xlators/cluster/ec/src/ec-code-x64.c144
-rw-r--r--xlators/cluster/ec/src/ec-code-x64.h18
-rw-r--r--xlators/cluster/ec/src/ec-code.c1060
-rw-r--r--xlators/cluster/ec/src/ec-code.h44
-rw-r--r--xlators/cluster/ec/src/ec-combine.c995
-rw-r--r--xlators/cluster/ec/src/ec-combine.h44
-rw-r--r--xlators/cluster/ec/src/ec-common.c3042
-rw-r--r--xlators/cluster/ec/src/ec-common.h234
-rw-r--r--xlators/cluster/ec/src/ec-data.c288
-rw-r--r--xlators/cluster/ec/src/ec-data.h35
-rw-r--r--xlators/cluster/ec/src/ec-dir-read.c647
-rw-r--r--xlators/cluster/ec/src/ec-dir-write.c1487
-rw-r--r--xlators/cluster/ec/src/ec-fops.h254
-rw-r--r--xlators/cluster/ec/src/ec-galois.c183
-rw-r--r--xlators/cluster/ec/src/ec-galois.h32
-rw-r--r--xlators/cluster/ec/src/ec-generic.c1591
-rw-r--r--xlators/cluster/ec/src/ec-gf8.c5882
-rw-r--r--xlators/cluster/ec/src/ec-gf8.h18
-rw-r--r--xlators/cluster/ec/src/ec-heal.c3367
-rw-r--r--xlators/cluster/ec/src/ec-heald.c681
-rw-r--r--xlators/cluster/ec/src/ec-heald.h30
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c867
-rw-r--r--xlators/cluster/ec/src/ec-helpers.h200
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c2046
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c2369
-rw-r--r--xlators/cluster/ec/src/ec-locks.c1128
-rw-r--r--xlators/cluster/ec/src/ec-mem-types.h30
-rw-r--r--xlators/cluster/ec/src/ec-messages.h61
-rw-r--r--xlators/cluster/ec/src/ec-method.c433
-rw-r--r--xlators/cluster/ec/src/ec-method.h48
-rw-r--r--xlators/cluster/ec/src/ec-types.h690
-rw-r--r--xlators/cluster/ec/src/ec.c1873
-rw-r--r--xlators/cluster/ec/src/ec.h34
-rw-r--r--xlators/cluster/ha/src/Makefile.am15
-rw-r--r--xlators/cluster/ha/src/ha-helpers.c204
-rw-r--r--xlators/cluster/ha/src/ha-mem-types.h37
-rw-r--r--xlators/cluster/ha/src/ha.c4023
-rw-r--r--xlators/cluster/ha/src/ha.h63
-rw-r--r--xlators/cluster/map/src/Makefile.am15
-rw-r--r--xlators/cluster/map/src/map-helper.c358
-rw-r--r--xlators/cluster/map/src/map-mem-types.h35
-rw-r--r--xlators/cluster/map/src/map.c2577
-rw-r--r--xlators/cluster/map/src/map.h77
-rw-r--r--xlators/cluster/stripe/src/Makefile.am17
-rw-r--r--xlators/cluster/stripe/src/stripe-mem-types.h41
-rw-r--r--xlators/cluster/stripe/src/stripe.c4923
-rw-r--r--xlators/cluster/stripe/src/stripe.h205
-rw-r--r--xlators/cluster/unify/src/Makefile.am16
-rw-r--r--xlators/cluster/unify/src/unify-mem-types.h41
-rw-r--r--xlators/cluster/unify/src/unify-self-heal.c1239
-rw-r--r--xlators/cluster/unify/src/unify.c4589
-rw-r--r--xlators/cluster/unify/src/unify.h146
-rw-r--r--xlators/debug/Makefile.am2
-rw-r--r--xlators/debug/delay-gen/Makefile.am (renamed from xlators/features/access-control/Makefile.am)0
-rw-r--r--xlators/debug/delay-gen/src/Makefile.am11
-rw-r--r--xlators/debug/delay-gen/src/delay-gen-mem-types.h21
-rw-r--r--xlators/debug/delay-gen/src/delay-gen-messages.h26
-rw-r--r--xlators/debug/delay-gen/src/delay-gen.c697
-rw-r--r--xlators/debug/delay-gen/src/delay-gen.h27
-rw-r--r--xlators/debug/error-gen/src/Makefile.am10
-rw-r--r--xlators/debug/error-gen/src/error-gen-mem-types.h20
-rw-r--r--xlators/debug/error-gen/src/error-gen.c3075
-rw-r--r--xlators/debug/error-gen/src/error-gen.h60
-rw-r--r--xlators/debug/io-stats/src/Makefile.am10
-rw-r--r--xlators/debug/io-stats/src/io-stats-mem-types.h38
-rw-r--r--xlators/debug/io-stats/src/io-stats.c5801
-rw-r--r--xlators/debug/sink/Makefile.am (renamed from xlators/cluster/stripe/Makefile.am)1
-rw-r--r--xlators/debug/sink/src/Makefile.am14
-rw-r--r--xlators/debug/sink/src/sink.c94
-rw-r--r--xlators/debug/trace/src/Makefile.am9
-rw-r--r--xlators/debug/trace/src/trace-mem-types.h20
-rw-r--r--xlators/debug/trace/src/trace.c4911
-rw-r--r--xlators/debug/trace/src/trace.h55
-rw-r--r--xlators/encryption/Makefile.am3
-rw-r--r--xlators/encryption/rot-13/src/rot-13.c208
-rw-r--r--xlators/encryption/rot-13/src/rot-13.h33
-rw-r--r--xlators/features/Makefile.am13
-rw-r--r--xlators/features/access-control/src/Makefile.am13
-rw-r--r--xlators/features/arbiter/Makefile.am (renamed from xlators/cluster/unify/Makefile.am)2
-rw-r--r--xlators/features/arbiter/src/Makefile.am19
-rw-r--r--xlators/features/arbiter/src/arbiter-mem-types.h18
-rw-r--r--xlators/features/arbiter/src/arbiter.c380
-rw-r--r--xlators/features/arbiter/src/arbiter.h21
-rw-r--r--xlators/features/barrier/Makefile.am (renamed from xlators/features/filter/Makefile.am)2
-rw-r--r--xlators/features/barrier/src/Makefile.am17
-rw-r--r--xlators/features/barrier/src/barrier-mem-types.h20
-rw-r--r--xlators/features/barrier/src/barrier.c809
-rw-r--r--xlators/features/barrier/src/barrier.h89
-rw-r--r--xlators/features/bit-rot/Makefile.am (renamed from xlators/protocol/legacy/transport/ib-verbs/Makefile.am)0
-rw-r--r--xlators/features/bit-rot/src/Makefile.am1
-rw-r--r--xlators/features/bit-rot/src/bitd/Makefile.am23
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h101
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c78
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h50
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub.c2070
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub.h46
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-ssm.c124
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-ssm.h38
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c2232
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h302
-rw-r--r--xlators/features/bit-rot/src/stub/Makefile.am20
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-common.h178
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-object-version.h30
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c796
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h36
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h117
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.c3590
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.h515
-rw-r--r--xlators/features/changelog/Makefile.am3
-rw-r--r--xlators/features/changelog/lib/Makefile.am (renamed from xlators/encryption/rot-13/Makefile.am)2
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes-multi.c90
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes.c93
-rw-r--r--xlators/features/changelog/lib/examples/c/get-history.c116
-rwxr-xr-xxlators/features/changelog/lib/examples/python/changes.py34
-rw-r--r--xlators/features/changelog/lib/examples/python/libgfchangelog.py71
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am35
-rw-r--r--xlators/features/changelog/lib/src/changelog-lib-messages.h74
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-api.c224
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.c170
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.h255
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-journal-handler.c1029
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-journal.h116
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-reborp.c413
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.c98
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.h28
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog.c652
-rw-r--r--xlators/features/changelog/lib/src/gf-history-changelog.c1020
-rw-r--r--xlators/features/changelog/src/Makefile.am29
-rw-r--r--xlators/features/changelog/src/changelog-barrier.c131
-rw-r--r--xlators/features/changelog/src/changelog-encoders.c232
-rw-r--r--xlators/features/changelog/src/changelog-encoders.h50
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.c412
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.h136
-rw-r--r--xlators/features/changelog/src/changelog-helpers.c1977
-rw-r--r--xlators/features/changelog/src/changelog-helpers.h716
-rw-r--r--xlators/features/changelog/src/changelog-mem-types.h34
-rw-r--r--xlators/features/changelog/src/changelog-messages.h172
-rw-r--r--xlators/features/changelog/src/changelog-misc.h131
-rw-r--r--xlators/features/changelog/src/changelog-rpc-common.c359
-rw-r--r--xlators/features/changelog/src/changelog-rpc-common.h85
-rw-r--r--xlators/features/changelog/src/changelog-rpc.c440
-rw-r--r--xlators/features/changelog/src/changelog-rpc.h31
-rw-r--r--xlators/features/changelog/src/changelog-rt.c66
-rw-r--r--xlators/features/changelog/src/changelog-rt.h33
-rw-r--r--xlators/features/changelog/src/changelog.c2989
-rw-r--r--xlators/features/cloudsync/Makefile.am3
-rw-r--r--xlators/features/cloudsync/src/Makefile.am46
-rw-r--r--xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c30
-rw-r--r--xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h24
-rw-r--r--xlators/features/cloudsync/src/cloudsync-common.c60
-rw-r--r--xlators/features/cloudsync/src/cloudsync-common.h134
-rwxr-xr-xxlators/features/cloudsync/src/cloudsync-fops-c.py324
-rwxr-xr-xxlators/features/cloudsync/src/cloudsync-fops-h.py31
-rw-r--r--xlators/features/cloudsync/src/cloudsync-mem-types.h22
-rw-r--r--xlators/features/cloudsync/src/cloudsync-messages.h16
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/Makefile.am3
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am11
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/Makefile.am3
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am12
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h19
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c584
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h50
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym1
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am3
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am12
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h203
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h30
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym1
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h19
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c842
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h84
-rw-r--r--xlators/features/cloudsync/src/cloudsync.c2076
-rw-r--r--xlators/features/cloudsync/src/cloudsync.h123
-rw-r--r--xlators/features/compress/Makefile.am3
-rw-r--r--xlators/features/compress/src/Makefile.am19
-rw-r--r--xlators/features/compress/src/cdc-helper.c527
-rw-r--r--xlators/features/compress/src/cdc-mem-types.h23
-rw-r--r--xlators/features/compress/src/cdc.c348
-rw-r--r--xlators/features/compress/src/cdc.h99
-rw-r--r--xlators/features/filter/src/Makefile.am15
-rw-r--r--xlators/features/filter/src/filter-mem-types.h30
-rw-r--r--xlators/features/filter/src/filter.c1744
-rw-r--r--xlators/features/gfid-access/Makefile.am (renamed from xlators/bindings/python/Makefile.am)0
-rw-r--r--xlators/features/gfid-access/src/Makefile.am16
-rw-r--r--xlators/features/gfid-access/src/gfid-access-mem-types.h22
-rw-r--r--xlators/features/gfid-access/src/gfid-access.c1420
-rw-r--r--xlators/features/gfid-access/src/gfid-access.h107
-rw-r--r--xlators/features/index/Makefile.am3
-rw-r--r--xlators/features/index/src/Makefile.am19
-rw-r--r--xlators/features/index/src/index-mem-types.h23
-rw-r--r--xlators/features/index/src/index-messages.h33
-rw-r--r--xlators/features/index/src/index.c2682
-rw-r--r--xlators/features/index/src/index.h86
-rw-r--r--xlators/features/leases/Makefile.am3
-rw-r--r--xlators/features/leases/src/Makefile.am20
-rw-r--r--xlators/features/leases/src/leases-internal.c1412
-rw-r--r--xlators/features/leases/src/leases-mem-types.h27
-rw-r--r--xlators/features/leases/src/leases-messages.h33
-rw-r--r--xlators/features/leases/src/leases.c1168
-rw-r--r--xlators/features/leases/src/leases.h259
-rw-r--r--xlators/features/locks/src/Makefile.am23
-rw-r--r--xlators/features/locks/src/clear.c460
-rw-r--r--xlators/features/locks/src/clear.h73
-rw-r--r--xlators/features/locks/src/common.c1949
-rw-r--r--xlators/features/locks/src/common.h265
-rw-r--r--xlators/features/locks/src/entrylk.c1484
-rw-r--r--xlators/features/locks/src/inodelk.c1491
-rw-r--r--xlators/features/locks/src/locks-mem-types.h44
-rw-r--r--xlators/features/locks/src/locks.h336
-rw-r--r--xlators/features/locks/src/pl-messages.h29
-rw-r--r--xlators/features/locks/src/posix.c5801
-rw-r--r--xlators/features/locks/src/reservelk.c606
-rw-r--r--xlators/features/locks/tests/unit-test.c118
-rw-r--r--xlators/features/mac-compat/Makefile.am3
-rw-r--r--xlators/features/mac-compat/src/Makefile.am13
-rw-r--r--xlators/features/mac-compat/src/mac-compat.c247
-rw-r--r--xlators/features/marker/Makefile.am2
-rw-r--r--xlators/features/marker/src/Makefile.am19
-rw-r--r--xlators/features/marker/src/marker-common.c109
-rw-r--r--xlators/features/marker/src/marker-common.h37
-rw-r--r--xlators/features/marker/src/marker-mem-types.h43
-rw-r--r--xlators/features/marker/src/marker-quota-helper.c611
-rw-r--r--xlators/features/marker/src/marker-quota-helper.h109
-rw-r--r--xlators/features/marker/src/marker-quota.c4035
-rw-r--r--xlators/features/marker/src/marker-quota.h235
-rw-r--r--xlators/features/marker/src/marker.c4417
-rw-r--r--xlators/features/marker/src/marker.h226
-rw-r--r--xlators/features/marker/utils/Makefile.am7
-rwxr-xr-xxlators/features/marker/utils/gsyncd.in55
-rw-r--r--xlators/features/marker/utils/syncdaemon/Makefile.am5
-rw-r--r--xlators/features/marker/utils/syncdaemon/README.md81
-rw-r--r--xlators/features/marker/utils/syncdaemon/__init__.py0
-rw-r--r--xlators/features/marker/utils/syncdaemon/configinterface.py185
-rw-r--r--xlators/features/marker/utils/syncdaemon/gconf.py15
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py302
-rw-r--r--xlators/features/marker/utils/syncdaemon/libcxattr.py62
-rw-r--r--xlators/features/marker/utils/syncdaemon/master.py366
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py80
-rw-r--r--xlators/features/marker/utils/syncdaemon/repce.py162
-rw-r--r--xlators/features/marker/utils/syncdaemon/resource.py467
-rw-r--r--xlators/features/marker/utils/syncdaemon/syncdutils.py160
-rw-r--r--xlators/features/metadisp/Makefile.am3
-rw-r--r--xlators/features/metadisp/src/Makefile.am38
-rw-r--r--xlators/features/metadisp/src/backend.c45
-rw-r--r--xlators/features/metadisp/src/fops-tmpl.c10
-rw-r--r--xlators/features/metadisp/src/gen-fops.py160
-rw-r--r--xlators/features/metadisp/src/metadisp-create.c101
-rw-r--r--xlators/features/metadisp/src/metadisp-fops.h51
-rw-r--r--xlators/features/metadisp/src/metadisp-fsync.c54
-rw-r--r--xlators/features/metadisp/src/metadisp-lookup.c90
-rw-r--r--xlators/features/metadisp/src/metadisp-open.c70
-rw-r--r--xlators/features/metadisp/src/metadisp-readdir.c65
-rw-r--r--xlators/features/metadisp/src/metadisp-setattr.c90
-rw-r--r--xlators/features/metadisp/src/metadisp-stat.c124
-rw-r--r--xlators/features/metadisp/src/metadisp-unlink.c160
-rw-r--r--xlators/features/metadisp/src/metadisp.c46
-rw-r--r--xlators/features/metadisp/src/metadisp.h45
-rw-r--r--xlators/features/namespace/Makefile.am3
-rw-r--r--xlators/features/namespace/src/Makefile.am17
-rw-r--r--xlators/features/namespace/src/namespace.c1344
-rw-r--r--xlators/features/namespace/src/namespace.h23
-rw-r--r--xlators/features/path-convertor/Makefile.am3
-rw-r--r--xlators/features/path-convertor/src/Makefile.am14
-rw-r--r--xlators/features/path-convertor/src/path-mem-types.h32
-rw-r--r--xlators/features/path-convertor/src/path.c1239
-rw-r--r--xlators/features/quiesce/src/Makefile.am10
-rw-r--r--xlators/features/quiesce/src/quiesce-mem-types.h27
-rw-r--r--xlators/features/quiesce/src/quiesce-messages.h28
-rw-r--r--xlators/features/quiesce/src/quiesce.c3532
-rw-r--r--xlators/features/quiesce/src/quiesce.h89
-rw-r--r--xlators/features/quota/src/Makefile.am29
-rw-r--r--xlators/features/quota/src/quota-enforcer-client.c503
-rw-r--r--xlators/features/quota/src/quota-mem-types.h46
-rw-r--r--xlators/features/quota/src/quota-messages.h39
-rw-r--r--xlators/features/quota/src/quota.c6712
-rw-r--r--xlators/features/quota/src/quota.h393
-rw-r--r--xlators/features/quota/src/quotad-aggregator.c494
-rw-r--r--xlators/features/quota/src/quotad-aggregator.h38
-rw-r--r--xlators/features/quota/src/quotad-helpers.c107
-rw-r--r--xlators/features/quota/src/quotad-helpers.h24
-rw-r--r--xlators/features/quota/src/quotad.c245
-rw-r--r--xlators/features/read-only/src/Makefile.am20
-rw-r--r--xlators/features/read-only/src/read-only-common.c406
-rw-r--r--xlators/features/read-only/src/read-only-common.h121
-rw-r--r--xlators/features/read-only/src/read-only-mem-types.h20
-rw-r--r--xlators/features/read-only/src/read-only.c374
-rw-r--r--xlators/features/read-only/src/read-only.h37
-rw-r--r--xlators/features/read-only/src/worm-helper.c395
-rw-r--r--xlators/features/read-only/src/worm-helper.h44
-rw-r--r--xlators/features/read-only/src/worm.c722
-rw-r--r--xlators/features/sdfs/Makefile.am3
-rw-r--r--xlators/features/sdfs/src/Makefile.am19
-rw-r--r--xlators/features/sdfs/src/sdfs-messages.h67
-rw-r--r--xlators/features/sdfs/src/sdfs.c1479
-rw-r--r--xlators/features/sdfs/src/sdfs.h49
-rw-r--r--xlators/features/selinux/Makefile.am3
-rw-r--r--xlators/features/selinux/src/Makefile.am20
-rw-r--r--xlators/features/selinux/src/selinux-mem-types.h19
-rw-r--r--xlators/features/selinux/src/selinux-messages.h30
-rw-r--r--xlators/features/selinux/src/selinux.c323
-rw-r--r--xlators/features/selinux/src/selinux.h24
-rw-r--r--xlators/features/shard/Makefile.am3
-rw-r--r--xlators/features/shard/src/Makefile.am17
-rw-r--r--xlators/features/shard/src/shard-mem-types.h24
-rw-r--r--xlators/features/shard/src/shard-messages.h39
-rw-r--r--xlators/features/shard/src/shard.c7382
-rw-r--r--xlators/features/shard/src/shard.h348
-rw-r--r--xlators/features/snapview-client/Makefile.am (renamed from xlators/performance/stat-prefetch/Makefile.am)0
-rw-r--r--xlators/features/snapview-client/src/Makefile.am16
-rw-r--r--xlators/features/snapview-client/src/snapview-client-mem-types.h24
-rw-r--r--xlators/features/snapview-client/src/snapview-client-messages.h71
-rw-r--r--xlators/features/snapview-client/src/snapview-client.c2791
-rw-r--r--xlators/features/snapview-client/src/snapview-client.h101
-rw-r--r--xlators/features/snapview-server/Makefile.am1
-rw-r--r--xlators/features/snapview-server/src/Makefile.am25
-rw-r--r--xlators/features/snapview-server/src/snapview-server-helpers.c715
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mem-types.h25
-rw-r--r--xlators/features/snapview-server/src/snapview-server-messages.h54
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mgmt.c524
-rw-r--r--xlators/features/snapview-server/src/snapview-server.c2720
-rw-r--r--xlators/features/snapview-server/src/snapview-server.h255
-rw-r--r--xlators/features/thin-arbiter/Makefile.am3
-rw-r--r--xlators/features/thin-arbiter/src/Makefile.am22
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h19
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter-messages.h28
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter.c661
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter.h59
-rw-r--r--xlators/features/trash/src/Makefile.am12
-rw-r--r--xlators/features/trash/src/trash-mem-types.h33
-rw-r--r--xlators/features/trash/src/trash.c3761
-rw-r--r--xlators/features/trash/src/trash.h128
-rw-r--r--xlators/features/upcall/Makefile.am3
-rw-r--r--xlators/features/upcall/src/Makefile.am23
-rw-r--r--xlators/features/upcall/src/upcall-cache-invalidation.h18
-rw-r--r--xlators/features/upcall/src/upcall-internal.c689
-rw-r--r--xlators/features/upcall/src/upcall-mem-types.h23
-rw-r--r--xlators/features/upcall/src/upcall-messages.h29
-rw-r--r--xlators/features/upcall/src/upcall.c2505
-rw-r--r--xlators/features/upcall/src/upcall.h131
-rw-r--r--xlators/features/utime/Makefile.am3
-rw-r--r--xlators/features/utime/src/Makefile.am41
-rw-r--r--xlators/features/utime/src/utime-autogen-fops-tmpl.c28
-rw-r--r--xlators/features/utime/src/utime-autogen-fops-tmpl.h22
-rwxr-xr-xxlators/features/utime/src/utime-gen-fops-c.py147
-rwxr-xr-xxlators/features/utime/src/utime-gen-fops-h.py35
-rw-r--r--xlators/features/utime/src/utime-helpers.c110
-rw-r--r--xlators/features/utime/src/utime-helpers.h25
-rw-r--r--xlators/features/utime/src/utime-mem-types.h21
-rw-r--r--xlators/features/utime/src/utime-messages.h29
-rw-r--r--xlators/features/utime/src/utime.c392
-rw-r--r--xlators/features/utime/src/utime.h23
-rw-r--r--xlators/lib/src/libxlator.c779
-rw-r--r--xlators/lib/src/libxlator.h188
-rw-r--r--xlators/meta/Makefile.am2
-rw-r--r--xlators/meta/src/Makefile.am48
-rw-r--r--xlators/meta/src/active-link.c34
-rw-r--r--xlators/meta/src/cmdline-file.c39
-rw-r--r--xlators/meta/src/frames-file.c107
-rw-r--r--xlators/meta/src/graph-dir.c98
-rw-r--r--xlators/meta/src/graphs-dir.c67
-rw-r--r--xlators/meta/src/history-file.c44
-rw-r--r--xlators/meta/src/logfile-link.c34
-rw-r--r--xlators/meta/src/logging-dir.c44
-rw-r--r--xlators/meta/src/loglevel-file.c50
-rw-r--r--xlators/meta/src/mallinfo-file.c36
-rw-r--r--xlators/meta/src/measure-file.c49
-rw-r--r--xlators/meta/src/meminfo-file.c44
-rw-r--r--xlators/meta/src/meta-defaults.c655
-rw-r--r--xlators/meta/src/meta-helpers.c332
-rw-r--r--xlators/meta/src/meta-hooks.h48
-rw-r--r--xlators/meta/src/meta-mem-types.h36
-rw-r--r--xlators/meta/src/meta.c1355
-rw-r--r--xlators/meta/src/meta.h162
-rw-r--r--xlators/meta/src/misc.c67
-rw-r--r--xlators/meta/src/misc.h31
-rw-r--r--xlators/meta/src/name-file.c44
-rw-r--r--xlators/meta/src/option-file.c45
-rw-r--r--xlators/meta/src/options-dir.c65
-rw-r--r--xlators/meta/src/private-file.c44
-rw-r--r--xlators/meta/src/process_uuid-file.c37
-rw-r--r--xlators/meta/src/profile-file.c44
-rw-r--r--xlators/meta/src/root-dir.c77
-rw-r--r--xlators/meta/src/subvolume-link.c56
-rw-r--r--xlators/meta/src/subvolumes-dir.c62
-rw-r--r--xlators/meta/src/top-link.c40
-rw-r--r--xlators/meta/src/tree.c179
-rw-r--r--xlators/meta/src/tree.h35
-rw-r--r--xlators/meta/src/type-file.c44
-rw-r--r--xlators/meta/src/version-file.c37
-rw-r--r--xlators/meta/src/view-dir.c33
-rw-r--r--xlators/meta/src/view.c258
-rw-r--r--xlators/meta/src/view.h32
-rw-r--r--xlators/meta/src/volfile-file.c79
-rw-r--r--xlators/meta/src/xlator-dir.c97
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am86
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitd-svc.c206
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitd-svc.h40
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitrot.c822
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-brick-ops.c2796
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-helper.c21
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-helper.h21
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-mgmt.c191
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-mgmt.h53
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-errno.h33
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-ganesha.c927
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.c6782
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.h52
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.c235
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.h51
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c478
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.h47
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c9220
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c2769
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.c641
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.h88
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.c870
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.h57
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-log-ops.c290
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mem-types.h104
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-messages.h451
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c1144
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.c3114
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.h97
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.c721
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.h37
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-nfs-svc.c228
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-nfs-svc.h27
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c14467
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.h408
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-peer-utils.c1058
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-peer-utils.h82
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.c903
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.h83
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c152
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-proc-mgmt.h44
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.c2259
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.h17
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quotad-svc.c217
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quotad-svc.h31
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rcu.h36
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c2049
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-replace-brick.c716
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-reset-brick.c376
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rpc-ops.c3828
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-scrub-svc.c207
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-scrub-svc.h45
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-server-quorum.c486
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-server-quorum.h46
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.c153
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.h42
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc.c796
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc.h45
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c2185
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.h270
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.c75
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.h32
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc.c478
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc.h42
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c4290
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot-utils.h169
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot.c10087
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-statedump.c243
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-statedump.h18
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.c6264
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.h263
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-helper.c1047
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-helper.h72
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c536
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-mgmt.h112
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.c2043
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.h93
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-tierd-svc-helper.c207
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c16622
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h896
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c7998
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h351
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c3033
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c3146
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c2706
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h1484
-rw-r--r--xlators/mount/fuse/src/Makefile.am28
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.c9058
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.h714
-rw-r--r--xlators/mount/fuse/src/fuse-helpers.c874
-rw-r--r--xlators/mount/fuse/src/fuse-mem-types.h42
-rw-r--r--xlators/mount/fuse/src/fuse-resolve.c1133
-rw-r--r--xlators/mount/fuse/utils/Makefile.am9
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in895
-rwxr-xr-xxlators/mount/fuse/utils/mount_glusterfs.in617
-rw-r--r--xlators/nfs/lib/src/auth-null.c72
-rw-r--r--xlators/nfs/lib/src/auth-unix.c97
-rw-r--r--xlators/nfs/lib/src/msg-nfs3.c554
-rw-r--r--xlators/nfs/lib/src/msg-nfs3.h186
-rw-r--r--xlators/nfs/lib/src/rpc-socket.c361
-rw-r--r--xlators/nfs/lib/src/rpc-socket.h65
-rw-r--r--xlators/nfs/lib/src/rpcsvc-auth.c400
-rw-r--r--xlators/nfs/lib/src/rpcsvc.c2943
-rw-r--r--xlators/nfs/lib/src/rpcsvc.h731
-rw-r--r--xlators/nfs/lib/src/xdr-common.h48
-rw-r--r--xlators/nfs/lib/src/xdr-nfs3.c1897
-rw-r--r--xlators/nfs/lib/src/xdr-nfs3.h1206
-rw-r--r--xlators/nfs/lib/src/xdr-rpc.c229
-rw-r--r--xlators/nfs/lib/src/xdr-rpc.h82
-rw-r--r--xlators/nfs/server/src/Makefile.am41
-rw-r--r--xlators/nfs/server/src/acl3.c933
-rw-r--r--xlators/nfs/server/src/acl3.h42
-rw-r--r--xlators/nfs/server/src/auth-cache.c496
-rw-r--r--xlators/nfs/server/src/auth-cache.h52
-rw-r--r--xlators/nfs/server/src/exports.c1484
-rw-r--r--xlators/nfs/server/src/exports.h93
-rw-r--r--xlators/nfs/server/src/mount3-auth.c642
-rw-r--r--xlators/nfs/server/src/mount3-auth.h59
-rw-r--r--xlators/nfs/server/src/mount3.c5122
-rw-r--r--xlators/nfs/server/src/mount3.h213
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c238
-rw-r--r--xlators/nfs/server/src/netgroups.c1161
-rw-r--r--xlators/nfs/server/src/netgroups.h53
-rw-r--r--xlators/nfs/server/src/nfs-common.c635
-rw-r--r--xlators/nfs/server/src/nfs-common.h77
-rw-r--r--xlators/nfs/server/src/nfs-fops.c2216
-rw-r--r--xlators/nfs/server/src/nfs-fops.h319
-rw-r--r--xlators/nfs/server/src/nfs-generics.c354
-rw-r--r--xlators/nfs/server/src/nfs-generics.h169
-rw-r--r--xlators/nfs/server/src/nfs-inodes.c788
-rw-r--r--xlators/nfs/server/src/nfs-inodes.h82
-rw-r--r--xlators/nfs/server/src/nfs-mem-types.h71
-rw-r--r--xlators/nfs/server/src/nfs-messages.h102
-rw-r--r--xlators/nfs/server/src/nfs.c2700
-rw-r--r--xlators/nfs/server/src/nfs.h171
-rw-r--r--xlators/nfs/server/src/nfs3-fh.c349
-rw-r--r--xlators/nfs/server/src/nfs3-fh.h137
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.c5416
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.h301
-rw-r--r--xlators/nfs/server/src/nfs3.c8897
-rw-r--r--xlators/nfs/server/src/nfs3.h388
-rw-r--r--xlators/nfs/server/src/nfsserver.sym12
-rw-r--r--xlators/nfs/server/src/nlm4.c2786
-rw-r--r--xlators/nfs/server/src/nlm4.h111
-rw-r--r--xlators/nfs/server/src/nlmcbk_svc.c134
-rw-r--r--xlators/performance/Makefile.am3
-rw-r--r--xlators/performance/io-cache/src/Makefile.am11
-rw-r--r--xlators/performance/io-cache/src/io-cache-messages.h69
-rw-r--r--xlators/performance/io-cache/src/io-cache.c3074
-rw-r--r--xlators/performance/io-cache/src/io-cache.h425
-rw-r--r--xlators/performance/io-cache/src/ioc-inode.c305
-rw-r--r--xlators/performance/io-cache/src/ioc-mem-types.h44
-rw-r--r--xlators/performance/io-cache/src/page.c1452
-rw-r--r--xlators/performance/io-threads/src/Makefile.am10
-rw-r--r--xlators/performance/io-threads/src/io-threads-messages.h41
-rw-r--r--xlators/performance/io-threads/src/io-threads.c3114
-rw-r--r--xlators/performance/io-threads/src/io-threads.h122
-rw-r--r--xlators/performance/io-threads/src/iot-mem-types.h30
-rw-r--r--xlators/performance/md-cache/Makefile.am1
-rw-r--r--xlators/performance/md-cache/src/Makefile.am29
-rw-r--r--xlators/performance/md-cache/src/md-cache-mem-types.h23
-rw-r--r--xlators/performance/md-cache/src/md-cache-messages.h29
-rw-r--r--xlators/performance/md-cache/src/md-cache.c4020
-rw-r--r--xlators/performance/nl-cache/Makefile.am3
-rw-r--r--xlators/performance/nl-cache/src/Makefile.am12
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-helper.c1201
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-mem-types.h27
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-messages.h29
-rw-r--r--xlators/performance/nl-cache/src/nl-cache.c840
-rw-r--r--xlators/performance/nl-cache/src/nl-cache.h175
-rw-r--r--xlators/performance/open-behind/Makefile.am1
-rw-r--r--xlators/performance/open-behind/src/Makefile.am16
-rw-r--r--xlators/performance/open-behind/src/open-behind-mem-types.h22
-rw-r--r--xlators/performance/open-behind/src/open-behind-messages.h32
-rw-r--r--xlators/performance/open-behind/src/open-behind.c1101
-rw-r--r--xlators/performance/quick-read/src/Makefile.am10
-rw-r--r--xlators/performance/quick-read/src/quick-read-mem-types.h34
-rw-r--r--xlators/performance/quick-read/src/quick-read-messages.h31
-rw-r--r--xlators/performance/quick-read/src/quick-read.c4511
-rw-r--r--xlators/performance/quick-read/src/quick-read.h135
-rw-r--r--xlators/performance/read-ahead/src/Makefile.am10
-rw-r--r--xlators/performance/read-ahead/src/page.c844
-rw-r--r--xlators/performance/read-ahead/src/read-ahead-mem-types.h37
-rw-r--r--xlators/performance/read-ahead/src/read-ahead-messages.h31
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c1796
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.h200
-rw-r--r--xlators/performance/readdir-ahead/Makefile.am3
-rw-r--r--xlators/performance/readdir-ahead/src/Makefile.am18
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h24
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-messages.h30
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.c1382
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.h98
-rw-r--r--xlators/performance/stat-prefetch/src/Makefile.am14
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch-mem-types.h36
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch.c4135
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch.h106
-rw-r--r--xlators/performance/symlink-cache/Makefile.am3
-rw-r--r--xlators/performance/symlink-cache/src/Makefile.am12
-rw-r--r--xlators/performance/symlink-cache/src/symlink-cache.c407
-rw-r--r--xlators/performance/write-behind/src/Makefile.am10
-rw-r--r--xlators/performance/write-behind/src/write-behind-mem-types.h37
-rw-r--r--xlators/performance/write-behind/src/write-behind-messages.h31
-rw-r--r--xlators/performance/write-behind/src/write-behind.c5245
-rw-r--r--xlators/playground/Makefile.am2
-rw-r--r--xlators/playground/rot-13/Makefile.am (renamed from xlators/cluster/map/Makefile.am)0
-rw-r--r--xlators/playground/rot-13/src/Makefile.am (renamed from xlators/encryption/rot-13/src/Makefile.am)8
-rw-r--r--xlators/playground/rot-13/src/rot-13.c166
-rw-r--r--xlators/playground/rot-13/src/rot-13.h18
-rw-r--r--xlators/playground/template/Makefile.am2
-rw-r--r--xlators/playground/template/src/Makefile.am17
-rw-r--r--xlators/playground/template/src/template.c186
-rw-r--r--xlators/playground/template/src/template.h43
-rw-r--r--xlators/protocol/auth/addr/src/Makefile.am11
-rw-r--r--xlators/protocol/auth/addr/src/addr.c500
-rw-r--r--xlators/protocol/auth/login/src/Makefile.am10
-rw-r--r--xlators/protocol/auth/login/src/login.c302
-rw-r--r--xlators/protocol/client/src/Makefile.am18
-rw-r--r--xlators/protocol/client/src/client-callback.c335
-rw-r--r--xlators/protocol/client/src/client-common.c3589
-rw-r--r--xlators/protocol/client/src/client-common.h630
-rw-r--r--xlators/protocol/client/src/client-handshake.c2378
-rw-r--r--xlators/protocol/client/src/client-helpers.c964
-rw-r--r--xlators/protocol/client/src/client-lk.c1134
-rw-r--r--xlators/protocol/client/src/client-mem-types.h38
-rw-r--r--xlators/protocol/client/src/client-messages.h174
-rw-r--r--xlators/protocol/client/src/client-rpc-fops.c6079
-rw-r--r--xlators/protocol/client/src/client-rpc-fops_v2.c6177
-rw-r--r--xlators/protocol/client/src/client.c4502
-rw-r--r--xlators/protocol/client/src/client.h512
-rw-r--r--xlators/protocol/client/src/client3_1-fops.c5545
-rw-r--r--xlators/protocol/legacy/Makefile.am3
-rw-r--r--xlators/protocol/legacy/client/Makefile.am3
-rw-r--r--xlators/protocol/legacy/client/src/Makefile.am21
-rw-r--r--xlators/protocol/legacy/client/src/client-mem-types.h43
-rw-r--r--xlators/protocol/legacy/client/src/client-protocol.c6683
-rw-r--r--xlators/protocol/legacy/client/src/client-protocol.h178
-rw-r--r--xlators/protocol/legacy/client/src/saved-frames.c196
-rw-r--r--xlators/protocol/legacy/client/src/saved-frames.h79
-rw-r--r--xlators/protocol/legacy/lib/Makefile.am3
-rw-r--r--xlators/protocol/legacy/lib/src/Makefile.am14
-rw-r--r--xlators/protocol/legacy/lib/src/protocol.c108
-rw-r--r--xlators/protocol/legacy/lib/src/protocol.h1118
-rw-r--r--xlators/protocol/legacy/lib/src/transport.c422
-rw-r--r--xlators/protocol/legacy/lib/src/transport.h106
-rw-r--r--xlators/protocol/legacy/server/Makefile.am3
-rw-r--r--xlators/protocol/legacy/server/src/Makefile.am27
-rw-r--r--xlators/protocol/legacy/server/src/authenticate.c249
-rw-r--r--xlators/protocol/legacy/server/src/authenticate.h60
-rw-r--r--xlators/protocol/legacy/server/src/server-helpers.c622
-rw-r--r--xlators/protocol/legacy/server/src/server-helpers.h48
-rw-r--r--xlators/protocol/legacy/server/src/server-mem-types.h39
-rw-r--r--xlators/protocol/legacy/server/src/server-protocol.c6587
-rw-r--r--xlators/protocol/legacy/server/src/server-protocol.h191
-rw-r--r--xlators/protocol/legacy/server/src/server-resolve.c658
-rw-r--r--xlators/protocol/legacy/transport/Makefile.am3
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/Makefile.am19
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/ib-verbs-mem-types.h39
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/ib-verbs.c2625
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/ib-verbs.h220
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/name.c712
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/name.h47
-rw-r--r--xlators/protocol/legacy/transport/socket/Makefile.am1
-rw-r--r--xlators/protocol/legacy/transport/socket/src/Makefile.am19
-rw-r--r--xlators/protocol/legacy/transport/socket/src/name.c740
-rw-r--r--xlators/protocol/legacy/transport/socket/src/name.h44
-rw-r--r--xlators/protocol/legacy/transport/socket/src/socket-mem-types.h36
-rw-r--r--xlators/protocol/legacy/transport/socket/src/socket.c1625
-rw-r--r--xlators/protocol/legacy/transport/socket/src/socket.h129
-rw-r--r--xlators/protocol/server/src/Makefile.am31
-rw-r--r--xlators/protocol/server/src/authenticate.c377
-rw-r--r--xlators/protocol/server/src/authenticate.h58
-rw-r--r--xlators/protocol/server/src/server-common.c842
-rw-r--r--xlators/protocol/server/src/server-common.h199
-rw-r--r--xlators/protocol/server/src/server-handshake.c1323
-rw-r--r--xlators/protocol/server/src/server-helpers.c2478
-rw-r--r--xlators/protocol/server/src/server-helpers.h131
-rw-r--r--xlators/protocol/server/src/server-mem-types.h41
-rw-r--r--xlators/protocol/server/src/server-messages.h168
-rw-r--r--xlators/protocol/server/src/server-resolve.c1023
-rw-r--r--xlators/protocol/server/src/server-rpc-fops.c6084
-rw-r--r--xlators/protocol/server/src/server-rpc-fops_v2.c6031
-rw-r--r--xlators/protocol/server/src/server.c2352
-rw-r--r--xlators/protocol/server/src/server.h329
-rw-r--r--xlators/protocol/server/src/server3_1-fops.c5196
-rw-r--r--xlators/storage/Makefile.am2
-rw-r--r--xlators/storage/bdb/Makefile.am3
-rw-r--r--xlators/storage/bdb/src/Makefile.am18
-rw-r--r--xlators/storage/bdb/src/bctx.c341
-rw-r--r--xlators/storage/bdb/src/bdb-ll.c1464
-rw-r--r--xlators/storage/bdb/src/bdb-mem-types.h42
-rw-r--r--xlators/storage/bdb/src/bdb.c3603
-rw-r--r--xlators/storage/bdb/src/bdb.h530
-rw-r--r--xlators/storage/posix/src/Makefile.am25
-rw-r--r--xlators/storage/posix/src/posix-aio.c556
-rw-r--r--xlators/storage/posix/src/posix-aio.h34
-rw-r--r--xlators/storage/posix/src/posix-common.c1524
-rw-r--r--xlators/storage/posix/src/posix-entry-ops.c2496
-rw-r--r--xlators/storage/posix/src/posix-gfid-path.c243
-rw-r--r--xlators/storage/posix/src/posix-gfid-path.h28
-rw-r--r--xlators/storage/posix/src/posix-handle.c1020
-rw-r--r--xlators/storage/posix/src/posix-handle.h221
-rw-r--r--xlators/storage/posix/src/posix-helpers.c3666
-rw-r--r--xlators/storage/posix/src/posix-inode-fd-ops.c6004
-rw-r--r--xlators/storage/posix/src/posix-inode-handle.h118
-rw-r--r--xlators/storage/posix/src/posix-mem-types.h39
-rw-r--r--xlators/storage/posix/src/posix-messages.h74
-rw-r--r--xlators/storage/posix/src/posix-metadata-disk.h31
-rw-r--r--xlators/storage/posix/src/posix-metadata.c916
-rw-r--r--xlators/storage/posix/src/posix-metadata.h71
-rw-r--r--xlators/storage/posix/src/posix.c5013
-rw-r--r--xlators/storage/posix/src/posix.h702
-rw-r--r--xlators/system/posix-acl/src/Makefile.am17
-rw-r--r--xlators/system/posix-acl/src/posix-acl-mem-types.h23
-rw-r--r--xlators/system/posix-acl/src/posix-acl-messages.h28
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.c253
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.h55
-rw-r--r--xlators/system/posix-acl/src/posix-acl.c3092
-rw-r--r--xlators/system/posix-acl/src/posix-acl.h103
-rw-r--r--xlators/xlator.sym1
787 files changed, 419613 insertions, 196569 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am
index b1643d26ca0..ef20cbb64fa 100644
--- a/xlators/Makefile.am
+++ b/xlators/Makefile.am
@@ -1,3 +1,13 @@
-SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system
+if BUILD_GNFS
+ GNFS_DIR = nfs
+endif
+
+DIST_SUBDIRS = cluster storage protocol performance debug features \
+ mount nfs mgmt system playground meta
+
+SUBDIRS = cluster storage protocol performance debug features \
+ mount ${GNFS_DIR} mgmt system playground meta
+
+EXTRA_DIST = xlator.sym
CLEANFILES =
diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am
deleted file mode 100644
index f7766580257..00000000000
--- a/xlators/bindings/Makefile.am
+++ /dev/null
@@ -1 +0,0 @@
-SUBDIRS = $(BINDINGS_SUBDIRS)
diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am
deleted file mode 100644
index c0b9141c667..00000000000
--- a/xlators/bindings/python/src/Makefile.am
+++ /dev/null
@@ -1,19 +0,0 @@
-
-xlator_PROGRAMS = python.so
-
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings
-
-python_PYTHON = gluster.py glustertypes.py glusterstack.py
-
-pythondir = $(xlatordir)/python
-
-python_so_SOURCES = python.c
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \
- $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\"
-
-AM_LDFLAGS = $(PYTHON_LDFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py
deleted file mode 100644
index ee0eb131011..00000000000
--- a/xlators/bindings/python/src/gluster.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-# This file is part of GlusterFS.
-#
-# GlusterFS is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published
-# by the Free Software Foundation; either version 3 of the License,
-# or (at your option) any later version.
-#
-# GlusterFS is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see
-# <http://www.gnu.org/licenses/>.
-from ctypes import *
-from glustertypes import *
-from glusterstack import *
-import sys
-import inspect
-
-libglusterfs = CDLL("libglusterfs.so")
-_gf_log = libglusterfs._gf_log
-_gf_log.restype = c_int32
-_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p]
-
-gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel")
-
-GF_LOG_NONE = 0
-GF_LOG_CRITICAL = 1
-GF_LOG_ERROR = 2
-GF_LOG_WARNING = 3
-GF_LOG_DEBUG = 4
-
-def gf_log(module, level, fmt, *params):
- if level <= gf_log_loglevel:
- frame = sys._getframe(1)
- _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name,
- frame.f_lineno, level, fmt, *params)
-
-class ComplexTranslator(object):
- def __init__(self, xlator):
- self.xlator = xlator_t.from_address(xlator)
-
- def __getattr__(self, item):
- return getattr(self.xlator, item)
diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py
deleted file mode 100644
index ba24c81652e..00000000000
--- a/xlators/bindings/python/src/glusterstack.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-# This file is part of GlusterFS.
-#
-# GlusterFS is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published
-# by the Free Software Foundation; either version 3 of the License,
-# or (at your option) any later version.
-#
-# GlusterFS is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see
-# <http://www.gnu.org/licenses/>.
-from ctypes import *
-from glustertypes import *
-
-libc = CDLL("libc.so.6")
-calloc = libc.calloc
-calloc.argtypes = [c_int, c_int]
-calloc.restype = c_void_p
-
-# TODO: Can these be done in C somehow?
-def stack_wind(frame, rfn, obj, fn, *params):
- """Frame is a frame object"""
- _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t))
- _new[0].root = frame.root
- _new[0].next = frame.root[0].frames.next
- _new[0].prev = pointer(frame.root[0].frames)
- if frame.root[0].frames.next:
- frame.root[0].frames.next[0].prev = _new
- frame.root[0].frames.next = _new
- _new[0].this = obj
- # TODO: Type checking like tmp_cbk?
- _new[0].ret = rfn
- _new[0].parent = pointer(frame)
- _new[0].cookie = cast(_new, c_void_p)
- # TODO: Initialize lock
- #_new.lock.init()
- frame.ref_count += 1
- fn(_new, obj, *params)
-
-def stack_unwind(frame, *params):
- """Frame is a frame object"""
- fn = frame[0].ret
- parent = frame[0].parent[0]
- parent.ref_count -= 1
-
- op_ret = params[0]
- op_err = params[1]
- params = params[2:]
- fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this,
- op_ret, op_err, *params)
diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py
deleted file mode 100644
index e9069d07c72..00000000000
--- a/xlators/bindings/python/src/glustertypes.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-# This file is part of GlusterFS.
-#
-# GlusterFS is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published
-# by the Free Software Foundation; either version 3 of the License,
-# or (at your option) any later version.
-#
-# GlusterFS is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see
-# <http://www.gnu.org/licenses/>.
-from ctypes import *
-import collections
-
-#
-# Forward declaration of some gluster types
-#
-class call_frame_t(Structure):
- pass
-
-class call_ctx_t(Structure):
- pass
-
-class call_pool_t(Structure):
- pass
-
-class xlator_t(Structure):
- def _getFirstChild(self):
- return self.children[0].xlator
- firstChild = property(_getFirstChild)
-
-class xlator_list_t(Structure):
- pass
-
-class xlator_fops(Structure):
- pass
-
-class xlator_mops(Structure):
- pass
-
-class glusterfs_ctx_t(Structure):
- pass
-
-class list_head(Structure):
- pass
-
-class dict_t(Structure):
- pass
-
-class inode_table_t(Structure):
- pass
-
-class fd_t(Structure):
- pass
-
-class iovec(Structure):
- _fields_ = [
- ("iov_base", c_void_p),
- ("iov_len", c_size_t),
- ]
-
- def __init__(self, s):
- self.iov_base = cast(c_char_p(s), c_void_p)
- self.iov_len = len(s)
-
- def getBytes(self):
- return string_at(self.iov_base, self.iov_len)
-
-# This is a pthread_spinlock_t
-# TODO: what happens to volatile-ness?
-gf_lock_t = c_int
-
-uid_t = c_uint32
-gid_t = c_uint32
-pid_t = c_int32
-
-off_t = c_int64
-
-#
-# Function pointer types
-#
-ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t),
- POINTER(xlator_t), c_int32, c_int32)
-
-fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t))
-init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t))
-event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p)
-
-list_head._fields_ = [
- ("next", POINTER(list_head)),
- ("prev", POINTER(list_head)),
- ]
-
-call_frame_t._fields_ = [
- ("root", POINTER(call_ctx_t)),
- ("parent", POINTER(call_frame_t)),
- ("next", POINTER(call_frame_t)),
- ("prev", POINTER(call_frame_t)),
- ("local", c_void_p),
- ("this", POINTER(xlator_t)),
- ("ret", ret_fn_t),
- ("ref_count", c_int32),
- ("lock", gf_lock_t),
- ("cookie", c_void_p),
- ("op", c_int32),
- ("type", c_int8),
- ]
-
-call_ctx_t._fields_ = [
- ("all_frames", list_head),
- ("trans", c_void_p),
- ("pool", call_pool_t),
- ("unique", c_uint64),
- ("state", c_void_p),
- ("uid", uid_t),
- ("gid", gid_t),
- ("pid", pid_t),
- ("frames", call_frame_t),
- ("req_refs", POINTER(dict_t)),
- ("rsp_refs", POINTER(dict_t)),
- ]
-
-xlator_t._fields_ = [
- ("name", c_char_p),
- ("type", c_char_p),
- ("next", POINTER(xlator_t)),
- ("prev", POINTER(xlator_t)),
- ("parent", POINTER(xlator_t)),
- ("children", POINTER(xlator_list_t)),
- ("fops", POINTER(xlator_fops)),
- ("mops", POINTER(xlator_mops)),
- ("fini", fini_fn_t),
- ("init", init_fn_t),
- ("notify", event_notify_fn_t),
- ("options", POINTER(dict_t)),
- ("ctx", POINTER(glusterfs_ctx_t)),
- ("itable", POINTER(inode_table_t)),
- ("ready", c_char),
- ("private", c_void_p),
- ]
-
-xlator_list_t._fields_ = [
- ("xlator", POINTER(xlator_t)),
- ("next", POINTER(xlator_list_t)),
- ]
-
-fop_functions = collections.defaultdict(lambda: c_void_p)
-fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod',
- 'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access',
- 'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink',
- 'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush',
- 'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir',
- 'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir',
- # TODO: Call backs?
- ]
-
-fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t),
- POINTER(fd_t), POINTER(iovec), c_int32,
- off_t)
-
-fop_functions['writev'] = fop_writev_t
-xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names]
diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c
deleted file mode 100644
index 3310a211513..00000000000
--- a/xlators/bindings/python/src/python.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- Copyright (c) 2007-2010 Chris AtLee <chris@atlee.ca>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <Python.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-
-typedef struct
-{
- char *scriptname;
- PyObject *pXlator;
- PyObject *pScriptModule;
- PyObject *pGlusterModule;
- PyThreadState *pInterp;
-
- PyObject *pFrameType, *pVectorType, *pFdType;
-} python_private_t;
-
-int32_t
-python_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t offset)
-{
- python_private_t *priv = (python_private_t *)this->private;
- gf_log("python", GF_LOG_DEBUG, "In writev");
- if (PyObject_HasAttrString(priv->pXlator, "writev"))
- {
-
- PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev",
- "O O O i l",
- PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame),
- PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd),
- PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector),
- count,
- offset);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- }
- Py_XDECREF(retval);
- }
- else
- {
- return default_writev(frame, this, fd, vector, count, offset);
- }
- return 0;
-}
-
-struct xlator_fops fops = {
- .writev = python_writev
-};
-
-static PyObject *
-AnonModule_FromFile (const char* fname)
-{
- // Get the builtins
- PyThreadState* pThread = PyThreadState_Get();
- PyObject *pBuiltins = pThread->interp->builtins;
-
- if (PyErr_Occurred())
- {
- PyErr_Print();
- return NULL;
- }
-
- // Create a new dictionary for running code in
- PyObject *pModuleDict = PyDict_New();
- PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins);
- Py_INCREF(pBuiltins);
-
- // Run the file in the new context
- FILE* fp = fopen(fname, "r");
- PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict);
- fclose(fp);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- return NULL;
- }
-
- // Create an object to hold the new context
- PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- return NULL;
- }
- PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- Py_XDECREF(pModule);
- return NULL;
- }
-
- // Set the new context's dictionary to the one we used to run the code
- // inside
- PyObject_SetAttrString(pModule, "__dict__", pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- Py_DECREF(pModule);
- return NULL;
- }
-
- return pModule;
-}
-
-int32_t
-init (xlator_t *this)
-{
- // This is ok to call more than once per process
- Py_InitializeEx(0);
-
- if (!this->children) {
- gf_log ("python", GF_LOG_ERROR,
- "FATAL: python should have exactly one child");
- return -1;
- }
-
- python_private_t *priv = CALLOC (sizeof (python_private_t), 1);
- ERR_ABORT (priv);
-
- data_t *scriptname = dict_get (this->options, "scriptname");
- if (scriptname) {
- priv->scriptname = data_to_str(scriptname);
- } else {
- gf_log("python", GF_LOG_ERROR,
- "FATAL: python requires the scriptname parameter");
- return -1;
- }
-
- priv->pInterp = Py_NewInterpreter();
-
- // Adjust python's path
- PyObject *syspath = PySys_GetObject("path");
- PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH);
- PyList_Append(syspath, path);
- Py_DECREF(path);
-
- gf_log("python", GF_LOG_DEBUG,
- "Loading gluster module");
-
- priv->pGlusterModule = PyImport_ImportModule("gluster");
- if (PyErr_Occurred())
- {
- PyErr_Print();
- return -1;
- }
-
- priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t");
- priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t");
- priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec");
-
- gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname);
-
- priv->pScriptModule = AnonModule_FromFile(priv->scriptname);
- if (!priv->pScriptModule || PyErr_Occurred())
- {
- gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname);
- PyErr_Print();
- return -1;
- }
-
- if (!PyObject_HasAttrString(priv->pScriptModule, "xlator"))
- {
- gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname);
- return -1;
- }
- gf_log("python", GF_LOG_DEBUG, "Instantiating translator");
- priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&",
- PyLong_FromVoidPtr, this);
- if (PyErr_Occurred() || !priv->pXlator)
- {
- PyErr_Print();
- return -1;
- }
-
- this->private = priv;
-
- gf_log ("python", GF_LOG_DEBUG, "python xlator loaded");
- return 0;
-}
-
-void
-fini (xlator_t *this)
-{
- python_private_t *priv = (python_private_t*)(this->private);
- Py_DECREF(priv->pXlator);
- Py_DECREF(priv->pScriptModule);
- Py_DECREF(priv->pGlusterModule);
- Py_DECREF(priv->pFrameType);
- Py_DECREF(priv->pFdType);
- Py_DECREF(priv->pVectorType);
- Py_EndInterpreter(priv->pInterp);
- return;
-}
diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py
deleted file mode 100644
index 507455c856a..00000000000
--- a/xlators/bindings/python/src/testxlator.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-# This file is part of GlusterFS.
-#
-# GlusterFS is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published
-# by the Free Software Foundation; either version 3 of the License,
-# or (at your option) any later version.
-#
-# GlusterFS is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see
-# <http://www.gnu.org/licenses/>.
-
-"""
-This is a test translator written in python.
-
-Important things to note:
- This file must be import-able from glusterfsd. This probably means
- setting PYTHONPATH to where this file is located.
-
- This file must have a top-level xlator class object that will be
- used to instantiate individual translators.
-"""
-from gluster import *
-
-class MyXlator(ComplexTranslator):
- name = "MyXlator"
- def writev_cbk(self, frame, cookie, op_ret, op_errno, buf):
- stack_unwind(frame, op_ret, op_errno, buf)
- return 0
-
- def writev(self, frame, fd, vector, count, offset):
- gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
- # TODO: Use cookie to pass this to writev_cbk
- old_count = vector.iov_len
-
- data = vector.getBytes().encode("zlib")
-
- vector = iovec(data)
- gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
-
- @ret_fn_t
- def rfn(frame, prev, this, op_ret, op_errno, *params):
- if len(params) == 0:
- params = [0]
- return self.writev_cbk(frame, prev, old_count, op_errno, *params)
-
- stack_wind(frame, rfn, self.firstChild,
- self.firstChild[0].fops[0].writev, fd, vector, count, offset)
- return 0
-
-xlator = MyXlator
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 0990822a7d3..8e067d5ab58 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht
+SUBDIRS = afr dht ec
CLEANFILES =
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index e192b599bf4..610819b28fc 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -1,27 +1,35 @@
-xlator_LTLIBRARIES = afr.la pump.la
+xlator_LTLIBRARIES = afr.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr-lk-common.c $(top_builddir)/xlators/lib/src/libxlator.c
+afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \
+ afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \
+ afr-read-txn.c \
+ $(top_builddir)/xlators/lib/src/libxlator.c
-afr_la_LDFLAGS = -module -avoidversion
-afr_la_SOURCES = $(afr_common_source) afr.c
+AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \
+ afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \
+ afr-self-heal-name.c
+
+afr_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c
afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-pump_la_LDFLAGS = -module -avoidversion
-pump_la_SOURCES = $(afr_common_source) pump.c
-pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \
+ afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \
+ afr-common.c afr-self-heald.h \
+ $(top_builddir)/xlators/lib/src/libxlator.h afr-messages.h
-noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c $(top_builddir)/xlators/lib/src/libxlator.h
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
- -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS) \
- -I$(top_srcdir)/xlators/lib/src
+AM_CFLAGS = -Wall $(GF_CFLAGS)
CLEANFILES =
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/replicate.so
- rm -f $(DESTDIR)$(xlatordir)/pump.so
install-data-hook:
ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index ba451a0e89a..032ab5c8001 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#include <libgen.h>
@@ -24,28 +15,20 @@
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "statedump.h"
-
-#include "fd.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/hashfn.h>
+#include <glusterfs/list.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/events.h>
+#include <glusterfs/upcall-utils.h>
#include "afr-inode-read.h"
#include "afr-inode-write.h"
@@ -53,1380 +36,3085 @@
#include "afr-dir-write.h"
#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "pump.h"
+#include "afr-self-heald.h"
+#include "afr-messages.h"
-#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL
-#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL
-#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL
+int32_t
+afr_quorum_errno(afr_private_t *priv)
+{
+ return ENOTCONN;
+}
-int
-afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this,
- gf_boolean_t fail_conflict);
gf_boolean_t
-afr_is_child_present (int32_t *success_children, int32_t child_count,
- int32_t child)
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid)
{
- gf_boolean_t success_child = _gf_false;
- int i = 0;
+ if (!__is_root_gfid(pargfid)) {
+ return _gf_false;
+ }
- GF_ASSERT (child < child_count);
+ if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) {
+ /*For backward compatibility /.landfill is private*/
+ return _gf_true;
+ }
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- if (child == success_children[i]) {
- success_child = _gf_true;
- break;
- }
+ if (pid == GF_CLIENT_PID_GSYNCD) {
+ /*geo-rep needs to create/sync private directory on slave because
+ * it appears in changelog*/
+ return _gf_false;
+ }
+
+ if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) {
+ if (strcmp(name, priv->anon_inode_name) == 0) {
+ /* anonymous-inode dir is private*/
+ return _gf_true;
}
- return success_child;
+ } else {
+ if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) ==
+ 0) {
+ /* anonymous-inode dir prefix is private for geo-rep to work*/
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
}
-gf_boolean_t
-afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child)
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies)
{
- gf_boolean_t source_xattrs = _gf_false;
-
- GF_ASSERT (child < child_count);
+ int i = 0;
- if ((child >= 0) && (child < child_count) &&
- sources[child]) {
- source_xattrs = _gf_true;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ replies[i] = 1;
+ } else {
+ replies[i] = 0;
}
- return source_xattrs;
+ }
}
-gf_boolean_t
-afr_is_read_child (int32_t *success_children, int32_t *sources,
- int32_t child_count, int32_t child)
-{
- gf_boolean_t success_child = _gf_false;
- gf_boolean_t source = _gf_false;
+int
+afr_fav_child_reset_sink_xattrs(void *opaque);
- GF_ASSERT (success_children);
- GF_ASSERT (child_count > 0);
+int
+afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);
- success_child = afr_is_child_present (success_children, child_count,
- child);
- if (!success_child)
- goto out;
- if (NULL == sources) {
- source = _gf_true;
- goto out;
- }
- source = afr_is_source_child (sources, child_count, child);
-out:
- return (success_child && source);
+static void
+afr_discover_done(call_frame_t *frame, xlator_t *this);
+
+int
+afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ local->cont.lk.dom_lock_op_ret[i] = op_ret;
+ local->cont.lk.dom_lock_op_errno[i] = op_errno;
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to acquire %s on %s",
+ uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM,
+ priv->children[i]->name);
+ } else {
+ local->cont.lk.dom_locked_nodes[i] = 1;
+ }
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
}
-void
-afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count)
+int
+afr_dom_lock_acquire(call_frame_t *frame)
{
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ local->cont.lk.dom_locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+ if (!local->cont.lk.dom_locked_nodes) {
+ return -ENOMEM;
+ }
+ local->cont.lk.dom_lock_op_ret = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_ret) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ local->cont.lk.dom_lock_op_errno = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_errno) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ flock.l_type = F_WRLCK;
+
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLK, &flock, NULL);
+
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL))
+ goto blocking_lock;
+
+ /*If any of the bricks returned EAGAIN, we still need blocking locks.*/
+ if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) !=
+ priv->child_count) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.dom_lock_op_ret[i] == -1 &&
+ local->cont.lk.dom_lock_op_errno[i] == EAGAIN)
+ goto blocking_lock;
+ }
+ }
- for (i = 0; i < child_count; i++)
- dst[i] = src[i];
+ return 0;
+
+blocking_lock:
+ afr_dom_lock_release(frame);
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLKW, &flock, NULL);
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) {
+ afr_dom_lock_release(frame);
+ return -afr_quorum_errno(priv);
+ }
+
+ return 0;
}
-void
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path)
+int
+afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
- priv = this->private;
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to release %s on %s", local->loc.path,
+ AFR_LK_HEAL_DOM, priv->children[i]->name);
+ }
+ local->cont.lk.dom_locked_nodes[i] = 0;
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req, priv->pending_key[i],
- 3 * sizeof(int32_t));
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- path, priv->pending_key[i]);
- /* 3 = data+metadata+entry */
- }
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
}
void
-afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
- dict_t *xattr_req, loc_t *loc, void **gfid_req)
+afr_dom_lock_release(call_frame_t *frame)
{
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = frame->this->private;
+ locked_on = local->cont.lk.dom_locked_nodes;
+ if (AFR_COUNT(locked_on, priv->child_count) == 0)
+ return;
+ flock.l_type = F_UNLCK;
- GF_ASSERT (gfid_req);
- *gfid_req = NULL;
- local->xattr_req = dict_new ();
- if (xattr_req)
- dict_copy (xattr_req, local->xattr_req);
+ AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk,
+ AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL);
- afr_xattr_req_prepare (this, local->xattr_req, loc->path);
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_INODELK_COUNT);
- }
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_ENTRYLK_COUNT);
- }
+ return;
+}
- ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: failed to get the gfid from dict", loc->path);
- *gfid_req = NULL;
- } else {
- if (loc->parent != NULL)
- dict_del (local->xattr_req, "gfid-req");
- }
+static void
+afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info)
+{
+ if (!info)
+ return;
+ if (info->xdata_req)
+ dict_unref(info->xdata_req);
+ if (info->fd)
+ fd_unref(info->fd);
+ GF_FREE(info->locked_nodes);
+ GF_FREE(info->child_up_event_gen);
+ GF_FREE(info->child_down_event_gen);
+ GF_FREE(info);
}
-void
-afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc)
+static int
+afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this)
{
- inode_t *inode = NULL;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -ENOMEM;
+
+ info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t);
+ if (!info) {
+ goto cleanup;
+ }
+ INIT_LIST_HEAD(&info->pos);
+ info->fd = fd_ref(local->fd);
+ info->cmd = local->cont.lk.cmd;
+ info->pid = frame->root->pid;
+ info->flock = local->cont.lk.user_flock;
+ info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL);
+ if (!info->xdata_req) {
+ goto cleanup;
+ }
+ info->lk_owner = frame->root->lk_owner;
+ info->locked_nodes = GF_MALLOC(
+ sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char);
+ if (!info->locked_nodes) {
+ goto cleanup;
+ }
+ memcpy(info->locked_nodes, local->cont.lk.locked_nodes,
+ sizeof(*info->locked_nodes) * priv->child_count);
+ info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen),
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!info->child_up_event_gen) {
+ goto cleanup;
+ }
+ info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!info->child_down_event_gen) {
+ goto cleanup;
+ }
+
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(local->fd, this);
+ if (fd_ctx)
+ fd_ctx->lk_heal_info = info;
+ }
+ UNLOCK(&local->fd->lock);
+ if (!fd_ctx) {
+ goto cleanup;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&info->pos, &priv->saved_locks);
+ }
+ UNLOCK(&priv->lock);
+
+ return 0;
+cleanup:
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to add lock to healq",
+ uuid_utoa(local->fd->inode->gfid));
+ if (info) {
+ afr_lk_heal_info_cleanup(info);
+ if (fd_ctx) {
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx->lk_heal_info = NULL;
+ }
+ UNLOCK(&local->fd->lock);
+ }
+ }
+ return ret;
+}
- GF_ASSERT (loc);
- inode = loc->inode;
- if (inode && !uuid_is_null (inode->gfid)) {
- uuid_copy (dst, inode->gfid);
- } else if (!uuid_is_null (loc->gfid)) {
- uuid_copy (dst, loc->gfid);
- } else {
- GF_ASSERT (new && !uuid_is_null (new));
- uuid_copy (dst, new);
- }
+static int
+afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = local->cont.lk.user_flock;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -EINVAL;
+
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx || !fd_ctx->lk_heal_info) {
+ goto out;
+ }
+
+ info = fd_ctx->lk_heal_info;
+ if ((info->flock.l_start != flock.l_start) ||
+ (info->flock.l_whence != flock.l_whence) ||
+ (info->flock.l_len != flock.l_len)) {
+ /*TODO: Compare lkowners too.*/
+ goto out;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ UNLOCK(&priv->lock);
+
+ afr_lk_heal_info_cleanup(info);
+ fd_ctx->lk_heal_info = NULL;
+ ret = 0;
+out:
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to remove lock from healq",
+ uuid_utoa(local->fd->inode->gfid));
+ return ret;
}
int
-afr_errno_count (int32_t *children, int *child_errno,
- unsigned int child_count, int32_t op_errno)
+afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
{
- int i = 0;
- int errno_count = 0;
- int child = 0;
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed to heal lock on child %d for %s", i,
+ uuid_utoa(local->fd->inode->gfid));
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
- for (i = 0; i < child_count; i++) {
- if (children) {
- child = children[i];
- if (child == -1)
- break;
- } else {
- child = i;
- }
- if (child_errno[child] == op_errno)
- errno_count++;
- }
- return errno_count;
+int
+afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid));
+ } else {
+ local->cont.lk.getlk_rsp[i] = *lock;
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
}
-int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid)
+static gf_boolean_t
+afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
{
- int ret = 0;
- uuid_t *pgfid = NULL;
+ int i = 0;
+ afr_local_t *local = frame->local;
+ struct gf_flock flock = {
+ 0,
+ };
+ gf_boolean_t ret = _gf_true;
+ char *wind_on = alloca0(priv->child_count);
+ unsigned char *success_replies = alloca0(priv->child_count);
+ local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp),
+ priv->child_count, gf_afr_mt_gf_lock);
+
+ flock = info->flock;
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ wind_on[i] = 1;
+ }
+
+ AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock,
+ info->xdata_req);
+
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) == 0) {
+ ret = _gf_false;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret != 0)
+ continue;
+ if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK)
+ continue;
+ /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/
+ if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner,
+ &info->lk_owner)) {
+ ret = _gf_false;
+ break;
+ }
+ }
+out:
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->cont.lk.getlk_rsp);
+ local->cont.lk.getlk_rsp = NULL;
+ return ret;
+}
- GF_ASSERT (gfid);
+static void
+afr_mark_fd_bad(fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
- pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
- if (!pgfid) {
- ret = -1;
- gf_log (THIS->name, GF_LOG_ERROR, "Out of memory");
- goto out;
+ if (!fd)
+ return;
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ if (fd_ctx) {
+ fd_ctx->is_fd_bad = _gf_true;
+ fd_ctx->lk_heal_info = NULL;
}
- uuid_copy (*pgfid, gfid);
+ }
+ UNLOCK(&fd->lock);
+}
- ret = dict_set_dynptr (dict, "gfid-req", pgfid, 16);
- if (ret) {
- GF_FREE (pgfid);
- gf_log (THIS->name, GF_LOG_DEBUG, "gfid set failed");
- }
+static void
+afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info)
+{
+ LOCK(&priv->lock);
+ {
+ list_del(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+}
-out:
- return ret;
+static void
+afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ int op_errno = 0;
+ int32_t *current_event_gen = NULL;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ char *wind_on = alloca0(priv->child_count);
+ gf_boolean_t retry = _gf_true;
+
+ frame->root->pid = info->pid;
+ lk_owner_copy(&frame->root->lk_owner, &info->lk_owner);
+
+ op_errno = -afr_dom_lock_acquire(frame);
+ if ((op_errno != 0)) {
+ goto release;
+ }
+
+ if (!afr_does_lk_owner_match(frame, priv, info)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM,
+ "Ignoring lock heal for %s since lk-onwers mismatch. "
+ "Lock possibly pre-empted by another client.",
+ uuid_utoa(info->fd->inode->gfid));
+ goto release;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ continue;
+ wind_on[i] = 1;
+ }
+
+ current_event_gen = alloca(priv->child_count);
+ memcpy(current_event_gen, info->child_up_event_gen,
+ priv->child_count * sizeof *current_event_gen);
+ AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd,
+ &info->flock, info->xdata_req);
+
+ LOCK(&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) {
+ continue;
+ }
+
+ if ((current_event_gen[i] == info->child_up_event_gen[i]) &&
+ (current_event_gen[i] > info->child_down_event_gen[i])) {
+ info->locked_nodes[i] = 1;
+ retry = _gf_false;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->saved_locks);
+ } else {
+ /*We received subsequent child up/down events while heal was in
+ * progress; don't mark child as healed. Attempt again on the
+ * new child up*/
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM,
+ "Event gen mismatch: skipped healing lock on child %d "
+ "for %s.",
+ i, uuid_utoa(info->fd->inode->gfid));
+ }
+ }
+ }
+ UNLOCK(&priv->lock);
+
+release:
+ afr_dom_lock_release(frame);
+ if (retry)
+ afr_add_lock_to_lkhealq(priv, info);
+ return;
}
-uint64_t
-afr_is_split_brain (xlator_t *this, inode_t *inode)
+static int
+afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque)
{
- int ret = 0;
+ STACK_DESTROY(frame->root);
+ return 0;
+}
- uint64_t ctx = 0;
- uint64_t split_brain = 0;
+static int
+afr_lock_heal(void *opaque)
+{
+ call_frame_t *frame = (call_frame_t *)opaque;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+ struct list_head healq = {
+ 0,
+ };
+ int ret = 0;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&healq);
+ LOCK(&priv->lock);
+ {
+ list_splice_init(&priv->lk_healq, &healq);
+ }
+ UNLOCK(&priv->lock);
+
+ list_for_each_entry_safe(info, tmp, &healq, pos)
+ {
+ GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) <
+ priv->child_count));
+ ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd);
+ afr_lock_heal_do(iter_frame, priv, info);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = ENOTCONN;
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN,
+ AFR_MSG_LK_HEAL_DOM,
+ "Aborting processing of lk_healq."
+ "Healing will be reattempted on next child up for locks "
+ "that are still in quorum.");
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&healq, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+ break;
+ }
+ }
+
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
+}
+
+static int
+__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
- VALIDATE_OR_GOTO (inode, out);
+ if (priv->shd.iamshd)
+ return 0;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_up_event_gen[child] = priv->event_generation;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
- if (ret < 0)
- goto unlock;
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return -1;
- split_brain = ctx & AFR_ICTX_SPLIT_BRAIN_MASK;
- }
-unlock:
- UNLOCK (&inode->lock);
+ ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame,
+ frame);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM,
+ "Failed to launch lock heal synctask");
-out:
- return split_brain;
+ return ret;
}
-
-void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set)
+static int
+__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child)
{
- uint64_t ctx = 0;
- int ret = 0;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
- VALIDATE_OR_GOTO (inode, out);
+ if (priv->shd.iamshd)
+ return 0;
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_down_event_gen[child] = priv->event_generation;
+ if (info->locked_nodes[child] == 1)
+ info->locked_nodes[child] = 0;
+ if (!afr_has_quorum(info->locked_nodes, this, NULL)) {
+ /* Since the lock was lost on quorum no. of nodes, we should
+ * not attempt to heal it anymore. Some other client could have
+ * acquired the lock, modified data and released it and this
+ * client wouldn't know about it if we heal it.*/
+ afr_mark_fd_bad(info->fd, this);
+ list_del(&info->pos);
+ afr_lk_heal_info_cleanup(info);
+ /* We're not winding an unlock on the node where the lock is still
+ * present because when fencing logic switches over to the new
+ * client (since we marked the fd bad), it should preempt any
+ * existing lock. */
+ }
+ }
+ return 0;
+}
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+gf_boolean_t
+afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
+ int32_t *op_errno)
+{
+ if (priv->consistent_io && local->call_count != priv->child_count) {
+ gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
+ "All subvolumes are not up");
+ if (op_errno)
+ *op_errno = ENOTCONN;
+ return _gf_false;
+ }
+ return _gf_true;
+}
- if (ret < 0) {
- ctx = 0;
- }
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata)
+{
+ int ret = 0;
+ uint32_t lk_mode = GF_LK_ADVISORY;
- if (set) {
- ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx)
- | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK);
- } else {
- ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx);
- }
+ ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode);
+ if (!ret && lk_mode == GF_LK_MANDATORY)
+ return _gf_true;
- ret = __inode_ctx_put (inode, this, ctx);
- if (ret) {
- gf_log_callingfn (this->name, GF_LOG_INFO,
- "failed to set the inode ctx (%s)",
- uuid_utoa (inode->gfid));
- }
- }
- UNLOCK (&inode->lock);
-out:
- return;
+ return _gf_false;
}
+call_frame_t *
+afr_copy_frame(call_frame_t *base)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int op_errno = 0;
+
+ frame = copy_frame(base);
+ if (!frame)
+ return NULL;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local) {
+ AFR_STACK_DESTROY(frame);
+ return NULL;
+ }
+
+ return frame;
+}
-uint64_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode)
+/* Check if an entry or inode could be undergoing a transaction. */
+gf_boolean_t
+afr_is_possibly_under_txn(afr_transaction_type type, afr_local_t *local,
+ xlator_t *this)
{
- int ret = 0;
- uint64_t ctx = 0;
- uint64_t opendir_done = 0;
+ int i = 0;
+ int tmp = 0;
+ afr_private_t *priv = NULL;
+ GF_UNUSED char *key = NULL;
+ int keylen = 0;
+
+ priv = this->private;
+
+ if (type == AFR_ENTRY_TRANSACTION) {
+ key = GLUSTERFS_PARENT_ENTRYLK;
+ keylen = SLEN(GLUSTERFS_PARENT_ENTRYLK);
+ } else if (type == AFR_DATA_TRANSACTION) {
+ /*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once
+ * pl_inodelk_xattr_fill supports separate keys for different
+ * domains.*/
+ key = GLUSTERFS_INODELK_COUNT;
+ keylen = SLEN(GLUSTERFS_INODELK_COUNT);
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].xdata)
+ continue;
+ if (dict_get_int32n(local->replies[i].xdata, key, keylen, &tmp) == 0)
+ if (tmp)
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
- VALIDATE_OR_GOTO (inode, out);
+static void
+afr_inode_ctx_destroy(afr_inode_ctx_t *ctx)
+{
+ int i = 0;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ if (!ctx)
+ return;
- if (ret < 0)
- goto unlock;
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ GF_FREE(ctx->pre_op_done[i]);
+ }
- opendir_done = ctx & AFR_ICTX_OPENDIR_DONE_MASK;
- }
-unlock:
- UNLOCK (&inode->lock);
+ GF_FREE(ctx);
+}
+int
+__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
+{
+ uint64_t ctx_int = 0;
+ int ret = -1;
+ int i = -1;
+ int num_locks = -1;
+ afr_inode_ctx_t *ictx = NULL;
+ afr_lock_t *lock = NULL;
+ afr_private_t *priv = this->private;
+
+ ret = __inode_ctx_get(inode, this, &ctx_int);
+ if (ret == 0) {
+ *ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
+ return 0;
+ }
+
+ ictx = GF_CALLOC(1, sizeof(afr_inode_ctx_t), gf_afr_mt_inode_ctx_t);
+ if (!ictx)
+ goto out;
+
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ ictx->pre_op_done[i] = GF_CALLOC(sizeof *ictx->pre_op_done[i],
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!ictx->pre_op_done[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ num_locks = sizeof(ictx->lock) / sizeof(afr_lock_t);
+ for (i = 0; i < num_locks; i++) {
+ lock = &ictx->lock[i];
+ INIT_LIST_HEAD(&lock->post_op);
+ INIT_LIST_HEAD(&lock->frozen);
+ INIT_LIST_HEAD(&lock->waiting);
+ INIT_LIST_HEAD(&lock->owners);
+ }
+
+ ctx_int = (uint64_t)(uintptr_t)ictx;
+ ret = __inode_ctx_set(inode, this, &ctx_int);
+ if (ret) {
+ goto out;
+ }
+
+ ictx->spb_choice = -1;
+ ictx->read_subvol = 0;
+ ictx->write_subvol = 0;
+ ictx->lock_count = 0;
+ ret = 0;
+ *ctx = ictx;
out:
- return opendir_done;
+ if (ret) {
+ afr_inode_ctx_destroy(ictx);
+ }
+ return ret;
}
+/*
+ * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS:
+ *
+ * |<---------- 64bit ------------>|
+ * 63 32 31 16 15 0
+ * | EVENT_GEN | DATA | METADATA |
+ *
+ *
+ * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which
+ * metadata can be attempted to be read.
+ *
+ * bit-0 => priv->subvolumes[0]
+ * bit-1 => priv->subvolumes[1]
+ * ... etc. till bit-15
+ *
+ * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data
+ * can be attempted to be read.
+ *
+ * bit-16 => priv->subvolumes[0]
+ * bit-17 => priv->subvolumes[1]
+ * ... etc. till bit-31
+ *
+ * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation)
+ * when DATA and METADATA was last updated.
+ *
+ * If EVENT_GEN is < priv->event_generation,
+ * or is 0, it means afr_inode_refresh() needs
+ * to be called to recalculate the bitmaps.
+ */
-void
-afr_set_opendir_done (xlator_t *this, inode_t *inode)
+int
+__afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
+ inode_t *inode)
{
- uint64_t ctx = 0;
- int ret = 0;
+ int i = 0;
+ int txn_type = 0;
+ int count = 0;
+ int index = -1;
+ uint16_t datamap_old = 0;
+ uint16_t metadatamap_old = 0;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint16_t tmp_map = 0;
+ uint16_t mask = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ txn_type = local->transaction.type;
+
+ if (txn_type == AFR_DATA_TRANSACTION)
+ val = local->inode_ctx->write_subvol;
+ else
+ val = local->inode_ctx->read_subvol;
+
+ metadatamap_old = metadatamap = (val & 0x000000000000ffff);
+ datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
+
+ if (txn_type == AFR_DATA_TRANSACTION)
+ tmp_map = datamap;
+ else if (txn_type == AFR_METADATA_TRANSACTION)
+ tmp_map = metadatamap;
+
+ count = gf_bits_count(tmp_map);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i])
+ continue;
+
+ mask = 1 << i;
+ if (txn_type == AFR_METADATA_TRANSACTION)
+ metadatamap &= ~mask;
+ else if (txn_type == AFR_DATA_TRANSACTION)
+ datamap &= ~mask;
+ }
+
+ switch (txn_type) {
+ case AFR_METADATA_TRANSACTION:
+ if ((metadatamap_old != 0) && (metadatamap == 0) && (count == 1)) {
+ index = gf_bits_index(tmp_map);
+ local->transaction.in_flight_sb_errno = local->replies[index]
+ .op_errno;
+ local->transaction.in_flight_sb = _gf_true;
+ metadatamap |= (1 << index);
+ }
+ if (metadatamap_old != metadatamap) {
+ __afr_inode_need_refresh_set(inode, this);
+ }
+ break;
+
+ case AFR_DATA_TRANSACTION:
+ if ((datamap_old != 0) && (datamap == 0) && (count == 1)) {
+ index = gf_bits_index(tmp_map);
+ local->transaction.in_flight_sb_errno = local->replies[index]
+ .op_errno;
+ local->transaction.in_flight_sb = _gf_true;
+ datamap |= (1 << index);
+ }
+ if (datamap_old != datamap)
+ __afr_inode_need_refresh_set(inode, this);
+ break;
- VALIDATE_OR_GOTO (inode, out);
+ default:
+ break;
+ }
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
- if (ret < 0) {
- ctx = 0;
- }
+ if (txn_type == AFR_DATA_TRANSACTION)
+ local->inode_ctx->write_subvol = val;
+ local->inode_ctx->read_subvol = val;
- ctx = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx)
- | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK);
-
- ret = __inode_ctx_put (inode, this, ctx);
- if (ret) {
- gf_log_callingfn (this->name, GF_LOG_INFO,
- "failed to set the inode ctx (%s)",
- uuid_utoa (inode->gfid));
- }
- }
- UNLOCK (&inode->lock);
-out:
- return;
+ return 0;
}
-
-uint64_t
-afr_read_child (xlator_t *this, inode_t *inode)
+gf_boolean_t
+afr_is_symmetric_error(call_frame_t *frame, xlator_t *this)
{
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int i_errno = 0;
+ gf_boolean_t matching_errors = _gf_true;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret != -1) {
+ /* Operation succeeded on at least one subvol,
+ so it is not a failed-everywhere situation.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+ i_errno = local->replies[i].op_errno;
+
+ if (i_errno == ENOTCONN) {
+ /* ENOTCONN is not a symmetric error. We do not
+ know if the operation was performed on the
+ backend or not.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+
+ if (!op_errno) {
+ op_errno = i_errno;
+ } else if (op_errno != i_errno) {
+ /* Mismatching op_errno's */
+ matching_errors = _gf_false;
+ break;
+ }
+ }
+
+ return matching_errors;
+}
- uint64_t ctx = 0;
- uint64_t read_child = 0;
+int
+afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, inode_t *inode)
+{
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- VALIDATE_OR_GOTO (inode, out);
+ priv = this->private;
+ local = frame->local;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ /* If this transaction saw no failures, then exit. */
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == 0)
+ return 0;
- if (ret < 0)
- goto unlock;
+ if (afr_is_symmetric_error(frame, this))
+ return 0;
- read_child = ctx & AFR_ICTX_READ_CHILD_MASK;
- }
-unlock:
- UNLOCK (&inode->lock);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_set_in_flight_sb_status(this, local, inode);
+ }
+ UNLOCK(&inode->lock);
-out:
- return read_child;
+ return ret;
}
-
-void
-afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child)
+int
+__afr_inode_read_subvol_get_small(inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
{
- uint64_t ctx = 0;
- int ret = 0;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ int i = 0;
+ afr_inode_ctx_t *ctx = NULL;
+
+ priv = this->private;
+
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0)
+ return ret;
- VALIDATE_OR_GOTO (inode, out);
+ val = ctx->read_subvol;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ metadatamap = (val & 0x000000000000ffff);
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
- if (ret < 0) {
- ctx = 0;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (metadata)
+ metadata[i] = (metadatamap >> i) & 1;
+ if (data)
+ data[i] = (datamap >> i) & 1;
+ }
- ctx = (~AFR_ICTX_READ_CHILD_MASK & ctx)
- | (AFR_ICTX_READ_CHILD_MASK & read_child);
+ if (event_p)
+ *event_p = event;
+ return ret;
+}
- ret = __inode_ctx_put (inode, this, ctx);
- if (ret) {
- gf_log_callingfn (this->name, GF_LOG_INFO,
- "failed to set the inode ctx (%s)",
- uuid_utoa (inode->gfid));
- }
- }
- UNLOCK (&inode->lock);
+int
+__afr_inode_read_subvol_set_small(inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int event)
+{
+ afr_private_t *priv = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int i = 0;
+ int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
-out:
- return;
-}
+ priv = this->private;
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto out;
-void
-afr_reset_xattr (dict_t **xattr, unsigned int child_count)
-{
- unsigned int i = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (data[i])
+ datamap |= (1 << i);
+ if (metadata[i])
+ metadatamap |= (1 << i);
+ }
- if (!xattr)
- goto out;
- for (i = 0; i < child_count; i++) {
- if (xattr[i]) {
- dict_unref (xattr[i]);
- xattr[i] = NULL;
- }
- }
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
+
+ ctx->read_subvol = val;
+
+ ret = 0;
out:
- return;
+ return ret;
}
-void
-afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
+int
+__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ priv = this->private;
- sh = &local->self_heal;
- priv = this->private;
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_get_small(inode, this, data, metadata,
+ event_p);
+ else
+ /* TBD: allocate structure with array and read from it */
+ ret = -1;
- if (sh->buf)
- GF_FREE (sh->buf);
+ return ret;
+}
- if (sh->parentbufs)
- GF_FREE (sh->parentbufs);
+int
+__afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
+ int *spb_choice)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- if (sh->inode)
- inode_unref (sh->inode);
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0)
+ return ret;
- if (sh->xattr) {
- afr_reset_xattr (sh->xattr, priv->child_count);
- GF_FREE (sh->xattr);
- }
+ *spb_choice = ctx->spb_choice;
+ return 0;
+}
- if (sh->child_errno)
- GF_FREE (sh->child_errno);
+int
+__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
- if (sh->pending_matrix) {
- for (i = 0; i < priv->child_count; i++) {
- GF_FREE (sh->pending_matrix[i]);
- }
- GF_FREE (sh->pending_matrix);
- }
+ priv = this->private;
- if (sh->delta_matrix) {
- for (i = 0; i < priv->child_count; i++) {
- GF_FREE (sh->delta_matrix[i]);
- }
- GF_FREE (sh->delta_matrix);
- }
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_set_small(inode, this, data, metadata,
+ event);
+ else
+ ret = -1;
- if (sh->sources)
- GF_FREE (sh->sources);
+ return ret;
+}
- if (sh->success)
- GF_FREE (sh->success);
+int
+__afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
+ int spb_choice)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- if (sh->locked_nodes)
- GF_FREE (sh->locked_nodes);
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto out;
- if (sh->healing_fd && !sh->healing_fd_opened) {
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
- }
+ ctx->spb_choice = spb_choice;
- if (sh->linkname)
- GF_FREE ((char *)sh->linkname);
- if (sh->child_success)
- GF_FREE (sh->child_success);
+ ret = 0;
+out:
+ return ret;
+}
- if (sh->fresh_children)
- GF_FREE (sh->fresh_children);
- if (sh->fresh_parent_dirs)
- GF_FREE (sh->fresh_parent_dirs);
+int
+afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
+{
+ int ret = -1;
- loc_wipe (&sh->parent_loc);
- loc_wipe (&sh->lookup_loc);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_get(inode, this, data, metadata, event_p);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
}
-
-void
-afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
+int
+afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type)
{
- int i = 0;
- afr_private_t * priv = NULL;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ unsigned char *data = alloca0(priv->child_count);
+ unsigned char *metadata = alloca0(priv->child_count);
+ int data_count = 0;
+ int metadata_count = 0;
+ int event_generation = 0;
+ int ret = 0;
+
+ ret = afr_inode_read_subvol_get(inode, this, data, metadata,
+ &event_generation);
+ if (ret == -1)
+ return -EIO;
+
+ data_count = AFR_COUNT(data, priv->child_count);
+ metadata_count = AFR_COUNT(metadata, priv->child_count);
+
+ if (inode->ia_type == IA_IFDIR) {
+ /* For directories, allow even if it is in data split-brain. */
+ if (type == AFR_METADATA_TRANSACTION || local->op == GF_FOP_STAT ||
+ local->op == GF_FOP_FSTAT) {
+ if (!metadata_count)
+ return -EIO;
+ }
+ } else {
+ /* For files, abort in case of data/metadata split-brain. */
+ if (!data_count || !metadata_count) {
+ return -EIO;
+ }
+ }
+
+ if (type == AFR_METADATA_TRANSACTION && readable)
+ memcpy(readable, metadata, priv->child_count * sizeof *metadata);
+ if (type == AFR_DATA_TRANSACTION && readable) {
+ if (!data_count)
+ memcpy(readable, local->child_up,
+ priv->child_count * sizeof *readable);
+ else
+ memcpy(readable, data, priv->child_count * sizeof *data);
+ }
+ if (event_p)
+ *event_p = event_generation;
+ return 0;
+}
- priv = this->private;
+static int
+afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
+ int *spb_choice)
+{
+ int ret = -1;
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_split_brain_choice_get(inode, this, spb_choice);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->pending && local->pending[i])
- GF_FREE (local->pending[i]);
- }
+/*
+ * frame is used to get the favourite policy. Since
+ * afr_inode_split_brain_choice_get was called with afr_open, it is possible to
+ * have a frame with out local->replies. So in that case, frame is passed as
+ * null, hence this function will handle the frame NULL case.
+ */
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol)
+{
+ int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- GF_FREE (local->pending);
+ GF_VALIDATE_OR_GOTO("afr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out);
- if (local->internal_lock.locked_nodes)
- GF_FREE (local->internal_lock.locked_nodes);
+ priv = this->private;
- if (local->internal_lock.inode_locked_nodes)
- GF_FREE (local->internal_lock.inode_locked_nodes);
+ ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol);
+ if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) {
+ local = frame->local;
+ *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (*spb_subvol >= 0) {
+ ret = 0;
+ }
+ }
- if (local->internal_lock.entry_locked_nodes)
- GF_FREE (local->internal_lock.entry_locked_nodes);
+out:
+ return ret;
+}
+int
+afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
+{
+ int ret = -1;
- if (local->internal_lock.lower_locked_nodes)
- GF_FREE (local->internal_lock.lower_locked_nodes);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_set(inode, this, data, metadata, event);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
- GF_FREE (local->transaction.child_errno);
- GF_FREE (local->child_errno);
+int
+afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice)
+{
+ int ret = -1;
- GF_FREE (local->transaction.basename);
- GF_FREE (local->transaction.new_basename);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- loc_wipe (&local->transaction.parent_loc);
- loc_wipe (&local->transaction.new_parent_loc);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_split_brain_choice_set(inode, this, spb_choice);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
}
-
-/**
- * afr_local_cleanup - cleanup everything in frame->local
+/* The caller of this should perform afr_inode_refresh, if this function
+ * returns _gf_true
*/
-
-void
-afr_local_cleanup (afr_local_t *local, xlator_t *this)
+gf_boolean_t
+afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
+ int event_gen2)
{
- afr_private_t * priv = NULL;
-
- if (!local)
- return;
-
- afr_local_sh_cleanup (local, this);
+ gf_boolean_t need_refresh = _gf_false;
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- afr_local_transaction_cleanup (local, this);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- priv = this->private;
-
- loc_wipe (&local->loc);
- loc_wipe (&local->newloc);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto unlock;
- if (local->fd)
- fd_unref (local->fd);
+ need_refresh = ctx->need_refresh;
+ /* Hoping that the caller will do inode_refresh followed by
+ * this, hence setting the need_refresh to false */
+ ctx->need_refresh = _gf_false;
+ }
+unlock:
+ UNLOCK(&inode->lock);
- if (local->xattr_req)
- dict_unref (local->xattr_req);
+ if (event_gen1 != event_gen2)
+ need_refresh = _gf_true;
+out:
+ return need_refresh;
+}
- GF_FREE (local->child_up);
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+{
+ int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
- { /* lookup */
- if (local->cont.lookup.xattrs) {
- afr_reset_xattr (local->cont.lookup.xattrs,
- priv->child_count);
- GF_FREE (local->cont.lookup.xattrs);
- local->cont.lookup.xattrs = NULL;
- }
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret == 0) {
+ ctx->need_refresh = _gf_true;
+ }
- if (local->cont.lookup.xattr) {
- dict_unref (local->cont.lookup.xattr);
- }
+ return ret;
+}
- if (local->cont.lookup.inode) {
- inode_unref (local->cont.lookup.inode);
- }
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+{
+ int ret = -1;
- if (local->cont.lookup.postparents)
- GF_FREE (local->cont.lookup.postparents);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- if (local->cont.lookup.bufs)
- GF_FREE (local->cont.lookup.bufs);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_need_refresh_set(inode, this);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
- if (local->cont.lookup.child_success)
- GF_FREE (local->cont.lookup.child_success);
+int
+afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- if (local->cont.lookup.sources)
- GF_FREE (local->cont.lookup.sources);
- }
+ if (!inode)
+ return ret;
- { /* getxattr */
- if (local->cont.getxattr.name)
- GF_FREE (local->cont.getxattr.name);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0 || !ctx) {
+ UNLOCK(&inode->lock);
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Failed to cancel split-brain choice timer.");
+ goto out;
+ }
+ ctx->spb_choice = -1;
+ if (ctx->timer) {
+ gf_timer_call_cancel(this->ctx, ctx->timer);
+ ctx->timer = NULL;
}
+ ret = 0;
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
- { /* lk */
- if (local->cont.lk.locked_nodes)
- GF_FREE (local->cont.lk.locked_nodes);
- }
+void
+afr_set_split_brain_choice_cbk(void *data)
+{
+ inode_t *inode = data;
+ xlator_t *this = THIS;
- { /* create */
- if (local->cont.create.fd)
- fd_unref (local->cont.create.fd);
- if (local->cont.create.params)
- dict_unref (local->cont.create.params);
- }
+ afr_spb_choice_timeout_cancel(this, inode);
+ inode_invalidate(inode);
+ inode_unref(inode);
+ return;
+}
- { /* mknod */
- if (local->cont.mknod.params)
- dict_unref (local->cont.mknod.params);
+int
+afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ inode_t *inode = NULL;
+ loc_t *loc = NULL;
+ xlator_t *this = NULL;
+ afr_spbc_timeout_t *data = opaque;
+ struct timespec delta = {
+ 0,
+ };
+ gf_boolean_t timer_set = _gf_false;
+ gf_boolean_t timer_cancelled = _gf_false;
+ gf_boolean_t timer_reset = _gf_false;
+ int old_spb_choice = -1;
+
+ frame = data->frame;
+ loc = data->loc;
+ this = frame->this;
+ priv = this->private;
+
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ delta.tv_sec = priv->spb_choice_timeout;
+ delta.tv_nsec = 0;
+
+ if (!loc->inode) {
+ ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!(data->d_spb || data->m_spb)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Cannot set "
+ "replica.split-brain-choice on %s. File is"
+ " not in data/metadata split-brain.",
+ uuid_utoa(loc->gfid));
+ ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ /*
+ * we're ref'ing the inode before LOCK like it is done elsewhere in the
+ * code. If we ref after LOCK, coverity complains of possible deadlocks.
+ */
+ inode = inode_ref(loc->inode);
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret) {
+ UNLOCK(&inode->lock);
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Failed to get inode_ctx for %s", loc->name);
+ goto post_unlock;
}
- { /* mkdir */
- if (local->cont.mkdir.params)
- dict_unref (local->cont.mkdir.params);
- }
+ old_spb_choice = ctx->spb_choice;
+ ctx->spb_choice = data->spb_child_index;
- { /* symlink */
- if (local->cont.symlink.params)
- dict_unref (local->cont.symlink.params);
- }
+ /* Possible changes in spb-choice :
+ * valid to -1 : cancel timer and unref
+ * valid to valid : cancel timer and inject new one
+ * -1 to -1 : unref and do not do anything
+ * -1 to valid : inject timer
+ */
- { /* writev */
- GF_FREE (local->cont.writev.vector);
- }
+ /* ctx->timer is NULL iff previous value of
+ * ctx->spb_choice is -1
+ */
+ if (ctx->timer) {
+ if (ctx->spb_choice == -1) {
+ if (!gf_timer_call_cancel(this->ctx, ctx->timer)) {
+ ctx->timer = NULL;
+ timer_cancelled = _gf_true;
+ }
+ /* If timer cancel failed here it means that the
+ * previous cbk will be executed which will set
+ * spb_choice to -1. So we can consider the
+ * 'valid to -1' case to be a success
+ * (i.e. ret = 0) and goto unlock.
+ */
+ goto unlock;
+ }
+ goto reset_timer;
+ } else {
+ if (ctx->spb_choice == -1)
+ goto unlock;
+ goto set_timer;
+ }
+
+ reset_timer:
+ ret = gf_timer_call_cancel(this->ctx, ctx->timer);
+ if (ret != 0) {
+ /* We need to bail out now instead of launching a new
+ * timer. Otherwise the cbk of the previous timer event
+ * will cancel the new ctx->timer.
+ */
+ ctx->spb_choice = old_spb_choice;
+ ret = -1;
+ op_errno = EAGAIN;
+ goto unlock;
+ }
+ ctx->timer = NULL;
+ timer_reset = _gf_true;
+
+ set_timer:
+ ctx->timer = gf_timer_call_after(this->ctx, delta,
+ afr_set_split_brain_choice_cbk, inode);
+ if (!ctx->timer) {
+ ctx->spb_choice = old_spb_choice;
+ ret = -1;
+ op_errno = ENOMEM;
+ }
+ if (!timer_reset && ctx->timer)
+ timer_set = _gf_true;
+ if (timer_reset && !ctx->timer)
+ timer_cancelled = _gf_true;
+ }
+unlock:
+ UNLOCK(&inode->lock);
+post_unlock:
+ if (!timer_set)
+ inode_unref(inode);
+ if (timer_cancelled)
+ inode_unref(inode);
+ /*
+ * We need to invalidate the inode to prevent the kernel from serving
+ * reads from an older cached value despite a change in spb_choice to
+ * a new value.
+ */
+ inode_invalidate(inode);
+out:
+ GF_FREE(data);
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ return 0;
+}
- { /* setxattr */
- if (local->cont.setxattr.dict)
- dict_unref (local->cont.setxattr.dict);
- }
+int
+afr_accused_fill(xlator_t *this, dict_t *xdata, unsigned char *accused,
+ afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int idx = afr_index_for_transaction_type(type);
+ void *pending_raw = NULL;
+ int pending[3];
+ int ret = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_get_ptr(xdata, priv->pending_key[i], &pending_raw);
+ if (ret) /* no pending flags */
+ continue;
+ memcpy(pending, pending_raw, sizeof(pending));
+
+ if (ntoh32(pending[idx]))
+ accused[i] = 1;
+ }
+
+ return 0;
+}
- { /* removexattr */
- GF_FREE (local->cont.removexattr.name);
- }
+int
+afr_accuse_smallfiles(xlator_t *this, struct afr_reply *replies,
+ unsigned char *data_accused)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t maxsize = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].xdata &&
+ dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))
+ continue;
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size > maxsize)
+ maxsize = replies[i].poststat.ia_size;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+ if (replies[i].poststat.ia_size < maxsize)
+ data_accused[i] = 1;
+ }
+
+ return 0;
+}
- { /* symlink */
- GF_FREE (local->cont.symlink.linkpath);
- }
+int
+afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *data_accused, unsigned char *metadata_accused,
+ unsigned char *data_readable,
+ unsigned char *metadata_readable, struct afr_reply *replies)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+ int ret = 0;
+ ia_type_t ia_type = IA_INVAL;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ data_readable[i] = 1;
+ metadata_readable[i] = 1;
+ }
+ if (AFR_IS_ARBITER_BRICK(priv, ARBITER_BRICK_INDEX)) {
+ data_readable[ARBITER_BRICK_INDEX] = 0;
+ metadata_readable[ARBITER_BRICK_INDEX] = 0;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies) { /* Lookup */
+ if (!replies[i].valid || replies[i].op_ret == -1 ||
+ (replies[i].xdata &&
+ dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ xdata = replies[i].xdata;
+ ia_type = replies[i].poststat.ia_type;
+ } else { /* pre-op xattrop */
+ xdata = local->transaction.changelog_xdata[i];
+ ia_type = inode->ia_type;
+ }
+
+ if (!xdata)
+ continue; /* mkdir_cbk sends NULL xdata_rsp. */
+ afr_accused_fill(this, xdata, data_accused,
+ (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION
+ : AFR_DATA_TRANSACTION);
+
+ afr_accused_fill(this, xdata, metadata_accused,
+ AFR_METADATA_TRANSACTION);
+ }
+
+ if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR &&
+ /* We want to accuse small files only when we know for
+ * sure that there is no IO happening. Otherwise, the
+ * ia_sizes obtained in post-refresh replies may
+ * mismatch due to a race between inode-refresh and
+ * ongoing writes, causing spurious heal launches*/
+ !afr_is_possibly_under_txn(AFR_DATA_TRANSACTION, local, this)) {
+ afr_accuse_smallfiles(this, replies, data_accused);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i]) {
+ data_readable[i] = 0;
+ ret = 1;
+ }
+ if (metadata_accused[i]) {
+ metadata_readable[i] = 0;
+ ret = 1;
+ }
+ }
+ return ret;
+}
- { /* opendir */
- if (local->cont.opendir.checksum)
- GF_FREE (local->cont.opendir.checksum);
- }
+int
+afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t *start_heal)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int event_generation = 0;
+ int i = 0;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+ event_generation = local->event_generation;
+
+ data_accused = alloca0(priv->child_count);
+ data_readable = alloca0(priv->child_count);
+ metadata_accused = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+
+ ret = afr_readables_fill(frame, this, inode, data_accused, metadata_accused,
+ data_readable, metadata_readable, replies);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (start_heal && priv->child_up[i] &&
+ (data_accused[i] || metadata_accused[i])) {
+ *start_heal = _gf_true;
+ break;
+ }
+ }
+ afr_inode_read_subvol_set(inode, this, data_readable, metadata_readable,
+ event_generation);
+ return ret;
}
+int
+afr_refresh_selfheal_done(int ret, call_frame_t *heal, void *opaque)
+{
+ if (heal)
+ AFR_STACK_DESTROY(heal);
+ return 0;
+}
int
-afr_frame_return (call_frame_t *frame)
+afr_inode_refresh_err(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int call_count = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int err = 0;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && !local->replies[i].op_ret) {
+ err = 0;
+ goto ret;
}
- UNLOCK (&frame->lock);
+ }
- return call_count;
+ err = afr_final_errno(local, priv);
+ret:
+ return err;
}
-
-/**
- * up_children_count - return the number of children that are up
- */
-
-int
-afr_up_children_count (int child_count, unsigned char *child_up)
+gf_boolean_t
+afr_selfheal_enabled(const xlator_t *this)
{
- int i = 0;
- int ret = 0;
+ const afr_private_t *priv = this->private;
- for (i = 0; i < child_count; i++)
- if (child_up[i])
- ret++;
- return ret;
+ return priv->data_self_heal || priv->metadata_self_heal ||
+ priv->entry_self_heal;
}
-gf_boolean_t
-afr_is_fresh_lookup (loc_t *loc, xlator_t *this)
+int
+afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
{
- uint64_t ctx = 0;
- int32_t ret = 0;
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ int event_generation = 0;
+ int read_subvol = -1;
+ int ret = 0;
+
+ local = frame->local;
+ inode = local->inode;
+ priv = this->private;
+
+ if (err)
+ goto refresh_done;
+
+ if (local->op == GF_FOP_LOOKUP)
+ goto refresh_done;
+
+ ret = afr_inode_get_readable(frame, inode, this, local->readable,
+ &event_generation, local->transaction.type);
+
+ if (ret == -EIO) {
+ /* No readable subvolume even after refresh ==> splitbrain.*/
+ if (!priv->fav_child_policy) {
+ err = EIO;
+ goto refresh_done;
+ }
+ read_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (read_subvol == -1) {
+ err = EIO;
+ goto refresh_done;
+ }
+
+ heal_frame = afr_frame_create(this, NULL);
+ if (!heal_frame) {
+ err = EIO;
+ goto refresh_done;
+ }
+ heal_local = heal_frame->local;
+ heal_local->xdata_req = dict_new();
+ if (!heal_local->xdata_req) {
+ err = EIO;
+ AFR_STACK_DESTROY(heal_frame);
+ goto refresh_done;
+ }
+ heal_local->heal_frame = frame;
+ ret = synctask_new(this->ctx->env, afr_fav_child_reset_sink_xattrs,
+ afr_fav_child_reset_sink_xattrs_cbk, heal_frame,
+ heal_frame);
+ return 0;
+ }
- GF_ASSERT (loc);
- GF_ASSERT (this);
- GF_ASSERT (loc->inode);
+refresh_done:
+ afr_local_replies_wipe(local, this->private);
+ local->refreshfn(frame, this, err);
- ret = inode_ctx_get (loc->inode, this, &ctx);
- if (0 == ret)
- return _gf_false;
- return _gf_true;
+ return 0;
}
-void
-afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
+int
+afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
{
- GF_ASSERT (loc);
- GF_ASSERT (buf);
-
- uuid_copy (loc->gfid, buf->ia_gfid);
- if (postparent)
- uuid_copy (loc->pargfid, postparent->ia_gfid);
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t start_heal = _gf_false;
+ afr_local_t *heal_local = NULL;
+ unsigned char *success_replies = NULL;
+ int ret = 0;
+
+ if (error != 0) {
+ goto refresh_done;
+ }
+
+ local = frame->local;
+ priv = this->private;
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+
+ if (priv->thin_arbiter_count && local->is_read_txn &&
+ AFR_COUNT(success_replies, priv->child_count) != priv->child_count) {
+ /* We need to query the good bricks and/or thin-arbiter.*/
+ if (success_replies[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (success_replies[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
+ error = EINVAL;
+ goto refresh_done;
+ }
+
+ if (!afr_has_quorum(success_replies, this, frame)) {
+ error = afr_final_errno(frame->local, this->private);
+ if (!error)
+ error = afr_quorum_errno(priv);
+ goto refresh_done;
+ }
+
+ ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal);
+
+ if (ret && afr_selfheal_enabled(this) && start_heal) {
+ heal_frame = afr_frame_create(this, NULL);
+ if (!heal_frame)
+ goto refresh_done;
+ heal_local = heal_frame->local;
+ heal_local->refreshinode = inode_ref(local->refreshinode);
+ heal_local->heal_frame = heal_frame;
+ if (!afr_throttled_selfheal(heal_frame, this)) {
+ AFR_STACK_DESTROY(heal_frame);
+ goto refresh_done;
+ }
+ }
+
+refresh_done:
+ afr_txn_refresh_done(frame, this, error);
+
+ return 0;
}
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index)
+void
+afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ dict_t *xdata, struct iatt *par)
{
- ino64_t scaled_ino = -1;
+ afr_local_t *local = NULL;
+ int call_child = (long)cookie;
+ int8_t need_heal = 1;
+ int call_count = 0;
+ int ret = 0;
+
+ local = frame->local;
+ local->replies[call_child].valid = 1;
+ local->replies[call_child].op_ret = op_ret;
+ local->replies[call_child].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[call_child].poststat = *buf;
+ if (par)
+ local->replies[call_child].postparent = *par;
+ if (xdata)
+ local->replies[call_child].xdata = dict_ref(xdata);
+ }
+
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to get link count");
+ }
+ }
- if (ino == ((uint64_t) -1)) {
- scaled_ino = ((uint64_t) -1);
- goto out;
+ local->replies[call_child].need_heal = need_heal;
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ ret = afr_inode_refresh_err(frame, this);
+ if (ret) {
+ gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed");
}
+ afr_inode_refresh_done(frame, this, ret);
+ }
+}
- scaled_ino = (ino * child_count) + child_index;
+int
+afr_inode_refresh_subvol_with_lookup_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *par)
+{
+ afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ xdata, par);
+ return 0;
+}
-out:
- return scaled_ino;
+int
+afr_inode_refresh_subvol_with_lookup(call_frame_t *frame, xlator_t *this, int i,
+ inode_t *inode, uuid_t gfid, dict_t *xdata)
+{
+ loc_t loc = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ loc.inode = inode;
+ if (gf_uuid_is_null(inode->gfid) && gfid) {
+ /* To handle setattr/setxattr on yet to be linked inode from
+ * dht */
+ gf_uuid_copy(loc.gfid, gfid);
+ } else {
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ }
+
+ STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_lookup_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &loc, xdata);
+ return 0;
}
+int
+afr_inode_refresh_subvol_with_fstat_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ xdata, NULL);
+ return 0;
+}
int
-afr_deitransform_orig (ino64_t ino, int child_count)
+afr_inode_refresh_subvol_with_fstat(call_frame_t *frame, xlator_t *this, int i,
+ dict_t *xdata)
{
- int index = -1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- index = ino % child_count;
+ priv = this->private;
+ local = frame->local;
- return index;
+ STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_fstat_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fstat, local->fd, xdata);
+ return 0;
}
-
int
-afr_deitransform (ino64_t ino, int child_count)
+afr_inode_refresh_do(call_frame_t *frame, xlator_t *this)
{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ int ret = 0;
+ dict_t *xdata = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *wind_subvols = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ wind_subvols = alloca0(priv->child_count);
+
+ afr_local_replies_wipe(local, priv);
+
+ if (local->fd) {
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ afr_inode_refresh_done(frame, this, EINVAL);
+ return 0;
+ }
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ afr_inode_refresh_done(frame, this, ENOMEM);
return 0;
-}
+ }
+ ret = afr_xattr_req_prepare(this, xdata);
+ if (ret != 0) {
+ dict_unref(xdata);
+ afr_inode_refresh_done(frame, this, -ret);
+ return 0;
+ }
-void
-afr_lookup_build_response_params (afr_local_t *local, xlator_t *this)
-{
- int32_t read_child = -1;
- struct iatt *buf = NULL;
- struct iatt *postparent = NULL;
- dict_t **xattr = NULL;
+ ret = dict_set_sizen_str_sizen(xdata, "link-count", GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
- GF_ASSERT (local);
+ ret = dict_set_str_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name);
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "Unable to set inodelk-dom-count in dict ");
+ }
- buf = &local->cont.lookup.buf;
- postparent = &local->cont.lookup.postparent;
- xattr = &local->cont.lookup.xattr;
+ if (local->fd) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] && fd_ctx->opened_on[i] == AFR_FD_OPENED)
+ wind_subvols[i] = 1;
+ }
+ } else {
+ memcpy(wind_subvols, local->child_up,
+ sizeof(*local->child_up) * priv->child_count);
+ }
- read_child = afr_read_child (this, local->cont.lookup.inode);
- gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d",
- read_child);
- *xattr = dict_ref (local->cont.lookup.xattrs[read_child]);
- *buf = local->cont.lookup.bufs[read_child];
- *postparent = local->cont.lookup.postparents[read_child];
+ local->call_count = AFR_COUNT(wind_subvols, priv->child_count);
- if (IA_INVAL == local->cont.lookup.inode->ia_type) {
- /* fix for RT #602 */
- local->cont.lookup.inode->ia_type = buf->ia_type;
- }
-}
+ call_count = local->call_count;
+ if (!call_count) {
+ dict_unref(xdata);
+ if (local->fd && AFR_COUNT(local->child_up, priv->child_count))
+ afr_inode_refresh_done(frame, this, EBADFD);
+ else
+ afr_inode_refresh_done(frame, this, ENOTCONN);
+ return 0;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_subvols[i])
+ continue;
-static void
-afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this,
- int child_index, dict_t *xattr)
-{
- uint32_t inodelk_count = 0;
- uint32_t entrylk_count = 0;
- int ret = -1;
+ if (local->fd)
+ afr_inode_refresh_subvol_with_fstat(frame, this, i, xdata);
+ else
+ afr_inode_refresh_subvol_with_lookup(
+ frame, this, i, local->refreshinode, local->refreshgfid, xdata);
- GF_ASSERT (local);
- GF_ASSERT (this);
- GF_ASSERT (xattr);
- GF_ASSERT (child_index >= 0);
+ if (!--call_count)
+ break;
+ }
- ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT,
- &inodelk_count);
- if (ret == 0)
- local->inodelk_count += inodelk_count;
+ dict_unref(xdata);
- ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT,
- &entrylk_count);
- if (ret == 0)
- local->entrylk_count += entrylk_count;
+ return 0;
}
-static void
-afr_lookup_detect_self_heal_by_xattr (afr_local_t *local, xlator_t *this,
- dict_t *xattr)
+int
+afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, afr_inode_refresh_cbk_t refreshfn)
{
- GF_ASSERT (local);
- GF_ASSERT (this);
- GF_ASSERT (xattr);
+ afr_local_t *local = NULL;
- if (afr_sh_has_metadata_pending (xattr, this)) {
- local->self_heal.need_metadata_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "metadata self-heal is pending for %s.",
- local->loc.path);
- }
+ local = frame->local;
- if (afr_sh_has_entry_pending (xattr, this)) {
- local->self_heal.need_entry_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "entry self-heal is pending for %s.", local->loc.path);
- }
+ local->refreshfn = refreshfn;
- if (afr_sh_has_data_pending (xattr, this)) {
- local->self_heal.need_data_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "data self-heal is pending for %s.", local->loc.path);
- }
-}
+ if (local->refreshinode) {
+ inode_unref(local->refreshinode);
+ local->refreshinode = NULL;
+ }
-static void
-afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this,
- struct iatt *buf, struct iatt *lookup_buf)
-{
- if (PERMISSION_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- gf_log (this->name, GF_LOG_INFO,
- "permissions differ for %s ", local->loc.path);
- local->self_heal.need_metadata_self_heal = _gf_true;
- }
+ local->refreshinode = inode_ref(inode);
- if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- local->self_heal.need_metadata_self_heal = _gf_true;
- gf_log (this->name, GF_LOG_INFO,
- "ownership differs for %s ", local->loc.path);
- }
+ if (gfid)
+ gf_uuid_copy(local->refreshgfid, gfid);
+ else
+ gf_uuid_clear(local->refreshgfid);
- if (SIZE_DIFFERS (buf, lookup_buf)
- && IA_ISREG (buf->ia_type)) {
- gf_log (this->name, GF_LOG_INFO,
- "size differs for %s ", local->loc.path);
- local->self_heal.need_data_self_heal = _gf_true;
- }
+ afr_inode_refresh_do(frame, this);
- if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) {
- /* mismatching gfid */
- gf_log (this->name, GF_LOG_WARNING,
- "%s: gfid different on subvolume", local->loc.path);
- }
+ return 0;
}
int
-afr_is_valid_read_child (int *sources, int32_t child_count, int32_t read_child)
+afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req)
{
- int valid = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
- if (!sources)
- goto out;
+ priv = this->private;
- if ((read_child < 0) ||
- (read_child >= child_count))
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_uint64(xattr_req, priv->pending_key[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret < 0)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Unable to set dict value for %s", priv->pending_key[i]);
+ /* 3 = data+metadata+entry */
+ }
+ ret = dict_set_uint64(xattr_req, AFR_DIRTY,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "failed to set dirty "
+ "query flag");
+ }
+
+ ret = dict_set_int32_sizen(xattr_req, "list-xattr", 1);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set list-xattr in dict ");
+ }
+
+ return ret;
+}
- valid = sources[read_child];
+int
+afr_lookup_xattr_req_prepare(afr_local_t *local, xlator_t *this,
+ dict_t *xattr_req, loc_t *loc)
+{
+ int ret = -ENOMEM;
+
+ if (!local->xattr_req)
+ local->xattr_req = dict_new();
+
+ if (!local->xattr_req)
+ goto out;
+
+ if (xattr_req && (xattr_req != local->xattr_req))
+ dict_copy(xattr_req, local->xattr_req);
+
+ ret = afr_xattr_req_prepare(this, local->xattr_req);
+
+ ret = dict_set_uint64(local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_INODELK_COUNT);
+ }
+ ret = dict_set_uint64(local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_ENTRYLK_COUNT);
+ }
+
+ ret = dict_set_uint32(local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_PARENT_ENTRYLK);
+ }
+
+ ret = dict_set_sizen_str_sizen(local->xattr_req, "link-count",
+ GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
+
+ ret = 0;
out:
- return valid;
+ return ret;
}
-static void
-afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this)
-{
- GF_ASSERT (local);
- GF_ASSERT (this);
-
- if ((local->success_count > 0) && (local->enoent_count > 0)) {
- local->self_heal.need_metadata_self_heal = _gf_true;
- local->self_heal.need_data_self_heal = _gf_true;
- local->self_heal.need_entry_self_heal = _gf_true;
- local->self_heal.need_gfid_self_heal = _gf_true;
- local->self_heal.need_missing_entry_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_INFO,
- "entries are missing in lookup of %s.",
- local->loc.path);
- //If all self-heals are needed no need to check for other rules
- goto out;
- }
+int
+afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)
+{
+ int i = 0;
+ int child = -1;
+ int64_t read_iter = -1;
+ int64_t pending_read = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])
+ continue;
+ read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
+ if (child == -1 || read_iter < pending_read) {
+ pending_read = read_iter;
+ child = i;
+ }
+ }
+
+ return child;
+}
- if ((local->success_count > 0) &&
- afr_is_split_brain (this, local->cont.lookup.inode) &&
- IA_ISREG (local->cont.lookup.inode->ia_type)) {
- local->self_heal.need_data_self_heal = _gf_true;
- local->self_heal.need_gfid_self_heal = _gf_true;
- local->self_heal.need_missing_entry_self_heal = _gf_true;
- gf_log (this->name, GF_LOG_WARNING,
- "split brain detected during lookup of %s.",
- local->loc.path);
+static int32_t
+afr_least_latency_child(afr_private_t *priv, unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ if (child == -1 ||
+ priv->child_latency[i] < priv->child_latency[child]) {
+ child = i;
}
+ }
+ return child;
+}
-out:
- return;
+static int32_t
+afr_least_latency_times_pending_reads_child(afr_private_t *priv,
+ unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+ int64_t pending_read = 0;
+ int64_t latency = -1;
+ int64_t least_latency = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
+ latency = (pending_read + 1) * priv->child_latency[i];
+
+ if (child == -1 || latency < least_latency) {
+ least_latency = latency;
+ child = i;
+ }
+ }
+ return child;
}
-gf_boolean_t
-afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv)
+int
+afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
+ unsigned char *readable)
{
- GF_ASSERT (sh);
- GF_ASSERT (priv);
+ uuid_t gfid_copy = {
+ 0,
+ };
+ pid_t pid;
+ int child = -1;
+
+ switch (priv->hash_mode) {
+ case AFR_READ_POLICY_FIRST_UP:
+ break;
+ case AFR_READ_POLICY_GFID_HASH:
+ gf_uuid_copy(gfid_copy, args->gfid);
+ child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
+ priv->child_count;
+ break;
+ case AFR_READ_POLICY_GFID_PID_HASH:
+ if (args->ia_type != IA_IFDIR) {
+ /*
+ * Why getpid? Because it's one of the cheapest calls
+ * available - faster than gethostname etc. - and
+ * returns a constant-length value that's sure to be
+ * shorter than a UUID. It's still very unlikely to be
+ * the same across clients, so it still provides good
+ * mixing. We're not trying for perfection here. All we
+ * need is a low probability that multiple clients
+ * won't converge on the same subvolume.
+ */
+ gf_uuid_copy(gfid_copy, args->gfid);
+ pid = getpid();
+ *(pid_t *)gfid_copy ^= pid;
+ }
+ child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
+ priv->child_count;
+ break;
+ case AFR_READ_POLICY_LESS_LOAD:
+ child = afr_least_pending_reads_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LEAST_LATENCY:
+ child = afr_least_latency_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
+ child = afr_least_latency_times_pending_reads_child(priv, readable);
+ break;
+ }
+
+ return child;
+}
- return (sh->need_gfid_self_heal
- || sh->need_missing_entry_self_heal
- || (priv->data_self_heal && sh->need_data_self_heal)
- || (priv->metadata_self_heal && sh->need_metadata_self_heal)
- || (priv->entry_self_heal && sh->need_entry_self_heal));
+int
+afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
+ unsigned char *readable,
+ afr_read_subvol_args_t *args)
+{
+ int i = 0;
+ int read_subvol = -1;
+ afr_private_t *priv = NULL;
+ afr_read_subvol_args_t local_args = {
+ 0,
+ };
+
+ priv = this->private;
+
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
+ return priv->read_child;
+
+ if (inode_is_linked(inode)) {
+ gf_uuid_copy(local_args.gfid, inode->gfid);
+ local_args.ia_type = inode->ia_type;
+ } else if (args) {
+ local_args = *args;
+ }
+
+ /* second preference - use hashed mode */
+ read_subvol = afr_hash_child(&local_args, priv, readable);
+ if (read_subvol >= 0 && readable[read_subvol])
+ return read_subvol;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i])
+ return i;
+ }
+
+ /* no readable subvolumes, either split brain or all subvols down */
+
+ return -1;
}
-afr_transaction_type
-afr_transaction_type_get (ia_type_t ia_type)
+int
+afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type)
{
- afr_transaction_type type = AFR_METADATA_TRANSACTION;
+ int ret = -1;
- GF_ASSERT (ia_type != IA_INVAL);
+ if (type == AFR_METADATA_TRANSACTION)
+ ret = afr_inode_read_subvol_get(inode, this, 0, readable, event_p);
+ else
+ ret = afr_inode_read_subvol_get(inode, this, readable, 0, event_p);
+ return ret;
+}
- if (IA_ISDIR (ia_type)) {
- type = AFR_ENTRY_TRANSACTION;
- } else if (IA_ISREG (ia_type)) {
- type = AFR_DATA_TRANSACTION;
- }
- return type;
+void
+afr_readables_intersect_get(inode_t *inode, xlator_t *this, int *event,
+ unsigned char *intersection)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ unsigned char *intersect = NULL;
+
+ priv = this->private;
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+ intersect = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable,
+ event);
+
+ AFR_INTERSECT(intersect, data_readable, metadata_readable,
+ priv->child_count);
+ if (intersection)
+ memcpy(intersection, intersect,
+ sizeof(*intersection) * priv->child_count);
}
int
-afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,
- int32_t *read_child)
+afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
+ unsigned char *readables, int *event_p,
+ afr_transaction_type type, afr_read_subvol_args_t *args)
{
- ia_type_t ia_type = IA_INVAL;
- int32_t source = -1;
- int ret = -1;
- dict_t **xattrs = NULL;
- int32_t *success_children = NULL;
- struct iatt *bufs = NULL;
- afr_transaction_type type = AFR_METADATA_TRANSACTION;
+ afr_private_t *priv = NULL;
+ unsigned char *readable = NULL;
+ unsigned char *intersection = NULL;
+ int subvol = -1;
+ int event = 0;
+
+ priv = this->private;
+
+ readable = alloca0(priv->child_count);
+ intersection = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_type_get(inode, this, readable, &event, type);
+
+ afr_readables_intersect_get(inode, this, &event, intersection);
+
+ if (AFR_COUNT(intersection, priv->child_count) > 0)
+ subvol = afr_read_subvol_select_by_policy(inode, this, intersection,
+ args);
+ else
+ subvol = afr_read_subvol_select_by_policy(inode, this, readable, args);
+ if (subvol_p)
+ *subvol_p = subvol;
+ if (event_p)
+ *event_p = event;
+ if (readables)
+ memcpy(readables, readable, sizeof(*readables) * priv->child_count);
+ return subvol;
+}
- GF_ASSERT (local);
- GF_ASSERT (this);
+void
+afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
- bufs = local->cont.lookup.bufs;
- success_children = local->cont.lookup.child_success;
- /*We can take the success_children[0] only because we already
- *handle the conflicting children other wise, we could select the
- *read_child based on wrong file type
- */
- ia_type = local->cont.lookup.bufs[success_children[0]].ia_type;
- type = afr_transaction_type_get (ia_type);
- xattrs = local->cont.lookup.xattrs;
- source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs,
- type);
- if (source < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "failed to select source "
- "for %s", local->loc.path);
- goto out;
+ priv = this->private;
+
+ afr_matrix_cleanup(local->pending, priv->child_count);
+
+ GF_FREE(local->internal_lock.lower_locked_nodes);
+
+ afr_lockees_cleanup(&local->internal_lock);
+
+ GF_FREE(local->transaction.pre_op);
+
+ GF_FREE(local->transaction.pre_op_sources);
+ if (local->transaction.changelog_xdata) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.changelog_xdata[i])
+ continue;
+ dict_unref(local->transaction.changelog_xdata[i]);
}
+ GF_FREE(local->transaction.changelog_xdata);
+ }
- gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s",
- source, local->loc.path);
- *read_child = source;
- ret = 0;
-out:
- return ret;
+ GF_FREE(local->transaction.failed_subvols);
+
+ GF_FREE(local->transaction.basename);
+ GF_FREE(local->transaction.new_basename);
+
+ loc_wipe(&local->transaction.parent_loc);
+ loc_wipe(&local->transaction.new_parent_loc);
}
-static inline gf_boolean_t
-afr_is_self_heal_running (afr_local_t *local)
+void
+afr_reply_wipe(struct afr_reply *reply)
{
- GF_ASSERT (local);
- return ((local->inodelk_count > 0) || (local->entrylk_count > 0));
+ if (reply->xdata) {
+ dict_unref(reply->xdata);
+ reply->xdata = NULL;
+ }
+
+ if (reply->xattr) {
+ dict_unref(reply->xattr);
+ reply->xattr = NULL;
+ }
}
-static void
-afr_launch_self_heal (call_frame_t *frame, xlator_t *this,
- gf_boolean_t is_background, ia_type_t ia_type,
- inode_t *inode,
- void (*gfid_sh_success_cbk) (call_frame_t*, xlator_t*),
- int (*unwind) (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno))
+void
+afr_replies_wipe(struct afr_reply *replies, int count)
{
- afr_local_t *local = NULL;
- char sh_type_str[256] = {0,};
+ int i = 0;
- GF_ASSERT (frame);
- GF_ASSERT (this);
+ for (i = 0; i < count; i++) {
+ afr_reply_wipe(&replies[i]);
+ }
+}
- local = frame->local;
- local->self_heal.background = is_background;
- local->self_heal.type = ia_type;
- local->self_heal.unwind = unwind;
- local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk;
-
- afr_self_heal_type_str_get (&local->self_heal,
- sh_type_str,
- sizeof (sh_type_str));
-
- gf_log (this->name, GF_LOG_INFO,
- "background %s self-heal triggered. path: %s",
- sh_type_str, local->loc.path);
-
- afr_self_heal (frame, this, inode);
-}
-
-unsigned int
-afr_gfid_missing_count (const char *xlator_name, int32_t *success_children,
- struct iatt *bufs, unsigned int child_count,
- const char *path)
-{
- unsigned int gfid_miss_count = 0;
- int i = 0;
- struct iatt *child1 = NULL;
-
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- child1 = &bufs[success_children[i]];
- if (uuid_is_null (child1->ia_gfid)) {
- gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null"
- " on subvolume %d", path, success_children[i]);
- gfid_miss_count++;
- }
- }
+void
+afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv)
+{
+ if (!local->replies)
+ return;
+
+ afr_replies_wipe(local->replies, priv->child_count);
- return gfid_miss_count;
+ memset(local->replies, 0, sizeof(*local->replies) * priv->child_count);
}
-static int
-afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this)
+static gf_boolean_t
+afr_fop_lock_is_unlock(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ if ((F_UNLCK == local->cont.inodelk.in_flock.l_type) &&
+ (local->cont.inodelk.in_cmd == F_SETLKW ||
+ local->cont.inodelk.in_cmd == F_SETLK))
+ return _gf_true;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ if (ENTRYLK_UNLOCK == local->cont.entrylk.in_cmd)
+ return _gf_true;
+ break;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
+
+static gf_boolean_t
+afr_lk_is_unlock(int32_t cmd, struct gf_flock *flock)
{
- int32_t *success_children = NULL;
- afr_private_t *priv = NULL;
- struct iatt *bufs = NULL;
- int miss_count = 0;
+ switch (cmd) {
+ case F_RESLK_UNLCK:
+ return _gf_true;
+ break;
- priv = this->private;
- bufs = local->cont.lookup.bufs;
- success_children = local->cont.lookup.child_success;
+#if F_SETLKW != F_SETLKW64
+ case F_SETLKW64:
+#endif
+ case F_SETLKW:
- miss_count = afr_gfid_missing_count (this->name, success_children,
- bufs, priv->child_count,
- local->loc.path);
- return miss_count;
+#if F_SETLK != F_SETLK64
+ case F_SETLK64:
+#endif
+ case F_SETLK:
+ if (F_UNLCK == flock->l_type)
+ return _gf_true;
+ break;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
}
-gf_boolean_t
-afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
- unsigned int child_count, const char *path,
- const char *xlator_name)
-{
- gf_boolean_t conflicting = _gf_false;
- int i = 0;
- struct iatt *child1 = NULL;
- struct iatt *child2 = NULL;
- uuid_t *gfid = NULL;
- char gfid_str[64] = {0};
-
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- child1 = &bufs[success_children[i]];
- if ((!gfid) && (!uuid_is_null (child1->ia_gfid)))
- gfid = &child1->ia_gfid;
-
- if (i == 0)
- continue;
-
- child2 = &bufs[success_children[i-1]];
- if (FILETYPE_DIFFERS (child1, child2)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype "
- "differs on subvolumes (%d, %d)", path,
- success_children[i-1], success_children[i]);
- conflicting = _gf_true;
- goto out;
- }
- if (!gfid || uuid_is_null (child1->ia_gfid))
- continue;
- if (uuid_compare (*gfid, child1->ia_gfid)) {
- uuid_utoa_r (*gfid, gfid_str);
- gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs"
- " on subvolume %d (%s, %s)", path,
- success_children[i], gfid_str,
- uuid_utoa (child1->ia_gfid));
- conflicting = _gf_true;
- goto out;
- }
- }
-out:
- return conflicting;
+void
+afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
+ int32_t *op_errno)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ if (!frame || !frame->this || !frame->local || !frame->this->private)
+ return;
+
+ if (*op_ret < 0)
+ return;
+
+ /* Failing inodelk/entrylk/lk here is not a good idea because we
+ * need to cleanup the locks on the other bricks if we choose to fail
+ * the fop here. The brick may go down just after unwind happens as well
+ * so anyways the fop will fail when the next fop is sent so leaving
+ * it like this for now.*/
+ local = frame->local;
+ switch (local->op) {
+ case GF_FOP_LOOKUP:
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ case GF_FOP_LK:
+ return;
+ default:
+ break;
+ }
+
+ priv = frame->this->private;
+ if (!priv->consistent_io)
+ return;
+
+ if (local->event_generation &&
+ (local->event_generation != priv->event_generation))
+ goto inconsistent;
+
+ return;
+inconsistent:
+ *op_ret = -1;
+ *op_errno = ENOTCONN;
}
-/* afr_update_gfid_from_iatts: This function should be called only if the
- * iatts are not conflicting.
- */
void
-afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs,
- int32_t *success_children, unsigned int child_count)
-{
- uuid_t *gfid = NULL;
- int i = 0;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child == -1)
- break;
- if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) {
- gfid = &bufs[child].ia_gfid;
- } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) {
- if (uuid_compare (*gfid, bufs[child].ia_gfid)) {
- GF_ASSERT (0);
- goto out;
- }
- }
- }
- if (gfid && (!uuid_is_null (*gfid)))
- uuid_copy (uuid, *gfid);
-out:
+afr_local_cleanup(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+
+ if (!local)
return;
+
+ syncbarrier_destroy(&local->barrier);
+
+ afr_local_transaction_cleanup(local, this);
+
+ priv = this->private;
+
+ loc_wipe(&local->loc);
+ loc_wipe(&local->newloc);
+
+ if (local->fd)
+ fd_unref(local->fd);
+
+ if (local->xattr_req)
+ dict_unref(local->xattr_req);
+
+ if (local->xattr_rsp)
+ dict_unref(local->xattr_rsp);
+
+ if (local->dict)
+ dict_unref(local->dict);
+
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->replies);
+
+ GF_FREE(local->child_up);
+
+ GF_FREE(local->read_attempted);
+
+ GF_FREE(local->readable);
+ GF_FREE(local->readable2);
+
+ if (local->inode)
+ inode_unref(local->inode);
+
+ if (local->parent)
+ inode_unref(local->parent);
+
+ if (local->parent2)
+ inode_unref(local->parent2);
+
+ if (local->refreshinode)
+ inode_unref(local->refreshinode);
+
+ { /* getxattr */
+ GF_FREE(local->cont.getxattr.name);
+ }
+
+ { /* lk */
+ GF_FREE(local->cont.lk.locked_nodes);
+ GF_FREE(local->cont.lk.dom_locked_nodes);
+ GF_FREE(local->cont.lk.dom_lock_op_ret);
+ GF_FREE(local->cont.lk.dom_lock_op_errno);
+ }
+
+ { /* create */
+ if (local->cont.create.fd)
+ fd_unref(local->cont.create.fd);
+ if (local->cont.create.params)
+ dict_unref(local->cont.create.params);
+ }
+
+ { /* mknod */
+ if (local->cont.mknod.params)
+ dict_unref(local->cont.mknod.params);
+ }
+
+ { /* mkdir */
+ if (local->cont.mkdir.params)
+ dict_unref(local->cont.mkdir.params);
+ }
+
+ { /* symlink */
+ if (local->cont.symlink.params)
+ dict_unref(local->cont.symlink.params);
+ }
+
+ { /* writev */
+ GF_FREE(local->cont.writev.vector);
+ if (local->cont.writev.iobref)
+ iobref_unref(local->cont.writev.iobref);
+ }
+
+ { /* setxattr */
+ if (local->cont.setxattr.dict)
+ dict_unref(local->cont.setxattr.dict);
+ }
+
+ { /* fsetxattr */
+ if (local->cont.fsetxattr.dict)
+ dict_unref(local->cont.fsetxattr.dict);
+ }
+
+ { /* removexattr */
+ GF_FREE(local->cont.removexattr.name);
+ }
+ { /* xattrop */
+ if (local->cont.xattrop.xattr)
+ dict_unref(local->cont.xattrop.xattr);
+ }
+ { /* symlink */
+ GF_FREE(local->cont.symlink.linkpath);
+ }
+
+ { /* opendir */
+ GF_FREE(local->cont.opendir.checksum);
+ }
+
+ { /* open */
+ if (local->cont.open.fd)
+ fd_unref(local->cont.open.fd);
+ }
+
+ { /* readdirp */
+ if (local->cont.readdir.dict)
+ dict_unref(local->cont.readdir.dict);
+ }
+
+ { /* inodelk */
+ GF_FREE(local->cont.inodelk.volume);
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ }
+
+ { /* entrylk */
+ GF_FREE(local->cont.entrylk.volume);
+ GF_FREE(local->cont.entrylk.basename);
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ }
+
+ if (local->xdata_req)
+ dict_unref(local->xdata_req);
+
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
}
-static gf_boolean_t
-afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this)
+int
+afr_frame_return(call_frame_t *frame)
{
- afr_private_t *priv = NULL;
- gf_boolean_t conflict = _gf_false;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+
+ local = frame->local;
- priv = this->private;
- conflict = afr_conflicting_iattrs (local->cont.lookup.bufs,
- local->cont.lookup.child_success,
- priv->child_count, local->loc.path,
- this->name);
- return conflict;
+ LOCK(&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ return call_count;
}
-static void
-afr_lookup_set_self_heal_data (afr_local_t *local, xlator_t *this)
+static char *afr_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
+
+gf_boolean_t
+afr_is_xattr_ignorable(char *key)
{
- int i = 0;
- struct iatt *bufs = NULL;
- dict_t **xattr = NULL;
- afr_private_t *priv = NULL;
- int32_t child1 = -1;
- int32_t child2 = -1;
+ int i = 0;
- priv = this->private;
- afr_detect_self_heal_by_lookup_status (local, this);
+ if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX)))
+ return _gf_true;
+ for (i = 0; afr_ignore_xattrs[i]; i++) {
+ if (!strcmp(key, afr_ignore_xattrs[i]))
+ return _gf_true;
+ }
+ return _gf_false;
+}
- if (afr_lookup_gfid_missing_count (local, this))
- local->self_heal.need_gfid_self_heal = _gf_true;
+static gf_boolean_t
+afr_xattr_match_needed(dict_t *this, char *key1, data_t *value1, void *data)
+{
+ /* Ignore all non-disk (i.e. virtual) xattrs right away. */
+ if (!gf_is_valid_xattr_namespace(key1))
+ return _gf_false;
- if (_gf_true == afr_lookup_conflicting_entries (local, this))
- local->self_heal.need_missing_entry_self_heal = _gf_true;
- else
- afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req,
- local->cont.lookup.bufs,
- local->cont.lookup.child_success,
- priv->child_count);
+ /* Ignore on-disk xattrs that AFR doesn't need to heal. */
+ if (!afr_is_xattr_ignorable(key1))
+ return _gf_true;
- bufs = local->cont.lookup.bufs;
- for (i = 1; i < local->success_count; i++) {
- child1 = local->cont.lookup.child_success[i-1];
- child2 = local->cont.lookup.child_success[i];;
- afr_detect_self_heal_by_iatt (local, this,
- &bufs[child1], &bufs[child2]);
- }
+ return _gf_false;
+}
- xattr = local->cont.lookup.xattrs;
- for (i = 0; i < local->success_count; i++) {
- child1 = local->cont.lookup.child_success[i];;
- afr_lookup_detect_self_heal_by_xattr (local, this,
- xattr[child1]);
- }
+gf_boolean_t
+afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2)
+{
+ return are_dicts_equal(dict1, dict2, afr_xattr_match_needed, NULL);
}
-int
-afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static int
+afr_get_parent_read_subvol(xlator_t *this, inode_t *parent,
+ struct afr_reply *replies, unsigned char *readable)
{
- afr_local_t *local = NULL;
+ int i = 0;
+ int par_read_subvol = -1;
+ int par_read_subvol_iter = -1;
+ afr_private_t *priv = NULL;
- local = frame->local;
+ priv = this->private;
- if (op_ret == -1) {
- local->op_ret = -1;
- if (afr_error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
+ if (parent)
+ par_read_subvol = afr_data_subvol_get(parent, this, NULL, NULL, NULL,
+ NULL);
- goto out;
- } else {
- local->op_ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret < 0)
+ continue;
+
+ if (par_read_subvol_iter == -1) {
+ par_read_subvol_iter = i;
+ continue;
}
- afr_lookup_done_success_action (frame, this, _gf_true);
-out:
- AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->cont.lookup.inode, &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
- return 0;
+ if ((par_read_subvol_iter != par_read_subvol) && readable[i])
+ par_read_subvol_iter = i;
+
+ if (i == par_read_subvol)
+ par_read_subvol_iter = i;
+ }
+ /* At the end of the for-loop, the only reason why @par_read_subvol_iter
+ * could be -1 is when this LOOKUP has failed on all sub-volumes.
+ * So it is okay to send an arbitrary subvolume (0 in this case)
+ * as parent read subvol.
+ */
+ if (par_read_subvol_iter == -1)
+ par_read_subvol_iter = 0;
+
+ return par_read_subvol_iter;
}
-//TODO: At the moment only lookup needs this, so not doing any checks, in the
-// future we will have to do fop specific operations
-void
-afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this)
+int
+afr_read_subvol_decide(inode_t *inode, xlator_t *this,
+ afr_read_subvol_args_t *args, unsigned char *readable)
{
- afr_local_t *local = NULL;
- afr_local_t *sh_local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- struct iatt *lookup_bufs = NULL;
- struct iatt *lookup_parentbufs = NULL;
+ int event = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *intersection = NULL;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- local = sh->orig_frame->local;
- lookup_bufs = local->cont.lookup.bufs;
- lookup_parentbufs = local->cont.lookup.postparents;
- priv = this->private;
+ priv = this->private;
+ intersection = alloca0(priv->child_count);
- memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf));
- memcpy (lookup_parentbufs, sh->parentbufs,
- priv->child_count * sizeof (*sh->parentbufs));
+ afr_readables_intersect_get(inode, this, &event, intersection);
- afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count);
- if (local->cont.lookup.xattr) {
- dict_unref (local->cont.lookup.xattr);
- local->cont.lookup.xattr = NULL;
- }
+ if (AFR_COUNT(intersection, priv->child_count) <= 0) {
+ /* TODO: If we have one brick with valid data_readable and
+ * another with metadata_readable, try to send an iatt with
+ * valid bits from both.*/
+ return -1;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]);
- }
- afr_reset_children (local->cont.lookup.child_success,
- priv->child_count);
- afr_children_copy (local->cont.lookup.child_success,
- sh->fresh_children, priv->child_count);
+ memcpy(readable, intersection, sizeof(*readable) * priv->child_count);
+
+ return afr_read_subvol_select_by_policy(inode, this, intersection, args);
}
-static void
-afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this,
- gf_boolean_t *sh_launched)
-{
- size_t up_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- GF_ASSERT (sh_launched);
- *sh_launched = _gf_false;
- priv = this->private;
- local = frame->local;
-
- up_count = afr_up_children_count (priv->child_count, local->child_up);
- if (up_count == 1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Only 1 child up - do not attempt to detect self heal");
- goto out;
- }
+static inline int
+afr_first_up_child(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- afr_lookup_set_self_heal_data (local, this);
- if (afr_can_self_heal_proceed (&local->self_heal, priv)) {
- if (afr_is_self_heal_running (local))
- goto out;
+ local = frame->local;
+ priv = this->private;
- afr_launch_self_heal (frame, this, _gf_true,
- local->cont.lookup.buf.ia_type,
- local->cont.lookup.inode,
- afr_post_gfid_sh_success,
- afr_self_heal_lookup_unwind);
- *sh_launched = _gf_true;
- }
-out:
- return;
+ for (i = 0; i < priv->child_count; i++)
+ if (local->replies[i].valid && local->replies[i].op_ret == 0)
+ return i;
+ return -1;
}
-static int
-afr_lookup_set_read_child (afr_local_t *local, xlator_t *this, int32_t read_child)
+static void
+afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
+ unsigned char *success_replies,
+ unsigned char *data_readable, int *read_subvol)
{
- GF_ASSERT (read_child >= 0);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int spb_subvol = -1;
+ int child_count = -1;
- afr_set_read_child (this, local->cont.lookup.inode, read_child);
- local->cont.lookup.read_child = read_child;
+ if (*read_subvol != -1)
+ return;
- return 0;
+ priv = this->private;
+ local = frame->local;
+ child_count = priv->child_count;
+
+ afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol);
+ if ((spb_subvol >= 0) &&
+ (AFR_COUNT(success_replies, child_count) == child_count)) {
+ *read_subvol = spb_subvol;
+ } else if (!priv->quorum_count ||
+ frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) {
+ *read_subvol = afr_first_up_child(frame, this);
+ } else if (priv->quorum_count &&
+ afr_has_quorum(data_readable, this, NULL)) {
+ /* read_subvol is guaranteed to be valid if we hit this path. */
+ *read_subvol = afr_first_up_child(frame, this);
+ } else {
+ /* If quorum is enabled and we do not have a
+ readable yet, it means all good copies are down.
+ */
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR,
+ "no read "
+ "subvols for %s",
+ local->loc.path);
+ }
+ if (*read_subvol >= 0)
+ dict_del_sizen(local->replies[*read_subvol].xdata, GF_CONTENT_KEY);
}
-int
-afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this,
- gf_boolean_t fail_conflict)
+static void
+afr_lookup_done(call_frame_t *frame, xlator_t *this)
{
- int32_t read_child = -1;
- int32_t ret = -1;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
+ int par_read_subvol = 0;
+ int ret = -1;
+ unsigned char *readable = NULL;
+ unsigned char *success_replies = NULL;
+ int event = 0;
+ struct afr_reply *replies = NULL;
+ uuid_t read_gfid = {
+ 0,
+ };
+ gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t in_flight_create = _gf_false;
+ gf_boolean_t can_interpret = _gf_true;
+ inode_t *parent = NULL;
+ ia_type_t ia_type = IA_INVAL;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+ char *gfid_heal_msg = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ replies = local->replies;
+ parent = local->loc.parent;
+
+ locked_entry = afr_is_possibly_under_txn(AFR_ENTRY_TRANSACTION, local,
+ this);
+
+ readable = alloca0(priv->child_count);
+ success_replies = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(parent, this, readable, NULL, &event);
+ par_read_subvol = afr_get_parent_read_subvol(this, parent, replies,
+ readable);
+
+ /* First, check if we have a gfid-change from somewhere,
+ If so, propagate that so that a fresh lookup can be
+ issued
+ */
+ if (local->cont.lookup.needs_fresh_lookup) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
+ goto error;
+ }
- local = frame->local;
- priv = this->private;
+ op_errno = afr_final_errno(frame->local, this->private);
+ local->op_errno = op_errno;
- if (local->loc.parent == NULL)
- fail_conflict = _gf_true;
+ read_subvol = -1;
+ afr_fill_success_replies(local, priv, success_replies);
- if (afr_conflicting_iattrs (local->cont.lookup.bufs,
- local->cont.lookup.child_success,
- priv->child_count, local->loc.path,
- this->name)) {
- if (fail_conflict == _gf_false) {
- ret = 0;
- } else {
- local->op_ret = -1;
- local->op_errno = EIO;
- }
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret == -1) {
+ if (locked_entry && replies[i].op_errno == ENOENT) {
+ in_flight_create = _gf_true;
+ }
+ continue;
}
- ret = afr_lookup_select_read_child (local, this, &read_child);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = EIO;
- goto out;
+ if (read_subvol == -1 || !readable[read_subvol]) {
+ read_subvol = i;
+ gf_uuid_copy(read_gfid, replies[i].poststat.ia_gfid);
+ ia_type = replies[i].poststat.ia_type;
+ local->op_ret = 0;
}
+ }
- ret = afr_lookup_set_read_child (local, this, read_child);
+ if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ goto error;
+ }
+
+ if (read_subvol == -1)
+ goto error;
+ /* We now have a read_subvol, which is readable[] (if there
+ were any). Next we look for GFID mismatches. We don't
+ consider a GFID mismatch as an error if read_subvol is
+ readable[] but the mismatching GFID subvol is not.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1) {
+ continue;
+ }
+
+ if (!gf_uuid_compare(replies[i].poststat.ia_gfid, read_gfid))
+ continue;
+
+ can_interpret = _gf_false;
+
+ if (locked_entry)
+ continue;
+
+ /* Now GFIDs mismatch. It's OK as long as this subvol
+ is not readable[] but read_subvol is */
+ if (readable[read_subvol] && !readable[i])
+ continue;
+
+ /* If we were called from glfsheal and there is still a gfid
+ * mismatch, succeed the lookup and let glfsheal print the
+ * response via gfid-heal-msg.*/
+ if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg",
+ &gfid_heal_msg))
+ goto cant_interpret;
+
+ /* LOG ERROR */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto error;
+ }
+
+ /* Forth, for the finalized GFID, pick the best subvolume
+ to return stats from.
+ */
+ read_subvol = -1;
+ memset(readable, 0, sizeof(*readable) * priv->child_count);
+ if (can_interpret) {
+ if (!afr_has_quorum(success_replies, this, NULL))
+ goto cant_interpret;
+ /* It is safe to call afr_replies_interpret() because we have
+ a response from all the UP subvolumes and all of them resolved
+ to the same GFID
+ */
+ gf_uuid_copy(args.gfid, read_gfid);
+ args.ia_type = ia_type;
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ read_subvol = afr_read_subvol_decide(local->inode, this, &args,
+ readable);
+ if (read_subvol == -1)
+ goto cant_interpret;
if (ret) {
- local->op_ret = -1;
- local->op_errno = EIO;
- goto out;
+ afr_inode_need_refresh_set(local->inode, this);
+ dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
}
-
- afr_lookup_build_response_params (local, this);
- if (afr_is_fresh_lookup (&local->loc, this)) {
- afr_update_loc_gfids (&local->loc,
- &local->cont.lookup.buf,
- &local->cont.lookup.postparent);
+ } else {
+ cant_interpret:
+ afr_attempt_readsubvol_set(frame, this, success_replies, readable,
+ &read_subvol);
+ if (read_subvol == -1) {
+ goto error;
}
+ }
- ret = 0;
-out:
- return ret;
-}
-
-static void
-afr_lookup_done (call_frame_t *frame, xlator_t *this)
-{
- int unwind = 1;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- gf_boolean_t sh_launched = _gf_false;
- int gfid_miss_count = 0;
- int enotconn_count = 0;
- int up_children_count = 0;
-
- priv = this->private;
- local = frame->local;
+ afr_handle_quota_size(frame, this);
- if (local->op_ret < 0)
- goto unwind;
- gfid_miss_count = afr_lookup_gfid_missing_count (local, this);
- up_children_count = afr_up_children_count (priv->child_count,
- local->child_up);
- enotconn_count = priv->child_count - up_children_count;
- if ((gfid_miss_count == local->success_count) &&
- (enotconn_count > 0)) {
- local->op_ret = -1;
- local->op_errno = EIO;
- gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, "
- "LOOKUP on a file without gfid is not allowed when "
- "some of the children are down", local->loc.path);
- goto unwind;
- }
-
- ret = afr_lookup_done_success_action (frame, this, _gf_false);
- if (ret)
- goto unwind;
- uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req);
-
- afr_lookup_perform_self_heal_if_needed (frame, this, &sh_launched);
- if (sh_launched) {
- unwind = 0;
- goto unwind;
- }
-
- unwind:
- if (unwind) {
- AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
- }
+ afr_set_need_heal(this, local);
+ if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg_debug(this->name, 0,
+ "Arbiter cannot be a read subvol "
+ "for %s",
+ local->loc.path);
+ goto error;
+ }
+
+ ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg);
+ if (!ret) {
+ ret = dict_set_str_sizen(local->replies[read_subvol].xdata,
+ "gfid-heal-msg", gfid_heal_msg);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "Error setting gfid-heal-msg dict");
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ }
+ }
+
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[par_read_subvol].postparent);
+ return;
+
+error:
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL, NULL);
}
/*
@@ -1434,1867 +3122,4757 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
* others in that they must be given higher priority while
* returning to the user.
*
- * The hierarchy is ESTALE > ENOENT > others
- *
+ * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others
*/
-gf_boolean_t
-afr_error_more_important (int32_t old_errno, int32_t new_errno)
+int
+afr_higher_errno(int32_t old_errno, int32_t new_errno)
{
- gf_boolean_t ret = _gf_true;
-
- /* Nothing should ever overwrite ESTALE */
- if (old_errno == ESTALE)
- ret = _gf_false;
-
- /* Nothing should overwrite ENOENT, except ESTALE */
- else if ((old_errno == ENOENT) && (new_errno != ESTALE))
- ret = _gf_false;
+ if (old_errno == ENODATA || new_errno == ENODATA)
+ return ENODATA;
+ if (old_errno == ENOENT || new_errno == ENOENT)
+ return ENOENT;
+ if (old_errno == ESTALE || new_errno == ESTALE)
+ return ESTALE;
+ if (old_errno == ENOSPC || new_errno == ENOSPC)
+ return ENOSPC;
+
+ return new_errno;
+}
- return ret;
+int
+afr_final_errno(afr_local_t *local, afr_private_t *priv)
+{
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret >= 0)
+ continue;
+ tmp_errno = local->replies[i].op_errno;
+ op_errno = afr_higher_errno(op_errno, tmp_errno);
+ }
+
+ return op_errno;
}
-int32_t
-afr_resultant_errno_get (int32_t *children,
- int *child_errno, unsigned int child_count)
-{
- int i = 0;
- int32_t op_errno = 0;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- if (children) {
- child = children[i];
- if (child == -1)
- break;
- } else {
- child = i;
- }
- if (afr_error_more_important (op_errno, child_errno[child]))
- op_errno = child_errno[child];
- }
- return op_errno;
+static int32_t
+afr_local_discovery_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int ret = 0;
+ char *pathinfo = NULL;
+ gf_boolean_t is_local = _gf_false;
+ afr_private_t *priv = NULL;
+ int32_t child_index = -1;
+
+ if (op_ret != 0) {
+ goto out;
+ }
+
+ priv = this->private;
+ child_index = (int32_t)(long)cookie;
+
+ ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
+ if (ret != 0) {
+ goto out;
+ }
+
+ ret = glusterfs_is_local_pathinfo(pathinfo, &is_local);
+ if (ret) {
+ goto out;
+ }
+
+ /*
+ * Note that one local subvolume will override another here. The only
+ * way to avoid that would be to retain extra information about whether
+ * the previous read_child is local, and it's just not worth it. Even
+ * the slowest local subvolume is far preferable to a remote one.
+ */
+ if (is_local) {
+ priv->local[child_index] = 1;
+ /* Don't set arbiter as read child. */
+ if (AFR_IS_ARBITER_BRICK(priv, child_index))
+ goto out;
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_LOCAL_CHILD,
+ "selecting local read_child %s",
+ priv->children[child_index]->name);
+
+ priv->read_child = child_index;
+ }
+out:
+ STACK_DESTROY(frame->root);
+ return 0;
}
static void
-afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno)
+afr_attempt_local_discovery(xlator_t *this, int32_t child_index)
{
- GF_ASSERT (local);
- if (op_errno == ENOENT)
- local->enoent_count++;
+ call_frame_t *newframe = NULL;
+ loc_t tmploc = {
+ 0,
+ };
+ afr_private_t *priv = this->private;
+
+ newframe = create_frame(this, this->ctx->pool);
+ if (!newframe) {
+ return;
+ }
- if (afr_error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
- if (local->op_errno == ESTALE) {
- local->op_ret = -1;
- }
+ tmploc.gfid[sizeof(tmploc.gfid) - 1] = 1;
+ STACK_WIND_COOKIE(newframe, afr_local_discovery_cbk,
+ (void *)(long)child_index, priv->children[child_index],
+ priv->children[child_index]->fops->getxattr, &tmploc,
+ GF_XATTR_PATHINFO_KEY, NULL);
}
-static void
-afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this,
- inode_t *inode)
+int
+afr_lookup_sh_metadata_wrap(void *opaque)
{
- afr_private_t *priv = NULL;
- GF_ASSERT (inode);
-
- if (inode->ino != 1)
- goto out;
- if (!afr_is_fresh_lookup (&local->loc, this))
- goto out;
- priv = this->private;
- if ((priv->first_lookup)) {
- gf_log (this->name, GF_LOG_INFO, "added root inode");
- priv->root_inode = inode_ref (inode);
- priv->first_lookup = 0;
- }
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0, first = -1;
+ int ret = -1;
+ dict_t *dict = NULL;
+
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ replies = local->replies;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ first = i;
+ break;
+ }
+ if (first == -1)
+ goto out;
+
+ if (afr_selfheal_metadata_by_stbuf(this, &replies[first].poststat))
+ goto out;
+
+ afr_local_replies_wipe(local, this->private);
+
+ dict = dict_new();
+ if (!dict)
+ goto out;
+ if (local->xattr_req) {
+ dict_copy(local->xattr_req, dict);
+ }
+
+ ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
+
+ if (loc_is_nameless(&local->loc)) {
+ ret = afr_selfheal_unlocked_discover_on(frame, local->inode,
+ local->loc.gfid, local->replies,
+ local->child_up, dict);
+ } else {
+ inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up, dict);
+ }
+ if (inode)
+ inode_unref(inode);
out:
- return;
+ if (loc_is_nameless(&local->loc))
+ afr_discover_done(frame, this);
+ else
+ afr_lookup_done(frame, this);
+
+ if (dict)
+ dict_unref(dict);
+
+ return 0;
}
-static void
-afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr,
- struct iatt *buf, struct iatt *postparent)
+gf_boolean_t
+afr_is_pending_set(xlator_t *this, dict_t *xdata, int type)
{
- GF_ASSERT (child_index >= 0);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparents[child_index] = *postparent;
- local->cont.lookup.bufs[child_index] = *buf;
+ int idx = -1;
+ afr_private_t *priv = NULL;
+ void *pending_raw = NULL;
+ int *pending_int = NULL;
+ int i = 0;
+
+ priv = this->private;
+ idx = afr_index_for_transaction_type(type);
+
+ if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw) == 0) {
+ if (pending_raw) {
+ pending_int = pending_raw;
+
+ if (ntoh32(pending_int[idx]))
+ return _gf_true;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
+ if (!pending_raw)
+ continue;
+ pending_int = pending_raw;
+
+ if (ntoh32(pending_int[idx]))
+ return _gf_true;
+ }
+
+ return _gf_false;
}
-static void
-afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this,
- inode_t *inode, struct iatt *buf)
+static gf_boolean_t
+afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
{
- local->cont.lookup.inode = inode_ref (inode);
- local->cont.lookup.buf = *buf;
- afr_set_root_inode_on_first_lookup (local, this, inode);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t start = _gf_false;
+ struct iatt stbuf = {
+ 0,
+ };
+
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
+
+ if (!priv->metadata_self_heal)
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (first == -1) {
+ first = i;
+ stbuf = replies[i].poststat;
+ continue;
+ }
+
+ if (afr_is_pending_set(this, replies[i].xdata,
+ AFR_METADATA_TRANSACTION)) {
+ /* Let shd do the heal so that lookup is not blocked
+ * on getting metadata lock/doing the heal */
+ start = _gf_false;
+ break;
+ }
+
+ if (gf_uuid_compare(stbuf.ia_gfid, replies[i].poststat.ia_gfid)) {
+ start = _gf_false;
+ break;
+ }
+ if (!IA_EQUAL(stbuf, replies[i].poststat, type)) {
+ start = _gf_false;
+ break;
+ }
+
+ /*Check if iattrs need heal*/
+ if ((!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, prot))) {
+ start = _gf_true;
+ continue;
+ }
+
+ /*Check if xattrs need heal*/
+ if (!afr_xattrs_are_equal(replies[first].xdata, replies[i].xdata))
+ start = _gf_true;
+ }
+
+ return start;
}
-static void
-afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- if (local->success_count == 0) {
- if (local->op_errno != ESTALE) {
- local->op_ret = op_ret;
- local->op_errno = 0;
- }
- afr_lookup_handle_first_success (local, this, inode, buf);
- }
- afr_lookup_update_lk_counts (local, this,
- child_index, xattr);
+int
+afr_lookup_metadata_heal_check(call_frame_t *frame, xlator_t *this)
- afr_lookup_cache_args (local, child_index, xattr,
- buf, postparent);
- local->cont.lookup.child_success[local->success_count] = child_index;
- local->success_count++;
+{
+ call_frame_t *heal = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ if (!afr_can_start_metadata_self_heal(frame, this))
+ goto out;
+
+ heal = afr_frame_create(this, &ret);
+ if (!heal) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = synctask_new(this->ctx->env, afr_lookup_sh_metadata_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto out;
+ return ret;
+out:
+ if (loc_is_nameless(&local->loc))
+ afr_discover_done(frame, this);
+ else
+ afr_lookup_done(frame, this);
+ if (heal)
+ AFR_STACK_DESTROY(heal);
+ return ret;
}
int
-afr_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+afr_lookup_selfheal_wrap(void *opaque)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = -1;
+ int ret = 0;
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
+ uuid_t pargfid = {
+ 0,
+ };
+
+ local = frame->local;
+ this = frame->this;
+ loc_pargfid(&local->loc, pargfid);
+
+ ret = afr_selfheal_name(frame->this, pargfid, local->loc.name,
+ &local->cont.lookup.gfid_req, local->xattr_req);
+ if (ret == -EIO)
+ goto unwind;
+
+ afr_local_replies_wipe(local, this->private);
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up, local->xattr_req);
+ if (inode)
+ inode_unref(inode);
+
+ afr_lookup_metadata_heal_check(frame, this);
+ return 0;
+
+unwind:
+ AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+ return 0;
+}
- child_index = (long) cookie;
+int
+afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *heal = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t name_state_mismatch = _gf_false;
+ struct afr_reply *replies = NULL;
+ int ret = 0;
+ unsigned char *par_readables = NULL;
+ unsigned char *success = NULL;
+ int32_t op_errno = 0;
+ uuid_t gfid = {0};
+
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
+ par_readables = alloca0(priv->child_count);
+ success = alloca0(priv->child_count);
+
+ ret = afr_inode_read_subvol_get(local->loc.parent, this, par_readables,
+ NULL, NULL);
+ if (ret < 0 || AFR_COUNT(par_readables, priv->child_count) == 0) {
+ /* In this case set par_readables to all 1 so that name_heal
+ * need checks at the end of this function will flag missing
+ * entry when name state mismatches*/
+ memset(par_readables, 1, priv->child_count);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret == 0) {
+ if (gf_uuid_is_null(gfid)) {
+ gf_uuid_copy(gfid, replies[i].poststat.ia_gfid);
+ }
+ success[i] = 1;
+ } else {
+ if ((replies[i].op_errno != ENOTCONN) &&
+ (replies[i].op_errno != ENOENT) &&
+ (replies[i].op_errno != ESTALE)) {
+ op_errno = replies[i].op_errno;
+ }
+ }
- LOCK (&frame->lock);
- {
- local = frame->local;
+ /*gfid is missing, needs heal*/
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) {
+ goto name_heal;
+ }
- if (op_ret == -1) {
- afr_lookup_handle_error (local, op_ret, op_errno);
- goto unlock;
- }
- afr_lookup_handle_success (local, this, child_index, op_ret,
- op_errno, inode, buf, xattr,
- postparent);
+ if (first == -1) {
+ first = i;
+ continue;
+ }
- }
-unlock:
- UNLOCK (&frame->lock);
+ if (replies[i].op_ret != replies[first].op_ret) {
+ name_state_mismatch = _gf_true;
+ }
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- afr_lookup_done (frame, this);
+ if (replies[i].op_ret == 0) {
+ /* Rename after this lookup may succeed if we don't do
+ * a name-heal and the destination may not have pending xattrs
+ * to indicate which name is good and which is bad so always do
+ * this heal*/
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid, gfid)) {
+ goto name_heal;
+ }
}
+ }
- return 0;
+ if (name_state_mismatch) {
+ if (!priv->quorum_count)
+ goto name_heal;
+ if (!afr_has_quorum(success, this, NULL))
+ goto name_heal;
+ if (op_errno)
+ goto name_heal;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (par_readables[i] && replies[i].op_ret < 0 &&
+ replies[i].op_errno != ENOTCONN) {
+ goto name_heal;
+ }
+ }
+ }
+
+ goto metadata_heal;
+
+name_heal:
+ heal = afr_frame_create(this, NULL);
+ if (!heal)
+ goto metadata_heal;
+
+ ret = synctask_new(this->ctx->env, afr_lookup_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret) {
+ AFR_STACK_DESTROY(heal);
+ goto metadata_heal;
+ }
+ return ret;
+
+metadata_heal:
+ ret = afr_lookup_metadata_heal_check(frame, this);
+
+ return ret;
}
int
-afr_lookup_cont_init (afr_local_t *local, unsigned int child_count)
+afr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
{
- int ret = -ENOMEM;
- int32_t *child_success = NULL;
- struct iatt *iatts = NULL;
- int i = 0;
- int32_t *sources = NULL;
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ GF_UNUSED int ret = 0;
+ int8_t need_heal = 1;
+
+ child_index = (long)cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ /*
+ * On revalidate lookup if the gfid-changed, afr should unwind the fop
+ * with ESTALE so that a fresh lookup will be sent by the top xlator.
+ * So remember it.
+ */
+ if (xdata && dict_get_sizen(xdata, "gfid-changed"))
+ local->cont.lookup.needs_fresh_lookup = _gf_true;
+
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ local->replies[child_index].need_heal = need_heal;
+ } else {
+ local->replies[child_index].need_heal = need_heal;
+ }
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ afr_lookup_entry_heal(frame, this);
+ }
+
+ return 0;
+}
- GF_ASSERT (local);
- local->cont.lookup.xattrs = GF_CALLOC (child_count,
- sizeof (*local->cont.lookup.xattr),
- gf_afr_mt_dict_t);
- if (NULL == local->cont.lookup.xattrs)
- goto out;
+static void
+afr_discover_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int read_subvol = -1;
+ int ret = 0;
+ unsigned char *data_readable = NULL;
+ unsigned char *success_replies = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ data_readable = alloca0(priv->child_count);
+ success_replies = alloca0(priv->child_count);
+
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) > 0)
+ local->op_ret = 0;
+
+ if (local->op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(frame->local, this->private);
+ goto error;
+ }
- iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt);
- if (NULL == iatts)
- goto out;
- local->cont.lookup.postparents = iatts;
+ if (!afr_has_quorum(success_replies, this, frame))
+ goto unwind;
- iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt);
- if (NULL == iatts)
- goto out;
- local->cont.lookup.bufs = iatts;
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ if (ret) {
+ afr_inode_need_refresh_set(local->inode, this);
+ }
- child_success = GF_CALLOC (child_count, sizeof (*child_success),
- gf_afr_mt_char);
- if (NULL == child_success)
- goto out;
- for (i = 0; i < child_count; i++)
- child_success[i] = -1;
+ read_subvol = afr_read_subvol_decide(local->inode, this, NULL,
+ data_readable);
- local->cont.lookup.child_success = child_success;
+unwind:
+ afr_attempt_readsubvol_set(frame, this, success_replies, data_readable,
+ &read_subvol);
+ if (read_subvol == -1)
+ goto error;
- sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t);
- if (NULL == sources)
- goto out;
+ if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg_debug(this->name, 0,
+ "Arbiter cannot be a read subvol "
+ "for %s",
+ local->loc.path);
+ }
+
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
+ return;
+
+error:
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL, NULL);
+}
+
+static int
+afr_ta_id_file_check(void *opaque)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+ dict_t *dict = NULL;
+ uuid_t gfid = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ this = opaque;
+ priv = this->private;
+
+ ret = afr_fill_ta_loc(this, &loc, _gf_false);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ goto out;
+ }
+
+ ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, &stbuf,
+ 0, 0, 0);
+ if (ret == 0) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ fd = fd_create(loc.inode, getpid());
+ if (!fd)
+ goto out;
+ dict = dict_new();
+ if (!dict)
+ goto out;
+ gf_uuid_generate(gfid);
+ ret = dict_set_gfuuid(dict, "gfid-req", gfid, true);
+ ret = syncop_create(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ O_RDWR, 0664, fd, &stbuf, dict, NULL);
+ }
- local->cont.lookup.sources = sources;
- ret = 0;
out:
- return ret;
+ if (ret == 0) {
+ gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to lookup/create thin-arbiter id file.");
+ }
+ if (dict)
+ dict_unref(dict);
+ if (fd)
+ fd_unref(fd);
+ loc_wipe(&loc);
+
+ return 0;
+}
+
+static int
+afr_ta_id_file_check_cbk(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ return 0;
+}
+
+static void
+afr_discover_done(call_frame_t *frame, xlator_t *this)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ if (!priv->thin_arbiter_count)
+ goto unwind;
+ if (!gf_uuid_is_null(priv->ta_gfid))
+ goto unwind;
+
+ ret = synctask_new(this->ctx->env, afr_ta_id_file_check,
+ afr_ta_id_file_check_cbk, NULL, this);
+ if (ret)
+ goto unwind;
+unwind:
+ afr_discover_unwind(frame, this);
}
int
-afr_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+afr_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- void *gfid_req = NULL;
- int ret = -1;
- int i = 0;
- int call_count = 0;
- uint64_t ctx = 0;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ GF_UNUSED int ret = 0;
+ int8_t need_heal = 1;
+
+ child_index = (long)cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+ }
+
+ if (local->do_discovery && (op_ret == 0))
+ afr_attempt_local_discovery(this, child_index);
+
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ local->replies[child_index].need_heal = need_heal;
+ } else {
+ local->replies[child_index].need_heal = need_heal;
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ afr_lookup_metadata_heal_check(frame, this);
+ }
+
+ return 0;
+}
- priv = this->private;
+int
+afr_discover_do(call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err) {
+ local->op_errno = err;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT(local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(
+ frame, afr_discover_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ return 0;
+out:
+ AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
- local->op_ret = -1;
+int
+afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int event = 0;
- frame->local = local;
+ priv = this->private;
- if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) {
- op_errno = ENOENT;
- goto out;
- }
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- loc_copy (&local->loc, loc);
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- ret = inode_ctx_get (loc->inode, this, &ctx);
- if (ret == 0) {
- /* lookup is a revalidate */
+ if (__is_root_gfid(loc->inode->gfid)) {
+ if (!priv->root_inode)
+ priv->root_inode = inode_ref(loc->inode);
- local->read_child_index = afr_read_child (this,
- loc->inode);
- } else {
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ if (priv->choose_local && !priv->did_discovery) {
+ /* Logic to detect which subvolumes of AFR are
+ local, in order to prefer them for reads
+ */
+ local->do_discovery = _gf_true;
+ priv->did_discovery = _gf_true;
}
+ }
- if (loc->parent)
- local->cont.lookup.parent_ino = loc->parent->ino;
+ local->op = GF_FOP_LOOKUP;
- local->child_up = memdup (priv->child_up, priv->child_count);
- if (NULL == local->child_up) {
- op_errno = ENOMEM;
- goto out;
- }
+ loc_copy(&local->loc, loc);
- ret = afr_lookup_cont_init (local, priv->child_count);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->inode = inode_ref(loc->inode);
- local->call_count = afr_up_children_count (priv->child_count,
- local->child_up);
- call_count = local->call_count;
- if (local->call_count == 0) {
- ret = -1;
- op_errno = ENOTCONN;
- goto out;
+ if (xattr_req) {
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+ if (!local->xattr_req) {
+ op_errno = ENOMEM;
+ goto out;
}
+ }
- /* By default assume ENOTCONN. On success it will be set to 0. */
- local->op_errno = ENOTCONN;
+ if (gf_uuid_is_null(loc->inode->gfid)) {
+ afr_discover_do(frame, this, 0);
+ return 0;
+ }
- afr_lookup_xattr_req_prepare (local, this, xattr_req, loc, &gfid_req);
- local->call_count = afr_up_children_count (priv->child_count,
- local->child_up);
- afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, loc);
- local->fop = GF_FOP_LOOKUP;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- loc, local->xattr_req);
- if (!--call_count)
- break;
- }
- }
+ afr_read_subvol_get(loc->inode, this, NULL, NULL, &event,
+ AFR_DATA_TRANSACTION, NULL);
- ret = 0;
-out:
- if (ret == -1)
- AFR_STACK_UNWIND (lookup, frame, -1, op_errno,
- NULL, NULL, NULL, NULL);
+ afr_discover_do(frame, this, 0);
- return 0;
+ return 0;
+out:
+ AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
+int
+afr_lookup_do(call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err < 0) {
+ local->op_errno = err;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT(local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(
+ frame, afr_lookup_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+out:
+ AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
-/* {{{ open */
+/*
+ * afr_lookup()
+ *
+ * The goal here is to figure out what the element getting looked up is.
+ * i.e what is the GFID, inode type and a conservative estimate of the
+ * inode attributes are.
+ *
+ * As we lookup, operations may be underway on the entry name and the
+ * inode. In lookup() we are primarily concerned only with the entry
+ * operations. If the entry is getting unlinked or renamed, we detect
+ * what operation is underway by querying for on-going transactions and
+ * pending self-healing on the entry through xdata.
+ *
+ * If the entry is a file/dir, it may need self-heal and/or in a
+ * split-brain condition. Lookup is not the place to worry about these
+ * conditions. Outcast marking will naturally handle them in the read
+ * paths.
+ *
+ * Here is a brief goal of what we are trying to achieve:
+ *
+ * - LOOKUP on all subvolumes concurrently, querying on-going transaction
+ * and pending self-heal info from the servers.
+ *
+ * - If all servers reply the same inode type and GFID, the overall call
+ * MUST be a success.
+ *
+ * - If inode types or GFIDs mismatch, and there IS either an on-going
+ * transaction or pending self-heal, inspect what the nature of the
+ * transaction or pending heal is, and select the appropriate subvolume's
+ * reply as the winner.
+ *
+ * - If inode types or GFIDs mismatch, and there are no on-going transactions
+ * or pending self-heal on the entry name on any of the servers, fail the
+ * lookup with EIO. Something has gone wrong beyond reasonable action.
+ */
int
-afr_fd_ctx_set (xlator_t *this, fd_t *fd)
+afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
{
- afr_private_t * priv = NULL;
- int ret = -1;
- uint64_t ctx = 0;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (fd, out);
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int event = 0;
+ int ret = 0;
- priv = this->private;
+ if (loc_is_nameless(loc)) {
+ if (xattr_req)
+ dict_del_sizen(xattr_req, "gfid-req");
+ afr_discover(frame, this, loc, xattr_req);
+ return 0;
+ }
- LOCK (&fd->lock);
- {
- ret = __fd_ctx_get (fd, this, &ctx);
+ if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name,
+ frame->root->pid)) {
+ op_errno = EPERM;
+ goto out;
+ }
- if (ret == 0)
- goto unlock;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t),
- gf_afr_mt_afr_fd_ctx_t);
- if (!fd_ctx) {
- ret = -ENOMEM;
- goto unlock;
- }
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->pre_op_done) {
- ret = -ENOMEM;
- goto unlock;
- }
+ local->op = GF_FOP_LOOKUP;
- fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->pre_op_piggyback) {
- ret = -ENOMEM;
- goto unlock;
- }
+ loc_copy(&local->loc, loc);
- fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->opened_on) {
- ret = -ENOMEM;
- goto unlock;
- }
+ local->inode = inode_ref(loc->inode);
- fd_ctx->up_count = priv->up_count;
- fd_ctx->down_count = priv->down_count;
+ if (xattr_req) {
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+ if (!local->xattr_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ ret = dict_get_gfuuid(local->xattr_req, "gfid-req",
+ &local->cont.lookup.gfid_req);
+ if (ret == 0) {
+ dict_del_sizen(local->xattr_req, "gfid-req");
+ }
+ }
- fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->locked_on) {
- ret = -ENOMEM;
- goto unlock;
- }
+ afr_read_subvol_get(loc->parent, this, NULL, NULL, &event,
+ AFR_DATA_TRANSACTION, NULL);
- ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set fd ctx (%p)", fd);
+ afr_lookup_do(frame, this, 0);
- INIT_LIST_HEAD (&fd_ctx->entries);
- }
-unlock:
- UNLOCK (&fd->lock);
+ return 0;
out:
- return ret;
+ AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ return 0;
}
-/* {{{ flush */
+void
+_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)
+{
+ afr_private_t *priv = this->private;
+
+ if (fd_ctx->lk_heal_info) {
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info);
+ fd_ctx->lk_heal_info = NULL;
+ }
+ GF_FREE(fd_ctx->opened_on);
+ GF_FREE(fd_ctx);
+ return;
+}
int
-afr_flush_unwind (call_frame_t *frame, xlator_t *this)
+afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = 0;
- local = frame->local;
+ ret = fd_ctx_get(fd, this, &ctx);
+ if (ret < 0)
+ goto out;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ fd_ctx = (afr_fd_ctx_t *)(long)ctx;
- if (main_frame) {
- AFR_STACK_UNWIND (flush, main_frame,
- local->op_ret, local->op_errno);
- }
+ if (fd_ctx) {
+ _afr_cleanup_fd_ctx(this, fd_ctx);
+ }
- return 0;
+out:
+ return 0;
}
-
int
-afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_release(xlator_t *this, fd_t *fd)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
+ afr_cleanup_fd_ctx(this, fd);
- local = frame->local;
- priv = this->private;
+ return 0;
+}
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+afr_fd_ctx_t *
+__afr_fd_ctx_get(fd_t *fd, xlator_t *this)
+{
+ uint64_t ctx = 0;
+ int ret = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ ret = __fd_ctx_get(fd, this, &ctx);
- if (need_unwind)
- afr_flush_unwind (frame, this);
+ if (ret < 0) {
+ ret = __afr_fd_ctx_set(this, fd);
+ if (ret < 0)
+ goto out;
- call_count = afr_frame_return (frame);
+ ret = __fd_ctx_get(fd, this, &ctx);
+ if (ret < 0)
+ goto out;
+ }
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ fd_ctx = (afr_fd_ctx_t *)(long)ctx;
+out:
+ return fd_ctx;
+}
- return 0;
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ }
+ UNLOCK(&fd->lock);
+
+ return fd_ctx;
}
+int
+__afr_fd_ctx_set(xlator_t *this, fd_t *fd)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int i = 0;
+
+ VALIDATE_OR_GOTO(this->private, out);
+ VALIDATE_OR_GOTO(fd, out);
+
+ priv = this->private;
+
+ ret = __fd_ctx_get(fd, this, &ctx);
+
+ if (ret == 0)
+ goto out;
+
+ fd_ctx = GF_CALLOC(1, sizeof(afr_fd_ctx_t), gf_afr_mt_afr_fd_ctx_t);
+ if (!fd_ctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ fd_ctx->opened_on = GF_CALLOC(sizeof(*fd_ctx->opened_on), priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!fd_ctx->opened_on) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_is_anonymous(fd))
+ fd_ctx->opened_on[i] = AFR_FD_OPENED;
+ else
+ fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
+ }
+
+ fd_ctx->readdir_subvol = -1;
+ fd_ctx->lk_heal_info = NULL;
+
+ ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);
+ if (ret)
+ gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);
+out:
+ if (ret && fd_ctx)
+ _afr_cleanup_fd_ctx(this, fd_ctx);
+ return ret;
+}
+
+/* {{{ flush */
int
-afr_flush_wind (call_frame_t *frame, xlator_t *this)
+afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = -1;
+ afr_local_t *local = NULL;
+ int call_count = -1;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ LOCK(&frame->lock);
+ {
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
+ return 0;
+}
+
+static int
+afr_flush_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_flush_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->flush,
+ local->fd, xdata);
+ if (!--call_count)
+ break;
}
+ }
- local->call_count = call_count;
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- local->fd);
-
- if (!--call_count)
- break;
- }
+afr_local_t *
+afr_wakeup_same_fd_delayed_op(xlator_t *this, afr_lock_t *lock, fd_t *fd)
+{
+ afr_local_t *local = NULL;
+
+ if (lock->delay_timer) {
+ local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ if (fd == local->fd) {
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ local = NULL;
+ } else {
+ lock->delay_timer = NULL;
+ }
+ } else {
+ local = NULL;
}
+ }
- return 0;
+ return local;
}
+void
+afr_delayed_changelog_wake_resume(xlator_t *this, inode_t *inode,
+ call_stub_t *stub)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ afr_lock_t *lock = NULL;
+ afr_local_t *metadata_local = NULL;
+ afr_local_t *data_local = NULL;
+ LOCK(&inode->lock);
+ {
+ (void)__afr_inode_ctx_get(this, inode, &ctx);
+ lock = &ctx->lock[AFR_DATA_TRANSACTION];
+ data_local = afr_wakeup_same_fd_delayed_op(this, lock, stub->args.fd);
+ lock = &ctx->lock[AFR_METADATA_TRANSACTION];
+ metadata_local = afr_wakeup_same_fd_delayed_op(this, lock,
+ stub->args.fd);
+ }
+ UNLOCK(&inode->lock);
+
+ if (data_local) {
+ data_local->transaction.resume_stub = stub;
+ } else if (metadata_local) {
+ metadata_local->transaction.resume_stub = stub;
+ } else {
+ call_resume(stub);
+ }
+ if (data_local) {
+ afr_delayed_changelog_wake_up_cbk(data_local);
+ }
+ if (metadata_local) {
+ afr_delayed_changelog_wake_up_cbk(metadata_local);
+ }
+}
int
-afr_flush_done (call_frame_t *frame, xlator_t *this)
+afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int op_errno = ENOMEM;
- local = frame->local;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->transaction.unwind (frame, this);
+ local->op = GF_FOP_FLUSH;
+ if (!afr_is_consistent_io_possible(local, this->private, &op_errno))
+ goto out;
- AFR_STACK_DESTROY (frame);
+ local->fd = fd_ref(fd);
- return 0;
-}
+ stub = fop_flush_stub(frame, afr_flush_wrapper, fd, xdata);
+ if (!stub)
+ goto out;
+ afr_delayed_changelog_wake_resume(this, fd->inode, stub);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+ return 0;
+}
int
-afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
- int call_count = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
+ afr_local_t *local = NULL;
+ int call_count = -1;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local = frame->local;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ } else {
+ local->op_errno = op_errno;
}
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (call_count == 0)
+ AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
- goto out;
+ return 0;
+}
+
+int
+afr_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_FSYNCDIR;
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ call_count = local->call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND(frame, afr_fsyncdir_cbk, priv->children[i],
+ priv->children[i]->fops->fsyncdir, fd, datasync, xdata);
+ if (!--call_count)
+ break;
}
+ }
- transaction_frame->local = local;
+ return 0;
+out:
+ AFR_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL);
- local->op = GF_FOP_FLUSH;
+ return 0;
+}
- local->transaction.fop = afr_flush_wind;
- local->transaction.done = afr_flush_done;
- local->transaction.unwind = afr_flush_unwind;
+/* }}} */
- local->fd = fd_ref (fd);
+static int
+afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this);
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+static gf_boolean_t
+afr_is_conflicting_lock_present(int32_t op_ret, int32_t op_errno)
+{
+ if (op_ret == -1 && op_errno == EAGAIN)
+ return _gf_true;
+ return _gf_false;
+}
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+static void
+afr_fop_lock_unwind(call_frame_t *frame, glusterfs_fop_t op, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ switch (op) {
+ case GF_FOP_INODELK:
+ AFR_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_FINODELK:
+ AFR_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_ENTRYLK:
+ AFR_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_FENTRYLK:
+ AFR_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata);
+ break;
+ default:
+ break;
+ }
+}
+static void
+afr_fop_lock_wind(call_frame_t *frame, xlator_t *this, int child_index,
+ int32_t (*lock_cbk)(call_frame_t *, void *, xlator_t *,
+ int32_t, int32_t, dict_t *))
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = child_index;
+
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->inodelk,
+ (const char *)local->cont.inodelk.volume, &local->loc,
+ local->cont.inodelk.cmd, &local->cont.inodelk.flock,
+ local->cont.inodelk.xdata);
+ break;
+ case GF_FOP_FINODELK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->finodelk,
+ (const char *)local->cont.inodelk.volume, local->fd,
+ local->cont.inodelk.cmd, &local->cont.inodelk.flock,
+ local->cont.inodelk.xdata);
+ break;
+ case GF_FOP_ENTRYLK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->entrylk, local->cont.entrylk.volume,
+ &local->loc, local->cont.entrylk.basename,
+ local->cont.entrylk.cmd, local->cont.entrylk.type,
+ local->cont.entrylk.xdata);
+ break;
+ case GF_FOP_FENTRYLK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fentrylk, local->cont.entrylk.volume,
+ local->fd, local->cont.entrylk.basename,
+ local->cont.entrylk.cmd, local->cont.entrylk.type,
+ local->cont.entrylk.xdata);
+ break;
+ default:
+ break;
+ }
+}
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+void
+afr_fop_lock_proceed(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- AFR_STACK_UNWIND (flush, frame, op_ret, op_errno);
- }
+ local = frame->local;
+ priv = frame->this->private;
- return 0;
+ if (local->fop_lock_state != AFR_FOP_LOCK_PARALLEL) {
+ afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return;
+ }
+ /* At least one child is up */
+ /*
+ * Non-blocking locks also need to be serialized. Otherwise there is
+ * a chance that both the mounts which issued same non-blocking inodelk
+ * may endup not acquiring the lock on any-brick.
+ * Ex: Mount1 and Mount2
+ * request for full length lock on file f1. Mount1 afr may acquire the
+ * partial lock on brick-1 and may not acquire the lock on brick-2
+ * because Mount2 already got the lock on brick-2, vice versa. Since
+ * both the mounts only got partial locks, afr treats them as failure in
+ * gaining the locks and unwinds with EAGAIN errno.
+ */
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+ local->fop_lock_state = AFR_FOP_LOCK_SERIAL;
+ afr_local_replies_wipe(local, priv);
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.cmd = local->cont.inodelk.in_cmd;
+ local->cont.inodelk.flock = local->cont.inodelk.in_flock;
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ local->cont.inodelk.xdata = NULL;
+ if (local->xdata_req)
+ local->cont.inodelk.xdata = dict_ref(local->xdata_req);
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = local->cont.entrylk.in_cmd;
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ local->cont.entrylk.xdata = NULL;
+ if (local->xdata_req)
+ local->cont.entrylk.xdata = dict_ref(local->xdata_req);
+ break;
+ default:
+ break;
+ }
+ afr_serialized_lock_wind(frame, frame->this);
}
-/* }}} */
-
+static int32_t
+afr_unlock_partial_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
-int
-afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
{
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int child_index = (long)cookie;
+ uuid_t gfid = {0};
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0)
- goto out;
+ local = frame->local;
+ priv = this->private;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ if (op_ret < 0 && op_errno != ENOTCONN) {
+ if (local->fd)
+ gf_uuid_copy(gfid, local->fd->inode->gfid);
+ else
+ loc_gfid(&local->loc, gfid);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "%s: Failed to unlock %s on %s "
+ "with lk_owner: %s",
+ uuid_utoa(gfid), gf_fop_list[local->op],
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ afr_fop_lock_proceed(frame);
+
+ return 0;
+}
- if (fd_ctx) {
- if (fd_ctx->pre_op_done)
- GF_FREE (fd_ctx->pre_op_done);
+static int32_t
+afr_unlock_locks_and_proceed(call_frame_t *frame, xlator_t *this,
+ int call_count)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ if (call_count == 0) {
+ afr_fop_lock_proceed(frame);
+ goto out;
+ }
+
+ local = frame->local;
+ priv = this->private;
+ local->call_count = call_count;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.flock.l_type = F_UNLCK;
+ local->cont.inodelk.cmd = F_SETLK;
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ local->cont.inodelk.xdata = NULL;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = ENTRYLK_UNLOCK;
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ local->cont.entrylk.xdata = NULL;
+ break;
+ default:
+ break;
+ }
- if (fd_ctx->opened_on)
- GF_FREE (fd_ctx->opened_on);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
- if (fd_ctx->locked_on)
- GF_FREE (fd_ctx->locked_on);
+ if (local->replies[i].op_ret == -1)
+ continue;
- if (fd_ctx->pre_op_piggyback)
- GF_FREE (fd_ctx->pre_op_piggyback);
+ afr_fop_lock_wind(frame, this, i, afr_unlock_partial_lock_cbk);
- GF_FREE (fd_ctx);
- }
+ if (!--call_count)
+ break;
+ }
out:
- return 0;
+ return 0;
}
-
-int
-afr_release (xlator_t *this, fd_t *fd)
+int32_t
+afr_fop_lock_done(call_frame_t *frame, xlator_t *this)
{
- afr_locked_fd_t *locked_fd = NULL;
- afr_locked_fd_t *tmp = NULL;
- afr_private_t *priv = NULL;
+ int i = 0;
+ int lock_count = 0;
+ unsigned char *success = NULL;
- priv = this->private;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- afr_cleanup_fd_ctx (this, fd);
+ local = frame->local;
+ priv = this->private;
+ success = alloca0(priv->child_count);
- list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds,
- list) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
- if (locked_fd->fd == fd) {
- list_del_init (&locked_fd->list);
- GF_FREE (locked_fd);
- }
+ if (local->replies[i].op_ret == 0) {
+ lock_count++;
+ success[i] = 1;
+ }
+ if (local->op_ret == -1 && local->op_errno == EAGAIN)
+ continue;
+
+ if ((local->replies[i].op_ret == -1) &&
+ (local->replies[i].op_errno == EAGAIN)) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
+ continue;
}
- return 0;
-}
+ if (local->replies[i].op_ret == 0)
+ local->op_ret = 0;
+ local->op_errno = local->replies[i].op_errno;
+ }
-/* {{{ fsync */
+ if (afr_fop_lock_is_unlock(frame))
+ goto unwind;
-int
-afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- afr_local_t *local = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
- int read_child = 0;
+ if (afr_is_conflicting_lock_present(local->op_ret, local->op_errno)) {
+ afr_unlock_locks_and_proceed(frame, this, lock_count);
+ } else if (priv->quorum_count && !afr_has_quorum(success, this, NULL)) {
+ local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED;
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ if (local->op_errno == 0)
+ local->op_errno = afr_quorum_errno(priv);
+ afr_unlock_locks_and_proceed(frame, this, lock_count);
+ } else {
+ goto unwind;
+ }
+
+ return 0;
+unwind:
+ afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
- local = frame->local;
+static int
+afr_common_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
- read_child = afr_read_child (this, local->fd->inode);
+ local = frame->local;
- LOCK (&frame->lock);
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret == 0 && xdata) {
+ local->replies[child_index].xdata = dict_ref(xdata);
+ LOCK(&frame->lock);
{
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ if (!local->xdata_rsp)
+ local->xdata_rsp = dict_ref(xdata);
+ }
+ UNLOCK(&frame->lock);
+ }
+ return 0;
+}
- if (op_ret == 0) {
- local->op_ret = 0;
+static int32_t
+afr_serialized_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
- if (local->success_count == 0) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
- }
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long)cookie;
+ int next_child = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+
+ for (next_child = child_index + 1; next_child < priv->child_count;
+ next_child++) {
+ if (local->child_up[next_child])
+ break;
+ }
+
+ if (afr_is_conflicting_lock_present(op_ret, op_errno) ||
+ (next_child == priv->child_count)) {
+ afr_fop_lock_done(frame, this);
+ } else {
+ afr_fop_lock_wind(frame, this, next_child, afr_serialized_lock_cbk);
+ }
+
+ return 0;
+}
- if (child_index == read_child) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
- }
+static int
+afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- local->success_count++;
- }
+ priv = this->private;
+ local = frame->local;
- local->op_errno = op_errno;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ afr_fop_lock_wind(frame, this, i, afr_serialized_lock_cbk);
+ break;
}
- UNLOCK (&frame->lock);
+ }
+ return 0;
+}
- call_count = afr_frame_return (frame);
+static int32_t
+afr_parallel_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
- if (call_count == 0) {
- local->cont.fsync.prebuf.ia_ino = local->cont.fsync.ino;
- local->cont.fsync.postbuf.ia_ino = local->cont.fsync.ino;
+{
+ int call_count = 0;
- AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno,
- &local->cont.fsync.prebuf,
- &local->cont.fsync.postbuf);
- }
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
- return 0;
-}
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ afr_fop_lock_done(frame, this);
+ return 0;
+}
-int
-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync)
+static int
+afr_parallel_lock_wind(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+ afr_fop_lock_wind(frame, this, i, afr_parallel_lock_cbk);
+ if (!--call_count)
+ break;
+ }
+ return 0;
+}
- priv = this->private;
+static int
+afr_fop_handle_lock(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ int op_errno = 0;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ if (!afr_fop_lock_is_unlock(frame)) {
+ if (!afr_is_consistent_io_possible(local, this->private, &op_errno))
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.cmd = F_SETLK;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = ENTRYLK_LOCK_NB;
+ break;
+ default:
+ break;
}
+ }
- call_count = local->call_count;
- frame->local = local;
-
- local->fd = fd_ref (fd);
- local->cont.fsync.ino = fd->inode->ino;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_fsync_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsync,
- fd, datasync);
- if (!--call_count)
- break;
- }
+ if (local->xdata_req) {
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.xdata = dict_ref(local->xdata_req);
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.xdata = dict_ref(local->xdata_req);
+ break;
+ default:
+ break;
}
+ }
- op_ret = 0;
+ local->fop_lock_state = AFR_FOP_LOCK_PARALLEL;
+ afr_parallel_lock_wind(frame, this);
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL);
- }
- return 0;
+ return -op_errno;
}
-/* }}} */
+static int32_t
+afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = fop;
+ if (loc)
+ loc_copy(&local->loc, loc);
+ if (fd && (flock->l_type != F_UNLCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local->fd = fd_ref(fd);
+ }
+
+ local->cont.inodelk.volume = gf_strdup(volume);
+ if (!local->cont.inodelk.volume) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->cont.inodelk.in_cmd = cmd;
+ local->cont.inodelk.cmd = cmd;
+ local->cont.inodelk.in_flock = *flock;
+ local->cont.inodelk.flock = *flock;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ op_errno = -afr_fop_handle_lock(frame, frame->this);
+ if (op_errno)
+ goto out;
+ return 0;
+out:
+ afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL);
-/* {{{ fsync */
+ return 0;
+}
int32_t
-afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret,
- local->op_errno);
+ afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd,
+ flock, xdata);
+ return 0;
+}
- return 0;
+int32_t
+afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd,
+ flock, xdata);
+ return 0;
}
+static int
+afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = fop;
+ if (loc)
+ loc_copy(&local->loc, loc);
+ if (fd && (cmd != ENTRYLK_UNLOCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local->fd = fd_ref(fd);
+ }
+ local->cont.entrylk.cmd = cmd;
+ local->cont.entrylk.in_cmd = cmd;
+ local->cont.entrylk.type = type;
+ local->cont.entrylk.volume = gf_strdup(volume);
+ local->cont.entrylk.basename = gf_strdup(basename);
+ if (!local->cont.entrylk.volume || !local->cont.entrylk.basename) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+ op_errno = -afr_fop_handle_lock(frame, frame->this);
+ if (op_errno)
+ goto out;
+
+ return 0;
+out:
+ afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL);
+ return 0;
+}
-int32_t
-afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync)
+int
+afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename,
+ cmd, type, xdata);
+ return 0;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename,
+ cmd, type, xdata);
+ return 0;
+}
- priv = this->private;
+int
+afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ struct statvfs *buf = NULL;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local = frame->local;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ LOCK(&frame->lock);
+ {
+ if (op_ret != 0) {
+ local->op_errno = op_errno;
+ goto unlock;
}
- call_count = local->call_count;
- frame->local = local;
+ local->op_ret = op_ret;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fsyncdir_cbk,
- priv->children[i],
- priv->children[i]->fops->fsyncdir,
- fd, datasync);
- if (!--call_count)
- break;
+ buf = &local->cont.statfs.buf;
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < buf->f_bavail) {
+ *buf = *statvfs;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(xdata);
}
+ }
+ } else {
+ *buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ if (xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
+ }
+unlock:
+ call_count = --local->call_count;
+ UNLOCK(&frame->lock);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno);
- }
- return 0;
-}
-
-/* }}} */
+ if (call_count == 0)
+ AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+ &local->cont.statfs.buf, local->xdata_rsp);
-/* {{{ xattrop */
+ return 0;
+}
-int32_t
-afr_xattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
+int
+afr_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_STATFS;
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX])
+ local->call_count--;
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ if (AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+ STACK_WIND(frame, afr_statfs_cbk, priv->children[i],
+ priv->children[i]->fops->statfs, loc, xdata);
+ if (!--call_count)
+ break;
}
- UNLOCK (&frame->lock);
+ }
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno,
- xattr);
+ return 0;
+out:
+ AFR_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+int32_t
+afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ int call_count = -1;
+ int child_index = (long)cookie;
+
+ local = frame->local;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
+ local->xdata_rsp);
+ }
+
+ return 0;
+}
int32_t
-afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr)
+afr_lk_unlock(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
+ priv = this->private;
- priv = this->private;
+ call_count = afr_locked_nodes_count(local->cont.lk.locked_nodes,
+ priv->child_count);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
+ local->xdata_rsp);
+ return 0;
+ }
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->call_count = call_count;
- call_count = local->call_count;
- frame->local = local;
+ local->cont.lk.user_flock.l_type = F_UNLCK;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_xattrop_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- loc, optype, xattr);
- if (!--call_count)
- break;
- }
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.locked_nodes[i]) {
+ STACK_WIND_COOKIE(frame, afr_lk_unlock_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock,
+ NULL);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL);
+ if (!--call_count)
+ break;
}
- return 0;
-}
-
-/* }}} */
+ }
-/* {{{ fxattrop */
+ return 0;
+}
int32_t
-afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
+afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = -1;
- int call_count = -1;
+ local = frame->local;
+ priv = this->private;
- local = frame->local;
+ child_index = (long)cookie;
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret < 0 && op_errno == EAGAIN) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ afr_lk_unlock(frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+
+ child_index++;
+
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+ } else if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
- call_count = afr_frame_return (frame);
+ afr_lk_unlock(frame, this);
+ } else {
+ if (local->op_ret < 0)
+ local->op_errno = afr_final_errno(local, priv);
- if (call_count == 0)
- AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno,
- xattr);
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
- return 0;
+ return 0;
}
+int
+afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ return 0;
+}
-int32_t
-afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr)
+int
+afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ int child_index = -1;
+
+ local = frame->local;
+ child_index = (long)cookie;
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int child_index = (long)cookie;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+ return 0;
+}
+int
+afr_lk_transaction(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *wind_on = NULL;
+ int op_errno = 0;
+ int i = 0;
+ int ret = 0;
+
+ frame = (call_frame_t *)opaque;
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+
+ if (priv->arbiter_count || priv->child_count != 3) {
+ op_errno = ENOTSUP;
+ gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Lock healing supported only for replica 3 volumes.",
+ uuid_utoa(local->fd->inode->gfid));
+ goto err;
+ }
+
+ op_errno = -afr_dom_lock_acquire(frame); // Released during
+ // AFR_STACK_UNWIND
+ if (op_errno != 0) {
+ goto err;
+ }
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) {
+ op_errno = afr_final_errno(local, priv);
+ goto err;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i])
+ wind_on[i] = 1;
+ }
+ AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ goto unlock;
+ } else {
+ if (local->cont.lk.user_flock.l_type == F_UNLCK)
+ ret = afr_remove_lock_from_saved_locks(local, this);
+ else
+ ret = afr_add_lock_to_saved_locks(frame, this);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ goto unlock;
+ }
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
- priv = this->private;
+ return 0;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+unlock:
+ local->cont.lk.user_flock.l_type = F_UNLCK;
+ AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock, NULL);
+err:
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+ return -1;
+}
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+int
+afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ int i = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_LK;
+ if (!afr_lk_is_unlock(cmd, flock)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+ }
+
+ local->cont.lk.locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+
+ if (!local->cont.lk.locked_nodes) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref(fd);
+ local->cont.lk.cmd = cmd;
+ local->cont.lk.user_flock = *flock;
+ local->cont.lk.ret_flock = *flock;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ if (afr_is_lock_mode_mandatory(xdata)) {
+ ret = synctask_new(this->ctx->env, afr_lk_transaction,
+ afr_lk_transaction_cbk, frame, frame);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
}
+ return 0;
+ }
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fxattrop_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- fd, optype, xattr);
- if (!--call_count)
- break;
- }
- }
+ STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],
+ priv->children[i]->fops->lk, fd, cmd, flock,
+ local->xdata_req);
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL);
- }
- return 0;
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
}
-/* }}} */
+int32_t
+afr_lease_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_lease *lease,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ local = frame->local;
+ call_count = afr_frame_return(frame);
-int32_t
-afr_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ if (call_count == 0)
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, lease,
+ xdata);
+
+ return 0;
+}
+int32_t
+afr_lease_unlock(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int call_count = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ call_count = afr_locked_nodes_count(local->cont.lease.locked_nodes,
+ priv->child_count);
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno,
+ &local->cont.lease.ret_lease, NULL);
+ return 0;
+ }
- call_count = afr_frame_return (frame);
+ local->call_count = call_count;
- if (call_count == 0)
- AFR_STACK_UNWIND (inodelk, frame, local->op_ret,
- local->op_errno);
+ local->cont.lease.user_lease.cmd = GF_UNLK_LEASE;
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lease.locked_nodes[i]) {
+ STACK_WIND(frame, afr_lease_unlock_cbk, priv->children[i],
+ priv->children[i]->fops->lease, &local->loc,
+ &local->cont.lease.user_lease, NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+}
int32_t
-afr_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock)
+afr_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_lease *lease, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = -1;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
+ priv = this->private;
- priv = this->private;
+ child_index = (long)cookie;
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret < 0 && op_errno == EAGAIN) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
- call_count = local->call_count;
- frame->local = local;
+ afr_lease_unlock(frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lease.locked_nodes[child_index] = 1;
+ local->cont.lease.ret_lease = *lease;
+ }
+
+ child_index++;
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lease, &local->loc,
+ &local->cont.lease.user_lease, xdata);
+ } else if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lease.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_inodelk_cbk,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- volume, loc, cmd, flock);
-
- if (!--call_count)
- break;
- }
- }
+ afr_lease_unlock(frame, this);
+ } else {
+ if (local->op_ret < 0)
+ local->op_errno = afr_final_errno(local, priv);
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno,
+ &local->cont.lease.ret_lease, NULL);
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno);
- }
- return 0;
+ return 0;
}
+int
+afr_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
-int32_t
-afr_finodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ priv = this->private;
-{
- afr_local_t *local = NULL;
- int call_count = -1;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local = frame->local;
+ local->op = GF_FOP_LEASE;
+ local->cont.lease.locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lease.locked_nodes),
+ gf_afr_mt_char);
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ if (!local->cont.lease.locked_nodes) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ loc_copy(&local->loc, loc);
+ local->cont.lease.user_lease = *lease;
+ local->cont.lease.ret_lease = *lease;
- call_count = afr_frame_return (frame);
+ STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)0, priv->children[0],
+ priv->children[0]->fops->lease, loc, lease, xdata);
- if (call_count == 0)
- AFR_STACK_UNWIND (finodelk, frame, local->op_ret,
- local->op_errno);
+ return 0;
+out:
+ AFR_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+int
+afr_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = 0;
+ gf_boolean_t failed = _gf_false;
+ gf_boolean_t succeeded = _gf_false;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+
+ call_count = afr_frame_return(frame);
+ if (call_count)
+ goto out;
+ /* If any of the subvolumes failed with other than ENOTCONN
+ * return error else return success unless all the subvolumes
+ * failed.
+ * TODO: In case of failure, we need to unregister the xattrs
+ * from the other subvolumes where it succeeded (once upcall
+ * fixes the Bz-1371622)*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno != ENOTCONN) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ if (local->replies[i].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ failed = _gf_true;
+ break;
+ }
+ if (local->replies[i].op_ret == 0) {
+ succeeded = _gf_true;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ if (!local->xdata_rsp && local->replies[i].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ }
+ }
+
+ if (!succeeded && !failed) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ }
+
+ AFR_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
-int32_t
-afr_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock)
+out:
+ return 0;
+}
+
+int
+afr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ int32_t op_errno = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_cnt = -1;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
- priv = this->private;
+ if (op != GF_IPC_TARGET_UPCALL)
+ goto wind_default;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ VALIDATE_OR_GOTO(this->private, err);
+ priv = this->private;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto err;
- call_count = local->call_count;
- frame->local = local;
+ call_cnt = local->call_count;
+ if (xdata) {
for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_finodelk_cbk,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- volume, fd, cmd, flock);
-
- if (!--call_count)
- break;
- }
- }
+ if (dict_set_int8(xdata, priv->pending_key[i], 0) < 0)
+ goto err;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE(frame, afr_ipc_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->ipc, op,
+ xdata);
+ if (!--call_cnt)
+ break;
+ }
+ return 0;
+
+err:
+ if (op_errno == -1)
+ op_errno = errno;
+ AFR_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
+
+ return 0;
+
+wind_default:
+ STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc, op, xdata);
+ return 0;
+}
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno);
- }
+int
+afr_forget(xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_int = 0;
+ afr_inode_ctx_t *ctx = NULL;
+
+ afr_spb_choice_timeout_cancel(this, inode);
+ inode_ctx_del(inode, this, &ctx_int);
+ if (!ctx_int)
return 0;
+
+ ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
+ afr_inode_ctx_destroy(ctx);
+ return 0;
}
+int
+afr_priv_dump(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+
+ GF_ASSERT(this);
+ priv = this->private;
+
+ GF_ASSERT(priv);
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+ gf_proc_dump_add_section("%s", key_prefix);
+ gf_proc_dump_write("child_count", "%u", priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sprintf(key, "child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->child_up[i]);
+ sprintf(key, "pending_key[%d]", i);
+ gf_proc_dump_write(key, "%s", priv->pending_key[i]);
+ sprintf(key, "pending_reads[%d]", i);
+ gf_proc_dump_write(key, "%" PRId64,
+ GF_ATOMIC_GET(priv->pending_reads[i]));
+ sprintf(key, "child_latency[%d]", i);
+ gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]);
+ sprintf(key, "halo_child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->halo_child_up[i]);
+ }
+ gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal);
+ gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
+ gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal);
+ gf_proc_dump_write("read_child", "%d", priv->read_child);
+ gf_proc_dump_write("wait_count", "%u", priv->wait_count);
+ gf_proc_dump_write("heal-wait-queue-length", "%d", priv->heal_wait_qlen);
+ gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters);
+ gf_proc_dump_write("background-self-heal-count", "%d",
+ priv->background_self_heal_count);
+ gf_proc_dump_write("healers", "%d", priv->healers);
+ gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
+ gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode);
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ gf_proc_dump_write("quorum-type", "auto");
+ } else if (priv->quorum_count == 0) {
+ gf_proc_dump_write("quorum-type", "none");
+ } else {
+ gf_proc_dump_write("quorum-type", "fixed");
+ gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);
+ }
+ gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this, NULL));
+ if (priv->thin_arbiter_count) {
+ gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up);
+ gf_proc_dump_write("ta_bad_child_index", "%d",
+ priv->ta_bad_child_index);
+ gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64,
+ priv->ta_notify_dom_lock_offset);
+ }
+
+ return 0;
+}
-int32_t
-afr_entrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+/**
+ * find_child_index - find the child's index in the array of subvolumes
+ * @this: AFR
+ * @child: child
+ */
+static int
+afr_find_child_index(xlator_t *this, xlator_t *child)
{
- afr_local_t *local = NULL;
- int call_count = -1;
-
- local = frame->local;
+ afr_private_t *priv = NULL;
+ int child_count = -1;
+ int i = -1;
+
+ priv = this->private;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ child_count++;
+ }
+
+ for (i = 0; i < child_count; i++) {
+ if ((xlator_t *)child == priv->children[i])
+ break;
+ }
+
+ return i;
+}
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+int
+__afr_get_up_children_count(afr_private_t *priv)
+{
+ int up_children = 0;
+ int i = 0;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
- call_count = afr_frame_return (frame);
+ return up_children;
+}
- if (call_count == 0)
- AFR_STACK_UNWIND (entrylk, frame, local->op_ret,
- local->op_errno);
+static int
+__get_heard_from_all_status(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int i;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i]) {
+ return 0;
+ }
+ }
+ if (priv->thin_arbiter_count && !priv->ta_child_up) {
return 0;
+ }
+ return 1;
}
+glusterfs_event_t
+__afr_transform_event_from_state(xlator_t *this)
+{
+ int i = 0;
+ int up_children = 0;
+ afr_private_t *priv = this->private;
+
+ if (__get_heard_from_all_status(this))
+ /* have_heard_from_all. Let afr_notify() do the propagation. */
+ return GF_EVENT_MAXVAL;
+
+ up_children = __afr_get_up_children_count(priv);
+ /* Treat the children with pending notification, as having sent a
+ * GF_EVENT_CHILD_DOWN. i.e. set the event as GF_EVENT_SOME_DESCENDENT_DOWN,
+ * as done in afr_notify() */
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i])
+ continue;
+ priv->last_event[i] = GF_EVENT_SOME_DESCENDENT_DOWN;
+ priv->child_up[i] = 0;
+ }
+
+ if (up_children)
+ /* We received at least one child up */
+ return GF_EVENT_CHILD_UP;
+ else
+ return GF_EVENT_CHILD_DOWN;
+
+ return GF_EVENT_MAXVAL;
+}
-int32_t
-afr_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc,
- const char *basename, entrylk_cmd cmd, entrylk_type type)
+static void
+afr_notify_cbk(void *data)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ xlator_t *this = data;
+ afr_private_t *priv = this->private;
+ glusterfs_event_t event = GF_EVENT_MAXVAL;
+ gf_boolean_t propagate = _gf_false;
+
+ LOCK(&priv->lock);
+ {
+ if (!priv->timer) {
+ /*
+ * Either child_up/child_down is already sent to parent.
+ * This is a spurious wake up.
+ */
+ goto unlock;
+ }
+ priv->timer = NULL;
+ event = __afr_transform_event_from_state(this);
+ if (event != GF_EVENT_MAXVAL)
+ propagate = _gf_true;
+ }
+unlock:
+ UNLOCK(&priv->lock);
+ if (propagate)
+ default_notify(this, event, NULL);
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+static void
+__afr_launch_notify_timer(xlator_t *this, afr_private_t *priv)
+{
+ struct timespec delay = {
+ 0,
+ };
+
+ gf_msg_debug(this->name, 0, "Initiating child-down timer");
+ delay.tv_sec = 10;
+ delay.tv_nsec = 0;
+ priv->timer = gf_timer_call_after(this->ctx, delay, afr_notify_cbk, this);
+ if (priv->timer == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL,
+ "Cannot create timer for delayed initialization");
+ }
+}
- priv = this->private;
+static int
+find_best_down_child(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t best_child = -1;
+ int64_t best_latency = INT64_MAX;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] < best_latency) {
+ best_child = i;
+ best_latency = priv->child_latency[i];
+ }
+ }
+ if (best_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Found best down child (%d) @ %" PRId64 " ms latency",
+ best_child, best_latency);
+ }
+ return best_child;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+find_worst_up_child(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t worst_child = -1;
+ int64_t worst_latency = INT64_MIN;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] > worst_latency) {
+ worst_child = i;
+ worst_latency = priv->child_latency[i];
+ }
+ }
+ if (worst_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Found worst up child (%d) @ %" PRId64 " ms latency",
+ worst_child, worst_latency);
+ }
+ return worst_child;
+}
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+void
+__afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
+ int64_t halo_max_latency_msec, int32_t *event,
+ int64_t child_latency_msec)
+{
+ afr_private_t *priv = NULL;
+ int up_children = 0;
- call_count = local->call_count;
- frame->local = local;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_entrylk_cbk,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- volume, loc, basename, cmd, type);
-
- if (!--call_count)
- break;
- }
- }
+ priv->child_latency[idx] = child_latency_msec;
+ gf_msg_debug(child_xlator->name, 0, "Client ping @ %" PRId64 " ms",
+ child_latency_msec);
+ if (priv->shd.iamshd)
+ return;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno);
+ up_children = __afr_get_up_children_count(priv);
+
+ if (child_latency_msec > halo_max_latency_msec &&
+ priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) {
+ if ((up_children - 1) < priv->halo_min_replicas) {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Overriding halo threshold, "
+ "min replicas: %d",
+ priv->halo_min_replicas);
+ } else {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Child latency (%" PRId64
+ " ms) "
+ "exceeds halo threshold (%" PRId64
+ "), "
+ "marking child down.",
+ child_latency_msec, halo_max_latency_msec);
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_DOWN;
+ }
+ }
+ } else if (child_latency_msec < halo_max_latency_msec &&
+ priv->child_up[idx] == 0) {
+ if (up_children < priv->halo_max_replicas) {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Child latency (%" PRId64
+ " ms) "
+ "below halo threshold (%" PRId64
+ "), "
+ "marking child up.",
+ child_latency_msec, halo_max_latency_msec);
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_UP;
+ }
+ } else {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Not marking child %d up, "
+ "max replicas (%d) reached.",
+ idx, priv->halo_max_replicas);
}
- return 0;
+ }
}
+static int64_t
+afr_get_halo_latency(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int64_t halo_max_latency_msec = 0;
+
+ priv = this->private;
+
+ if (priv->shd.iamshd) {
+ halo_max_latency_msec = priv->shd.halo_max_latency_msec;
+ } else if (priv->nfsd.iamnfsd) {
+ halo_max_latency_msec = priv->nfsd.halo_max_latency_msec;
+ } else {
+ halo_max_latency_msec = priv->halo_max_latency_msec;
+ }
+ gf_msg_debug(this->name, 0, "Using halo latency %" PRId64,
+ halo_max_latency_msec);
+ return halo_max_latency_msec;
+}
+void
+__afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
+ const int idx, int64_t child_latency_msec,
+ int32_t *event, int32_t *call_psh,
+ int32_t *up_child)
+{
+ afr_private_t *priv = NULL;
+ int up_children = 0;
+ int worst_up_child = -1;
+ int64_t halo_max_latency_msec = afr_get_halo_latency(this);
+
+ priv = this->private;
+
+ /*
+ * This only really counts if the child was never up
+ * (value = -1) or had been down (value = 0). See
+ * comment at GF_EVENT_CHILD_DOWN for a more detailed
+ * explanation.
+ */
+ if (priv->child_up[idx] != 1) {
+ priv->event_generation++;
+ }
+ priv->child_up[idx] = 1;
+
+ *call_psh = 1;
+ *up_child = idx;
+ up_children = __afr_get_up_children_count(priv);
+ /*
+ * If this is an _actual_ CHILD_UP event, we
+ * want to set the child_latency to MAX to indicate
+ * the child needs ping data to be available before doing child-up
+ */
+ if (!priv->halo_enabled)
+ goto out;
+
+ if (child_latency_msec < 0) {
+ /*set to INT64_MAX-1 so that it is found for best_down_child*/
+ priv->halo_child_up[idx] = 1;
+ if (priv->child_latency[idx] < 0) {
+ priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ }
+ }
+
+ /*
+ * Handle the edge case where we exceed
+ * halo_min_replicas and we've got a child which is
+ * marked up as it was helping to satisfy the
+ * halo_min_replicas even though it's latency exceeds
+ * halo_max_latency_msec.
+ */
+ if (up_children > priv->halo_min_replicas) {
+ worst_up_child = find_worst_up_child(this);
+ if (worst_up_child >= 0 &&
+ priv->child_latency[worst_up_child] > halo_max_latency_msec) {
+ gf_msg_debug(this->name, 0,
+ "Marking child %d down, "
+ "doesn't meet halo threshold (%" PRId64
+ "), and > "
+ "halo_min_replicas (%d)",
+ worst_up_child, halo_max_latency_msec,
+ priv->halo_min_replicas);
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ }
+ }
+
+ if (up_children > priv->halo_max_replicas && !priv->shd.iamshd) {
+ worst_up_child = find_worst_up_child(this);
+ if (worst_up_child < 0) {
+ worst_up_child = idx;
+ }
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ gf_msg_debug(this->name, 0,
+ "Marking child %d down, "
+ "up_children (%d) > halo_max_replicas (%d)",
+ worst_up_child, up_children, priv->halo_max_replicas);
+ }
+out:
+ if (up_children == 1) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
+ "Subvolume '%s' came back up; "
+ "going online.",
+ child_xlator->name);
+ gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ } else {
+ *event = GF_EVENT_SOME_DESCENDENT_UP;
+ }
+
+ priv->last_event[idx] = *event;
+}
-int32_t
-afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+void
+__afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
+ int64_t child_latency_msec, int32_t *event,
+ int32_t *call_psh, int32_t *up_child)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int up_children = 0;
+ int down_children = 0;
+ int best_down_child = -1;
+
+ priv = this->private;
+
+ /*
+ * If a brick is down when we start, we'll get a
+ * CHILD_DOWN to indicate its initial state. There
+ * was never a CHILD_UP in this case, so if we
+ * increment "down_count" the difference between than
+ * and "up_count" will no longer be the number of
+ * children that are currently up. This has serious
+ * implications e.g. for quorum enforcement, so we
+ * don't increment these values unless the event
+ * represents an actual state transition between "up"
+ * (value = 1) and anything else.
+ */
+ if (priv->child_up[idx] == 1) {
+ priv->event_generation++;
+ }
+
+ /*
+ * If this is an _actual_ CHILD_DOWN event, we
+ * want to set the child_latency to < 0 to indicate
+ * the child is really disconnected.
+ */
+ if (child_latency_msec < 0) {
+ priv->child_latency[idx] = child_latency_msec;
+ priv->halo_child_up[idx] = 0;
+ }
+ priv->child_up[idx] = 0;
+
+ up_children = __afr_get_up_children_count(priv);
+ /*
+ * Handle the edge case where we need to find the
+ * next best child (to mark up) as marking this child
+ * down would cause us to fall below halo_min_replicas.
+ * We will also force the SHD to heal this child _now_
+ * as we want it to be up to date if we are going to
+ * begin using it synchronously.
+ */
+ if (priv->halo_enabled && up_children < priv->halo_min_replicas) {
+ best_down_child = find_best_down_child(this);
+ if (best_down_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Swapping out child %d for "
+ "child %d to satisfy halo_min_replicas (%d).",
+ idx, best_down_child, priv->halo_min_replicas);
+ priv->child_up[best_down_child] = 1;
+ *call_psh = 1;
+ *up_child = best_down_child;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 0)
+ down_children++;
+ if (down_children == priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN,
+ "All subvolumes are down. Going "
+ "offline until at least one of them "
+ "comes back up.");
+ gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ } else {
+ *event = GF_EVENT_SOME_DESCENDENT_DOWN;
+ }
+ priv->last_event[idx] = *event;
+}
+void
+afr_ta_lock_release_synctask(xlator_t *this)
{
- afr_local_t *local = NULL;
- int call_count = -1;
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
- local = frame->local;
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ return;
+ }
+
+ ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta,
+ afr_ta_lock_release_done, ta_frame, this);
+ if (ret) {
+ STACK_DESTROY(ta_frame->root);
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to release "
+ "AFR_TA_DOM_NOTIFY lock.");
+ }
+}
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+static void
+afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ afr_private_t *priv = this->private;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ lc = upcall->data;
- call_count = afr_frame_return (frame);
+ if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0)
+ return;
- if (call_count == 0)
- AFR_STACK_UNWIND (fentrylk, frame, local->op_ret,
- local->op_errno);
+ if (priv->shd.iamshd) {
+ /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */
+ return;
+ }
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Ignore multiple release requests from shds.*/
+ UNLOCK(&priv->lock);
+ return;
+ }
+ priv->release_ta_notify_dom_lock = _gf_true;
+ inmem_count = priv->ta_in_mem_txn_count;
+ onwire_count = priv->ta_on_wire_txn_count;
+ }
+ UNLOCK(&priv->lock);
+ if (inmem_count || onwire_count)
+ /* lock release will happen in txn code path after
+ * in-memory or on-wire txns are over.*/
+ return;
- return 0;
+ afr_ta_lock_release_synctask(this);
}
+static void
+afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *up_ci = NULL;
+ afr_private_t *priv = this->private;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ int i = 0;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_INODELK_CONTENTION:
+ afr_handle_inodelk_contention(this, upcall);
+ break;
+ case GF_UPCALL_CACHE_INVALIDATION:
+ up_ci = (struct gf_upcall_cache_invalidation *)upcall->data;
+
+ /* Since md-cache will be aggressively filtering
+ * lookups, the stale read issue will be more
+ * pronounced. Hence when a pending xattr is set notify
+ * all the md-cache clients to invalidate the existing
+ * stat cache and send the lookup next time */
+ if (!up_ci->dict)
+ break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!dict_get(up_ci->dict, priv->pending_key[i]))
+ continue;
+ up_ci->flags |= UP_INVAL_ATTR;
+ itable = ((xlator_t *)this->graph->top)->itable;
+ /*Internal processes may not have itable for
+ *top xlator*/
+ if (itable)
+ inode = inode_find(itable, upcall->gfid);
+ if (inode)
+ afr_inode_need_refresh_set(inode, this);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+}
int32_t
-afr_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd,
- const char *basename, entrylk_cmd cmd, entrylk_type type)
+afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_private_t *priv = NULL;
+ xlator_t *child_xlator = NULL;
+ int i = -1;
+ int propagate = 0;
+ int had_heard_from_all = 0;
+ int have_heard_from_all = 0;
+ int idx = -1;
+ int ret = -1;
+ int call_psh = 0;
+ int up_child = -1;
+ dict_t *input = NULL;
+ dict_t *output = NULL;
+ gf_boolean_t had_quorum = _gf_false;
+ gf_boolean_t has_quorum = _gf_false;
+ int64_t halo_max_latency_msec = 0;
+ int64_t child_latency_msec = -1;
+
+ child_xlator = (xlator_t *)data;
+
+ priv = this->private;
+
+ if (!priv)
+ return 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ /*
+ * We need to reset this in case children come up in "staggered"
+ * fashion, so that we discover a late-arriving local subvolume. Note
+ * that we could end up issuing N lookups to the first subvolume, and
+ * O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
+ */
+ priv->did_discovery = _gf_false;
+
+ /* parent xlators don't need to know about every child_up, child_down
+ * because of afr ha. If all subvolumes go down, child_down has
+ * to be triggered. In that state when 1 subvolume comes up child_up
+ * needs to be triggered. dht optimizes revalidate lookup by sending
+ * it only to one of its subvolumes. When child up/down happens
+ * for afr's subvolumes dht should be notified by child_modified. The
+ * subsequent revalidate lookup happens on all the dht's subvolumes
+ * which triggers afr self-heals if any.
+ */
+ idx = afr_find_child_index(this, child_xlator);
+ if (idx < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
+ "Received child_up from invalid subvolume");
+ goto out;
+ }
+
+ had_quorum = priv->quorum_count &&
+ afr_has_quorum(priv->child_up, this, NULL);
+ if (event == GF_EVENT_CHILD_PING) {
+ child_latency_msec = (int64_t)(uintptr_t)data2;
+ if (priv->halo_enabled) {
+ halo_max_latency_msec = afr_get_halo_latency(this);
+
+ /* Calculates the child latency and sets event
+ */
+ LOCK(&priv->lock);
+ {
+ __afr_handle_ping_event(this, child_xlator, idx,
+ halo_max_latency_msec, &event,
+ child_latency_msec);
+ }
+ UNLOCK(&priv->lock);
+ } else {
+ LOCK(&priv->lock);
+ {
+ priv->child_latency[idx] = child_latency_msec;
+ }
+ UNLOCK(&priv->lock);
+ }
+ }
- priv = this->private;
+ if (event == GF_EVENT_CHILD_PING) {
+ /* This is the only xlator that handles PING, no reason to
+ * propagate.
+ */
+ goto out;
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ if (event == GF_EVENT_TRANSLATOR_OP) {
+ LOCK(&priv->lock);
+ {
+ had_heard_from_all = __get_heard_from_all_status(this);
+ }
+ UNLOCK(&priv->lock);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ if (!had_heard_from_all) {
+ ret = -1;
+ } else {
+ input = data;
+ output = data2;
+ ret = afr_xl_op(this, input, output);
}
+ goto out;
+ }
- call_count = local->call_count;
- frame->local = local;
+ if (event == GF_EVENT_UPCALL) {
+ afr_handle_upcall_event(this, data);
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fentrylk_cbk,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
- volume, fd, basename, cmd, type);
-
- if (!--call_count)
- break;
+ LOCK(&priv->lock);
+ {
+ had_heard_from_all = __get_heard_from_all_status(this);
+ switch (event) {
+ case GF_EVENT_PARENT_UP:
+ __afr_launch_notify_timer(this, priv);
+ propagate = 1;
+ break;
+ case GF_EVENT_CHILD_UP:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 1;
+ priv->ta_event_gen++;
+ break;
+ }
+ __afr_handle_child_up_event(this, child_xlator, idx,
+ child_latency_msec, &event,
+ &call_psh, &up_child);
+ __afr_lock_heal_synctask(this, priv, idx);
+ break;
+
+ case GF_EVENT_CHILD_DOWN:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 0;
+ priv->ta_event_gen++;
+ afr_ta_locked_priv_invalidate(priv);
+ break;
}
+ __afr_handle_child_down_event(this, child_xlator, idx,
+ child_latency_msec, &event,
+ &call_psh, &up_child);
+ __afr_mark_pending_lk_heal(this, priv, idx);
+ break;
+
+ case GF_EVENT_CHILD_CONNECTING:
+ priv->last_event[idx] = event;
+
+ break;
+
+ case GF_EVENT_SOME_DESCENDENT_DOWN:
+ priv->last_event[idx] = event;
+ break;
+ default:
+ propagate = 1;
+ break;
}
+ have_heard_from_all = __get_heard_from_all_status(this);
+ if (!had_heard_from_all && have_heard_from_all) {
+ if (priv->timer) {
+ gf_timer_call_cancel(this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+ /* This is the first event which completes aggregation
+ of events from all subvolumes. If at least one subvol
+ had come up, propagate CHILD_UP, but only this time
+ */
+ event = GF_EVENT_CHILD_DOWN;
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
+ event = GF_EVENT_CHILD_UP;
+ break;
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno);
+ if (priv->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
+ event = GF_EVENT_CHILD_CONNECTING;
+ /* continue to check other events for CHILD_UP */
+ }
+ }
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (priv->quorum_count) {
+ has_quorum = afr_has_quorum(priv->child_up, this, NULL);
+ if (!had_quorum && has_quorum) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
+ "Client-quorum is met");
+ gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ }
+ if (had_quorum && !has_quorum) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
+ "Client-quorum is not met");
+ gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ }
+ }
+
+ /* if all subvols have reported status, no need to hide anything
+ or wait for anything else. Just propagate blindly */
+ if (have_heard_from_all)
+ propagate = 1;
+
+ ret = 0;
+ if (propagate)
+ ret = default_notify(this, event, data);
+
+ if ((!had_heard_from_all) || call_psh) {
+ /* Launch self-heal on all local subvolumes if:
+ * a) We have_heard_from_all for the first time
+ * b) Already heard from everyone, but we now got a child-up
+ * event.
+ */
+ if (have_heard_from_all) {
+ afr_selfheal_childup(this, priv);
}
- return 0;
+ }
+out:
+ return ret;
}
-int32_t
-afr_statfs_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct statvfs *statvfs)
+int
+afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
{
- afr_local_t *local = NULL;
- int call_count = 0;
+ int __ret = -1;
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ __ret = syncbarrier_init(&local->barrier);
+ if (__ret) {
+ if (op_errno)
+ *op_errno = __ret;
+ goto out;
+ }
+
+ local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up),
+ gf_afr_mt_char);
+ if (!local->child_up) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ memcpy(local->child_up, priv->child_up,
+ sizeof(*local->child_up) * priv->child_count);
+ local->call_count = AFR_COUNT(local->child_up, priv->child_count);
+ if (local->call_count == 0) {
+ gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
+ "no subvolumes up");
+ if (op_errno)
+ *op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->event_generation = priv->event_generation;
+
+ local->read_attempted = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->read_attempted) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->readable = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->readable) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->readable2 = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->readable2) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->read_subvol = -1;
+
+ local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+ gf_afr_mt_reply_t);
+ if (!local->replies) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->need_full_crawl = _gf_false;
+ if (priv->thin_arbiter_count) {
+ local->ta_child_up = priv->ta_child_up;
+ local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
+ local->read_txn_query_child = AFR_CHILD_UNKNOWN;
+ local->ta_event_gen = priv->ta_event_gen;
+ local->fop_state = TA_SUCCESS;
+ }
+ local->is_new_entry = _gf_false;
+
+ INIT_LIST_HEAD(&local->healer);
+ return 0;
+out:
+ return -1;
+}
- LOCK (&frame->lock);
- {
- local = frame->local;
-
- if (op_ret == 0) {
- local->op_ret = op_ret;
-
- if (local->cont.statfs.buf_set) {
- if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
- local->cont.statfs.buf = *statvfs;
- } else {
- local->cont.statfs.buf = *statvfs;
- local->cont.statfs.buf_set = 1;
- }
- }
+int
+afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count)
+{
+ int ret = -ENOMEM;
- if (op_ret == -1)
- local->op_errno = op_errno;
+ lk->lower_locked_nodes = GF_CALLOC(sizeof(*lk->lower_locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->lower_locked_nodes)
+ goto out;
- }
- UNLOCK (&frame->lock);
+ lk->lock_op_ret = -1;
+ lk->lock_op_errno = EUCLEAN;
+
+ ret = 0;
+out:
+ return ret;
+}
- call_count = afr_frame_return (frame);
+void
+afr_matrix_cleanup(int32_t **matrix, unsigned int m)
+{
+ int i = 0;
- if (call_count == 0)
- AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
- &local->cont.statfs.buf);
+ if (!matrix)
+ goto out;
+ for (i = 0; i < m; i++) {
+ GF_FREE(matrix[i]);
+ }
- return 0;
+ GF_FREE(matrix);
+out:
+ return;
}
-
-int32_t
-afr_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+int32_t **
+afr_matrix_create(unsigned int m, unsigned int n)
{
- afr_private_t * priv = NULL;
- int child_count = 0;
- afr_local_t * local = NULL;
- int i = 0;
- int ret = -1;
- int call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ int32_t **matrix = NULL;
+ int i = 0;
+
+ matrix = GF_CALLOC(sizeof(*matrix), m, gf_afr_mt_int32_t);
+ if (!matrix)
+ goto out;
+
+ for (i = 0; i < m; i++) {
+ matrix[i] = GF_CALLOC(sizeof(*matrix[i]), n, gf_afr_mt_int32_t);
+ if (!matrix[i])
+ goto out;
+ }
+ return matrix;
+out:
+ afr_matrix_cleanup(matrix, m);
+ return NULL;
+}
- priv = this->private;
- child_count = priv->child_count;
+int
+afr_transaction_local_init(afr_local_t *local, xlator_t *this)
+{
+ int ret = -ENOMEM;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ INIT_LIST_HEAD(&local->transaction.wait_list);
+ INIT_LIST_HEAD(&local->transaction.owner_list);
+ INIT_LIST_HEAD(&local->ta_waitq);
+ INIT_LIST_HEAD(&local->ta_onwireq);
+ ret = afr_internal_lock_init(&local->internal_lock, priv->child_count);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOMEM;
+ local->pre_op_compat = priv->pre_op_compat;
+
+ local->transaction.pre_op = GF_CALLOC(sizeof(*local->transaction.pre_op),
+ priv->child_count, gf_afr_mt_char);
+ if (!local->transaction.pre_op)
+ goto out;
+
+ local->transaction.changelog_xdata = GF_CALLOC(
+ sizeof(*local->transaction.changelog_xdata), priv->child_count,
+ gf_afr_mt_dict_t);
+ if (!local->transaction.changelog_xdata)
+ goto out;
+
+ if (priv->arbiter_count == 1) {
+ local->transaction.pre_op_sources = GF_CALLOC(
+ sizeof(*local->transaction.pre_op_sources), priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.pre_op_sources)
+ goto out;
+ }
+
+ local->transaction.failed_subvols = GF_CALLOC(
+ sizeof(*local->transaction.failed_subvols), priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.failed_subvols)
+ goto out;
+
+ local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!local->pending)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+void
+afr_set_low_priority(call_frame_t *frame)
+{
+ frame->root->pid = LOW_PRIO_PROC_PID;
+}
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+void
+afr_priv_destroy(afr_private_t *priv)
+{
+ int i = 0;
+ int child_count = -1;
- frame->local = local;
- call_count = local->call_count;
+ if (!priv)
+ goto out;
- for (i = 0; i < child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_statfs_cbk,
- priv->children[i],
- priv->children[i]->fops->statfs,
- loc);
- if (!--call_count)
- break;
- }
- }
+ GF_FREE(priv->sh_domain);
+ GF_FREE(priv->last_event);
- op_ret = 0;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ child_count++;
+ }
+ if (priv->pending_key) {
+ for (i = 0; i < child_count; i++)
+ GF_FREE(priv->pending_key[i]);
+ }
+
+ GF_FREE(priv->pending_reads);
+ GF_FREE(priv->local);
+ GF_FREE(priv->pending_key);
+ GF_FREE(priv->children);
+ GF_FREE(priv->anon_inode);
+ GF_FREE(priv->child_up);
+ GF_FREE(priv->halo_child_up);
+ GF_FREE(priv->child_latency);
+ LOCK_DESTROY(&priv->lock);
+
+ GF_FREE(priv);
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL);
- }
- return 0;
+ return;
}
+int **
+afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, ia_type_t iat)
+{
+ int i = 0;
+ int **changelog = NULL;
+ int idx = -1;
+ int m_idx = 0;
+ int d_idx = 0;
+ int ret = 0;
+
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ idx = afr_index_from_ia_type(iat);
+
+ changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ continue;
+
+ changelog[i][m_idx] = hton32(1);
+ if (idx != -1)
+ changelog[i][idx] = hton32(1);
+ /* If the newentry marking is on a newly created directory,
+ * then mark it with the full-heal indicator.
+ */
+ if ((IA_ISDIR(iat)) && (priv->esh_granular))
+ changelog[i][d_idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict(priv, xattr, changelog);
+ if (ret < 0) {
+ afr_matrix_cleanup(changelog, priv->child_count);
+ return NULL;
+ }
+out:
+ return changelog;
+}
-int32_t
-afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
+static dict_t *
+afr_set_heal_info(char *status)
{
- afr_local_t * local = NULL;
- int call_count = -1;
+ dict_t *dict = NULL;
+ int ret = -1;
+
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_dynstr_sizen(dict, "heal-info", status);
+ if (ret)
+ gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set heal-info key to "
+ "%s",
+ status);
+out:
+ /* Any error other than EINVAL, dict_set_dynstr frees status */
+ if (ret == -ENOMEM || ret == -EINVAL) {
+ GF_FREE(status);
+ }
+
+ if (ret && dict) {
+ dict_unref(dict);
+ dict = NULL;
+ }
+ return dict;
+}
- local = frame->local;
- call_count = afr_frame_return (frame);
+static gf_boolean_t
+afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ afr_private_t *priv = this->private;
+ int *dirty = alloca0(priv->child_count * sizeof(int));
+ int i = 0;
- if (call_count == 0)
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- lock);
+ afr_selfheal_extract_xattr(this, replies, type, dirty, NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dirty[i] > 1)
+ return _gf_true;
+ }
- return 0;
+ return _gf_false;
}
-
-int32_t
-afr_lk_unlock (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int i = 0;
- int call_count = 0;
+ gf_boolean_t data_chk = _gf_false;
+ gf_boolean_t mdata_chk = _gf_false;
+ gf_boolean_t entry_chk = _gf_false;
+
+ switch (ia_type) {
+ case IA_IFDIR:
+ mdata_chk = _gf_true;
+ entry_chk = _gf_true;
+ break;
+ case IA_IFREG:
+ mdata_chk = _gf_true;
+ data_chk = _gf_true;
+ break;
+ default:
+ /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/
+ mdata_chk = _gf_true;
+ break;
+ }
- local = frame->local;
- priv = this->private;
+ if (data_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_DATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_METADATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_ENTRY_TRANSACTION)) {
+ return _gf_true;
+ }
- call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,
- priv->child_count);
+ return _gf_false;
+}
- if (call_count == 0) {
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- &local->cont.lk.ret_flock);
- return 0;
+static int
+afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh,
+ gf_boolean_t *msh, unsigned char pending)
+{
+ int ret = -1;
+ GF_UNUSED int ret1 = 0;
+ int i = 0;
+ int io_domain_lk_count = 0;
+ int shd_domain_lk_count = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
+
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((replies[i].valid != 1) || (replies[i].op_ret != 0))
+ continue;
+ if (!io_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count);
+ }
+ if (!shd_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count);
+ }
+ }
+
+ if (!pending) {
+ if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) ||
+ (!io_domain_lk_count)) {
+ /* Needs heal. */
+ ret = 0;
+ } else {
+ /* No heal needed. */
+ *dsh = *esh = *msh = 0;
}
+ } else {
+ if (shd_domain_lk_count) {
+ ret = -EAGAIN; /*For 'possibly-healing'. */
+ } else {
+ ret = 0; /*needs heal. Just set a non -ve value so that it is
+ assumed as the source index.*/
+ }
+ }
+ return ret;
+}
- local->call_count = call_count;
+/*return EIO, EAGAIN or pending*/
+int
+afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **inode, gf_boolean_t *entry_selfheal,
+ gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal, unsigned char *pending)
+{
+ int ret = -1;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ gf_boolean_t dsh = _gf_false;
+ gf_boolean_t msh = _gf_false;
+ gf_boolean_t esh = _gf_false;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *valid_on = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+ sources = alloca0(sizeof(*sources) * priv->child_count);
+ sinks = alloca0(sizeof(*sinks) * priv->child_count);
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ valid_on = alloca0(sizeof(*valid_on) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh,
+ &esh, replies);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ valid_on[i] = 1;
+ }
+ }
+ if (msh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_METADATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
+ if (dsh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_DATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
+ if (esh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
- local->cont.lk.user_flock.l_type = F_UNLCK;
+ ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh,
+ &msh, *pending);
+out:
+ *data_selfheal = dsh;
+ *entry_selfheal = esh;
+ *metadata_selfheal = msh;
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ return ret;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->cont.lk.locked_nodes[i]) {
- STACK_WIND (frame, afr_lk_unlock_cbk,
- priv->children[i],
- priv->children[i]->fops->lk,
- local->fd, F_SETLK,
- &local->cont.lk.user_flock);
-
- if (!--call_count)
- break;
- }
+int
+afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ unsigned char pending = 0;
+ dict_t *dict = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ inode_t *inode = NULL;
+ char *substr = NULL;
+ char *status = NULL;
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
+
+ /*Use frame with lk-owner set*/
+ heal_frame = afr_frame_create(frame->this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+
+ ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode,
+ &entry_selfheal, &data_selfheal,
+ &metadata_selfheal, &pending);
+
+ if (ret == -ENOMEM) {
+ ret = -1;
+ goto out;
+ }
+
+ if (pending & PFLAG_PENDING) {
+ gf_asprintf(&substr, "-pending");
+ if (!substr)
+ goto out;
+ }
+
+ if (ret == -EIO) {
+ ret = gf_asprintf(&status, "split-brain%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ } else if (ret == -EAGAIN) {
+ ret = gf_asprintf(&status, "possibly-healing%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ } else if (ret >= 0) {
+ /* value of ret = source index
+ * so ret >= 0 and at least one of the 3 booleans set to
+ * true means a source is identified; heal is required.
+ */
+ if (!data_selfheal && !entry_selfheal && !metadata_selfheal) {
+ status = gf_strdup("no-heal");
+ if (!status) {
+ ret = -1;
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ }
+ } else if (ret < 0) {
+ /* Apart from above checked -ve ret values, there are
+ * other possible ret values like ENOTCONN
+ * (returned when number of valid replies received are
+ * less than 2)
+ * in which case heal is required when one of the
+ * selfheal booleans is set.
+ */
+ if (data_selfheal || entry_selfheal || metadata_selfheal) {
+ ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
}
+ }
- return 0;
-}
+ ret = 0;
+ op_errno = 0;
+out:
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ if (dict)
+ dict_unref(dict);
+ if (inode)
+ inode_unref(inode);
+ GF_FREE(substr);
+ return ret;
+}
-int32_t
-afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
+int
+_afr_is_split_brain(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies, afr_transaction_type type,
+ gf_boolean_t *spb)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int child_index = -1;
-/* int ret = 0; */
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ int sources_count = 0;
+ int ret = 0;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ witness = alloca0(priv->child_count * sizeof(*witness));
+
+ ret = afr_selfheal_find_direction(frame, this, replies, type,
+ priv->child_up, sources, sinks, witness,
+ NULL);
+ if (ret)
+ return ret;
+ sources_count = AFR_COUNT(sources, priv->child_count);
+ if (!sources_count)
+ *spb = _gf_true;
- local = frame->local;
- priv = this->private;
+ return ret;
+}
- child_index = (long) cookie;
+int
+afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb)
+{
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
- if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
- local->op_ret = -1;
- local->op_errno = op_errno;
+ priv = this->private;
- afr_lk_unlock (frame, this);
- return 0;
- }
+ replies = alloca0(sizeof(*replies) * priv->child_count);
- if (op_ret == 0) {
- local->op_ret = 0;
- local->op_errno = 0;
- local->cont.lk.locked_nodes[child_index] = 1;
- local->cont.lk.ret_flock = *lock;
- }
+ ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+ if (ret)
+ goto out;
- child_index++;
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ ret = -EAGAIN;
+ goto out;
+ }
- if (child_index < priv->child_count) {
- STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->lk,
- local->fd, local->cont.lk.cmd,
- &local->cont.lk.user_flock);
- } else if (local->op_ret == -1) {
- /* all nodes have gone down */
+ ret = _afr_is_split_brain(frame, this, replies, AFR_DATA_TRANSACTION,
+ d_spb);
+ if (ret)
+ goto out;
- AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN,
- &local->cont.lk.ret_flock);
- } else {
- /* locking has succeeded on all nodes that are up */
+ ret = _afr_is_split_brain(frame, this, replies, AFR_METADATA_TRANSACTION,
+ m_spb);
+out:
+ if (replies) {
+ afr_replies_wipe(replies, priv->child_count);
+ replies = NULL;
+ }
+ return ret;
+}
- /* temporarily
- ret = afr_mark_locked_nodes (this, local->fd,
- local->cont.lk.locked_nodes);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not save locked nodes info in fdctx");
+int
+afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ GF_FREE(opaque);
+ return 0;
+}
- ret = afr_save_locked_fd (this, local->fd);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not save locked fd");
+int
+afr_get_split_brain_status(void *opaque)
+{
+ gf_boolean_t d_spb = _gf_false;
+ gf_boolean_t m_spb = _gf_false;
+ int ret = -1;
+ int op_errno = 0;
+ int i = 0;
+ char *choices = NULL;
+ char *status = NULL;
+ dict_t *dict = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ loc_t *loc = NULL;
+ afr_spb_status_t *data = NULL;
+
+ data = opaque;
+ frame = data->frame;
+ this = frame->this;
+ loc = data->loc;
+ priv = this->private;
+ children = priv->children;
+
+ inode = afr_inode_find(this, loc->gfid);
+ if (!inode)
+ goto out;
+
+ dict = dict_new();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ /* Calculation for string length :
+ * (child_count X length of child-name) + SLEN(" Choices :")
+ * child-name consists of :
+ * a) 251 = max characters for volname according to GD_VOLUME_NAME_MAX
+ * b) strlen("-client-00,") assuming 16 replicas
+ */
+ choices = alloca0(priv->child_count * (256 + SLEN("-client-00,")) +
+ SLEN(" Choices:"));
+
+ ret = afr_is_split_brain(frame, this, inode, loc->gfid, &d_spb, &m_spb);
+ if (ret) {
+ op_errno = -ret;
+ if (ret == -EAGAIN) {
+ ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+ SBRAIN_HEAL_NO_GO_MSG);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ AFR_MSG_DICT_SET_FAILED,
+ "Failed to set GF_AFR_SBRAIN_STATUS in dict");
+ }
+ }
+ ret = -1;
+ goto out;
+ }
+
+ if (d_spb || m_spb) {
+ sprintf(choices, " Choices:");
+ for (i = 0; i < priv->child_count; i++) {
+ strcat(choices, children[i]->name);
+ strcat(choices, ",");
+ }
+ choices[strlen(choices) - 1] = '\0';
- */
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- &local->cont.lk.ret_flock);
+ ret = gf_asprintf(&status,
+ "data-split-brain:%s "
+ "metadata-split-brain:%s%s",
+ (d_spb) ? "yes" : "no", (m_spb) ? "yes" : "no",
+ choices);
+
+ if (-1 == ret) {
+ op_errno = ENOMEM;
+ goto out;
}
+ ret = dict_set_dynstr_sizen(dict, GF_AFR_SBRAIN_STATUS, status);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+ SFILE_NOT_UNDER_DATA);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ }
- return 0;
+ ret = 0;
+out:
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ if (dict)
+ dict_unref(dict);
+ if (inode)
+ inode_unref(inode);
+ return ret;
}
+int32_t
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int ret = 0;
+ int op_errno = 0;
+ dict_t *dict = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
+
+ local = frame->local;
+ dict = dict_new();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ heal_frame = afr_frame_create(this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+ /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk
+ * work correctly*/
+ ret = afr_selfheal_do(heal_frame, this, loc->gfid);
+
+ if (ret == 1 || ret == 2) {
+ ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set sh-fail-msg in dict");
+ ret = 0;
+ goto out;
+ } else {
+ if (local->xdata_rsp) {
+ /* 'sh-fail-msg' has been set in the dict during self-heal.*/
+ dict_copy(local->xdata_rsp, dict);
+ ret = 0;
+ } else if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ }
+ }
+
+out:
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
+ if (local->op == GF_FOP_GETXATTR)
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ else if (local->op == GF_FOP_SETXATTR)
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ if (dict)
+ dict_unref(dict);
+ return ret;
+}
int
-afr_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t cmd, struct gf_flock *flock)
+afr_get_child_index_from_name(xlator_t *this, char *name)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_private_t *priv = this->private;
+ int index = -1;
+
+ for (index = 0; index < priv->child_count; index++) {
+ if (!strcmp(priv->children[index]->name, name))
+ goto out;
+ }
+ index = -1;
+out:
+ return index;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+void
+afr_priv_need_heal_set(afr_private_t *priv, gf_boolean_t need_heal)
+{
+ LOCK(&priv->lock);
+ {
+ priv->need_heal = need_heal;
+ }
+ UNLOCK(&priv->lock);
+}
- priv = this->private;
+void
+afr_set_need_heal(xlator_t *this, afr_local_t *local)
+{
+ int i = 0;
+ afr_private_t *priv = this->private;
+ gf_boolean_t need_heal = _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].need_heal) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+ afr_priv_need_heal_set(priv, need_heal);
+ return;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
- AFR_LOCAL_INIT (local, priv);
+gf_boolean_t
+afr_get_need_heal(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ gf_boolean_t need_heal = _gf_true;
+
+ LOCK(&priv->lock);
+ {
+ need_heal = priv->need_heal;
+ }
+ UNLOCK(&priv->lock);
+ return need_heal;
+}
- frame->local = local;
+int
+afr_get_msg_id(char *op_type)
+{
+ if (!strcmp(op_type, GF_AFR_REPLACE_BRICK))
+ return AFR_MSG_REPLACE_BRICK_STATUS;
+ else if (!strcmp(op_type, GF_AFR_ADD_BRICK))
+ return AFR_MSG_ADD_BRICK_STATUS;
+ return -1;
+}
- local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.lk.locked_nodes),
- gf_afr_mt_char);
+int
+afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *heal_frame,
+ void *opaque)
+{
+ call_frame_t *txn_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ xlator_t *this = NULL;
- if (!local->cont.lk.locked_nodes) {
- op_errno = ENOMEM;
- goto out;
- }
+ heal_local = heal_frame->local;
+ txn_frame = heal_local->heal_frame;
+ local = txn_frame->local;
+ this = txn_frame->this;
- local->fd = fd_ref (fd);
- local->cont.lk.cmd = cmd;
- local->cont.lk.user_flock = *flock;
- local->cont.lk.ret_flock = *flock;
+ /* Refresh the inode agan and proceed with the transaction.*/
+ afr_inode_refresh(txn_frame, this, local->inode, NULL, local->refreshfn);
- STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
- priv->children[i],
- priv->children[i]->fops->lk,
- fd, cmd, flock);
+ AFR_STACK_DESTROY(heal_frame);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL);
- }
- return 0;
+ return 0;
}
int
-afr_priv_dump (xlator_t *this)
+afr_fav_child_reset_sink_xattrs(void *opaque)
{
- afr_private_t *priv = NULL;
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
+ call_frame_t *heal_frame = NULL;
+ call_frame_t *txn_frame = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t d_spb = _gf_false;
+ gf_boolean_t m_spb = _gf_false;
+ afr_local_t *heal_local = NULL;
+ afr_local_t *txn_local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ unsigned char *locked_on = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int ret = 0;
+
+ heal_frame = (call_frame_t *)opaque;
+ heal_local = heal_frame->local;
+ txn_frame = heal_local->heal_frame;
+ txn_local = txn_frame->local;
+ this = txn_frame->this;
+ inode = txn_local->inode;
+ priv = this->private;
+ locked_on = alloca0(priv->child_count);
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = _afr_is_split_brain(txn_frame, this, txn_local->replies,
+ AFR_DATA_TRANSACTION, &d_spb);
+
+ ret = _afr_is_split_brain(txn_frame, this, txn_local->replies,
+ AFR_METADATA_TRANSACTION, &m_spb);
+
+ /* Take appropriate locks and reset sink xattrs. */
+ if (d_spb) {
+ ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
+ locked_on);
+ {
+ if (ret < priv->child_count)
+ goto data_unlock;
+ ret = __afr_selfheal_data_prepare(
+ heal_frame, this, inode, locked_on, sources, sinks,
+ healed_sinks, undid_pending, locked_replies, NULL);
+ }
+ data_unlock:
+ afr_selfheal_uninodelk(heal_frame, this, inode, this->name, 0, 0,
+ locked_on);
+ }
+
+ if (m_spb) {
+ memset(locked_on, 0, sizeof(*locked_on) * priv->child_count);
+ memset(undid_pending, 0, sizeof(*undid_pending) * priv->child_count);
+ ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, locked_on);
+ {
+ if (ret < priv->child_count)
+ goto mdata_unlock;
+ ret = __afr_selfheal_metadata_prepare(
+ heal_frame, this, inode, locked_on, sources, sinks,
+ healed_sinks, undid_pending, locked_replies, NULL);
+ }
+ mdata_unlock:
+ afr_selfheal_uninodelk(heal_frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, locked_on);
+ }
+
+ return ret;
+}
+/*
+ * Concatenates the xattrs in local->replies separated by a delimiter.
+ */
+int
+afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
+ char *buf, const char *default_str,
+ int32_t *serz_len, char delimiter)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *xattr = NULL;
+ int i = 0;
+ int len = 0;
+ int keylen = 0;
+ size_t str_len = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ keylen = strlen(local->cont.getxattr.name);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret) {
+ str_len = strlen(default_str);
+ buf = strncat(buf, default_str, str_len);
+ len += str_len;
+ buf[len++] = delimiter;
+ buf[len] = '\0';
+ } else {
+ ret = dict_get_strn(local->replies[i].xattr,
+ local->cont.getxattr.name, keylen, &xattr);
+ if (ret) {
+ gf_msg("TEST", GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "Failed to get the node_uuid of brick "
+ "%d",
+ i);
+ goto out;
+ }
+ str_len = strlen(xattr);
+ buf = strncat(buf, xattr, str_len);
+ len += str_len;
+ buf[len++] = delimiter;
+ buf[len] = '\0';
+ }
+ }
+ buf[--len] = '\0'; /*remove the last delimiter*/
+ if (serz_len)
+ *serz_len = ++len;
+ ret = 0;
- GF_ASSERT (this);
- priv = this->private;
+out:
+ return ret;
+}
- GF_ASSERT (priv);
- snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
- gf_proc_dump_add_section(key_prefix);
- gf_proc_dump_build_key(key, key_prefix, "child_count");
- gf_proc_dump_write(key, "%u", priv->child_count);
- gf_proc_dump_build_key(key, key_prefix, "read_child_rr");
- gf_proc_dump_write(key, "%u", priv->read_child_rr);
- for (i = 0; i < priv->child_count; i++) {
- gf_proc_dump_build_key(key, key_prefix, "child_up[%d]", i);
- gf_proc_dump_write(key, "%d", priv->child_up[i]);
- gf_proc_dump_build_key(key, key_prefix,
- "pending_key[%d]", i);
- gf_proc_dump_write(key, "%s", priv->pending_key[i]);
- }
- gf_proc_dump_build_key(key, key_prefix, "data_self_heal");
- gf_proc_dump_write(key, "%d", priv->data_self_heal);
- gf_proc_dump_build_key(key, key_prefix, "metadata_self_heal");
- gf_proc_dump_write(key, "%d", priv->metadata_self_heal);
- gf_proc_dump_build_key(key, key_prefix, "entry_self_heal");
- gf_proc_dump_write(key, "%d", priv->entry_self_heal);
- gf_proc_dump_build_key(key, key_prefix, "data_change_log");
- gf_proc_dump_write(key, "%d", priv->data_change_log);
- gf_proc_dump_build_key(key, key_prefix, "metadata_change_log");
- gf_proc_dump_write(key, "%d", priv->metadata_change_log);
- gf_proc_dump_build_key(key, key_prefix, "entry_change_log");
- gf_proc_dump_write(key, "%d", priv->entry_change_log);
- gf_proc_dump_build_key(key, key_prefix, "read_child");
- gf_proc_dump_write(key, "%d", priv->read_child);
- gf_proc_dump_build_key(key, key_prefix, "favorite_child");
- gf_proc_dump_write(key, "%u", priv->favorite_child);
- gf_proc_dump_build_key(key, key_prefix, "data_lock_server_count");
- gf_proc_dump_write(key, "%u", priv->data_lock_server_count);
- gf_proc_dump_build_key(key, key_prefix, "metadata_lock_server_count");
- gf_proc_dump_write(key, "%u", priv->metadata_lock_server_count);
- gf_proc_dump_build_key(key, key_prefix, "entry_lock_server_count");
- gf_proc_dump_write(key, "%u", priv->entry_lock_server_count);
- gf_proc_dump_build_key(key, key_prefix, "wait_count");
- gf_proc_dump_write(key, "%u", priv->wait_count);
+uint64_t
+afr_write_subvol_get(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ uint64_t write_subvol = 0;
- return 0;
-}
+ local = frame->local;
+ LOCK(&local->inode->lock);
+ write_subvol = local->inode_ctx->write_subvol;
+ UNLOCK(&local->inode->lock);
+ return write_subvol;
+}
-/**
- * find_child_index - find the child's index in the array of subvolumes
- * @this: AFR
- * @child: child
- */
+int
+afr_write_subvol_set(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int event = 0;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ data_accused = alloca0(priv->child_count);
+ metadata_accused = alloca0(priv->child_count);
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+ event = local->event_generation;
+
+ afr_readables_fill(frame, this, local->inode, data_accused,
+ metadata_accused, data_readable, metadata_readable,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_readable[i])
+ datamap |= (1 << i);
+ if (metadata_readable[i])
+ metadatamap |= (1 << i);
+ }
+
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
+
+ LOCK(&local->inode->lock);
+ {
+ if (local->inode_ctx->write_subvol == 0 &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ local->inode_ctx->write_subvol = val;
+ }
+ }
+ UNLOCK(&local->inode->lock);
+
+ return 0;
+}
-static int
-find_child_index (xlator_t *this, xlator_t *child)
+int
+afr_write_subvol_reset(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- int i = -1;
+ afr_local_t *local = NULL;
- priv = this->private;
+ local = frame->local;
+ LOCK(&local->inode->lock);
+ {
+ GF_ASSERT(local->inode_ctx->lock_count > 0);
+ local->inode_ctx->lock_count--;
- for (i = 0; i < priv->child_count; i++) {
- if ((xlator_t *) child == priv->children[i])
- break;
- }
+ if (!local->inode_ctx->lock_count)
+ local->inode_ctx->write_subvol = 0;
+ }
+ UNLOCK(&local->inode->lock);
- return i;
+ return 0;
}
-int32_t
-afr_notify (xlator_t *this, int32_t event,
- void *data, ...)
+int
+afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode)
{
- afr_private_t *priv = NULL;
- int i = -1;
- int up_children = 0;
- int down_children = 0;
- int propagate = 0;
+ int ret = 0;
+
+ local->inode = inode_ref(inode);
+ LOCK(&local->inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, local->inode, &local->inode_ctx);
+ }
+ UNLOCK(&local->inode->lock);
+ if (ret < 0) {
+ gf_msg_callingfn(
+ this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_INODE_CTX_GET_FAILED,
+ "Error getting inode ctx %s", uuid_utoa(local->inode->gfid));
+ }
+ return ret;
+}
- int had_heard_from_all = 0;
- int have_heard_from_all = 0;
- int idx = -1;
- int ret = -1;
+gf_boolean_t
+afr_ta_is_fop_called_from_synctask(xlator_t *this)
+{
+ struct synctask *task = NULL;
+ gf_lkowner_t tmp_owner = {
+ 0,
+ };
- priv = this->private;
+ task = synctask_get();
+ if (!task)
+ return _gf_false;
- if (!priv)
- return 0;
+ set_lk_owner_from_ptr(&tmp_owner, (void *)this);
- had_heard_from_all = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i]) {
- had_heard_from_all = 0;
- }
- }
+ if (!is_same_lkowner(&tmp_owner, &task->frame->root->lk_owner))
+ return _gf_false;
- /* parent xlators dont need to know about every child_up, child_down
- * because of afr ha. If all subvolumes go down, child_down has
- * to be triggered. In that state when 1 subvolume comes up child_up
- * needs to be triggered. dht optimises revalidate lookup by sending
- * it only to one of its subvolumes. When child up/down happens
- * for afr's subvolumes dht should be notified by child_modified. The
- * subsequent revalidate lookup happens on all the dht's subvolumes
- * which triggers afr self-heals if any.
- */
- idx = find_child_index (this, data);
- if (idx < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Received child_up "
- "from invalid subvolume");
- goto out;
- }
+ return _gf_true;
+}
- switch (event) {
- case GF_EVENT_CHILD_UP:
- LOCK (&priv->lock);
- {
- priv->child_up[idx] = 1;
- priv->up_count++;
-
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 1)
- up_children++;
- if (up_children == 1) {
- gf_log (this->name, GF_LOG_INFO,
- "Subvolume '%s' came back up; "
- "going online.", ((xlator_t *)data)->name);
- } else {
- event = GF_EVENT_CHILD_MODIFIED;
- }
-
- priv->last_event[idx] = event;
- }
- UNLOCK (&priv->lock);
+int
+afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
+{
+ int ret = 0;
+ uuid_t gfid = {
+ 0,
+ };
+ afr_private_t *priv = this->private;
+ gf_boolean_t locked = _gf_false;
+ struct gf_flock flock1 = {
+ 0,
+ };
+ struct gf_flock flock2 = {
+ 0,
+ };
+ int32_t cmd = 0;
+
+ /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock
+ * has been released in afr_notify due to upcall notification from shd.
+ */
+ GF_ASSERT(priv->ta_notify_dom_lock_offset == 0);
+
+ if (!priv->shd.iamshd)
+ GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+ flock1.l_type = F_WRLCK;
+
+ while (!locked) {
+ if (priv->shd.iamshd) {
+ cmd = F_SETLKW;
+ flock1.l_start = 0;
+ flock1.l_len = 0;
+ } else {
+ cmd = F_SETLK;
+ gf_uuid_generate(gfid);
+ flock1.l_start = gfid_to_ino(gfid);
+ if (flock1.l_start < 0)
+ flock1.l_start = -flock1.l_start;
+ flock1.l_len = 1;
+ }
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, cmd, &flock1, NULL, NULL);
+ if (!ret) {
+ locked = _gf_true;
+ priv->ta_notify_dom_lock_offset = flock1.l_start;
+ } else if (ret == -EAGAIN) {
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to get "
+ "AFR_TA_DOM_NOTIFY lock on %s.",
+ loc->name);
+ goto out;
+ }
+ }
+
+ flock2.l_type = F_WRLCK;
+ flock2.l_start = 0;
+ flock2.l_len = 0;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);
+ flock1.l_type = F_UNLCK;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL,
+ NULL);
+ }
+out:
+ return ret;
+}
- break;
+int
+afr_ta_post_op_unlock(xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = 0;
+
+ if (!priv->shd.iamshd)
+ GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+ flock.l_type = F_UNLCK;
+ flock.l_start = 0;
+ flock.l_len = 0;
+
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_MODIFY lock.");
+ goto out;
+ }
+
+ if (!priv->shd.iamshd)
+ /* Mounts (clients) will not release the AFR_TA_DOM_NOTIFY lock
+ * in post-op as they use it as a notification mechanism. When
+ * shd sends a lock request on TA during heal, the clients will
+ * receive a lock-contention upcall notification upon which they
+ * will release the AFR_TA_DOM_NOTIFY lock after completing the
+ * in flight I/O.*/
+ goto out;
+
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+ }
+out:
+ return ret;
+}
- case GF_EVENT_CHILD_DOWN:
- LOCK (&priv->lock);
- {
- priv->child_up[idx] = 0;
- priv->down_count++;
-
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 0)
- down_children++;
- if (down_children == priv->child_count) {
- gf_log (this->name, GF_LOG_ERROR,
- "All subvolumes are down. Going offline "
- "until atleast one of them comes back up.");
- } else {
- event = GF_EVENT_CHILD_MODIFIED;
- }
-
- priv->last_event[idx] = event;
- }
- UNLOCK (&priv->lock);
+call_frame_t *
+afr_ta_frame_create(xlator_t *this)
+{
+ call_frame_t *frame = NULL;
+ void *lk_owner = NULL;
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return NULL;
+ lk_owner = (void *)this;
+ afr_set_lk_owner(frame, this, lk_owner);
+ return frame;
+}
- break;
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
+{
+ int data_count = 0;
- case GF_EVENT_CHILD_CONNECTING:
- LOCK (&priv->lock);
- {
- priv->last_event[idx] = event;
- }
- UNLOCK (&priv->lock);
- break;
- default:
- propagate = 1;
- break;
- }
+ data_count = AFR_COUNT(local->child_up, priv->child_count);
+ if (data_count == 2) {
+ return _gf_true;
+ } else if (data_count == 1 && local->ta_child_up) {
+ return _gf_true;
+ }
- /* have all subvolumes reported status once by now? */
- have_heard_from_all = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i])
- have_heard_from_all = 0;
- }
+ return _gf_false;
+}
- /* if all subvols have reported status, no need to hide anything
- or wait for anything else. Just propagate blindly */
- if (have_heard_from_all)
- propagate = 1;
+static gf_boolean_t
+afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
- if (!had_heard_from_all && have_heard_from_all) {
- /* This is the first event which completes aggregation
- of events from all subvolumes. If at least one subvol
- had come up, propagate CHILD_UP, but only this time
- */
- event = GF_EVENT_CHILD_DOWN;
-
- LOCK (&priv->lock);
- {
- for (i = 0; i < priv->child_count; i++) {
- if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
- event = GF_EVENT_CHILD_UP;
- break;
- }
-
- if (priv->last_event[i] ==
- GF_EVENT_CHILD_CONNECTING) {
- event = GF_EVENT_CHILD_CONNECTING;
- /* continue to check other events for CHILD_UP */
- }
- }
- }
- UNLOCK (&priv->lock);
- }
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT)
+ return _gf_false;
- ret = 0;
- if (propagate)
- ret = default_notify (this, event, data);
+ local = frame->local;
-out:
- return ret;
-}
-void
-afr_reset_children (int32_t *fresh_children, int32_t child_count)
-{
- unsigned int i = 0;
- for (i = 0; i < child_count; i++)
- fresh_children[i] = -1;
+ if (local->op != GF_FOP_LOOKUP)
+ /* TODO:If the replica count is being increased on a plain distribute
+ * volume that was never mounted, we need to allow setxattr on '/' with
+ * GF_CLIENT_PID_NO_ROOT_SQUASH to accomodate for DHT layout setting */
+ return _gf_false;
+
+ if (local->inode == NULL)
+ return _gf_false;
+
+ if (!__is_root_gfid(local->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
}
-int32_t*
-afr_children_create (unsigned int child_count)
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count)
{
- int32_t *children = NULL;
- int i = 0;
+ if (frame && (up_children_count > 0) &&
+ afr_is_add_replica_mount_lookup_on_root(frame))
+ return _gf_true;
- children = GF_CALLOC (child_count, sizeof (*children),
- gf_afr_mt_int32_t);
- if (NULL == children)
- goto out;
- for (i = 0; i < child_count; i++)
- children[i] = -1;
-out:
- return children;
+ return _gf_false;
}
void
-afr_fresh_children_add_child (int32_t *fresh_children, int32_t child,
- int32_t child_count)
-{
- gf_boolean_t child_found = _gf_false;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (fresh_children[i] == -1)
- break;
- if (fresh_children[i] == child) {
- child_found = _gf_true;
- break;
- }
- }
- if (!child_found) {
- GF_ASSERT (i < child_count);
- fresh_children[i] = child;
- }
-}
-
-int
-afr_get_children_count (int32_t *fresh_children, unsigned int child_count)
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this)
{
- int count = 0;
- int i = 0;
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ unsigned char *success_replies = NULL;
- for (i = 0; i < child_count; i++) {
- if (fresh_children[i] == -1)
- break;
- count++;
- }
- return count;
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+
+ if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_errno = afr_final_errno(local, priv);
+ if (!local->op_errno)
+ local->op_errno = afr_quorum_errno(priv);
+ local->op_ret = -1;
+ }
}
-void
-afr_get_fresh_children (int32_t *success_children, int32_t *sources,
- int32_t *fresh_children, unsigned int child_count)
-{
- unsigned int i = 0;
- unsigned int j = 0;
-
- GF_ASSERT (success_children);
- GF_ASSERT (sources);
- GF_ASSERT (fresh_children);
-
- afr_reset_children (fresh_children, child_count);
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- if (afr_is_read_child (success_children, sources, child_count,
- success_children[i])) {
- fresh_children[j] = success_children[i];
- j++;
- }
- }
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child)
+{
+ int *pending = NULL;
+ int ret = 0;
+ int i = 0;
+
+ ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending);
+ if (ret == 0) {
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ /* Not doing a ntoh32(pending) as we just want to check
+ * if it is non-zero or not. */
+ if (pending[i]) {
+ return _gf_true;
+ }
+ }
+ }
+
+ return _gf_false;
}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 9e799b85813..f8bf8340dab 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -1,768 +1,346 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "checksum.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/list.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
#include "afr.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-int
-afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret,
- int32_t op_errno)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- afr_set_opendir_done (this, local->fd->inode);
-
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
-
- return 0;
-}
-
-
-gf_boolean_t
-__checksums_differ (uint32_t *checksum, int child_count,
- unsigned char *child_up)
-{
- int ret = _gf_false;
- int i = 0;
- uint32_t cksum = 0;
- gf_boolean_t activate_check = _gf_false;
-
- for (i = 0; i < child_count; i++) {
- if (!child_up[i])
- continue;
- if (_gf_false == activate_check) {
- cksum = checksum[i];
- activate_check = _gf_true;
- continue;
- }
-
- if (cksum != checksum[i]) {
- ret = _gf_true;
- break;
- }
-
- cksum = checksum[i];
- }
-
- return ret;
-}
-
+#include "afr-transaction.h"
int32_t
-afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
+afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- gf_dirent_t * entry = NULL;
- gf_dirent_t * tmp = NULL;
- int child_index = 0;
- uint32_t entry_cksum = 0;
- int call_count = 0;
- off_t last_offset = 0;
- char sh_type_str[256] = {0,};
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to do opendir on %s",
- local->loc.path, priv->children[child_index]->name);
- local->op_ret = -1;
- local->op_ret = op_errno;
- goto out;
- }
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int32_t child_index = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: no entries found in %s",
- local->loc.path, priv->children[child_index]->name);
- goto out;
- }
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+ child_index = (long)cookie;
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- entry_cksum = gf_rsync_weak_checksum (entry->d_name,
- strlen (entry->d_name));
- local->cont.opendir.checksum[child_index] ^= entry_cksum;
- }
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
+ local->op_ret = op_ret;
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- /* read more entries */
-
- STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->readdir,
- local->fd, 131072, last_offset);
-
- return 0;
-
-out:
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (__checksums_differ (local->cont.opendir.checksum,
- priv->child_count,
- local->child_up)) {
-
- sh->need_entry_self_heal = _gf_true;
- sh->forced_merge = _gf_true;
- sh->type = local->fd->inode->ia_type;
- sh->background = _gf_false;
- sh->unwind = afr_examine_dir_sh_unwind;
-
- afr_self_heal_type_str_get(&local->self_heal,
- sh_type_str,
- sizeof(sh_type_str));
- gf_log (this->name, GF_LOG_INFO,
- "%s self-heal triggered. path: %s, "
- "reason: checksums of directory differ,"
- " forced merge option set",
- sh_type_str, local->loc.path);
-
- afr_self_heal (frame, this, local->fd->inode);
- } else {
- afr_set_opendir_done (this, local->fd->inode);
-
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
- }
- }
+ if (call_count == 0) {
+ afr_handle_replies_quorum(frame, this);
+ AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno,
+ local->fd, NULL);
+ }
- return 0;
+ return 0;
}
-
int
-afr_examine_dir (call_frame_t *frame, xlator_t *this)
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
-
- local->cont.opendir.checksum = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.opendir.checksum),
- gf_afr_mt_int32_t);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = -1;
+ int32_t op_errno = ENOMEM;
+ afr_fd_ctx_t *fd_ctx = NULL;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ priv = this->private;
- local->call_count = call_count;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->readdir,
- local->fd, 131072, 0);
+ local->op = GF_FOP_OPENDIR;
- if (!--call_count)
- break;
- }
- }
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
- return 0;
-}
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
+ goto out;
-int32_t
-afr_opendir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int32_t up_children_count = 0;
- int ret = -1;
- int call_count = -1;
+ loc_copy(&local->loc, loc);
- priv = this->private;
- local = frame->local;
+ local->fd = fd_ref(fd);
+ local->fd_ctx = fd_ctx;
- up_children_count = afr_up_children_count (priv->child_count,
- local->child_up);
+ call_count = local->call_count;
- LOCK (&frame->lock);
- {
- if (op_ret >= 0)
- local->op_ret = op_ret;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_opendir_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->opendir, loc, fd, NULL);
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (local->op_ret != 0)
- goto out;
-
- ret = afr_fd_ctx_set (this, local->fd);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "failed to set fd ctx for fd %p",
- local->fd);
- goto out;
- }
- if (!afr_is_opendir_done (this, local->fd->inode) &&
- up_children_count > 1) {
-
- /*
- * This is the first opendir on this inode. We need
- * to check if the directory's entries are the same
- * on all subvolumes. This is needed in addition
- * to regular entry self-heal because the readdir
- * call is sent only to the first subvolume, and
- * thus files that exist only there will never be healed
- * otherwise (assuming changelog shows no anamolies).
- */
-
- gf_log (this->name, GF_LOG_TRACE,
- "reading contents of directory %s looking for mismatch",
- local->loc.path);
-
- afr_examine_dir (frame, this);
-
- } else {
- /* do the unwind */
- goto out;
- }
+ if (!--call_count)
+ break;
}
+ }
- return 0;
-
+ return 0;
out:
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
-
- return 0;
+ AFR_STACK_UNWIND(opendir, frame, -1, op_errno, fd, NULL);
+ return 0;
}
-
-int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd)
+static int
+afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int child_count = 0;
- int i = 0;
- int ret = -1;
- int call_count = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- child_count = priv->child_count;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- loc_copy (&local->loc, loc);
-
- frame->local = local;
- local->fd = fd_ref (fd);
-
- call_count = local->call_count;
-
- for (i = 0; i < child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_opendir_cbk,
- priv->children[i],
- priv->children[i]->fops->opendir,
- loc, fd);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd);
- }
-
+ int gen = 0;
+ int entry_read_subvol = 0;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable,
+ &gen);
+
+ if (gen != priv->event_generation || !data_readable[par_read_subvol] ||
+ !metadata_readable[par_read_subvol])
+ return -1;
+
+ /* Once the control reaches the following statement, it means that the
+ * parent's read subvol is perfectly readable. So calling
+ * either afr_data_subvol_get() or afr_metadata_subvol_get() would
+ * yield the same result. Hence, choosing afr_data_subvol_get() below.
+ */
+
+ if (!priv->consistent_metadata)
return 0;
-}
-
-
-/**
- * Common algorithm for directory read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: readdir
- */
-
-struct entry_name {
- char *name;
- struct list_head list;
-};
-
-
-static gf_boolean_t
-remembered_name (const char *name, struct list_head *entries)
-{
- struct entry_name *e = NULL;
- gf_boolean_t ret = _gf_false;
-
- list_for_each_entry (e, entries, list) {
- if (!strcmp (name, e->name)) {
- ret = _gf_true;
- goto out;
- }
- }
-
-out:
- return ret;
+ /* For an inode fetched through readdirp which is yet to be linked,
+ * inode ctx would not be initialised (yet). So this function returns
+ * -1 above due to gen being 0, which is why it is OK to pass NULL for
+ * read_subvol_args here.
+ */
+ entry_read_subvol = afr_data_subvol_get(inode, this, NULL, NULL, NULL,
+ NULL);
+ if (entry_read_subvol != par_read_subvol)
+ return -1;
+
+ return 0;
}
-
static void
-afr_remember_entries (gf_dirent_t *entries, fd_t *fd)
+afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries,
+ int subvol, gf_dirent_t *entries, fd_t *fd)
{
- struct entry_name *n = NULL;
- gf_dirent_t *entry = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- return;
+ int ret = -1;
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t need_heal = _gf_false;
+ gf_boolean_t validate_subvol = _gf_false;
+
+ this = THIS;
+ priv = this->private;
+
+ need_heal = afr_get_need_heal(this);
+ validate_subvol = need_heal | priv->consistent_metadata;
+
+ list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list)
+ {
+ if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name,
+ frame->root->pid)) {
+ continue;
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ list_del_init(&entry->list);
+ list_add_tail(&entry->list, &entries->list);
- list_for_each_entry (entry, &entries->list, list) {
- n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name);
- n->name = gf_strdup (entry->d_name);
- INIT_LIST_HEAD (&n->list);
+ if (!validate_subvol)
+ continue;
- list_add (&n->list, &fd_ctx->entries);
+ if (entry->inode) {
+ ret = afr_validate_read_subvol(entry->inode, this, subvol);
+ if (ret == -1) {
+ inode_unref(entry->inode);
+ entry->inode = NULL;
+ continue;
+ }
}
+ }
}
-
-static off_t
-afr_filter_entries (gf_dirent_t *entries, fd_t *fd)
+int32_t
+afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries,
+ dict_t *xdata)
{
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- off_t offset = 0;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- return -1;
- }
+ afr_local_t *local = NULL;
+ gf_dirent_t entries;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ INIT_LIST_HEAD(&entries.list);
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- offset = entry->d_off;
+ local = frame->local;
- if (remembered_name (entry->d_name, &fd_ctx->entries)) {
- list_del (&entry->list);
- GF_FREE (entry);
- }
- }
+ if (op_ret < 0 && !local->cont.readdir.offset) {
+ /* failover only if this was first readdir, detected
+ by offset == 0 */
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- return offset;
-}
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
+ if (op_ret >= 0)
+ afr_readdir_transform_entries(frame, subvol_entries, (long)cookie,
+ &entries, local->fd);
-static void
-afr_forget_entries (fd_t *fd)
-{
- struct entry_name *entry = NULL;
- struct entry_name *tmp = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- return;
- }
+ AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata);
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ gf_dirent_free(&entries);
- list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) {
- GF_FREE (entry->name);
- list_del (&entry->list);
- GF_FREE (entry);
- }
+ return 0;
}
-
-int32_t
-afr_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
+int
+afr_readdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- gf_dirent_t * entry = NULL;
- gf_dirent_t * tmp = NULL;
- int child_index = -1;
-
- priv = this->private;
- local = frame->local;
- child_index = (long) cookie;
-
- if (op_ret == -1)
- goto out;
-
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- entry->d_ino = afr_itransform (entry->d_ino,
- priv->child_count,
- child_index);
-
- if ((local->fd->inode == local->fd->inode->table->root)
- && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
- list_del_init (&entry->list);
- GF_FREE (entry);
- }
- }
-
-out:
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries);
-
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ local->op_errno = EINVAL;
+ local->op_ret = -1;
+ }
+
+ if (subvol == -1 || !fd_ctx) {
+ AFR_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, 0, 0);
return 0;
+ }
+
+ fd_ctx->readdir_subvol = subvol;
+
+ if (local->op == GF_FOP_READDIR)
+ STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdir, local->fd,
+ local->cont.readdir.size, local->cont.readdir.offset,
+ local->xdata_req);
+ else
+ STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdirp, local->fd,
+ local->cont.readdir.size, local->cont.readdir.offset,
+ local->xdata_req);
+ return 0;
}
-
-int32_t
-afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries)
+int
+afr_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, int whichop, dict_t *dict)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- ino_t inum = 0;
- int call_child = 0;
- int first_call_child = 0;
- int ret = 0;
- gf_dirent_t * entry = NULL;
- gf_dirent_t * tmp = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- off_t offset = 0;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
-
- first_call_child = (long) cookie;
- if (local->cont.readdir.last_tried == -1) {
- call_child = (long) cookie;
- } else {
- call_child = local->cont.readdir.last_tried;
- }
-
- if (priv->strict_readdir) {
- ret = fd_ctx_get (local->fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", local->fd);
- op_ret = -1;
- op_errno = -ret;
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (child_went_down (op_ret, op_errno)) {
- if ((call_child + 1) % priv->child_count
- == first_call_child) {
- gf_log (this->name, GF_LOG_INFO,
- "all options tried going out");
- goto out;
- }
-
- call_child = (call_child + 1) % priv->child_count;
- local->cont.readdir.last_tried = call_child;
-
- gf_log (this->name, GF_LOG_TRACE,
- "starting readdir afresh on child %d, offset %"PRId64,
- call_child, (uint64_t) 0);
-
- fd_ctx->failed_over = _gf_true;
-
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) first_call_child,
- children[call_child],
- children[call_child]->fops->readdirp, local->fd,
- local->cont.readdir.size, 0);
- return 0;
- }
- }
-
- if (op_ret != -1) {
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- inum = afr_itransform (entry->d_ino, priv->child_count,
- call_child);
- entry->d_ino = inum;
- inum = afr_itransform (entry->d_stat.ia_ino,
- priv->child_count, call_child);
- entry->d_stat.ia_ino = inum;
-
- if ((local->fd->inode == local->fd->inode->table->root)
- && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
- list_del_init (&entry->list);
- GF_FREE (entry);
- }
- }
- }
-
- if (priv->strict_readdir) {
- if (fd_ctx->failed_over) {
- if (list_empty (&entries->list)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no entries found");
- goto out;
- }
-
- offset = afr_filter_entries (entries, local->fd);
-
- afr_remember_entries (entries, local->fd);
-
- if (list_empty (&entries->list)) {
- /* All the entries we got were duplicate. We
- shouldn't send an empty list now, because
- that'll make the application stop reading. So
- try to get more entries */
-
- gf_log (this->name, GF_LOG_TRACE,
- "trying to fetch non-duplicate entries "
- "from offset %"PRId64", child %s",
- offset, children[call_child]->name);
-
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) first_call_child,
- children[call_child],
- children[call_child]->fops->readdirp,
- local->fd, local->cont.readdir.size, offset);
- return 0;
- }
- } else {
- afr_remember_entries (entries, local->fd);
- }
- }
-
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int subvol = -1;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ local->op = whichop;
+ local->fd = fd_ref(fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+ local->xdata_req = (dict) ? dict_ref(dict) : NULL;
+
+ subvol = fd_ctx->readdir_subvol;
+
+ if (offset == 0 || subvol == -1) {
+ /* First readdir has option of failing over and selecting
+ an appropriate read subvolume */
+ afr_read_txn(frame, this, fd->inode, afr_readdir_wind,
+ AFR_DATA_TRANSACTION);
+ } else {
+ /* But continued readdirs MUST stick to the same subvolume
+ without an option to failover */
+ afr_readdir_wind(frame, this, subvol);
+ }
+
+ return 0;
out:
- AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries);
-
- return 0;
+ AFR_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
-
int32_t
-afr_do_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, int whichop)
+afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t read_child = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- children = priv->children;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- frame->local = local;
-
- read_child = afr_read_child (this, fd->inode);
-
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
- local->cont.readdir.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_INFO,
- "no child is up");
- goto out;
- }
- local->cont.readdir.last_tried = call_child;
- }
-
- local->fd = fd_ref (fd);
- local->cont.readdir.size = size;
-
- if (priv->strict_readdir) {
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (fd_ctx->last_tried != call_child) {
- gf_log (this->name, GF_LOG_TRACE,
- "first up child has changed from %d to %d, "
- "restarting readdir from offset 0",
- fd_ctx->last_tried, call_child);
+ afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
- fd_ctx->failed_over = _gf_true;
- offset = 0;
- }
-
- fd_ctx->last_tried = call_child;
- }
-
- if (whichop == GF_FOP_READDIR)
- STACK_WIND_COOKIE (frame, afr_readdir_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdir, fd,
- size, offset);
- else
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdirp, fd,
- size, offset);
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, NULL);
- }
- return 0;
+ return 0;
}
-
int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
+afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict)
{
- afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR);
- return 0;
-}
+ afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIRP, dict);
-
-int32_t
-afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
-{
- afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP);
- return 0;
+ return 0;
}
-
int32_t
-afr_releasedir (xlator_t *this, fd_t *fd)
+afr_releasedir(xlator_t *this, fd_t *fd)
{
- afr_forget_entries (fd);
- afr_cleanup_fd_ctx (this, fd);
+ afr_cleanup_fd_ctx(this, fd);
- return 0;
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
index ae853c9d852..773e925ec6c 100644
--- a/xlators/cluster/afr/src/afr-dir-read.h
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -1,50 +1,33 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __DIR_READ_H__
#define __DIR_READ_H__
-
int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd);
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata);
int32_t
-afr_releasedir (xlator_t *this, fd_t *fd);
+afr_releasedir(xlator_t *this, fd_t *fd);
int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset);
-
+afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata);
int32_t
-afr_readdirp (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset);
+afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict);
int32_t
-afr_getdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, int32_t flag);
-
-
-int32_t
-afr_checksum (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags);
-
+afr_checksum(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ dict_t *xdata);
#endif /* __DIR_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index c6f68a072c3..b7cceb79158 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -1,1318 +1,937 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
#include "afr.h"
#include "afr-transaction.h"
-
void
-afr_build_parent_loc (loc_t *parent, loc_t *child)
-{
- char *tmp = NULL;
-
- if (!child->parent) {
- //this should never be called with root as the child
- GF_ASSERT (0);
- loc_copy (parent, child);
- return;
- }
-
- tmp = gf_strdup (child->path);
- parent->path = gf_strdup (dirname (tmp));
- GF_FREE (tmp);
-
- parent->name = strrchr (parent->path, '/');
- if (parent->name)
- parent->name++;
-
- parent->inode = inode_ref (child->parent);
- parent->parent = inode_parent (parent->inode, 0, NULL);
- parent->ino = parent->inode->ino;
-
- if (!uuid_is_null (child->pargfid))
- uuid_copy (parent->gfid, child->pargfid);
-}
-
-/* {{{ create */
+afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this);
int
-afr_create_unwind (call_frame_t *frame, xlator_t *this)
+afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.create.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.create.read_child_buf;
- } else {
- unwind_buf = &local->cont.create.buf;
- }
-
- unwind_buf->ia_ino = local->cont.create.ino;
-
- local->cont.create.preparent.ia_ino = local->cont.create.parent_ino;
- local->cont.create.postparent.ia_ino = local->cont.create.parent_ino;
-
- AFR_STACK_UNWIND (create, main_frame,
- local->op_ret, local->op_errno,
- local->cont.create.fd,
- local->cont.create.inode,
- unwind_buf, &local->cont.create.preparent,
- &local->cont.create.postparent);
- }
+ int ret = -1;
+ char *child_path = NULL;
+
+ if (!child->parent) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
+ }
+
+ child_path = gf_strdup(child->path);
+ if (!child_path) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ parent->path = gf_strdup(dirname(child_path));
+ if (!parent->path) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ parent->inode = inode_ref(child->parent);
+ gf_uuid_copy(parent->gfid, child->pargfid);
+
+ ret = 0;
+out:
+ GF_FREE(child_path);
- return 0;
+ return ret;
}
-
-int
-afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- fd_t *fd, inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+static void
+__afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- ret = afr_fd_ctx_set (this, fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set ctx on fd=%p", fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not get fd ctx for fd=%p", fd);
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = 1;
- fd_ctx->flags = local->cont.create.flags;
-
- if (local->success_count == 0) {
- local->cont.create.buf = *buf;
-
- local->cont.create.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->first_up_child) {
- local->cont.create.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- local->first_up_child);
- }
-
- if (child_index == local->read_child_index) {
- local->cont.create.read_child_buf = *buf;
- local->cont.create.preparent = *preparent;
- local->cont.create.postparent = *postparent;
- }
-
- local->cont.create.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
-
-unlock:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int inode_read_subvol = -1;
+ int parent_read_subvol = -1;
+ int parent2_read_subvol = -1;
+ int i = 0;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == -1)
+ continue;
+ gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid);
+ args.ia_type = local->replies[i].poststat.ia_type;
+ break;
+ }
+
+ if (local->inode) {
+ if (local->op != GF_FOP_RENAME && local->op != GF_FOP_LINK)
+ afr_replies_interpret(frame, this, local->inode, NULL);
+
+ inode_read_subvol = afr_data_subvol_get(local->inode, this, NULL, NULL,
+ NULL, &args);
+ }
+
+ if (local->parent)
+ parent_read_subvol = afr_data_subvol_get(local->parent, this, NULL,
+ local->readable, NULL, NULL);
+
+ if (local->parent2)
+ parent2_read_subvol = afr_data_subvol_get(local->parent2, this, NULL,
+ local->readable2, NULL, NULL);
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ afr_pick_error_xdata(local, priv, local->parent, local->readable,
+ local->parent2, local->readable2);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ if (local->inode)
+ afr_inode_need_refresh_set(local->inode, this);
+ if (local->parent)
+ afr_inode_need_refresh_set(local->parent, this);
+ if (local->parent2)
+ afr_inode_need_refresh_set(local->parent2, this);
+ continue;
+ }
+
+ if (local->op_ret == -1) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.dir_fop.buf = local->replies[i].poststat;
+ local->cont.dir_fop.preparent = local->replies[i].preparent;
+ local->cont.dir_fop.postparent = local->replies[i].postparent;
+ local->cont.dir_fop.prenewparent = local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent = local->replies[i].postparent2;
+ if (local->xdata_rsp) {
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ }
+
+ if (local->replies[i].xdata)
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ continue;
+ }
+
+ if (i == inode_read_subvol) {
+ local->cont.dir_fop.buf = local->replies[i].poststat;
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ }
+
+ if (i == parent_read_subvol) {
+ local->cont.dir_fop.preparent = local->replies[i].preparent;
+ local->cont.dir_fop.postparent = local->replies[i].postparent;
+ }
+
+ if (i == parent2_read_subvol) {
+ local->cont.dir_fop.prenewparent = local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent = local->replies[i].postparent2;
+ }
+ }
}
+static void
+__afr_dir_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *poststat,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+
+ if (op_ret >= 0) {
+ if (poststat)
+ local->replies[child_index].poststat = *poststat;
+ if (preparent)
+ local->replies[child_index].preparent = *preparent;
+ if (postparent)
+ local->replies[child_index].postparent = *postparent;
+ if (preparent2)
+ local->replies[child_index].preparent2 = *preparent2;
+ if (postparent2)
+ local->replies[child_index].postparent2 = *postparent2;
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ if (op_errno != ENOTEMPTY)
+ afr_transaction_fop_failed(frame, this, child_index);
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ }
+
+ return;
+}
-int
-afr_create_wind (call_frame_t *frame, xlator_t *this)
+static int
+__afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ priv = this->private;
+ local = frame->local;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ LOCK(&frame->lock);
+ {
+ __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf,
+ preparent, postparent, preparent2, postparent2,
+ xdata);
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ if (call_count == 0) {
+ __afr_dir_write_finalize(frame, this);
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->create,
- &local->loc,
- local->cont.create.flags,
- local->cont.create.mode,
- local->cont.create.fd,
- local->cont.create.params);
- if (!--call_count)
- break;
- }
+ if (afr_txn_nothing_failed(frame, this)) {
+ /*if it did pre-op, it will do post-op changing ctime*/
+ if (priv->consistent_metadata && afr_needs_changelog_update(local))
+ afr_zero_fill_stat(local);
+ local->transaction.unwind(frame, this);
}
- return 0;
-}
+ afr_mark_entry_pending_changelog(frame, this);
+
+ afr_transaction_resume(frame, this);
+ }
+ return 0;
+}
int
-afr_create_done (call_frame_t *frame, xlator_t *this)
+afr_mark_new_entry_changelog_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
{
- afr_local_t * local = NULL;
+ int call_count = 0;
- local = frame->local;
+ call_count = afr_frame_return(frame);
- local->transaction.unwind (frame, this);
+ if (call_count == 0)
+ AFR_STACK_DESTROY(frame);
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ return 0;
}
-
-int
-afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params)
+void
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
+ call_frame_t *new_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *new_local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int32_t **changelog = NULL;
+ int i = 0;
+ int op_errno = ENOMEM;
+ unsigned char *pending = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ new_frame = copy_frame(frame);
+ if (!new_frame)
+ goto out;
+
+ new_local = AFR_FRAME_INIT(new_frame, op_errno);
+ if (!new_local)
+ goto out;
+
+ xattr = dict_new();
+ if (!xattr)
+ goto out;
+
+ pending = alloca0(priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ !local->transaction.failed_subvols[i]) {
+ call_count++;
+ continue;
+ }
+ pending[i] = 1;
+ }
+
+ changelog = afr_mark_pending_changelog(priv, pending, xattr,
+ local->cont.dir_fop.buf.ia_type);
+ if (!changelog)
+ goto out;
+
+ new_local->pending = changelog;
+ gf_uuid_copy(new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid);
+ new_local->loc.inode = inode_ref(local->inode);
+
+ new_local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (pending[i])
+ continue;
+
+ STACK_WIND_COOKIE(new_frame, afr_mark_new_entry_changelog_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->xattrop, &new_local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL);
+ if (!--call_count)
+ break;
+ }
+
+ new_frame = NULL;
+out:
+ if (new_frame)
+ AFR_STACK_DESTROY(new_frame);
+ if (xattr)
+ dict_unref(xattr);
+ return;
+}
- loc_copy (&local->loc, loc);
+void
+afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_count = 0;
+ int failed_count = 0;
+ unsigned char *success_replies = NULL;
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local = frame->local;
+ priv = this->private;
- local->cont.create.flags = flags;
- local->cont.create.mode = mode;
- local->cont.create.fd = fd_ref (fd);
- if (params)
- local->cont.create.params = dict_ref (params);
+ if (local->op_ret < 0)
+ return;
- if (loc->parent)
- local->cont.create.parent_ino = loc->parent->ino;
+ if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD &&
+ local->op != GF_FOP_MKDIR)
+ return;
- local->transaction.fop = afr_create_wind;
- local->transaction.done = afr_create_done;
- local->transaction.unwind = afr_create_unwind;
+ pre_op_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
+ /* FOP succeeded on all bricks. */
+ if (pre_op_count == priv->child_count && !failed_count)
+ return;
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
+ /* FOP did not suceed on quorum no. of bricks. */
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+ if (!afr_has_quorum(success_replies, this, NULL))
+ return;
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (priv->thin_arbiter_count) {
+ /*Mark new entry using ta file*/
+ local->is_new_entry = _gf_true;
+ return;
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (create, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ afr_mark_new_entry_changelog(frame, this);
- return 0;
+ return;
}
-/* }}} */
-
-/* {{{ mknod */
+/* {{{ create */
int
-afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
+afr_create_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- if (main_frame) {
- if (local->cont.mknod.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.mknod.read_child_buf;
- } else {
- unwind_buf = &local->cont.mknod.buf;
- }
+ local = frame->local;
- unwind_buf->ia_ino = local->cont.mknod.ino;
-
- local->cont.mknod.preparent.ia_ino = local->cont.mknod.parent_ino;
- local->cont.mknod.postparent.ia_ino = local->cont.mknod.parent_ino;
-
- AFR_STACK_UNWIND (mknod, main_frame,
- local->op_ret, local->op_errno,
- local->cont.mknod.inode,
- unwind_buf, &local->cont.mknod.preparent,
- &local->cont.mknod.postparent);
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(create, main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd, local->inode,
+ &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
+afr_create_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0){
- local->cont.mknod.buf = *buf;
- local->cont.mknod.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->first_up_child) {
- local->cont.mknod.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- local->first_up_child);
- }
-
- if (child_index == local->read_child_index) {
- local->cont.mknod.read_child_buf = *buf;
- local->cont.mknod.preparent = *preparent;
- local->cont.mknod.postparent = *postparent;
- }
-
- local->cont.mknod.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
-int32_t
-afr_mknod_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mknod,
- &local->loc, local->cont.mknod.mode,
- local->cont.mknod.dev,
- local->cont.mknod.params);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
int
-afr_mknod_done (call_frame_t *frame, xlator_t *this)
+afr_create_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_create_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->create, &local->loc,
+ local->cont.create.flags, local->cont.create.mode,
+ local->umask, local->cont.create.fd, local->xdata_req);
+ return 0;
}
-
int
-afr_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev, dict_t *params)
+afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
-
- local->cont.mknod.mode = mode;
- local->cont.mknod.dev = dev;
- if (params)
- local->cont.mknod.params = dict_ref (params);
-
- if (loc->parent)
- local->cont.mknod.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_mknod_wind;
- local->transaction.done = afr_mknod_done;
- local->transaction.unwind = afr_mknod_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+
+ local->fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!local->fd_ctx)
+ goto out;
+
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->op = GF_FOP_CREATE;
+ local->cont.create.flags = flags;
+ local->fd_ctx->flags = flags;
+ local->cont.create.mode = mode;
+ local->cont.create.fd = fd_ref(fd);
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_create_wind;
+ local->transaction.unwind = afr_create_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (mknod, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
-/* {{{ mkdir */
-
+/* {{{ mknod */
int
-afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
+afr_mknod_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.mkdir.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.mkdir.read_child_buf;
- } else {
- unwind_buf = &local->cont.mkdir.buf;
- }
-
- unwind_buf->ia_ino = local->cont.mkdir.ino;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local->cont.mkdir.preparent.ia_ino = local->cont.mkdir.parent_ino;
- local->cont.mkdir.postparent.ia_ino = local->cont.mkdir.parent_ino;
-
- AFR_STACK_UNWIND (mkdir, main_frame,
- local->op_ret, local->op_errno,
- local->cont.mkdir.inode,
- unwind_buf, &local->cont.mkdir.preparent,
- &local->cont.mkdir.postparent);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
-
-int
-afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.mkdir.buf = *buf;
-
- local->cont.mkdir.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->first_up_child) {
- local->cont.mkdir.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- local->first_up_child);
- }
-
- if (child_index == local->read_child_index) {
- local->cont.mkdir.read_child_buf = *buf;
- local->cont.mkdir.preparent = *preparent;
- local->cont.mkdir.postparent = *postparent;
- }
-
- local->cont.mkdir.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(mknod, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-
int
-afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+afr_mknod_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mkdir,
- &local->loc, local->cont.mkdir.mode,
- local->cont.mkdir.params);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_mkdir_done (call_frame_t *frame, xlator_t *this)
+afr_mknod_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_mknod_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mknod, &local->loc,
+ local->cont.mknod.mode, local->cont.mknod.dev,
+ local->umask, local->xdata_req);
+ return 0;
}
-
int
-afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params)
+afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
-
- local->cont.mkdir.mode = mode;
- if (params)
- local->cont.mkdir.params = dict_ref (params);
-
- if (loc->parent)
- local->cont.mkdir.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_mkdir_wind;
- local->transaction.done = afr_mkdir_done;
- local->transaction.unwind = afr_mkdir_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->op = GF_FOP_MKNOD;
+ local->cont.mknod.mode = mode;
+ local->cont.mknod.dev = dev;
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_mknod_wind;
+ local->transaction.unwind = afr_mknod_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (mkdir, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
-/* {{{ link */
-
+/* {{{ mkdir */
int
-afr_link_unwind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.link.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.link.read_child_buf;
- } else {
- unwind_buf = &local->cont.link.buf;
- }
-
- unwind_buf->ia_ino = local->cont.link.ino;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local->cont.link.preparent.ia_ino = local->cont.link.parent_ino;
- local->cont.link.postparent.ia_ino = local->cont.link.parent_ino;
-
- AFR_STACK_UNWIND (link, main_frame,
- local->op_ret, local->op_errno,
- local->cont.link.inode,
- unwind_buf, &local->cont.link.preparent,
- &local->cont.link.postparent);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(mkdir, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+afr_mkdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, inode_t *inode,
struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.link.buf = *buf;
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->read_child_index) {
- local->cont.link.read_child_buf = *buf;
- local->cont.link.preparent = *preparent;
- local->cont.link.postparent = *postparent;
- }
-
- local->cont.link.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
+int
+afr_mkdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.resume (frame, this);
- }
+ local = frame->local;
+ priv = this->private;
- return 0;
+ STACK_WIND_COOKIE(frame, afr_mkdir_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mkdir, &local->loc,
+ local->cont.mkdir.mode, local->umask, local->xdata_req);
+ return 0;
}
-
int
-afr_link_wind (call_frame_t *frame, xlator_t *this)
+afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.mkdir.mode = mode;
+ local->umask = umask;
+
+ if (!xdata || !dict_get_sizen(xdata, "gfid-req")) {
+ op_errno = EPERM;
+ gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_GFID_NULL,
+ "mkdir: %s is received "
+ "without gfid-req %p",
+ loc->path, xdata);
+ goto out;
+ }
+
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ if (!local->xdata_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->op = GF_FOP_MKDIR;
+ local->transaction.wind = afr_mkdir_wind;
+ local->transaction.unwind = afr_mkdir_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ AFR_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
+}
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+/* }}} */
- local->call_count = call_count;
+/* {{{ link */
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->link,
- &local->loc,
- &local->newloc);
+int
+afr_link_unwind(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- if (!--call_count)
- break;
- }
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(link, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_link_done (call_frame_t *frame, xlator_t *this)
+afr_link_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = frame->local;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
- local->transaction.unwind (frame, this);
+int
+afr_link_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ priv = this->private;
- return 0;
+ STACK_WIND_COOKIE(frame, afr_link_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->link, &local->loc,
+ &local->newloc, local->xdata_req);
+ return 0;
}
-
int
-afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
+afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->newloc, newloc);
+ loc_copy(&local->loc, oldloc);
+ loc_copy(&local->newloc, newloc);
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->inode = inode_ref(oldloc->inode);
+ local->parent = inode_ref(newloc->parent);
- local->cont.link.ino = oldloc->inode->ino;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (oldloc->parent)
- local->cont.link.parent_ino = newloc->parent->ino;
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_link_wind;
- local->transaction.done = afr_link_done;
- local->transaction.unwind = afr_link_unwind;
+ local->op = GF_FOP_LINK;
- afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
+ local->transaction.wind = afr_link_wind;
+ local->transaction.unwind = afr_link_unwind;
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (oldloc->path);
- local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, newloc,
+ &op_errno);
+ if (ret)
+ goto out;
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(newloc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (link, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ symlink */
-
int
-afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_symlink_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.symlink.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.symlink.read_child_buf;
- } else {
- unwind_buf = &local->cont.symlink.buf;
- }
-
- unwind_buf->ia_ino = local->cont.symlink.ino;
-
- local->cont.symlink.preparent.ia_ino = local->cont.symlink.parent_ino;
- local->cont.symlink.postparent.ia_ino = local->cont.symlink.parent_ino;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- AFR_STACK_UNWIND (symlink, main_frame,
- local->op_ret, local->op_errno,
- local->cont.symlink.inode,
- unwind_buf, &local->cont.symlink.preparent,
- &local->cont.symlink.postparent);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
-
-
-int
-afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.symlink.buf = *buf;
- local->cont.symlink.ino =
- afr_itransform (buf->ia_ino, priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->first_up_child) {
- local->cont.symlink.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- local->first_up_child);
- }
-
- if (child_index == local->read_child_index) {
- local->cont.symlink.read_child_buf = *buf;
- local->cont.symlink.preparent = *preparent;
- local->cont.symlink.postparent = *postparent;
- }
-
- local->cont.symlink.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(symlink, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-
int
-afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+afr_symlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->symlink,
- local->cont.symlink.linkpath,
- &local->loc,
- local->cont.symlink.params);
-
- if (!--call_count)
- break;
-
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_symlink_done (call_frame_t *frame, xlator_t *this)
+afr_symlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_symlink_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->symlink,
+ local->cont.symlink.linkpath, &local->loc, local->umask,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, dict_t *params)
+afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
-
- local->cont.symlink.linkpath = gf_strdup (linkpath);
- if (params)
- local->cont.symlink.params = dict_ref (params);
-
- if (loc->parent)
- local->cont.symlink.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_symlink_wind;
- local->transaction.done = afr_symlink_done;
- local->transaction.unwind = afr_symlink_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.symlink.linkpath = gf_strdup(linkpath);
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_SYMLINK;
+ local->transaction.wind = afr_symlink_wind;
+ local->transaction.unwind = afr_symlink_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (symlink, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
@@ -1320,223 +939,118 @@ out:
/* {{{ rename */
int
-afr_rename_unwind (call_frame_t *frame, xlator_t *this)
+afr_rename_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.rename.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.rename.read_child_buf;
- } else {
- unwind_buf = &local->cont.rename.buf;
- }
-
- unwind_buf->ia_ino = local->cont.rename.ino;
-
- local->cont.rename.preoldparent.ia_ino = local->cont.rename.oldparent_ino;
- local->cont.rename.postoldparent.ia_ino = local->cont.rename.oldparent_ino;
- local->cont.rename.prenewparent.ia_ino = local->cont.rename.newparent_ino;
- local->cont.rename.postnewparent.ia_ino = local->cont.rename.newparent_ino;
-
- AFR_STACK_UNWIND (rename, main_frame,
- local->op_ret, local->op_errno,
- unwind_buf,
- &local->cont.rename.preoldparent,
- &local->cont.rename.postoldparent,
- &local->cont.rename.prenewparent,
- &local->cont.rename.postnewparent);
- }
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(rename, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent,
+ &local->cont.dir_fop.prenewparent,
+ &local->cont.dir_fop.postnewparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
+afr_rename_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY)
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
-
- if (buf) {
- local->cont.rename.buf = *buf;
- }
-
- local->success_count++;
- }
-
- if (child_index == local->read_child_index) {
- local->cont.rename.read_child_buf = *buf;
-
- local->cont.rename.preoldparent = *preoldparent;
- local->cont.rename.postoldparent = *postoldparent;
- local->cont.rename.prenewparent = *prenewparent;
- local->cont.rename.postnewparent = *postnewparent;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
}
-
-int32_t
-afr_rename_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rename,
- &local->loc,
- &local->newloc);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
int
-afr_rename_done (call_frame_t *frame, xlator_t *this)
+afr_rename_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ STACK_WIND_COOKIE(frame, afr_rename_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rename, &local->loc,
+ &local->newloc, local->xdata_req);
+ return 0;
}
-
int
-afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
+afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->newloc, newloc);
-
- local->read_child_index = afr_read_child (this, oldloc->inode);
-
- local->cont.rename.ino = oldloc->inode->ino;
-
- if (oldloc->parent)
- local->cont.rename.oldparent_ino = oldloc->parent->ino;
- if (newloc->parent)
- local->cont.rename.newparent_ino = newloc->parent->ino;
-
- local->transaction.fop = afr_rename_wind;
- local->transaction.done = afr_rename_done;
- local->transaction.unwind = afr_rename_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
- afr_build_parent_loc (&local->transaction.new_parent_loc, newloc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (oldloc->path);
- local->transaction.new_basename = AFR_BASENAME (newloc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, oldloc);
+ loc_copy(&local->newloc, newloc);
+
+ local->inode = inode_ref(oldloc->inode);
+ local->parent = inode_ref(oldloc->parent);
+ local->parent2 = inode_ref(newloc->parent);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_RENAME;
+ local->transaction.wind = afr_rename_wind;
+ local->transaction.unwind = afr_rename_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, oldloc,
+ &op_errno);
+ if (ret)
+ goto out;
+ ret = afr_build_parent_loc(&local->transaction.new_parent_loc, newloc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME(newloc->path);
+ ret = afr_transaction(transaction_frame, this,
+ AFR_ENTRY_RENAME_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (rename, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
@@ -1544,399 +1058,205 @@ out:
/* {{{ unlink */
int
-afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_unlink_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- if (main_frame) {
- local->cont.unlink.preparent.ia_ino = local->cont.unlink.parent_ino;
- local->cont.unlink.postparent.ia_ino = local->cont.unlink.parent_ino;
-
- AFR_STACK_UNWIND (unlink, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.unlink.preparent,
- &local->cont.unlink.postparent);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(unlink, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent)
+afr_unlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (child_index == local->read_child_index) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.unlink.preparent = *preparent;
- local->cont.unlink.postparent = *postparent;
- }
-
- if (child_index == local->read_child_index) {
- local->cont.unlink.preparent = *preparent;
- local->cont.unlink.postparent = *postparent;
- }
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-
-int32_t
-afr_unlink_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->unlink,
- &local->loc);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int32_t
-afr_unlink_done (call_frame_t *frame, xlator_t *this)
+int
+afr_unlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ priv = this->private;
- return 0;
+ STACK_WIND_COOKIE(frame, afr_unlink_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->unlink, &local->loc,
+ local->xflag, local->xdata_req);
+ return 0;
}
-
-int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+int
+afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- if (loc->parent)
- local->cont.unlink.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_unlink_wind;
- local->transaction.done = afr_unlink_done;
- local->transaction.unwind = afr_unlink_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->xflag = xflag;
+
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_UNLINK;
+ local->transaction.wind = afr_unlink_wind;
+ local->transaction.unwind = afr_unlink_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (unlink, frame, op_ret, op_errno,
- NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ rmdir */
-
-
int
-afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
+afr_rmdir_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.rmdir.preparent.ia_ino = local->cont.rmdir.parent_ino;
- local->cont.rmdir.postparent.ia_ino = local->cont.rmdir.parent_ino;
-
- AFR_STACK_UNWIND (rmdir, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.rmdir.preparent,
- &local->cont.rmdir.postparent);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
-
-
-int
-afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
- int read_child = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY))
- afr_transaction_fop_failed (frame, this, child_index);
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.rmdir.preparent = *preparent;
- local->cont.rmdir.postparent = *postparent;
-
- }
-
- if (child_index == read_child) {
- local->cont.rmdir.preparent = *preparent;
- local->cont.rmdir.postparent = *postparent;
- }
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-
int
-afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+afr_rmdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rmdir,
- &local->loc, local->cont.rmdir.flags);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_rmdir_done (call_frame_t *frame, xlator_t *this)
+afr_rmdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ priv = this->private;
- return 0;
+ STACK_WIND_COOKIE(frame, afr_rmdir_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rmdir, &local->loc,
+ local->cont.rmdir.flags, local->xdata_req);
+ return 0;
}
-
int
-afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags)
+afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- local->cont.rmdir.flags = flags;
- loc_copy (&local->loc, loc);
-
- if (loc->parent)
- local->cont.rmdir.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_rmdir_wind;
- local->transaction.done = afr_rmdir_done;
- local->transaction.unwind = afr_rmdir_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.rmdir.flags = flags;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_RMDIR;
+ local->transaction.wind = afr_rmdir_wind;
+ local->transaction.unwind = afr_rmdir_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (rmdir, frame, op_ret, op_errno,
- NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h
index 0290c6350b3..1d88c3b9b26 100644
--- a/xlators/cluster/afr/src/afr-dir-write.h
+++ b/xlators/cluster/afr/src/afr-dir-write.h
@@ -1,60 +1,46 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __DIR_WRITE_H__
#define __DIR_WRITE_H__
int32_t
-afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params);
+afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
int32_t
-afr_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev, dict_t *params);
+afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata);
int32_t
-afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params);
+afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc);
+afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata);
int32_t
-afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags);
+afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata);
int32_t
-afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc);
+afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
int32_t
-afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc);
+afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
int
-afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *oldloc, dict_t *params);
-
-int32_t
-afr_setdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count);
+afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *oldloc, mode_t umask, dict_t *params);
#endif /* __DIR_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 0f934f8028f..c5521704de2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -1,23 +1,13 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#include <libgen.h>
#include <unistd.h>
#include <fnmatch.h>
@@ -25,1058 +15,1880 @@
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-
+#include <glusterfs/glusterfs.h>
#include "afr.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/quota-common-utils.h>
+
+#include "afr-transaction.h"
+#include "afr-messages.h"
-
-/**
- * Common algorithm for inode read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: access, stat, fstat, readlink, getxattr
- */
+/*
+ * Quota size xattrs are not maintained by afr. There is a
+ * possibility that they differ even when both the directory changelog xattrs
+ * suggest everything is fine. So if there is at least one 'source' check among
+ * the sources which has the maximum quota size. Otherwise check among all the
+ * available ones for maximum quota size. This way if there is a source and
+ * stale copies it always votes for the 'source'.
+ * */
+
+int
+afr_handle_quota_size(call_frame_t *frame, xlator_t *this)
+{
+ unsigned char *readable = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0;
+ int ret = 0;
+ quota_meta_t size = {
+ 0,
+ };
+ quota_meta_t max_size = {
+ 0,
+ };
+ int readable_cnt = 0;
+ int read_subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+
+ readable = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(local->inode, this, readable, 0, 0);
+
+ readable_cnt = AFR_COUNT(readable, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY,
+ SLEN(QUOTA_SIZE_KEY), &size);
+ if (ret == -1)
+ continue;
+ if (read_subvol == -1)
+ read_subvol = i;
+ if (size.size > max_size.size ||
+ (size.file_count + size.dir_count) >
+ (max_size.file_count + max_size.dir_count))
+ read_subvol = i;
+
+ if (size.size > max_size.size)
+ max_size.size = size.size;
+ if (size.file_count > max_size.file_count)
+ max_size.file_count = size.file_count;
+ if (size.dir_count > max_size.dir_count)
+ max_size.dir_count = size.dir_count;
+ }
+
+ if (max_size.size == 0 && max_size.file_count == 0 &&
+ max_size.dir_count == 0)
+ return read_subvol;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ quota_dict_set_meta(replies[i].xdata, QUOTA_SIZE_KEY, &max_size,
+ IA_IFDIR);
+ }
+
+ return read_subvol;
+}
/* {{{ access */
-int32_t
-afr_access_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+int
+afr_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ afr_local_t *local = NULL;
- priv = this->private;
- children = priv->children;
+ local = frame->local;
- local = frame->local;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- read_child = (long) cookie;
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.access.last_tried;
+ AFR_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
- if (all_tried (last_tried, priv->child_count)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: all subvolumes tried, going out",
- local->loc.path);
- goto out;
- }
- this_try = ++local->cont.access.last_tried;
+ return 0;
+}
- if (this_try == read_child) {
- goto retry;
- }
+int
+afr_access_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- unwind = 0;
+ priv = this->private;
+ local = frame->local;
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->access,
- &local->loc, local->cont.access.mask);
- }
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(access, frame, local->op_ret, local->op_errno, 0);
+ return 0;
+ }
+ STACK_WIND_COOKIE(frame, afr_access_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->access, &local->loc,
+ local->cont.access.mask, local->xdata_req);
+ return 0;
+}
+
+int
+afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_ACCESS;
+ loc_copy(&local->loc, loc);
+ local->cont.access.mask = mask;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ afr_read_txn(frame, this, loc->inode, afr_access_wind,
+ AFR_METADATA_TRANSACTION);
+
+ return 0;
out:
- if (unwind) {
- AFR_STACK_UNWIND (access, frame, op_ret, op_errno);
- }
+ AFR_STACK_UNWIND(access, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
+/* }}} */
-int32_t
-afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask)
+/* {{{ stat */
+
+int
+afr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- int32_t read_child = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- children = priv->children;
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ AFR_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
- frame->local = local;
+ return 0;
+}
- read_child = afr_read_child (this, loc->inode);
+int
+afr_stat_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+ priv = this->private;
+ local = frame->local;
- local->cont.access.last_tried = -1;
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, 0, 0);
+ return 0;
+ }
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_INFO,
- "%s: no child is up", loc->path);
- goto out;
- }
+ STACK_WIND_COOKIE(
+ frame, afr_stat_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->stat, &local->loc, local->xdata_req);
+ return 0;
+}
- local->cont.access.last_tried = call_child;
- }
+int
+afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- loc_copy (&local->loc, loc);
- local->cont.access.mask = mask;
+ local->op = GF_FOP_STAT;
+ loc_copy(&local->loc, loc);
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->access,
- loc, mask);
+ afr_read_txn(frame, this, loc->inode, afr_stat_wind, AFR_DATA_TRANSACTION);
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (access, frame, op_ret, op_errno);
- }
+ AFR_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fstat */
+
+int
+afr_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ afr_read_txn_continue(frame, this, (long)cookie);
return 0;
+ }
+
+ AFR_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
+
+ return 0;
}
+int
+afr_fstat_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
-/* }}} */
+ priv = this->private;
+ local = frame->local;
-/* {{{ stat */
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(
+ frame, afr_fstat_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->fstat, local->fd, local->xdata_req);
+ return 0;
+}
int32_t
-afr_stat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf)
+afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- priv = this->private;
- children = priv->children;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- read_child = (long) cookie;
+ local->op = GF_FOP_FSTAT;
+ local->fd = fd_ref(fd);
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- local = frame->local;
+ afr_fix_open(fd, this);
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.stat.last_tried;
+ afr_read_txn(frame, this, fd->inode, afr_fstat_wind, AFR_DATA_TRANSACTION);
- if (all_tried (last_tried, priv->child_count)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: all subvolumes tried, going out",
- local->loc.path);
- goto out;
- }
- this_try = ++local->cont.stat.last_tried;
+ return 0;
+out:
+ AFR_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
- if (this_try == read_child) {
- goto retry;
- }
+ return 0;
+}
- unwind = 0;
+/* }}} */
- STACK_WIND_COOKIE (frame, afr_stat_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->stat,
- &local->loc);
- }
+/* {{{ readlink */
-out:
- if (unwind) {
- if (buf)
- buf->ia_ino = local->cont.stat.ino;
+int
+afr_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, const char *buf,
+ struct iatt *sbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf);
- }
+ local = frame->local;
+
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ afr_read_txn_continue(frame, this, (long)cookie);
return 0;
-}
+ }
+ AFR_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
+ return 0;
+}
-int32_t
-afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc)
+int
+afr_readlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int32_t read_child = -1;
- int call_child = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
+ priv = this->private;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(readlink, frame, local->op_ret, local->op_errno, 0, 0,
+ 0);
+ return 0;
+ }
- children = priv->children;
+ STACK_WIND_COOKIE(frame, afr_readlink_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readlink, &local->loc,
+ local->cont.readlink.size, local->xdata_req);
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
- frame->local = local;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- read_child = afr_read_child (this, loc->inode);
+ local->op = GF_FOP_READLINK;
+ loc_copy(&local->loc, loc);
+ local->cont.readlink.size = size;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+ afr_read_txn(frame, this, loc->inode, afr_readlink_wind,
+ AFR_DATA_TRANSACTION);
- local->cont.stat.last_tried = -1;
+ return 0;
+out:
+ AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0);
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_INFO,
- "%s: no child is up", loc->path);
- goto out;
- }
+ return 0;
+}
- local->cont.stat.last_tried = call_child;
- }
+/* }}} */
- loc_copy (&local->loc, loc);
+/* {{{ getxattr */
- local->cont.stat.ino = loc->inode->ino;
+struct _xattr_key {
+ char *key;
+ struct list_head list;
+};
- STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->stat,
- loc);
+int
+__gather_xattr_keys(dict_t *dict, char *key, data_t *value, void *data)
+{
+ struct list_head *list = data;
+ struct _xattr_key *xkey = NULL;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, NULL);
- }
+ if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) {
+ xkey = GF_MALLOC(sizeof(*xkey), gf_afr_mt_xattr_key);
+ if (!xkey)
+ return -1;
- return 0;
+ xkey->key = key;
+ INIT_LIST_HEAD(&xkey->list);
+
+ list_add_tail(&xkey->list, list);
+ }
+ return 0;
}
+void
+afr_filter_xattrs(dict_t *dict)
+{
+ struct list_head keys = {
+ 0,
+ };
+ struct _xattr_key *key = NULL;
+ struct _xattr_key *tmp = NULL;
-/* }}} */
+ INIT_LIST_HEAD(&keys);
-/* {{{ fstat */
+ dict_foreach(dict, __gather_xattr_keys, (void *)&keys);
-int32_t
-afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ list_for_each_entry_safe(key, tmp, &keys, list)
+ {
+ dict_del(dict, key->key);
- priv = this->private;
- children = priv->children;
+ list_del_init(&key->list);
- local = frame->local;
+ GF_FREE(key);
+ }
+}
+
+static gf_boolean_t
+afr_getxattr_ignorable_errnos(int32_t op_errno)
+{
+ if (op_errno == ENODATA || op_errno == ENOTSUP || op_errno == ERANGE ||
+ op_errno == ENAMETOOLONG)
+ return _gf_true;
- read_child = (long) cookie;
+ return _gf_false;
+}
+int
+afr_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.fstat.last_tried;
+ local = frame->local;
- if (all_tried (last_tried, priv->child_count)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%p: all subvolumes tried, going out",
- local->fd);
- goto out;
- }
- this_try = ++local->cont.fstat.last_tried;
+ if (op_ret < 0 && !afr_getxattr_ignorable_errnos(op_errno)) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- if (this_try == read_child) {
- goto retry;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- unwind = 0;
+ if (dict)
+ afr_filter_xattrs(dict);
- STACK_WIND_COOKIE (frame, afr_fstat_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->fstat,
- local->fd);
- }
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
-out:
- if (unwind) {
- if (buf)
- buf->ia_ino = local->cont.fstat.ino;
+ return 0;
+}
- AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf);
- }
+int
+afr_getxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_getxattr_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->getxattr, &local->loc,
+ local->cont.getxattr.name, local->xdata_req);
+ return 0;
}
+int32_t
+afr_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
+ dict_t *xdata)
+
+{
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
+afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- int32_t read_child = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {
+ 0,
+ };
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+ int keylen = 0;
+ int children_keylen = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ children_keylen = strlen(children[cky]->name);
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (local->dict) {
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstrn(local->dict, children[cky]->name,
+ children_keylen, gf_strdup(tmp_report));
+ if (ret)
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim(local->dict, lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
+ ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+ gf_strdup(lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED,
+ "Error setting dictionary");
+ goto unwind;
+ }
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (this->private, out);
+ op_errno = afr_final_errno(local, priv);
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+ if (xattr)
+ dict_unref(xattr);
+ }
- children = priv->children;
+ return ret;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int32_t
+afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {
+ 0,
+ };
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+ int keylen = 0;
+ int children_keylen = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long)cookie;
+
+ keylen = strlen(local->cont.getxattr.name);
+ children_keylen = strlen(children[cky]->name);
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (local->dict) {
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstrn(local->dict, children[cky]->name,
+ children_keylen, gf_strdup(tmp_report));
+ if (ret)
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim(local->dict, lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
+ ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+ gf_strdup(lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED,
+ "Error setting dictionary");
+ goto unwind;
+ }
- frame->local = local;
+ op_errno = afr_final_errno(local, priv);
- VALIDATE_OR_GOTO (fd->inode, out);
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
- read_child = afr_read_child (this, fd->inode);
+ if (xattr)
+ dict_unref(xattr);
+ }
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+ return ret;
+}
- local->cont.fstat.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
+/**
+ * node-uuid cbk uses next child querying mechanism
+ */
+int32_t
+afr_getxattr_node_uuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t **children = NULL;
+ int unwind = 1;
+ int curr_call_child = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) { /** query the _next_ child */
+
+ /**
+ * _current_ becomes _next_
+ * If done with all children and yet no success; give up !
+ */
+ curr_call_child = (int)((long)cookie);
+ if (++curr_call_child == priv->child_count)
+ goto unwind;
+
+ gf_msg_debug(this->name, op_errno,
+ "op_ret (-1): Re-querying afr-child (%d/%d)",
+ curr_call_child, priv->child_count);
+
+ unwind = 0;
+ STACK_WIND_COOKIE(
+ frame, afr_getxattr_node_uuid_cbk, (void *)(long)curr_call_child,
+ children[curr_call_child],
+ children[curr_call_child]->fops->getxattr, &local->loc,
+ local->cont.getxattr.name, local->xdata_req);
+ }
+
+unwind:
+ if (unwind)
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+
+ return 0;
+}
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_INFO,
- "%p: no child is up", fd);
- goto out;
- }
+/**
+ * list-node-uuids cbk returns the list of node_uuids for the subvolume.
+ */
+int32_t
+afr_getxattr_list_node_uuids_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr_serz = NULL;
+ long cky = 0;
+ int32_t tlen = 0;
+
+ local = frame->local;
+ priv = this->private;
+ cky = (long)cookie;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ local->replies[cky].valid = 1;
+ local->replies[cky].op_ret = op_ret;
+ local->replies[cky].op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto unlock;
+
+ local->op_ret = 0;
+
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ local->replies[cky].xattr = dict_ref(dict);
+ }
+
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ if (local->op_ret != 0) {
+ /* All bricks gave an error. */
+ local->op_errno = afr_final_errno(local, priv);
+ goto unwind;
+ }
- local->cont.fstat.last_tried = call_child;
+ /*Since we store the UUID0_STR as node uuid for down bricks and
+ *for non zero op_ret, assigning length to priv->child_count
+ *number of uuids*/
+ local->cont.getxattr.xattr_len = (SLEN(UUID0_STR) + 2) *
+ priv->child_count;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
}
- local->cont.fstat.ino = fd->inode->ino;
- local->fd = fd_ref (fd);
+ xattr_serz = GF_CALLOC(local->cont.getxattr.xattr_len, sizeof(char),
+ gf_common_mt_char);
- STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->fstat,
- fd);
+ if (!xattr_serz) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL);
+ ret = afr_serialize_xattrs_with_delimiter(frame, this, xattr_serz,
+ UUID0_STR, &tlen, ' ');
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ GF_FREE(xattr_serz);
+ goto unwind;
+ }
+ ret = dict_set_dynstr_sizen(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set node_uuid key in dict");
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
+ } else {
+ local->op_ret = local->cont.getxattr.xattr_len - 1;
+ local->op_errno = 0;
}
- return 0;
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->dict, local->xdata_rsp);
+ }
+
+ return ret;
}
-/* }}} */
+int32_t
+afr_getxattr_quota_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int idx = (long)cookie;
+ int call_count = 0;
+ afr_local_t *local = frame->local;
+ int read_subvol = -1;
+
+ local->replies[idx].valid = 1;
+ local->replies[idx].op_ret = op_ret;
+ local->replies[idx].op_errno = op_errno;
+ if (dict)
+ local->replies[idx].xdata = dict_ref(dict);
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ local->inode = inode_ref(local->loc.inode);
+ read_subvol = afr_handle_quota_size(frame, this);
+ if (read_subvol != -1) {
+ op_ret = local->replies[read_subvol].op_ret;
+ op_errno = local->replies[read_subvol].op_errno;
+ dict = local->replies[read_subvol].xdata;
+ }
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ }
-/* {{{ readlink */
+ return 0;
+}
int32_t
-afr_readlink_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- const char *buf, struct iatt *sbuf)
+afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
-
- priv = this->private;
- children = priv->children;
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
+ LOCK(&frame->lock);
+ {
local = frame->local;
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.readlink.last_tried;
+ call_cnt = --local->call_count;
- if (all_tried (last_tried, priv->child_count)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: all subvolumes tried, going out",
- local->loc.path);
- goto out;
- }
- this_try = ++local->cont.readlink.last_tried;
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
- if (this_try == read_child) {
- goto retry;
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
}
+ }
+ }
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->readlink,
- &local->loc,
- local->cont.readlink.size);
+ if (!dict) {
+ goto unlock;
}
-out:
- if (unwind) {
- if (sbuf)
- sbuf->ia_ino = local->cont.readlink.ino;
+ op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
- AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf);
+ if (!lockinfo_buf) {
+ goto unlock;
}
- return 0;
-}
-
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
-int32_t
-afr_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size)
-{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- int32_t read_child = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ if (lockinfo && local->dict) {
+ dict_copy(lockinfo, local->dict);
+ }
+ }
+ }
+
+ if (xdata && local->xdata_rsp) {
+ dict_copy(xdata, local->xdata_rsp);
+ }
+
+ if (!call_cnt) {
+ newdict = dict_new();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
+ local->op_ret = -1;
+ goto unwind;
+ }
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
+ }
- children = priv->children;
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict,
+ local->xdata_rsp);
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ dict_unref(lockinfo);
- frame->local = local;
+ return 0;
+}
- read_child = afr_read_child (this, loc->inode);
+int32_t
+afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+ LOCK(&frame->lock);
+ {
+ local = frame->local;
- local->cont.readlink.last_tried = -1;
+ call_cnt = --local->call_count;
- } else {
- call_child = afr_first_up_child (priv);
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_INFO,
- "%s: no child is up", loc->path);
- goto out;
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
}
+ }
+ }
- local->cont.readlink.last_tried = call_child;
+ if (!dict) {
+ goto unlock;
}
- loc_copy (&local->loc, loc);
+ op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
- local->cont.readlink.size = size;
- local->cont.readlink.ino = loc->inode->ino;
+ if (!lockinfo_buf) {
+ goto unlock;
+ }
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->readlink,
- loc, size);
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, NULL, NULL);
+ if (lockinfo && local->dict) {
+ dict_copy(lockinfo, local->dict);
+ }
+ }
+ }
+
+ if (xdata && local->xdata_rsp) {
+ dict_copy(xdata, local->xdata_rsp);
+ }
+
+ if (!call_cnt) {
+ newdict = dict_new();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
}
- return 0;
-}
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
+ local->op_ret = -1;
+ goto unwind;
+ }
-/* }}} */
+ op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
+ }
-/* {{{ getxattr */
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict,
+ local->xdata_rsp);
+ }
-struct _xattr_key {
- char *key;
- struct list_head list;
-};
+ dict_unref(lockinfo);
+ return 0;
+}
-void
-__gather_xattr_keys (dict_t *dict, char *key, data_t *value,
- void *data)
+int32_t
+afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- struct list_head * list = data;
- struct _xattr_key * xkey = NULL;
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ int keylen = 0;
+ char xattr_cky[1024] = {
+ 0,
+ };
+ int xattr_cky_len = 0;
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+ local->cont.getxattr.name, cky);
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ }
- if (!strncmp (key, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
+ if (!dict || (op_ret < 0))
+ goto unlock;
- xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key);
- if (!xkey)
- return;
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict)
+ goto unlock;
+ }
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+ if (ret)
+ goto unlock;
+
+ xattr = gf_strdup(xattr);
+
+ ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set xattr cookie key");
+ goto post_unlock;
+ }
- xkey->key = key;
- INIT_LIST_HEAD (&xkey->list);
+ local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len,
+ gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ int xattr_serz_len = sprintf(
+ xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim(
+ local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
+ if (ret) {
+ GF_FREE(xattr_serz);
+ goto unwind;
+ }
- list_add_tail (&xkey->list, list);
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
+
+ ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set pathinfo key in dict");
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
}
-}
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno,
+ nxattr, local->xdata_rsp);
-void
-__filter_xattrs (dict_t *dict)
-{
- struct list_head keys = {0,};
- struct _xattr_key *key = NULL;
- struct _xattr_key *tmp = NULL;
+ if (nxattr)
+ dict_unref(nxattr);
+ }
- INIT_LIST_HEAD (&keys);
+out:
+ return ret;
+}
- dict_foreach (dict, __gather_xattr_keys,
- (void *) &keys);
+int32_t
+afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ char xattr_cky[1024] = {
+ 0,
+ };
+ int keylen = 0;
+ int xattr_cky_len = 0;
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+ local->cont.getxattr.name, cky);
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ }
- list_for_each_entry_safe (key, tmp, &keys, list) {
- dict_del (dict, key->key);
+ if (!dict || (op_ret < 0))
+ goto unlock;
- list_del_init (&key->list);
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict)
+ goto unlock;
+ }
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+ if (ret)
+ goto unlock;
+
+ xattr = gf_strdup(xattr);
+
+ ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set xattr cookie key");
+ goto post_unlock;
+ }
- GF_FREE (key);
+ local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len,
+ gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ int xattr_serz_len = sprintf(
+ xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim(
+ local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
+ if (ret) {
+ GF_FREE(xattr_serz);
+ goto unwind;
}
+
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
+
+ ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set pathinfo key in dict");
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
+ }
+
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ nxattr, local->xdata_rsp);
+
+ if (nxattr)
+ dict_unref(nxattr);
+ }
+
+out:
+ return ret;
}
+static int
+afr_aggregate_stime_xattr(dict_t *this, char *key, data_t *value, void *data)
+{
+ int ret = 0;
+
+ if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+ ret = gf_get_max_stime(THIS, data, key, value);
+ return ret;
+}
int32_t
-afr_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict)
+afr_common_getxattr_stime_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
- priv = this->private;
- children = priv->children;
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
- local = frame->local;
+ local = frame->local;
- read_child = (long) cookie;
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.getxattr.last_tried;
+ if (!dict || (op_ret < 0)) {
+ local->op_errno = op_errno;
+ goto cleanup;
+ }
- if (all_tried (last_tried, priv->child_count)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: all subvolumes tried, going out",
- local->loc.path);
- goto out;
- }
- this_try = ++local->cont.getxattr.last_tried;
+ if (!local->dict)
+ local->dict = dict_copy_with_ref(dict, NULL);
+ else
+ dict_foreach(dict, afr_aggregate_stime_xattr, local->dict);
+ local->op_ret = 0;
+ }
- if (this_try == read_child) {
- goto retry;
- }
+cleanup:
+ UNLOCK(&frame->lock);
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name);
- }
+ if (!callcnt) {
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->dict, xdata);
+ }
out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
+ return 0;
+}
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
+static gf_boolean_t
+afr_is_special_xattr(const char *name, fop_getxattr_cbk_t *cbk,
+ gf_boolean_t is_fgetxattr)
+{
+ gf_boolean_t is_spl = _gf_true;
+
+ GF_ASSERT(cbk);
+ if (!cbk || !name) {
+ is_spl = _gf_false;
+ goto out;
+ }
+
+ if (!strcmp(name, GF_XATTR_PATHINFO_KEY) ||
+ !strcmp(name, GF_XATTR_USER_PATHINFO_KEY)) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_pathinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_pathinfo_cbk;
+ }
+ } else if (!strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_clrlk_cbk;
+ } else {
+ *cbk = afr_getxattr_clrlk_cbk;
}
+ } else if (!strncmp(name, GF_XATTR_LOCKINFO_KEY,
+ SLEN(GF_XATTR_LOCKINFO_KEY))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_lockinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_lockinfo_cbk;
+ }
+ } else if (fnmatch(GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+ *cbk = afr_common_getxattr_stime_cbk;
+ } else if (strcmp(name, QUOTA_SIZE_KEY) == 0) {
+ *cbk = afr_getxattr_quota_size_cbk;
+ } else if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) {
+ *cbk = afr_getxattr_list_node_uuids_cbk;
+ } else {
+ is_spl = _gf_false;
+ }
- return 0;
+out:
+ return is_spl;
}
-int32_t
-afr_getxattr_unwind (call_frame_t *frame,
- int op_ret, int op_errno, dict_t *dict)
-
+static void
+afr_getxattr_all_subvols(xlator_t *this, call_frame_t *frame, const char *name,
+ loc_t *loc, fop_getxattr_cbk_t cbk)
{
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
- return 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ priv = this->private;
+
+ local = frame->local;
+ // local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) {
+ GF_FREE(local->cont.getxattr.name);
+ local->cont.getxattr.name = gf_strdup(GF_XATTR_NODE_UUID_KEY);
+ }
+
+ // If up-children count is 0, afr_local_init would have failed already
+ // and the call would have unwound so not handling it here.
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->getxattr, loc,
+ local->cont.getxattr.name, NULL);
+ if (!--call_count)
+ break;
+ }
+ }
+ return;
}
-int32_t
-afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict)
+int
+afr_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+ xlator_t **subvols)
{
- afr_local_t *local = NULL;
- int32_t callcnt = 0;
- int ret = 0;
- char *pathinfo = NULL;
- char *pathinfo_serz = NULL;
- char pathinfo_cky[1024] = {0,};
- dict_t *xattr = NULL;
- long cky = 0;
- int32_t padding = 0;
- int32_t tlen = 0;
-
- if (!frame || !frame->local || !this) {
- gf_log (this->name, GF_LOG_ERROR, "possible NULL deref");
- goto unlock;
- }
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
- local = frame->local;
- cky = (long) cookie;
+ memcpy(subvols, priv->children, sizeof(*subvols) * priv->child_count);
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
+ if (type == MARKER_XTIME_TYPE) {
+ /*Don't error out on ENOENT/ENOTCONN */
+ gauge[MCNT_NOTFOUND] = 0;
+ gauge[MCNT_ENOTCONN] = 0;
+ }
+ return priv->child_count;
+}
- if (!dict || (op_ret < 0))
- goto unlock;
+static int
+afr_handle_heal_xattrs(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *heal_op)
+{
+ int ret = -1;
+ afr_spb_status_t *data = NULL;
+
+ if (!strcmp(heal_op, GF_HEAL_INFO)) {
+ afr_get_heal_info(frame, this, loc);
+ ret = 0;
+ goto out;
+ }
+
+ if (!strcmp(heal_op, GF_AFR_HEAL_SBRAIN)) {
+ afr_heal_splitbrain_file(frame, this, loc);
+ ret = 0;
+ goto out;
+ }
+
+ if (!strcmp(heal_op, GF_AFR_SBRAIN_STATUS)) {
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spb_status_t);
+ if (!data) {
+ ret = 1;
+ goto out;
+ }
+ data->frame = frame;
+ data->loc = loc;
+ ret = synctask_new(this->ctx->env, afr_get_split_brain_status,
+ afr_get_split_brain_status_cbk, NULL, data);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ "Failed to create"
+ " synctask. Unable to fetch split-brain status"
+ " for %s.",
+ loc->name);
+ ret = 1;
+ goto out;
+ }
+ goto out;
+ }
- if (!local->dict)
- local->dict = dict_new ();
+out:
+ if (ret == 1) {
+ AFR_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL);
+ if (data)
+ GF_FREE(data);
+ ret = 0;
+ }
+ return ret;
+}
- if (local->dict) {
- ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
- if (ret)
- goto unlock;
+int32_t
+afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t **children = NULL;
+ int i = 0;
+ int32_t op_errno = 0;
+ int ret = -1;
+ fop_getxattr_cbk_t cbk = NULL;
- pathinfo = gf_strdup (pathinfo);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- snprintf (pathinfo_cky, 1024, "%s-%ld", GF_XATTR_PATHINFO_KEY, cky);
- ret = dict_set_dynstr (local->dict, pathinfo_cky, pathinfo);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo cookie key");
- goto unlock;
- }
+ priv = this->private;
- local->cont.getxattr.pathinfo_len += strlen (pathinfo) + 1;
- }
- }
- unlock:
- if (frame)
- UNLOCK (&frame->lock);
+ children = priv->children;
- if (!callcnt) {
- if (!local->cont.getxattr.pathinfo_len)
- goto unwind;
+ loc_copy(&local->loc, loc);
- xattr = dict_new ();
- if (!xattr)
- goto unwind;
+ local->op = GF_FOP_GETXATTR;
- /* extra bytes for decorations (brackets and <>'s) */
- padding = strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4;
- local->cont.getxattr.pathinfo_len += (padding + 2);
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- pathinfo_serz = GF_CALLOC (local->cont.getxattr.pathinfo_len, sizeof (char),
- gf_common_mt_char);
+ if (!name)
+ goto no_name;
- if (!pathinfo_serz)
- goto unwind;
+ local->cont.getxattr.name = gf_strdup(name);
- /* the xlator info */
- sprintf (pathinfo_serz, "(<"AFR_PATHINFO_HEADER"%s> ", this->name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- /* actual series of pathinfo */
- ret = dict_serialize_value_with_delim (local->dict, pathinfo_serz + strlen (pathinfo_serz),
- &tlen, ' ');
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Error serializing dictionary");
- goto unwind;
- }
+ if (!strncmp(name, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) {
+ op_errno = ENODATA;
+ goto out;
+ }
- /* closing part */
- *(pathinfo_serz + padding + tlen) = ')';
- *(pathinfo_serz + padding + tlen + 1) = '\0';
+ if (cluster_handle_marker_getxattr(frame, loc, name, priv->vol_uuid,
+ afr_getxattr_unwind,
+ afr_marker_populate_args) == 0)
+ return 0;
- ret = dict_set_dynstr (xattr, GF_XATTR_PATHINFO_KEY, pathinfo_serz);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo key in dict");
+ ret = afr_handle_heal_xattrs(frame, this, &local->loc, name);
+ if (ret == 0)
+ return 0;
+
+ /*
+ * Heal daemons don't have IO threads ... and as a result they
+ * send this getxattr down and eventually crash :(
+ */
+ op_errno = -1;
+ GF_CHECK_XATTR_KEY_AND_GOTO(name, IO_THREADS_QUEUE_SIZE_KEY, op_errno, out);
+
+ /*
+ * Special xattrs which need responses from all subvols
+ */
+ if (afr_is_special_xattr(name, &cbk, 0)) {
+ afr_getxattr_all_subvols(this, frame, name, loc, cbk);
+ return 0;
+ }
- unwind:
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr);
+ if (XATTR_IS_NODE_UUID(name)) {
+ i = 0;
+ STACK_WIND_COOKIE(frame, afr_getxattr_node_uuid_cbk, (void *)(long)i,
+ children[i], children[i]->fops->getxattr, loc, name,
+ xdata);
+ return 0;
+ }
- if (local->dict)
- dict_unref (local->dict);
+no_name:
- if (xattr)
- dict_unref (xattr);
- }
+ afr_read_txn(frame, this, local->loc.inode, afr_getxattr_wind,
+ AFR_METADATA_TRANSACTION);
- return ret;
+ ret = 0;
+out:
+ if (ret < 0)
+ AFR_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
+/* {{{ fgetxattr */
+
int32_t
-afr_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+afr_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t * local = NULL;
- xlator_list_t * trav = NULL;
- xlator_t ** sub_volumes = NULL;
- int read_child = -1;
- int i = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- frame->local = local;
-
- loc_copy (&local->loc, loc);
- if (name)
- local->cont.getxattr.name = gf_strdup (name);
-
-
- if (name) {
- if (!strncmp (name, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: no data present for key %s",
- loc->path, name);
- op_errno = ENODATA;
- goto out;
- }
- if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0)
- && (-1 == frame->root->pid)) {
+ afr_local_t *local = NULL;
- local->marker.call_count = priv->child_count;
+ local = frame->local;
- sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *));
- for (i = 0, trav = this->children; trav ;
- trav = trav->next, i++) {
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- *(sub_volumes + i) = trav->xlator;
- }
-
- if (cluster_getmarkerattr (frame, this, loc, name,
- local, afr_getxattr_unwind,
- sub_volumes,
- priv->child_count,
- MARKER_UUID_TYPE,
- priv->vol_uuid)) {
-
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to get marker attr (%s)",
- loc->path, name);
- op_errno = EINVAL;
- goto out;
- }
-
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- /*
- * if we are doing getxattr with pathinfo as the key then we
- * collect information from all childs
- */
- if (strncmp (name, GF_XATTR_PATHINFO_KEY,
- strlen (GF_XATTR_PATHINFO_KEY)) == 0) {
-
- local->call_count = priv->child_count;
- for (i = 0; i < priv->child_count; i++) {
- STACK_WIND_COOKIE (frame, afr_getxattr_pathinfo_cbk,
- (void *) (long) i,
- children[i], children[i]->fops->getxattr,
- loc, name);
- }
-
- return 0;
- }
+ if (dict)
+ afr_filter_xattrs(dict);
- if (*priv->vol_uuid) {
- if ((match_uuid_local (name, priv->vol_uuid) == 0)
- && (-1 == frame->root->pid)) {
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata);
- local->marker.call_count = priv->child_count;
+ return 0;
+}
- sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *));
- for (i = 0, trav = this->children; trav ;
- trav = trav->next, i++) {
+int
+afr_fgetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- *(sub_volumes + i) = trav->xlator;
+ local = frame->local;
+ priv = this->private;
- }
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ return 0;
+ }
- if (cluster_getmarkerattr (frame, this, loc,
- name, local,
- afr_getxattr_unwind,
- sub_volumes,
- priv->child_count,
- MARKER_XTIME_TYPE,
- priv->vol_uuid)) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to get marker attr (%s)",
- loc->path, name);
- op_errno = EINVAL;
- goto out;
- }
+ STACK_WIND_COOKIE(frame, afr_fgetxattr_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fgetxattr, local->fd,
+ local->cont.getxattr.name, local->xdata_req);
+ return 0;
+}
- return 0;
- }
- }
+static void
+afr_fgetxattr_all_subvols(xlator_t *this, call_frame_t *frame,
+ fop_fgetxattr_cbk_t cbk)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ priv = this->private;
+
+ local = frame->local;
+ // local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ // If up-children count is 0, afr_local_init would have failed already
+ // and the call would have unwound so not handling it here.
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fgetxattr, local->fd,
+ local->cont.getxattr.name, NULL);
+ if (!--call_count)
+ break;
}
+ }
- read_child = afr_read_child (this, loc->inode);
-
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
-
- local->cont.getxattr.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
-
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_INFO,
- "%s: no child is up", loc->path);
- goto out;
- }
+ return;
+}
- local->cont.getxattr.last_tried = call_child;
+int
+afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ fop_fgetxattr_cbk_t cbk = NULL;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_FGETXATTR;
+ local->fd = fd_ref(fd);
+ if (name) {
+ local->cont.getxattr.name = gf_strdup(name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
}
+ }
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ /* pathinfo gets handled only in getxattr(), but we need to handle
+ * lockinfo.
+ * If we are doing fgetxattr with lockinfo as the key then we
+ * collect information from all children.
+ */
+ if (afr_is_special_xattr(name, &cbk, 1)) {
+ afr_fgetxattr_all_subvols(this, frame, cbk);
+ return 0;
+ }
+ afr_fix_open(fd, this);
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->getxattr,
- loc, name);
+ afr_read_txn(frame, this, fd->inode, afr_fgetxattr_wind,
+ AFR_METADATA_TRANSACTION);
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL);
- }
- return 0;
-}
+ AFR_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
/* }}} */
/* {{{ readv */
-/**
- * read algorithm:
- *
- * if the user has specified a read subvolume, use it
- * otherwise -
- * use the inode number to hash it to one of the subvolumes, and
- * read from there (to balance read load)
- *
- * if any of the above read's fail, try the children in sequence
- * beginning at the beginning
- */
-
-int32_t
-afr_readv_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref)
+int
+afr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iovec *vector, int32_t count,
+ struct iatt *buf, struct iobref *iobref, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ afr_local_t *local = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- children = priv->children;
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- local = frame->local;
+ AFR_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf, iobref,
+ xdata);
+ return 0;
+}
- read_child = (long) cookie;
+int
+afr_readv_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.readv.last_tried;
+ local = frame->local;
+ priv = this->private;
- if (all_tried (last_tried, priv->child_count)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%p: all subvolumes tried, going out",
- local->fd);
- goto out;
- }
- this_try = ++local->cont.readv.last_tried;
-
- if (this_try == read_child) {
- /*
- skip the read child since if we are here
- we must have already tried that child
- */
- goto retry;
- }
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, 0, 0, 0,
+ 0, 0);
+ return 0;
+ }
- unwind = 0;
+ STACK_WIND_COOKIE(
+ frame, afr_readv_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->readv, local->fd, local->cont.readv.size,
+ local->cont.readv.offset, local->cont.readv.flags, local->xdata_req);
+ return 0;
+}
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->readv,
- local->fd, local->cont.readv.size,
- local->cont.readv.offset);
- }
+int
+afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
-out:
- if (unwind) {
- if (buf && local)
- buf->ia_ino = local->cont.readv.ino;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- AFR_STACK_UNWIND (readv, frame, op_ret, op_errno,
- vector, count, buf, iobref);
- }
+ local->op = GF_FOP_READ;
+ local->fd = fd_ref(fd);
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+ local->cont.readv.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- return 0;
+ afr_fix_open(fd, this);
+
+ afr_read_txn(frame, this, fd->inode, afr_readv_wind, AFR_DATA_TRANSACTION);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+
+ return 0;
}
+/* }}} */
-int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset)
+/* {{{ seek */
+
+int
+afr_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, off_t offset, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int32_t read_child = -1;
- int call_child = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (fd, out);
+ local = frame->local;
- priv = this->private;
- children = priv->children;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- frame->local = local;
+ AFR_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata);
+ return 0;
+}
+
+int
+afr_seek_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- read_child = afr_read_child (this, fd->inode);
+ local = frame->local;
+ priv = this->private;
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(seek, frame, local->op_ret, local->op_errno, 0, NULL);
+ return 0;
+ }
- /*
- if read fails from the read child, we try
- all children starting with the first one
- */
- local->cont.readv.last_tried = -1;
+ STACK_WIND_COOKIE(
+ frame, afr_seek_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->seek, local->fd, local->cont.seek.offset,
+ local->cont.seek.what, local->xdata_req);
+ return 0;
+}
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "%p: no child is up", fd);
- goto out;
- }
+int
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
- local->cont.readv.last_tried = call_child;
- }
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->fd = fd_ref (fd);
+ local->op = GF_FOP_SEEK;
+ local->fd = fd_ref(fd);
+ local->cont.seek.offset = offset;
+ local->cont.seek.what = what;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- local->cont.readv.ino = fd->inode->ino;
- local->cont.readv.size = size;
- local->cont.readv.offset = offset;
+ afr_fix_open(fd, this);
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readv,
- fd, size, offset);
+ afr_read_txn(frame, this, fd->inode, afr_seek_wind, AFR_DATA_TRANSACTION);
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, 0, NULL,
- NULL);
- }
- return 0;
-}
+ AFR_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL);
+ return 0;
+}
/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
index 5479cfbd562..8c982bc7e6f 100644
--- a/xlators/cluster/afr/src/afr-inode-read.h
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -1,47 +1,45 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __INODE_READ_H__
#define __INODE_READ_H__
int32_t
-afr_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask);
+afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata);
int32_t
-afr_stat (call_frame_t *frame, xlator_t *this,
- loc_t *loc);
+afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd);
+afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
int32_t
-afr_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size);
+afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata);
int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset);
+afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
int32_t
-afr_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name);
+afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata);
+int32_t
+afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata);
+
+int
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata);
+int
+afr_handle_quota_size(call_frame_t *frame, xlator_t *this);
#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index d1e03bdb871..1d6e4f3570a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -1,1548 +1,2565 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-
+#include <glusterfs/glusterfs.h>
#include "afr.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include "protocol-common.h"
+#include <glusterfs/byte-order.h>
#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
-/* {{{ writev */
+static void
+__afr_inode_write_finalize(call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int ret = 0;
+ int read_subvol = 0;
+ struct iatt *stbuf = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, local->inode, out);
+
+ /*This code needs to stay till DHT sends fops on linked
+ * inodes*/
+ if (!inode_is_linked(local->inode)) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == -1)
+ continue;
+ if (!gf_uuid_is_null(local->replies[i].poststat.ia_gfid)) {
+ gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid);
+ args.ia_type = local->replies[i].poststat.ia_type;
+ break;
+ } else {
+ ret = dict_get_bin(local->replies[i].xdata,
+ DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+ if (ret)
+ continue;
+ gf_uuid_copy(args.gfid, stbuf->ia_gfid);
+ args.ia_type = stbuf->ia_type;
+ break;
+ }
+ }
+ }
+
+ if (local->transaction.type == AFR_METADATA_TRANSACTION) {
+ read_subvol = afr_metadata_subvol_get(local->inode, this, NULL,
+ local->readable, NULL, &args);
+ } else {
+ read_subvol = afr_data_subvol_get(local->inode, this, NULL,
+ local->readable, NULL, &args);
+ }
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ afr_pick_error_xdata(local, priv, local->inode, local->readable, NULL,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0)
+ continue;
+
+ /* Order of checks in the compound conditional
+ below is important.
+
+ - Highest precedence: largest op_ret
+ - Next precedence: if all op_rets are equal, read subvol
+ - Least precedence: any succeeded subvol
+ */
+ if ((local->op_ret < local->replies[i].op_ret) ||
+ ((local->op_ret == local->replies[i].op_ret) &&
+ (i == read_subvol))) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.inode_wfop.prebuf = local->replies[i].prestat;
+ local->cont.inode_wfop.postbuf = local->replies[i].poststat;
+
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ if (local->replies[i].xattr) {
+ if (local->xattr_rsp)
+ dict_unref(local->xattr_rsp);
+ local->xattr_rsp = dict_ref(local->replies[i].xattr);
+ }
+ }
+ }
+
+ afr_set_in_flight_sb_status(this, frame, local->inode);
+out:
+ return;
+}
-int
-afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+static void
+__afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->replies[child_index].valid = 1;
+
+ if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1)
+ op_ret = iov_length(local->cont.writev.vector,
+ local->cont.writev.count);
+
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+
+ if (op_ret >= 0) {
+ if (prebuf)
+ local->replies[child_index].prestat = *prebuf;
+ if (postbuf)
+ local->replies[child_index].poststat = *postbuf;
+ if (xattr)
+ local->replies[child_index].xattr = dict_ref(xattr);
+ } else {
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
+
+ return;
+}
- local = frame->local;
+static int
+__afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_private_t *priv = NULL;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ priv = this->private;
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
+ prebuf, postbuf, xattr, xdata);
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- if (main_frame) {
- local->cont.writev.prebuf.ia_ino = local->cont.writev.ino;
- local->cont.writev.postbuf.ia_ino = local->cont.writev.ino;
+ if (call_count == 0) {
+ __afr_inode_write_finalize(frame, this);
- AFR_STACK_UNWIND (writev, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.writev.prebuf,
- &local->cont.writev.postbuf);
+ if (afr_txn_nothing_failed(frame, this)) {
+ /*if it did pre-op, it will do post-op changing ctime*/
+ if (priv->consistent_metadata && afr_needs_changelog_update(local))
+ afr_zero_fill_stat(local);
+ local->transaction.unwind(frame, this);
}
- return 0;
+
+ afr_transaction_resume(frame, this);
+ }
+
+ return 0;
}
+/* {{{ writev */
-int
-afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+void
+afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame)
{
- afr_local_t * local = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
- int read_child = 0;
-
- local = frame->local;
+ afr_local_t *src_local = NULL;
+ afr_local_t *dst_local = NULL;
+
+ src_local = src_frame->local;
+ dst_local = dst_frame->local;
+
+ dst_local->op_ret = src_local->op_ret;
+ dst_local->op_errno = src_local->op_errno;
+ dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+ dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
+ if (src_local->xdata_rsp)
+ dst_local->xdata_rsp = dict_ref(src_local->xdata_rsp);
+}
- read_child = afr_read_child (this, local->fd->inode);
+void
+afr_writev_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ local = frame->local;
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+ if (priv->consistent_metadata)
+ afr_zero_fill_stat(local);
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
+ AFR_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+}
- if (child_index == read_child) {
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
- }
+int
+afr_transaction_writev_unwind(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *fop_frame = NULL;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ fop_frame = afr_transaction_detach_fop_frame(frame);
- call_count = afr_frame_return (frame);
+ if (fop_frame) {
+ afr_writev_copy_outvars(frame, fop_frame);
+ afr_writev_unwind(fop_frame, this);
+ }
+ return 0;
+}
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
+static void
+afr_writev_handle_short_writes(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ /*
+ * We already have the best case result of the writev calls staged
+ * as the return value. Any writev that returns some value less
+ * than the best case is now out of sync, so mark the fop as
+ * failed. Note that fops that have returned with errors have
+ * already been marked as failed.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret == -1))
+ continue;
+
+ if (local->replies[i].op_ret < local->op_ret)
+ afr_transaction_fop_failed(frame, this, i);
+ }
+}
- local->transaction.resume (frame, this);
- }
- return 0;
+void
+afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int ret = 0;
+ afr_local_t *local = frame->local;
+ uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
+ int32_t num_inodelks = 0;
+
+ LOCK(&frame->lock);
+ {
+ __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
+ prebuf, postbuf, NULL, xdata);
+ if (op_ret == -1 || !xdata)
+ goto unlock;
+
+ write_is_append = 0;
+ ret = dict_get_uint32(xdata, GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
+
+ ret = dict_get_uint32(xdata, GLUSTERFS_ACTIVE_FD_COUNT, &open_fd_count);
+ if (ret < 0)
+ goto unlock;
+ if (open_fd_count > local->open_fd_count) {
+ local->open_fd_count = open_fd_count;
+ local->update_open_fd_count = _gf_true;
+ }
+
+ ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT,
+ &num_inodelks);
+ if (ret < 0)
+ goto unlock;
+ if (num_inodelks > local->num_inodelks) {
+ local->num_inodelks = num_inodelks;
+ local->update_num_inodelks = _gf_true;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
}
+void
+afr_process_post_writev(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
+ afr_fd_report_unstable_write(this, local);
+
+ __afr_inode_write_finalize(frame, this);
+
+ afr_writev_handle_short_writes(frame, this);
+
+ if (local->update_open_fd_count)
+ local->inode_ctx->open_fd_count = local->open_fd_count;
+ if (local->update_num_inodelks &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ lock->num_inodelks = local->num_inodelks;
+ }
+}
int
-afr_writev_wind (call_frame_t *frame, xlator_t *this)
+afr_writev_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = -1;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ call_frame_t *fop_frame = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf,
+ postbuf, xdata);
- local->call_count = call_count;
+ call_count = afr_frame_return(frame);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- local->fd,
- local->cont.writev.vector,
- local->cont.writev.count,
- local->cont.writev.offset,
- local->cont.writev.iobref);
-
- if (!--call_count)
- break;
- }
- }
+ if (call_count == 0) {
+ afr_process_post_writev(frame, this);
- return 0;
+ if (!afr_txn_nothing_failed(frame, this)) {
+ // Don't unwind until post-op is complete
+ afr_transaction_resume(frame, this);
+ } else {
+ /*
+ * Generally inode-write fops do transaction.unwind then
+ * transaction.resume, but writev needs to make sure that
+ * delayed post-op frame is placed in fdctx before unwind
+ * happens. This prevents the race of flush doing the
+ * changelog wakeup first in fuse thread and then this
+ * writev placing its delayed post-op frame in fdctx.
+ * This helps flush make sure all the delayed post-ops are
+ * completed.
+ */
+
+ fop_frame = afr_transaction_detach_fop_frame(frame);
+ afr_writev_copy_outvars(frame, fop_frame);
+ afr_transaction_resume(frame, this);
+ afr_writev_unwind(fop_frame, this);
+ }
+ }
+ return 0;
}
+static int
+afr_arbiter_writev_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ static char byte = 0xFF;
+ static struct iovec vector = {&byte, 1};
+ int32_t count = 1;
+
+ STACK_WIND_COOKIE(
+ frame, afr_writev_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol], priv->children[subvol]->fops->writev, local->fd,
+ &vector, count, local->cont.writev.offset, local->cont.writev.flags,
+ local->cont.writev.iobref, local->xdata_req);
+
+ return 0;
+}
int
-afr_writev_done (call_frame_t *frame, xlator_t *this)
+afr_writev_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- iobref_unref (local->cont.writev.iobref);
- local->cont.writev.iobref = NULL;
+ if (AFR_IS_ARBITER_BRICK(priv, subvol)) {
+ afr_arbiter_writev_wind(frame, this, subvol);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_writev_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->writev, local->fd,
+ local->cont.writev.vector, local->cont.writev.count,
+ local->cont.writev.offset, local->cont.writev.flags,
+ local->cont.writev.iobref, local->xdata_req);
+ return 0;
+}
+
+int
+afr_do_writev(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = frame->local;
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ if (!AFR_FRAME_INIT(frame, op_errno))
+ goto out;
+
+ local->op = GF_FOP_WRITE;
+
+ local->transaction.wind = afr_writev_wind;
+ local->transaction.unwind = afr_transaction_writev_unwind;
+
+ local->transaction.main_frame = frame;
+
+ if (local->fd->flags & O_APPEND) {
+ /*
+ * Backend vfs ignores the 'offset' for append mode fd so
+ * locking just the region provided for the writev does not
+ * give consistency guarantee. The actual write may happen at a
+ * completely different range than the one provided by the
+ * offset, len in the fop. So lock the entire file.
+ */
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ } else {
+ local->transaction.start = local->cont.writev.offset;
+ local->transaction.len = iov_length(local->cont.writev.vector,
+ local->cont.writev.count);
+ }
+
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- local->transaction.unwind (frame, this);
+ AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
- AFR_STACK_DESTROY (frame);
+int
+afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = ENOMEM;
+ int ret = -1;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.writev.vector = iov_dup(vector, count);
+ if (!local->cont.writev.vector)
+ goto out;
+ local->cont.writev.count = count;
+ local->cont.writev.offset = offset;
+ local->cont.writev.flags = flags;
+ local->cont.writev.iobref = iobref_ref(iobref);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
+
+ if (dict_set_uint32(local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT,
+ this->name)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (dict_set_uint32(local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
+
+ /* detect here, but set it in writev_wind_cbk *after* the unstable
+ write is performed
+ */
+ local->stable_write = !!((fd->flags | flags) & (O_SYNC | O_DSYNC));
+
+ afr_fix_open(fd, this);
+
+ afr_do_writev(frame, this);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
+/* }}} */
+
+/* {{{ truncate */
int
-afr_do_writev (call_frame_t *frame, xlator_t *this)
+afr_truncate_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t * transaction_frame = NULL;
- afr_local_t * local = NULL;
- int op_ret = -1;
- int op_errno = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
- goto out;
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- transaction_frame->local = local;
- frame->local = NULL;
+ AFR_STACK_UNWIND(truncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
- local->op = GF_FOP_WRITE;
+int
+afr_truncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- local->success_count = 0;
+ local = frame->local;
- local->transaction.fop = afr_writev_wind;
- local->transaction.done = afr_writev_done;
- local->transaction.unwind = afr_writev_unwind;
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- local->transaction.main_frame = frame;
- if (local->fd->flags & O_APPEND) {
- local->transaction.start = 0;
- local->transaction.len = 0;
- } else {
- local->transaction.start = local->cont.writev.offset;
- local->transaction.len = iov_length (local->cont.writev.vector,
- local->cont.writev.count);
- }
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+int
+afr_truncate_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL);
- }
+ local = frame->local;
+ priv = this->private;
- return 0;
+ STACK_WIND_COOKIE(frame, afr_truncate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->truncate, &local->loc,
+ local->cont.truncate.offset, local->xdata_req);
+ return 0;
}
-
int
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref)
+afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- priv = this->private;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.truncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ if (!local->xdata_req)
+ goto out;
- frame->local = local;
+ local->transaction.wind = afr_truncate_wind;
+ local->transaction.unwind = afr_truncate_unwind;
- local->cont.writev.vector = iov_dup (vector, count);
- local->cont.writev.count = count;
- local->cont.writev.offset = offset;
- local->cont.writev.ino = fd->inode->ino;
- local->cont.writev.iobref = iobref_ref (iobref);
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- local->fd = fd_ref (fd);
+ local->op = GF_FOP_TRUNCATE;
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- goto out;
- }
+ local->transaction.main_frame = frame;
+ local->transaction.start = offset;
+ local->transaction.len = 0;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ /* Set it true speculatively, will get reset in afr_truncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- if (fd_ctx->up_count < priv->up_count) {
- local->openfd_flush_cbk = afr_do_writev;
- afr_openfd_flush (frame, this, fd);
- } else {
- afr_do_writev (frame, this);
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
/* }}} */
-/* {{{ truncate */
+/* {{{ ftruncate */
int
-afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- local->cont.truncate.prebuf.ia_ino = local->cont.truncate.ino;
- local->cont.truncate.postbuf.ia_ino = local->cont.truncate.ino;
+ AFR_STACK_UNWIND(ftruncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
- AFR_STACK_UNWIND (truncate, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.truncate.prebuf,
- &local->cont.truncate.postbuf);
- }
+int
+afr_ftruncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- return 0;
+ local = frame->local;
+
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
+
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
}
+int
+afr_ftruncate_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_ftruncate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->ftruncate, local->fd,
+ local->cont.ftruncate.offset, local->xdata_req);
+ return 0;
+}
int
-afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_read_child (this, local->loc.inode);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG)
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- call_count = afr_frame_return (frame);
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ local->cont.ftruncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- return 0;
-}
+ if (!local->xdata_req)
+ goto out;
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
-int32_t
-afr_truncate_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ local->op = GF_FOP_FTRUNCATE;
- local = frame->local;
- priv = this->private;
+ local->transaction.wind = afr_ftruncate_wind;
+ local->transaction.unwind = afr_ftruncate_unwind;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local->transaction.main_frame = frame;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ local->transaction.start = local->cont.ftruncate.offset;
+ local->transaction.len = 0;
- local->call_count = call_count;
+ afr_fix_open(fd, this);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->truncate,
- &local->loc,
- local->cont.truncate.offset);
-
- if (!--call_count)
- break;
- }
- }
+ /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- return 0;
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
}
+/* }}} */
+
+/* {{{ setattr */
int
-afr_truncate_done (call_frame_t *frame, xlator_t *this)
+afr_setattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(setattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset)
+afr_setattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preop,
+ struct iatt *postop, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop,
+ postop, NULL, xdata);
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_setattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_setattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setattr, &local->loc,
+ &local->cont.setattr.in_buf, local->cont.setattr.valid,
+ local->xdata_req);
+ return 0;
+}
- priv = this->private;
+int
+afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.setattr.in_buf = *buf;
+ local->cont.setattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_setattr_wind;
+ local->transaction.unwind = afr_setattr_unwind;
+
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
+
+ local->op = GF_FOP_SETATTR;
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ AFR_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+/* {{{ fsetattr */
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+int
+afr_fsetattr_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- transaction_frame->local = local;
+ local = frame->local;
- local->op_ret = -1;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- local->cont.truncate.offset = offset;
- local->cont.truncate.ino = loc->inode->ino;
+ AFR_STACK_UNWIND(fsetattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
- local->transaction.fop = afr_truncate_wind;
- local->transaction.done = afr_truncate_done;
- local->transaction.unwind = afr_truncate_unwind;
+int
+afr_fsetattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preop,
+ struct iatt *postop, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop,
+ postop, NULL, xdata);
+}
- loc_copy (&local->loc, loc);
+int
+afr_fsetattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fsetattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetattr, local->fd,
+ &local->cont.fsetattr.in_buf, local->cont.fsetattr.valid,
+ local->xdata_req);
+ return 0;
+}
- local->transaction.main_frame = frame;
- local->transaction.start = offset;
- local->transaction.len = 0;
+int
+afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (truncate, frame, op_ret, op_errno, NULL, NULL);
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- return 0;
-}
+ local->cont.fsetattr.in_buf = *buf;
+ local->cont.fsetattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+ if (!local->xdata_req)
+ goto out;
-/* }}} */
+ local->transaction.wind = afr_fsetattr_wind;
+ local->transaction.unwind = afr_fsetattr_unwind;
-/* {{{ ftruncate */
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
+ local->op = GF_FOP_FSETATTR;
-int
-afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_fix_open(fd, this);
- local = frame->local;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- if (main_frame) {
- local->cont.ftruncate.prebuf.ia_ino = local->cont.ftruncate.ino;
- local->cont.ftruncate.postbuf.ia_ino = local->cont.ftruncate.ino;
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.ftruncate.prebuf,
- &local->cont.ftruncate.postbuf);
- }
- return 0;
+ AFR_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
+/* {{{ setxattr */
int
-afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+afr_setxattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
- int need_unwind = 0;
- int read_child = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- read_child = afr_read_child (this, local->fd->inode);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ AFR_STACK_UNWIND(setxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+int
+afr_setxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
+}
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
+int
+afr_setxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_setxattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setxattr, &local->loc,
+ local->cont.setxattr.dict, local->cont.setxattr.flags,
+ local->xdata_req);
+ return 0;
+}
- if (child_index == read_child) {
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
+int
+afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
- local->success_count++;
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i, ret = 0;
+ char *op_type = NULL;
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
+ priv = this->private;
+ i = (long)cookie;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- call_count = afr_frame_return (frame);
+ ret = dict_get_str_sizen(local->xdata_req, "replicate-brick-op", &op_type);
+ if (ret)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
+ op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s",
+ priv->children[i]->name, "op_ret=%s",
+ op_ret ? "failed" : "succeeded", NULL);
- return 0;
+out:
+ syncbarrier_wake(&local->barrier);
+ return 0;
}
-
int
-afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+afr_emptyb_set_pending_changelog(call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_nodes)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = 0, i = 0;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ AFR_ONLIST(locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk,
+ xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, local->xattr_req,
+ NULL);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
+ /* It is sufficient if xattrop was successful on one child */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret == 0) {
+ ret = 0;
+ goto out;
+ } else {
+ ret = afr_higher_errno(ret, local->replies[i].op_errno);
}
+ }
+out:
+ return -ret;
+}
- local->call_count = call_count;
+static int
+_afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ int empty_index, afr_transaction_type type,
+ char *op_type, const int op_type_len)
+{
+ int count = 0;
+ int ret = -ENOMEM;
+ int idx = -1;
+ int d_idx = -1;
+ unsigned char *locked_nodes = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ locked_nodes = alloca0(priv->child_count);
+
+ idx = afr_index_for_transaction_type(type);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!local->pending)
+ goto out;
+
+ local->pending[empty_index][idx] = hton32(1);
+
+ if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION))
+ local->pending[empty_index][d_idx] = hton32(1);
+
+ local->xdata_req = dict_new();
+ if (!local->xdata_req)
+ goto out;
+
+ ret = dict_set_nstrn(local->xdata_req, "replicate-brick-op",
+ SLEN("replicate-brick-op"), op_type, op_type_len);
+ if (ret)
+ goto out;
+
+ local->xattr_req = dict_new();
+ if (!local->xattr_req)
+ goto out;
+
+ ret = afr_set_pending_dict(priv, local->xattr_req, local->pending);
+ if (ret < 0)
+ goto out;
+
+ if (AFR_ENTRY_TRANSACTION == type) {
+ count = afr_selfheal_entrylk(frame, this, loc->inode, this->name, NULL,
+ locked_nodes);
+ } else {
+ count = afr_selfheal_inodelk(frame, this, loc->inode, this->name,
+ LLONG_MAX - 1, 0, locked_nodes);
+ }
+
+ if (!count) {
+ gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS,
+ NULL);
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ ret = afr_emptyb_set_pending_changelog(frame, this, locked_nodes);
+ if (ret)
+ goto unlock;
+ ret = 0;
+unlock:
+ if (AFR_ENTRY_TRANSACTION == type) {
+ afr_selfheal_unentrylk(frame, this, loc->inode, this->name, NULL,
+ locked_nodes, NULL);
+ } else {
+ afr_selfheal_uninodelk(frame, this, loc->inode, this->name,
+ LLONG_MAX - 1, 0, locked_nodes);
+ }
+out:
+ return ret;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- local->fd, local->cont.ftruncate.offset);
-
- if (!--call_count)
- break;
- }
- }
+void
+afr_brick_args_cleanup(void *opaque)
+{
+ afr_empty_brick_args_t *data = NULL;
- return 0;
+ data = opaque;
+ loc_wipe(&data->loc);
+ GF_FREE(data);
}
+int
+_afr_handle_empty_brick_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ afr_brick_args_cleanup(opaque);
+ return 0;
+}
int
-afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+_afr_handle_empty_brick(void *opaque)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int empty_index = -1;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ char *op_type = NULL;
+ int op_type_len = 0;
+ afr_empty_brick_args_t *data = NULL;
+ call_frame_t *op_frame = NULL;
+
+ data = opaque;
+ frame = data->frame;
+ empty_index = data->empty_index;
+ if (!data->op_type)
+ goto out;
+
+ op_frame = copy_frame(frame);
+ if (!op_frame) {
+ ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_type = data->op_type;
+ op_type_len = strlen(op_type);
+ this = op_frame->this;
+ priv = this->private;
+
+ afr_set_lk_owner(op_frame, this, op_frame->root);
+ local = AFR_FRAME_INIT(op_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, &data->loc);
+
+ gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s",
+ priv->children[empty_index]->name, NULL);
+
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+ AFR_METADATA_TRANSACTION, op_type,
+ op_type_len);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ dict_unref(local->xdata_req);
+ dict_unref(local->xattr_req);
+ afr_matrix_cleanup(local->pending, priv->child_count);
+ local->pending = NULL;
+ local->xattr_req = NULL;
+ local->xdata_req = NULL;
+
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+ AFR_ENTRY_TRANSACTION, op_type,
+ op_type_len);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ if (op_frame) {
+ AFR_STACK_DESTROY(op_frame);
+ }
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ return 0;
+}
- local = frame->local;
+int
+afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ char *data)
+{
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ local = frame->local;
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_int32_sizen(local->xdata_req, "heal-op",
+ GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str_sizen(local->xdata_req, "child-name", data);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ /* set spb choice to -1 whether heal succeeds or not:
+ * If heal succeeds : spb-choice should be set to -1 as
+ * it is no longer valid; file is not
+ * in split-brain anymore.
+ * If heal doesn't succeed:
+ * spb-choice should be set to -1
+ * otherwise reads will be served
+ * from spb-choice which is misleading.
+ */
+ ret = afr_inode_split_brain_choice_set(loc->inode, this, -1);
+ if (ret)
+ gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ NULL);
+ afr_heal_splitbrain_file(frame, this, loc);
+ ret = 0;
+out:
+ if (ret < 0)
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int
+afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len)
+{
+ int spb_child_index = -1;
+ char *spb_child_str = NULL;
- local->transaction.unwind (frame, this);
+ spb_child_str = alloca0(len + 1);
+ memcpy(spb_child_str, value, len);
- AFR_STACK_DESTROY (frame);
+ if (!strcmp(spb_child_str, "none"))
+ return -2;
- return 0;
+ spb_child_index = afr_get_child_index_from_name(this, spb_child_str);
+ if (spb_child_index < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "subvol=%s", spb_child_str, NULL);
+ }
+ return spb_child_index;
}
+int
+afr_can_set_split_brain_choice(void *opaque)
+{
+ afr_spbc_timeout_t *data = opaque;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ loc_t *loc = NULL;
+ int ret = -1;
+
+ frame = data->frame;
+ loc = data->loc;
+ this = frame->this;
+
+ ret = afr_is_split_brain(frame, this, loc->inode, loc->gfid, &data->d_spb,
+ &data->m_spb);
+
+ if (ret)
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s",
+ uuid_utoa(loc->gfid), NULL);
+ return ret;
+}
int
-afr_do_ftruncate (call_frame_t *frame, xlator_t *this)
+afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
{
- call_frame_t * transaction_frame = NULL;
- afr_local_t * local = NULL;
- int op_ret = -1;
- int op_errno = 0;
+ void *choice_value = NULL;
+ void *resolve_value = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_spbc_timeout_t *data = NULL;
+ int len = 0;
+ int spb_child_index = -1;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len);
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value,
+ &len);
+ if (!choice_value && !resolve_value) {
+ ret = -1;
+ goto out;
+ }
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local) {
+ ret = 1;
+ goto out;
+ }
+
+ local->op = GF_FOP_SETXATTR;
+
+ if (choice_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, choice_value,
+ len);
+ if (spb_child_index < 0) {
+ /* Case where value was "none" */
+ if (spb_child_index == -2)
+ spb_child_index = -1;
+ else {
+ ret = 1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ }
+
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spbc_timeout_t);
+ if (!data) {
+ ret = 1;
+ goto out;
+ }
+ data->spb_child_index = spb_child_index;
+ data->frame = frame;
+ loc_copy(&local->loc, loc);
+ data->loc = &local->loc;
+ ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice,
+ afr_set_split_brain_choice, NULL, data);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ "name=%s", loc->name, NULL);
+ ret = 1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ if (resolve_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, resolve_value,
+ len);
+ if (spb_child_index < 0) {
+ ret = 1;
+ goto out;
+ }
+
+ afr_split_brain_resolve_do(frame, this, loc,
+ priv->children[spb_child_index]->name);
+ ret = 0;
+ }
+out:
+ /* key was correct but value was invalid when ret == 1 */
+ if (ret == 1) {
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ if (data)
+ GF_FREE(data);
+ ret = 0;
+ }
+ return ret;
+}
- local = frame->local;
+int
+afr_handle_spb_choice_timeout(xlator_t *this, call_frame_t *frame, dict_t *dict)
+{
+ int ret = -1;
+ int op_errno = 0;
+ uint64_t timeout = 0;
+ afr_private_t *priv = NULL;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ priv = this->private;
- transaction_frame->local = local;
- frame->local = NULL;
+ ret = dict_get_uint64(dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout);
+ if (!ret) {
+ priv->spb_choice_timeout = timeout * 60;
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ }
- local->op = GF_FOP_FTRUNCATE;
+ return ret;
+}
- local->transaction.fop = afr_ftruncate_wind;
- local->transaction.done = afr_ftruncate_done;
- local->transaction.unwind = afr_ftruncate_unwind;
+int
+afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
+{
+ int ret = -1;
+ int ab_ret = -1;
+ int empty_index = -1;
+ int op_errno = EPERM;
+ char *empty_brick = NULL;
+ char *op_type = NULL;
+ afr_empty_brick_args_t *data = NULL;
+
+ ret = dict_get_str_sizen(dict, GF_AFR_REPLACE_BRICK, &empty_brick);
+ if (!ret)
+ op_type = GF_AFR_REPLACE_BRICK;
+
+ ab_ret = dict_get_str_sizen(dict, GF_AFR_ADD_BRICK, &empty_brick);
+ if (!ab_ret)
+ op_type = GF_AFR_ADD_BRICK;
+
+ if (ret && ab_ret)
+ goto out;
+
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) {
+ gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR,
+ "op_type=%s", op_type, NULL);
+ ret = 1;
+ goto out;
+ }
+ empty_index = afr_get_child_index_from_name(this, empty_brick);
+
+ if (empty_index < 0) {
+ /* Didn't belong to this replica pair
+ * Just do a no-op
+ */
+ AFR_STACK_UNWIND(setxattr, frame, 0, 0, NULL);
+ return 0;
+ } else {
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_empty_brick_t);
+ if (!data) {
+ ret = 1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+ data->frame = frame;
+ loc_copy(&data->loc, loc);
+ data->empty_index = empty_index;
+ data->op_type = op_type;
+ ret = synctask_new(this->ctx->env, _afr_handle_empty_brick,
+ _afr_handle_empty_brick_cbk, NULL, data);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ NULL);
+ ret = 1;
+ op_errno = ENOMEM;
+ afr_brick_args_cleanup(data);
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ if (ret == 1) {
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ ret = 0;
+ }
+ return ret;
+}
- local->transaction.main_frame = frame;
+static int
+afr_handle_special_xattr(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
+{
+ int ret = -1;
- local->transaction.start = local->cont.ftruncate.offset;
- local->transaction.len = 0;
+ ret = afr_handle_split_brain_commands(this, frame, loc, dict);
+ if (ret == 0)
+ goto out;
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ ret = afr_handle_spb_choice_timeout(this, frame, dict);
+ if (ret == 0)
+ goto out;
- op_ret = 0;
+ /* Applicable for replace-brick and add-brick commands */
+ ret = afr_handle_empty_brick(this, frame, loc, dict);
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL);
- }
-
- return 0;
+ return ret;
}
-
int
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset)
+afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out);
- priv = this->private;
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = afr_handle_special_xattr(this, frame, loc, dict);
+ if (ret == 0)
+ return 0;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- frame->local = local;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.ftruncate.offset = offset;
- local->cont.ftruncate.ino = fd->inode->ino;
+ local->cont.setxattr.dict = dict_ref(dict);
+ local->cont.setxattr.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- local->fd = fd_ref (fd);
+ if (!local->xdata_req)
+ goto out;
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- goto out;
- }
+ local->transaction.wind = afr_setxattr_wind;
+ local->transaction.unwind = afr_setxattr_unwind;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- if (fd_ctx->up_count < priv->up_count) {
- local->openfd_flush_cbk = afr_do_ftruncate;
- afr_openfd_flush (frame, this, fd);
- } else {
- afr_do_ftruncate (frame, this);
- }
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ local->op = GF_FOP_SETXATTR;
- op_ret = 0;
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
-}
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
-/* }}} */
+ return 0;
+}
-/* {{{ setattr */
+/* {{{ fsetxattr */
int
-afr_setattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fsetxattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.setattr.preop_buf.ia_ino = local->cont.setattr.ino;
- local->cont.setattr.postop_buf.ia_ino = local->cont.setattr.ino;
-
- AFR_STACK_UNWIND (setattr, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.setattr.preop_buf,
- &local->cont.setattr.postop_buf);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(fsetxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
}
+int
+afr_fsetxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
+}
int
-afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
+afr_fsetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fsetxattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetxattr, local->fd,
+ local->cont.fsetxattr.dict, local->cont.fsetxattr.flags,
+ local->xdata_req);
+ return 0;
+}
- local = frame->local;
- priv = this->private;
+int
+afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- read_child = afr_read_child (this, local->loc.inode);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out);
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- if (child_index == read_child) {
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
+ local->cont.fsetxattr.dict = dict_ref(dict);
+ local->cont.fsetxattr.flags = flags;
- local->success_count++;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ if (!local->xdata_req)
+ goto out;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ local->transaction.wind = afr_fsetxattr_wind;
+ local->transaction.unwind = afr_fsetxattr_unwind;
- call_count = afr_frame_return (frame);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ local->op = GF_FOP_FSETXATTR;
- return 0;
-}
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
-int32_t
-afr_setattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- local = frame->local;
- priv = this->private;
+ AFR_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+ return 0;
+}
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+/* }}} */
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+/* {{{ removexattr */
- local->call_count = call_count;
+int
+afr_removexattr_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc,
- &local->cont.setattr.in_buf,
- local->cont.setattr.valid);
-
- if (!--call_count)
- break;
- }
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(removexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
int
-afr_setattr_done (call_frame_t *frame, xlator_t *this)
+afr_removexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
-
- local = frame->local;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
+}
- local->transaction.unwind (frame, this);
+int
+afr_removexattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ priv = this->private;
- return 0;
+ STACK_WIND_COOKIE(frame, afr_removexattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->removexattr, &local->loc,
+ local->cont.removexattr.name, local->xdata_req);
+ return 0;
}
-
int
-afr_setattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct iatt *buf, int32_t valid)
+afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out);
- priv = this->private;
+ GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->cont.removexattr.name = gf_strdup(name);
- transaction_frame->local = local;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- local->op_ret = -1;
+ if (!local->xdata_req)
+ goto out;
- local->cont.setattr.ino = loc->inode->ino;
+ local->transaction.wind = afr_removexattr_wind;
+ local->transaction.unwind = afr_removexattr_unwind;
- local->cont.setattr.in_buf = *buf;
- local->cont.setattr.valid = valid;
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- local->transaction.fop = afr_setattr_wind;
- local->transaction.done = afr_setattr_done;
- local->transaction.unwind = afr_setattr_unwind;
+ local->op = GF_FOP_REMOVEXATTR;
- loc_copy (&local->loc, loc);
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
-
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setattr, frame, op_ret, op_errno, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+ return 0;
}
-/* {{{ fsetattr */
-
+/* ffremovexattr */
int
-afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fremovexattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.fsetattr.preop_buf.ia_ino =
- local->cont.fsetattr.ino;
- local->cont.fsetattr.postop_buf.ia_ino =
- local->cont.fsetattr.ino;
-
- AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.fsetattr.preop_buf,
- &local->cont.fsetattr.postop_buf);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(fremovexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
int
-afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
+afr_fremovexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_read_child (this, local->fd->inode);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
+}
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+int
+afr_fremovexattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+ local = frame->local;
+ priv = this->private;
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
+ STACK_WIND_COOKIE(frame, afr_fremovexattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fremovexattr, local->fd,
+ local->cont.removexattr.name, local->xdata_req);
+ return 0;
+}
- if (child_index == read_child) {
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
+int
+afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local->success_count++;
+ GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out);
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- call_count = afr_frame_return (frame);
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ local->cont.removexattr.name = gf_strdup(name);
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- return 0;
-}
+ if (!local->xdata_req)
+ goto out;
+ local->transaction.wind = afr_fremovexattr_wind;
+ local->transaction.unwind = afr_fremovexattr_unwind;
-int32_t
-afr_fsetattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- local = frame->local;
- priv = this->private;
+ local->op = GF_FOP_FREMOVEXATTR;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- local->call_count = call_count;
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsetattr,
- local->fd,
- &local->cont.fsetattr.in_buf,
- local->cont.fsetattr.valid);
-
- if (!--call_count)
- break;
- }
- }
+ AFR_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-afr_fsetattr_done (call_frame_t *frame, xlator_t *this)
+afr_fallocate_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(fallocate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
}
+int
+afr_fallocate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
int
-afr_fsetattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *buf, int32_t valid)
+afr_fallocate_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fallocate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fallocate, local->fd,
+ local->cont.fallocate.mode, local->cont.fallocate.offset,
+ local->cont.fallocate.len, local->xdata_req);
+ return 0;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- priv = this->private;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.fallocate.mode = mode;
+ local->cont.fallocate.offset = offset;
+ local->cont.fallocate.len = len;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- transaction_frame->local = local;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- local->op_ret = -1;
+ if (!local->xdata_req)
+ goto out;
- local->cont.fsetattr.ino = fd->inode->ino;
+ local->op = GF_FOP_FALLOCATE;
- local->cont.fsetattr.in_buf = *buf;
- local->cont.fsetattr.valid = valid;
+ local->transaction.wind = afr_fallocate_wind;
+ local->transaction.unwind = afr_fallocate_unwind;
- local->transaction.fop = afr_fsetattr_wind;
- local->transaction.done = afr_fsetattr_done;
- local->transaction.unwind = afr_fsetattr_unwind;
+ local->transaction.main_frame = frame;
- local->fd = fd_ref (fd);
+ local->transaction.start = local->cont.fallocate.offset;
+ local->transaction.len = 0;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ afr_fix_open(fd, this);
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
+/* }}} */
-/* {{{ setxattr */
-
+/* {{{ discard */
int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_discard_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (setxattr, main_frame,
- local->op_ret, local->op_errno)
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_discard_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
- local = frame->local;
- priv = this->private;
+int
+afr_discard_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_discard_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->discard, local->fd,
+ local->cont.discard.offset, local->cont.discard.len,
+ local->xdata_req);
+ return 0;
+}
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
+int
+afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ local->cont.discard.offset = offset;
+ local->cont.discard.len = len;
- call_count = afr_frame_return (frame);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- return 0;
-}
+ if (!local->xdata_req)
+ goto out;
+ local->op = GF_FOP_DISCARD;
-int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ local->transaction.wind = afr_discard_wind;
+ local->transaction.unwind = afr_discard_unwind;
- local = frame->local;
- priv = this->private;
+ local->transaction.main_frame = frame;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = 0;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ afr_fix_open(fd, this);
- local->call_count = call_count;
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags);
-
- if (!--call_count)
- break;
- }
- }
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
+/* {{{ zerofill */
int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
+afr_zerofill_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
}
int
-afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags)
+afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_zerofill_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_zerofill_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->zerofill, local->fd,
+ local->cont.zerofill.offset, local->cont.zerofill.len,
+ local->xdata_req);
+ return 0;
+}
- priv = this->private;
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ local->cont.zerofill.offset = offset;
+ local->cont.zerofill.len = len;
+
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- transaction_frame->local = local;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- local->op_ret = -1;
+ if (!local->xdata_req)
+ goto out;
- local->cont.setxattr.dict = dict_ref (dict);
- local->cont.setxattr.flags = flags;
+ local->op = GF_FOP_ZEROFILL;
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
- local->transaction.unwind = afr_setxattr_unwind;
+ local->transaction.wind = afr_zerofill_wind;
+ local->transaction.unwind = afr_zerofill_unwind;
- loc_copy (&local->loc, loc);
+ local->transaction.main_frame = frame;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.start = local->cont.zerofill.offset;
+ local->transaction.len = len;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ afr_fix_open(fd, this);
- op_ret = 0;
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
-/* {{{ removexattr */
-
+int32_t
+afr_xattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, xattr, xdata);
+}
int
-afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_xattrop_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (removexattr, main_frame,
- local->op_ret, local->op_errno)
- }
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_xattrop_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &local->loc,
+ local->cont.xattrop.optype, local->cont.xattrop.xattr,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_xattrop_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+ AFR_STACK_UNWIND(xattrop, main_frame, local->op_ret, local->op_errno,
+ local->xattr_rsp, local->xdata_rsp);
+ return 0;
+}
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+int32_t
+afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- call_count = afr_frame_return (frame);
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ local->cont.xattrop.xattr = dict_ref(xattr);
+ local->cont.xattrop.optype = optype;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- return 0;
-}
+ local->transaction.wind = afr_xattrop_wind;
+ local->transaction.unwind = afr_xattrop_unwind;
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
-int32_t
-afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ local->op = GF_FOP_XATTROP;
- local = frame->local;
- priv = this->private;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->removexattr,
- &local->loc,
- local->cont.removexattr.name);
-
- if (!--call_count)
- break;
- }
- }
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
+int32_t
+afr_fxattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, xattr, xdata);
+}
int
-afr_removexattr_done (call_frame_t *frame, xlator_t *this)
+afr_fxattrop_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fxattrop_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fxattrop, local->fd,
+ local->cont.xattrop.optype, local->cont.xattrop.xattr,
+ local->xdata_req);
+ return 0;
+}
- local->transaction.unwind (frame, this);
+int
+afr_fxattrop_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(fxattrop, main_frame, local->op_ret, local->op_errno,
+ local->xattr_rsp, local->xdata_rsp);
+ return 0;
+}
-int
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+int32_t
+afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.xattrop.xattr = dict_ref(xattr);
+ local->cont.xattrop.optype = optype;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ local->transaction.wind = afr_fxattrop_wind;
+ local->transaction.unwind = afr_fxattrop_unwind;
+
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
+
+ local->op = GF_FOP_FXATTROP;
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ AFR_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_fsync_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = frame->local;
- transaction_frame->local = local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- local->op_ret = -1;
+ AFR_STACK_UNWIND(fsync, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
- local->cont.removexattr.name = gf_strdup (name);
+ return 0;
+}
- local->transaction.fop = afr_removexattr_wind;
- local->transaction.done = afr_removexattr_done;
- local->transaction.unwind = afr_removexattr_unwind;
+int
+afr_fsync_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
- loc_copy (&local->loc, loc);
+int
+afr_fsync_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local = frame->local;
+ priv = this->private;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ STACK_WIND_COOKIE(frame, afr_fsync_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsync, local->fd,
+ local->cont.fsync.datasync, local->xdata_req);
+ return 0;
+}
- op_ret = 0;
+int
+afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int32_t op_errno = ENOMEM;
+ int8_t last_fsync = 0;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (xdata) {
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
+ if (last_fsync) {
+ local->transaction.disable_delayed_post_op = _gf_true;
+ }
+ }
+ } else {
+ local->xdata_req = dict_new();
+ }
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
+
+ local->op = GF_FOP_FSYNC;
+ local->cont.fsync.datasync = datasync;
+
+ if (afr_fd_has_witnessed_unstable_write(this, fd->inode)) {
+ /* don't care. we only wanted to CLEAR the bit */
+ }
+
+ local->transaction.wind = afr_fsync_wind;
+ local->transaction.unwind = afr_fsync_unwind;
+
+ local->transaction.main_frame = frame;
+
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (removexattr, frame, op_ret, op_errno);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index f9aa7bd36f6..a787069b7a1 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -1,72 +1,94 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __INODE_WRITE_H__
#define __INODE_WRITE_H__
int32_t
-afr_chmod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode);
+afr_chmod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dict_t *xdata);
int32_t
-afr_chown (call_frame_t *frame, xlator_t *this,
- loc_t *loc, uid_t uid, gid_t gid);
+afr_chown(call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, gid_t gid,
+ dict_t *xdata);
int
-afr_fchown (call_frame_t *frame, xlator_t *this,
- fd_t *fd, uid_t uid, gid_t gid);
+afr_fchown(call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid,
+ dict_t *xdata);
+
+int32_t
+afr_fchmod(call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode,
+ dict_t *xdata);
+
+int32_t
+afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata);
+
+int32_t
+afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata);
+
+int32_t
+afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata);
int32_t
-afr_fchmod (call_frame_t *frame, xlator_t *this,
- fd_t *fd, mode_t mode);
+afr_utimens(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct timespec tv[2], dict_t *xdata);
+
+int
+afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata);
+
+int
+afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+ int32_t valid, dict_t *xdata);
int32_t
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref);
+afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata);
int32_t
-afr_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset);
+afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata);
int32_t
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset);
+afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
int32_t
-afr_utimens (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct timespec tv[2]);
+afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+int
+afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
int
-afr_setattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct iatt *buf, int32_t valid);
+afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
int
-afr_fsetattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *buf, int32_t valid);
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata);
int32_t
-afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags);
+afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
int32_t
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name);
+afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+int
+afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata);
#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index c059a6b9e5b..bc8eabe0f43 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -1,2165 +1,791 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "dict.h"
-#include "byte-order.h"
-#include "common-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
#include "afr.h"
#include "afr-transaction.h"
+#include "afr-messages.h"
#include <signal.h>
-
-#define LOCKED_NO 0x0 /* no lock held */
-#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */
-#define LOCKED_LOWER 0x2 /* for lower path */
-
-int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
-
-static uint64_t afr_lock_number = 1;
-
-static uint64_t
-get_afr_lock_number ()
-{
- return (++afr_lock_number);
-}
-
-int
-afr_set_lock_number (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_number = get_afr_lock_number ();
-
- return 0;
-}
+#define LOCKED_NO 0x0 /* no lock held */
+#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */
+#define LOCKED_LOWER 0x2 /* for lower path */
void
-afr_set_lk_owner (call_frame_t *frame, xlator_t *this)
-{
- if (!frame->root->lk_owner) {
- gf_log (this->name, GF_LOG_TRACE,
- "Setting lk-owner=%llu",
- (unsigned long long) (unsigned long)frame->root);
- frame->root->lk_owner = (uint64_t) (unsigned long)frame->root;
- }
-}
-
-static int
-is_afr_lock_selfheal (afr_local_t *local)
-{
- afr_internal_lock_t *int_lock = NULL;
- int ret = -1;
-
- int_lock = &local->internal_lock;
-
- switch (int_lock->selfheal_lk_type) {
- case AFR_DATA_SELF_HEAL_LK:
- case AFR_METADATA_SELF_HEAL_LK:
- ret = 1;
- break;
- case AFR_ENTRY_SELF_HEAL_LK:
- ret = 0;
- break;
- }
-
- return ret;
-
-}
-
-int32_t
-internal_lock_count (call_frame_t *frame, xlator_t *this,
- afr_fd_ctx_t *fd_ctx)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t call_count = 0;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- if (fd_ctx) {
- GF_ASSERT (local->fd);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && fd_ctx->opened_on[i])
- ++call_count;
- }
- } else {
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i])
- ++call_count;
- }
- }
-
- return call_count;
-}
-
-static void
-afr_print_inodelk (char *str, int size, int cmd,
- struct gf_flock *flock, uint64_t owner)
+afr_lockee_cleanup(afr_lockee_t *lockee)
{
- char *cmd_str = NULL;
- char *type_str = NULL;
-
- switch (cmd) {
-#if F_GETLK != F_GETLK64
- case F_GETLK64:
-#endif
- case F_GETLK:
- cmd_str = "GETLK";
- break;
+ if (lockee->fd) {
+ fd_unref(lockee->fd);
+ lockee->fd = NULL;
+ } else {
+ loc_wipe(&lockee->loc);
+ }
-#if F_SETLK != F_SETLK64
- case F_SETLK64:
-#endif
- case F_SETLK:
- cmd_str = "SETLK";
- break;
-
-#if F_SETLKW != F_SETLKW64
- case F_SETLKW64:
-#endif
- case F_SETLKW:
- cmd_str = "SETLKW";
- break;
-
- default:
- cmd_str = "<null>";
- break;
- }
-
- switch (flock->l_type) {
- case F_RDLCK:
- type_str = "READ";
- break;
- case F_WRLCK:
- type_str = "WRITE";
- break;
- case F_UNLCK:
- type_str = "UNLOCK";
- break;
- default:
- type_str = "UNKNOWN";
- break;
- }
-
- snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, "
- "start=%llu, len=%llu, pid=%llu, lk-owner=%llu",
- cmd_str, type_str, (unsigned long long) flock->l_start,
- (unsigned long long) flock->l_len,
- (unsigned long long) flock->l_pid,
- (unsigned long long) owner);
+ GF_FREE(lockee->basename);
+ lockee->basename = NULL;
+ GF_FREE(lockee->locked_nodes);
+ lockee->locked_nodes = NULL;
-}
-
-static void
-afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd,
- int child_index)
-{
- snprintf (str, size, "path=%s, fd=%p, child=%d",
- loc->path ? loc->path : "<nul>",
- fd ? fd : NULL,
- child_index);
+ return;
}
void
-afr_print_entrylk (char *str, int size, const char *basename,
- uint64_t owner)
+afr_lockees_cleanup(afr_internal_lock_t *int_lock)
{
- snprintf (str, size, "Basename=%s, lk-owner=%llu",
- basename ? basename : "<nul>",
- (unsigned long long)owner);
-}
-
-static void
-afr_print_verdict (int op_ret, int op_errno, char *str)
-{
- if (op_ret < 0) {
- if (op_errno == EAGAIN)
- strcpy (str, "EAGAIN");
- else
- strcpy (str, "FAILED");
- }
- else
- strcpy (str, "GRANTED");
-}
+ int i = 0;
-static void
-afr_set_lock_call_type (afr_lock_call_type_t lock_call_type,
- char *lock_call_type_str,
- afr_internal_lock_t *int_lock)
-{
- switch (lock_call_type) {
- case AFR_INODELK_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL");
- break;
- case AFR_INODELK_NB_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL");
- break;
- case AFR_ENTRYLK_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL");
- break;
- case AFR_ENTRYLK_NB_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL");
- break;
- default:
- strcpy (lock_call_type_str, "UNKNOWN");
- break;
- }
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ afr_lockee_cleanup(&int_lock->lockee[i]);
+ }
+ return;
}
-
-static void
-afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, struct gf_flock *flock,
- int op_ret, int op_errno, int32_t child_index)
-{
- xlator_t *this = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- char lockee[256];
- char lock_call_type_str[256];
- char verdict[16];
-
- this = THIS;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->inodelk_trace) {
- return;
- }
-
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- afr_print_verdict (op_ret, op_errno, verdict);
-
- gf_log (this->name, GF_LOG_INFO,
- "[%s %s] [%s] Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
- verdict,
- lockee,
- (unsigned long long) int_lock->lock_number);
-
-}
-
-static void
-afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, struct gf_flock *flock,
- int32_t cmd, int32_t child_index)
-{
- xlator_t *this = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
-
- this = THIS;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->inodelk_trace) {
- return;
- }
-
- afr_print_inodelk (lock, 256, cmd, flock, frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- gf_log (this->name, GF_LOG_INFO,
- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
- lock, lockee,
- (unsigned long long) int_lock->lock_number);
-
-}
-
-static void
-afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, const char *basename,
- int32_t child_index)
-{
- xlator_t *this = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
-
- this = THIS;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->entrylk_trace) {
- return;
- }
-
- afr_print_entrylk (lock, 256, basename, frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- gf_log (this->name, GF_LOG_INFO,
- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
- lock, lockee,
- (unsigned long long) int_lock->lock_number);
-}
-
-static void
-afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, const char *basename, int op_ret,
- int op_errno, int32_t child_index)
-{
- xlator_t *this = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
- char verdict[16];
-
- this = THIS;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->entrylk_trace) {
- return;
- }
-
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- afr_print_verdict (op_ret, op_errno, verdict);
-
- gf_log (this->name, GF_LOG_INFO,
- "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
- verdict,
- lock, lockee,
- (unsigned long long) int_lock->lock_number);
-
-}
-
-static int
-transaction_lk_op (afr_local_t *local)
-{
- afr_internal_lock_t *int_lock = NULL;
- int ret = -1;
-
- int_lock = &local->internal_lock;
-
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "lk op is for a transaction");
- ret = 1;
- }
- else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "lk op is for a self heal");
-
- ret = 0;
- }
-
- if (ret == -1)
- gf_log (THIS->name, GF_LOG_DEBUG,
- "lk op is not set");
-
- return ret;
-
-}
-
-static int
-is_afr_lock_transaction (afr_local_t *local)
-{
- int ret = 0;
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- ret = 1;
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- ret = 0;
- break;
-
- }
-
- return ret;
+int
+afr_entry_lockee_cmp(const void *l1, const void *l2)
+{
+ const afr_lockee_t *r1 = l1;
+ const afr_lockee_t *r2 = l2;
+ int ret = 0;
+ uuid_t gfid1 = {0};
+ uuid_t gfid2 = {0};
+
+ loc_gfid((loc_t *)&r1->loc, gfid1);
+ loc_gfid((loc_t *)&r2->loc, gfid2);
+ ret = gf_uuid_compare(gfid1, gfid2);
+ /*Entrylks with NULL basename are the 'smallest'*/
+ if (ret == 0) {
+ if (!r1->basename)
+ return -1;
+ if (!r2->basename)
+ return 1;
+ ret = strcmp(r1->basename, r2->basename);
+ }
+
+ if (ret <= 0)
+ return -1;
+ else
+ return 1;
}
-static int
-initialize_entrylk_variables (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
-
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->entrylk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- int_lock->entry_locked_nodes[i] = 0;
- }
-
- return 0;
-}
+int
+afr_lock_blocking(call_frame_t *frame, xlator_t *this, int child_index);
-static int
-initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
+void
+afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner)
{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+ gf_msg_trace(this->name, 0, "Setting lk-owner=%llu",
+ (unsigned long long)(unsigned long)lk_owner);
- int_lock->inodelk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- int_lock->inode_locked_nodes[i] = 0;
- }
-
- return 0;
+ set_lk_owner_from_ptr(&frame->root->lk_owner, lk_owner);
}
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
+int32_t
+internal_lock_count(call_frame_t *frame, xlator_t *this)
{
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int32_t call_count = 0;
+ int i = 0;
- ret = strcmp (l1->path, l2->path);
+ local = frame->local;
+ priv = this->private;
- if (ret == 0)
- ret = strcmp (b1, b2);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i])
+ ++call_count;
+ }
- if (ret <= 0)
- return l1;
- else
- return l2;
+ return call_count;
}
int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
-
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+ int child_count)
{
- int i = 0;
- int call_count = 0;
+ int ret = -ENOMEM;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
- for (i = 0; i < child_count; i++) {
- if (locked_nodes[i] & LOCKED_YES)
- call_count++;
- }
+ GF_ASSERT(int_lock->lockee_count < AFR_LOCKEE_COUNT_MAX);
+ loc_copy(&lockee->loc, loc);
+ lockee->basename = (basename) ? gf_strdup(basename) : NULL;
+ if (basename && !lockee->basename)
+ goto out;
- return call_count;
-}
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
-/* FIXME: What if UNLOCK fails */
-static int32_t
-afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- int call_count = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
+ if (!lockee->locked_nodes)
+ goto out;
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "All internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- }
-
- return 0;
+ ret = 0;
+ int_lock->lockee_count++;
+out:
+ if (ret) {
+ afr_lockee_cleanup(lockee);
+ }
+ return ret;
}
-static int32_t
-afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+int
+afr_add_inode_lockee(afr_local_t *local, int child_count)
{
- afr_local_t *local = NULL;
-
- local = frame->local;
+ int ret = -ENOMEM;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
- afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
-
- if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: unlock failed %s",
- local->loc.path, strerror (op_errno));
- }
+ if (local->fd) {
+ lockee->fd = fd_ref(local->fd);
+ } else {
+ loc_copy(&lockee->loc, &local->loc);
+ }
- afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno);
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
- return 0;
+ if (!lockee->locked_nodes)
+ goto out;
+ ret = 0;
+ int_lock->lockee_count++;
+out:
+ if (ret) {
+ afr_lockee_cleanup(lockee);
+ }
+ return ret;
}
static int
-afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
+initialize_internal_lock_variables(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- struct gf_flock flock = {0,};
- int call_count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = F_UNLCK;
-
- call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes,
- priv->child_count);
-
- int_lock->lk_call_count = call_count;
-
- if (!call_count) {
- gf_log (this->name, GF_LOG_TRACE,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->inode_locked_nodes[i] & LOCKED_YES) {
- if (local->fd) {
- afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, &flock, F_SETLK, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
- (void *) (long)i,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- this->name, local->fd,
- F_SETLK, &flock);
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
- if (!--call_count)
- break;
+ int i = 0;
- } else {
- afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, &flock, F_SETLK, i);
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
- (void *) (long)i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name, &local->loc,
- F_SETLK, &flock);
+ int_lock->lock_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
+ int_lock->lk_attempted_count = 0;
- if (!--call_count)
- break;
+ for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
+ if (!int_lock->lockee[i].locked_nodes)
+ break;
+ int_lock->lockee[i].locked_count = 0;
+ memset(int_lock->lockee[i].locked_nodes, 0,
+ sizeof(*int_lock->lockee[i].locked_nodes) * priv->child_count);
+ }
- }
-
- }
-
- }
-
-out:
- return 0;
+ return 0;
}
-static int32_t
-afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+int
+afr_lockee_locked_nodes_count(afr_internal_lock_t *int_lock)
{
- afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_UNLOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
+ int call_count = 0;
+ int i = 0;
- afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno);
+ for (i = 0; i < int_lock->lockee_count; i++)
+ call_count += int_lock->lockee[i].locked_count;
- return 0;
+ return call_count;
}
-static int
-afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int call_count = 0;
- int i = -1;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
- call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes,
- priv->child_count);
- int_lock->lk_call_count = call_count;
-
- if (!call_count){
- gf_log (this->name, GF_LOG_TRACE,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->entry_locked_nodes[i] & LOCKED_YES) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- loc, basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-
- if (!--call_count)
- break;
- }
- }
-
-out:
- return 0;
-
-}
+int
+afr_locked_nodes_count(unsigned char *locked_nodes, int child_count)
-static int32_t
-afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int done = 0;
- int child_index = (long) cookie;
+ int i = 0;
+ int call_count = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
+ for (i = 0; i < child_count; i++) {
+ if (locked_nodes[i] & LOCKED_YES)
+ call_count++;
+ }
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
- done = 1;
- }
-
- local->child_up[child_index] = 0;
- local->op_errno = op_errno;
- int_lock->lock_op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if ((op_ret == -1) &&
- (op_errno == ENOSYS)) {
- afr_unlock (frame, this);
- } else {
- if (op_ret == 0) {
- int_lock->locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->lock_count++;
- }
- afr_lock_blocking (frame, this, child_index + 1);
- }
-
- return 0;
+ return call_count;
}
-static int32_t
-afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static void
+afr_log_locks_failure(call_frame_t *frame, char *where, char *what,
+ int op_errno)
{
- afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
+ xlator_t *this = frame->this;
+ gf_lkowner_t *lk_owner = &frame->root->lk_owner;
+ afr_local_t *local = frame->local;
+ const char *fop = NULL;
+ char *gfid = NULL;
+ const char *name = NULL;
- afr_lock_cbk (frame, cookie, this, op_ret, op_errno);
- return 0;
+ fop = gf_fop_list[local->op];
+ switch (local->transaction.type) {
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
+ switch (local->op) {
+ case GF_FOP_LINK:
+ gfid = uuid_utoa(local->newloc.pargfid);
+ name = local->newloc.name;
+ break;
+ default:
+ gfid = uuid_utoa(local->loc.pargfid);
+ name = local->loc.name;
+ break;
+ }
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_INTERNAL_LKS_FAILED,
+ "Unable to do entry %s with lk-owner:%s on %s "
+ "while attempting %s on {pgfid:%s, name:%s}.",
+ what, lkowner_utoa(lk_owner), where, fop, gfid, name);
+ break;
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ gfid = uuid_utoa(local->inode->gfid);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_INTERNAL_LKS_FAILED,
+ "Unable to do inode %s with lk-owner:%s on %s "
+ "while attempting %s on gfid:%s.",
+ what, lkowner_utoa(lk_owner), where, fop, gfid);
+ break;
+ }
}
static int32_t
-afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *lower_name = NULL;
- const char *higher_name = NULL;
- int child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
-
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
-
- local->op_ret = op_ret;
- }
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int lockee_num = 0;
+ int call_count = 0;
+ int child_index = 0;
+ int ret = 0;
- local->child_up[child_index] = 0;
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if (op_ret != 0) {
- afr_unlock (frame, this);
- goto out;
- } else {
- int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER;
- int_lock->lock_count++;
- }
-
- /* The lower path has been locked. Now lock the higher path */
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ lockee_num = (int)((long)cookie) / priv->child_count;
+ child_index = (int)((long)cookie) % priv->child_count;
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ afr_log_locks_failure(frame, priv->children[child_index]->name,
+ "unlock", op_errno);
+ }
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
+ int_lock->lockee[lockee_num].locked_nodes[child_index] &= LOCKED_NO;
+ if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1)
+ ret = afr_write_subvol_reset(frame, this);
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
+ LOCK(&frame->lock);
+ {
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK(&frame->lock);
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, higher_name, child_index);
+ if (call_count == 0) {
+ int_lock->lock_cbk(frame, this);
+ }
-
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, higher, higher_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
-out:
- return 0;
+ return ret;
}
-static int32_t
-afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long)cookie);
-
- afr_lock_cbk (frame, cookie, this, op_ret, op_errno);
- return 0;
-}
-
-static int
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- memcpy (int_lock->inode_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->inodelk_lock_count = int_lock->lock_count;
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
+void
+afr_internal_lock_wind(call_frame_t *frame,
+ int32_t (*cbk)(call_frame_t *, void *, xlator_t *,
+ int32_t, int32_t, dict_t *),
+ void *cookie, int child, int lockee_num,
+ gf_boolean_t blocking, gf_boolean_t unlock)
+{
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ entrylk_cmd cmd = ENTRYLK_LOCK_NB;
+ int32_t cmd1 = F_SETLK;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ switch (local->transaction.type) {
case AFR_ENTRY_TRANSACTION:
- memcpy (int_lock->entry_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->entrylk_lock_count = int_lock->lock_count;
- break;
- }
-
- return 0;
-
-}
-
-int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *lower_name = NULL;
- const char *higher_name = NULL;
- struct gf_flock flock = {0,};
- uint64_t ctx = 0;
- int ret = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
-
- if (local->fd) {
- ret = fd_ctx_get (local->fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "unable to get fd ctx for fd=%p",
- local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
-
- afr_copy_locked_nodes (frame, this);
-
- afr_unlock (frame, this);
-
- return 0;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- /* skip over children that or down
- or don't have the fd open */
-
- while ((child_index < priv->child_count)
- && (!local->child_up[child_index]
- || !fd_ctx->opened_on[child_index]))
-
- child_index++;
- } else {
- /* skip over children that are down */
- while ((child_index < priv->child_count)
- && !local->child_up[child_index])
- child_index++;
- }
-
- if ((child_index == priv->child_count) &&
- int_lock->lock_count == 0) {
-
- gf_log (this->name, GF_LOG_INFO,
- "unable to lock on even one child");
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
-
- afr_copy_locked_nodes (frame, this);
-
- afr_unlock(frame, this);
-
- return 0;
-
- }
-
- if ((child_index == priv->child_count)
- || (int_lock->lock_count ==
- afr_up_children_count (priv->child_count,
- local->child_up))) {
-
- /* we're done locking */
-
- gf_log (this->name, GF_LOG_DEBUG,
- "we're done locking");
-
- afr_copy_locked_nodes (frame, this);
-
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- return 0;
- }
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
-
- if (local->fd) {
- afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLKW,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->finodelk,
- this->name, local->fd,
- F_SETLKW, &flock);
-
- } else {
- afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLKW,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->inodelk,
- this->name, &local->loc,
- F_SETLKW, &flock);
- }
-
- break;
-
case AFR_ENTRY_RENAME_TRANSACTION:
- {
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, lower_name, child_index);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_lower_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, lower, lower_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
- break;
- }
-
- case AFR_ENTRY_TRANSACTION:
- if (local->fd) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, local->transaction.basename,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->fentrylk,
- this->name, local->fd,
- local->transaction.basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
- } else {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, local->transaction.basename,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name,
- &local->transaction.parent_loc,
- local->transaction.basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
- }
-
- break;
- }
-
- return 0;
-
-
-}
+ if (unlock) {
+ cmd = ENTRYLK_UNLOCK;
+ } else if (blocking) { /*Doesn't make sense to have blocking
+ unlock*/
+ cmd = ENTRYLK_LOCK;
+ }
+
+ if (local->fd) {
+ STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->fentrylk,
+ int_lock->domain,
+ int_lock->lockee[lockee_num].fd,
+ int_lock->lockee[lockee_num].basename, cmd,
+ ENTRYLK_WRLCK, NULL);
+ } else {
+ STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->entrylk,
+ int_lock->domain,
+ &int_lock->lockee[lockee_num].loc,
+ int_lock->lockee[lockee_num].basename, cmd,
+ ENTRYLK_WRLCK, NULL);
+ }
+ break;
-int32_t
-afr_blocking_lock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- initialize_inodelk_variables (frame, this);
- break;
+ flock = int_lock->lockee[lockee_num].flock;
+ if (unlock) {
+ flock.l_type = F_UNLCK;
+ } else if (blocking) { /*Doesn't make sense to have blocking
+ unlock*/
+ cmd1 = F_SETLKW;
+ }
+
+ if (local->fd) {
+ STACK_WIND_COOKIE(
+ frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->finodelk, int_lock->domain,
+ int_lock->lockee[lockee_num].fd, cmd1, &flock, NULL);
+ } else {
+ STACK_WIND_COOKIE(
+ frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->inodelk, int_lock->domain,
+ &int_lock->lockee[lockee_num].loc, cmd1, &flock, NULL);
+ }
+ break;
+ }
+}
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- initialize_entrylk_variables (frame, this);
+static int
+afr_unlock_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ int lockee_num = 0;
+ int i = -1;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ call_count = afr_lockee_locked_nodes_count(int_lock);
+
+ int_lock->lk_call_count = call_count;
+
+ if (!call_count) {
+ gf_msg_trace(this->name, 0, "No internal locks unlocked");
+ int_lock->lock_cbk(frame, this);
+ goto out;
+ }
+
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ lockee_num = i / priv->child_count;
+ child_index = i % priv->child_count;
+ if (int_lock->lockee[lockee_num].locked_nodes[child_index] &
+ LOCKED_YES) {
+ afr_internal_lock_wind(frame, afr_unlock_common_cbk,
+ (void *)(long)i, child_index, lockee_num,
+ _gf_false, _gf_true);
+ if (!--call_count)
break;
}
+ }
- afr_lock_blocking (frame, this, 0);
-
- return 0;
+out:
+ return 0;
}
static int32_t
-afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
-
- LOCK (&frame->lock);
- {
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
-
- if (op_ret < 0 ) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
-
- local->child_up[child_index] = 0;
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- } else if (op_ret == 0) {
- int_lock->entry_locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->entrylk_lock_count++;
- }
-
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "Last locking reply received");
- /* all locks successfull. Proceed to call FOP */
- if (int_lock->entrylk_lock_count ==
- afr_up_children_count (priv->child_count, local->child_up)) {
- gf_log (this->name, GF_LOG_TRACE,
- "All servers locked. Calling the cbk");
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- }
- /* Not all locks were successfull. Unlock and try locking
- again, this time with serially blocking locks */
- else {
- gf_log (this->name, GF_LOG_TRACE,
- "%d servers locked. Trying again with blocking calls",
- int_lock->lock_count);
-
- afr_unlock(frame, this);
+afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int cky = (long)cookie;
+ int child_index = 0;
+ int lockee_num = 0;
+
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ child_index = ((int)cky) % priv->child_count;
+ lockee_num = ((int)cky) / priv->child_count;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+ }
+
+ local->op_errno = op_errno;
+ int_lock->lock_op_errno = op_errno;
+ }
+
+ int_lock->lk_attempted_count++;
+ }
+ UNLOCK(&frame->lock);
+
+ if ((op_ret == -1) && (op_errno == ENOSYS)) {
+ afr_unlock_now(frame, this);
+ } else {
+ if (op_ret == 0) {
+ int_lock->lockee[lockee_num]
+ .locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_num].locked_count++;
+ int_lock->lock_count++;
+ if (local->transaction.type == AFR_DATA_TRANSACTION) {
+ LOCK(&local->inode->lock);
+ {
+ local->inode_ctx->lock_count++;
}
+ UNLOCK(&local->inode->lock);
+ }
}
+ afr_lock_blocking(frame, this, cky + 1);
+ }
- return 0;
+ return 0;
}
-int
-afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+_is_lock_wind_needed(afr_local_t *local, int child_index)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int32_t call_count = 0;
- int i = 0;
- uint64_t ctx = 0;
- int ret = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- initialize_entrylk_variables (frame, this);
-
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
- if (local->fd) {
- ret = fd_ctx_get (local->fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "unable to get fd ctx for fd=%p",
- local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EINVAL;
- int_lock->lock_op_errno = EINVAL;
-
- return -1;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ if (!local->child_up[child_index])
+ return _gf_false;
- call_count = internal_lock_count (frame, this, fd_ctx);
- int_lock->lk_call_count = call_count;
-
- if (!call_count) {
- gf_log (this->name, GF_LOG_INFO,
- "fd not open on any subvolumes. aborting.");
- afr_unlock (frame, this);
- goto out;
- }
-
- /* Send non-blocking entrylk calls only on up children
- and where the fd has been opened */
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && fd_ctx->opened_on[i]) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
- this->name, local->fd,
- basename,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
- }
- }
- } else {
- GF_ASSERT (loc);
-
- call_count = internal_lock_count (frame, this, NULL);
- int_lock->lk_call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name, loc, basename,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
-
- if (!--call_count)
- break;
-
- }
- }
- }
-out:
- return 0;
+ return _gf_true;
}
-int32_t
-afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static gf_boolean_t
+is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int child = 0;
+ int nlockee = 0;
+ int lockee_count = 0;
+ gf_boolean_t ret = _gf_true;
- afr_trace_inodelk_out (frame, AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
+ local = frame->local;
+ priv = this->private;
+ int_lock = &local->internal_lock;
+ lockee_count = int_lock->lockee_count;
- LOCK (&frame->lock);
- {
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
-
- if (op_ret < 0 ) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
- local->child_up[child_index] = 0;
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- } else if (op_ret == 0) {
- int_lock->inode_locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->inodelk_lock_count++;
- }
-
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "Last inode locking reply received");
- /* all locks successfull. Proceed to call FOP */
- if (int_lock->inodelk_lock_count ==
- afr_up_children_count (priv->child_count, local->child_up)) {
- gf_log (this->name, GF_LOG_TRACE,
- "All servers locked. Calling the cbk");
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- }
- /* Not all locks were successfull. Unlock and try locking
- again, this time with serially blocking locks */
- else {
- gf_log (this->name, GF_LOG_TRACE,
- "%d servers locked. Trying again with blocking calls",
- int_lock->lock_count);
-
- afr_unlock(frame, this);
- }
+ if (int_lock->lock_count == 0) {
+ afr_log_locks_failure(frame, "any subvolume", "lock",
+ int_lock->lock_op_errno);
+ return _gf_false;
+ }
+ /* For FOPS that take multiple sets of locks (mkdir, rename),
+ * there must be at least one brick on which the locks from
+ * all lock sets were successful. */
+ for (child = 0; child < priv->child_count; child++) {
+ ret = _gf_true;
+ for (nlockee = 0; nlockee < lockee_count; nlockee++) {
+ if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES))
+ ret = _gf_false;
}
+ if (ret)
+ return ret;
+ }
+ if (!ret)
+ afr_log_locks_failure(frame, "all", "lock", int_lock->lock_op_errno);
- return 0;
+ return ret;
}
int
-afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
+afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int32_t call_count = 0;
- uint64_t ctx = 0;
- int i = 0;
- int ret = 0;
- struct gf_flock flock = {0,};
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
-
- initialize_inodelk_variables (frame, this);
-
- if (local->fd) {
- ret = fd_ctx_get (local->fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "unable to get fd ctx for fd=%p",
- local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EINVAL;
- int_lock->lock_op_errno = EINVAL;
-
- ret = -1;
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- call_count = internal_lock_count (frame, this, fd_ctx);
- int_lock->lk_call_count = call_count;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ uint64_t ctx = 0;
+ int ret = 0;
+ int child_index = 0;
+ int lockee_num = 0;
- if (!call_count) {
- gf_log (this->name, GF_LOG_INFO,
- "fd not open on any subvolumes. aborting.");
- afr_unlock (frame, this);
- goto out;
- }
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ child_index = cookie % priv->child_count;
+ lockee_num = cookie / priv->child_count;
- /* Send non-blocking inodelk calls only on up children
- and where the fd has been opened */
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && fd_ctx->opened_on[i]) {
- afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLK, i);
+ if (local->fd) {
+ ret = fd_ctx_get(local->fd, this, &ctx);
- STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- this->name, local->fd,
- F_SETLK, &flock);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
+ "unable to get fd ctx for fd=%p", local->fd);
- if (!--call_count)
- break;
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
- }
+ afr_unlock_now(frame, this);
- }
- } else {
- call_count = internal_lock_count (frame, this, NULL);
- int_lock->lk_call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLK, i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name, &local->loc,
- F_SETLK, &flock);
-
- if (!--call_count)
- break;
-
- }
- }
+ return 0;
}
+ }
-out:
- return ret;
-}
-
-static int
-__is_lower_locked (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER)
- count++;
- }
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ if (!is_blocking_locks_count_sufficient(frame, this)) {
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
- return count;
+ afr_unlock_now(frame, this);
-}
-
-static int
-__is_higher_locked (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->locked_nodes[i] & LOCKED_YES)
- count++;
+ return 0;
}
+ }
- return count;
-
-}
-
-static int
-afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int call_count = 0;
- int i = -1;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
- call_count = __is_lower_locked (frame, this);
- int_lock->lk_call_count = call_count;
-
- if (!call_count){
- gf_log (this->name, GF_LOG_TRACE,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ /* we're done locking */
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP, basename, i);
+ gf_msg_debug(this->name, 0, "we're done locking");
- STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- loc, basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-
- if (!--call_count)
- break;
-
- }
- }
-
-out:
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk(frame, this);
return 0;
+ }
-}
-
-
-static int
-afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.done (frame, this);
+ if (!_is_lock_wind_needed(local, child_index)) {
+ afr_lock_blocking(frame, this, cookie + 1);
return 0;
-}
+ }
-static int
-afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *lower_name = NULL;
- const char *higher_name = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- if (__is_higher_locked (frame, this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking higher");
- int_lock->lk_basename = higher_name;
- int_lock->lk_loc = higher;
- int_lock->lock_cbk = afr_post_unlock_higher_cbk;
-
- afr_unlock_entrylk (frame, this);
- } else
- local->transaction.done (frame, this);
-
- return 0;
-}
-
-static int
-afr_rename_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *lower_name = NULL;
- const char *higher_name = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
-
- if (__is_lower_locked (frame, this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking lower");
- int_lock->lk_basename = lower_name;
- int_lock->lk_loc = lower;
- int_lock->lock_cbk = afr_post_unlock_lower_cbk;
-
- afr_unlock_lower_entrylk (frame, this);
- } else
- afr_post_unlock_lower_cbk (frame, this);
-
- return 0;
-}
-
-static int
-afr_rename_transaction (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- return (local->transaction.type ==
- AFR_ENTRY_RENAME_TRANSACTION);
+ afr_internal_lock_wind(frame, afr_lock_cbk, (void *)(long)cookie,
+ child_index, lockee_num, _gf_true, _gf_false);
+ return 0;
}
int32_t
-afr_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- if (transaction_lk_op (local)) {
- if (is_afr_lock_transaction (local))
- afr_unlock_inodelk (frame, this);
- else
- if (!afr_rename_transaction (frame, this))
- afr_unlock_entrylk (frame, this);
- else
- afr_rename_unlock (frame, this);
- } else {
- if (is_afr_lock_selfheal (local))
- afr_unlock_inodelk (frame, this);
- else
- afr_unlock_entrylk (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
- unsigned char *locked_nodes)
+afr_blocking_lock(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int up_count = 0;
- priv = this->private;
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- ret = afr_fd_ctx_set (this, fd);
- if (ret)
- goto out;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "failed to get the fd ctx");
- goto out;
- }
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- GF_ASSERT (fdctx->locked_on);
-
- memcpy (fdctx->locked_on, locked_nodes,
- priv->child_count);
-
-out:
- return ret;
-}
-
-static int
-__is_fd_saved (xlator_t *this, fd_t *fd)
-{
- afr_locked_fd_t *locked_fd = NULL;
- afr_private_t *priv = NULL;
- int found = 0;
-
- priv = this->private;
+ up_count = AFR_COUNT(local->child_up, priv->child_count);
+ int_lock->lk_call_count = int_lock->lk_expected_count =
+ (int_lock->lockee_count * up_count);
+ initialize_internal_lock_variables(frame, this);
- list_for_each_entry (locked_fd, &priv->saved_fds, list) {
- if (locked_fd->fd == fd) {
- found = 1;
- break;
- }
- }
+ afr_lock_blocking(frame, this, 0);
- return found;
+ return 0;
}
-static int
-__afr_save_locked_fd (xlator_t *this, fd_t *fd)
+static int32_t
+afr_nb_internal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_locked_fd_t *locked_fd = NULL;
- int ret = 0;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ int lockee_num = 0;
+ afr_private_t *priv = NULL;
- priv = this->private;
-
- locked_fd = GF_CALLOC (1, sizeof (*locked_fd),
- gf_afr_mt_locked_fd);
- if (!locked_fd) {
- ret = -1;
- goto out;
- }
+ priv = this->private;
- locked_fd->fd = fd;
- INIT_LIST_HEAD (&locked_fd->list);
+ child_index = ((long)cookie) % priv->child_count;
+ lockee_num = ((long)cookie) / priv->child_count;
- list_add_tail (&locked_fd->list, &priv->saved_fds);
+ local = frame->local;
+ int_lock = &local->internal_lock;
-out:
- return ret;
-}
-
-int
-afr_save_locked_fd (xlator_t *this, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- int ret = 0;
-
- priv = this->private;
-
- pthread_mutex_lock (&priv->mutex);
+ if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) {
+ LOCK(&local->inode->lock);
{
- if (__is_fd_saved (this, fd)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fd=%p already saved", fd);
- goto unlock;
- }
-
- ret = __afr_save_locked_fd (this, fd);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "fd=%p could not be saved", fd);
- goto unlock;
- }
+ local->inode_ctx->lock_count++;
}
-unlock:
- pthread_mutex_unlock (&priv->mutex);
-
- return ret;
-}
-
-static int
-afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_locked_fd_t *locked_fd = NULL;
-
- local = frame->local;
-
- locked_fd = local->locked_fd;
-
- STACK_DESTROY (frame->root);
- afr_local_cleanup (local, this);
-
- afr_save_locked_fd (this, locked_fd->fd);
-
- return 0;
-
-}
-
-static int
-afr_get_source_lock_recovery (xlator_t *this, fd_t *fd)
-{
- afr_fd_ctx_t *fdctx = NULL;
- afr_private_t *priv = NULL;
- uint64_t tmp = 0;
- int i = 0;
- int source_child = -1;
- int ret = 0;
-
- priv = this->private;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- for (i = 0; i < priv->child_count; i++) {
- if (fdctx->locked_on[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Found lock recovery source=%d", i);
- source_child = i;
- break;
- }
- }
-
-out:
- return source_child;
-
-}
+ UNLOCK(&local->inode->lock);
+ }
-int32_t
-afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock);
-int32_t
-afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t source_child = 0;
- struct gf_flock flock = {0,};
-
- local = frame->local;
- priv = this->private;
-
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "lock recovery failed");
- goto cleanup;
+ LOCK(&frame->lock);
+ {
+ if (op_ret < 0) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED,
+ "subvolume does not support "
+ "locking. please load features/locks"
+ " xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ } else if (op_ret == 0) {
+ int_lock->lockee[lockee_num]
+ .locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_num].locked_count++;
+ int_lock->lock_count++;
}
- source_child = local->source_child;
-
- memcpy (&flock, lock, sizeof (*lock));
-
- STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk,
- (void *) (long) source_child,
- priv->children[source_child],
- priv->children[source_child]->fops->lk,
- local->fd, F_GETLK_FD, &flock);
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
- return 0;
-}
-
-int
-afr_recover_lock (call_frame_t *frame, xlator_t *this,
- struct gf_flock *flock)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t lock_recovery_child = 0;
-
- priv = this->private;
- local = frame->local;
-
- lock_recovery_child = local->lock_recovery_child;
-
- frame->root->lk_owner = flock->l_owner;
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK(&frame->lock);
- STACK_WIND_COOKIE (frame, afr_recover_lock_cbk,
- (void *) (long) lock_recovery_child,
- priv->children[lock_recovery_child],
- priv->children[lock_recovery_child]->fops->lk,
- local->fd, F_SETLK, flock);
-
- return 0;
-}
-
-static int
-is_afr_lock_eol (struct gf_flock *lock)
-{
- int ret = 0;
-
- if ((lock->l_type == GF_LK_EOL))
- ret = 1;
-
- return ret;
-}
-
-int32_t
-afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
-{
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Failed to get locks on fd");
- goto cleanup;
+ if (call_count == 0) {
+ gf_msg_trace(this->name, 0, "Last locking reply received");
+ /* all locks successful. Proceed to call FOP */
+ if (int_lock->lock_count == int_lock->lk_expected_count) {
+ gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk");
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk(frame, this);
}
+ /* Not all locks were successful. Unlock and try locking
+ again, this time with serially blocking locks */
+ else {
+ gf_msg_trace(this->name, 0,
+ "%d servers locked. Trying again "
+ "with blocking calls",
+ int_lock->lock_count);
- gf_log (this->name, GF_LOG_DEBUG,
- "Got a lock on fd");
-
- if (is_afr_lock_eol (lock)) {
- gf_log (this->name, GF_LOG_INFO,
- "Reached EOL on locks on fd");
- goto cleanup;
+ afr_unlock_now(frame, this);
}
+ }
- afr_recover_lock (frame, this, lock);
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
-
- return 0;
+ return 0;
}
-static int
-afr_lock_recovery (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- int ret = 0;
- int32_t source_child = 0;
- struct gf_flock flock = {0,};
-
- priv = this->private;
- local = frame->local;
-
- fd = local->fd;
-
- source_child = afr_get_source_lock_recovery (this, fd);
- if (source_child < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not recover locks due to lock "
- "split brain");
- ret = -1;
- goto out;
+int
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int child = 0;
+ int lockee_num = 0;
+ int32_t call_count = 0;
+ int i = 0;
+ int ret = 0;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ initialize_internal_lock_variables(frame, this);
+
+ if (local->fd) {
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
+ "unable to get fd ctx for fd=%p", local->fd);
+
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
+ local->op_errno = EINVAL;
+ int_lock->lock_op_errno = EINVAL;
+
+ afr_unlock_now(frame, this);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ call_count = int_lock->lockee_count * internal_lock_count(frame, this);
+ int_lock->lk_call_count = call_count;
+ int_lock->lk_expected_count = call_count;
+
+ if (!call_count) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON,
+ "fd not open on any subvolumes. aborting.");
+ afr_unlock_now(frame, this);
+ goto out;
+ }
+
+ /* Send non-blocking lock calls only on up children
+ and where the fd has been opened */
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ child = i % priv->child_count;
+ lockee_num = i / priv->child_count;
+ if (local->child_up[child]) {
+ afr_internal_lock_wind(frame, afr_nb_internal_lock_cbk,
+ (void *)(long)i, child, lockee_num,
+ _gf_false, _gf_false);
+ if (!--call_count)
+ break;
}
-
- local->source_child = source_child;
-
- /* the flock can be zero filled as we're querying incrementally
- the locks held on the fd.
- */
- STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk,
- (void *) (long) source_child,
- priv->children[source_child],
- priv->children[source_child]->fops->lk,
- local->fd, F_GETLK_FD, &flock);
-
+ }
out:
- return ret;
-}
-
-
-static int
-afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index)
-{
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- fdctx->opened_on[child_index] = 1;
-
-out:
- return ret;
+ return ret;
}
int32_t
-afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- int32_t child_index = (long )cookie;
- int ret = 0;
-
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Reopen during lock-recovery failed");
- goto cleanup;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Open succeeded => proceed to recover locks");
-
- ret = afr_lock_recovery (frame, this);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Lock recovery failed");
- goto cleanup;
- }
-
- ret = afr_mark_fd_opened (this, fd, child_index);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Marking fd open failed");
- goto cleanup;
- }
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
- return 0;
-}
-
-static int
-afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- uint64_t tmp = 0;
- afr_fd_ctx_t *fdctx = NULL;
- loc_t loc = {0,};
- int32_t child_index = 0;
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
-
- GF_ASSERT (local && local->fd);
-
- ret = fd_ctx_get (local->fd, this, &tmp);
- fdctx = (afr_fd_ctx_t *) (long) tmp;
- GF_ASSERT (fdctx);
-
- child_index = local->lock_recovery_child;
-
- inode_path (local->fd->inode, NULL, (char **)&loc.path);
- loc.name = strrchr (loc.path, '/');
- loc.inode = inode_ref (local->fd->inode);
- loc.parent = inode_parent (local->fd->inode, 0, NULL);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk,
- (void *)(long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->open,
- &loc, fdctx->flags, local->fd,
- fdctx->wbflags);
-
- return 0;
-}
-
-static int
-is_fd_opened (fd_t *fd, int32_t child_index)
-{
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, THIS, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- if (fdctx->opened_on[child_index])
- ret = 1;
-
-out:
- return ret;
-}
-
-int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index)
-{
- call_frame_t *frame = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_locked_fd_t *locked_fd = NULL;
- afr_locked_fd_t *tmp = NULL;
- int ret = 0;
- struct list_head locks_list = {0,};
-
-
- priv = this->private;
-
- if (list_empty (&priv->saved_fds))
- goto out;
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- ret = -1;
- goto out;
- }
-
- local = GF_CALLOC (1, sizeof (*local),
- gf_afr_mt_afr_local_t);
- if (!local) {
- ret = -1;
- goto out;
- }
-
- AFR_LOCAL_INIT (local, priv);
- if (!local) {
- ret = -1;
- goto out;
- }
-
- frame->local = local;
-
- INIT_LIST_HEAD (&locks_list);
-
- pthread_mutex_lock (&priv->mutex);
- {
- list_splice_init (&priv->saved_fds, &locks_list);
- }
- pthread_mutex_unlock (&priv->mutex);
-
- list_for_each_entry_safe (locked_fd, tmp,
- &locks_list, list) {
-
- list_del_init (&locked_fd->list);
-
- local->fd = fd_ref (locked_fd->fd);
- local->lock_recovery_child = child_index;
- local->locked_fd = locked_fd;
-
- if (!is_fd_opened (locked_fd->fd, child_index)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting open before lock "
- "recovery");
- afr_lock_recovery_preopen (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting lock recovery "
- "without a preopen");
- afr_lock_recovery (frame, this);
- }
- }
+afr_unlock(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+
+ if (!local->transaction.eager_lock_on)
+ goto out;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ list_del_init(&local->transaction.owner_list);
+ if (list_empty(&lock->owners) && list_empty(&lock->post_op)) {
+ local->transaction.do_eager_unlock = _gf_true;
+ /*TODO: Need to get metadata use on_disk and inherit/uninherit
+ *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]);
+ *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]);
+ */
+ GF_ASSERT(lock->release);
+ }
+ }
+ UNLOCK(&local->inode->lock);
+ if (!local->transaction.do_eager_unlock) {
+ local->internal_lock.lock_cbk(frame, this);
+ return 0;
+ }
out:
- return ret;
+ afr_unlock_now(frame, this);
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 35a73646c82..816065fb57a 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -1,48 +1,38 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#ifndef __AFR_MEM_TYPES_H__
#define __AFR_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_afr_mem_types_ {
- gf_afr_mt_iovec = gf_common_mt_end + 1,
- gf_afr_mt_afr_fd_ctx_t,
- gf_afr_mt_afr_local_t,
- gf_afr_mt_afr_private_t,
- gf_afr_mt_int32_t,
- gf_afr_mt_char,
- gf_afr_mt_xattr_key,
- gf_afr_mt_dict_t,
- gf_afr_mt_xlator_t,
- gf_afr_mt_iatt,
- gf_afr_mt_int,
- gf_afr_mt_afr_node_character,
- gf_afr_mt_sh_diff_loop_state,
- gf_afr_mt_uint8_t,
- gf_afr_mt_loc_t,
- gf_afr_mt_entry_name,
- gf_afr_mt_pump_priv,
- gf_afr_mt_locked_fd,
- gf_afr_mt_end
+ gf_afr_mt_afr_fd_ctx_t = gf_common_mt_end + 1,
+ gf_afr_mt_afr_private_t,
+ gf_afr_mt_int32_t,
+ gf_afr_mt_char,
+ gf_afr_mt_xattr_key,
+ gf_afr_mt_dict_t,
+ gf_afr_mt_xlator_t,
+ gf_afr_mt_afr_node_character,
+ gf_afr_mt_inode_ctx_t,
+ gf_afr_mt_shd_event_t,
+ gf_afr_mt_reply_t,
+ gf_afr_mt_subvol_healer_t,
+ gf_afr_mt_spbc_timeout_t,
+ gf_afr_mt_spb_status_t,
+ gf_afr_mt_empty_brick_t,
+ gf_afr_mt_child_latency_t,
+ gf_afr_mt_atomic_t,
+ gf_afr_mt_lk_heal_info_t,
+ gf_afr_mt_gf_lock,
+ gf_afr_mt_end
};
#endif
-
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
new file mode 100644
index 00000000000..e73fd997765
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -0,0 +1,167 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _AFR_MESSAGES_H_
+#define _AFR_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+ AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE,
+ AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN,
+ AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL,
+ AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL,
+ AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS,
+ AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED,
+ AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED,
+ AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO,
+ AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA,
+ AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED,
+ AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED,
+ AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG,
+ AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB,
+ AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED,
+ AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET,
+ AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START,
+ AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT,
+ AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED,
+ AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG,
+ AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED,
+ AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH,
+ AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED,
+ AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA,
+ SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE,
+ SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED,
+ AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED,
+ AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED,
+ AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED,
+ AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN,
+ AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN,
+ AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP,
+ AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR,
+ AFR_MSG_INTERNAL_ATTR);
+
+#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed"
+#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed"
+#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed"
+#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed"
+#define AFR_MSG_INVALID_ARG_STR "Invalid argument"
+#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on "
+#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file"
+#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict."
+#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed."
+#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \
+ "Failed to populate loc for thin-arbiter"
+#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending"
+#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping."
+#define AFR_MSG_UNKNOWN_SET_STR "Unknown set"
+#define AFR_MSG_NO_XL_ID_STR "xl does not have id"
+#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on"
+#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on"
+#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter."
+#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output"
+#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time"
+#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict"
+#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \
+ "No majority to resolve gfid split brain"
+#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected"
+#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal"
+#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch"
+#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \
+ "Size policy is not applicable to directories."
+#define AFR_MSG_NO_CHILD_SELECTED_STR \
+ "No child selected by favorite-child policy"
+#define AFR_MSG_INVALID_CHILD_STR "Invalid child"
+#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \
+ "selected as authentic to resolve conflicting data"
+#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick"
+#define SNO_DIFF_IN_MTIME_STR "No difference in mtime"
+#define SNO_BIGGER_FILE_STR "No bigger file"
+#define SALL_BRICKS_UP_TO_RESOLVE_STR \
+ "All the bricks should be up to resolve the gfid split brain"
+#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock"
+#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed"
+#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame"
+#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop"
+#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed"
+#define AFR_MSG_FSYNC_FAILED_STR "fsync failed"
+#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met"
+#define AFR_MSG_FOP_FAILED_STR "Failing Fop"
+#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume"
+#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling"
+#define AFR_MSG_CHILD_MISCONFIGURED_STR \
+ "replicate translator needs more than one subvolume defined"
+#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads"
+#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count"
+#define AFR_MSG_UNABLE_TO_FETCH_STR \
+ "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \
+ "using client translator names"
+#define AFR_MSG_NULL_DEREF_STR "possible NULL deref"
+#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key"
+#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask"
+#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up"
+#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \
+ "Failed to cancel split-brain choice"
+#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \
+ "Cannot set replica. File is not in data/metadata split-brain"
+#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx"
+#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols"
+#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child"
+#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file"
+#define AFR_MSG_TIMER_CREATE_FAIL_STR \
+ "Cannot create timer for delayed initialization"
+#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online"
+#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \
+ "All subvolumes are down. Going offline until atleast one of them is up"
+#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock"
+#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume"
+#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met"
+#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir"
+#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid"
+#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file"
+#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain"
+#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down"
+#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask"
+#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \
+ "read subvolume in this generation is not up"
+#define AFR_MSG_INTERNAL_LKS_FAILED_STR \
+ "Unable to work with lk-owner while attempting fop"
+#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \
+ "subvolume does not support locking. please load features/locks xlator " \
+ "on server."
+#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx"
+#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting."
+#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child."
+#define AFR_MSG_NEW_BRICK_STR "New brick"
+#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \
+ "Failed to set split-brain choice to -1"
+#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \
+ "Failed to determine split-brain. Aborting split-brain-choice set"
+#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume"
+#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr"
+#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute"
+#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 41db67d7f81..64856042b65 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -1,647 +1,353 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "statedump.h"
-
-#include "fd.h"
-
-#include "afr-inode-read.h"
-#include "afr-inode-write.h"
-#include "afr-dir-read.h"
-#include "afr-dir-write.h"
-#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+#include "afr-transaction.h"
-int
-afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+gf_boolean_t
+afr_is_fd_fixable(fd_t *fd)
{
- afr_local_t * local = frame->local;
-
- AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
- local->fd);
- return 0;
+ if (!fd || !fd->inode)
+ return _gf_false;
+ else if (fd_is_anonymous(fd))
+ return _gf_false;
+ else if (gf_uuid_is_null(fd->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
}
-
int
-afr_open_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
+afr_open_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- local->success_count++;
-
- ret = afr_fd_ctx_set (this, fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set fd ctx for fd=%p", fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not get fd ctx for fd=%p", fd);
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = 1;
- fd_ctx->flags = local->cont.open.flags;
- fd_ctx->wbflags = local->cont.open.wbflags;
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if ((local->cont.open.flags & O_TRUNC)
- && (local->op_ret >= 0)) {
- STACK_WIND (frame, afr_open_ftruncate_cbk,
- this, this->fops->ftruncate,
- fd, 0);
- } else {
- AFR_STACK_UNWIND (open, frame, local->op_ret,
- local->op_errno, local->fd);
- }
- }
+ afr_local_t *local = frame->local;
- return 0;
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
+ local->cont.open.fd, xdata);
+ return 0;
}
-
int
-afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags)
+afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int i = 0;
- int ret = -1;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t wind_flags = flags & (~O_TRUNC);
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
-
- if (afr_is_split_brain (this, loc->inode)) {
- /* self-heal failed */
- gf_log (this->name, GF_LOG_WARNING,
- "failed to open as split brain seen, returning EIO");
- op_errno = EIO;
- goto out;
- }
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = (long)cookie;
+ afr_fd_ctx_t *fd_ctx = NULL;
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
- frame->local = local;
- call_count = local->call_count;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
- loc_copy (&local->loc, loc);
-
- local->cont.open.flags = flags;
- local->cont.open.wbflags = wbflags;
-
- local->fd = fd_ref (fd);
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- loc, wind_flags, fd, wbflags);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
+ LOCK(&frame->lock);
+ {
if (op_ret == -1) {
- AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd);
- }
-
- return 0;
-}
-
-
-int
-afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- uint64_t ctx = 0;
- int ret = 0;
- int call_count = 0;
- int child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- if (op_ret >= 0) {
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to get fd context, %p", fd);
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = 1;
-
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened successfully on subvolume %s",
- local->loc.path, priv->children[child_index]->name);
- }
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
+ local->op_ret = op_ret;
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
-out:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- int_lock->lock_cbk = local->transaction.done;
- local->transaction.resume (frame, this);
- }
-
- return 0;
-}
-
-
-static int
-__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up)
-{
- int i = 0;
- int count = 0;
-
- for (i = 0; i < child_count; i++) {
- if (!opened_on[i] && child_up[i])
- count++;
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ if (call_count == 0) {
+ afr_handle_replies_quorum(frame, this);
+ if (local->op_ret == -1) {
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ } else if (fd_ctx->flags & O_TRUNC) {
+ STACK_WIND(frame, afr_open_ftruncate_cbk, this,
+ this->fops->ftruncate, fd, 0, NULL);
+ } else {
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
+ local->cont.open.fd, local->xdata_rsp);
}
+ }
- return count;
+ return 0;
}
-
int
-afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret,
- int32_t op_errno)
+afr_open_continue(call_frame_t *frame, xlator_t *this, int err)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int abandon = 0;
- int ret = 0;
- int i = 0;
- int call_count = 0;
-
- priv = this->private;
- local = frame->local;
-
- /*
- * Some subvolumes might have come up on which we never
- * opened this fd in the first place. Re-open fd's on those
- * subvolumes now.
- */
-
- ret = fd_ctx_get (local->fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to get fd context %p (%s)",
- local->fd, local->loc.path);
- abandon = 1;
- goto out;
- }
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- LOCK (&local->fd->lock);
- {
- call_count = __unopened_count (priv->child_count,
- fd_ctx->opened_on,
- local->child_up);
- for (i = 0; i < priv->child_count; i++) {
- fd_ctx->pre_op_done[i] = 0;
- fd_ctx->pre_op_piggyback[i] = 0;
- }
- }
- UNLOCK (&local->fd->lock);
+ local = frame->local;
+ priv = this->private;
- if (call_count == 0) {
- abandon = 1;
- goto out;
- }
-
- local->call_count = call_count;
+ if (err) {
+ AFR_STACK_UNWIND(open, frame, -1, err, NULL, NULL);
+ } else {
+ local->call_count = AFR_COUNT(local->child_up, priv->child_count);
+ call_count = local->call_count;
for (i = 0; i < priv->child_count; i++) {
- if (!fd_ctx->opened_on[i] && local->child_up[i]) {
- gf_log (this->name, GF_LOG_TRACE,
- "opening fd for %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk,
- (void *)(long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc, fd_ctx->flags, local->fd,
- fd_ctx->wbflags);
-
- if (!--call_count)
- break;
- }
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_open_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->open, &local->loc,
+ (local->cont.open.flags & ~O_TRUNC),
+ local->cont.open.fd, local->xdata_req);
+ if (!--call_count)
+ break;
+ }
}
-
-out:
- if (abandon)
- local->transaction.resume (frame, this);
-
- return 0;
+ }
+ return 0;
}
-
-static int
-afr_prepare_loc (call_frame_t *frame, fd_t *fd)
-{
- afr_local_t *local = NULL;
- char *name = NULL;
- char *path = NULL;
- int ret = 0;
-
- if ((!fd) || (!fd->inode))
- return -1;
-
- local = frame->local;
- ret = inode_path (fd->inode, NULL, (char **)&path);
- if (ret <= 0) {
- gf_log (frame->this->name, GF_LOG_DEBUG,
- "Unable to get path for gfid: %s",
- uuid_utoa (fd->inode->gfid));
- return -1;
- }
-
- if (local->loc.path) {
- if (strcmp (path, local->loc.path))
- gf_log (frame->this->name, GF_LOG_DEBUG,
- "overwriting old loc->path %s with %s",
- local->loc.path, path);
- GF_FREE ((char *)local->loc.path);
- }
- local->loc.path = path;
-
- name = strrchr (local->loc.path, '/');
- if (name)
- name++;
- local->loc.name = name;
-
- if (local->loc.inode) {
- inode_unref (local->loc.inode);
- }
- local->loc.inode = inode_ref (fd->inode);
-
- if (local->loc.parent) {
- inode_unref (local->loc.parent);
- }
-
- local->loc.parent = inode_parent (local->loc.inode, 0, NULL);
-
- return 0;
-}
-
-
int
-afr_openfd_sh (call_frame_t *frame, xlator_t *this)
+afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- char sh_type_str[256] = {0,};
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (local->loc.path);
- /* forcibly trigger missing-entries self-heal */
-
- sh->need_missing_entry_self_heal = _gf_true;
- sh->need_gfid_self_heal = _gf_true;
- sh->data_lock_held = _gf_true;
- sh->need_data_self_heal = _gf_true;
- sh->type = local->fd->inode->ia_type;
- sh->background = _gf_false;
- sh->unwind = afr_openfd_sh_unwind;
-
- afr_self_heal_type_str_get(&local->self_heal,
- sh_type_str,
- sizeof(sh_type_str));
- gf_log (this->name, GF_LOG_INFO, "%s self-heal triggered. "
- "path: %s, reason: Replicate up down flush, data lock is held",
- sh_type_str, local->loc.path);
-
- afr_self_heal (frame, this, local->fd->inode);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int spb_subvol = 0;
+ int event_generation = 0;
+ int ret = 0;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ // We can't let truncation to happen outside transaction.
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_OPEN;
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ local->inode = inode_ref(loc->inode);
+ loc_copy(&local->loc, loc);
+ local->fd_ctx = fd_ctx;
+ fd_ctx->flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ local->cont.open.flags = flags;
+ local->cont.open.fd = fd_ref(fd);
+
+ ret = afr_inode_get_readable(frame, local->inode, this, NULL,
+ &event_generation, AFR_DATA_TRANSACTION);
+ if ((ret < 0) &&
+ (afr_split_brain_read_subvol_get(local->inode, this, NULL,
+ &spb_subvol) == 0) &&
+ spb_subvol < 0) {
+ afr_inode_refresh(frame, this, local->inode, local->inode->gfid,
+ afr_open_continue);
+ } else {
+ afr_open_continue(frame, this, 0);
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(open, frame, -1, op_errno, fd, NULL);
- return 0;
+ return 0;
}
-
int
-afr_openfd_flush_done (call_frame_t *frame, xlator_t *this)
+afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- int _ret = -1;
-
- priv = this->private;
- local = frame->local;
-
- LOCK (&local->fd->lock);
- {
- _ret = __fd_ctx_get (local->fd, this, &ctx);
- if (_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to get fd context %p (%s)",
- local->fd, local->loc.path);
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->down_count = priv->down_count;
- fd_ctx->up_count = priv->up_count;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int call_count = 0;
+ int child_index = (long)cookie;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (op_ret >= 0) {
+ gf_msg_debug(this->name, 0,
+ "fd for %s opened "
+ "successfully on subvolume %s",
+ local->loc.path, priv->children[child_index]->name);
+ } else {
+ gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno,
+ AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s",
+ priv->children[child_index]->name, NULL);
+ }
+
+ fd_ctx = local->fd_ctx;
+
+ LOCK(&local->fd->lock);
+ {
+ if (op_ret >= 0) {
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
}
-out:
- UNLOCK (&local->fd->lock);
-
- afr_local_transaction_cleanup (local, this);
-
- gf_log (this->name, GF_LOG_TRACE,
- "The up/down flush is over");
+ }
+ UNLOCK(&local->fd->lock);
- fd_unref (local->fd);
- local->openfd_flush_cbk (frame, this);
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ AFR_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
-
-
-int
-afr_openfd_xaction (call_frame_t *frame, xlator_t *this, fd_t *fd)
+static int
+afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open)
{
- afr_local_t * local = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- local = frame->local;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int count = 0;
- local->op = GF_FOP_FLUSH;
+ priv = this->private;
- local->transaction.fop = afr_openfd_sh;
- local->transaction.done = afr_openfd_flush_done;
-
- local->transaction.start = 0;
- local->transaction.len = 0;
-
- gf_log (this->name, GF_LOG_TRACE,
- "doing up/down flush on fd=%p", fd);
-
- afr_transaction (frame, this, AFR_DATA_TRANSACTION);
-
-out:
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
return 0;
-}
-
-
-int
-afr_openfd_xaction_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- if (op_ret >= 0) {
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to get fd context %p (%s)",
- fd, local->loc.path);
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = 1;
-
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened successfully on subvolume %s",
- local->loc.path, priv->children[child_index]->name);
- }
- }
-out:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_openfd_xaction (frame, this, local->fd);
+ LOCK(&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED &&
+ priv->child_up[i]) {
+ fd_ctx->opened_on[i] = AFR_FD_OPENING;
+ need_open[i] = 1;
+ count++;
+ } else {
+ need_open[i] = 0;
+ }
}
+ }
+ UNLOCK(&fd->lock);
- return 0;
+ return count;
}
-
-int
-afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+void
+afr_fix_open(fd_t *fd, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int no_open = 0;
- int ret = 0;
- int i = 0;
- int call_count = 0;
-
- priv = this->private;
- local = frame->local;
-
- /*
- * If the file is already deleted while the fd is open, no need to
- * perform the openfd flush, call the flush_cbk and get out.
- */
- ret = afr_prepare_loc (frame, fd);
- if (ret < 0) {
- local->openfd_flush_cbk (frame, this);
- goto out;
- }
-
- /*
- * Some subvolumes might have come up on which we never
- * opened this fd in the first place. Re-open fd's on those
- * subvolumes now.
- */
-
- local->fd = fd_ref (fd);
-
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to get fd context %p (%s)",
- fd, local->loc.path);
- no_open = 1;
- goto out;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *need_open = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+
+ if (!afr_is_fd_fixable(fd))
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
+ goto out;
+
+ need_open = alloca0(priv->child_count);
+
+ call_count = afr_fd_ctx_need_open(fd, this, need_open);
+ if (!call_count)
+ goto out;
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->loc.inode = inode_ref(fd->inode);
+ ret = loc_path(&local->loc, NULL);
+ if (ret < 0)
+ goto out;
+
+ local->fd = fd_ref(fd);
+ local->fd_ctx = fd_ctx;
+
+ local->call_count = call_count;
+
+ gf_msg_debug(this->name, 0, "need open count: %d", call_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!need_open[i])
+ continue;
+
+ if (IA_IFDIR == fd->inode->ia_type) {
+ gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->opendir, &local->loc,
+ local->fd, NULL);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "opening fd for file %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->open,
+ &local->loc, fd_ctx->flags & (~O_TRUNC),
+ local->fd, NULL);
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- LOCK (&local->fd->lock);
- {
- call_count = __unopened_count (priv->child_count,
- fd_ctx->opened_on,
- local->child_up);
- }
- UNLOCK (&local->fd->lock);
-
- if (call_count == 0) {
- goto out;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!fd_ctx->opened_on[i] && local->child_up[i]) {
- gf_log (this->name, GF_LOG_TRACE,
- "opening fd for %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_openfd_xaction_open_cbk,
- (void *)(long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc, fd_ctx->flags, fd,
- fd_ctx->wbflags);
-
- if (!--call_count)
- break;
- }
- }
+ if (!--call_count)
+ break;
+ }
+ return;
out:
- if (no_open)
- afr_openfd_xaction (frame, this, fd);
-
- return 0;
+ if (frame)
+ AFR_STACK_DESTROY(frame);
}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
new file mode 100644
index 00000000000..6fc2c75145c
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -0,0 +1,494 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-transaction.h"
+#include "afr-messages.h"
+
+void
+afr_pending_read_increment(afr_private_t *priv, int child_index)
+{
+ if (child_index < 0 || child_index > priv->child_count)
+ return;
+
+ GF_ATOMIC_INC(priv->pending_reads[child_index]);
+}
+
+void
+afr_pending_read_decrement(afr_private_t *priv, int child_index)
+{
+ if (child_index < 0 || child_index > priv->child_count)
+ return;
+
+ GF_ATOMIC_DEC(priv->pending_reads[child_index]);
+}
+
+void
+afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_pending_read_decrement(priv, local->read_subvol);
+ local->read_subvol = subvol;
+ afr_pending_read_increment(priv, subvol);
+ local->readfn(frame, this, subvol);
+}
+
+int
+afr_read_txn_next_subvol(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->readable[i]) {
+ /* don't even bother trying here.
+ just mark as attempted and move on. */
+ local->read_attempted[i] = 1;
+ continue;
+ }
+
+ if (!local->read_attempted[i]) {
+ subvol = i;
+ break;
+ }
+ }
+
+ /* If no more subvols were available for reading, we leave
+ @subvol as -1, which is an indication we have run out of
+ readable subvols. */
+ if (subvol != -1)
+ local->read_attempted[subvol] = 1;
+ afr_read_txn_wind(frame, this, subvol);
+
+ return 0;
+}
+
+static int
+afr_ta_read_txn_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ STACK_DESTROY(ta_frame->root);
+ return 0;
+}
+
+static int
+afr_ta_read_txn(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ int read_subvol = -1;
+ int query_child = AFR_CHILD_UNKNOWN;
+ int possible_bad_child = AFR_CHILD_UNKNOWN;
+ int ret = 0;
+ int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int **pending = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ frame = (call_frame_t *)opaque;
+ this = frame->this;
+ local = frame->local;
+ priv = this->private;
+ query_child = local->read_txn_query_child;
+
+ if (query_child == AFR_CHILD_ZERO) {
+ possible_bad_child = AFR_CHILD_ONE;
+ } else if (query_child == AFR_CHILD_ONE) {
+ possible_bad_child = AFR_CHILD_ZERO;
+ } else {
+ /*read_txn_query_child is AFR_CHILD_UNKNOWN*/
+ goto out;
+ }
+
+ /* Ask the query_child to see if it blames the possibly bad one. */
+ xdata_req = dict_new();
+ if (!xdata_req)
+ goto out;
+
+ pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!pending)
+ goto out;
+
+ ret = afr_set_pending_dict(priv, xdata_req, pending);
+ if (ret < 0)
+ goto out;
+
+ if (local->fd) {
+ ret = syncop_fxattrop(priv->children[query_child], local->fd,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ } else {
+ ret = syncop_xattrop(priv->children[query_child], &local->loc,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ }
+ if (ret || !xdata_rsp) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed xattrop for gfid %s on %s",
+ uuid_utoa(local->inode->gfid),
+ priv->children[query_child]->name);
+ op_errno = -ret;
+ goto out;
+ }
+
+ if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv,
+ possible_bad_child)) {
+ read_subvol = query_child;
+ goto out;
+ }
+ dict_unref(xdata_rsp);
+ xdata_rsp = NULL;
+
+ /* It doesn't. So query thin-arbiter to see if it blames any data brick. */
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ goto out;
+ }
+ flock.l_type = F_WRLCK; /*start and length are already zero. */
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.",
+ uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ op_errno = -ret;
+ goto out;
+ }
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ if (ret || !xdata_rsp) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed xattrop on %s.", uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ op_errno = -ret;
+ goto unlock;
+ }
+
+ if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) {
+ read_subvol = query_child;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Failing read for gfid %s since good brick %s is down",
+ uuid_utoa(local->inode->gfid),
+ priv->children[possible_bad_child]->name);
+ op_errno = EIO;
+ }
+
+unlock:
+ flock.l_type = F_UNLCK;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on "
+ "%s.",
+ uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ }
+out:
+ if (xdata_req)
+ dict_unref(xdata_req);
+ if (xdata_rsp)
+ dict_unref(xdata_rsp);
+ if (pending)
+ afr_matrix_cleanup(pending, priv->child_count);
+ loc_wipe(&loc);
+
+ if (read_subvol == -1) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ afr_read_txn_wind(frame, this, read_subvol);
+ return ret;
+}
+
+void
+afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *ta_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ goto out;
+ }
+ ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done,
+ ta_frame, frame);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to launch "
+ "afr_ta_read_txn synctask for gfid %s.",
+ uuid_utoa(local->inode->gfid));
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ STACK_DESTROY(ta_frame->root);
+ goto out;
+ }
+ return;
+out:
+ afr_read_txn_wind(frame, this, -1);
+}
+
+int
+afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int read_subvol = -1;
+ inode_t *inode = NULL;
+ int ret = -1;
+ int spb_subvol = -1;
+
+ local = frame->local;
+ inode = local->inode;
+ priv = this->private;
+
+ if (err) {
+ if (!priv->thin_arbiter_count)
+ goto readfn;
+ if (err != EINVAL)
+ goto readfn;
+ /* We need to query the good bricks and/or thin-arbiter.*/
+ afr_ta_read_txn_synctask(frame, this);
+ return 0;
+ }
+
+ read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable,
+ NULL);
+ if (read_subvol == -1) {
+ err = EIO;
+ goto readfn;
+ }
+
+ if (local->read_attempted[read_subvol]) {
+ afr_read_txn_next_subvol(frame, this);
+ return 0;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+readfn:
+ if (read_subvol == -1) {
+ ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol);
+ if ((ret == 0) && spb_subvol >= 0)
+ read_subvol = spb_subvol;
+ }
+
+ if (read_subvol == -1) {
+ AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+ }
+ afr_read_txn_wind(frame, this, read_subvol);
+
+ return 0;
+}
+
+int
+afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (!local->refreshed) {
+ local->refreshed = _gf_true;
+ afr_inode_refresh(frame, this, local->inode, NULL,
+ afr_read_txn_refresh_done);
+ } else {
+ afr_read_txn_next_subvol(frame, this);
+ }
+
+ return 0;
+}
+
+/* afr_read_txn_wipe:
+
+ clean internal variables in @local in order to make
+ it possible to call afr_read_txn() multiple times from
+ the same frame
+*/
+
+void
+afr_read_txn_wipe(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->readfn = NULL;
+
+ if (local->inode)
+ inode_unref(local->inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ local->read_attempted[i] = 0;
+ local->readable[i] = 0;
+ }
+}
+
+/*
+ afr_read_txn:
+
+ This is the read transaction function. The way it works:
+
+ - Determine read-subvolume from inode ctx.
+
+ - If read-subvolume's generation was stale, refresh ctx once by
+ calling afr_inode_refresh()
+
+ Else make an attempt to read on read-subvolume.
+
+ - If attempted read on read-subvolume fails, refresh ctx once
+ by calling afr_inode_refresh()
+
+ - After ctx refresh, query read-subvolume freshly and attempt
+ read once.
+
+ - If read fails, try every other readable[] subvolume before
+ finally giving up. readable[] elements are set by afr_inode_refresh()
+ based on dirty and pending flags.
+
+ - If file is in split brain in the backend, generation will be
+ kept 0 by afr_inode_refresh() and readable[] will be set 0 for
+ all elements. Therefore reads always fail.
+*/
+
+int
+afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *data = NULL;
+ unsigned char *metadata = NULL;
+ int read_subvol = -1;
+ int event_generation = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+ data = alloca0(priv->child_count);
+ metadata = alloca0(priv->child_count);
+
+ afr_read_txn_wipe(frame, this);
+
+ local->readfn = readfn;
+ local->inode = inode_ref(inode);
+ local->is_read_txn = _gf_true;
+ local->transaction.type = type;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_quorum_errno(priv);
+ goto read;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &local->op_errno)) {
+ local->op_ret = -1;
+ goto read;
+ }
+
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ local->op_ret = -1;
+ local->op_errno = -afr_quorum_errno(priv);
+ goto read;
+ }
+
+ if (priv->thin_arbiter_count &&
+ AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ if (local->child_up[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (local->child_up[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
+ afr_ta_read_txn_synctask(frame, this);
+ return 0;
+ }
+
+ ret = afr_inode_read_subvol_get(inode, this, data, metadata,
+ &event_generation);
+ if (ret == -1)
+ /* very first transaction on this inode */
+ goto refresh;
+ AFR_INTERSECT(local->readable, data, metadata, priv->child_count);
+
+ gf_msg_debug(this->name, 0,
+ "%s: generation now vs cached: %d, "
+ "%d",
+ uuid_utoa(inode->gfid), local->event_generation,
+ event_generation);
+ if (afr_is_inode_refresh_reqd(inode, this, local->event_generation,
+ event_generation))
+ /* servers have disconnected / reconnected, and possibly
+ rebooted, very likely changing the state of freshness
+ of copies */
+ goto refresh;
+
+ read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable,
+ NULL);
+
+ if (read_subvol < 0 || read_subvol > priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "Unreadable subvolume %d found "
+ "with event generation %d for gfid %s.",
+ read_subvol, event_generation, uuid_utoa(inode->gfid));
+ goto refresh;
+ }
+
+ if (!local->child_up[read_subvol]) {
+ /* should never happen, just in case */
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR,
+ "subvolume %d is the "
+ "read subvolume in this generation, but is not up",
+ read_subvol);
+ goto refresh;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+
+read:
+ afr_read_txn_wind(frame, this, read_subvol);
+
+ return 0;
+
+refresh:
+ afr_inode_refresh(frame, this, inode, NULL, afr_read_txn_refresh_done);
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
deleted file mode 100644
index 857b9f499cc..00000000000
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ /dev/null
@@ -1,1090 +0,0 @@
-/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "xlator.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "md5.h"
-
-#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
-/*
- This file contains the various self-heal algorithms
-*/
-
-
-/*
- The "full" algorithm. Copies the entire file from
- source to sinks.
-*/
-
-
-static void
-sh_full_private_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = sh->private;
-
- if (sh_priv)
- GF_FREE (sh_priv);
-}
-
-
-static int
-sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call);
-
-static int
-sh_full_loop_driver_done (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
-
- sh_full_private_cleanup (frame, this);
- if (sh->op_failed) {
- gf_log (this->name, GF_LOG_INFO,
- "full self-heal aborting on %s",
- local->loc.path);
-
- local->self_heal.algo_abort_cbk (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "full self-heal completed on %s",
- local->loc.path);
-
- local->self_heal.algo_completion_cbk (frame, this);
- }
- return 0;
-}
-
-static int
-sh_full_loop_return (call_frame_t *rw_frame, xlator_t *this, off_t offset)
-{
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
-
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- AFR_STACK_DESTROY (rw_frame);
-
- sh_full_loop_driver (sh_frame, this, _gf_false);
-
- return 0;
-}
-
-
-static int
-sh_full_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t *rw_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- int child_index = (long) cookie;
- int call_count = 0;
-
- priv = this->private;
-
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- gf_log (this->name, GF_LOG_TRACE,
- "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
- op_ret, sh_local->loc.path, child_index,
- rw_sh->offset - op_ret);
-
- LOCK (&sh_frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "write to %s failed on subvolume %s (%s)",
- sh_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->op_failed = 1;
- }
- }
- UNLOCK (&sh_frame->lock);
-
- call_count = afr_frame_return (rw_frame);
-
- if (call_count == 0) {
- sh_full_loop_return (rw_frame, this, rw_sh->offset - op_ret);
- }
-
- return 0;
-}
-
-
-static int
-sh_full_read_cbk (call_frame_t *rw_frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t *rw_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
- off_t offset = (long) cookie;
-
- priv = this->private;
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- call_count = sh->active_sinks;
-
- rw_local->call_count = call_count;
-
- gf_log (this->name, GF_LOG_TRACE,
- "read %d bytes of data from %s, offset %"PRId64"",
- op_ret, sh_local->loc.path, offset);
-
- if (op_ret <= 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "read from %s failed on subvolume %s (%s)",
- sh_local->loc.path,
- priv->children[sh->source]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- sh_full_loop_return (rw_frame, this, offset);
- return 0;
- }
-
- rw_sh->offset += op_ret;
-
- if (sh->file_has_holes) {
- if (iov_0filled (vector, count) == 0) {
- /* the iter function depends on the
- sh->offset already being updated
- above
- */
- gf_log (this->name, GF_LOG_DEBUG,
- "block has all 0 filled");
- sh_full_loop_return (rw_frame, this, offset);
- goto out;
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !sh_local->child_up[i])
- continue;
-
- /* this is a sink, so write to it */
-
- STACK_WIND_COOKIE (rw_frame, sh_full_write_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- sh->healing_fd, vector, count, offset,
- iobref);
-
- if (!--call_count)
- break;
- }
-
-out:
- return 0;
-}
-
-
-static int
-sh_full_read_write (call_frame_t *frame, xlator_t *this, off_t offset)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t *rw_sh = NULL;
- afr_self_heal_t *sh = NULL;
- call_frame_t *rw_frame = NULL;
- int32_t op_errno = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- rw_frame = copy_frame (frame);
- if (!rw_frame)
- goto out;
-
- ALLOC_OR_GOTO (rw_local, afr_local_t, out);
-
- rw_frame->local = rw_local;
- rw_sh = &rw_local->self_heal;
-
- rw_sh->offset = offset;
- rw_sh->sh_frame = frame;
-
- STACK_WIND_COOKIE (rw_frame, sh_full_read_cbk,
- (void *) (long) offset,
- priv->children[sh->source],
- priv->children[sh->source]->fops->readv,
- sh->healing_fd, sh->block_size,
- offset);
- return 0;
-
-out:
- sh->op_failed = 1;
-
- sh_full_loop_driver (frame, this, _gf_false);
-
- return 0;
-}
-
-
-static int
-sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
- gf_boolean_t is_driver_done = _gf_false;
- blksize_t block_size = 0;
- off_t offset = 0;
- int loop = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
-
- LOCK (&sh_priv->lock);
- {
- if (_gf_false == is_first_call)
- sh_priv->loops_running--;
- offset = sh_priv->offset;
- block_size = sh->block_size;
- while ((sh->op_failed == 0) &&
- (sh_priv->loops_running < priv->data_self_heal_window_size)
- && (sh_priv->offset < sh->file_size)) {
-
- loop++;
- gf_log (this->name, GF_LOG_TRACE,
- "spawning a loop for offset %"PRId64,
- sh_priv->offset);
-
- sh_priv->offset += sh->block_size;
- sh_priv->loops_running++;
-
- if (_gf_false == is_first_call)
- break;
-
- }
- if (0 == sh_priv->loops_running) {
- is_driver_done = _gf_true;
- }
- }
- UNLOCK (&sh_priv->lock);
-
- while (loop--) {
- if (sh->op_failed) {
- // op failed in other loop, stop spawning more loops
- sh_full_loop_driver (frame, this, _gf_false);
- } else {
- sh_full_read_write (frame, this, offset);
- offset += block_size;
- }
- }
-
- if (is_driver_done) {
- sh_full_loop_driver_done (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_algo_full (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = GF_CALLOC (1, sizeof (*sh_priv),
- gf_afr_mt_afr_private_t);
- if (!sh_priv)
- goto out;
-
- LOCK_INIT (&sh_priv->lock);
-
- sh->private = sh_priv;
-
- local->call_count = 0;
-
- sh_full_loop_driver (frame, this, _gf_true);
-out:
- return 0;
-}
-
-
-/*
- * The "diff" algorithm. Copies only those blocks whose checksums
- * don't match with those of source.
- */
-
-
-static void
-sh_diff_private_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = sh->private;
-
- for (i = 0; i < priv->data_self_heal_window_size; i++) {
- if (sh_priv->loops[i]) {
- if (sh_priv->loops[i]->write_needed)
- GF_FREE (sh_priv->loops[i]->write_needed);
-
- if (sh_priv->loops[i]->checksum)
- GF_FREE (sh_priv->loops[i]->checksum);
-
- GF_FREE (sh_priv->loops[i]);
- }
- }
-
- if (sh_priv) {
- if (sh_priv->loops)
- GF_FREE (sh_priv->loops);
-
- GF_FREE (sh_priv);
- }
-
-
-}
-
-
-static uint32_t
-__make_cookie (int loop_index, int child_index)
-{
- uint32_t ret = ((loop_index << 16) | child_index);
- return ret;
-}
-
-
-static int
-__loop_index (uint32_t cookie)
-{
- return ((cookie & 0xFFFF0000) >> 16);
-}
-
-
-static int
-__child_index (uint32_t cookie)
-{
- return (cookie & 0x0000FFFF);
-}
-
-
-static void
-sh_diff_loop_state_reset (struct sh_diff_loop_state *loop_state, int child_count)
-{
- loop_state->active = _gf_false;
-// loop_state->offset = 0;
-
- memset (loop_state->write_needed,
- 0, sizeof (*loop_state->write_needed) * child_count);
-
- memset (loop_state->checksum,
- 0, MD5_DIGEST_LEN * child_count);
-}
-
-
-static int
-sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count)
-{
- int writes = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (write_needed[i])
- writes++;
- }
-
- return writes;
-}
-
-
-static int
-sh_diff_loop_driver_done (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
- int32_t total_blocks = 0;
- int32_t diff_blocks = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
- total_blocks = sh_priv->total_blocks;
- diff_blocks = sh_priv->diff_blocks;
-
- sh_diff_private_cleanup (frame, this);
- if (sh->op_failed) {
- gf_log (this->name, GF_LOG_INFO,
- "diff self-heal aborting on %s",
- local->loc.path);
-
- local->self_heal.algo_abort_cbk (frame, this);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "diff self-heal on %s: completed. "
- "(%d blocks of %d were different (%.2f%%))",
- local->loc.path, diff_blocks, total_blocks,
- ((diff_blocks * 1.0)/total_blocks) * 100);
-
- local->self_heal.algo_completion_cbk (frame, this);
- }
-
- return 0;
-}
-
-static int
-sh_diff_loop_driver (call_frame_t *frame, xlator_t *this,
- gf_boolean_t is_first_call,
- struct sh_diff_loop_state *loop_state);
-
-static int
-sh_diff_loop_return (call_frame_t *rw_frame, xlator_t *this,
- struct sh_diff_loop_state *loop_state)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
-
- priv = this->private;
-
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- gf_log (this->name, GF_LOG_TRACE,
- "loop for offset %"PRId64" returned", loop_state->offset);
-
- AFR_STACK_DESTROY (rw_frame);
-
- sh_diff_loop_driver (sh_frame, this, _gf_false, loop_state);
-
- return 0;
-}
-
-
-static int
-sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *postbuf)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
- struct sh_diff_loop_state *loop_state = NULL;
- int call_count = 0;
- int child_index = 0;
- int loop_index = 0;
-
- priv = this->private;
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- child_index = __child_index ((uint32_t) (long) cookie);
- loop_index = __loop_index ((uint32_t) (long) cookie);
- loop_state = sh_priv->loops[loop_index];
-
- gf_log (this->name, GF_LOG_TRACE,
- "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
- op_ret, sh_local->loc.path, child_index,
- loop_state->offset);
-
- LOCK (&sh_frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "write to %s failed on subvolume %s (%s)",
- sh_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->op_failed = 1;
- }
- }
- UNLOCK (&sh_frame->lock);
-
- call_count = afr_frame_return (rw_frame);
-
- if (call_count == 0) {
- sh_diff_loop_return (rw_frame, this, loop_state);
- }
-
- return 0;
-}
-
-
-static int
-sh_diff_read_cbk (call_frame_t *rw_frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
- afr_sh_algo_diff_private_t * sh_priv = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- int loop_index = 0;
- struct sh_diff_loop_state *loop_state = NULL;
- uint32_t wcookie = 0;
- int i = 0;
- int call_count = 0;
-
- priv = this->private;
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- loop_index = __loop_index ((uint32_t) (long) cookie);
- loop_state = sh_priv->loops[loop_index];
-
- call_count = sh_diff_number_of_writes_needed (loop_state->write_needed,
- priv->child_count);
-
- rw_local->call_count = call_count;
-
- gf_log (this->name, GF_LOG_TRACE,
- "read %d bytes of data from %s, offset %"PRId64"",
- op_ret, sh_local->loc.path, loop_state->offset);
-
- if ((op_ret <= 0) ||
- (call_count == 0)) {
- sh_diff_loop_return (rw_frame, this, loop_state);
-
- return 0;
- }
-
- if (sh->file_has_holes) {
- if (iov_0filled (vector, count) == 0) {
- gf_log (this->name, GF_LOG_DEBUG, "0 filled block");
- sh_diff_loop_return (rw_frame, this, loop_state);
- goto out;
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (loop_state->write_needed[i]) {
- wcookie = __make_cookie (loop_index, i);
-
- STACK_WIND_COOKIE (rw_frame, sh_diff_write_cbk,
- (void *) (long) wcookie,
- priv->children[i],
- priv->children[i]->fops->writev,
- sh->healing_fd, vector, count,
- loop_state->offset, iobref);
-
- if (!--call_count)
- break;
- }
- }
-
-out:
- return 0;
-}
-
-
-static int
-sh_diff_read (call_frame_t *rw_frame, xlator_t *this,
- int loop_index)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
- afr_sh_algo_diff_private_t * sh_priv = NULL;
- struct sh_diff_loop_state *loop_state = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- uint32_t cookie = 0;
-
- priv = this->private;
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- loop_state = sh_priv->loops[loop_index];
-
- cookie = __make_cookie (loop_index, sh->source);
-
- STACK_WIND_COOKIE (rw_frame, sh_diff_read_cbk,
- (void *) (long) cookie,
- priv->children[sh->source],
- priv->children[sh->source]->fops->readv,
- sh->healing_fd, sh_priv->block_size,
- loop_state->offset);
-
- return 0;
-}
-
-
-static int
-sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- uint32_t weak_checksum, uint8_t *strong_checksum)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t *rw_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_diff_private_t * sh_priv = NULL;
- int loop_index = 0;
- int child_index = 0;
- struct sh_diff_loop_state *loop_state = NULL;
- int call_count = 0;
- int i = 0;
- int write_needed = 0;
-
- priv = this->private;
-
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- sh_priv = sh->private;
-
- child_index = __child_index ((uint32_t) (long) cookie);
- loop_index = __loop_index ((uint32_t) (long) cookie);
-
- loop_state = sh_priv->loops[loop_index];
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "checksum on %s failed on subvolume %s (%s)",
- sh_local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->op_failed = 1;
- } else {
- memcpy (loop_state->checksum + child_index * MD5_DIGEST_LEN,
- strong_checksum,
- MD5_DIGEST_LEN);
- }
-
- call_count = afr_frame_return (rw_frame);
-
- if (call_count == 0) {
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !sh_local->child_up[i])
- continue;
-
- if (memcmp (loop_state->checksum + (i * MD5_DIGEST_LEN),
- loop_state->checksum + (sh->source * MD5_DIGEST_LEN),
- MD5_DIGEST_LEN)) {
- /*
- Checksums differ, so this block
- must be written to this sink
- */
-
- gf_log (this->name, GF_LOG_DEBUG,
- "checksum on subvolume %s at offset %"
- PRId64" differs from that on source",
- priv->children[i]->name, loop_state->offset);
-
- write_needed = loop_state->write_needed[i] = 1;
- }
- }
-
- LOCK (&sh_priv->lock);
- {
- sh_priv->total_blocks++;
- if (write_needed)
- sh_priv->diff_blocks++;
- }
- UNLOCK (&sh_priv->lock);
-
- if (write_needed && !sh->op_failed) {
- sh_diff_read (rw_frame, this, loop_index);
- } else {
- sh->offset += sh_priv->block_size;
-
- sh_diff_loop_return (rw_frame, this, loop_state);
- }
- }
-
- return 0;
-}
-
-
-static int
-sh_diff_find_unused_loop (afr_sh_algo_diff_private_t *sh_priv, int max)
-{
- int i = 0;
-
- LOCK (&sh_priv->lock);
- {
- for (i = 0; i < max; i++) {
- if (sh_priv->loops[i]->active == _gf_false) {
- sh_priv->loops[i]->active = _gf_true;
- break;
- }
- }
- }
- UNLOCK (&sh_priv->lock);
-
- if (i == max) {
- gf_log ("[sh-diff]", GF_LOG_ERROR,
- "no free loops found! This shouldn't happen. Please"
- " report this to gluster-devel@nongnu.org");
- }
-
- return i;
-}
-
-
-static int
-sh_diff_checksum (call_frame_t *frame, xlator_t *this, off_t offset)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_self_heal_t * rw_sh = NULL;
- afr_sh_algo_diff_private_t * sh_priv = NULL;
- call_frame_t *rw_frame = NULL;
- uint32_t cookie = 0;
- int loop_index = 0;
- struct sh_diff_loop_state *loop_state = NULL;
- int32_t op_errno = 0;
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = sh->private;
-
- rw_frame = copy_frame (frame);
- if (!rw_frame)
- goto out;
-
- ALLOC_OR_GOTO (rw_local, afr_local_t, out);
-
- rw_frame->local = rw_local;
- rw_sh = &rw_local->self_heal;
-
- rw_sh->offset = sh->offset;
- rw_sh->sh_frame = frame;
-
- call_count = sh->active_sinks + 1; /* sinks and source */
-
- rw_local->call_count = call_count;
-
- loop_index = sh_diff_find_unused_loop (sh_priv, priv->data_self_heal_window_size);
-
- loop_state = sh_priv->loops[loop_index];
- loop_state->offset = offset;
-
- /* we need to send both the loop index and child index,
- so squeeze them both into a 32-bit number */
-
- cookie = __make_cookie (loop_index, sh->source);
-
- STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk,
- (void *) (long) cookie,
- priv->children[sh->source],
- priv->children[sh->source]->fops->rchecksum,
- sh->healing_fd,
- offset, sh_priv->block_size);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- cookie = __make_cookie (loop_index, i);
-
- STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk,
- (void *) (long) cookie,
- priv->children[i],
- priv->children[i]->fops->rchecksum,
- sh->healing_fd,
- offset, sh_priv->block_size);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-
-out:
- sh->op_failed = 1;
-
- sh_diff_loop_driver (frame, this, _gf_false, loop_state);
-
- return 0;
-}
-
-
-static int
-sh_diff_loop_driver (call_frame_t *frame, xlator_t *this,
- gf_boolean_t is_first_call,
- struct sh_diff_loop_state *loop_state)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
- gf_boolean_t is_driver_done = _gf_false;
- blksize_t block_size = 0;
- int loop = 0;
- off_t offset = 0;
- char sh_type_str[256] = {0,};
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
-
- afr_self_heal_type_str_get(sh, sh_type_str, sizeof(sh_type_str));
-
- LOCK (&sh_priv->lock);
- {
- if (loop_state)
- sh_diff_loop_state_reset (loop_state, priv->child_count);
- if (_gf_false == is_first_call)
- sh_priv->loops_running--;
- offset = sh_priv->offset;
- block_size = sh_priv->block_size;
- while ((0 == sh->op_failed) &&
- (sh_priv->loops_running < priv->data_self_heal_window_size)
- && (sh_priv->offset < sh->file_size)) {
-
- loop++;
- gf_log (this->name, GF_LOG_TRACE,
- "spawning a loop for offset %"PRId64,
- sh_priv->offset);
-
- sh_priv->offset += sh_priv->block_size;
- sh_priv->loops_running++;
-
- if (_gf_false == is_first_call)
- break;
-
- }
- if (0 == sh_priv->loops_running) {
- is_driver_done = _gf_true;
- }
- }
- UNLOCK (&sh_priv->lock);
-
- while (loop--) {
- if (sh->op_failed) {
- // op failed in other loop, stop spawning more loops
- sh_diff_loop_driver (frame, this, _gf_false, NULL);
- } else {
- sh_diff_checksum (frame, this, offset);
- offset += block_size;
- }
- }
-
- if (is_driver_done) {
- sh_diff_loop_driver_done (frame, this);
- }
- return 0;
-}
-
-
-int
-afr_sh_algo_diff (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = GF_CALLOC (1, sizeof (*sh_priv),
- gf_afr_mt_afr_private_t);
- if (!sh_priv)
- goto err;
-
- sh_priv->block_size = this->ctx->page_size;
-
- sh->private = sh_priv;
-
- LOCK_INIT (&sh_priv->lock);
-
- local->call_count = 0;
-
- sh_priv->loops = GF_CALLOC (priv->data_self_heal_window_size,
- sizeof (*sh_priv->loops),
- gf_afr_mt_sh_diff_loop_state);
- if (!sh_priv->loops)
- goto err;
-
- for (i = 0; i < priv->data_self_heal_window_size; i++) {
- sh_priv->loops[i] = GF_CALLOC (1, sizeof (*sh_priv->loops[i]),
- gf_afr_mt_sh_diff_loop_state);
- if (!sh_priv->loops[i])
- goto err;
-
- sh_priv->loops[i]->checksum = GF_CALLOC (priv->child_count,
- MD5_DIGEST_LEN, gf_afr_mt_uint8_t);
- if (!sh_priv->loops[i]->checksum)
- goto err;
-
- sh_priv->loops[i]->write_needed = GF_CALLOC (priv->child_count,
- sizeof (*sh_priv->loops[i]->write_needed),
- gf_afr_mt_char);
- if (!sh_priv->loops[i]->write_needed)
- goto err;
-
- }
-
- sh_diff_loop_driver (frame, this, _gf_true, NULL);
-
- return 0;
-err:
- if (sh_priv) {
- if (sh_priv->loops) {
- for (i = 0; i < priv->data_self_heal_window_size; i++) {
- if (sh_priv->loops[i]->write_needed)
- GF_FREE (sh_priv->loops[i]->write_needed);
- if (sh_priv->loops[i]->checksum)
- GF_FREE (sh_priv->loops[i]->checksum);
- if (sh_priv->loops[i])
- GF_FREE (sh_priv->loops[i]);
- }
-
- GF_FREE (sh_priv->loops);
- }
-
- GF_FREE (sh_priv);
- }
- return 0;
-}
-
-
-struct afr_sh_algorithm afr_self_heal_algorithms[] = {
- {.name = "full", .fn = afr_sh_algo_full},
- {.name = "diff", .fn = afr_sh_algo_diff},
- {0, 0},
-};
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h
deleted file mode 100644
index 2790dbc6a52..00000000000
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __AFR_SELF_HEAL_ALGORITHM_H__
-#define __AFR_SELF_HEAL_ALGORITHM_H__
-
-
-typedef int (*afr_sh_algo_fn) (call_frame_t *frame,
- xlator_t *this);
-
-struct afr_sh_algorithm {
- const char *name;
- afr_sh_algo_fn fn;
-};
-
-extern struct afr_sh_algorithm afr_self_heal_algorithms[3];
-
-typedef struct {
- gf_lock_t lock;
- unsigned int loops_running;
- off_t offset;
-} afr_sh_algo_full_private_t;
-
-struct sh_diff_loop_state {
- off_t offset;
- unsigned char *write_needed;
- uint8_t *checksum;
- gf_boolean_t active;
-};
-
-typedef struct {
- size_t block_size;
-
- gf_lock_t lock;
- unsigned int loops_running;
- off_t offset;
-
- int32_t total_blocks;
- int32_t diff_blocks;
-
- struct sh_diff_loop_state **loops;
-} afr_sh_algo_diff_private_t;
-
-#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index b925b0adcef..a580a1584cc 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1,2288 +1,2934 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
-#include "byte-order.h"
-
#include "afr.h"
-#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
#include "afr-self-heal.h"
-#include "pump.h"
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include "afr-messages.h"
+#include <glusterfs/events.h>
-//Intersection[child]=1 if child is part of intersection
void
-afr_children_intersection_get (int32_t *set1, int32_t *set2,
- int *intersection, unsigned int child_count)
-{
- int i = 0;
+afr_heal_synctask(xlator_t *this, afr_local_t *local);
- memset (intersection, 0, sizeof (*intersection) * child_count);
- for (i = 0; i < child_count; i++) {
- intersection[i] = afr_is_child_present (set1, child_count, i)
- && afr_is_child_present (set2, child_count,
- i);
+int
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+ inode_t *inode, struct afr_reply *replies, int source,
+ unsigned char *sources, void *gfid, int *gfid_idx)
+{
+ afr_private_t *priv = NULL;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_on = NULL;
+ ia_type_t ia_type = IA_INVAL;
+ dict_t *xdata = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
+ int i = 0;
+
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+ if (source >= 0 && replies[source].valid && replies[source].op_ret == 0)
+ ia_type = replies[source].poststat.ia_type;
+
+ if (ia_type != IA_INVAL)
+ goto heal;
+
+ /* If ia_type is still invalid, it means either
+ * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain
+ * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks
+ * are sources) and the 'source' we selected earlier might be the one where
+ * the file is not actually present.
+ *
+ * In both cases, let us pick a brick with a successful reply and use its
+ * ia_type.
+ * */
+ for (i = 0; i < priv->child_count; i++) {
+ if (source == -1) {
+ /* case (a) above. */
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
+ ia_type = replies[i].poststat.ia_type;
+ break;
+ }
+ } else {
+ /* case (b) above. */
+ if (i == source)
+ continue;
+ if (sources[i] && replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
+ ia_type = replies[i].poststat.ia_type;
+ break;
+ }
}
-}
+ }
-/**
- * select_source - select a source and return it
- */
+heal:
+ /* gfid heal on those subvolumes that do not have gfid associated
+ * with the inode and update those replies.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
-int
-afr_sh_select_source (int sources[], int child_count)
-{
- int i = 0;
- for (i = 0; i < child_count; i++)
- if (sources[i])
- return i;
+ if (gf_uuid_is_null(gfid) &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid) &&
+ replies[i].poststat.ia_type == ia_type)
+ gfid = replies[i].poststat.ia_gfid;
- return -1;
-}
+ if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) ||
+ replies[i].poststat.ia_type != ia_type)
+ continue;
+ wind_on[i] = 1;
+ }
-/**
- * sink_count - return number of sinks in sources array
- */
+ if (AFR_COUNT(wind_on, priv->child_count) == 0)
+ return 0;
-int
-afr_sh_sink_count (int sources[], int child_count)
-{
- int i = 0;
- int sinks = 0;
- for (i = 0; i < child_count; i++)
- if (!sources[i])
- sinks++;
- return sinks;
+ xdata = dict_new();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ local = frame->local;
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
+
+ AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ afr_reply_wipe(&replies[i]);
+ afr_reply_copy(&replies[i], &local->replies[i]);
+ }
+ if (gfid_idx && (*gfid_idx == -1)) {
+ /*Pick a brick where the gifd heal was successful.*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid)) {
+ *gfid_idx = i;
+ break;
+ }
+ }
+ }
+out:
+ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) {
+ ret = -afr_final_errno(local, priv);
+ }
+ loc_wipe(&loc);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ if (xdata)
+ dict_unref(xdata);
+
+ return ret;
}
int
-afr_sh_source_count (int sources[], int child_count)
+afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies,
+ char *src_brick)
{
- int i = 0;
- int nsource = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
- for (i = 0; i < child_count; i++)
- if (sources[i])
- nsource++;
- return nsource;
-}
-
-void
-afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno)
-{
- sh->op_ret = -1;
- if (afr_error_more_important (sh->op_errno, op_errno))
- sh->op_errno = op_errno;
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (strcmp(priv->children[i]->name, src_brick) == 0)
+ return i;
+ }
+ return -1;
}
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
+int
+afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies,
+ int child_count)
{
- afr_private_t * priv = this->private;
- char *buf = NULL;
- char *ptr = NULL;
- int i = 0;
- int j = 0;
+ int j = 0;
+ int i = 0;
+ int votes;
- /* 10 digits per entry + 1 space + '[' and ']' */
- buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char);
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
- for (i = 0; i < priv->child_count; i++) {
- ptr = buf;
- ptr += sprintf (ptr, "[ ");
- for (j = 0; j < priv->child_count; j++) {
- ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
- }
- sprintf (ptr, "]");
- gf_log (this->name, GF_LOG_TRACE,
- "pending_matrix: %s", buf);
+ votes = 1;
+ for (j = i + 1; j < child_count; j++) {
+ if ((!gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[j].poststat.ia_gfid)))
+ votes++;
+ if (votes > child_count / 2)
+ return i;
}
+ }
- GF_FREE (buf);
+ return -1;
}
-void
-afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count)
+int
+afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies,
+ int child_count)
{
- int i = 0;
- int j = 0;
-
- GF_ASSERT (pending_matrix);
+ int i = 0;
+ int src = -1;
+ uint64_t size = 0;
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- pending_matrix[i][j] = 0;
- }
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (size < replies[i].poststat.ia_size) {
+ src = i;
+ size = replies[i].poststat.ia_size;
+ } else if (replies[i].poststat.ia_size == size) {
+ src = -1;
}
+ }
+ return src;
}
-void
-afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix,
- unsigned char *ignorant_subvols,
- size_t child_count)
-{
- int i = 0;
- int j = 0;
-
- GF_ASSERT (pending_matrix);
- GF_ASSERT (ignorant_subvols);
-
- for (i = 0; i < child_count; i++) {
- if (ignorant_subvols[i]) {
- for (j = 0; j < child_count; j++) {
- if (!ignorant_subvols[j])
- pending_matrix[j][i] += 1;
- }
- }
- }
+int
+afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies,
+ int child_count)
+{
+ int i = 0;
+ int src = -1;
+ uint32_t mtime = 0;
+ uint32_t mtime_nsec = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+ if ((mtime < replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+ src = i;
+ mtime = replies[i].poststat.ia_mtime;
+ mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ } else if ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) {
+ src = -1;
+ }
+ }
+ return src;
}
int
-afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
- dict_t *xattr[], afr_transaction_type type,
- size_t child_count)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
- int k = 0;
- unsigned char *ignorant_subvols = NULL;
-
- ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count,
- gf_afr_mt_char);
- if (NULL == ignorant_subvols)
- goto out;
-
- afr_init_pending_matrix (pending_matrix, child_count);
-
- for (i = 0; i < child_count; i++) {
- pending_raw = NULL;
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid, const char *bname,
+ int src_idx, int child_idx,
+ unsigned char *locked_on, int *src, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ char g1[64] = {
+ 0,
+ };
+ char g2[64] = {
+ 0,
+ };
+ int up_count = 0;
+ int heal_op = -1;
+ int ret = -1;
+ char *src_brick = NULL;
+
+ *src = -1;
+ priv = this->private;
+ up_count = AFR_COUNT(locked_on, priv->child_count);
+ if (up_count != priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "All the bricks should be up to resolve the gfid split "
+ "barin");
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SALL_BRICKS_UP_TO_RESOLVE);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "Error setting"
+ " gfid-heal-msg dict");
+ }
+ goto out;
+ }
+
+ if (xdata) {
+ ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op);
+ if (ret)
+ goto fav_child;
+ } else {
+ goto fav_child;
+ }
+
+ switch (heal_op) {
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ *src = afr_gfid_sbrain_source_from_bigger_file(replies,
+ priv->child_count);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SNO_BIGGER_FILE);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_BIGGER_FILE);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ " setting gfid-heal-msg dict");
+ }
+ }
+ break;
+
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+ *src = afr_gfid_sbrain_source_from_latest_mtime(replies,
+ priv->child_count);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SNO_DIFF_IN_MTIME);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_DIFF_IN_MTIME);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ "setting gfid-heal-msg dict");
+ }
+ }
+ break;
+
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str_sizen(xdata, "child-name", &src_brick);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Error getting the source "
+ "brick");
+ break;
+ }
+ *src = afr_gfid_sbrain_source_from_src_brick(this, replies,
+ src_brick);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SERROR_GETTING_SRC_BRICK);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SERROR_GETTING_SRC_BRICK);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ " setting gfid-heal-msg dict");
+ }
+ }
+ break;
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], pending_key[j],
- &pending_raw);
+ default:
+ break;
+ }
+ goto out;
+
+fav_child:
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ *src = afr_sh_fav_by_size(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ *src = afr_sh_fav_by_mtime(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ *src = afr_sh_fav_by_ctime(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ if (priv->child_count != 2)
+ *src = afr_selfheal_gfid_mismatch_by_majority(
+ replies, priv->child_count);
+ else
+ *src = -1;
+
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No majority to resolve "
+ "gfid split brain");
+ }
+ break;
+ default:
+ break;
+ }
- if (ret != 0) {
- /*
- * There is no xattr present. This means this
- * subvolume should be considered an 'ignorant'
- * subvolume.
- */
+out:
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and"
+ " %s on %s.",
+ uuid_utoa(pargfid), bname,
+ uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1),
+ priv->children[child_idx]->name,
+ uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2),
+ priv->children[src_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=gfid;file="
+ "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
+ "child-%d=%s;gfid-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid),
+ bname, child_idx, priv->children[child_idx]->name, child_idx,
+ uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx,
+ priv->children[src_idx]->name, src_idx,
+ uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2));
+ return -1;
+ }
+ return 0;
+}
- ignorant_subvols[i] = 1;
- continue;
- }
+int
+afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- memcpy (pending, pending_raw, sizeof(pending));
- k = afr_index_for_transaction_type (type);
+ local = frame->local;
- pending_matrix[i][j] = ntoh32 (pending[k]);
- }
- }
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ syncbarrier_wake(&local->barrier);
- afr_mark_ignorant_subvols_as_pending (pending_matrix,
- ignorant_subvols,
- child_count);
- GF_FREE (ignorant_subvols);
-out:
- return ret;
+ return 0;
}
-typedef enum {
- AFR_NODE_INNOCENT,
- AFR_NODE_FOOL,
- AFR_NODE_WISE,
- AFR_NODE_INVALID = -1,
-} afr_node_type;
+int
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
-typedef struct {
- afr_node_type type;
- int wisdom;
-} afr_node_character;
+ priv = this->private;
+ local = frame->local;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
-static int
-afr_sh_is_innocent (int32_t *array, int child_count)
-{
- int i = 0;
- int ret = 1; /* innocent until proven guilty */
+ local->op_ret = 0;
- for (i = 0; i < child_count; i++) {
- if (array[i]) {
- ret = 0;
- break;
- }
- }
+ STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
- return ret;
-}
+ syncbarrier_wait(&local->barrier, 1);
+ if (local->op_ret < 0)
+ ret = -local->op_errno;
+ loc_wipe(&loc);
+ local->op_ret = 0;
-static int
-afr_sh_is_fool (int32_t *array, int i, int child_count)
-{
- return array[i]; /* fool if accuses itself */
+ return ret;
}
-
-static int
-afr_sh_is_wise (int32_t *array, int i, int child_count)
-{
- return !array[i]; /* wise if does not accuse itself */
+int
+afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv)
+{
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
+ int stale_count = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ tmp_errno = replies[i].op_errno;
+ if (tmp_errno == ENOENT || tmp_errno == ESTALE) {
+ op_errno = afr_higher_errno(op_errno, tmp_errno);
+ stale_count++;
+ }
+ }
+ if (stale_count != priv->child_count)
+ return -ENOTCONN;
+ else
+ return -op_errno;
}
-
-static int
-afr_sh_all_nodes_innocent (afr_node_character *characters,
- int child_count)
+int
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- int i = 0;
- int ret = 1;
+ int i = (long)cookie;
+ afr_local_t *local = NULL;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_INNOCENT) {
- ret = 0;
- break;
- }
- }
+ local = frame->local;
- return ret;
-}
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (pre)
+ local->replies[i].prestat = *pre;
+ if (post)
+ local->replies[i].poststat = *post;
+ if (xdata)
+ local->replies[i].xdata = dict_ref(xdata);
+ syncbarrier_wake(&local->barrier);
-static int
-afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count)
-{
- int i = 0;
- int ret = 0;
+ return 0;
+}
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- ret = 1;
- break;
- }
+int
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies)
+{
+ loc_t loc = {
+ 0,
+ };
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc,
+ &replies[source].poststat,
+ (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME),
+ NULL);
+
+ loc_wipe(&loc);
+
+ return 0;
+}
+
+dict_t *
+afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl,
+ afr_transaction_type type, int *output_dirty,
+ int **output_matrix, int subvol,
+ int **full_heal_mtx_out)
+{
+ int j = 0;
+ int idx = 0;
+ int d_idx = 0;
+ int ret = 0;
+ int *raw = 0;
+ dict_t *xattr = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ idx = afr_index_for_transaction_type(type);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ xattr = dict_new();
+ if (!xattr)
+ return NULL;
+
+ /* clear dirty */
+ raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32(output_dirty[subvol]);
+ ret = dict_set_bin(xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ GF_FREE(raw);
+ goto err;
+ }
+
+ /* clear/set pending */
+ for (j = 0; j < priv->child_count; j++) {
+ raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32(output_matrix[subvol][j]);
+ if (is_full_crawl)
+ raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]);
+
+ ret = dict_set_bin(xattr, priv->pending_key[j], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ GF_FREE(raw);
+ goto err;
}
+ }
- return ret;
+ return xattr;
+err:
+ if (xattr)
+ dict_unref(xattr);
+ return NULL;
}
+int
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int j = 0;
+ unsigned char *pending = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int **full_heal_mtx_in = NULL;
+ int **full_heal_mtx_out = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ pending = alloca0(priv->child_count);
+
+ input_dirty = alloca0(priv->child_count * sizeof(int));
+ input_matrix = ALLOC_MATRIX(priv->child_count, int);
+ full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int);
+ full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int);
+ output_dirty = alloca0(priv->child_count * sizeof(int));
+ output_matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
-/*
- * The 'wisdom' of a wise node is 0 if any other wise node accuses it.
- * It is 1 if no other wise node accuses it.
- * Only wise nodes with wisdom 1 are sources.
- *
- * If no nodes with wisdom 1 exist, a split-brain has occured.
- */
-
-static void
-afr_sh_compute_wisdom (int32_t *pending_matrix[],
- afr_node_character characters[], int child_count)
-{
- int i = 0;
- int j = 0;
+ afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- characters[i].wisdom = 1;
+ if (local->need_full_crawl)
+ afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL,
+ full_heal_mtx_in);
- for (j = 0; j < child_count; j++) {
- if ((characters[j].type == AFR_NODE_WISE)
- && pending_matrix[j][i]) {
+ for (i = 0; i < priv->child_count; i++)
+ if (sinks[i] && !healed_sinks[i])
+ pending[i] = 1;
- characters[i].wisdom = 0;
- }
- }
- }
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (pending[j]) {
+ output_matrix[i][j] = 1;
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = 1;
+ } else if (locked_on[j]) {
+ output_matrix[i][j] = -input_matrix[i][j];
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j];
+ }
}
-}
-
+ }
-static int
-afr_sh_wise_nodes_conflict (afr_node_character *characters,
- int child_count)
-{
- int i = 0;
- int ret = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ output_dirty[i] = -input_dirty[i];
+ }
- for (i = 0; i < child_count; i++) {
- if ((characters[i].type == AFR_NODE_WISE)
- && characters[i].wisdom == 1) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ /* perform post-op only on subvols we had locked
+ and inspected on.
+ */
+ continue;
+ if (undid_pending[i])
+ /* We already unset the pending xattrs in
+ * _afr_fav_child_reset_sink_xattrs(). */
+ continue;
- /* There is atleast one bona-fide wise node */
- ret = 0;
- break;
- }
+ xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type,
+ output_dirty, output_matrix, i,
+ full_heal_mtx_out);
+ if (!xattr) {
+ continue;
}
- return ret;
-}
+ if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) {
+ if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1))
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set"
+ " dict value for %s",
+ GF_XATTROP_PURGE_INDEX);
+ }
+ afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
+ dict_unref(xattr);
+ }
-static int
-afr_sh_mark_wisest_as_sources (int sources[],
- afr_node_character *characters,
- int child_count)
-{
- int nsources = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].wisdom == 1) {
- sources[i] = 1;
- nsources++;
- }
- }
+ if (xdata)
+ dict_unref(xdata);
- return nsources;
+ return 0;
}
-static void
-afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix,
- afr_node_character *characters,
- int32_t child_count)
-{
- int i = 0;
- int j = 0;
- int witness = 0;
-
- GF_ASSERT (witnesses);
- GF_ASSERT (pending_matrix);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
-
- witness = 0;
- for (j = 0; j < child_count; j++) {
- if (i == j)
- continue;
- witness += pending_matrix[i][j];
- }
- witnesses[i] = witness;
- }
+void
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src)
+{
+ dict_t *xdata = NULL;
+
+ dst->valid = src->valid;
+ dst->op_ret = src->op_ret;
+ dst->op_errno = src->op_errno;
+ dst->prestat = src->prestat;
+ dst->poststat = src->poststat;
+ dst->preparent = src->preparent;
+ dst->postparent = src->postparent;
+ dst->preparent2 = src->preparent2;
+ dst->postparent2 = src->postparent2;
+ if (src->xdata)
+ xdata = dict_ref(src->xdata);
+ else
+ xdata = NULL;
+ if (dst->xdata)
+ dict_unref(dst->xdata);
+ dst->xdata = xdata;
+ if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum",
+ _gf_false) == _gf_true) {
+ memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH);
+ } else {
+ memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH);
+ }
+ dst->fips_mode_rchecksum = src->fips_mode_rchecksum;
}
-static int32_t
-afr_find_biggest_witness_among_fools (int32_t *witnesses,
- afr_node_character *characters,
- int32_t child_count)
+void
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count)
{
- int i = 0;
- int biggest_witness = -1;
+ int i = 0;
- GF_ASSERT (witnesses);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
+ if (dst == src)
+ return;
- if (biggest_witness < witnesses[i])
- biggest_witness = witnesses[i];
- }
- return biggest_witness;
+ for (i = 0; i < count; i++) {
+ afr_reply_copy(&dst[i], &src[i]);
+ }
}
int
-afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,
- afr_node_character *characters,
- int32_t child_count, int32_t witness)
+afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx,
+ dict_t *xdata)
{
- int i = 0;
- int nsources = 0;
-
- GF_ASSERT (sources);
- GF_ASSERT (witnesses);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
+ void *pending_raw = NULL;
+ int pending[3] = {
+ 0,
+ };
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
-
- if (witness == witnesses[i]) {
- sources[i] = 1;
- nsources++;
- }
- }
- return nsources;
-}
+ if (!dirty)
+ return 0;
-static int
-afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
- afr_node_character *characters,
- int child_count)
-{
- int32_t biggest_witness = 0;
- int nsources = 0;
- int32_t *witnesses = NULL;
+ if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw))
+ return -1;
- GF_ASSERT (child_count > 0);
+ if (!pending_raw)
+ return -1;
- witnesses = GF_CALLOC (child_count, sizeof (*witnesses),
- gf_afr_mt_int32_t);
- if (NULL == witnesses) {
- nsources = -1;
- goto out;
- }
+ memcpy(pending, pending_raw, sizeof(pending));
- afr_compute_witness_of_fools (witnesses, pending_matrix, characters,
- child_count);
- biggest_witness = afr_find_biggest_witness_among_fools (witnesses,
- characters,
- child_count);
- nsources = afr_mark_fool_as_source_by_witness (sources, witnesses,
- characters, child_count,
- biggest_witness);
-out:
- if (witnesses)
- GF_FREE (witnesses);
- return nsources;
-}
+ dirty[subvol] = ntoh32(pending[idx]);
-int
-afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs,
- int32_t *valid_children, int child_count,
- uint32_t uid)
-{
- int i = 0;
- int nsources = 0;
- int child = 0;
-
- GF_ASSERT (bufs);
- GF_ASSERT (valid_children);
- GF_ASSERT (sources);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (-1 == valid_children[i])
- continue;
-
- child = valid_children[i];
- if (uid == bufs[child].ia_uid) {
- sources[child] = 1;
- nsources++;
- }
- }
- return nsources;
+ return 0;
}
int
-afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children,
- int child_count)
-{
- int i = 0;
- int smallest = -1;
- int child = 0;
-
- GF_ASSERT (bufs);
- GF_ASSERT (valid_children);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (-1 == valid_children[i])
- continue;
- child = valid_children[i];
- if ((smallest == -1) ||
- (bufs[child].ia_uid < bufs[smallest].ia_uid)) {
- smallest = child;
- }
- }
- return smallest;
-}
-
-static int
-afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children,
- int child_count, int32_t *sources)
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+ dict_t *xdata)
{
- int nsources = 0;
- int smallest = 0;
+ int i = 0;
+ void *pending_raw = NULL;
+ int pending[3] = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
- smallest = afr_get_child_with_lowest_uid (bufs, valid_children,
- child_count);
- if (smallest < 0) {
- nsources = -1;
- goto out;
- }
- nsources = afr_mark_child_as_source_by_uid (sources, bufs,
- valid_children, child_count,
- bufs[smallest].ia_uid);
-out:
- return nsources;
-}
+ priv = this->private;
-char *
-afr_get_character_str (afr_node_type type)
-{
- char *character = NULL;
+ if (!matrix)
+ return 0;
- switch (type) {
- case AFR_NODE_INNOCENT:
- character = "innocent";
- break;
- case AFR_NODE_FOOL:
- character = "fool";
- break;
- case AFR_NODE_WISE:
- character = "wise";
- break;
- default:
- character = "invalid";
- break;
- }
- return character;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
-afr_node_type
-afr_find_child_character_type (int32_t *pending_row, int32_t child,
- int32_t child_count, const char *xlator_name)
-{
- afr_node_type type = AFR_NODE_INVALID;
+ if (!pending_raw)
+ continue;
- GF_ASSERT (pending_row);
- GF_ASSERT (child_count > 0);
- GF_ASSERT ((child >= 0) && (child < child_count));
+ memcpy(pending, pending_raw, sizeof(pending));
- if (afr_sh_is_innocent (pending_row, child_count))
- type = AFR_NODE_INNOCENT;
- else if (afr_sh_is_fool (pending_row, child, child_count))
- type = AFR_NODE_FOOL;
- else if (afr_sh_is_wise (pending_row, child, child_count))
- type = AFR_NODE_WISE;
- else
- GF_ASSERT (0);
+ matrix[subvol][i] = ntoh32(pending[idx]);
+ }
- gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s",
- child, afr_get_character_str (type));
- return type;
+ return 0;
}
int
-afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs,
- int32_t **pending_matrix, int32_t *sources,
- int32_t *success_children, afr_transaction_type type)
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix)
{
- afr_private_t *priv = NULL;
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
- int nsources = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ dict_t *xdata = NULL;
+ int idx = -1;
- priv = this->private;
+ idx = afr_index_for_transaction_type(type);
- if (afr_get_children_count (success_children, priv->child_count) == 0)
- goto out;
+ priv = this->private;
- afr_build_pending_matrix (priv->pending_key, pending_matrix,
- xattr, type, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
- sh_type = afr_self_heal_type_for_transaction (type);
- if (AFR_SELF_HEAL_INVALID == sh_type)
- goto out;
+ if (!replies[i].xdata)
+ continue;
- afr_sh_print_pending_matrix (pending_matrix, this);
+ xdata = replies[i].xdata;
- nsources = afr_mark_sources (sources, pending_matrix, bufs,
- priv->child_count, sh_type,
- success_children, this->name);
-out:
- return nsources;
+ afr_selfheal_fill_dirty(this, dirty, i, idx, xdata);
+ afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+ }
+
+ return 0;
}
-/**
- * mark_sources: Mark all 'source' nodes and return number of source
- * nodes found
- *
- * A node (a row in the pending matrix) belongs to one of
- * three categories:
- *
- * M is the pending matrix.
- *
- * 'innocent' - M[i] is all zeroes
- * 'fool' - M[i] has i'th element = 1 (self-reference)
- * 'wise' - M[i] has i'th element = 0, others are 1 or 0.
- *
- * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is
- * needed.
- *
- * A 'wise' node can be a source. If two 'wise' nodes conflict, it is
- * a split-brain. If one wise node refers to the other but the other doesn't
- * refer back, the referrer is a source.
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
*
- * All fools are sinks, unless there are no 'wise' nodes. In that case,
- * one of the fools is made a source.
+ * This can happen if data was directly modified in the backend or for snapshots
*/
-
-int
-afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs,
- int32_t child_count, afr_self_heal_type type,
- int32_t *valid_children, const char *xlator_name)
+void
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- /* stores the 'characters' (innocent, fool, wise) of the nodes */
-
- afr_node_character *characters = NULL;
- int i = 0;
- int nsources = -1;
- xlator_t *this = NULL;
-
- characters = GF_CALLOC (sizeof (afr_node_character),
- child_count, gf_afr_mt_afr_node_character);
- if (!characters)
- goto out;
-
- this = THIS;
-
- /* start clean */
- for (i = 0; i < child_count; i++) {
- sources[i] = 0;
- }
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
- nsources = 0;
- for (i = 0; i < child_count; i++) {
- characters[i].type =
- afr_find_child_character_type (pending_matrix[i], i,
- child_count,
- xlator_name);
- if (AFR_NODE_INVALID == characters[i].type)
- gf_log (xlator_name, GF_LOG_WARNING,
- "child %d had invalid xattrs", i);
+ /* Find source with biggest file size */
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!replies[i].valid || replies[i].op_ret != 0) {
+ sources[i] = 0;
+ continue;
}
-
- if ((type == AFR_SELF_HEAL_METADATA)
- && afr_sh_all_nodes_innocent (characters, child_count)) {
-
- nsources = afr_sh_mark_lowest_uid_as_source (bufs,
- valid_children,
- child_count,
- sources);
- goto out;
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
}
+ }
- if (afr_sh_wise_nodes_exist (characters, child_count)) {
- afr_sh_compute_wisdom (pending_matrix, characters, child_count);
+ /* Mark sources with less size as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size > replies[i].poststat.ia_size)
+ sources[i] = 0;
+ }
+}
- if (afr_sh_wise_nodes_conflict (characters, child_count)) {
- /* split-brain */
- gf_log (this->name, GF_LOG_INFO,
- "split-brain possible, no source detected");
- nsources = -1;
+void
+afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint32_t mtime = 0;
+ uint32_t mtime_nsec = 0;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!replies[i].valid || replies[i].op_ret != 0) {
+ sources[i] = 0;
+ continue;
+ }
+ if ((mtime < replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+ mtime = replies[i].poststat.ia_mtime;
+ mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if ((mtime > replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) {
+ sources[i] = 0;
+ }
+ }
+}
- } else {
- nsources = afr_sh_mark_wisest_as_sources (sources,
- characters,
- child_count);
- }
- } else {
- nsources = afr_mark_biggest_of_fools_as_source (sources,
- pending_matrix,
- characters,
- child_count);
- }
+void
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+ unsigned char *locked_on, unsigned char *sinks)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
-out:
- if (nsources == 0) {
- for (i = 0; i < child_count; i++) {
- if (valid_children[i] != -1)
- sources[valid_children[i]] = 1;
- }
- }
- if (characters)
- GF_FREE (characters);
+ priv = this->private;
- gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources);
- return nsources;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] && locked_on[i])
+ sinks[i] = 1;
+ else
+ sinks[i] = 0;
+ }
}
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], int success[],
- int child_count, afr_transaction_type type)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- int ret = 0;
- int i = 0;
- int j = 0;
- int k = 0;
-
- /* start clean */
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- delta_matrix[i][j] = 0;
- }
- }
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ dict_t *xdata_req = NULL;
+ int ret = 0;
+ int heal_op = -1;
- for (i = 0; i < child_count; i++) {
- if (pending_raw)
- pending_raw = NULL;
+ local = frame->local;
+ xdata_req = local->xdata_req;
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+ if (ret)
+ return _gf_false;
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp)
+ return _gf_true;
+ }
+ ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], priv->pending_key[j],
- &pending_raw);
- if (ret < 0)
- gf_log (THIS->name, GF_LOG_DEBUG,
- "Unable to get dict value.");
- if (!success[j])
- continue;
+ return _gf_true;
+}
- k = afr_index_for_transaction_type (type);
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+ int child_count)
+{
+ int i = 0;
- if (pending_raw != NULL) {
- memcpy (pending, pending_raw, sizeof(pending));
- delta_matrix[i][j] = -(ntoh32 (pending[k]));
- } else {
- delta_matrix[i][j] = 0;
- }
+ for (i = 0; i < child_count; i++)
+ if (replies[i].valid != 1 || replies[i].op_ret != 0)
+ return _gf_false;
- }
- }
+ return _gf_true;
}
+int
+afr_mark_split_brain_source_sinks_by_heal_op(
+ call_frame_t *frame, xlator_t *this, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type, int heal_op)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int ret = 0;
+ int i = 0;
+ char *name = NULL;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ if (sources[i] || !sinks[i] || !healed_sinks[i]) {
+ ret = -1;
+ goto out;
+ }
+ }
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ ret = -1;
+ goto out;
+ }
+ }
+ xdata_rsp = local->xdata_rsp;
+
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRAIN_HEAL_NO_GO_MSG);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++)
+ if (locked_on[i])
+ sources[i] = 1;
+ switch (heal_op) {
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ afr_mark_largest_file_as_source(this, sources, replies);
+ if (AFR_COUNT(sources, priv->child_count) != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_BIGGER_FILE);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ break;
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ afr_mark_latest_mtime_file_as_source(this, sources, replies);
+ if (AFR_COUNT(sources, priv->child_count) != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_DIFF_IN_MTIME);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ break;
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str_sizen(xdata_req, "child-name", &name);
+ if (ret)
+ goto out;
+ source = afr_get_child_index_from_name(this, name);
+ if (source < 0) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SINVALID_BRICK_NAME);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ if (locked_on[source] != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRICK_IS_NOT_UP);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ sources[source] = 1;
+ break;
+ default:
+ ret = -1;
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+ sinks[source] = 0;
+ healed_sinks[source] = 0;
+ ret = source;
+out:
+ if (ret < 0)
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ return ret;
+}
int
-afr_sh_delta_to_xattr (afr_private_t *priv,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
-{
- int i = 0;
- int j = 0;
- int k = 0;
- int ret = 0;
- int32_t *pending = NULL;
-
- for (i = 0; i < child_count; i++) {
- if (!xattr[i])
- continue;
-
- for (j = 0; j < child_count; j++) {
- pending = GF_CALLOC (sizeof (int32_t), 3,
- gf_afr_mt_int32_t);
-
- if (!pending)
- continue;
- /* 3 = data+metadata+entry */
-
- k = afr_index_for_transaction_type (type);
-
- pending[k] = hton32 (delta_matrix[i][j]);
-
- ret = dict_set_bin (xattr[i], priv->pending_key[j],
- pending,
- 3 * sizeof (int32_t));
- if (ret < 0)
- gf_log (THIS->name, GF_LOG_WARNING,
- "Unable to set dict value.");
+afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode)
+{
+ afr_private_t *priv;
+ int vote_count = -1;
+ int fav_child = -1;
+ int i = 0;
+ int k = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64
+ " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_size, uuid_utoa(inode->gfid));
+ vote_count = 0;
+ for (k = 0; k < priv->child_count; k++) {
+ if ((replies[k].poststat.ia_mtime ==
+ replies[i].poststat.ia_mtime) &&
+ (replies[k].poststat.ia_size ==
+ replies[i].poststat.ia_size)) {
+ vote_count++;
}
+ }
+ if (vote_count > priv->child_count / 2) {
+ fav_child = i;
+ break;
+ }
}
- return 0;
+ }
+ return fav_child;
}
-
+/*
+ * afr_sh_fav_by_mtime: Choose favorite child by mtime.
+ */
int
-afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
-
- if (ret != 0)
- return 0;
-
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
-
- if (pending[j])
- return 1;
- }
-
- return 0;
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_mtime = 0;
+ uint32_t cmp_mtime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s mtime = %" PRId64
+ ", mtime_nsec = %d for "
+ "gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_mtime_nsec,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_mtime > cmp_mtime) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_mtime == cmp_mtime) &&
+ (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
}
-
+/*
+ * afr_sh_fav_by_ctime: Choose favorite child by ctime.
+ */
int
-afr_sh_has_data_pending (dict_t *xattr, xlator_t *this)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
-
- if (ret != 0)
- return 0;
-
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
-
- if (pending[j])
- return 1;
- }
-
- return 0;
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_ctime = 0;
+ uint32_t cmp_ctime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s ctime = %" PRId64
+ ", ctime_nsec = %d for "
+ "gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_ctime,
+ replies[i].poststat.ia_ctime_nsec,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_ctime > cmp_ctime) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_ctime == cmp_ctime) &&
+ (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
}
+/*
+ * afr_sh_fav_by_size: Choose favorite child by size
+ * when not all files are of zero size.
+ */
+int
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint64_t cmp_sz = 0;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid) {
+ continue;
+ }
+ gf_msg_debug(this->name, 0,
+ "Child:%s file size = %" PRIu64 " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_size,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_type == IA_IFDIR) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Cannot perform selfheal on %s. "
+ "Size policy is not applicable to directories.",
+ uuid_utoa(inode->gfid));
+ break;
+ }
+ if (replies[i].poststat.ia_size > cmp_sz) {
+ cmp_sz = replies[i].poststat.ia_size;
+ fav_child = i;
+ } else if (replies[i].poststat.ia_size == cmp_sz) {
+ fav_child = -1;
+ }
+ }
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No bigger file");
+ }
+ return fav_child;
+}
int
-afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this)
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str)
{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
- priv = this->private;
+ priv = this->private;
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ return -1;
+ }
+
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ fav_child = afr_sh_fav_by_size(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "SIZE";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ fav_child = afr_sh_fav_by_ctime(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "CTIME";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ fav_child = afr_sh_fav_by_mtime(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "MTIME";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ fav_child = afr_sh_fav_by_majority(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "MAJORITY";
+ }
+ break;
+ case AFR_FAV_CHILD_NONE:
+ default:
+ break;
+ }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
+ return fav_child;
+}
- if (ret != 0)
- return 0;
+int
+afr_mark_split_brain_source_sinks_by_policy(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
+ char mtime_str[256];
+ char ctime_str[256];
+ char *policy_str = NULL;
+ struct tm *tm_ptr;
+ time_t time;
+
+ priv = this->private;
+
+ fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str);
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "No child selected by favorite-child policy.");
+ } else if (fav_child > priv->child_count - 1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Invalid child (%d) "
+ "selected by policy %s.",
+ fav_child, policy_str);
+ } else if (fav_child >= 0) {
+ time = replies[fav_child].poststat.ia_mtime;
+ tm_ptr = localtime(&time);
+ strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+ time = replies[fav_child].poststat.ia_ctime;
+ tm_ptr = localtime(&time);
+ strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Source %s selected as authentic to resolve conflicting data "
+ "in file (gfid:%s) by %s (%" PRIu64
+ " bytes @ %s mtime, %s "
+ "ctime).",
+ priv->children[fav_child]->name, uuid_utoa(inode->gfid),
+ policy_str, replies[fav_child].poststat.ia_size, mtime_str,
+ ctime_str);
+
+ sources[fav_child] = 1;
+ sinks[fav_child] = 0;
+ healed_sinks[fav_child] = 0;
+ }
+ return fav_child;
+}
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
+gf_boolean_t
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+ struct afr_reply *replies)
+{
+ int i = 0;
- if (pending[j])
- return 1;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!replies[i].valid) || (replies[i].op_ret != 0) ||
+ (replies[i].poststat.ia_size != 0))
+ return _gf_false;
+ }
- return 0;
+ return _gf_true;
}
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ int source = -1;
+ int i = 0;
+ afr_private_t *priv = this->private;
+ struct iatt stbuf = {
+ 0,
+ };
+
+ if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) ||
+ (afr_success_count(replies, priv->child_count) < priv->child_count))
+ return -1;
-/**
- * is_matrix_zero - return true if pending matrix is all zeroes
+ if (type == AFR_DATA_TRANSACTION) {
+ if (!afr_is_file_empty_on_all_children(priv, replies))
+ return -1;
+ goto mark;
+ }
+
+ /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/
+ stbuf = replies[0].poststat;
+ for (i = 1; i < priv->child_count; i++) {
+ if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, prot)))
+ return -1;
+ }
+ for (i = 1; i < priv->child_count; i++) {
+ if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata))
+ return -1;
+ }
+
+mark:
+ /* data/metadata is same on all bricks. Pick one of them as source. Rest
+ * are sinks.*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (source == -1) {
+ source = i;
+ sources[i] = 1;
+ sinks[i] = 0;
+ healed_sinks[i] = 0;
+ continue;
+ }
+ sources[i] = 0;
+ sinks[i] = 1;
+ healed_sinks[i] = 1;
+ }
+
+ return source;
+}
+
+/* Return a source depending on the type of heal_op, and set sources[source],
+ * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
+ * only if the following condition is met:
+ * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
+ * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
+ * sinks[node] are 1. This should be the case if the file is in split-brain.
*/
+int
+afr_mark_split_brain_source_sinks(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ int heal_op = -1;
+ int ret = -1;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ source = afr_mark_source_sinks_if_file_empty(
+ this, sources, sinks, healed_sinks, locked_on, replies, type);
+ if (source >= 0)
+ return source;
+
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+ if (ret)
+ goto autoheal;
+
+ source = afr_mark_split_brain_source_sinks_by_heal_op(
+ frame, this, sources, sinks, healed_sinks, locked_on, replies, type,
+ heal_op);
+ return source;
+
+autoheal:
+ /* Automatically heal if fav_child_policy is set. */
+ if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
+ source = afr_mark_split_brain_source_sinks_by_policy(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, type);
+ if (source != -1) {
+ ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1);
+ if (ret)
+ return -1;
+ }
+ }
+
+ return source;
+}
int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count)
-{
- int i = 0;
- int j = 0;
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (!dict_get_sizen(local->xdata_req, "fav-child-policy"))
+ return 0;
- for (i = 0; i < child_count; i++)
- for (j = 0; j < child_count; j++)
- if (pending_matrix[i][j])
- return 0;
- return 1;
-}
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
+ input_dirty = alloca0(priv->child_count * sizeof(int));
+ input_matrix = ALLOC_MATRIX(priv->child_count, int);
+ output_dirty = alloca0(priv->child_count * sizeof(int));
+ output_matrix = ALLOC_MATRIX(priv->child_count, int);
-int
-afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
+ output_dirty[i] = -input_dirty[i];
+ output_matrix[i][source] = -input_matrix[i][source];
+ }
-// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
- memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] || !locked_on[i])
+ continue;
+ xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty,
+ output_matrix, i, NULL);
- for (i = 0; i < priv->child_count; i++) {
- sh->locked_nodes[i] = 0;
- }
+ afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
+ undid_pending[i] = 1;
+ dict_unref(xattr);
+ }
- if (local->govinda_gOvinda || sh->op_failed) {
- gf_log (this->name, GF_LOG_INFO,
- "split brain found, aborting selfheal of %s",
- local->loc.path);
- sh->op_failed = 1;
- sh->completion_cbk (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
- afr_self_heal_metadata (frame, this);
- }
+ if (xdata)
+ dict_unref(xdata);
- return 0;
+ return 0;
}
-
-static int
-afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_does_witness_exist(xlator_t *this, uint64_t *witness)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ priv = this->private;
- int_lock->lock_cbk = afr_sh_missing_entries_done;
- afr_unlock (frame, this);
-
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (witness[i])
+ return _gf_true;
+ }
+ return _gf_false;
}
-int
-afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count)
+unsigned int
+afr_get_quorum_count(afr_private_t *priv)
{
- int ret = -ENOMEM;
- sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf),
- gf_afr_mt_iatt);
- if (!sh->buf)
- goto out;
- sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs),
- gf_afr_mt_iatt);
- if (!sh->parentbufs)
- goto out;
- sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno),
- gf_afr_mt_int);
- if (!sh->child_errno)
- goto out;
- sh->child_success = afr_children_create (child_count);
- if (!sh->child_success)
- goto out;
- sh->fresh_children = afr_children_create (child_count);
- if (!sh->fresh_children)
- goto out;
- sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr),
- gf_afr_mt_dict_t);
- if (!sh->xattr)
- goto out;
- ret = 0;
-out:
- return ret;
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ return priv->child_count / 2 + 1;
+ } else {
+ return priv->quorum_count;
+ }
}
void
-afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr, struct iatt *postparent,
- loc_t *loc)
-{
- int child_index = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- sh->buf[child_index] = *buf;
- sh->parentbuf = *postparent;
- sh->parentbufs[child_index] = *postparent;
- sh->child_success[sh->success_count] = child_index;
- sh->success_count++;
- sh->xattr[child_index] = dict_ref (xattr);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "path %s on subvolume %s => -1 (%s)",
- loc->path, priv->children[child_index]->name,
- strerror (op_errno));
- local->self_heal.child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
+afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused,
+ unsigned char *sources,
+ unsigned char *locked_on)
+{
+ int i = 0;
+ unsigned int quorum_count = 0;
+
+ if (AFR_COUNT(sources, priv->child_count) != 0)
return;
-}
-gf_boolean_t
-afr_valid_ia_type (ia_type_t ia_type)
-{
- switch (ia_type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- case IA_IFDIR:
- return _gf_true;
- default:
- return _gf_false;
+ quorum_count = afr_get_quorum_count(priv);
+ for (i = 0; i < priv->child_count; i++) {
+ if ((accused[i] < quorum_count) && locked_on[i]) {
+ sources[i] = 1;
}
- return _gf_false;
+ }
+ return;
}
-int
-afr_impunge_frame_create (call_frame_t *frame, xlator_t *this,
- int active_source, int ret_child, mode_t entry_mode,
- call_frame_t **impunge_frame)
-{
- afr_local_t *local = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int32_t op_errno = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
- call_frame_t *new_frame = NULL;
+/*
+ * This function determines if a self-heal is required for a given inode,
+ * and if needed, in what direction.
+ *
+ * locked_on[] is the array representing servers which have been locked and
+ * from which xattrs have been fetched for analysis.
+ *
+ * The output of the function is by filling the arrays sources[] and sinks[].
+ *
+ * sources[i] is set if i'th server is an eligible source for a selfheal.
+ *
+ * sinks[i] is set if i'th server needs to be healed.
+ *
+ * if sources[0..N] are all set, there is no need for a selfheal.
+ *
+ * if sinks[0..N] are all set, the inode is in split brain.
+ *
+ */
- op_errno = ENOMEM;
- priv = this->private;
- new_frame = copy_frame (frame);
- if (!new_frame) {
- goto out;
+int
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ uint64_t *witness, unsigned char *pflag)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ int *dirty = NULL; /* Denotes if dirty xattr is set */
+ int **matrix = NULL; /* Changelog matrix */
+ char *accused = NULL; /* Accused others without any self-accusal */
+ char *pending = NULL; /* Have pending operations on others */
+ char *self_accused = NULL; /* Accused itself */
+
+ priv = this->private;
+
+ dirty = alloca0(priv->child_count * sizeof(int));
+ accused = alloca0(priv->child_count);
+ pending = alloca0(priv->child_count);
+ self_accused = alloca0(priv->child_count);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+ memset(witness, 0, sizeof(*witness) * priv->child_count);
+
+ /* First construct the pending matrix for further analysis */
+ afr_selfheal_extract_xattr(this, replies, type, dirty, matrix);
+
+ if (pflag) {
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j])
+ *pflag |= PFLAG_PENDING;
+ if (*pflag)
+ break;
+ }
+ }
+
+ if (afr_success_count(replies, priv->child_count) < priv->child_count) {
+ /* Treat this just like locks not being acquired */
+ return -ENOTCONN;
+ }
+
+ /* short list all self-accused */
+ for (i = 0; i < priv->child_count; i++) {
+ if (matrix[i][i])
+ self_accused[i] = 1;
+ }
+
+ /* Next short list all accused to exclude them from being sources */
+ /* Self-accused can't accuse others as they are FOOLs */
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j]) {
+ if (!self_accused[i])
+ accused[j] += 1;
+ if (i != j)
+ pending[i] += 1;
+ }
+ }
+ }
+
+ /* Short list all non-accused as sources */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!accused[i] && locked_on[i])
+ sources[i] = 1;
+ else
+ sources[i] = 0;
+ }
+
+ /* Everyone accused by non-self-accused sources are sinks */
+ memset(sinks, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (self_accused[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ sinks[j] = 1;
+ }
+ }
+
+ /* For breaking ties provide with number of fops they witnessed */
+
+ /*
+ * count the pending fops witnessed from itself to others when it is
+ * self-accused
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!self_accused[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (i == j)
+ continue;
+ witness[i] += matrix[i][j];
+ }
+ }
+
+ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
+ afr_selfheal_post_op_failure_accounting(priv, accused, sources,
+ locked_on);
+
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT(sources, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ sinks[i] = 1;
+ }
+ if (pflag)
+ *pflag |= PFLAG_SBRAIN;
+ }
+
+ /* One more class of witness similar to dirty in v2 is where no pending
+ * exists but we have self-accusing markers. This can happen in afr-v1
+ * if the brick crashes just after doing xattrop on self but
+ * before xattrop on the other xattrs on the brick in pre-op. */
+ if (AFR_COUNT(pending, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (self_accused[i])
+ witness[i] += matrix[i][i];
}
+ } else {
+ /* In afr-v1 if a file is self-accused and has pending
+ * operations on others then it is similar to 'dirty' in afr-v2.
+ * Consider such cases as witness.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (self_accused[i] && pending[i])
+ witness[i] += matrix[i][i];
+ }
+ }
- ALLOC_OR_GOTO (impunge_local, afr_local_t, out);
-
- local = frame->local;
- sh = &local->self_heal;
- new_frame->local = impunge_local;
- impunge_sh = &impunge_local->self_heal;
- impunge_sh->sh_frame = frame;
- impunge_sh->active_source = active_source;
- impunge_sh->impunge_ret_child = ret_child;
- impunge_sh->impunging_entry_mode = entry_mode;
- impunge_local->child_up = memdup (local->child_up,
- sizeof (*local->child_up) *
- priv->child_count);
- if (!impunge_local->child_up)
- goto out;
+ /* count the number of dirty fops witnessed */
+ for (i = 0; i < priv->child_count; i++)
+ witness[i] += dirty[i];
- ret = afr_sh_common_create (impunge_sh, priv->child_count);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
- op_errno = 0;
- *impunge_frame = new_frame;
-out:
- if (op_errno && new_frame)
- AFR_STACK_DESTROY (new_frame);
- return -op_errno;
+ return 0;
}
void
-afr_sh_call_entry_impunge_recreate (call_frame_t *frame, xlator_t *this,
- int child_index, struct iatt *buf,
- struct iatt *postparent,
- afr_impunge_done_cbk_t impunge_done)
-{
- call_frame_t *impunge_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = 0;
- mode_t mode = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- mode = st_mode_from_ia (buf->ia_prot, buf->ia_type);
- ret = afr_impunge_frame_create (frame, this, sh->source, child_index,
- mode, &impunge_frame);
- if (ret)
- goto out;
- impunge_local = impunge_frame->local;
- loc_copy (&impunge_local->loc, &local->loc);
- sh->impunge_done = impunge_done;
- impunge_local->call_count = 1;
- afr_sh_entry_impunge_create (impunge_frame, this, child_index, buf,
- postparent);
- return;
-out:
- gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, reason: %s",
- local->loc.path, strerror (-ret));
- impunge_done (frame, this, child_index, -1, -ret);
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ char *status = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
+ char *sources_str = NULL;
+ char *q = NULL;
+ afr_private_t *priv = NULL;
+ gf_loglevel_t loglevel = GF_LOG_NONE;
+ int i = 0;
+
+ priv = this->private;
+ sinks_str = alloca0(priv->child_count * 8);
+ p = sinks_str;
+ sources_str = alloca0(priv->child_count * 8);
+ q = sources_str;
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i])
+ p += sprintf(p, "%d ", i);
+ if (sources[i]) {
+ if (source == i) {
+ q += sprintf(q, "[%d] ", i);
+ } else {
+ q += sprintf(q, "%d ", i);
+ }
+ }
+ }
+
+ if (ret < 0) {
+ status = "Failed";
+ loglevel = GF_LOG_DEBUG;
+ } else {
+ status = "Completed";
+ loglevel = GF_LOG_INFO;
+ }
+
+ gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO,
+ "%s %s selfheal on %s. "
+ "sources=%s sinks=%s",
+ status, type, uuid_utoa(gfid), sources_str, sinks_str);
}
int
-afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
- int32_t op_ret, int32_t op_errno)
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
{
- int call_count = 0;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ GF_UNUSED int ret = -1;
+ int8_t need_heal = 1;
- local = frame->local;
+ local = frame->local;
+ i = (long)cookie;
- if (op_ret == -1)
- gf_log (this->name, GF_LOG_ERROR,
- "create entry %s failed, on child %d reason, %s",
- local->loc.path, child, strerror (op_errno));
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_sh_missing_entries_finish (frame, this);
- return 0;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (buf)
+ local->replies[i].poststat = *buf;
+ if (parbuf)
+ local->replies[i].postparent = *parbuf;
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ }
+
+ local->replies[i].need_heal = need_heal;
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
}
-static int
-sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int type = 0;
- afr_private_t *priv = NULL;
- int enoent_count = 0;
- int i = 0;
- struct iatt *buf = NULL;
- struct iatt *postparent = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- enoent_count = afr_errno_count (NULL, sh->child_errno,
- priv->child_count, ENOENT);
- if (enoent_count == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "no missing files - %s. proceeding to metadata check",
- local->loc.path);
- /* proceed to next step - metadata self-heal */
- afr_sh_missing_entries_finish (frame, this);
- return 0;
- }
+inode_t *
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on, dict_t *xattr)
+{
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
- buf = &sh->buf[sh->source];
- postparent = &sh->parentbufs[sh->source];
+ local = frame->local;
+ priv = frame->this->private;
- type = buf->ia_type;
- if (!afr_valid_ia_type (type)) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: unknown file type: 0%o", local->loc.path, type);
- local->govinda_gOvinda = 1;
- afr_sh_missing_entries_finish (frame, this);
- goto out;
- }
+ xattr_req = dict_new();
+ if (!xattr_req)
+ return NULL;
- local->call_count = enoent_count;
- for (i = 0; i < priv->child_count; i++) {
- //If !child_up errno will be zero
- if (sh->child_errno[i] != ENOENT)
- continue;
- afr_sh_call_entry_impunge_recreate (frame, this, i,
- buf, postparent,
- afr_sh_create_entry_cbk);
- enoent_count--;
- }
- GF_ASSERT (enoent_count == 0);
-out:
- return 0;
-}
+ if (xattr)
+ dict_copy(xattr, xattr_req);
-void
-afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t ia_type = IA_INVAL;
- int32_t nsources = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0) {
- if (op_errno == EIO)
- local->govinda_gOvinda = 1;
- // EIO can happen if finding the fresh parent dir failed
- goto out;
- }
+ if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+ dict_unref(xattr_req);
+ return NULL;
+ }
- //now No chance for the ia_type to conflict
- ia_type = sh->buf[sh->child_success[0]].ia_type;
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->child_success,
- afr_transaction_type_get (ia_type));
- if (nsources < 0) {
- gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s,"
- " in missing entry self-heal, continuing with the rest"
- " of the self-heals", local->loc.path);
- op_errno = EIO;
- goto out;
- }
+ inode = inode_new(parent->table);
+ if (!inode) {
+ dict_unref(xattr_req);
+ return NULL;
+ }
- afr_get_fresh_children (sh->child_success, sh->sources,
- sh->fresh_children, priv->child_count);
- sh->source = sh->fresh_children[0];
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_DEBUG, "No active sources found.");
- op_errno = EIO;
- goto out;
- }
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
- if (sh->gfid_sh_success_cbk)
- sh->gfid_sh_success_cbk (frame, this);
- sh_missing_entries_create (frame, this);
- return;
-out:
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_missing_entries_finish (frame, this);
- return;
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
+
+ afr_replies_copy(replies, local->replies, priv->child_count);
+
+ loc_wipe(&loc);
+ dict_unref(xattr_req);
+
+ return inode;
}
static int
-afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret,
- op_errno, inode, buf, xattr,
- postparent, &sh->lookup_loc);
- call_count = afr_frame_return (frame);
-
- if (call_count)
- goto out;
- op_ret = -1;
- if (!sh->success_count) {
- op_errno = afr_resultant_errno_get (NULL, sh->child_errno,
- priv->child_count);
- gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, "
- "reason %s", sh->lookup_loc.path,
- strerror (op_errno));
- goto done;
- }
+afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
- if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) &&
- (afr_conflicting_iattrs (sh->buf, sh->child_success,
- priv->child_count,
- sh->lookup_loc.path, this->name))) {
- op_errno = EIO;
- gf_log (this->name, GF_LOG_ERROR, "Conflicting entries "
- "for %s", sh->lookup_loc.path);
- goto done;
- }
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
- if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) &&
- (afr_gfid_missing_count (this->name, sh->child_success,
- sh->buf, priv->child_count,
- sh->lookup_loc.path))) {
- op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "Missing Gfids "
- "for %s", sh->lookup_loc.path);
- goto done;
- }
- op_ret = 0;
+ ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1);
+ if (ret)
+ return ret;
-done:
- sh->lookup_done (frame, this, op_ret, op_errno);
-out:
- return 0;
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ ret = dict_set_uint32(dict, key1, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+ ret = dict_set_uint32(dict, key2, 1);
+ if (ret)
+ return ret;
+
+ return 0;
}
int
-afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
- int32_t op_ret, int32_t op_errno)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->post_remove_call);
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_ERROR,
- "purge entry %s failed, on child %d reason, %s",
- local->loc.path, child, strerror (op_errno));
- LOCK (&frame->lock);
- {
- afr_sh_set_error (sh, EIO);
- sh->op_failed = 1;
- }
- UNLOCK (&frame->lock);
- }
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- sh->post_remove_call (frame, this);
- return 0;
-}
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on, dict_t *dict)
+{
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = frame->this->private;
+
+ xattr_req = dict_new();
+ if (!xattr_req)
+ return -ENOMEM;
+ if (dict)
+ dict_copy(dict, xattr_req);
+
+ if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+ dict_unref(xattr_req);
+ return -ENOMEM;
+ }
+
+ if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) {
+ dict_unref(xattr_req);
+ return -1;
+ }
-void
-afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this,
- int child_index, struct iatt *buf,
- afr_expunge_done_cbk_t expunge_done)
-{
- call_frame_t *expunge_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int32_t op_errno = 0;
-
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- goto out;
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, gfid);
- ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
+ AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- local = frame->local;
- sh = &local->self_heal;
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- loc_copy (&expunge_local->loc, &local->loc);
- sh->expunge_done = expunge_done;
- afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf);
- return;
-out:
- gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s",
- local->loc.path, strerror (op_errno));
- expunge_done (frame, this, child_index, -1, op_errno);
-}
+ afr_replies_copy(replies, local->replies, priv->child_count);
-void
-afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children,
- int32_t *fresh_children,
- unsigned int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (afr_is_child_present (success_children, child_count, i) &&
- !afr_is_child_present (fresh_children, child_count, i)) {
- sh->child_errno[i] = ENOENT;
- GF_ASSERT (sh->xattr[i]);
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
- }
+ loc_wipe(&loc);
+ dict_unref(xattr_req);
+
+ return 0;
}
int
-afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this)
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ dict_t *dict = NULL;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ local = frame->local;
- if (sh->op_failed) {
- afr_sh_missing_entries_finish (frame, this);
- } else {
- if (afr_gfid_missing_count (this->name, sh->fresh_children,
- sh->buf, priv->child_count,
- local->loc.path)) {
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_missing_entries_lookup_done,
- sh->sh_gfid_req,
- AFR_LOOKUP_FAIL_CONFLICTS|
- AFR_LOOKUP_FAIL_MISSING_GFIDS);
- } else {
- //No need to set gfid so goto missing entries lookup done
- //Behave as if you have done the lookup
- afr_sh_remove_stale_lookup_info (sh,
- sh->child_success,
- sh->fresh_children,
- priv->child_count);
- afr_children_copy (sh->child_success,
- sh->fresh_children,
- priv->child_count);
- afr_sh_missing_entries_lookup_done (frame, this, 0, 0);
- }
- }
- return 0;
+ if (local->xattr_req)
+ dict = local->xattr_req;
+
+ return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
+ local->child_up, dict);
}
-gf_boolean_t
-afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv,
- int child)
+unsigned int
+afr_success_count(struct afr_reply *replies, unsigned int count)
{
- afr_self_heal_t *sh = NULL;
-
- sh = &local->self_heal;
+ int i = 0;
+ unsigned int success = 0;
- if (local->child_up[child] &&
- (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count,
- child))
- && (sh->child_errno[child] != ENOENT))
- return _gf_true;
-
- return _gf_false;
+ for (i = 0; i < count; i++)
+ if (replies[i].valid && replies[i].op_ret == 0)
+ success++;
+ return success;
}
-gf_boolean_t
-afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv,
- int child)
+int
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- sh = &local->self_heal;
+ local = frame->local;
+ i = (long)cookie;
- if (local->child_up[child] &&
- (!afr_is_child_present (sh->fresh_children, priv->child_count,
- child))
- && (sh->child_errno[child] != ENOENT))
- return _gf_true;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- return _gf_false;
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
}
-void
-afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this,
- gf_boolean_t purge_condition (afr_local_t *local,
- afr_private_t *priv,
- int child))
+int
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int count = 0;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (purge_condition (local, priv, i))
- call_count++;
- }
-
- if (call_count == 0) {
- sh->post_remove_call (frame, this);
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ locked_on[i] = 1;
+ count++;
+ } else {
+ locked_on[i] = 0;
}
+ }
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!purge_condition (local, priv, i))
- continue;
- afr_sh_call_entry_expunge_remove (frame, this,
- (long) i, &sh->buf[i],
- afr_sh_remove_entry_cbk);
- }
-out:
- return;
+ return count;
}
-void
-afr_sh_purge_entry (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
+
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- local = frame->local;
- sh = &local->self_heal;
- sh->post_remove_call = afr_sh_missing_entries_finish;
+ loc_wipe(&loc);
- afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition);
+ return afr_locked_fill(frame, this, locked_on);
}
-void
-afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ priv = this->private;
+ local = frame->local;
- sh->post_remove_call = afr_sh_purge_stale_entries_done;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- for (i = 0; i < priv->child_count; i++) {
- if (afr_is_child_present (sh->fresh_children,
- priv->child_count, i))
- continue;
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- if ((!local->child_up[i]) || sh->child_errno[i] != 0)
- continue;
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) ||
- uuid_is_null (sh->buf[i].ia_gfid));
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_uninodelk(frame, this, inode, dom, off, size,
+ locked_on);
- if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) ||
- (uuid_compare (sh->buf[i].ia_gfid,
- sh->entrybuf.ia_gfid)))
- continue;
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+ &flock, NULL);
+ break;
+ }
+ }
- afr_fresh_children_add_child (sh->fresh_children,
- i, priv->child_count);
+ loc_wipe(&loc);
- }
- afr_sh_purge_entry_common (frame, this,
- afr_sh_purge_stale_entry_condition);
+ return afr_locked_fill(frame, this, locked_on);
}
-void
-afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs,
- struct iatt *save,
- unsigned int child_count)
-{
- int i = 0;
- int child = 0;
- gf_boolean_t saved = _gf_false;
-
- GF_ASSERT (save);
- //if iatt buf with gfid exists sets it
- for (i = 0; i < child_count; i++) {
- child = children[i];
- if (child == -1)
- break;
- *save = bufs[child];
- saved = _gf_true;
- if (!uuid_is_null (save->ia_gfid))
- break;
+static void
+afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies,
+ int *lock_count, int *eagain_count)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == 0) {
+ (*lock_count)++;
+ } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) {
+ (*eagain_count)++;
}
- GF_ASSERT (saved);
+ }
}
-void
-afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh,
- unsigned int child_count)
+/*Do blocking locks if number of locks acquired is majority and there were some
+ * EAGAINs. Useful for odd-way replication*/
+int
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, off_t off,
+ size_t size, unsigned char *locked_on)
{
- afr_children_intersection_get (sh->child_success,
- sh->fresh_parent_dirs,
- sh->sources, child_count);
- afr_get_fresh_children (sh->child_success, sh->sources,
- sh->fresh_children, child_count);
- memset (sh->sources, 0, sizeof (*sh->sources) * child_count);
-}
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lock_count = 0;
+ int eagain_count = 0;
-void
-afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t fresh_child_enoents = 0;
- int32_t fresh_parent_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0)
- goto fail;
- afr_get_children_of_fresh_parent_dirs (sh, priv->child_count);
- fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs,
- priv->child_count);
- //we need the enoent count of the subvols present in fresh_parent_dirs
- fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs,
- sh->child_errno,
- priv->child_count, ENOENT);
- if (fresh_child_enoents == fresh_parent_count) {
- gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s",
- local->loc.path);
- afr_sh_set_error (sh, ENOENT);
- sh->op_failed = 1;
- afr_sh_purge_entry (frame, this);
- } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children,
- priv->child_count, local->loc.path,
- this->name)) {
- afr_sh_save_child_iatts_from_policy (sh->fresh_children,
- sh->buf, &sh->entrybuf,
- priv->child_count);
- afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf,
- sh->fresh_children,
- priv->child_count);
- afr_sh_purge_stale_entry (frame, this);
- } else {
- op_errno = EIO;
- local->govinda_gOvinda = 1;
- goto fail;
- }
+ priv = this->private;
+ local = frame->local;
- return;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
-fail:
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
-static void
-afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int enoent_count = 0;
- int nsources = 0;
- int source = -1;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- /* If We can't find a fresh parent directory here,
- * we wont know which subvol is correct without finding a parent dir
- * upwards which has correct xattrs, for that we may have to
- * do lookups till root, we dont wanna do that,
- * instead make sure that if there are conflicting gfid
- * parent dirs, self-heal thus lookup is failed with EIO.
- * if there are missing entries we dont know whether to delete or
- * create so fail with EIO,
- * If there are conflicting xattr fail with EIO.
- */
- if (op_ret < 0)
- goto out;
- enoent_count = afr_errno_count (NULL, sh->child_errno,
- priv->child_count, ENOENT);
- if (enoent_count > 0) {
- gf_log (this->name, GF_LOG_ERROR, "Parent dir missing for %s,"
- " in missing entry self-heal, aborting self-heal",
- local->loc.path);
- goto out;
- }
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->child_success,
- AFR_ENTRY_TRANSACTION);
- if (nsources < 0) {
- gf_log (this->name, GF_LOG_ERROR, "No sources for dir of %s,"
- " in missing entry self-heal, aborting self-heal",
- local->loc.path);
- goto out;
- }
+ afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+ &eagain_count);
- source = afr_sh_select_source (sh->sources, priv->child_count);
- if (source == -1) {
- GF_ASSERT (0);
- gf_log (this->name, GF_LOG_DEBUG, "No active sources found.");
- goto out;
- }
- afr_get_fresh_children (sh->child_success, sh->sources,
- sh->fresh_parent_dirs, priv->child_count);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_children_lookup_done, NULL, 0);
- return;
+ if (lock_count > priv->child_count / 2 && eagain_count) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on);
-out:
- afr_sh_set_error (sh, EIO);
- sh->op_failed = 1;
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+ &flock, NULL);
+ }
-void
-afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count)
-{
- int i = 0;
+ loc_wipe(&loc);
- for (i = 0; i < child_count; i++) {
- memset (&sh->buf[i], 0, sizeof (sh->buf[i]));
- memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i]));
- sh->child_errno[i] = 0;
- }
- memset (&sh->parentbuf, 0, sizeof (sh->parentbuf));
- sh->success_count = 0;
- afr_reset_children (sh->child_success, child_count);
- afr_reset_children (sh->fresh_children, child_count);
- afr_reset_xattr (sh->xattr, child_count);
- loc_wipe (&sh->lookup_loc);
+ return afr_locked_fill(frame, this, locked_on);
}
-/* afr self-heal state will be lost if this call is made
- * please check the afr_sh_common_reset that is called in this function
- */
int
-afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- afr_lookup_done_cbk_t lookup_done , uuid_t gfid,
- int32_t flags)
-{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
-
- if (xattr_req) {
- afr_xattr_req_prepare (this, xattr_req, loc->path);
- if (gfid) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s with gfid: %s",
- loc->path, uuid_utoa (gfid));
- GF_ASSERT (!uuid_is_null (gfid));
- afr_set_dict_gfid (xattr_req, gfid);
- }
- }
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
- afr_sh_common_reset (sh, priv->child_count);
- sh->lookup_done = lookup_done;
- loc_copy (&sh->lookup_loc, loc);
- sh->lookup_flags = flags;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame,
- afr_sh_common_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- loc, xattr_req);
-
- if (!--call_count)
- break;
- }
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- if (xattr_req)
- dict_unref (xattr_req);
+ flock.l_type = F_UNLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- return 0;
-}
+ AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc,
+ F_SETLK, &flock, NULL);
+ loc_wipe(&loc);
+ return 0;
+}
int
-afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this)
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {
+ 0,
+ };
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Non blocking entrylks failed.");
- afr_sh_missing_entries_done (frame, this);
- } else {
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_sh_common_lookup (frame, this, &sh->parent_loc,
- afr_sh_find_fresh_parents,
- NULL, AFR_LOOKUP_FAIL_CONFLICTS);
- }
+ loc_wipe(&loc);
- return 0;
+ return afr_locked_fill(frame, this, locked_on);
}
int
-afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this)
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- sh = &local->self_heal;
- int_lock = &local->internal_lock;
+ priv = this->private;
+ local = frame->local;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Non blocking entrylks failed.");
- afr_sh_missing_entries_done (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_missing_entries_lookup_done,
- sh->sh_gfid_req, AFR_LOOKUP_FAIL_CONFLICTS|
- AFR_LOOKUP_FAIL_MISSING_GFIDS);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on,
+ NULL);
+
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ break;
}
+ }
- return 0;
+ loc_wipe(&loc);
+
+ return afr_locked_fill(frame, this, locked_on);
}
int
-afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
- char *base_name, afr_lock_cbk_t lock_cbk)
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, const char *name,
+ unsigned char *locked_on)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lock_count = 0;
+ int eagain_count = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ priv = this->private;
+ local = frame->local;
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- afr_set_lock_number (frame, this);
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- int_lock->lk_basename = base_name;
- int_lock->lk_loc = loc;
- int_lock->lock_cbk = lock_cbk;
+ afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+ &eagain_count);
- afr_nonblocking_entrylk (frame, this);
+ if (lock_count > priv->child_count / 2 && eagain_count) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL);
- return 0;
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ }
+
+ loc_wipe(&loc);
+
+ return afr_locked_fill(frame, this, locked_on);
}
-static int
-afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this,
- afr_lock_cbk_t lock_cbk)
+int
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
- priv = this->private;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- gf_log (this->name, GF_LOG_TRACE,
- "attempting to recreate missing entries for path=%s",
- local->loc.path);
+ AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
+ name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
- GF_ASSERT (local->loc.parent);
- afr_build_parent_loc (&sh->parent_loc, &local->loc);
+ loc_wipe(&loc);
- afr_sh_entrylk (frame, this, &sh->parent_loc, NULL,
- lock_cbk);
- return 0;
+ return 0;
}
-static int
-afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_data_set(xlator_t *this, dict_t *xdata)
{
- afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_conflicting_sh_cbk);
- return 0;
+ return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION);
}
-static int
-afr_self_heal_gfids (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_metadata_set(xlator_t *this, dict_t *xdata)
{
- afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_gfid_sh_cbk);
- return 0;
+ return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION);
}
-afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)
+gf_boolean_t
+afr_is_entry_set(xlator_t *this, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *lc = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *shc = NULL;
+ return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION);
+}
- priv = this->private;
+/*
+ * This function inspects the looked up replies (in an unlocked manner)
+ * and decides whether a locked verification and possible healing is
+ * required or not. It updates the three booleans for each type
+ * of healing. If the boolean flag gets set to FALSE, then we are sure
+ * no healing is required. If the boolean flag gets set to TRUE then
+ * we have to proceed with locked reinspection.
+ */
- sh = &l->self_heal;
+int
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **link_inode, gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies_dst)
+{
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ int i = 0;
+ int valid_cnt = 0;
+ struct iatt first = {
+ 0,
+ };
+ int first_idx = 0;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ inode = afr_inode_find(this, gfid);
+ if (!inode)
+ goto out;
+
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1)
+ continue;
+
+ /* The data segment of the changelog can be non-zero to indicate
+ * the directory needs a full heal. So the check below ensures
+ * it's not a directory before setting the data_selfheal boolean.
+ */
+ if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) &&
+ afr_is_data_set(this, replies[i].xdata))
+ *data_selfheal = _gf_true;
+
+ if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata))
+ *metadata_selfheal = _gf_true;
+
+ if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata))
+ *entry_selfheal = _gf_true;
+
+ valid_cnt++;
+ if (valid_cnt == 1) {
+ first = replies[i].poststat;
+ first_idx = i;
+ continue;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, type)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "TYPE mismatch %d vs %d on %s for gfid:%s",
+ (int)first.ia_type, (int)replies[i].poststat.ia_type,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;"
+ "type=file;gfid=%s;"
+ "ia_type-%d=%s;ia_type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(replies[i].poststat.ia_gfid), first_idx,
+ gf_inode_type_to_str(first.ia_type), i,
+ gf_inode_type_to_str(replies[i].poststat.ia_type));
+ ret = -EIO;
+ goto out;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, uid)) {
+ gf_msg_debug(this->name, 0,
+ "UID mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, gid)) {
+ gf_msg_debug(this->name, 0,
+ "GID mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, prot)) {
+ gf_msg_debug(this->name, 0,
+ "MODE mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)st_mode_from_ia(first.ia_prot, 0),
+ (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0),
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (IA_ISREG(first.ia_type) &&
+ !IA_EQUAL(first, replies[i].poststat, size)) {
+ gf_msg_debug(this->name, 0,
+ "SIZE mismatch "
+ "%lld vs %lld on %s for gfid:%s",
+ (long long)first.ia_size,
+ (long long)replies[i].poststat.ia_size,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (data_selfheal)
+ *data_selfheal = _gf_true;
+ }
+ }
+
+ if (valid_cnt > 0 && link_inode) {
+ *link_inode = inode_link(inode, NULL, NULL, &first);
+ if (!*link_inode) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else if (valid_cnt < 2) {
+ ret = afr_check_stale_error(replies, priv);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (replies && replies_dst)
+ afr_replies_copy(replies_dst, replies, priv->child_count);
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
- lc = GF_CALLOC (1, sizeof (afr_local_t),
- gf_afr_mt_afr_local_t);
- if (!lc)
- goto out;
+ return ret;
+}
- shc = &lc->self_heal;
-
- shc->unwind = sh->unwind;
- shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk;
- shc->need_missing_entry_self_heal = sh->need_missing_entry_self_heal;
- shc->need_gfid_self_heal = sh->need_gfid_self_heal;
- shc->need_data_self_heal = sh->need_data_self_heal;
- shc->need_metadata_self_heal = sh->need_metadata_self_heal;
- shc->need_entry_self_heal = sh->need_entry_self_heal;
- shc->forced_merge = sh->forced_merge;
- shc->healing_fd_opened = sh->healing_fd_opened;
- shc->data_lock_held = sh->data_lock_held;
- if (sh->healing_fd && !sh->healing_fd_opened)
- shc->healing_fd = fd_ref (sh->healing_fd);
- else
- shc->healing_fd = sh->healing_fd;
- shc->background = sh->background;
- shc->type = sh->type;
-
- uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req);
- if (l->loc.path)
- loc_copy (&lc->loc, &l->loc);
-
- lc->child_up = memdup (l->child_up, priv->child_count);
- if (l->xattr_req)
- lc->xattr_req = dict_ref (l->xattr_req);
-
- if (l->cont.lookup.inode)
- lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode);
- if (l->cont.lookup.xattr)
- lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr);
- if (l->internal_lock.inode_locked_nodes)
- lc->internal_lock.inode_locked_nodes =
- memdup (l->internal_lock.inode_locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.entry_locked_nodes)
- lc->internal_lock.entry_locked_nodes =
- memdup (l->internal_lock.entry_locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.locked_nodes)
- lc->internal_lock.locked_nodes =
- memdup (l->internal_lock.locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+inode_t *
+afr_inode_find(xlator_t *this, uuid_t gfid)
+{
+ inode_table_t *table = NULL;
+ inode_t *inode = NULL;
- lc->internal_lock.inodelk_lock_count =
- l->internal_lock.inodelk_lock_count;
- lc->internal_lock.entrylk_lock_count =
- l->internal_lock.entrylk_lock_count;
+ table = this->itable;
+ if (!table)
+ return NULL;
-out:
- return lc;
-}
+ inode = inode_find(table, gfid);
+ if (inode)
+ return inode;
-int
-afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- char sh_type_str[256] = {0,};
- gf_boolean_t split_brain = _gf_false;
+ inode = inode_new(table);
+ if (!inode)
+ return NULL;
- priv = this->private;
- local = bgsh_frame->local;
- sh = &local->self_heal;
+ gf_uuid_copy(inode->gfid, gfid);
- if (local->govinda_gOvinda)
- split_brain = _gf_true;
+ return inode;
+}
- afr_set_split_brain (this, sh->inode, split_brain);
+call_frame_t *
+afr_frame_create(xlator_t *this, int32_t *op_errno)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ pid_t pid = GF_CLIENT_PID_SELF_HEALD;
- afr_self_heal_type_str_get (sh, sh_type_str,
- sizeof(sh_type_str));
- if (sh->op_failed) {
- gf_log (this->name, GF_LOG_ERROR, "background %s self-heal "
- "failed on %s", sh_type_str, local->loc.path);
- } else {
- gf_log (this->name, GF_LOG_INFO, "background %s self-heal "
- "completed on %s", sh_type_str, local->loc.path);
- }
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ return NULL;
+ }
- FRAME_SU_UNDO (bgsh_frame, afr_local_t);
+ local = AFR_FRAME_INIT(frame, (*op_errno));
+ if (!local) {
+ STACK_DESTROY(frame->root);
+ return NULL;
+ }
- if (!sh->unwound) {
- sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno);
- }
+ syncopctx_setfspid(&pid);
- if (sh->background) {
- LOCK (&priv->lock);
- {
- priv->background_self_heals_started--;
- }
- UNLOCK (&priv->lock);
- }
+ frame->root->pid = pid;
- AFR_STACK_DESTROY (bgsh_frame);
+ afr_set_lk_owner(frame, this, frame->root);
- return 0;
+ return frame;
}
int
-afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int32_t op_errno = 0;
- int ret = 0;
- afr_self_heal_t *orig_sh = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int **changelog = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t *sh_local = NULL;
+ priv = this->private;
- local = frame->local;
- orig_sh = &local->self_heal;
- priv = this->private;
+ gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid);
- GF_ASSERT (local->loc.path);
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
- gf_log (this->name, GF_LOG_TRACE,
- "performing self heal on %s (metadata=%d data=%d entry=%d)",
- local->loc.path,
- local->self_heal.need_metadata_self_heal,
- local->self_heal.need_data_self_heal,
- local->self_heal.need_entry_self_heal);
+ changelog = afr_mark_pending_changelog(priv, newentry, xattr,
+ replies[source].poststat.ia_type);
- op_errno = ENOMEM;
- sh_frame = copy_frame (frame);
- sh_frame->root->pid = SELF_HEAL_PID;
- if (!sh_frame)
- goto out;
- afr_set_lk_owner (sh_frame, this);
-
- sh_local = afr_local_copy (local, this);
- if (!sh_local)
- goto out;
- sh_frame->local = sh_local;
- sh = &sh_local->self_heal;
+ if (!changelog) {
+ ret = -ENOMEM;
+ goto out;
+ }
- sh->orig_frame = frame;
- sh->inode = inode_ref (inode);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ }
+out:
+ if (changelog)
+ afr_matrix_cleanup(changelog, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
- sh->completion_cbk = afr_self_heal_completion_cbk;
+int
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
+{
+ int ret = -1;
+ int entry_ret = 1;
+ int metadata_ret = 1;
+ int data_ret = 1;
+ int or_ret = 0;
+ inode_t *inode = NULL;
+ fd_t *fd = NULL;
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode,
+ &data_selfheal, &metadata_selfheal,
+ &entry_selfheal, NULL);
+ if (ret)
+ goto out;
+
+ if (!(data_selfheal || metadata_selfheal || entry_selfheal)) {
+ ret = 2;
+ goto out;
+ }
+
+ if (inode->ia_type == IA_IFREG) {
+ ret = afr_selfheal_data_open(this, inode, &fd);
+ if (!fd) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ if (data_selfheal && priv->data_self_heal)
+ data_ret = afr_selfheal_data(frame, this, fd);
+
+ if (metadata_selfheal && priv->metadata_self_heal)
+ metadata_ret = afr_selfheal_metadata(frame, this, inode);
+
+ if (entry_selfheal && priv->entry_self_heal)
+ entry_ret = afr_selfheal_entry(frame, this, inode);
+
+ or_ret = (data_ret | metadata_ret | entry_ret);
+
+ if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO)
+ ret = -EIO;
+ else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1)
+ ret = 1;
+ else if (or_ret < 0)
+ ret = or_ret;
+ else
+ ret = 0;
- sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success),
- gf_afr_mt_char);
- if (!sh->success)
- goto out;
- sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count,
- gf_afr_mt_int);
- if (!sh->sources)
- goto out;
- sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes),
- priv->child_count,
- gf_afr_mt_int);
- if (!sh->locked_nodes)
- goto out;
+out:
+ if (inode)
+ inode_unref(inode);
+ if (fd)
+ fd_unref(fd);
+ return ret;
+}
+/*
+ * This is the entry point for healing a given GFID. The return values for this
+ * function are as follows:
+ * '0' if the self-heal is successful
+ * '1' if the afr-xattrs are non-zero (due to on-going IO) and no heal is needed
+ * '2' if the afr-xattrs are all-zero and no heal is needed
+ * $errno if the heal on the gfid failed.
+ */
- sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count,
- gf_afr_mt_int32_t);
- if (!sh->pending_matrix)
- goto out;
+int
+afr_selfheal(xlator_t *this, uuid_t gfid)
+{
+ int ret = -1;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
- for (i = 0; i < priv->child_count; i++) {
- sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!sh->pending_matrix[i])
- goto out;
- }
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ return ret;
- sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count,
- gf_afr_mt_int32_t);
- if (!sh->delta_matrix)
- goto out;
- for (i = 0; i < priv->child_count; i++) {
- sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!sh->delta_matrix)
- goto out;
- }
- sh->fresh_parent_dirs = afr_children_create (priv->child_count);
- if (!sh->fresh_parent_dirs)
- goto out;
- ret = afr_sh_common_create (sh, priv->child_count);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ local = frame->local;
+ local->xdata_req = dict_new();
- if (local->self_heal.background) {
- LOCK (&priv->lock);
- {
- if (priv->background_self_heals_started
- < priv->background_self_heal_count) {
- priv->background_self_heals_started++;
+ ret = afr_selfheal_do(frame, this, gfid);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
- } else {
- local->self_heal.background = _gf_false;
- }
- }
- UNLOCK (&priv->lock);
- }
+ return ret;
+}
- FRAME_SU_DO (sh_frame, afr_local_t);
- if (sh->need_missing_entry_self_heal) {
- afr_self_heal_conflicting_entries (sh_frame, this);
- } else if (sh->need_gfid_self_heal) {
- GF_ASSERT (!uuid_is_null (sh->sh_gfid_req));
- afr_self_heal_gfids (sh_frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
+afr_local_t *
+__afr_dequeue_heals(afr_private_t *priv)
+{
+ afr_local_t *local = NULL;
- afr_sh_missing_entries_done (sh_frame, this);
- }
- op_errno = 0;
+ if (list_empty(&priv->heal_waiting))
+ goto none;
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->healers >= priv->background_self_heal_count))
+ goto none;
-out:
- if (op_errno) {
- orig_sh->unwind (frame, this, -1, op_errno);
- }
- return 0;
+ local = list_entry(priv->heal_waiting.next, afr_local_t, healer);
+ priv->heal_waiters--;
+ GF_ASSERT(priv->heal_waiters >= 0);
+ list_del_init(&local->healer);
+ list_add(&local->healer, &priv->healing);
+ priv->healers++;
+ return local;
+none:
+ gf_msg_debug(THIS->name, 0,
+ "Nothing dequeued. "
+ "Num healers: %d, Num Waiters: %d",
+ priv->healers, priv->heal_waiters);
+ return NULL;
}
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size)
+int
+afr_refresh_selfheal_wrap(void *opaque)
{
- GF_ASSERT (str && (size > strlen (" missing-entry gfid "
- "meta-data data entry")));
+ call_frame_t *heal_frame = opaque;
+ afr_local_t *local = heal_frame->local;
+ int ret = 0;
- if (self_heal_p->need_metadata_self_heal) {
- snprintf (str, size, " meta-data");
- }
+ ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid);
+ return ret;
+}
- if (self_heal_p->need_data_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " data");
- }
+int
+afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ call_frame_t *heal_frame = opaque;
+ xlator_t *this = heal_frame->this;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = heal_frame->local;
- if (self_heal_p->need_entry_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " entry");
- }
+ LOCK(&priv->lock);
+ {
+ list_del_init(&local->healer);
+ priv->healers--;
+ GF_ASSERT(priv->healers >= 0);
+ local = __afr_dequeue_heals(priv);
+ }
+ UNLOCK(&priv->lock);
- if (self_heal_p->need_missing_entry_self_heal) {
- snprintf (str + strlen(str), size - strlen(str),
- " missing-entry");
- }
+ AFR_STACK_DESTROY(heal_frame);
- if (self_heal_p->need_gfid_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " gfid");
- }
+ if (local)
+ afr_heal_synctask(this, local);
+ return 0;
}
-afr_self_heal_type
-afr_self_heal_type_for_transaction (afr_transaction_type type)
+void
+afr_heal_synctask(xlator_t *this, afr_local_t *local)
{
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
+ int ret = 0;
+ call_frame_t *heal_frame = NULL;
- switch (type) {
- case AFR_DATA_TRANSACTION:
- sh_type = AFR_SELF_HEAL_DATA;
- break;
- case AFR_METADATA_TRANSACTION:
- sh_type = AFR_SELF_HEAL_METADATA;
- break;
- case AFR_ENTRY_TRANSACTION:
- sh_type = AFR_SELF_HEAL_ENTRY;
- break;
- case AFR_ENTRY_RENAME_TRANSACTION:
- GF_ASSERT (0);
- break;
+ heal_frame = local->heal_frame;
+ ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap,
+ afr_refresh_heal_done, heal_frame, heal_frame);
+ if (ret < 0)
+ /* Heal not launched. Will be queued when the next inode
+ * refresh happens and shd hasn't healed it yet. */
+ afr_refresh_heal_done(ret, heal_frame, heal_frame);
+}
+
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this)
+{
+ gf_boolean_t can_heal = _gf_true;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ LOCK(&priv->lock);
+ {
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->heal_wait_qlen + priv->background_self_heal_count) >
+ (priv->heal_waiters + priv->healers)) {
+ list_add_tail(&local->healer, &priv->heal_waiting);
+ priv->heal_waiters++;
+ local = __afr_dequeue_heals(priv);
+ } else {
+ can_heal = _gf_false;
}
- return sh_type;
+ }
+ UNLOCK(&priv->lock);
+
+ if (can_heal) {
+ if (local)
+ afr_heal_synctask(this, local);
+ else
+ gf_msg_debug(this->name, 0,
+ "Max number of heals are "
+ "pending, background self-heal rejected.");
+ }
+
+ return can_heal;
}
int
-afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+ afr_transaction_type type)
{
- int ret = -1;
+ int source = -1;
+ int i = 0;
- if (!child) {
- goto out;
- }
+ /* Give preference to local child to save on bandwidth */
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->local[i] && sources[i]) {
+ if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
- if (strcmp (parent->path, "/") == 0)
- ret = gf_asprintf ((char **)&child->path, "/%s", name);
- else
- ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
- name);
+ source = i;
+ goto out;
+ }
+ }
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed while setting child path");
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ goto out;
}
+ }
+out:
+ return source;
+}
- if (!child->path) {
- goto out;
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->replies[i].poststat = *buf;
+ local->replies[i].preparent = *preparent;
+ local->replies[i].postparent = *postparent;
+ }
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ unsigned char *mkdir_on = alloca0(priv->child_count);
+ unsigned char *lookup_on = alloca0(priv->child_count);
+ loc_t loc = {0};
+ int32_t op_errno = 0;
+ int32_t child_op_errno = 0;
+ struct iatt iatt = {0};
+ dict_t *xdata = NULL;
+ uuid_t anon_inode_gfid = {0};
+ int mkdir_count = 0;
+ int i = 0;
+
+ /*Try to mkdir everywhere and return success if the dir exists on 'child'
+ */
+
+ if (!priv->use_anon_inode) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &op_errno);
+ if (op_errno) {
+ goto out;
+ }
+ local = frame->local;
+ if (!local->child_up[child]) {
+ /*Other bricks may need mkdir so don't error out yet*/
+ child_op_errno = ENOTCONN;
+ }
+ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ if (priv->anon_inode[i]) {
+ mkdir_on[i] = 0;
+ } else {
+ mkdir_on[i] = 1;
+ mkdir_count++;
}
+ }
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
+ if (mkdir_count == 0) {
+ *linked_inode = inode_find(this->itable, anon_inode_gfid);
+ if (*linked_inode) {
+ op_errno = 0;
+ goto out;
+ }
+ }
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
+ loc.parent = inode_ref(this->itable->root);
+ loc.name = priv->anon_inode_name;
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- if (!child->inode) {
- ret = -1;
- goto out;
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+ if (op_errno) {
+ goto out;
+ }
+
+ if (mkdir_count == 0) {
+ memcpy(lookup_on, local->child_up, priv->child_count);
+ goto lookup;
+ }
+
+ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+ xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!mkdir_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno == EEXIST) {
+ lookup_on[i] = 1;
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+
+ if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+ goto link;
+ }
+
+lookup:
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xdata);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!lookup_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ if (gf_uuid_compare(anon_inode_gfid,
+ local->replies[i].poststat.ia_gfid) == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else {
+ if (i == child)
+ child_op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+ "%s has gfid: %s", priv->anon_inode_name,
+ uuid_utoa(local->replies[i].poststat.ia_gfid));
+ }
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+link:
+ if (!gf_uuid_is_null(iatt.ia_gfid)) {
+ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+ if (*linked_inode) {
+ op_errno = 0;
+ inode_lookup(*linked_inode);
+ } else {
+ op_errno = ENOMEM;
}
+ goto out;
+ }
- ret = 0;
out:
- if (ret == -1)
- loc_wipe (child);
-
- return ret;
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ /*child_op_errno takes precedence*/
+ if (child_op_errno == 0) {
+ child_op_errno = op_errno;
+ }
+
+ if (child_op_errno && *linked_inode) {
+ inode_unref(*linked_inode);
+ *linked_inode = NULL;
+ }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return -child_op_errno;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
deleted file mode 100644
index 7a06c2f4b13..00000000000
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __AFR_SELF_HEAL_COMMON_H__
-#define __AFR_SELF_HEAL_COMMON_H__
-
-#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512))
-
-typedef enum {
- AFR_SELF_HEAL_ENTRY,
- AFR_SELF_HEAL_METADATA,
- AFR_SELF_HEAL_DATA,
- AFR_SELF_HEAL_INVALID = -1,
-} afr_self_heal_type;
-
-typedef enum {
- AFR_LOOKUP_FAIL_CONFLICTS = 1,
- AFR_LOOKUP_FAIL_MISSING_GFIDS = 2,
-} afr_lookup_flags_t;
-
-int
-afr_sh_select_source (int sources[], int child_count);
-
-int
-afr_sh_sink_count (int sources[], int child_count);
-
-int
-afr_sh_source_count (int sources[], int child_count);
-
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
-
-int
-afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
- dict_t *xattr[], afr_transaction_type type,
- size_t child_count);
-
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], int success[],
- int child_count, afr_transaction_type type);
-
-int
-afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs,
- int32_t child_count, afr_self_heal_type type,
- int32_t *valid_children, const char *xlator_name);
-
-int
-afr_sh_delta_to_xattr (afr_private_t *priv,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type);
-
-int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count);
-
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size);
-
-afr_self_heal_type
-afr_self_heal_type_for_transaction (afr_transaction_type type);
-
-int
-afr_build_sources (xlator_t *xlator, dict_t **xattr, struct iatt *bufs,
- int32_t **pending_matrix, int32_t *sources,
- int32_t *success_children, afr_transaction_type type);
-void
-afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count);
-int
-afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid,
- int32_t flags);
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
- int active_src, struct iatt *buf);
-int
-afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
- char *base_name, afr_lock_cbk_t lock_cbk);
-int
-afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *buf,
- struct iatt *postparent);
-afr_local_t *
-afr_local_copy (afr_local_t *l, xlator_t *this);
-void
-afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno);
-typedef int
-(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr);
-int
-afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name);
-int
-afr_impunge_frame_create (call_frame_t *frame, xlator_t *this,
- int active_source, int ret_child, mode_t entry_mode,
- call_frame_t **impunge_frame);
-#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 5bd5edd668b..37bcc2b3f9e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -1,1262 +1,891 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
-
-int
-afr_sh_data_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- /*
- TODO: cleanup sh->*
- */
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include "afr-messages.h"
+#include <glusterfs/events.h>
- if (sh->healing_fd && !sh->healing_fd_opened) {
- /* unref only if we created the fd ourselves */
-
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
- }
-
- /* for (i = 0; i < priv->child_count; i++) */
- /* sh->locked_nodes[i] = 0; */
-
- sh->completion_cbk (frame, this);
-
- return 0;
+#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
+static int
+__checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ struct afr_reply *replies = NULL;
+ int i = (long)cookie;
+
+ local = frame->local;
+ replies = local->replies;
+
+ replies[i].valid = 1;
+ replies[i].op_ret = op_ret;
+ replies[i].op_errno = op_errno;
+ if (xdata) {
+ replies[i].buf_has_zeroes = dict_get_str_boolean(
+ xdata, "buf-has-zeroes", _gf_false);
+ replies[i].fips_mode_rchecksum = dict_get_str_boolean(
+ xdata, "fips-mode-rchecksum", _gf_false);
+ }
+ if (strong) {
+ if (replies[i].fips_mode_rchecksum) {
+ memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH);
+ } else {
+ memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
+ }
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+static gf_boolean_t
+__afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size, struct iatt *poststat)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_subvols = NULL;
+ gf_boolean_t checksum_match = _gf_true;
+ struct afr_reply *replies = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ replies = local->replies;
+
+ xdata = dict_new();
+ if (!xdata)
+ goto out;
+ if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) {
+ dict_unref(xdata);
+ goto out;
+ }
+
+ wind_subvols = alloca0(priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || healed_sinks[i])
+ wind_subvols[i] = 1;
+ }
+
+ AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+
+ if (!replies[source].valid || replies[source].op_ret != 0)
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+ if (replies[i].valid) {
+ if (memcmp(replies[source].checksum, replies[i].checksum,
+ replies[source].fips_mode_rchecksum
+ ? SHA256_DIGEST_LENGTH
+ : MD5_DIGEST_LENGTH)) {
+ checksum_match = _gf_false;
+ break;
+ }
+ }
+ }
+
+ if (checksum_match) {
+ if (HAS_HOLES(poststat))
+ return _gf_true;
+
+ /* For non-sparse files, we might be better off writing the
+ * zeroes to sinks to avoid mismatch of disk-usage in bricks. */
+ if (local->replies[source].buf_has_zeroes)
+ return _gf_false;
+ else
+ return _gf_true;
+ }
+out:
+ return _gf_false;
+}
+
+static gf_boolean_t
+__afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset,
+ int sink)
+{
+ afr_private_t *priv = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec *iovec = NULL;
+ int count = 0;
+ int ret = 0;
+ gf_boolean_t zero_filled = _gf_false;
+
+ priv = this->private;
+ ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec,
+ &count, &iobref, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ ret = iov_0filled(iovec, count);
+ if (!ret)
+ zero_filled = _gf_true;
+out:
+ if (iovec)
+ GF_FREE(iovec);
+ if (iobref)
+ iobref_unref(iobref);
+ return zero_filled;
}
+static int
+__afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size,
+ struct afr_reply *replies, int type)
+{
+ struct iovec *iovec = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec,
+ &count, &iobref, NULL, NULL, NULL);
+ if (ret <= 0)
+ return ret;
-int
-afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "flush or setattr failed on %s on subvolume %s: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_done (frame, this);
- }
-
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ /*
+ * TODO: Use fiemap() and discard() to heal holes
+ * in the future.
+ *
+ * For now,
+ *
+ * - if the source had any holes at all,
+ * AND
+ * - if we are writing past the original file size
+ * of the sink
+ * AND
+ * - is NOT the last block of the source file. if
+ * the block contains EOF, it has to be written
+ * in order to set the file size even if the
+ * last block is 0-filled.
+ * AND
+ * - if the read buffer is filled with only 0's
+ *
+ * then, skip writing to this source. We don't depend
+ * on the write to happen to update the size as we
+ * have performed an ftruncate() upfront anyways.
+ */
+#define is_last_block(o, b, s) ((s >= o) && (s <= (o + b)))
+ if (HAS_HOLES((&replies[source].poststat)) &&
+ offset >= replies[i].poststat.ia_size &&
+ !is_last_block(offset, size, replies[source].poststat.ia_size) &&
+ (iov_0filled(iovec, count) == 0))
+ continue;
+
+ /* Avoid filling up sparse regions of the sink with 0-filled
+ * writes.*/
+ if (type == AFR_SELFHEAL_DATA_FULL &&
+ HAS_HOLES((&replies[source].poststat)) &&
+ ((offset + size) <= replies[i].poststat.ia_size) &&
+ (iov_0filled(iovec, count) == 0) &&
+ __afr_is_sink_zero_filled(this, fd, size, offset, i)) {
+ continue;
+ }
+
+ ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref,
+ 0, NULL, NULL, NULL, NULL);
+ if (ret != iov_length(iovec, count)) {
+ /* write() failed on this sink. unset the corresponding
+ member in sinks[] (which is healed_sinks[] in the
+ caller) so that this server does NOT get considered
+ as successfully healed.
+ */
+ healed_sinks[i] = 0;
+ }
+ }
+ if (iovec)
+ GF_FREE(iovec);
+ if (iobref)
+ iobref_unref(iobref);
+
+ return ret;
+}
+
+static gf_boolean_t
+afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source,
+ unsigned char *healed_sinks)
+{
+ afr_private_t *priv = this->private;
+ int i = 0;
+
+ if (!locked_on[source])
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && locked_on[i])
+ return _gf_true;
+ }
+
+ return _gf_false;
}
-
-int
-afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *statpre,
- struct iatt *statpost)
-{
- afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_data_close (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks, off_t offset,
+ size_t size, int type, struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
- int source = 0;
- int32_t valid = 0;
- struct iatt stbuf = {0,};
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME);
-
- stbuf.ia_atime = sh->buf[source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec;
-
- if (sh->healing_fd_opened) {
- /* not our job to close the fd */
-
- afr_sh_data_done (frame, this);
- return 0;
- }
-
- if (!sh->healing_fd) {
- afr_sh_data_done (frame, this);
- return 0;
- }
-
- call_count = (sh->active_sinks + 1) * 2;
- local->call_count = call_count;
-
- /* closed source */
- gf_log (this->name, GF_LOG_TRACE,
- "closing fd of %s on %s",
- local->loc.path, priv->children[sh->source]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->flush,
- sh->healing_fd);
- call_count--;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->setattr,
- &local->loc, &stbuf, valid);
-
- call_count--;
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ unsigned char *data_lock = NULL;
- if (call_count == 0)
- return 0;
+ priv = this->private;
+ data_lock = alloca0(priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "closing fd of %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- sh->healing_fd);
-
- call_count--;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, &stbuf, valid);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-
-int
-afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * local = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "locking inode of %s on child %d failed: %s",
- local->loc.path, child_index,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "inode of %s on child %d locked",
- local->loc.path, child_index);
- }
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size,
+ data_lock);
+ {
+ if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) {
+ ret = -ENOTCONN;
+ goto unlock;
}
- UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_close (frame, this);
+ if (type == AFR_SELFHEAL_DATA_DIFF &&
+ __afr_can_skip_data_block_heal(frame, this, fd, source,
+ healed_sinks, offset, size,
+ &replies[source].poststat)) {
+ ret = 0;
+ goto unlock;
}
- return 0;
+ ret = __afr_selfheal_data_read_write(
+ frame, this, fd, source, healed_sinks, offset, size, replies, type);
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size,
+ data_lock);
+ return ret;
}
-
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- GF_ASSERT (!sh->data_lock_held);
-
- int_lock->lock_cbk = afr_sh_data_close;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing data selfheal of %s", local->loc.path);
-
- if (!sh->data_lock_held)
- afr_sh_data_unlock (frame, this);
- else
- afr_sh_data_close (frame, this);
+ local = frame->local;
+ priv = this->private;
+ if (!priv->ensure_durability)
return 0;
-}
-
-
-int
-afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
-{
- int call_count = 0;
-
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_sh_data_finish (frame, this);
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL);
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret != 0)
+ /* fsync() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
-
-int
-afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
+static int
+afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks,
+ int source, struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success,
- priv->child_count, AFR_DATA_TRANSACTION);
-
- erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,
- gf_afr_mt_dict_t);
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int i = 0;
+ if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) {
+ type = AFR_SELFHEAL_DATA_FULL;
for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
-
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
- }
- }
-
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_DATA_TRANSACTION);
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
- }
- GF_FREE (erase_xattr);
-
- return 0;
-}
-
-
-int
-afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int call_count = 0;
- int child_index = 0;
-
- priv = this->private;
- local = frame->local;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1)
- gf_log (this->name, GF_LOG_INFO,
- "ftruncate of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- else
- gf_log (this->name, GF_LOG_TRACE,
- "ftruncate of %s on subvolume %s completed",
- local->loc.path,
- priv->children[child_index]->name);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_erase_pending (frame, this);
+ if (!healed_sinks[i] && i != source)
+ continue;
+ if (replies[i].poststat.ia_size) {
+ type = AFR_SELFHEAL_DATA_DIFF;
+ break;
+ }
}
-
- return 0;
+ } else {
+ type = priv->data_self_heal_algorithm;
+ }
+ return type;
}
-
-int
-afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
+ unsigned char *healed_sinks, struct afr_reply *replies)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- int *sources = NULL;
- int call_count = 0;
- int i = 0;
-
+ afr_private_t *priv = NULL;
+ off_t off = 0;
+ size_t block = 0;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int ret = -1;
+ call_frame_t *iter_frame = NULL;
+ unsigned char arbiter_sink_status = 0;
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing data selfheal on %s", uuid_utoa(fd->inode->gfid));
- sources = sh->sources;
- call_count = sh->active_sinks;
+ priv = this->private;
+ if (priv->arbiter_count) {
+ arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
+ healed_sinks[ARBITER_BRICK_INDEX] = 0;
+ }
- local->call_count = call_count;
+ block = 128 * 1024 * priv->data_self_heal_window_size;
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
+ type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies);
- STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- sh->healing_fd, sh->file_size);
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ ret = -ENOMEM;
+ goto out;
+ }
- if (!--call_count)
- break;
+ for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ ret = -ENOTCONN;
+ goto out;
}
- return 0;
-}
-
+ ret = afr_selfheal_data_block(iter_frame, this, fd, source,
+ healed_sinks, off, block, type, replies);
+ if (ret < 0)
+ goto out;
-static struct afr_sh_algorithm *
-sh_algo_from_name (xlator_t *this, char *name)
-{
- int i = 0;
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ }
- while (afr_self_heal_algorithms[i].name) {
- if (!strcmp (name, afr_self_heal_algorithms[i].name)) {
- return &afr_self_heal_algorithms[i];
- }
+ ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks);
- i++;
- }
+out:
+ if (arbiter_sink_status)
+ healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
- return NULL;
+ if (iter_frame)
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
}
-
static int
-sh_zero_byte_files_exist (afr_self_heal_t *sh, int child_count)
+__afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks, uint64_t size)
{
- int i;
- int ret = 0;
-
- for (i = 0; i < child_count; i++) {
- if (sh->buf[i].ia_size == 0) {
- ret = 1;
- break;
- }
- }
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- return ret;
-}
+ local = frame->local;
+ priv = this->private;
+ /* This will send truncate on the arbiter brick as well if it is marked as
+ * sink. If changelog is enabled on the volume it captures truncate as a
+ * data transactions on the arbiter brick. This will help geo-rep to
+ * properly sync the data from master to slave if arbiter is the ACTIVE
+ * brick during syncing and which had got some entries healed for data as
+ * part of self heal.
+ */
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size,
+ NULL);
-struct afr_sh_algorithm *
-afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- struct afr_sh_algorithm * algo = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- algo = sh_algo_from_name (this, priv->data_self_heal_algorithm);
-
- if (algo == NULL) {
- /* option not set, so fall back on heuristics */
-
- if ((local->enoent_count != 0)
- || sh_zero_byte_files_exist (sh, priv->child_count)
- || (sh->file_size <= (priv->data_self_heal_window_size *
- this->ctx->page_size))) {
-
- /*
- * If the file does not exist on one of the subvolumes,
- * or a zero-byte file exists (created by entry self-heal)
- * the entire content has to be copied anyway, so there
- * is no benefit from using the "diff" algorithm.
- *
- * If the file size is about the same as page size,
- * the entire file can be read and written with a few
- * (pipelined) STACK_WINDs, which will be faster
- * than "diff" which has to read checksums and then
- * read and write.
- */
-
- algo = sh_algo_from_name (this, "full");
-
- } else {
- algo = sh_algo_from_name (this, "diff");
- }
- }
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret == -1)
+ /* truncate() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
- return algo;
+ return 0;
}
-
-int
-afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_has_source_witnesses(xlator_t *this, unsigned char *sources,
+ uint64_t *witness)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
- struct afr_sh_algorithm *sh_algo = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ int i = 0;
+ afr_private_t *priv = NULL;
- source = sh->source;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
- }
- }
- sh->success[source] = 1;
-
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_data_finish (frame, this);
- return 0;
- }
- sh->active_sinks = active_sinks;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing file %s from subvolume %s to %d other",
- local->loc.path, priv->children[source]->name, active_sinks);
-
- sh->algo_completion_cbk = afr_sh_data_trim_sinks;
- sh->algo_abort_cbk = afr_sh_data_finish;
-
- sh_algo = afr_sh_data_pick_algo (frame, this);
-
- sh_algo->fn (frame, this);
-
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && witness[i])
+ return _gf_true;
+ }
+ return _gf_false;
}
-
-int
-afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+afr_does_size_mismatch(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_local_t * orig_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_build_pending_matrix (priv->pending_key, sh->pending_matrix,
- sh->xattr, AFR_DATA_TRANSACTION,
- priv->child_count);
-
- afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
- nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf,
- priv->child_count, AFR_SELF_HEAL_DATA,
- sh->child_success, this->name);
-
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Picking favorite child %s as authentic source to "
- "resolve conflicting data of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to self-heal contents of '%s' (possible "
- "split-brain). Please delete the file from all but "
- "the preferred subvolume.", local->loc.path);
-
- local->govinda_gOvinda = 1;
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- if (source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- sh->source = source;
- sh->block_size = 65536; /* TODO: make it configurable or use macro */
- sh->file_size = sh->buf[source].ia_size;
-
- if (FILE_HAS_HOLES (&sh->buf[source]))
- sh->file_has_holes = 1;
-
- orig_local = sh->orig_frame->local;
- orig_local->cont.lookup.buf.ia_size = sh->buf[source].ia_size;
-
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == source || sh->child_errno[i])
- continue;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt *min = NULL;
+ struct iatt *max = NULL;
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
- }
+ priv = this->private;
- afr_set_read_child (this, local->loc.inode, sh->source);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
- /*
- quick-read might have read the file, so send xattr from
- the source subvolume (http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=815)
- */
+ if (replies[i].op_ret < 0)
+ continue;
- dict_unref (orig_local->cont.lookup.xattr);
- if (orig_local->cont.lookup.xattrs)
- orig_local->cont.lookup.xattr = dict_ref (orig_local->cont.lookup.xattrs[sh->source]);
+ if (!sources[i])
+ continue;
- if (sh->background) {
- sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno);
- sh->unwound = _gf_true;
- }
+ if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0))
+ continue;
- afr_sh_data_sync_prepare (frame, this);
+ if (!min)
+ min = &replies[i].poststat;
- return 0;
-}
+ if (!max)
+ max = &replies[i].poststat;
-gf_boolean_t
-afr_is_fresh_read_child (int32_t *sources, int32_t child_count,
- int32_t read_child)
-{
- gf_boolean_t is_fresh_child = _gf_false;
+ if (min->ia_size > replies[i].poststat.ia_size)
+ min = &replies[i].poststat;
- GF_ASSERT (read_child < child_count);
-
- if ((read_child >= 0) && (read_child < child_count) &&
- sources[read_child]) {
- is_fresh_child = _gf_true;
- }
- return is_fresh_child;
-}
+ if (max->ia_size < replies[i].poststat.ia_size)
+ max = &replies[i].poststat;
+ }
-static int
-afr_select_read_child_from_policy (int32_t *sources, int32_t child_count,
- int32_t prev_read_child,
- int32_t config_read_child,
- int32_t *valid_children)
-{
- int32_t read_child = -1;
- int i = 0;
-
- GF_ASSERT (sources);
-
- read_child = prev_read_child;
- if (_gf_true == afr_is_fresh_read_child (sources, child_count,
- read_child))
- goto out;
-
- read_child = config_read_child;
- if (_gf_true == afr_is_fresh_read_child (sources, child_count,
- read_child))
- goto out;
-
- for (i = 0; i < child_count; i++) {
- read_child = valid_children[i];
- if (read_child < 0)
- break;
- if (_gf_true == afr_is_fresh_read_child (sources, child_count,
- read_child))
- goto out;
- }
- read_child = -1;
+ if (min && max) {
+ if (min->ia_size != max->ia_size)
+ return _gf_true;
+ }
-out:
- return read_child;
+ return _gf_false;
}
static void
-afr_destroy_pending_matrix (int32_t **pending_matrix, int32_t child_count)
+afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources,
+ uint64_t *witness)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t biggest_witness = 0;
+
+ priv = this->private;
+ /* Find source with biggest witness count */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (biggest_witness < witness[i])
+ biggest_witness = witness[i];
+ }
+
+ /* Mark files with less witness count as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (witness[i] < biggest_witness)
+ sources[i] = 0;
+ }
+
+ return;
+}
+
+/* This is a tie breaker function. Only one source be assigned here */
+static void
+afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- int i = 0;
- GF_ASSERT (child_count > 0);
- if (pending_matrix) {
- for (i = 0; i < child_count; i++) {
- if (pending_matrix[i])
- GF_FREE (pending_matrix[i]);
- }
- GF_FREE (pending_matrix);
- }
-}
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ uint32_t max_ctime = 0;
-static int32_t**
-afr_create_pending_matrix (int32_t child_count)
-{
- gf_boolean_t cleanup = _gf_false;
- int32_t **pending_matrix = NULL;
- int i = 0;
-
- GF_ASSERT (child_count > 0);
-
- pending_matrix = GF_CALLOC (sizeof (*pending_matrix), child_count,
- gf_afr_mt_int32_t);
- if (NULL == pending_matrix)
- goto out;
- for (i = 0; i < child_count; i++) {
- pending_matrix[i] = GF_CALLOC (sizeof (**pending_matrix),
- child_count,
- gf_afr_mt_int32_t);
- if (NULL == pending_matrix[i]) {
- cleanup = _gf_true;
- goto out;
- }
- }
-out:
- if (_gf_true == cleanup) {
- afr_destroy_pending_matrix (pending_matrix, child_count);
- pending_matrix = NULL;
- }
- return pending_matrix;
-}
+ priv = this->private;
+ /* Find source with latest ctime */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
-int
-afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
- dict_t **xattr,
- afr_transaction_type txn_type)
-{
- afr_private_t *priv = NULL;
- int read_child = -1;
- int ret = -1;
- int32_t **pending_matrix = NULL;
- int32_t *sources = NULL;
- int32_t *valid_children = NULL;
- struct iatt *bufs = NULL;
- int32_t nsources = 0;
- int32_t prev_read_child = -1;
- int32_t config_read_child = -1;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
-
- priv = this->private;
- bufs = local->cont.lookup.bufs;
- valid_children = local->cont.lookup.child_success;
- sh = &local->self_heal;
-
- pending_matrix = afr_create_pending_matrix (priv->child_count);
- if (NULL == pending_matrix)
- goto out;
-
- sources = local->cont.lookup.sources;
- memset (sources, 0, sizeof (*sources) * priv->child_count);
- afr_build_pending_matrix (priv->pending_key, pending_matrix,
- xattr, txn_type, priv->child_count);
-
- sh_type = afr_self_heal_type_for_transaction (txn_type);
- if (AFR_SELF_HEAL_INVALID == sh_type)
- goto out;
-
- nsources = afr_mark_sources (sources, pending_matrix, bufs,
- priv->child_count, sh_type,
- valid_children, this->name);
- if (nsources < 0) {
- ret = -1;
- goto out;
+ if (max_ctime <= replies[i].poststat.ia_ctime) {
+ source = i;
+ max_ctime = replies[i].poststat.ia_ctime;
}
+ }
- prev_read_child = local->read_child_index;
- config_read_child = priv->read_child;
- read_child = afr_select_read_child_from_policy (sources,
- priv->child_count,
- prev_read_child,
- config_read_child,
- valid_children);
- ret = 0;
-out:
- afr_destroy_pending_matrix (pending_matrix, priv->child_count);
- gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", read_child);
- return read_child;
+ /* Only mark one of the files as source to break ties */
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ sources[source] = 1;
}
-int
-afr_sh_data_special_file_fix (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++)
- if (1 == local->child_up[i])
- sh->success[i] = 1;
-
- afr_sh_data_erase_pending (frame, this);
-
- return 0;
-}
+static int
+__afr_selfheal_data_finalize_source(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness)
+{
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count) {
+ /* split brain */
+ source = afr_mark_split_brain_source_sinks(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, AFR_DATA_TRANSACTION);
+ if (source < 0) {
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=data;"
+ "file=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
+ return -EIO;
+ }
+
+ _afr_fav_child_reset_sink_xattrs(
+ frame, this, inode, source, healed_sinks, undid_pending,
+ AFR_DATA_TRANSACTION, locked_on, replies);
+ goto out;
+ }
+
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
+ /* If there are no witnesses/size-mismatches on sources we are done*/
+ if (!afr_does_size_mismatch(this, sources, replies) &&
+ !afr_has_source_witnesses(this, sources, witness))
+ goto out;
+
+ afr_mark_largest_file_as_source(this, sources, replies);
+ afr_mark_biggest_witness_as_source(this, sources, witness);
+ afr_mark_newest_file_as_source(this, sources, replies);
+ if (priv->arbiter_count)
+ /* Choose non-arbiter brick as source for empty files. */
+ afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks,
+ locked_on, replies,
+ AFR_DATA_TRANSACTION);
-int
-afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fstat of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->buf[child_index] = *buf;
- sh->child_success[sh->success_count] = child_index;
- sh->success_count++;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- /* Previous versions of glusterfs might have set
- * the pending data xattrs which need to be erased
- */
- if (IA_ISREG (buf->ia_type))
- afr_sh_data_fix (frame, this);
- else
- afr_sh_data_special_file_fix (frame, this);
- }
+out:
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION);
- return 0;
+ return source;
}
-
+/*
+ * __afr_selfheal_data_prepare:
+ *
+ * This function inspects the on-disk xattrs and determines which subvols
+ * are sources and sinks.
+ *
+ * The return value is the index of the subvolume to be used as the source
+ * for self-healing, or -1 if no healing is necessary/split brain.
+ */
int
-afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)
+__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *pflag)
{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
- local->call_count = call_count;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++)
- sh->child_success[i] = -1;
- sh->success_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fstat,
- sh->healing_fd);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+ if (ret)
+ return ret;
-int
-afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fxattrop of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->xattr[child_index] = dict_ref (xattr);
- }
- }
- UNLOCK (&frame->lock);
+ witness = alloca0(priv->child_count * sizeof(*witness));
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_DATA_TRANSACTION, locked_on, sources,
+ sinks, witness, pflag);
+ if (ret)
+ return ret;
- call_count = afr_frame_return (frame);
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
- if (call_count == 0) {
- afr_sh_data_fstat (frame, this);
- }
+ source = __afr_selfheal_data_finalize_source(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ undid_pending, replies, witness);
+ if (source < 0)
+ return -EIO;
- return 0;
+ return source;
}
-
-int
-afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this)
-{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
- int32_t *zero_pending = NULL;
- int call_count = 0;
- int i = 0;
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
- if (!xattr_req) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- zero_pending = GF_CALLOC (3, sizeof (int32_t), gf_common_mt_int32_t);
- if (!zero_pending) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
- goto out;
- }
- ret = dict_set_dynptr (xattr_req, priv->pending_key[i],
- zero_pending, 3 * sizeof (int32_t));
- if (ret < 0) {
- GF_FREE (zero_pending);
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value");
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd, GF_XATTROP_ADD_ARRAY,
- xattr_req);
-
- if (!--call_count)
- break;
- }
+static int
+__afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+ gf_boolean_t did_sh = _gf_true;
+ gf_boolean_t is_arbiter_the_only_sink = _gf_false;
+ gf_boolean_t empty_file = _gf_false;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "self-heal as only %d number "
+ "of subvolumes "
+ "could be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_data_prepare(frame, this, fd->inode, data_lock,
+ sources, sinks, healed_sinks,
+ undid_pending, locked_replies, NULL);
+ if (ret < 0)
+ goto unlock;
+
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
+ }
+
+ source = ret;
+
+ if (AFR_IS_ARBITER_BRICK(priv, source)) {
+ empty_file = afr_is_file_empty_on_all_children(priv,
+ locked_replies);
+ if (empty_file)
+ goto restore_time;
+
+ did_sh = _gf_false;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_truncate_sinks(
+ frame, this, fd, healed_sinks,
+ locked_replies[source].poststat.ia_size);
+ if (ret < 0)
+ goto unlock;
+
+ if (priv->arbiter_count &&
+ AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
+ healed_sinks[ARBITER_BRICK_INDEX]) {
+ is_arbiter_the_only_sink = _gf_true;
+ goto restore_time;
}
-
- dict_unref (xattr_req);
- return 0;
-
+ ret = 0;
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
+ if (ret < 0)
+ goto out;
+
+ if (!did_sh)
+ goto out;
+
+ ret = afr_selfheal_data_do(frame, this, fd, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
+restore_time:
+ afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+ locked_replies);
+
+ if (!is_arbiter_the_only_sink && !empty_file) {
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ did_sh = _gf_false;
+ goto skip_undo_pending;
+ }
+ }
+ ret = afr_selfheal_undo_pending(
+ frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_DATA_TRANSACTION, locked_replies, data_lock);
+skip_undo_pending:
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
out:
- if (xattr_req)
- dict_unref (xattr_req);
- sh->op_failed = 1;
- afr_sh_data_done (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Non Blocking data inodelks "
- "failed for %s.", local->loc.path);
- sh->op_failed = 1;
- afr_sh_data_done (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks "
- "done for %s. Proceeding to FOP", local->loc.path);
- afr_sh_data_fxattrop (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK;
+ if (did_sh)
+ afr_log_selfheal(fd->inode->gfid, this, ret, "data", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
- afr_set_lock_number (frame, this);
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
- int_lock->lk_flock.l_start = 0;
- int_lock->lk_flock.l_len = 0;
- int_lock->lk_flock.l_type = F_WRLCK;
- int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk;
-
- afr_nonblocking_inodelk (frame, this);
-
- return 0;
+ return ret;
}
-
int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this)
+afr_selfheal_data_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t * sh = NULL;
+ afr_local_t *local = NULL;
+ int i = (long)cookie;
+ local = frame->local;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- if (sh->data_lock_held) {
- /* caller has held the lock already,
- so skip locking */
+ syncbarrier_wake(&local->barrier);
- afr_sh_data_fxattrop (frame, this);
- return 0;
- }
-
- return afr_sh_data_lock_rec (frame, this);
+ return 0;
}
-
int
-afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
+afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "open of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "open of %s succeeded on child %s",
- local->loc.path,
- priv->children[child_index]->name);
- }
- UNLOCK (&frame->lock);
+ int ret = 0;
+ fd_t *fd_tmp = NULL;
+ loc_t loc = {
+ 0,
+ };
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- call_count = afr_frame_return (frame);
+ priv = this->private;
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_data_finish (frame, this);
- return 0;
- }
+ fd_tmp = fd_create(inode, 0);
+ if (!fd_tmp)
+ return -ENOMEM;
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- afr_sh_data_lock (frame, this);
- }
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ fd_unref(fd_tmp);
+ goto out;
+ }
+ local = frame->local;
- return 0;
-}
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_data_open_cbk, open, &loc,
+ O_RDWR | O_LARGEFILE, fd_tmp, NULL);
+ ret = -ENOTCONN;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
-int
-afr_sh_data_open (call_frame_t *frame, xlator_t *this)
-{
- int i = 0;
- int call_count = 0;
- fd_t *fd = NULL;
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (sh->healing_fd_opened) {
- /* caller has opened the fd for us already, so skip open */
-
- afr_sh_data_lock (frame, this);
- return 0;
+ if (local->replies[i].op_ret < 0) {
+ ret = -local->replies[i].op_errno;
+ continue;
}
- call_count = afr_up_children_count (priv->child_count, local->child_up);
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
+ ret = 0;
+ break;
+ }
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if(!local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc,
- O_RDWR|O_LARGEFILE, fd, 0);
-
- if (!--call_count)
- break;
- }
+ if (ret < 0) {
+ fd_unref(fd_tmp);
+ goto out;
+ } else {
+ fd_bind(fd_tmp);
+ }
- return 0;
+ *fd = fd_tmp;
+out:
+ loc_wipe(&loc);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
}
-
int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
-
- local = frame->local;
- sh = &local->self_heal;
-
- if (sh->need_data_self_heal && priv->data_self_heal) {
- afr_sh_data_open (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "not doing data self heal on %s",
- local->loc.path);
- afr_sh_data_done (frame, this);
- }
-
- return 0;
+afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+ inode_t *inode = fd->inode;
+
+ priv = this->private;
+
+ locked_on = alloca0(priv->child_count);
+
+ ret = afr_selfheal_tie_breaker_inodelk(frame, this, inode, priv->sh_domain,
+ 0, 0, locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "self-heal as only %d number of "
+ "subvolumes could be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_data(frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
+
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 21b248f496c..64893f441e3 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -1,2229 +1,1276 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "inode.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\
- do {\
- _local = _frame->local;\
- _sh = &_local->self_heal;\
- _sh_frame = _sh->sh_frame;\
- _sh_local = _sh_frame->local;\
- _sh_sh = &_sh_local->self_heal;\
- } while (0);
-
-int
-afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- /*
- TODO: cleanup sh->*
- */
-
- if (sh->healing_fd)
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
-
- /* for (i = 0; i < priv->child_count; i++) { */
- /* sh->locked_nodes[i] = 0; */
- /* } */
-
- sh->completion_cbk (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_cbk = afr_sh_entry_done;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "finishing entry selfheal of %s", local->loc.path);
-
- afr_sh_entry_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
-{
- long i = 0;
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *orig_local = NULL;
- call_frame_t *orig_frame = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- if (op_ret == -1) {
- i = (long)cookie;
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to erase pending xattrs on %s (%s)",
- local->loc.path, priv->children[i]->name,
- strerror (op_errno));
- }
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- sh = &local->self_heal;
-
- orig_frame = sh->orig_frame;
- orig_local = orig_frame->local;
-
- if (sh->source != -1) {
- orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink;
- }
-
- afr_sh_entry_finish (frame, this);
- }
-
- return 0;
-}
-
+#include <glusterfs/byte-order.h>
+#include "afr-transaction.h"
+#include "afr-messages.h"
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/events.h>
int
-afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
- int need_unwind = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success,
- priv->child_count, AFR_ENTRY_TRANSACTION);
-
- erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,
- gf_afr_mt_dict_t);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
-
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
- }
- }
-
- if (call_count == 0)
- need_unwind = 1;
-
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_ENTRY_TRANSACTION);
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
- }
- GF_FREE (erase_xattr);
-
- if (need_unwind)
- afr_sh_entry_finish (frame, this);
-
- return 0;
-}
-
-
-
-static int
-next_active_source (call_frame_t *frame, xlator_t *this,
- int current_active_source)
+afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child,
+ struct afr_reply *replies,
+ gf_boolean_t *anon_inode)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int source = -1;
- int next_active_source = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- source = sh->source;
-
- if (source != -1) {
- if (current_active_source != source)
- next_active_source = source;
- goto out;
- }
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_source)) {
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ int i = 0;
+ char g[64] = {0};
+ unsigned char *lookup_success = NULL;
+ call_frame_t *frame = NULL;
+ loc_t loc2 = {
+ 0,
+ };
+ loc_t loc = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[child];
+ lookup_success = alloca0(priv->child_count);
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (replies[child].poststat.ia_type == IA_IFDIR) {
+ /* This directory may have sub-directory hierarchy which may need to
+ * be preserved for subsequent heals. So unconditionally move the
+ * directory to anonymous-inode directory*/
+ *anon_inode = _gf_true;
+ goto anon_inode;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid);
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ lookup_success[i] = 1;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ ret = -local->replies[i].op_errno;
+ }
+ }
+
+ if (priv->quorum_count) {
+ if (afr_has_quorum(lookup_success, this, NULL)) {
+ *anon_inode = _gf_true;
+ }
+ } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) {
+ *anon_inode = _gf_true;
+ } else if (ret) {
+ goto out;
+ }
+
+anon_inode:
+ if (!*anon_inode) {
+ ret = 0;
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ gf_uuid_copy(loc.pargfid, dir->gfid);
+ loc.name = name;
+
+ ret = afr_anon_inode_create(this, child, &loc2.parent);
+ if (ret < 0)
+ goto out;
+
+ loc2.name = g;
+ ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s failed",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s successful",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ }
- next_active_source = i;
- break;
- }
- }
out:
- return next_active_source;
-}
-
-
-
-static int
-next_active_sink (call_frame_t *frame, xlator_t *this,
- int current_active_sink)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int next_active_sink = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
+ loc_wipe(&loc);
+ loc_wipe(&loc2);
+ if (frame) {
+ AFR_STACK_DESTROY(frame);
+ }
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_sink)) {
-
- next_active_sink = i;
- break;
- }
- }
-
- return next_active_sink;
+ return ret;
}
int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src);
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src);
-
-int
-afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
- int active_src, int32_t op_ret,
- int32_t op_errno)
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies)
{
- int call_count = 0;
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_expunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = (long) cookie;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "setattr on parent directory of %s on subvolume %s failed: %s",
- expunge_local->loc.path,
- priv->children[active_src]->name, strerror (op_errno));
- }
-
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- call_frame_t *frame = NULL;
- int32_t valid = 0;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
-
- active_src = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "removed %s on %s",
- expunge_local->loc.path,
- priv->children[active_src]->name);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "removing %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- }
-
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- afr_build_parent_loc (&expunge_sh->parent_loc, &expunge_local->loc);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->setattr,
- &expunge_sh->parent_loc,
- &expunge_sh->parentbuf,
- valid);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "expunging file %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->unlink,
- &expunge_local->loc);
-
- return 0;
-}
-
-
-
-int
-afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "expunging directory %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->rmdir,
- &expunge_local->loc, 1);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
- int active_src, struct iatt *buf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int type = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- local = frame->local;
- sh = &local->self_heal;
-
- type = buf->ia_type;
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
- break;
+ char g[64] = {0};
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ gf_boolean_t anon_inode = _gf_false;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ if ((!replies[child].valid) || (replies[child].op_ret < 0)) {
+ /*Nothing to do*/
+ ret = 0;
+ goto out;
+ }
+
+ if (priv->use_anon_inode) {
+ ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child,
+ replies, &anon_inode);
+ if (ret < 0 || anon_inode)
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ switch (replies[child].poststat.ia_type) {
case IA_IFDIR:
- afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
- break;
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name,
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ break;
default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- expunge_local->loc.path,
- priv->children[active_src]->name, type);
- goto out;
- break;
- }
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
+ name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+ break;
+ }
- return 0;
out:
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, -1, EINVAL);
-
- return 0;
+ loc_wipe(&loc);
+ return ret;
}
-
int
-afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
+afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
+ unsigned char *sources, inode_t *dir,
+ const char *name, inode_t *inode,
+ struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = (long) cookie;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "lookup of %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ loc_t srcloc = {
+ 0,
+ };
+ loc_t anonloc = {
+ 0,
+ };
+ xlator_t *this = frame->this;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ struct iatt *iatt = NULL;
+ char *linkname = NULL;
+ mode_t mode = 0;
+ struct iatt newent = {
+ 0,
+ };
+ unsigned char *newentry = NULL;
+ char iatt_uuid_str[64] = {0};
+ char dir_uuid_str[64] = {0};
+
+ priv = this->private;
+ iatt = &replies[source].poststat;
+ uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str);
+ if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
+ "Invalid ia_type (%d) or gfid(%s). source brick=%d, "
+ "pargfid=%s, name=%s",
+ iatt->ia_type, iatt_uuid_str, source,
+ uuid_utoa_r(dir->gfid, dir_uuid_str), name);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata)
+ return -ENOMEM;
+ newentry = alloca0(priv->child_count);
+ loc.parent = inode_ref(dir);
+ gf_uuid_copy(loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
+
+ ret = afr_selfheal_entry_delete(this, dir, name, inode, dst, replies);
+ if (ret)
+ goto out;
+
+ ret = dict_set_gfuuid(xdata, "gfid-req", replies[source].poststat.ia_gfid,
+ true);
+ if (ret)
+ goto out;
+
+ srcloc.inode = inode_ref(inode);
+ gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
+ ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == -ENOENT || ret == -ESTALE) {
+ newentry[dst] = 1;
+ ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
+ sources, newentry);
+ if (ret)
+ goto out;
+ } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) {
+ // Try rename from hidden directory
+ ret = afr_anon_inode_create(this, dst, &anonloc.parent);
+ if (ret < 0)
+ goto out;
+ anonloc.inode = inode_ref(inode);
+ anonloc.name = iatt_uuid_str;
+ ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = -1; /*This sets 'mismatch' to true*/
+ goto out;
+ }
+
+ mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
+
+ switch (iatt->ia_type) {
+ case IA_IFDIR:
+ ret = syncop_mkdir(priv->children[dst], &loc, mode, 0, xdata, NULL);
+ break;
+ case IA_IFLNK:
+ if (!newentry[dst]) {
+ ret = syncop_link(priv->children[dst], &srcloc, &loc, &newent,
+ NULL, NULL);
+ } else {
+ ret = syncop_readlink(priv->children[source], &srcloc,
+ &linkname, 4096, NULL, NULL);
+ if (ret <= 0)
+ goto out;
+ ret = syncop_symlink(priv->children[dst], &loc, linkname, NULL,
+ xdata, NULL);
+ }
+ break;
+ default:
+ ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret)
goto out;
- }
-
- afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf);
+ ret = syncop_mknod(
+ priv->children[dst], &loc, mode,
+ makedev(ia_major(iatt->ia_rdev), ia_minor(iatt->ia_rdev)),
+ &newent, xdata, NULL);
+ break;
+ }
- return 0;
out:
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->lookup,
- &expunge_local->loc, 0);
-
- return 0;
+ if (xdata)
+ dict_unref(xdata);
+ GF_FREE(linkname);
+ loc_wipe(&loc);
+ loc_wipe(&srcloc);
+ loc_wipe(&anonloc);
+ return ret;
}
-int
-afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
+static int
+__afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int source = 0;
- call_frame_t *frame = NULL;
- int active_src = 0;
- int need_expunge = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = expunge_sh->active_source;
- source = (long) cookie;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret == -1 && op_errno == ENOENT)
- need_expunge = 1;
- else if (op_ret == -1)
- goto out;
-
- if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) &&
- !uuid_is_null (buf->ia_gfid) &&
- (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) {
- char uuidbuf1[64];
- char uuidbuf2[64];
- gf_log (this->name, GF_LOG_DEBUG,
- "entry %s found on %s with mismatching gfid (%s/%s)",
- expunge_local->loc.path,
- priv->children[source]->name,
- uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1),
- uuid_utoa_r (buf->ia_gfid, uuidbuf2));
- need_expunge = 1;
- }
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
- if (need_expunge) {
- gf_log (this->name, GF_LOG_INFO,
- "missing entry %s on %s",
- expunge_local->loc.path,
- priv->children[source]->name);
+ priv = this->private;
- if (postparent)
- expunge_sh->parentbuf = *postparent;
+ if (!replies[source].valid)
+ return -EIO;
- afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
+ /* Skip healing this entry if the last lookup on it failed for reasons
+ * other than ENOENT.
+ */
+ if ((replies[source].op_ret < 0) && (replies[source].op_errno != ENOENT))
+ return -replies[source].op_errno;
- return 0;
- }
-
-out:
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "%s exists under %s",
- expunge_local->loc.path,
- priv->children[source]->name);
+ if (replies[source].op_ret == 0) {
+ ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
+ source, sources,
+ &replies[source].poststat.ia_gfid, NULL);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if (replies[source].op_ret == -1 &&
+ replies[source].op_errno == ENOENT) {
+ ret = afr_selfheal_entry_delete(this, fd->inode, name, inode, i,
+ replies);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "looking up %s under %s failed (%s)",
- expunge_local->loc.path,
- priv->children[source]->name,
- strerror (op_errno));
- }
-
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entry)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = -1;
- call_frame_t *expunge_frame = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- int source = 0;
- int op_errno = 0;
- char *name = NULL;
- int op_ret = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
- source = sh->source;
- sh->expunge_done = afr_sh_entry_expunge_entry_done;
-
- name = entry->d_name;
-
- if ((strcmp (name, ".") == 0)
- || (strcmp (name, "..") == 0)
- || ((strcmp (local->loc.path, "/") == 0)
- && (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0))) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- name, local->loc.path);
- op_ret = 0;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "inspecting existance of %s under %s",
- name, local->loc.path);
+ if (!gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[source].poststat.ia_gfid))
+ continue;
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- op_errno = ENOMEM;
- goto out;
- }
-
- ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
-
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- expunge_sh->active_source = active_src;
- expunge_sh->entrybuf = entry->d_stat;
-
- ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, name);
- if (ret != 0) {
- op_errno = EINVAL;
- goto out;
+ ret = afr_selfheal_recreate_entry(frame, i, source, sources,
+ fd->inode, name, inode, replies);
}
+ if (ret < 0)
+ break;
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s", expunge_local->loc.path,
- priv->children[source]->name);
-
- STACK_WIND_COOKIE (expunge_frame,
- afr_sh_entry_expunge_entry_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->lookup,
- &expunge_local->loc, 0);
-
- ret = 0;
-out:
- if (ret == -1)
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
+ return ret;
}
-
-int
-afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_expunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_expunge_entry (frame, this, entry);
- }
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src)
+static int
+afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
+ struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid,
+ char *bname, int src_idx,
+ unsigned char *locked_on, int *src)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdirp,
- sh->healing_fd, sh->block_size, sh->offset);
-
- return 0;
+ int i = 0;
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ void *gfid = NULL;
+ ia_type_t ia_type = IA_INVAL;
+
+ priv = this->private;
+ gfid = &replies[src_idx].poststat.ia_gfid;
+ ia_type = replies[src_idx].poststat.ia_type;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == src_idx)
+ continue;
+
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret != 0)
+ continue;
+
+ if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+ continue;
+
+ if (replies[i].poststat.ia_type == IA_INVAL)
+ continue;
+
+ if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) {
+ src_idx = i;
+ ia_type = replies[src_idx].poststat.ia_type;
+ gfid = &replies[src_idx].poststat.ia_gfid;
+ continue;
+ }
+
+ if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) &&
+ (ia_type == replies[i].poststat.ia_type)) {
+ ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
+ bname, src_idx, i, locked_on, src,
+ NULL);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Skipping conservative merge on the "
+ "file.");
+ return ret;
+ }
+
+ if (ia_type != replies[i].poststat.ia_type) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Type mismatch detected "
+ "for <gfid:%s>/%s>, %s on %s and %s on %s. "
+ "Skipping conservative merge on the file.",
+ uuid_utoa(pargfid), bname,
+ gf_inode_type_to_str(replies[i].poststat.ia_type),
+ priv->children[i]->name,
+ gf_inode_type_to_str(replies[src_idx].poststat.ia_type),
+ priv->children[src_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=file;"
+ "file=<gfid:%s>/%s>;count=2;child-%d=%s;type-"
+ "%d=%s;child-%d=%s;type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name, i,
+ gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx,
+ priv->children[src_idx]->name, src_idx,
+ gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
+ return -1;
+ }
+ }
+
+ return 0;
}
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
+ int ret = 0;
+ int i = 0;
+ int source = -1;
+ int src = -1;
+ afr_private_t *priv = NULL;
- sh->offset = 0;
-
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s to expunge entries",
- local->loc.path);
- goto out;
- }
-
- active_src = next_active_sink (frame, this, sh->active_source);
- sh->active_source = active_src;
-
- if (sh->op_failed) {
- goto out;
- }
+ priv = this->private;
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ source = i;
+ break;
}
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "expunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
-
- afr_sh_entry_expunge_subvol (frame, this, active_src);
-
- return 0;
-out:
- afr_sh_entry_impunge_all (frame, this);
- return 0;
-
-}
-
-
-int
-afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
- int active_src, int32_t op_ret,
- int32_t op_errno)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- call_count = afr_frame_return (frame);
- if (op_ret == -1)
- sh->op_failed = 1;
-
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- goto out;
- }
- afr_sh_entry_impunge_subvol (frame, this, active_src);
- }
-out:
+ if (source == -1) {
+ /* entry got deleted in the mean time? */
return 0;
-}
-
-void
-afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- int32_t impunge_ret_child = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
-
- impunge_ret_child = impunge_sh->impunge_ret_child;
- AFR_STACK_DESTROY (impunge_frame);
- sh->impunge_done (frame, this, impunge_ret_child, op_ret,
- op_errno);
-}
+ }
-int
-afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- child_index = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "setattr done for %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "setattr (%s) on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
+ /* Set all the sources as 1, otheriwse newentry_mark won't be set */
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ sources[i] = 1;
}
+ }
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
-
- return 0;
-}
+ ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
+ source, sources,
+ &replies[source].poststat.ia_gfid, NULL);
+ if (ret)
+ return ret;
+ /* In case of type mismatch / unable to resolve gfid mismatch on the
+ * entry, return -1.*/
+ ret = afr_selfheal_detect_gfid_and_type_mismatch(
+ this, replies, inode, fd->inode->gfid, name, source, locked_on, &src);
-int
-afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = 0;
- struct iatt stbuf = {0};
- int32_t valid = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to perform xattrop on %s (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
+ if (ret < 0)
+ return ret;
+ if (src != -1) {
+ source = src;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i != src && replies[i].valid &&
+ gf_uuid_compare(replies[src].poststat.ia_gfid,
+ replies[i].poststat.ia_gfid)) {
+ sources[i] = 0;
+ }
}
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "setting ownership of %s on %s to %d/%d",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- impunge_local->cont.lookup.buf.ia_uid,
- impunge_local->cont.lookup.buf.ia_gid);
-
- stbuf.ia_atime = impunge_local->cont.lookup.buf.ia_atime;
- stbuf.ia_atime_nsec = impunge_local->cont.lookup.buf.ia_atime_nsec;
- stbuf.ia_mtime = impunge_local->cont.lookup.buf.ia_mtime;
- stbuf.ia_mtime_nsec = impunge_local->cont.lookup.buf.ia_mtime_nsec;
-
- stbuf.ia_uid = impunge_local->cont.lookup.buf.ia_uid;
- stbuf.ia_gid = impunge_local->cont.lookup.buf.ia_gid;
-
- valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_setattr_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->setattr,
- &impunge_local->loc,
- &stbuf, valid);
- return 0;
-}
-
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
-int
-afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- loc_t *parent_loc = cookie;
-
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_INFO,
- "setattr on parent directory (%s) failed: %s",
- parent_loc->path, strerror (op_errno));
+ if (src != -1) {
+ if (!gf_uuid_compare(replies[src].poststat.ia_gfid,
+ replies[i].poststat.ia_gfid))
+ continue;
+ } else if (replies[i].op_errno != ENOENT) {
+ continue;
}
- loc_wipe (parent_loc);
+ ret |= afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode,
+ name, inode, replies);
+ }
- GF_FREE (parent_loc);
-
- AFR_STACK_DESTROY (setattr_frame);
- return 0;
+ return ret;
}
-int
-afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent,
- struct iatt *postparent)
+static int
+__afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = 0;
- int child_index = 0;
- int32_t *pending_array = NULL;
- dict_t *xattr = NULL;
- int ret = 0;
- int idx = 0;
- call_frame_t *setattr_frame = NULL;
- int32_t valid = 0;
- loc_t *parent_loc = NULL;
- struct iatt parentbuf = {0,};
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "creation of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- inode->ia_type = stbuf->ia_type;
-
- xattr = dict_new ();
- if (!xattr) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
- goto out;
- }
-
- pending_array = (int32_t*) GF_CALLOC (3, sizeof (int32_t),
- gf_common_mt_int32_t);
-
- if (!pending_array) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
- goto out;
- }
- /* Pending data xattrs shouldn't be set for special files
- */
- idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
- pending_array[idx] = hton32 (1);
- if (IA_ISDIR (stbuf->ia_type))
- idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
- else if (IA_ISREG (stbuf->ia_type))
- idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- else
- goto cont;
- pending_array[idx] = hton32 (1);
-
-cont:
- ret = dict_set_dynptr (xattr, priv->pending_key[child_index],
- pending_array, 3 * sizeof (int32_t));
- if (ret < 0) {
- GF_FREE (pending_array);
- pending_array = NULL;
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- }
-
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- parentbuf = impunge_sh->parentbuf;
- setattr_frame = copy_frame (impunge_frame);
-
- parent_loc = GF_CALLOC (1, sizeof (*parent_loc),
- gf_afr_mt_loc_t);
- if (!parent_loc) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
- goto out;
- }
- afr_build_parent_loc (parent_loc, &impunge_local->loc);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->xattrop,
- &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr);
-
- STACK_WIND_COOKIE (setattr_frame, afr_sh_entry_impunge_parent_setattr_cbk,
- (void *) (long) parent_loc,
- priv->children[child_index],
- priv->children[child_index]->fops->setattr,
- parent_loc, &parentbuf, valid);
-
- dict_unref (xattr);
-
- return 0;
-
-out:
- if (xattr)
- dict_unref (xattr);
-
- if (pending_array)
- GF_FREE (pending_array);
-
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, op_errno);
-
- return 0;
+ int ret = -1;
+
+ if (source < 0)
+ ret = __afr_selfheal_merge_dirent(frame, this, fd, name, inode, sources,
+ healed_sinks, locked_on, replies);
+ else
+ ret = __afr_selfheal_heal_dirent(frame, this, fd, name, inode, source,
+ sources, healed_sinks, locked_on,
+ replies);
+ return ret;
}
-
-int
-afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
+static gf_boolean_t
+is_full_heal_marker_present(xlator_t *this, dict_t *xdata, int idx)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing file %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- dict = dict_new ();
- if (!dict)
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
-
- GF_ASSERT (!uuid_is_null (stbuf->ia_gfid));
- ret = afr_set_dict_gfid (dict, stbuf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
- impunge_local->loc.path);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mknod,
- &impunge_local->loc,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- makedev (ia_major (stbuf->ia_rdev),
- ia_minor (stbuf->ia_rdev)), dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
+ int i = 0;
+ int pending[3] = {
+ 0,
+ };
+ void *pending_raw = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!xdata)
+ return _gf_false;
+
+ /* Iterate over each of the priv->pending_keys[] elements and then
+ * see if any of them have data segment non-zero. If they do, return
+ * true. Else return false.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
+
+ if (!pending_raw)
+ continue;
+
+ memcpy(pending, pending_raw, sizeof(pending));
+ if (ntoh32(pending[idx]))
+ return _gf_true;
+ }
+
+ return _gf_false;
}
-
-
-int
-afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
+static gf_boolean_t
+afr_need_full_heal(xlator_t *this, struct afr_reply *replies, int source,
+ unsigned char *healed_sinks, afr_transaction_type type)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
-
- int ret = 0;
+ int i = 0;
+ int idx = 0;
+ afr_private_t *priv = NULL;
- priv = this->private;
- impunge_local = impunge_frame->local;
+ priv = this->private;
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return 0;
- }
+ if (!priv->esh_granular)
+ return _gf_true;
- GF_ASSERT (!uuid_is_null (stbuf->ia_gfid));
- ret = afr_set_dict_gfid (dict, stbuf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
- impunge_local->loc.path);
+ if (type != AFR_ENTRY_TRANSACTION)
+ return _gf_true;
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing directory %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
+ priv = this->private;
+ idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mkdir,
- &impunge_local->loc,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- dict);
+ /* If there is a clear source, check whether the full-heal-indicator
+ * is present in its xdata. Otherwise, we need to examine all the
+ * participating bricks and then figure if *even* one of them has a
+ * full-heal-indicator.
+ */
- if (dict)
- dict_unref (dict);
+ if (source != -1) {
+ if (is_full_heal_marker_present(this, replies[source].xdata, idx))
+ return _gf_true;
+ }
- return 0;
-}
+ /* else ..*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
-int
-afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, const char *linkname)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
- struct iatt *buf = NULL;
- int ret = 0;
- afr_self_heal_t *impunge_sh = NULL;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- buf = &impunge_local->cont.symlink.buf;
-
- dict = dict_new ();
- if (!dict) {
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, ENOMEM);
- goto out;
- }
+ if (is_full_heal_marker_present(this, replies[i].xdata, idx))
+ return _gf_true;
+ }
- GF_ASSERT (!uuid_is_null (buf->ia_gfid));
- ret = afr_set_dict_gfid (dict, buf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO,
- "%s: dict set gfid failed",
- impunge_local->loc.path);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing symlink %s -> %s on %s",
- impunge_local->loc.path, linkname,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->symlink,
- linkname, &impunge_local->loc, dict);
-
- if (dict)
- dict_unref (dict);
-out:
- return 0;
+ return _gf_false;
}
-
-int
-afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
+static int
+__afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ uint64_t *witness)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "unlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
- impunge_sh->linkname);
-
- return 0;
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+ int i = 0;
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
+ priv = this->private;
- return 0;
-}
+ sources_count = AFR_COUNT(sources, priv->child_count);
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count || afr_does_witness_exist(this, witness)) {
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return -1;
+ }
-int
-afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
+ source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION);
- gf_log (this->name, GF_LOG_DEBUG,
- "unlinking symlink %s with wrong target on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->unlink,
- &impunge_local->loc);
+ /*If the selected source does not blame any other brick, then mark
+ * everything as sink to trigger conservative merge.
+ */
+ if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ return -1;
+ }
- return 0;
+ return source;
}
-
int
-afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname, struct iatt *sbuf)
+__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p,
+ unsigned char *pflag)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_INFO,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
- /* symlink doesn't exist on the sink */
-
- if ((op_ret == -1) && (op_errno == ENOENT)) {
- afr_sh_entry_impunge_symlink (impunge_frame, this,
- child_index, impunge_sh->linkname);
- return 0;
- }
+ priv = this->private;
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+ if (ret)
+ return ret;
- /* symlink exists on the sink, so check if targets match */
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, locked_on, sources,
+ sinks, witness, pflag);
+ if (ret)
+ return ret;
- if (strcmp (linkname, impunge_sh->linkname) == 0) {
- /* targets match, nothing to do */
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
- goto out;
- } else {
- /*
- * Hah! Sneaky wolf in sheep's clothing!
- */
- afr_sh_entry_impunge_symlink_unlink (impunge_frame, this,
- child_index);
- return 0;
- }
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ source = __afr_selfheal_entry_finalize_source(this, sources, healed_sinks,
+ locked_on, replies, witness);
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
- return 0;
+ return ret;
}
-
-int
-afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
+static int
+afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *parent_idx_inode,
+ xlator_t *subvol, gf_boolean_t full_crawl)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "checking symlink target of %s on %s",
- impunge_local->loc.path, priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->readlink,
- &impunge_local->loc, 4096);
-
+ int ret = 0;
+ int source = -1;
+ unsigned char *locked_on = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ inode_t *inode = NULL;
+ struct afr_reply *replies = NULL;
+ struct afr_reply *par_replies = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+
+ if (afr_is_private_directory(priv, fd->inode->gfid, name,
+ GF_CLIENT_PID_SELF_HEALD)) {
return 0;
+ }
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+ ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
+ if (ret) {
+ dict_unref(xattr);
+ return -1;
+ }
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ locked_on = alloca0(priv->child_count);
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+ par_replies = alloca0(priv->child_count * sizeof(*par_replies));
+
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes "
+ " could be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, this->name);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, locked_on,
+ sources, sinks, healed_sinks,
+ par_replies, &source, NULL);
+ if (ret < 0)
+ goto unlock;
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, fd->inode, name, replies,
+ locked_on, xattr);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_dirent(frame, this, fd, name, inode, source,
+ sources, healed_sinks, locked_on,
+ replies);
+
+ if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
+ ret = afr_shd_entry_purge(subvol, parent_idx_inode, name,
+ inode->ia_type);
+ /* Why is ret force-set to 0? We do not care about
+ * index purge failing for full heal as it is quite
+ * possible during replace-brick that not all files
+ * and directories have their name indices present in
+ * entry-changes/.
+ */
+ ret = 0;
+ }
+ }
+
+unlock:
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, locked_on,
+ NULL);
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ if (par_replies)
+ afr_replies_wipe(par_replies, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
}
-
-int
-afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname, struct iatt *sbuf)
+static inode_t *
+afr_shd_entry_changes_index_inode(xlator_t *this, xlator_t *subvol,
+ uuid_t pargfid)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- impunge_sh->linkname = gf_strdup (linkname);
- afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index);
-
- return 0;
+ int ret = -1;
+ void *index_gfid = NULL;
+ loc_t rootloc = {
+ 0,
+ };
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ inode_t *inode = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr(subvol, &rootloc, &xattr,
+ GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr(xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid);
+ if (ret) {
+ errno = EINVAL;
+ goto out;
+ }
+
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ gf_uuid_copy(loc.pargfid, index_gfid);
+ loc.name = gf_strdup(uuid_utoa(pargfid));
+
+ ret = syncop_lookup(subvol, &loc, &iatt, NULL, NULL, NULL);
+ if (ret < 0) {
+ errno = -ret;
+ goto out;
+ }
+
+ inode = inode_link(loc.inode, NULL, NULL, &iatt);
out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ if (xattr)
+ dict_unref(xattr);
+ loc_wipe(&rootloc);
+ GF_FREE((char *)loc.name);
+ loc_wipe(&loc);
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
-
- return 0;
+ return inode;
}
-
-int
-afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
+static int
+afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int child)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
- impunge_local->cont.symlink.buf = *stbuf;
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->readlink,
- &impunge_local->loc, 4096);
+ int ret = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ off_t offset = 0;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t mismatch = _gf_false;
+ afr_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ INIT_LIST_HEAD(&entries.list);
+
+ local = frame->local;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame)
+ return -ENOMEM;
+
+ loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
+ fd->inode->gfid);
+
+ while ((ret = syncop_readdir(subvol, fd, 131072, offset, &entries, NULL,
+ NULL))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry(entry, &entries.list, list)
+ {
+ offset = entry->d_off;
- return 0;
-}
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+ continue;
-int
-afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *buf,
- struct iatt *postparent)
-{
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t type = IA_INVAL;
- int ret = 0;
- int active_src = 0;
-
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- impunge_sh->parentbuf = *postparent;
- active_src = impunge_sh->active_source;
- impunge_local->cont.lookup.buf = *buf;
- afr_update_loc_gfids (&impunge_local->loc, buf, postparent);
-
- type = buf->ia_type;
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- afr_sh_entry_impunge_mknod (impunge_frame, this,
- child_index, buf);
- break;
- case IA_IFLNK:
- afr_sh_entry_impunge_readlink (impunge_frame, this,
- child_index, buf);
- break;
- case IA_IFDIR:
- afr_sh_entry_impunge_mkdir (impunge_frame, this,
- child_index, buf);
+ ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
+ loc.inode, subvol,
+ local->need_full_crawl);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = -ENOTCONN;
break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- impunge_local->loc.path,
- priv->children[active_src]->name, type);
- ret = -1;
+ }
+
+ if (ret == -1) {
+ /* gfid or type mismatch. */
+ mismatch = _gf_true;
+ ret = 0;
+ }
+ if (ret)
break;
}
- return ret;
-}
-
-gf_boolean_t
-afr_sh_need_recreate (afr_self_heal_t *impunge_sh, int *sources,
- unsigned int child, unsigned int child_count)
-{
- int32_t *success_children = NULL;
- gf_boolean_t recreate = _gf_false;
-
- GF_ASSERT (impunge_sh->impunging_entry_mode);
- GF_ASSERT (impunge_sh->child_errno);
- GF_ASSERT (sources);
-
- success_children = impunge_sh->child_success;
- if (sources[child] || (child == impunge_sh->active_source)) {
- GF_ASSERT (afr_is_child_present (success_children,
- child_count, child));
- goto out;
- }
-
- if (IA_ISLNK (impunge_sh->impunging_entry_mode)) {
- recreate = _gf_true;
- goto out;
- }
-
- if (impunge_sh->child_errno[child] == ENOENT)
- recreate = _gf_true;
-out:
- return recreate;
-}
-
-unsigned int
-afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources,
- unsigned int child_count)
-{
- int count = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (afr_sh_need_recreate (impunge_sh, sources, i, child_count))
- count++;
- }
-
- return count;
-}
-
-int
-afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame,
- xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- struct iatt *buf = NULL;
- struct iatt *postparent = NULL;
- unsigned int recreate_count = 0;
- int i = 0;
- int active_src = 0;
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
- buf = &impunge_sh->buf[active_src];
- postparent = &impunge_sh->parentbufs[active_src];
-
- recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources,
- priv->child_count);
- GF_ASSERT (recreate_count);
- impunge_local->call_count = recreate_count;
- for (i = 0; i < priv->child_count; i++) {
- if (afr_sh_need_recreate (impunge_sh, sh->sources, i,
- priv->child_count)) {
- (void)afr_sh_entry_impunge_create (impunge_frame, this,
- i, buf,
- postparent);
- recreate_count--;
- }
- }
- GF_ASSERT (!recreate_count);
- return 0;
-}
-
-void
-afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- unsigned int recreate_count = 0;
- unsigned int gfid_miss_count = 0;
- unsigned int children_up_count = 0;
- uuid_t gfid = {0};
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
-
- if (op_ret < 0)
- goto done;
- if (impunge_sh->child_errno[active_src]) {
- op_ret = -1;
- op_errno = impunge_sh->child_errno[active_src];
- goto done;
- }
+ gf_dirent_free(&entries);
+ if (ret)
+ break;
+ }
- gfid_miss_count = afr_gfid_missing_count (this->name,
- impunge_sh->child_success,
- impunge_sh->buf, priv->child_count,
- impunge_local->loc.path);
- children_up_count = afr_up_children_count (priv->child_count,
- impunge_local->child_up);
- if ((gfid_miss_count == children_up_count) &&
- (children_up_count < priv->child_count)) {
- op_ret = -1;
- op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "Not all children are up, "
- "gfid should not be assigned in this state for %s",
- impunge_local->loc.path);
- goto done;
- }
+ loc_wipe(&loc);
- if (gfid_miss_count) {
- afr_update_gfid_from_iatts (gfid, impunge_sh->buf,
- impunge_sh->child_success,
- priv->child_count);
- if (uuid_is_null (gfid))
- uuid_generate (gfid);
- afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc,
- afr_sh_entry_common_lookup_done, gfid,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS);
- } else {
- recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources,
- priv->child_count);
- if (!recreate_count) {
- op_ret = 0;
- op_errno = 0;
- goto done;
- }
- afr_sh_entry_call_impunge_recreate (impunge_frame, this);
- }
- return;
-done:
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
- return;
+ AFR_STACK_DESTROY(iter_frame);
+ if (mismatch == _gf_true)
+ /* undo pending will be skipped */
+ ret = -1;
+ return ret;
}
-int
-afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entry)
+static int
+afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
+ loc_t *parent, void *data)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = -1;
- call_frame_t *impunge_frame = NULL;
- afr_local_t *impunge_local = NULL;
- int active_src = 0;
- int op_errno = 0;
- int op_ret = -1;
- mode_t entry_mode = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
- sh->impunge_done = afr_sh_entry_impunge_entry_done;
-
- if ((strcmp (entry->d_name, ".") == 0)
- || (strcmp (entry->d_name, "..") == 0)
- || ((strcmp (local->loc.path, "/") == 0)
- && (strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR) == 0))) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- entry->d_name, local->loc.path);
- op_ret = 0;
- goto out;
- }
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+ afr_granular_esh_args_t *args = data;
+
+ /* Look up the actual inode associated with entry. If the lookup returns
+ * ESTALE or ENOENT, then it means we have a stale index. Remove it.
+ * This is analogous to the check in afr_shd_index_heal() except that
+ * here it is achieved through LOOKUP and in afr_shd_index_heal() through
+ * a GETXATTR.
+ */
+
+ loc.inode = inode_new(args->xl->itable);
+ loc.parent = inode_ref(args->heal_fd->inode);
+ gf_uuid_copy(loc.pargfid, loc.parent->gfid);
+ loc.name = entry->d_name;
+
+ ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL);
+ if ((ret == -ENOENT) || (ret == -ESTALE)) {
+ /* The name indices under the pgfid index dir are guaranteed
+ * to be regular files. Hence the hardcoding.
+ */
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
+ ret = 0;
+ goto out;
+ }
+ /* TBD: afr_shd_zero_xattrop? */
- gf_log (this->name, GF_LOG_TRACE,
- "inspecting existance of %s under %s",
- entry->d_name, local->loc.path);
+ ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd,
+ entry->d_name, parent->inode, subvol,
+ _gf_false);
+ AFR_STACK_RESET(args->frame);
+ if (args->frame->local == NULL)
+ ret = -ENOTCONN;
- entry_mode = st_mode_from_ia (entry->d_stat.ia_prot,
- entry->d_stat.ia_type);
- ret = afr_impunge_frame_create (frame, this, active_src, active_src,
- entry_mode, &impunge_frame);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ if (ret == -1)
+ args->mismatch = _gf_true;
- impunge_local = impunge_frame->local;
- ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc,
- entry->d_name);
- if (ret != 0) {
- op_errno = ENOMEM;
- goto out;
- }
-
- afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc,
- afr_sh_entry_common_lookup_done, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS);
-
- op_ret = 0;
out:
- if (ret) {
- if (impunge_frame)
- AFR_STACK_DESTROY (impunge_frame);
- sh->impunge_done (frame, this, active_src, op_ret, op_errno);
- }
-
- return 0;
+ loc_wipe(&loc);
+ return 0;
}
-
-int
-afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_impunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_impunge_entry (frame, this, entry);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdirp,
- sh->healing_fd, sh->block_size, sh->offset);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh->offset = 0;
-
- active_src = next_active_source (frame, this, sh->active_source);
- sh->active_source = active_src;
-
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
-
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- afr_sh_entry_erase_pending (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "impunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
-
- afr_sh_entry_impunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
+static int
+afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int subvol_idx, gf_boolean_t is_src)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "opendir of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ afr_granular_esh_args_t args = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[subvol_idx];
+
+ args.frame = afr_copy_frame(frame);
+ if (!args.frame)
+ goto out;
+ args.xl = this;
+ /* args.heal_fd represents the fd associated with the original directory
+ * on which entry heal is being attempted.
+ */
+ args.heal_fd = fd;
+
+ /* @subvol here represents the subvolume of AFR where
+ * indices/entry-changes/<pargfid> will be processed
+ */
+ loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
+ fd->inode->gfid);
+ if (!loc.inode) {
+ /* If granular heal failed on the sink (as it might sometimes
+ * because it is the src that would mostly contain the granular
+ * changelogs and the sink's entry-changes would be empty),
+ * do not treat heal as failure.
+ */
+ if (is_src)
+ ret = -errno;
+ else
+ ret = 0;
+ goto out;
+ }
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
+ ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args,
+ afr_selfheal_entry_granular_dirent);
- sh->active_source = -1;
- afr_sh_entry_expunge_all (frame, this);
- }
+ loc_wipe(&loc);
- return 0;
+ if (args.mismatch == _gf_true)
+ ret = -1;
+out:
+ if (args.frame)
+ AFR_STACK_DESTROY(args.frame);
+ return ret;
}
-
-int
-afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
+ unsigned char *sources, unsigned char *healed_sinks)
{
- int i = 0;
- int call_count = 0;
-
- int source = -1;
- int *sources = NULL;
-
- fd_t *fd = NULL;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = local->self_heal.source;
- sources = local->self_heal.sources;
-
- sh->block_size = 65536; //131072
- sh->offset = 0;
-
- call_count = sh->active_sinks;
- if (source != -1)
- call_count++;
-
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
-
- if (source != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "opening directory %s on subvolume %s (source)",
- local->loc.path, priv->children[source]->name);
-
- /* open source */
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->opendir,
- &local->loc, fd);
- call_count--;
- }
-
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "opening directory %s on subvolume %s (sink)",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->opendir,
- &local->loc, fd);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
+ int i = 0;
+ int ret = 0;
+ gf_boolean_t mismatch = _gf_false;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid));
+
+ for (i = 0; i < priv->child_count; i++) {
+ /* Expunge */
+ if (!healed_sinks[i])
+ continue;
+
+ if (!local->need_full_crawl)
+ /* Why call afr_selfheal_entry_granular() on a "healed sink",
+ * given that it is the source that contains the granular
+ * indices?
+ * If the index for this directory is non-existent or empty on
+ * this subvol (=> clear sink), then it will return early
+ * without failure status.
+ * If the index is non-empty and it is yet a 'healed sink', then
+ * it is due to a split-brain in which case we anyway need to
+ * crawl the indices/entry-changes/pargfid directory.
+ */
+ ret = afr_selfheal_entry_granular(frame, this, fd, i, _gf_false);
+ else
+ ret = afr_selfheal_entry_do_subvol(frame, this, fd, i);
-int
-afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
- }
- }
- if (source != -1)
- sh->success[source] = 1;
-
- sh->active_sinks = active_sinks;
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "no active sinks for self-heal on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- if (source == -1 && active_sinks < 2) {
- gf_log (this->name, GF_LOG_TRACE,
- "cannot sync with 0 sources and 1 sink on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
+ if (ret == -1) {
+ /* gfid or type mismatch. */
+ mismatch = _gf_true;
+ ret = 0;
}
+ if (ret)
+ break;
+ }
- if (source != -1)
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing directory %s from subvolume %s to "
- "%d other",
- local->loc.path, priv->children[source]->name,
- active_sinks);
+ if (!ret && source != -1) {
+ /* Impunge */
+ if (local->need_full_crawl)
+ ret = afr_selfheal_entry_do_subvol(frame, this, fd, source);
else
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s found. "
- "merging all entries as a conservative decision",
- local->loc.path);
-
- afr_sh_entry_open (frame, this);
-
- return 0;
+ ret = afr_selfheal_entry_granular(frame, this, fd, source,
+ _gf_true);
+ }
+
+ if (mismatch == _gf_true)
+ /* undo pending will be skipped */
+ ret = -1;
+ return ret;
}
-
-void
-afr_sh_entry_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static int
+__afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- int nsources = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0) {
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_entry_finish (frame, this);
- goto out;
- }
-
- if (sh->forced_merge) {
- sh->source = -1;
- goto heal;
- }
-
- afr_build_pending_matrix (priv->pending_key, sh->pending_matrix,
- sh->xattr, AFR_ENTRY_TRANSACTION,
- priv->child_count);
-
- afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
- nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf,
- priv->child_count, AFR_SELF_HEAL_ENTRY,
- sh->child_success, this->name);
-
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_entry_finish (frame, this);
- return;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- sh->source = source;
-
-
-heal:
- afr_sh_entry_sync_prepare (frame, this);
+ int ret = -1;
+ int source = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *postop_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t did_sh = _gf_true;
+
+ priv = this->private;
+ local = frame->local;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+ postop_lock = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes could "
+ "be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, this->name);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies, &source, NULL);
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
+ }
+
+ local->need_full_crawl = afr_need_full_heal(
+ this, locked_replies, source, healed_sinks, AFR_ENTRY_TRANSACTION);
+ }
+unlock:
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, data_lock,
+ NULL);
+ if (ret < 0)
+ goto out;
+
+ if (!did_sh)
+ goto out;
+
+ ret = afr_selfheal_entry_do(frame, this, fd, source, sources, healed_sinks);
+ if (ret)
+ goto out;
+
+ /* Take entrylks in xlator domain before doing post-op (undo-pending) in
+ * entry self-heal. This is to prevent a parallel name self-heal on
+ * an entry under @fd->inode from reading pending xattrs while it is
+ * being modified by SHD after entry sh below, given that
+ * name self-heal takes locks ONLY in xlator domain and is free to read
+ * pending changelog in the absence of the following locking.
+ */
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ postop_lock);
+ {
+ if (AFR_CMP(data_lock, postop_lock, priv->child_count) != 0) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "post-op after entry self-heal as %d "
+ "sub-volumes, as opposed to %d, "
+ "could be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret,
+ AFR_COUNT(data_lock, priv->child_count), this->name);
+ ret = -ENOTCONN;
+ goto postop_unlock;
+ }
+
+ afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+ locked_replies);
+ ret = afr_selfheal_undo_pending(
+ frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_ENTRY_TRANSACTION, locked_replies, postop_lock);
+ }
+postop_unlock:
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL,
+ postop_lock, NULL);
out:
- return;
+ if (did_sh)
+ afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
+
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
+ return ret;
}
-int
-afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
+static fd_t *
+afr_selfheal_data_opendir(xlator_t *this, inode_t *inode)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks "
- "failed for %s.", local->loc.path);
- sh->op_failed = 1;
- afr_sh_entry_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done "
- "for %s. Proceeding to FOP", local->loc.path);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_entry_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS);
- }
-
- return 0;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
+ fd_t *fd = NULL;
+
+ fd = fd_create(inode, 0);
+ if (!fd)
+ return NULL;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ ret = syncop_opendir(this, &loc, fd, NULL, NULL);
+ if (ret) {
+ fd_unref(fd);
+ fd = NULL;
+ } else {
+ fd_bind(fd);
+ }
+
+ loc_wipe(&loc);
+ return fd;
}
int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
+afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
-
- priv = this->private;
- local = frame->local;
-
- if (local->self_heal.need_entry_self_heal && priv->entry_self_heal) {
- afr_sh_entrylk (frame, this, &local->loc, NULL,
- afr_sh_post_nonblocking_entry_cbk);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to completion on %s",
- local->loc.path);
- afr_sh_entry_done (frame, this);
- }
-
- return 0;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ fd = afr_selfheal_data_opendir(this, inode);
+ if (!fd)
+ return -EIO;
+
+ locked_on = alloca0(priv->child_count);
+
+ ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
+ NULL, locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes could "
+ "be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, priv->sh_domain);
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry(frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, locked_on,
+ NULL);
+
+ if (fd)
+ fd_unref(fd);
+
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 5727209c58e..03f43bad16e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -1,620 +1,546 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include <glusterfs/events.h>
+#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE)
-int
-afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
-// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
- memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count);
- memset (sh->success, 0, sizeof (int) * priv->child_count);
-
-/* for (i = 0; i < priv->child_count; i++) { */
-/* sh->locked_nodes[i] = 1; */
-/* } */
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
-
- if (local->govinda_gOvinda) {
- gf_log (this->name, GF_LOG_INFO,
- "split-brain detected, aborting selfheal of %s",
- local->loc.path);
- sh->op_failed = 1;
- sh->completion_cbk (frame, this);
- } else {
- if (IA_ISDIR (sh->type)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to entry check on %s",
- local->loc.path);
- afr_self_heal_entry (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to data check on %s",
- local->loc.path);
- afr_self_heal_data (frame, this);
- }
-
- return 0;
+ return afr_is_xattr_ignorable(k);
}
-
-int
-afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- int call_count = 0;
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_done (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_cbk = afr_sh_metadata_done;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
+void
+afr_delete_ignorable_xattrs(dict_t *xattr)
{
- afr_sh_inode_unlock (frame, this);
-
- return 0;
+ dict_foreach_match(xattr, _afr_ignorable_key_match, NULL,
+ dict_remove_foreach_fn, NULL);
}
-
int
-afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
+__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- afr_local_t *local = NULL;
- int call_count = 0;
-
- local = frame->local;
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_finish (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix,
- sh->success, priv->child_count,
- AFR_METADATA_TRANSACTION);
-
- erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,
- gf_afr_mt_dict_t);
- if (!erase_xattr)
- return -ENOMEM;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
-
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
- }
- }
-
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_METADATA_TRANSACTION);
-
- local->call_count = call_count;
-
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "metadata of %s not healed on any subvolume",
- local->loc.path);
-
- afr_sh_metadata_finish (frame, this);
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
+ int ret = -1;
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ dict_t *old_xattr = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing metadata selfheal on %s", uuid_utoa(inode->gfid));
+
+ ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL,
+ NULL);
+ if (ret < 0) {
+ ret = -EIO;
+ goto out;
+ }
+
+ afr_delete_ignorable_xattrs(xattr);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (old_xattr) {
+ dict_unref(old_xattr);
+ old_xattr = NULL;
}
- GF_FREE (erase_xattr);
- return 0;
-}
-
-
-int
-afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "setting attributes failed for %s on %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->success[child_index] = 0;
- }
+ if (!healed_sinks[i])
+ continue;
+
+ ret = syncop_setattr(priv->children[i], &loc,
+ &locked_replies[source].poststat, AFR_HEAL_ATTR,
+ NULL, NULL, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+
+ ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL,
+ NULL);
+ if (old_xattr) {
+ afr_delete_ignorable_xattrs(old_xattr);
+ ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr,
+ NULL);
+ if (ret)
+ healed_sinks[i] = 0;
}
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_erase_pending (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
-}
+ ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+ }
+ ret = 0;
-int
-afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
+out:
+ loc_wipe(&loc);
+ if (xattr)
+ dict_unref(xattr);
+ if (old_xattr)
+ dict_unref(old_xattr);
- return 0;
+ return ret;
}
-
-int
-afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
+static uint64_t
+mtime_ns(struct iatt *ia)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
- int active_sinks = 0;
- int call_count = 0;
- int i = 0;
-
- struct iatt stbuf = {0,};
- int32_t valid = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- active_sinks = sh->active_sinks;
+ uint64_t ret;
- /*
- * 2 calls per sink - setattr, setxattr
- */
- if (xattr)
- call_count = active_sinks * 2;
- else
- call_count = active_sinks;
+ ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) +
+ (uint64_t)(ia->ia_mtime_nsec);
- local->call_count = call_count;
-
- stbuf.ia_atime = sh->buf[source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec;
-
- stbuf.ia_uid = sh->buf[source].ia_uid;
- stbuf.ia_gid = sh->buf[source].ia_gid;
-
- stbuf.ia_type = sh->buf[source].ia_type;
- stbuf.ia_prot = sh->buf[source].ia_prot;
-
- valid = GF_SET_ATTR_MODE |
- GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- for (i = 0; i < priv->child_count; i++) {
- if (call_count == 0) {
- break;
- }
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing metadata of %s from %s to %s",
- local->loc.path, priv->children[source]->name,
- priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, &stbuf, valid);
-
- call_count--;
-
- if (!xattr)
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc, xattr, 0);
- call_count--;
- }
-
- return 0;
+ return ret;
}
-
-int
-afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+/*
+ * When directory content is modified, [mc]time is updated. On
+ * Linux, the filesystem does it, while at least on NetBSD, the
+ * kernel file-system independent code does it. This means that
+ * when entries are added while bricks are down, the kernel sends
+ * a SETATTR [mc]time which will cause metadata split brain for
+ * the directory. In this case, clear the split brain by finding
+ * the source with the most recent modification date.
+ */
+static int
+afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- int i;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
- local->loc.path, priv->children[source]->name,
- strerror (op_errno));
-
- afr_sh_metadata_sync (frame, this, NULL);
- } else {
- for (i = 0; i < priv->child_count; i++) {
- dict_del (xattr, priv->pending_key[i]);
- }
-
- afr_sh_metadata_sync (frame, this, xattr);
- }
-
- return 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ struct iatt source_ia;
+ struct iatt child_ia;
+ uint64_t mtime = 0;
+ int i;
+ int ret = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ continue;
+
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret != 0)
+ continue;
+
+ if (mtime_ns(&replies[i].poststat) <= mtime)
+ continue;
+
+ mtime = mtime_ns(&replies[i].poststat);
+ source = i;
+ }
+
+ if (source == -1)
+ goto out;
+
+ source_ia = replies[source].poststat;
+ if (source_ia.ia_type != IA_IFDIR)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret != 0)
+ continue;
+
+ child_ia = replies[i].poststat;
+
+ if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+ !IA_EQUAL(source_ia, child_ia, type) ||
+ !IA_EQUAL(source_ia, child_ia, prot) ||
+ !IA_EQUAL(source_ia, child_ia, uid) ||
+ !IA_EQUAL(source_ia, child_ia, gid) ||
+ !afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata))
+ goto out;
+ }
+
+ /*
+ * Metadata split brain is just about [amc]time
+ * We return our source.
+ */
+ ret = source;
+out:
+ return ret;
}
-
-int
-afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
+ struct afr_reply *replies,
+ unsigned char *sources)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
- }
+ int ret = 0;
+ int i = 0;
+ int m_idx = 0;
+ afr_private_t *priv = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ raw[m_idx] = 1;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ ret = -1;
+ goto out;
}
- sh->success[source] = 1;
-
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_metadata_finish (frame, this);
- return 0;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO,
+ "Failed to set pending metadata xattr on child %d for %s", i,
+ uuid_utoa(inode->gfid));
+ goto out;
}
- sh->active_sinks = active_sinks;
-
- gf_log (this->name, GF_LOG_TRACE,
- "syncing metadata of %s from subvolume %s to %d active sinks",
- local->loc.path, priv->children[source]->name, active_sinks);
+ }
- STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
- priv->children[source],
- priv->children[source]->fops->getxattr,
- &local->loc, NULL);
+ afr_replies_wipe(replies, priv->child_count);
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
- return 0;
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
}
-
-void
-afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+/*
+ * Look for mismatching uid/gid or mode or user xattrs even if
+ * AFR xattrs don't say so, and pick one arbitrarily as winner. */
+
+static int
+__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0) {
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->child_success,
- AFR_METADATA_TRANSACTION);
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
-
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_WARNING,
- "Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt srcstat = {
+ 0,
+ };
+ int source = -1;
+ int sources_count = 0;
+ int ret = 0;
+
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count) {
+ source = afr_mark_split_brain_source_sinks(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, AFR_METADATA_TRANSACTION);
+ if (source >= 0) {
+ _afr_fav_child_reset_sink_xattrs(
+ frame, this, inode, source, healed_sinks, undid_pending,
+ AFR_METADATA_TRANSACTION, locked_on, replies);
+ goto out;
}
- if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to self-heal permissions/ownership of '%s' "
- "(possible split-brain). Please fix the file on "
- "all backend volumes", local->loc.path);
-
- local->govinda_gOvinda = 1;
-
- afr_sh_metadata_finish (frame, this);
- goto out;
+ /* If this is a directory mtime/ctime only split brain
+ use the most recent */
+ source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on);
+ if (source != -1) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN,
+ "clear time "
+ "split brain on %s",
+ uuid_utoa(replies[source].poststat.ia_gfid));
+ sources[source] = 1;
+ healed_sinks[source] = 0;
+ goto out;
}
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- if (source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
-
- afr_sh_metadata_finish (frame, this);
- goto out;
+ if (!priv->metadata_splitbrain_forced_heal) {
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;"
+ "type=metadata;file=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
+ return -EIO;
}
- sh->source = source;
-
- /* detect changes not visible through pending flags -- JIC */
+ /* Metadata split brain, select one subvol
+ arbitrarily */
for (i = 0; i < priv->child_count; i++) {
- if (i == source || sh->child_errno[i])
- continue;
-
- if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
-
- if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
+ if (locked_on[i] && healed_sinks[i]) {
+ sources[i] = 1;
+ healed_sinks[i] = 0;
+ break;
+ }
}
-
- afr_sh_metadata_sync_prepare (frame, this);
+ }
+
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
+ source = afr_choose_source_by_policy(priv, sources,
+ AFR_METADATA_TRANSACTION);
+ srcstat = replies[source].poststat;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] || i == source)
+ continue;
+ if (!IA_EQUAL(srcstat, replies[i].poststat, type) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, uid) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, gid) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, prot)) {
+ gf_msg_debug(this->name, 0,
+ "%s: iatt mismatch "
+ "for source(%d) vs (%d)",
+ uuid_utoa(replies[source].poststat.ia_gfid), source,
+ i);
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] || i == source)
+ continue;
+ if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) {
+ gf_msg_debug(this->name, 0,
+ "%s: xattr mismatch "
+ "for source(%d) vs (%d)",
+ uuid_utoa(replies[source].poststat.ia_gfid), source,
+ i);
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ if ((sources_count == priv->child_count) && (source > -1) &&
+ (AFR_COUNT(healed_sinks, priv->child_count) != 0)) {
+ ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode,
+ replies, sources);
+ if (ret < 0)
+ return ret;
+ }
out:
- return;
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return source;
}
int
-afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame,
- xlator_t *this)
+__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *pflag)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Non Blocking metadata "
- "inodelks failed for %s.", local->loc.path);
- gf_log (this->name, GF_LOG_ERROR, "Metadata self-heal "
- "failed for %s.", local->loc.path);
- afr_sh_metadata_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata "
- "inodelks done for %s. Proceeding to FOP",
- local->loc.path);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_metadata_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS);
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+ if (ret)
+ return ret;
+
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_METADATA_TRANSACTION, locked_on,
+ sources, sinks, witness, pflag);
+ if (ret)
+ return ret;
+
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ /* If any source has witness, pick first
+ * witness source and make everybody else sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && witness[i]) {
+ source = i;
+ break;
}
+ }
- return 0;
-}
-
-int
-afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK;
-
- afr_set_lock_number (frame, this);
+ if (source != -1) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (i != source && sources[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ }
- int_lock->lk_flock.l_start = 0;
- int_lock->lk_flock.l_len = 0;
- int_lock->lk_flock.l_type = F_WRLCK;
- int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk;
+ source = __afr_selfheal_metadata_finalize_source(
+ frame, this, inode, sources, sinks, healed_sinks, undid_pending,
+ locked_on, replies);
- afr_nonblocking_inodelk (frame, this);
+ if (source < 0)
+ return -EIO;
- return 0;
+ return source;
}
-
int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = this->private;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ gf_boolean_t did_sh = _gf_true;
+ int source = -1;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __afr_selfheal_metadata_prepare(
+ frame, this, inode, data_lock, sources, sinks, healed_sinks,
+ undid_pending, locked_replies, NULL);
+ if (ret < 0)
+ goto unlock;
- local = frame->local;
+ source = ret;
- if (local->self_heal.need_metadata_self_heal && priv->metadata_self_heal) {
- afr_sh_metadata_lock (frame, this);
- } else {
- afr_sh_metadata_done (frame, this);
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
}
- return 0;
+ ret = __afr_selfheal_metadata_do(frame, this, inode, source,
+ healed_sinks, locked_replies);
+ if (ret)
+ goto unlock;
+
+ afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
+ locked_replies);
+
+ ret = afr_selfheal_undo_pending(
+ frame, this, inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_METADATA_TRANSACTION, locked_replies, data_lock);
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
+ data_lock);
+
+ if (did_sh)
+ afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
+
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
+ return ret;
+}
+
+int
+afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf)
+{
+ inode_t *inode = NULL;
+ inode_t *link_inode = NULL;
+ call_frame_t *frame = NULL;
+ int ret = 0;
+
+ if (gf_uuid_is_null(stbuf->ia_gfid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ inode = inode_new(this->itable);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ link_inode = inode_link(inode, NULL, NULL, stbuf);
+ if (!link_inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = afr_selfheal_metadata(frame, this, link_inode);
+out:
+ if (inode)
+ inode_unref(inode);
+ if (link_inode)
+ inode_unref(link_inode);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
new file mode 100644
index 00000000000..834aac86d48
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -0,0 +1,616 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/events.h>
+#include "afr.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
+
+int
+__afr_selfheal_assign_gfid(xlator_t *this, inode_t *parent, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ struct afr_reply *replies, void *gfid,
+ unsigned char *locked_on, int source,
+ unsigned char *sources, gf_boolean_t is_gfid_absent,
+ int *gfid_idx)
+{
+ int ret = 0;
+ int up_count = 0;
+ int locked_count = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ gf_uuid_copy(parent->gfid, pargfid);
+
+ if (is_gfid_absent) {
+ /* Ensure all children of AFR are up before performing gfid heal, to
+ * guard against the possibility of gfid split brain. */
+
+ up_count = AFR_COUNT(priv->child_up, priv->child_count);
+ if (up_count != priv->child_count) {
+ ret = -EIO;
+ goto out;
+ }
+
+ locked_count = AFR_COUNT(locked_on, priv->child_count);
+ if (locked_count != priv->child_count) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ ret = afr_lookup_and_heal_gfid(this, parent, bname, inode, replies, source,
+ sources, gfid, gfid_idx);
+
+out:
+ return ret;
+}
+
+int
+__afr_selfheal_name_impunge(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid, const char *bname,
+ inode_t *inode, struct afr_reply *replies,
+ int gfid_idx)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ unsigned char *sources = NULL;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+
+ gf_uuid_copy(parent->gfid, pargfid);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[gfid_idx].poststat.ia_gfid) == 0) {
+ sources[i] = 1;
+ continue;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
+
+ ret |= afr_selfheal_recreate_entry(frame, i, gfid_idx, sources, parent,
+ bname, inode, replies);
+ }
+
+ return ret;
+}
+
+int
+__afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret)
+ continue;
+
+ ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i,
+ replies);
+ }
+
+ return ret;
+}
+
+static gf_boolean_t
+afr_selfheal_name_need_heal_check(xlator_t *this, struct afr_reply *replies)
+{
+ int i = 0;
+ int first_idx = -1;
+ gf_boolean_t need_heal = _gf_false;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA))
+ need_heal = _gf_true;
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ need_heal = _gf_true;
+
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ need_heal = _gf_true;
+
+ if ((replies[i].op_ret == 0) &&
+ (gf_uuid_is_null(replies[i].poststat.ia_gfid)))
+ need_heal = _gf_true;
+ }
+
+ return need_heal;
+}
+
+static int
+afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies,
+ int source, unsigned char *sources,
+ uuid_t pargfid, const char *bname)
+{
+ int i = 0;
+ int type_idx = -1;
+ ia_type_t inode_type = IA_INVAL;
+ ia_type_t inode_type1 = IA_INVAL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+
+ if (replies[i].poststat.ia_type == IA_INVAL)
+ continue;
+
+ if (inode_type == IA_INVAL) {
+ inode_type = replies[i].poststat.ia_type;
+ type_idx = i;
+ continue;
+ }
+ inode_type1 = replies[i].poststat.ia_type;
+ if (sources[i] || source == -1) {
+ if ((sources[type_idx] || source == -1) &&
+ (inode_type != inode_type1)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN,
+ "Type mismatch for <gfid:%s>/%s: "
+ "%s on %s and %s on %s",
+ uuid_utoa(pargfid), bname,
+ gf_inode_type_to_str(inode_type1),
+ priv->children[i]->name,
+ gf_inode_type_to_str(inode_type),
+ priv->children[type_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=file;"
+ "file=<gfid:%s>/%s;count=2;"
+ "child-%d=%s;type-%d=%s;child-%d=%s;"
+ "type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name,
+ i, gf_inode_type_to_str(inode_type1), type_idx,
+ priv->children[type_idx]->name, type_idx,
+ gf_inode_type_to_str(inode_type));
+ return -EIO;
+ }
+ inode_type = replies[i].poststat.ia_type;
+ type_idx = i;
+ }
+ }
+ return 0;
+}
+
+static int
+afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies,
+ int source, unsigned char *sources,
+ int *gfid_idx, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ unsigned char *locked_on, dict_t *xdata)
+{
+ int i = 0;
+ int gfid_idx_iter = -1;
+ int ret = -1;
+ void *gfid = NULL;
+ void *gfid1 = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+
+ if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+ continue;
+
+ if (!gfid) {
+ gfid = &replies[i].poststat.ia_gfid;
+ gfid_idx_iter = i;
+ continue;
+ }
+
+ gfid1 = &replies[i].poststat.ia_gfid;
+ if (sources[i] || source == -1) {
+ if ((sources[gfid_idx_iter] || source == -1) &&
+ gf_uuid_compare(gfid, gfid1)) {
+ ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
+ bname, gfid_idx_iter, i,
+ locked_on, gfid_idx, xdata);
+ if (!ret && *gfid_idx >= 0) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ "GFID split-brain resolved");
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error setting gfid-"
+ "heal-msg dict");
+ }
+ return ret;
+ }
+ gfid = &replies[i].poststat.ia_gfid;
+ gfid_idx_iter = i;
+ }
+ }
+
+ *gfid_idx = gfid_idx_iter;
+ return 0;
+}
+
+static gf_boolean_t
+afr_selfheal_name_source_empty_check(xlator_t *this, struct afr_reply *replies,
+ unsigned char *sources, int source)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ gf_boolean_t source_is_empty = _gf_true;
+
+ priv = this->private;
+
+ if (source == -1) {
+ source_is_empty = _gf_false;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+
+ if (replies[i].op_ret == -1 && replies[i].op_errno == ENOENT)
+ continue;
+
+ source_is_empty = _gf_false;
+ break;
+ }
+out:
+ return source_is_empty;
+}
+
+int
+__afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, int source,
+ unsigned char *locked_on, struct afr_reply *replies,
+ void *gfid_req, dict_t *xdata)
+{
+ int gfid_idx = -1;
+ int ret = -1;
+ void *gfid = NULL;
+ gf_boolean_t source_is_empty = _gf_true;
+ gf_boolean_t need_heal = _gf_false;
+ gf_boolean_t is_gfid_absent = _gf_false;
+
+ need_heal = afr_selfheal_name_need_heal_check(this, replies);
+ if (!need_heal)
+ return 0;
+
+ source_is_empty = afr_selfheal_name_source_empty_check(this, replies,
+ sources, source);
+ if (source_is_empty) {
+ ret = __afr_selfheal_name_expunge(this, parent, pargfid, bname, inode,
+ replies);
+ if (ret == -EIO)
+ ret = -1;
+ return ret;
+ }
+
+ ret = afr_selfheal_name_type_mismatch_check(this, replies, source, sources,
+ pargfid, bname);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_name_gfid_mismatch_check(this, replies, source, sources,
+ &gfid_idx, pargfid, bname,
+ inode, locked_on, xdata);
+ if (ret)
+ return ret;
+
+ if (gfid_idx == -1) {
+ if (!gfid_req || gf_uuid_is_null(gfid_req))
+ return -1;
+ gfid = gfid_req;
+ } else {
+ gfid = &replies[gfid_idx].poststat.ia_gfid;
+ if (source == -1)
+ /* Either entry split-brain or dirty xattrs are present on parent.*/
+ source = gfid_idx;
+ }
+
+ is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;
+ ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode,
+ replies, gfid, locked_on, source, sources,
+ is_gfid_absent, &gfid_idx);
+ if (ret || (gfid_idx < 0))
+ return ret;
+
+ ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname,
+ inode, replies, gfid_idx);
+ if (ret == -EIO)
+ ret = -1;
+
+ return ret;
+}
+
+int
+__afr_selfheal_name_finalize_source(xlator_t *this, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on, uint64_t *witness)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count || afr_does_witness_exist(this, witness)) {
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return -1;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+
+ return source;
+}
+
+int
+__afr_selfheal_name_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ int *source_p)
+{
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_unlocked_discover(frame, parent, pargfid, replies);
+ if (ret)
+ goto out;
+
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, locked_on, sources,
+ sinks, witness, NULL);
+ if (ret)
+ goto out;
+
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ source = __afr_selfheal_name_finalize_source(this, sources, healed_sinks,
+ locked_on, witness);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
+
+out:
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+
+ return ret;
+}
+
+int
+afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, void *gfid_req,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *locked_on = NULL;
+ int source = -1;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+ dict_t *xattr = NULL;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
+ if (ret) {
+ dict_unref(xattr);
+ return -1;
+ }
+
+ priv = this->private;
+
+ locked_on = alloca0(priv->child_count);
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
+ locked_on);
+ {
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_prepare(frame, this, parent, pargfid,
+ locked_on, sources, sinks,
+ healed_sinks, &source);
+ if (ret)
+ goto unlock;
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
+ locked_on, xattr);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_do(frame, this, parent, pargfid, bname, inode,
+ sources, sinks, healed_sinks, source,
+ locked_on, replies, gfid_req, xdata);
+ }
+unlock:
+ afr_selfheal_unentrylk(frame, this, parent, this->name, bname, locked_on,
+ NULL);
+ if (inode)
+ inode_unref(inode);
+
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
+}
+
+int
+afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ const char *bname, gf_boolean_t *need_heal)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
+ int first_idx = -1;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
+ local->child_up, NULL);
+ if (!inode)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) {
+ *need_heal = _gf_true;
+ break;
+ }
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret) {
+ *need_heal = _gf_true;
+ break;
+ }
+
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid)) {
+ *need_heal = _gf_true;
+ break;
+ }
+ }
+
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ return 0;
+}
+
+int
+afr_selfheal_name(xlator_t *this, uuid_t pargfid, const char *bname,
+ void *gfid_req, dict_t *xdata)
+{
+ inode_t *parent = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t need_heal = _gf_false;
+
+ parent = afr_inode_find(this, pargfid);
+ if (!parent)
+ goto out;
+
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ goto out;
+
+ ret = afr_selfheal_name_unlocked_inspect(frame, this, parent, pargfid,
+ bname, &need_heal);
+ if (ret)
+ goto out;
+
+ if (need_heal) {
+ ret = afr_selfheal_name_do(frame, this, parent, pargfid, bname,
+ gfid_req, xdata);
+ if (ret)
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (parent)
+ inode_unref(parent);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+
+ return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 5b211499acc..48e6dbcfb18 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -1,62 +1,377 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#ifndef __AFR_SELF_HEAL_H__
-#define __AFR_SELF_HEAL_H__
+#ifndef _AFR_SELFHEAL_H
+#define _AFR_SELFHEAL_H
+
+/* Perform fop on all UP subvolumes and wait for all callbacks to return */
+
+#define AFR_ONALL(frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ unsigned char *__child_up = alloca(__priv->child_count); \
+ \
+ memcpy(__child_up, __priv->child_up, \
+ sizeof(*__child_up) * __priv->child_count); \
+ __count = AFR_COUNT(__child_up, __priv->child_count); \
+ \
+ __local->barrier.waitfor = __count; \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__child_up[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ } \
+ syncbarrier_wait(&__local->barrier, __count); \
+ } while (0)
+
+/* Perform fop on all subvolumes represented by list[] array and wait
+ for all callbacks to return */
+
+#define AFR_ONLIST(list, frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ int __count = 0; \
+ unsigned char *__list = alloca(__priv->child_count); \
+ \
+ memcpy(__list, list, sizeof(*__list) * __priv->child_count); \
+ __count = AFR_COUNT(__list, __priv->child_count); \
+ __local->barrier.waitfor = __count; \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__list[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ } \
+ syncbarrier_wait(&__local->barrier, __count); \
+ } while (0)
+
+#define AFR_SEQ(frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ syncbarrier_wait(&__local->barrier, 1); \
+ } \
+ } while (0)
+
+#define ALLOC_MATRIX(n, type) \
+ ({ \
+ int __i; \
+ type **__ptr = alloca(n * sizeof(type *)); \
+ \
+ for (__i = 0; __i < n; __i++) \
+ __ptr[__i] = alloca0(n * sizeof(type)); \
+ __ptr; \
+ })
+
+#define IA_EQUAL(f, s, field) \
+ (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0)
+
+#define SBRAIN_HEAL_NO_GO_MSG \
+ "Failed to obtain replies from all bricks of " \
+ "the replica (are they up?). Cannot resolve split-brain."
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SNO_BIGGER_FILE "No bigger file"
+#define SNO_DIFF_IN_MTIME "No difference in mtime"
+#define SUSE_SOURCE_BRICK_TO_HEAL \
+ "Use source-brick option to heal metadata" \
+ " split-brain"
+#define SINVALID_BRICK_NAME "Invalid brick name"
+#define SBRICK_IS_NOT_UP "Brick is not up"
+#define SBRICK_NOT_CONNECTED "Brick is not connected"
+#define SLESS_THAN2_BRICKS_in_REP "< 2 bricks in replica are up"
+#define SBRICK_IS_REMOTE "Brick is remote"
+#define SSTARTED_SELF_HEAL "Started self-heal"
+#define SOP_NOT_SUPPORTED "Operation Not Supported"
+#define SFILE_NOT_UNDER_DATA \
+ "The file is not under data or metadata " \
+ "split-brain"
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SALL_BRICKS_UP_TO_RESOLVE \
+ "All the bricks should be up to resolve the" \
+ " gfid split brain"
+#define SERROR_GETTING_SRC_BRICK "Error getting the source brick"
+int
+afr_selfheal(xlator_t *this, uuid_t gfid);
+
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this);
+
+int
+afr_selfheal_name(xlator_t *this, uuid_t gfid, const char *name, void *gfid_req,
+ dict_t *xdata);
+
+int
+afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd);
+
+int
+afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+int
+afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+int
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+ inode_t *inode, struct afr_reply *replies, int source,
+ unsigned char *sources, void *gfid, int *gfid_idx);
+
+int
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
+
+int
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
+
+int
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, off_t off,
+ size_t size, unsigned char *locked_on);
+
+int
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on);
+
+int
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
-#include <sys/stat.h>
+int
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
-#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type)
-#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type))
-#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid))
-#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size)
+int
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, const char *name,
+ unsigned char *locked_on);
-#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size)
+int
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata);
+
+int
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+ struct afr_reply *replies);
+
+int
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on, dict_t *dict);
+inode_t *
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on, dict_t *xattr);
+
+int
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ uint64_t *witness, unsigned char *flag);
+int
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+ dict_t *xdata);
+
+int
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix);
+
+int
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata);
int
-afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this);
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies);
int
-afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this);
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on);
+
int
-afr_sh_has_data_pending (dict_t *xattr, xlator_t *this);
+afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
+ unsigned char *sources, inode_t *dir,
+ const char *name, inode_t *inode,
+ struct afr_reply *replies);
int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr, dict_t *xdata);
+
+call_frame_t *
+afr_frame_create(xlator_t *this, int32_t *op_errno);
+
+inode_t *
+afr_inode_find(xlator_t *this, uuid_t gfid);
int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf);
+void
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src);
+
+void
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count);
int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry);
+
+unsigned int
+afr_success_count(struct afr_reply *replies, unsigned int count);
+
+void
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+ unsigned char *sources, unsigned char *healed_sinks);
void
-afr_self_heal_find_sources (xlator_t *this, afr_local_t *local, dict_t **xattr,
- afr_transaction_type transaction_type);
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies);
+void
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+ unsigned char *locked_on, unsigned char *sinks);
+
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame);
+
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+ int child_count);
+int
+afr_mark_split_brain_source_sinks(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type);
+
+int
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str);
+
+int
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies);
+
+int
+afr_get_child_index_from_name(xlator_t *this, char *name);
+
+gf_boolean_t
+afr_does_witness_exist(xlator_t *this, uint64_t *witness);
int
-afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode);
+__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *flag);
+
+int
+__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *flag);
+int
+__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p,
+ unsigned char *flag);
+
+int
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **link_inode, gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies);
+
+int
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
+
+int
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on);
+int
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+ afr_transaction_type type);
+
+int
+afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf);
+
+int
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+int
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+int
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+
+int
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid, const char *bname,
+ int src_idx, int child_idx,
+ unsigned char *locked_on, int *src, dict_t *xdata);
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type);
gf_boolean_t
-afr_is_fresh_read_child (int32_t *sources, int32_t child_count,
- int32_t read_child);
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+ struct afr_reply *replies);
+
+int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies);
int
-afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
- dict_t **xattr,
- afr_transaction_type txn_type);
-#endif /* __AFR_SELF_HEAL_H__ */
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode);
+#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
new file mode 100644
index 00000000000..109fd4b7421
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -0,0 +1,1716 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-self-heal.h"
+#include "afr-self-heald.h"
+#include "protocol-common.h"
+#include <glusterfs/syncop-utils.h>
+#include "afr-messages.h"
+#include <glusterfs/byte-order.h>
+
+#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
+#define AFR_STATISTICS_HISTORY_SIZE 50
+
+#define ASSERT_LOCAL(this, healer) \
+ if (!afr_shd_is_subvol_local(this, healer->subvol)) { \
+ healer->local = _gf_false; \
+ if (safe_break(healer)) { \
+ break; \
+ } else { \
+ continue; \
+ } \
+ } else { \
+ healer->local = _gf_true; \
+ }
+
+#define NTH_INDEX_HEALER(this, n) \
+ &((((afr_private_t *)this->private))->shd.index_healers[n])
+#define NTH_FULL_HEALER(this, n) \
+ &((((afr_private_t *)this->private))->shd.full_healers[n])
+
+char *
+afr_subvol_name(xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ if (subvol < 0 || subvol > priv->child_count)
+ return NULL;
+
+ return priv->children[subvol]->name;
+}
+
+void
+afr_destroy_crawl_event_data(void *data)
+{
+ return;
+}
+
+void
+afr_destroy_shd_event_data(void *data)
+{
+ shd_event_t *shd_event = data;
+
+ if (!shd_event)
+ return;
+ GF_FREE(shd_event->path);
+
+ return;
+}
+
+gf_boolean_t
+afr_shd_is_subvol_local(xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ gf_boolean_t is_local = _gf_false;
+ loc_t loc = {
+ 0,
+ };
+
+ loc.inode = this->itable->root;
+ gf_uuid_copy(loc.gfid, loc.inode->gfid);
+ priv = this->private;
+ syncop_is_subvol_local(priv->children[subvol], &loc, &is_local);
+ return is_local;
+}
+
+int
+__afr_shd_healer_wait(struct subvol_healer *healer)
+{
+ afr_private_t *priv = NULL;
+ struct timespec wait_till = {
+ 0,
+ };
+ int ret = 0;
+
+ priv = healer->this->private;
+
+disabled_loop:
+ wait_till.tv_sec = gf_time() + priv->shd.timeout;
+
+ while (!healer->rerun) {
+ ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
+ if (ret == ETIMEDOUT)
+ break;
+ }
+
+ ret = healer->rerun;
+ healer->rerun = 0;
+
+ if (!priv->shd.enabled)
+ goto disabled_loop;
+
+ return ret;
+}
+
+int
+afr_shd_healer_wait(struct subvol_healer *healer)
+{
+ int ret = 0;
+
+ pthread_mutex_lock(&healer->mutex);
+ {
+ ret = __afr_shd_healer_wait(healer);
+ }
+ pthread_mutex_unlock(&healer->mutex);
+
+ return ret;
+}
+
+gf_boolean_t
+safe_break(struct subvol_healer *healer)
+{
+ gf_boolean_t ret = _gf_false;
+
+ pthread_mutex_lock(&healer->mutex);
+ {
+ if (healer->rerun)
+ goto unlock;
+
+ healer->running = _gf_false;
+ ret = _gf_true;
+ }
+unlock:
+ pthread_mutex_unlock(&healer->mutex);
+
+ return ret;
+}
+
+inode_t *
+afr_shd_inode_find(xlator_t *this, xlator_t *subvol, uuid_t gfid)
+{
+ int ret = 0;
+ uint64_t val = IA_INVAL;
+ dict_t *xdata = NULL;
+ dict_t *rsp_dict = NULL;
+ inode_t *inode = NULL;
+
+ xdata = dict_new();
+ if (!xdata)
+ goto out;
+
+ ret = dict_set_int8(xdata, GF_INDEX_IA_TYPE_GET_REQ, 1);
+ if (ret)
+ goto out;
+
+ ret = syncop_inode_find(this, subvol, gfid, &inode, xdata, &rsp_dict);
+ if (ret < 0)
+ goto out;
+
+ if (rsp_dict) {
+ ret = dict_get_uint64(rsp_dict, GF_INDEX_IA_TYPE_GET_RSP, &val);
+ if (ret)
+ goto out;
+ }
+ ret = inode_ctx_set2(inode, subvol, 0, &val);
+out:
+ if (ret && inode) {
+ inode_unref(inode);
+ inode = NULL;
+ }
+ if (xdata)
+ dict_unref(xdata);
+ if (rsp_dict)
+ dict_unref(rsp_dict);
+ return inode;
+}
+
+inode_t *
+afr_shd_index_inode(xlator_t *this, xlator_t *subvol, char *vgfid)
+{
+ loc_t rootloc = {
+ 0,
+ };
+ inode_t *inode = NULL;
+ int ret = 0;
+ dict_t *xattr = NULL;
+ void *index_gfid = NULL;
+
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr(subvol, &rootloc, &xattr, vgfid, NULL, NULL);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr(xattr, vgfid, &index_gfid);
+ if (ret)
+ goto out;
+
+ gf_msg_debug(this->name, 0, "%s dir gfid for %s: %s", vgfid, subvol->name,
+ uuid_utoa(index_gfid));
+
+ inode = afr_shd_inode_find(this, subvol, index_gfid);
+
+out:
+ loc_wipe(&rootloc);
+
+ if (xattr)
+ dict_unref(xattr);
+
+ return inode;
+}
+
+int
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
+ ia_type_t type)
+{
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+
+ loc.parent = inode_ref(inode);
+ loc.name = name;
+
+ if (IA_ISDIR(type))
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ else
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+
+ loc_wipe(&loc);
+ return ret;
+}
+
+void
+afr_shd_zero_xattrop(xlator_t *this, uuid_t gfid)
+{
+ call_frame_t *frame = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+ int i = 0;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+
+ priv = this->private;
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ goto out;
+ inode = afr_inode_find(this, gfid);
+ if (!inode)
+ goto out;
+ xattr = dict_new();
+ if (!xattr)
+ goto out;
+ ret = dict_set_static_bin(xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto out;
+ }
+
+ /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or
+ * has valid repies for this gfid seems a bit of an overkill.*/
+ for (i = 0; i < priv->child_count; i++)
+ afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ if (inode)
+ inode_unref(inode);
+ if (xattr)
+ dict_unref(xattr);
+ return;
+}
+
+int
+afr_shd_selfheal_name(struct subvol_healer *healer, int child, uuid_t parent,
+ const char *bname)
+{
+ int ret = -1;
+
+ ret = afr_selfheal_name(THIS, parent, bname, NULL, NULL);
+
+ return ret;
+}
+
+int
+afr_shd_selfheal(struct subvol_healer *healer, int child, uuid_t gfid)
+{
+ int ret = 0;
+ eh_t *eh = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ this = healer->this;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = &healer->crawl_event;
+
+ subvol = priv->children[child];
+
+ // If this fails with ENOENT/ESTALE index is stale
+ ret = syncop_gfid_to_path(this->itable, subvol, gfid, &path);
+ if (ret < 0)
+ return ret;
+
+ ret = afr_selfheal(this, gfid);
+
+ LOCK(&priv->lock);
+ {
+ if (ret == -EIO) {
+ eh = shd->split_brain;
+ crawl_event->split_brain_count++;
+ } else if (ret < 0) {
+ crawl_event->heal_failed_count++;
+ } else if (ret == 0) {
+ crawl_event->healed_count++;
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (eh) {
+ shd_event = GF_CALLOC(1, sizeof(*shd_event), gf_afr_mt_shd_event_t);
+ if (!shd_event)
+ goto out;
+
+ shd_event->child = child;
+ shd_event->path = path;
+
+ if (eh_save_history(eh, shd_event) < 0)
+ goto out;
+
+ shd_event = NULL;
+ path = NULL;
+ }
+out:
+ GF_FREE(shd_event);
+ GF_FREE(path);
+ return ret;
+}
+
+void
+afr_shd_sweep_prepare(struct subvol_healer *healer)
+{
+ crawl_event_t *event = NULL;
+
+ event = &healer->crawl_event;
+
+ event->healed_count = 0;
+ event->split_brain_count = 0;
+ event->heal_failed_count = 0;
+
+ event->start_time = gf_time();
+ event->end_time = 0;
+ _mask_cancellation();
+}
+
+void
+afr_shd_sweep_done(struct subvol_healer *healer)
+{
+ crawl_event_t *event = NULL;
+ crawl_event_t *history = NULL;
+ afr_self_heald_t *shd = NULL;
+
+ event = &healer->crawl_event;
+ shd = &(((afr_private_t *)healer->this->private)->shd);
+
+ event->end_time = gf_time();
+ history = gf_memdup(event, sizeof(*event));
+ event->start_time = 0;
+
+ if (!history)
+ return;
+
+ if (eh_save_history(shd->statistics[healer->subvol], history) < 0)
+ GF_FREE(history);
+ _unmask_cancellation();
+}
+
+int
+afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = NULL;
+ uuid_t gfid = {0};
+ int ret = 0;
+ uint64_t val = IA_INVAL;
+
+ priv = healer->this->private;
+ if (!priv->shd.enabled)
+ return -EBUSY;
+
+ gf_msg_debug(healer->this->name, 0, "got entry: %s from %s", entry->d_name,
+ priv->children[healer->subvol]->name);
+
+ ret = gf_uuid_parse(entry->d_name, gfid);
+ if (ret)
+ return 0;
+
+ inode_ctx_get2(parent->inode, subvol, NULL, &val);
+
+ ret = afr_shd_selfheal(healer, healer->subvol, gfid);
+
+ if (ret == -ENOENT || ret == -ESTALE)
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val);
+
+ if (ret == 2)
+ /* If bricks crashed in pre-op after creating indices/xattrop
+ * link but before setting afr changelogs, we end up with stale
+ * xattrop links but zero changelogs. Remove such entries by
+ * sending a post-op with zero changelogs.
+ */
+ afr_shd_zero_xattrop(healer->this, gfid);
+
+ return 0;
+}
+
+int
+afr_shd_index_sweep(struct subvol_healer *healer, char *vgfid)
+{
+ loc_t loc = {0};
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ xlator_t *subvol = NULL;
+ dict_t *xdata = NULL;
+ call_frame_t *frame = NULL;
+
+ priv = healer->this->private;
+ subvol = priv->children[healer->subvol];
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ loc.inode = afr_shd_index_inode(healer->this, subvol, vgfid);
+ if (!loc.inode) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_INDEX_DIR_GET_FAILED, "unable to get index-dir on %s",
+ subvol->name);
+ ret = -errno;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata || dict_set_int32_sizen(xdata, "get-gfid-type", 1)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
+ healer, afr_shd_index_heal, xdata,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+
+ if (ret == 0)
+ ret = healer->crawl_event.healed_count;
+
+out:
+ loc_wipe(&loc);
+
+ if (xdata)
+ dict_unref(xdata);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
+}
+
+int
+afr_shd_index_sweep_all(struct subvol_healer *healer)
+{
+ int ret = 0;
+ int count = 0;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_INDEX_GFID);
+ if (ret < 0)
+ goto out;
+ count = ret;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_DIRTY_GFID);
+ if (ret < 0)
+ goto out;
+ count += ret;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_ENTRY_CHANGES_GFID);
+ if (ret < 0)
+ goto out;
+ count += ret;
+out:
+ if (ret < 0)
+ return ret;
+ else
+ return count;
+}
+
+int
+afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ xlator_t *this = healer->this;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (this->cleanup_starting) {
+ return -ENOTCONN;
+ }
+
+ if (!priv->shd.enabled)
+ return -EBUSY;
+
+ afr_shd_selfheal_name(healer, healer->subvol, parent->inode->gfid,
+ entry->d_name);
+
+ afr_shd_selfheal(healer, healer->subvol, entry->d_stat.ia_gfid);
+
+ return 0;
+}
+
+int
+afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode)
+{
+ afr_private_t *priv = NULL;
+ loc_t loc = {0};
+
+ priv = healer->this->private;
+ loc.inode = inode;
+ return syncop_ftw(priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal);
+}
+
+int
+afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ int ret = -1;
+
+ priv = this->private;
+ loc->parent = inode_ref(this->itable->root);
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ loc->inode = inode_new(loc->parent->table);
+ GF_CHECK_ALLOC(loc->inode, ret, out);
+
+ if (!gf_uuid_is_null(priv->ta_gfid))
+ goto assign_gfid;
+
+ ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf,
+ 0, 0, 0);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed lookup on file %s.", loc->name);
+ goto out;
+ }
+
+ gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+
+assign_gfid:
+ gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ ret = 0;
+
+out:
+ if (ret)
+ loc_wipe(loc);
+
+ return ret;
+}
+
+int
+_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
+{
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {
+ 0,
+ };
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ xattr = dict_new();
+ if (!xattr) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED,
+ "Failed to create dict.");
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret)
+ goto out;
+ }
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL);
+ if (ret || !(*xdata)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Xattrop failed on %s.", loc->name);
+ }
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
+}
+
+void
+afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer,
+ dict_t **xdata)
+{
+ int ret = 0;
+
+ loc_wipe(loc);
+ if (afr_shd_fill_ta_loc(this, loc)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc->name);
+ ret = -1;
+ goto out;
+ }
+
+ ret = afr_ta_post_op_lock(this, loc);
+ if (ret)
+ goto out;
+
+ ret = _afr_shd_ta_get_xattrs(this, loc, xdata);
+ if (ret) {
+ if (*xdata) {
+ dict_unref(*xdata);
+ *xdata = NULL;
+ }
+ }
+
+ afr_ta_post_op_unlock(this, loc);
+
+out:
+ if (ret)
+ healer->rerun = 1;
+}
+
+int
+afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
+{
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ gf_boolean_t need_xattrop = _gf_false;
+ void *pending_raw = NULL;
+ int *raw = NULL;
+ int pending[AFR_NUM_CHANGE_LOGS] = {
+ 0,
+ };
+ int i = 0;
+ int j = 0;
+ int val = 0;
+ int ret = -1;
+
+ priv = this->private;
+
+ xattr = dict_new();
+ if (!xattr) {
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t);
+ if (!raw) {
+ goto out;
+ }
+
+ ret = dict_get_ptr(*xdata, priv->pending_key[i], &pending_raw);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "Error getting value "
+ "of pending key %s",
+ priv->pending_key[i]);
+ GF_FREE(raw);
+ goto out;
+ }
+
+ memcpy(pending, pending_raw, sizeof(pending));
+ for (j = 0; j < AFR_NUM_CHANGE_LOGS; j++) {
+ val = ntoh32(pending[j]);
+ if (val) {
+ if (i == healer) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_THIN_ARB,
+ "I am "
+ "not the good shd. Skipping. "
+ "SHD = %d.",
+ healer);
+ ret = 0;
+ GF_FREE(raw);
+ goto out;
+ }
+ need_xattrop = _gf_true;
+ raw[j] = hton32(-val);
+ }
+ }
+
+ ret = dict_set_bin(xattr, priv->pending_key[i], raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ GF_FREE(raw);
+ goto out;
+ }
+
+ if (need_xattrop)
+ break;
+ }
+
+ if (!need_xattrop) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Xattrop failed.");
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
+}
+
+void
+afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc,
+ struct subvol_healer *healer,
+ dict_t *pre_crawl_xdata)
+{
+ int ret_lock = 0;
+ int ret = 0;
+ dict_t *post_crawl_xdata = NULL;
+
+ ret_lock = afr_ta_post_op_lock(this, loc);
+ if (ret_lock)
+ goto unref;
+
+ ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata);
+ if (ret)
+ goto unref;
+
+ if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) {
+ ret = -1;
+ goto unref;
+ }
+
+ ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol);
+
+unref:
+ if (post_crawl_xdata) {
+ dict_unref(post_crawl_xdata);
+ post_crawl_xdata = NULL;
+ }
+
+ if (ret || ret_lock)
+ healer->rerun = 1;
+
+ if (!ret_lock)
+ afr_ta_post_op_unlock(this, loc);
+}
+
+gf_boolean_t
+afr_bricks_available_for_heal(afr_private_t *priv)
+{
+ int up_children = 0;
+
+ up_children = __afr_get_up_children_count(priv);
+ if (up_children < 2) {
+ return _gf_false;
+ }
+ return _gf_true;
+}
+
+static gf_boolean_t
+afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer)
+{
+ dict_t *xdata = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = -1;
+ int i = 0;
+ gf_boolean_t need_heal = _gf_false;
+
+ priv = this->private;
+
+ ret = afr_shd_fill_ta_loc(this, &loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ healer->rerun = 1;
+ goto out;
+ }
+
+ if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) {
+ healer->rerun = 1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+
+ return need_heal;
+}
+
+static int
+afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = healer->this->private;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int count = 0;
+ int i = 0;
+ int op_errno = 0;
+ struct iatt *iatt = NULL;
+ gf_boolean_t multiple_links = _gf_false;
+ unsigned char *gfid_present = alloca0(priv->child_count);
+ unsigned char *entry_present = alloca0(priv->child_count);
+ char *type = "file";
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ gf_msg_debug(healer->this->name, 0,
+ "Not all bricks are up. Skipping "
+ "cleanup of %s on %s",
+ entry->d_name, subvol->name);
+ ret = 0;
+ goto out;
+ }
+
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = gf_uuid_parse(entry->d_name, loc.gfid);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ gfid_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ if (iatt->ia_type == IA_IFDIR) {
+ type = "dir";
+ }
+
+ if (i == healer->subvol) {
+ if (local->replies[i].poststat.ia_nlink > 1) {
+ multiple_links = _gf_true;
+ }
+ }
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ /*Inode is deleted from subvol*/
+ if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type,
+ priv->anon_inode_name, entry->d_name, subvol->name);
+ ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name,
+ iatt->ia_type);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = 0;
+ } else if (count > 1) {
+ loc_wipe(&loc);
+ loc.parent = inode_ref(parent->inode);
+ loc.name = entry->d_name;
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup,
+ &loc, NULL);
+ count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ entry_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (gfid_present[i] && !entry_present[i]) {
+ /*Entry is not anonymous on at least one subvol*/
+ gf_msg_debug(healer->this->name, 0,
+ "Valid entry present on %s "
+ "Skipping cleanup of %s on %s",
+ priv->children[i]->name, entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging %s %s/%s on all subvols", type, priv->anon_inode_name,
+ entry->d_name);
+ ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent,
+ entry->d_name, iatt->ia_type);
+ if (op_errno != ENOENT && op_errno != ESTALE) {
+ ret |= -op_errno;
+ }
+ }
+ }
+
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return ret;
+}
+
+static void
+afr_cleanup_anon_inode_dir(struct subvol_healer *healer)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_private_t *priv = healer->this->private;
+ loc_t loc = {0};
+
+ ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode);
+ if (ret)
+ goto out;
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer,
+ afr_shd_anon_inode_cleaner, NULL,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return;
+}
+
+void *
+afr_shd_index_healer(void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ dict_t *pre_crawl_xdata = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ healer = data;
+ THIS = this = healer->this;
+ priv = this->private;
+
+ for (;;) {
+ afr_shd_healer_wait(healer);
+
+ if (!afr_bricks_available_for_heal(priv))
+ continue;
+
+ ASSERT_LOCAL(this, healer);
+ priv->local[healer->subvol] = healer->local;
+
+ if (priv->thin_arbiter_count) {
+ if (afr_shd_ta_needs_heal(this, healer))
+ afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
+ }
+
+ do {
+ gf_msg_debug(this->name, 0, "starting index sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+
+ afr_shd_sweep_prepare(healer);
+
+ ret = afr_shd_index_sweep_all(healer);
+
+ afr_shd_sweep_done(healer);
+ /*
+ As long as at least one gfid was
+ healed, keep retrying. We may have
+ just healed a directory and thereby
+ created entries for other gfids which
+ could not be healed thus far.
+ */
+
+ gf_msg_debug(this->name, 0, "finished index sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+ /*
+ Give a pause before retrying to avoid a busy loop
+ in case the only entry in index is because of
+ an ongoing I/O.
+ */
+ sleep(1);
+ } while (ret > 0);
+
+ if (ret == 0) {
+ afr_cleanup_anon_inode_dir(healer);
+ }
+
+ if (ret == 0 && pre_crawl_xdata &&
+ !healer->crawl_event.heal_failed_count) {
+ afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
+ pre_crawl_xdata);
+ }
+
+ if (pre_crawl_xdata) {
+ dict_unref(pre_crawl_xdata);
+ pre_crawl_xdata = NULL;
+ }
+ }
+
+ return NULL;
+}
+
+void *
+afr_shd_full_healer(void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int run = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+
+ for (;;) {
+ pthread_mutex_lock(&healer->mutex);
+ {
+ run = __afr_shd_healer_wait(healer);
+ if (!run)
+ healer->running = _gf_false;
+ }
+ pthread_mutex_unlock(&healer->mutex);
+
+ if (!run)
+ break;
+
+ ASSERT_LOCAL(this, healer);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "starting full sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+
+ afr_shd_sweep_prepare(healer);
+
+ afr_shd_full_sweep(healer, this->itable->root);
+
+ afr_shd_sweep_done(healer);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "finished full sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+ }
+
+ return NULL;
+}
+
+int
+afr_shd_healer_init(xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = 0;
+
+ ret = pthread_mutex_init(&healer->mutex, NULL);
+ if (ret)
+ goto out;
+
+ ret = pthread_cond_init(&healer->cond, NULL);
+ if (ret)
+ goto out;
+
+ healer->this = this;
+ healer->running = _gf_false;
+ healer->rerun = _gf_false;
+ healer->local = _gf_false;
+out:
+ return ret;
+}
+
+int
+afr_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer,
+ void *(threadfn)(void *))
+{
+ int ret = 0;
+
+ pthread_mutex_lock(&healer->mutex);
+ {
+ if (healer->running) {
+ pthread_cond_signal(&healer->cond);
+ } else {
+ ret = gf_thread_create(&healer->thread, NULL, threadfn, healer,
+ "shdheal");
+ if (ret)
+ goto unlock;
+ healer->running = 1;
+ }
+
+ healer->rerun = 1;
+ }
+unlock:
+ pthread_mutex_unlock(&healer->mutex);
+
+ return ret;
+}
+
+int
+afr_shd_full_healer_spawn(xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol),
+ afr_shd_full_healer);
+}
+
+int
+afr_shd_index_healer_spawn(xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol),
+ afr_shd_index_healer);
+}
+
+int
+afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
+ crawl_event_t *crawl_event)
+{
+ int ret = 0;
+ uint64_t count = 0;
+ char key[128] = {0};
+ int keylen = 0;
+ char suffix[64] = {0};
+ int xl_id = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = 0;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+ int child = -1;
+
+ child = crawl_event->child;
+ healed_count = crawl_event->healed_count;
+ split_brain_count = crawl_event->split_brain_count;
+ heal_failed_count = crawl_event->heal_failed_count;
+ crawl_type = crawl_event->crawl_type;
+
+ if (!crawl_event->start_time)
+ goto out;
+
+ start_time_str = gf_strdup(ctime(&crawl_event->start_time));
+
+ if (crawl_event->end_time)
+ end_time_str = gf_strdup(ctime(&crawl_event->end_time));
+
+ ret = dict_get_int32(output, this->name, &xl_id);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "xl does not have id");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_get_uint64(output, key, &count);
+
+ snprintf(suffix, sizeof(suffix), "%d-%d-%" PRIu64, xl_id, child, count);
+ snprintf(key, sizeof(key), "statistics_healed_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, healed_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_healed_count to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics_sb_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, split_brain_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_split_brain_count to output");
+ goto out;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_crawl_type-%s", suffix);
+ ret = dict_set_strn(output, key, keylen, crawl_type);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_type to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics_heal_failed_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, heal_failed_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_healed_failed_count to output");
+ goto out;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_strt_time-%s", suffix);
+ ret = dict_set_dynstrn(output, key, keylen, start_time_str);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_start_time to output");
+ goto out;
+ } else {
+ start_time_str = NULL;
+ }
+
+ if (!end_time_str)
+ progress = 1;
+ else
+ progress = 0;
+
+ keylen = snprintf(key, sizeof(key), "statistics_end_time-%s", suffix);
+ if (!end_time_str)
+ end_time_str = gf_strdup("Could not determine the end time");
+ ret = dict_set_dynstrn(output, key, keylen, end_time_str);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_end_time to output");
+ goto out;
+ } else {
+ end_time_str = NULL;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_inprogress-%s", suffix);
+
+ ret = dict_set_int32n(output, key, keylen, progress);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_inprogress to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_set_uint64(output, key, count + 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not increment the counter.");
+ goto out;
+ }
+out:
+ GF_FREE(start_time_str);
+ GF_FREE(end_time_str);
+ return ret;
+}
+
+int
+afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
+ struct timeval *tv)
+{
+ int ret = -1;
+ uint64_t count = 0;
+ char key[64] = {0};
+ int keylen = 0;
+ char xl_id_child_str[32] = {0};
+ int xl_id = 0;
+
+ ret = dict_get_int32(output, this->name, &xl_id);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "xl does not have id");
+ goto out;
+ }
+
+ snprintf(xl_id_child_str, sizeof(xl_id_child_str), "%d-%d", xl_id, child);
+ snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
+ ret = dict_get_uint64(output, key, &count);
+
+ keylen = snprintf(key, sizeof(key), "%s-%" PRIu64, xl_id_child_str, count);
+ ret = dict_set_dynstrn(output, key, keylen, path);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Could not add to output", path);
+ goto out;
+ }
+
+ if (tv) {
+ snprintf(key, sizeof(key), "%s-%" PRIu64 "-time", xl_id_child_str,
+ count);
+ ret = dict_set_uint32(output, key, tv->tv_sec);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Could not set time", path);
+ goto out;
+ }
+ }
+
+ snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
+
+ ret = dict_set_uint64(output, key, count + 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not increment count");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+afr_add_shd_event(circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ shd_event = cb->data;
+
+ if (!shd->index_healers[shd_event->child].local)
+ return 0;
+
+ path = gf_strdup(shd_event->path);
+ if (!path)
+ return -ENOMEM;
+
+ afr_shd_dict_add_path(this, output, shd_event->child, path, &cb->tv);
+ return 0;
+}
+
+int
+afr_add_crawl_event(circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = cb->data;
+
+ if (!shd->index_healers[crawl_event->child].local)
+ return 0;
+
+ afr_shd_dict_add_crawl_event(this, output, crawl_event);
+
+ return 0;
+}
+
+int
+afr_selfheal_daemon_init(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->index_healers)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->index_healers[i].subvol = i;
+ ret = afr_shd_healer_init(this, &shd->index_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->full_healers)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ shd->full_healers[i].subvol = i;
+ ret = afr_shd_healer_init(this, &shd->full_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->split_brain = eh_new(AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->split_brain)
+ goto out;
+
+ shd->statistics = GF_CALLOC(sizeof(eh_t *), priv->child_count,
+ gf_common_mt_eh_t);
+ if (!shd->statistics)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->statistics[i] = eh_new(AFR_STATISTICS_HISTORY_SIZE, _gf_false,
+ afr_destroy_crawl_event_data);
+ if (!shd->statistics[i])
+ goto out;
+ shd->full_healers[i].crawl_event.child = i;
+ shd->full_healers[i].crawl_event.crawl_type = "FULL";
+ shd->index_healers[i].crawl_event.child = i;
+ shd->index_healers[i].crawl_event.crawl_type = "INDEX";
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv)
+{
+ int subvol = 0;
+
+ if (!priv->shd.iamshd)
+ return;
+ for (subvol = 0; subvol < priv->child_count; subvol++)
+ if (priv->child_up[subvol])
+ afr_shd_index_healer_spawn(this, subvol);
+
+ return;
+}
+
+int
+afr_shd_get_index_count(xlator_t *this, int i, uint64_t *count)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ loc_t rootloc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ int ret = -1;
+
+ priv = this->private;
+ subvol = priv->children[i];
+
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_COUNT,
+ NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ ret = dict_get_uint64(xattr, GF_XATTROP_INDEX_COUNT, count);
+ if (ret)
+ goto out;
+
+ ret = 0;
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+ loc_wipe(&rootloc);
+
+ return ret;
+}
+
+int
+afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
+{
+ gf_xl_afr_op_t op = GF_SHD_OP_INVALID;
+ int ret = 0;
+ int xl_id = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ struct subvol_healer *healer = NULL;
+ int i = 0;
+ char key[64];
+ int keylen = 0;
+ int this_name_len = 0;
+ int op_ret = 0;
+ uint64_t cnt = 0;
+
+#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \
+ dict_str_len) \
+ { \
+ int ret; \
+ \
+ ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \
+ if (ret) { \
+ gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \
+ "key=%s", key, "value=%s", dict_str, NULL); \
+ } \
+ }
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=xl-op", NULL);
+ goto out;
+ }
+ this_name_len = strlen(this->name);
+ ret = dict_get_int32n(input, this->name, this_name_len, &xl_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=%s", this->name, NULL);
+ goto out;
+ }
+ ret = dict_set_int32n(output, this->name, this_name_len, xl_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "key=%s", this->name, NULL);
+ goto out;
+ }
+ switch (op) {
+ case GF_SHD_OP_HEAL_INDEX:
+ op_ret = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ op_ret = -1;
+ } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SLESS_THAN2_BRICKS_in_REP,
+ SLEN(SLESS_THAN2_BRICKS_in_REP));
+ op_ret = -1;
+ } else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
+ SLEN(SBRICK_IS_REMOTE));
+ } else {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SSTARTED_SELF_HEAL,
+ SLEN(SSTARTED_SELF_HEAL));
+
+ ret = afr_shd_index_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
+ }
+ }
+ break;
+ case GF_SHD_OP_HEAL_FULL:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->full_healers[i];
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SLESS_THAN2_BRICKS_in_REP,
+ SLEN(SLESS_THAN2_BRICKS_in_REP));
+ } else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
+ SLEN(SBRICK_IS_REMOTE));
+ } else {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SSTARTED_SELF_HEAL,
+ SLEN(SSTARTED_SELF_HEAL));
+
+ ret = afr_shd_full_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
+ op_ret = 0;
+ }
+ }
+ break;
+ case GF_SHD_OP_INDEX_SUMMARY:
+ /* this case has been handled in glfs-heal.c */
+ break;
+ case GF_SHD_OP_SPLIT_BRAIN_FILES:
+ eh_dump(shd->split_brain, output, afr_add_shd_event);
+ break;
+ case GF_SHD_OP_STATISTICS:
+ for (i = 0; i < priv->child_count; i++) {
+ eh_dump(shd->statistics[i], output, afr_add_crawl_event);
+ ret = afr_shd_dict_add_crawl_event(
+ this, output, &shd->index_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
+
+ ret = afr_shd_dict_add_crawl_event(
+ this, output, &shd->full_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
+ }
+ break;
+ case GF_SHD_OP_STATISTICS_HEAL_COUNT:
+ case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i]) {
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id,
+ i);
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ } else {
+ snprintf(key, sizeof(key), "%d-%d-hardlinks", xl_id, i);
+ ret = afr_shd_get_index_count(this, i, &cnt);
+ if (ret == 0) {
+ ret = dict_set_uint64(output, key, cnt);
+ }
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_DICT_SET_FAILED, NULL);
+ }
+ op_ret = 0;
+ }
+ }
+
+ break;
+
+ default:
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d",
+ op, NULL);
+ break;
+ }
+out:
+ dict_deln(output, this->name, this_name_len);
+ return op_ret;
+
+#undef AFR_SET_DICT_AND_LOG
+}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
new file mode 100644
index 00000000000..18db728ea7b
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -0,0 +1,75 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _AFR_SELF_HEALD_H
+#define _AFR_SELF_HEALD_H
+
+#include <pthread.h>
+
+typedef struct {
+ char *path;
+ int child;
+} shd_event_t;
+
+typedef struct {
+ uint64_t healed_count;
+ uint64_t split_brain_count;
+ uint64_t heal_failed_count;
+
+ /* If start_time is 0, it means crawler is not in progress
+ and stats are not valid */
+ time_t start_time;
+ /* If start_time is NOT 0 and end_time is 0, it means
+ cralwer is in progress */
+ time_t end_time;
+ char *crawl_type;
+ int child;
+} crawl_event_t;
+
+struct subvol_healer {
+ xlator_t *this;
+ crawl_event_t crawl_event;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ pthread_t thread;
+ int subvol;
+ gf_boolean_t local;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
+};
+
+typedef struct {
+ struct subvol_healer *index_healers;
+ struct subvol_healer *full_healers;
+
+ eh_t *split_brain;
+ eh_t **statistics;
+ int timeout;
+ uint32_t max_threads;
+ uint32_t wait_qlength;
+ uint32_t halo_max_latency_msec;
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
+} afr_self_heald_t;
+
+int
+afr_selfheal_daemon_init(xlator_t *this);
+
+int
+afr_xl_op(xlator_t *this, dict_t *input, dict_t *output);
+
+int
+afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid,
+ char **path_p);
+
+int
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
+ ia_type_t type);
+#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index ac329958d2a..a51f79b1f43 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -1,1238 +1,2927 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "dict.h"
-#include "byte-order.h"
-#include "common-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/timer.h>
#include "afr.h"
#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
#include <signal.h>
+typedef enum {
+ AFR_TRANSACTION_PRE_OP,
+ AFR_TRANSACTION_POST_OP,
+} afr_xattrop_type_t;
-#define LOCKED_NO 0x0 /* no lock held */
-#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path
- of RENAME */
-#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */
+static void
+afr_lock_resume_shared(struct list_head *list);
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this);
-afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this)
-{
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno);
- ret = fd_ctx_get (fd, this, &ctx);
+void
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
- if (ret < 0)
- goto out;
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this);
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+int
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this);
-out:
- return fd_ctx;
-}
+gf_boolean_t
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this);
+gf_boolean_t
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_call_count(afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned char *failed_subvols,
+ unsigned int child_count);
+int
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume,
+ afr_xattrop_type_t op);
static void
-afr_pid_save (call_frame_t *frame)
-{
- afr_local_t * local = NULL;
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this);
- local = frame->local;
+static int
+afr_ta_post_op_do(void *opaque);
- local->saved_pid = frame->root->pid;
-}
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local);
+static int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);
static void
-afr_pid_restore (call_frame_t *frame)
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno);
+
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv)
{
- afr_local_t * local = NULL;
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ priv->ta_notify_dom_lock_offset = 0;
+}
- local = frame->local;
+static void
+afr_ta_process_waitq(xlator_t *this)
+{
+ afr_local_t *entry = NULL;
+ afr_private_t *priv = this->private;
+ struct list_head waitq = {
+ 0,
+ };
+
+ INIT_LIST_HEAD(&waitq);
+ LOCK(&priv->lock);
+ list_splice_init(&priv->ta_waitq, &waitq);
+ UNLOCK(&priv->lock);
+ list_for_each_entry(entry, &waitq, ta_waitq)
+ {
+ afr_ta_decide_post_op_state(entry->transaction.frame, this);
+ }
+}
- frame->root->pid = local->saved_pid;
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ afr_ta_process_waitq(ta_frame->this);
+ STACK_DESTROY(ta_frame->root);
+ return 0;
}
+int
+afr_release_notify_lock_for_ta(void *opaque)
+{
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = -1;
+
+ this = (xlator_t *)opaque;
+ priv = this->private;
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+ flock.l_type = F_UNLCK;
+ flock.l_start = priv->ta_notify_dom_lock_offset;
+ flock.l_len = 1;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+ }
+
+ LOCK(&priv->lock);
+ {
+ afr_ta_locked_priv_invalidate(priv);
+ }
+ UNLOCK(&priv->lock);
+out:
+ loc_wipe(&loc);
+ return ret;
+}
-static void
-__mark_all_pending (int32_t *pending[], int child_count,
- afr_transaction_type type)
+void
+afr_zero_fill_stat(afr_local_t *local)
{
- int i = 0;
- int j = 0;
+ if (!local)
+ return;
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION) {
+ gf_zero_fill_stat(&local->cont.inode_wfop.prebuf);
+ gf_zero_fill_stat(&local->cont.inode_wfop.postbuf);
+ } else if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ gf_zero_fill_stat(&local->cont.dir_fop.buf);
+ gf_zero_fill_stat(&local->cont.dir_fop.preparent);
+ gf_zero_fill_stat(&local->cont.dir_fop.postparent);
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION)
+ return;
+ gf_zero_fill_stat(&local->cont.dir_fop.prenewparent);
+ gf_zero_fill_stat(&local->cont.dir_fop.postnewparent);
+ }
+}
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (1);
+/* In case of errors afr needs to choose which xdata from lower xlators it needs
+ * to unwind with. The way it is done is by checking if there are
+ * any good subvols which failed. Give preference to errnos other than
+ * ENOTCONN even if the child is source */
+void
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+ unsigned char *readable1, inode_t *inode2,
+ unsigned char *readable2)
+{
+ int s = -1; /*selection*/
+ int i = 0;
+ unsigned char *readable = NULL;
+
+ if (local->xdata_rsp) {
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ }
+
+ readable = alloca0(priv->child_count * sizeof(*readable));
+ if (inode2 && readable2) { /*rename fop*/
+ AFR_INTERSECT(readable, readable1, readable2, priv->child_count);
+ } else {
+ memcpy(readable, readable1, sizeof(*readable) * priv->child_count);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret >= 0)
+ continue;
+
+ if (local->replies[i].op_errno == ENOTCONN)
+ continue;
+
+ /*Order is important in the following condition*/
+ if ((s < 0) || (!readable[s] && readable[i]))
+ s = i;
+ }
+
+ if (s != -1 && local->replies[s].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[s].xdata);
+ } else if (s == -1) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret >= 0)
+ continue;
+
+ if (!local->replies[i].xdata)
+ continue;
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ break;
}
+ }
}
+gf_boolean_t
+afr_needs_changelog_update(afr_local_t *local)
+{
+ if (local->transaction.type == AFR_DATA_TRANSACTION)
+ return _gf_true;
+ if (!local->optimistic_change_log)
+ return _gf_true;
+ return _gf_false;
+}
-static void
-__mark_child_dead (int32_t *pending[], int child_count, int child,
- afr_transaction_type type)
+gf_boolean_t
+afr_changelog_has_quorum(afr_local_t *local, xlator_t *this)
{
- int j = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ unsigned char *success_children = NULL;
+
+ priv = this->private;
+ success_children = alloca0(priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i]) {
+ success_children[i] = 1;
+ }
+ }
- j = afr_index_for_transaction_type (type);
+ if (afr_has_quorum(success_children, this, NULL)) {
+ return _gf_true;
+ }
- pending[child][j] = 0;
+ return _gf_false;
}
+gf_boolean_t
+afr_is_write_subvol_valid(call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ uint64_t write_subvol = 0;
+ unsigned char *writable = NULL;
+ uint16_t datamap = 0;
+
+ local = frame->local;
+ priv = this->private;
+ writable = alloca0(priv->child_count);
+
+ write_subvol = afr_write_subvol_get(frame, this);
+ datamap = (write_subvol & 0x00000000ffff0000) >> 16;
+ for (i = 0; i < priv->child_count; i++) {
+ if (datamap & (1 << i))
+ writable[i] = 1;
+
+ if (writable[i] && !local->transaction.failed_subvols[i])
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
-static void
-__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+int
+afr_transaction_fop(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ unsigned char *failed_subvols = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ failed_subvols = local->transaction.failed_subvols;
+ call_count = priv->child_count -
+ AFR_COUNT(failed_subvols, priv->child_count);
+ /* Fail if pre-op did not succeed on quorum no. of bricks. */
+ if (!afr_changelog_has_quorum(local, this) || !call_count) {
+ local->op_ret = -1;
+ /* local->op_errno is already captured in changelog cbk. */
+ afr_transaction_resume(frame, this);
+ return 0;
+ }
+
+ /* Fail if at least one writeable brick isn't up.*/
+ if (local->transaction.type == AFR_DATA_TRANSACTION &&
+ !afr_is_write_subvol_valid(frame, this)) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ afr_transaction_resume(frame, this);
+ return 0;
+ }
- local = frame->local;
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] && !failed_subvols[i]) {
+ local->transaction.wind(frame, this, i);
- if (!local->fd)
- return;
+ if (!--call_count)
+ break;
+ }
+ }
- fd_ctx = afr_fd_ctx_get (local->fd, this);
+ return 0;
+}
- if (!fd_ctx)
- goto out;
+int
+afr_transaction_done(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t unwind = _gf_false;
+ afr_lock_t *lock = NULL;
+ afr_local_t *lock_local = NULL;
+
+ priv = this->private;
+ local = frame->local;
- LOCK (&local->fd->lock);
+ if (priv->consistent_metadata) {
+ LOCK(&frame->lock);
{
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- fd_ctx->pre_op_done[child_index]++;
+ unwind = (local->transaction.main_frame != NULL);
}
- UNLOCK (&local->fd->lock);
-out:
- return;
+ UNLOCK(&frame->lock);
+ if (unwind) /*It definitely did post-op*/
+ afr_zero_fill_stat(local);
+ }
+
+ if (local->transaction.do_eager_unlock) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ lock->acquired = _gf_false;
+ lock->release = _gf_false;
+ list_splice_init(&lock->frozen, &lock->waiting);
+ if (list_empty(&lock->waiting))
+ goto unlock;
+ lock_local = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ list_del_init(&lock_local->transaction.wait_list);
+ list_add(&lock_local->transaction.owner_list, &lock->owners);
+ }
+ unlock:
+ UNLOCK(&local->inode->lock);
+ }
+ if (lock_local) {
+ afr_lock(lock_local->transaction.frame,
+ lock_local->transaction.frame->this);
+ }
+ local->transaction.unwind(frame, this);
+
+ GF_ASSERT(list_empty(&local->transaction.owner_list));
+ GF_ASSERT(list_empty(&local->transaction.wait_list));
+ AFR_STACK_DESTROY(frame);
+
+ return 0;
}
-
static void
-__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t, transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ each->op_ret = -1;
+ each->op_errno = local->op_errno;
+ afr_transaction_done(each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
- local = frame->local;
+static void
+afr_handle_lock_acquire_failure(afr_local_t *local)
+{
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
- if (!local->fd)
- return;
+ if (!local->transaction.eager_lock_on)
+ goto out;
- fd_ctx = afr_fd_ctx_get (local->fd, this);
+ lock = &local->inode_ctx->lock[local->transaction.type];
- if (!fd_ctx)
- goto out;
+ INIT_LIST_HEAD(&shared);
+ LOCK(&local->inode->lock);
+ {
+ lock->release = _gf_true;
+ list_splice_init(&lock->waiting, &shared);
+ }
+ UNLOCK(&local->inode->lock);
- LOCK (&local->fd->lock);
- {
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- fd_ctx->pre_op_done[child_index]--;
- }
- UNLOCK (&local->fd->lock);
+ afr_lock_fail_shared(local, &shared);
+ local->transaction.do_eager_unlock = _gf_true;
out:
- return;
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ afr_unlock(local->transaction.frame, local->transaction.frame->this);
}
-
-static void
-__mark_down_children (int32_t *pending[], int child_count,
- unsigned char *child_up, afr_transaction_type type)
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame)
{
- int i = 0;
- int j = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *fop_frame = NULL;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
+ local = frame->local;
- if (!child_up[i])
- pending[i][j] = 0;
- }
-}
+ afr_handle_inconsistent_fop(frame, &local->op_ret, &local->op_errno);
+ LOCK(&frame->lock);
+ {
+ fop_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK(&frame->lock);
+ return fop_frame;
+}
static void
-__mark_all_success (int32_t *pending[], int child_count,
- afr_transaction_type type)
+afr_save_lk_owner(call_frame_t *frame)
{
- int i;
- int j;
+ afr_local_t *local = NULL;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (-1);
- }
-}
+ local = frame->local;
+ local->saved_lk_owner = frame->root->lk_owner;
+}
-static int
-__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
+static void
+afr_restore_lk_owner(call_frame_t *frame)
{
- int ret = 0;
+ afr_local_t *local = NULL;
- switch (type) {
- case AFR_DATA_TRANSACTION:
- if (priv->data_change_log)
- ret = 1;
+ local = frame->local;
- break;
+ frame->root->lk_owner = local->saved_lk_owner;
+}
- case AFR_METADATA_TRANSACTION:
- if (priv->metadata_change_log)
- ret = 1;
+void
+__mark_all_success(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i;
- break;
+ local = frame->local;
+ priv = this->private;
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- if (priv->entry_change_log)
- ret = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ local->transaction.failed_subvols[i] = 0;
+ }
+}
- break;
- }
+void
+afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_transaction_type type = -1;
+ dict_t *xdata = NULL;
+ int **matrix = NULL;
+ int idx = -1;
+ int i = 0;
+ int j = 0;
+
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
+ idx = afr_index_for_transaction_type(type);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.changelog_xdata[i])
+ continue;
+ xdata = local->transaction.changelog_xdata[i];
+ afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+ }
+
+ memset(local->transaction.pre_op_sources, 1, priv->child_count);
+
+ /*If lock or pre-op failed on a brick, it is not a source. */
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->transaction.pre_op_sources[i] = 0;
+ }
+
+ /* If brick is blamed by others, it is not a source. */
+ for (i = 0; i < priv->child_count; i++)
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j] != 0)
+ local->transaction.pre_op_sources[j] = 0;
+}
- return ret;
+void
+afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_sources_count = 0;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_compute_pre_op_sources(frame, this);
+ pre_op_sources_count = AFR_COUNT(local->transaction.pre_op_sources,
+ priv->child_count);
+
+ /* If arbiter is the only source, do not proceed. */
+ if (pre_op_sources_count < 2 &&
+ local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
+ }
+
+ afr_transaction_fop(frame, this);
+
+ return;
}
+int
+afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int ret = 0;
+ int failure_count = 0;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ INIT_LIST_HEAD(&shared);
+ if (local->transaction.type == AFR_DATA_TRANSACTION &&
+ !local->transaction.inherited) {
+ ret = afr_write_subvol_set(frame, this);
+ if (ret) {
+ /*act as if operation failed on all subvols*/
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
+ }
+ }
+
+ if (local->pre_op_compat)
+ /* old mode, pre-op was done as afr_changelog_do()
+ just now, before OP */
+ afr_changelog_pre_op_update(frame, this);
+
+ if (!local->transaction.eager_lock_on || local->transaction.inherited)
+ goto fop;
+ failure_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failure_count == priv->child_count) {
+ afr_handle_lock_acquire_failure(local);
+ return 0;
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ lock->acquired = _gf_true;
+ __afr_transaction_wake_shared(local, &shared);
+ }
+ UNLOCK(&local->inode->lock);
+ }
+
+fop:
+ /* Perform fops with the lk-owner from top xlator.
+ * Eg: lk-owner of posix-lk and flush should be same,
+ * flush cant clear the posix-lks without that lk-owner.
+ */
+ afr_save_lk_owner(frame);
+ frame->root->lk_owner = local->transaction.main_frame->root->lk_owner;
+
+ if (priv->arbiter_count == 1) {
+ afr_txn_arbitrate_fop(frame, this);
+ } else {
+ afr_transaction_fop(frame, this);
+ }
+
+ afr_lock_resume_shared(&shared);
+ return 0;
+}
-static int
-__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
+int
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
+ int i = 0;
+ int ret = 0;
- int op_ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], pending[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ /* 3 = data+metadata+entry */
- priv = this->private;
- local = frame->local;
+ if (ret)
+ break;
+ }
- if (__changelog_enabled (priv, local->transaction.type)) {
- switch (local->op) {
+ return ret;
+}
- case GF_FOP_WRITE:
- case GF_FOP_FTRUNCATE:
- op_ret = 1;
- break;
+static void
+afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ gf_boolean_t release = _gf_false;
+
+ LOCK(&priv->lock);
+ {
+ /*Once we get notify lock release upcall notification,
+ if any of the fop state counters are non-zero, we will
+ not release the lock.
+ */
+ onwire_count = priv->ta_on_wire_txn_count;
+ inmem_count = priv->ta_in_mem_txn_count;
+ switch (fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ onwire_count = --priv->ta_on_wire_txn_count;
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ case TA_INFO_IN_MEMORY_FAILED:
+ inmem_count = --priv->ta_in_mem_txn_count;
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ GF_ASSERT(0);
+ break;
+ case TA_SUCCESS:
+ break;
+ }
+ release = priv->release_ta_notify_dom_lock;
+ }
+ UNLOCK(&priv->lock);
- case GF_FOP_FLUSH:
- op_ret = 0;
- break;
+ if (inmem_count != 0 || release == _gf_false || onwire_count != 0)
+ return;
- default:
- op_ret = 1;
- }
+ afr_ta_lock_release_synctask(this);
+}
+
+static void
+afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *entry = NULL;
+ int bad_child = AFR_CHILD_UNKNOWN;
+
+ struct list_head onwireq = {
+ 0,
+ };
+ INIT_LIST_HEAD(&onwireq);
+
+ LOCK(&priv->lock);
+ {
+ bad_child = priv->ta_bad_child_index;
+ if (bad_child == AFR_CHILD_UNKNOWN) {
+ /*The previous on-wire ta_post_op was a failure. Just dequeue
+ *one element to wind on-wire again. */
+ entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ } else {
+ /* Prepare to process all fops based on bad_child_index. */
+ list_splice_init(&priv->ta_onwireq, &onwireq);
}
+ }
+ UNLOCK(&priv->lock);
- return op_ret;
+ if (entry) {
+ afr_ta_post_op_synctask(this, entry);
+ return;
+ } else {
+ while (!list_empty(&onwireq)) {
+ entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ if (entry->ta_failed_subvol == bad_child) {
+ afr_post_op_handle_success(entry->transaction.frame, this);
+ } else {
+ afr_post_op_handle_failure(entry->transaction.frame, this, EIO);
+ }
+ }
+ }
}
+int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+ int_lock = &local->internal_lock;
+
+ if (priv->thin_arbiter_count) {
+ /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */
+ afr_ta_dom_lock_check_and_release(local->fop_state, this);
+ }
+
+ /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
+ if (!afr_changelog_has_quorum(local, this)) {
+ local->op_ret = -1;
+ /*local->op_errno is already captured in changelog cbk*/
+ }
+
+ if (local->transaction.resume_stub) {
+ call_resume(local->transaction.resume_stub);
+ local->transaction.resume_stub = NULL;
+ }
+
+ int_lock->lock_cbk = afr_transaction_done;
+ afr_unlock(frame, this);
+
+ return 0;
+}
-static int
-__changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int op_ret = 0;
- afr_transaction_type type = -1;
+ afr_local_t *local = frame->local;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- priv = this->private;
- local = frame->local;
- type = local->transaction.type;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB,
+ "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op],
+ uuid_utoa(local->inode->gfid), local->fop_state);
- if (__changelog_enabled (priv, type)) {
- switch (local->op) {
+ afr_changelog_post_op_done(frame, this);
+}
- case GF_FOP_WRITE:
- case GF_FOP_FTRUNCATE:
- op_ret = 1;
- break;
+unsigned char *
+afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)
+{
+ /*Because same set of subvols participate in all lockee
+ * entities*/
+ return int_lock->lockee[0].locked_nodes;
+}
- case GF_FOP_FLUSH:
- op_ret = 0;
- break;
+int
+afr_changelog_call_count(afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned char *failed_subvols,
+ unsigned int child_count)
+{
+ int i = 0;
+ int call_count = 0;
- default:
- op_ret = 1;
- }
+ for (i = 0; i < child_count; i++) {
+ if (pre_op_subvols[i] && !failed_subvols[i]) {
+ call_count++;
}
+ }
- return op_ret;
-}
+ if (type == AFR_ENTRY_RENAME_TRANSACTION)
+ call_count *= 2;
+ return call_count;
+}
-static int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending)
+gf_boolean_t
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this)
{
- int i = 0;
- int ret = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (priv->thin_arbiter_count) {
+ /* We need to perform post-op even if 1 data brick was down
+ * before the txn started.*/
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count))
+ return _gf_false;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ local->transaction.failed_subvols[i])
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin (xattr, priv->pending_key[i],
- pending[i], 3 * sizeof (int32_t));
- /* 3 = data+metadata+entry */
+void
+afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this)
+{
+ if (afr_is_symmetric_error(frame, this))
+ __mark_all_success(frame, this);
+}
- if (ret < 0)
- goto out;
+gf_boolean_t
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame)
+{
+ unsigned int quorum_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned int up_children_count = 0;
+
+ priv = this->private;
+ up_children_count = AFR_COUNT(subvols, priv->child_count);
+
+ if (afr_lookup_has_quorum(frame, up_children_count))
+ return _gf_true;
+
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ /*
+ * Special case for auto-quorum with an even number of nodes.
+ *
+ * A replica set with even count N can only handle the same
+ * number of failures as odd N-1 before losing "vanilla"
+ * quorum, and the probability of more simultaneous failures is
+ * actually higher. For example, with a 1% chance of failure
+ * we'd have a 0.03% chance of two simultaneous failures with
+ * N=3 but a 0.06% chance with N=4. However, the special case
+ * is necessary for N=2 because there's no real quorum in that
+ * case (i.e. can't normally survive *any* failures). In that
+ * case, we treat the first node as a tie-breaker, allowing
+ * quorum to be retained in some cases while still honoring the
+ * all-important constraint that there can not simultaneously
+ * be two partitioned sets of nodes each believing they have
+ * quorum. Of two equally sized sets, the one without that
+ * first node will lose.
+ *
+ * It turns out that the special case is beneficial for higher
+ * values of N as well. Continuing the example above, the
+ * probability of losing quorum with N=4 and this type of
+ * quorum is (very) slightly lower than with N=3 and vanilla
+ * quorum. The difference becomes even more pronounced with
+ * higher N. Therefore, even though such replica counts are
+ * unlikely to be seen in practice, we might as well use the
+ * "special" quorum then as well.
+ */
+ if ((up_children_count * 2) == priv->child_count) {
+ return subvols[0];
}
+ }
-out:
- return ret;
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ quorum_count = priv->child_count / 2 + 1;
+ } else {
+ quorum_count = priv->quorum_count;
+ }
+
+ if (up_children_count >= quorum_count)
+ return _gf_true;
+
+ return _gf_false;
}
+static gf_boolean_t
+afr_has_fop_quorum(call_frame_t *frame)
+{
+ xlator_t *this = frame->this;
+ afr_local_t *local = frame->local;
+ unsigned char *locked_nodes = NULL;
-static int
-afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
- afr_transaction_type type)
-{
- int i = 0;
- int ret = 0;
- int *arr = NULL;
- int index = 0;
- size_t pending_xattr_size = 3 * sizeof (int32_t);
- /* 3 = data+metadata+entry */
+ locked_nodes = afr_locked_nodes_get(local->transaction.type,
+ &local->internal_lock);
+ return afr_has_quorum(locked_nodes, this, NULL);
+}
- index = afr_index_for_transaction_type (type);
+static gf_boolean_t
+afr_has_fop_cbk_quorum(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ unsigned char *success = alloca0(priv->child_count);
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i])
+ if (!local->transaction.failed_subvols[i])
+ success[i] = 1;
+ }
+
+ return afr_has_quorum(success, this, NULL);
+}
- for (i = 0; i < priv->child_count; i++) {
- arr = GF_CALLOC (1, pending_xattr_size,
- gf_afr_mt_char);
- if (!arr) {
- ret = -1;
- goto out;
- }
+gf_boolean_t
+afr_need_dirty_marking(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ gf_boolean_t need_dirty = _gf_false;
- memcpy (arr, pending[i], pending_xattr_size);
+ local = frame->local;
- arr[index] = hton32 (ntoh32 (arr[index]) + 1);
+ if (!priv->quorum_count || !local->optimistic_change_log)
+ return _gf_false;
- ret = dict_set_bin (xattr, priv->pending_key[i],
- arr, pending_xattr_size);
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION)
+ return _gf_false;
- if (ret < 0)
- goto out;
- }
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) ==
+ priv->child_count)
+ return _gf_false;
-out:
- return ret;
-}
+ if (!afr_has_fop_cbk_quorum(frame))
+ need_dirty = _gf_true;
+ return need_dirty;
+}
-int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
+void
+afr_handle_quorum(call_frame_t *frame, xlator_t *this)
{
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ const char *file = NULL;
+ uuid_t gfid = {0};
- switch (type) {
- case AFR_DATA_TRANSACTION:
- ret = priv->child_count;
- break;
+ local = frame->local;
+ priv = frame->this->private;
- case AFR_METADATA_TRANSACTION:
- ret = priv->child_count;
- break;
+ if (priv->quorum_count == 0)
+ return;
+
+ /* If the fop already failed return right away to preserve errno */
+ if (local->op_ret == -1)
+ return;
+ /*
+ * Network split may happen just after the fops are unwound, so check
+ * if the fop succeeded in a way it still follows quorum. If it doesn't,
+ * mark the fop as failure, mark the changelogs so it reflects that
+ * failure.
+ *
+ * Scenario:
+ * There are 3 mounts on 3 machines(node1, node2, node3) all writing to
+ * single file. Network split happened in a way that node1 can't see
+ * node2, node3. Node2, node3 both of them can't see node1. Now at the
+ * time of sending write all the bricks are up. Just after write fop is
+ * wound on node1, network split happens. Node1 thinks write fop failed
+ * on node2, node3 so marks pending changelog for those 2 extended
+ * attributes on node1. Node2, node3 thinks writes failed on node1 so
+ * they mark pending changelog for node1. When the network is stable
+ * again the file already is in split-brain. These checks prevent
+ * marking pending changelog on other subvolumes if the fop doesn't
+ * succeed in a way it is still following quorum. So with this fix what
+ * is happening is, node1 will have all pending changelog(FOOL) because
+ * the write succeeded only on node1 but failed on node2, node3 so
+ * instead of marking pending changelogs on node2, node3 it just treats
+ * the fop as failure and goes into DIRTY state. Where as node2, node3
+ * say they are sources and have pending changelog to node1 so there is
+ * no split-brain with the fix. The problem is eliminated completely.
+ */
+
+ if (afr_has_fop_cbk_quorum(frame))
+ return;
+
+ if (afr_need_dirty_marking(frame, this))
+ goto set_response;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i])
+ afr_transaction_fop_failed(frame, frame->this, i);
+ }
+
+set_response:
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ if (local->op_errno == 0)
+ local->op_errno = afr_quorum_errno(priv);
+
+ if (local->fd) {
+ gf_uuid_copy(gfid, local->fd->inode->gfid);
+ file = uuid_utoa(gfid);
+ } else {
+ loc_path(&local->loc, local->loc.name);
+ file = local->loc.path;
+ }
+
+ gf_msg(frame->this->name, GF_LOG_WARNING, local->op_errno,
+ AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met", file,
+ gf_fop_list[local->op]);
+
+ switch (local->transaction.type) {
case AFR_ENTRY_TRANSACTION:
case AFR_ENTRY_RENAME_TRANSACTION:
- ret = priv->child_count;
- break;
- }
+ afr_pick_error_xdata(local, priv, local->parent, local->readable,
+ local->parent2, local->readable2);
+ break;
+ default:
+ afr_pick_error_xdata(local, priv, local->inode, local->readable,
+ NULL, NULL);
+ break;
+ }
+}
- return ret;
+int
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ loc->parent = inode_ref(priv->root_inode);
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) {
+ /* Except afr_ta_id_file_check() which is path based, all other gluster
+ * FOPS need gfid.*/
+ return -EINVAL;
+ }
+ gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ loc->inode = inode_new(loc->parent->table);
+ if (!loc->inode) {
+ loc_wipe(loc);
+ return -ENOMEM;
+ }
+ return 0;
}
-/* {{{ pending */
+static int
+afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
+{
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *txn_frame = NULL;
+ afr_ta_fop_state_t fop_state;
+
+ local = (afr_local_t *)opaque;
+ fop_state = local->fop_state;
+ txn_frame = local->transaction.frame;
+ this = frame->this;
+
+ if (ret == 0) {
+ /*Mark pending xattrs on the up data brick.*/
+ afr_post_op_handle_success(txn_frame, this);
+ } else {
+ afr_post_op_handle_failure(txn_frame, this, -ret);
+ }
+
+ STACK_DESTROY(frame->root);
+ afr_ta_process_onwireq(fop_state, this);
+
+ return 0;
+}
-int32_t
-afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+int **
+afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, afr_local_t *local)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int child_index = 0;
- int call_count = -1;
+ int **changelog = NULL;
+ int idx = 0;
+ int ret = 0;
+ int i;
+
+ if (local->is_new_entry == _gf_true) {
+ changelog = afr_mark_pending_changelog(priv, pending, xattr,
+ local->cont.dir_fop.buf.ia_type);
+ } else {
+ idx = afr_index_for_transaction_type(local->transaction.type);
+ changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog) {
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ changelog[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict(priv, xattr, changelog);
+ if (ret < 0) {
+ afr_matrix_cleanup(changelog, priv->child_count);
+ return NULL;
+ }
+ }
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+out:
+ return changelog;
+}
- child_index = (long) cookie;
+static void
+afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local,
+ gf_boolean_t *valid)
+{
+ if (priv->ta_event_gen > local->ta_event_gen) {
+ /* We can't trust the ta's response anymore.*/
+ afr_ta_locked_priv_invalidate(priv);
+ *valid = _gf_false;
+ return;
+ }
+ return;
+}
- if (op_ret == 1) {
+static int
+afr_ta_post_op_do(void *opaque)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
+ dict_t *xattr = NULL;
+ unsigned char *pending = NULL;
+ int **changelog = NULL;
+ int failed_subvol = -1;
+ int success_subvol = -1;
+ loc_t loc = {
+ 0,
+ };
+ int i = 0;
+ int ret = 0;
+ gf_boolean_t valid = _gf_true;
+
+ local = (afr_local_t *)opaque;
+ this = local->transaction.frame->this;
+ priv = this->private;
+
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+
+ xattr = dict_new();
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pending = alloca0(priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ pending[i] = 1;
+ failed_subvol = i;
+ } else {
+ success_subvol = i;
}
-
- if (op_ret == 0) {
- __mark_pre_op_undone_on_fd (frame, this, child_index);
+ }
+
+ changelog = afr_set_changelog_xattr(priv, pending, xattr, local);
+
+ if (!changelog) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = afr_ta_post_op_lock(this, &loc);
+ if (ret)
+ goto out;
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s failed for gfid %s.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ }
+ LOCK(&priv->lock);
+ {
+ if (ret == 0) {
+ priv->ta_bad_child_index = failed_subvol;
+ } else if (ret == -EINVAL) {
+ priv->ta_bad_child_index = success_subvol;
+ ret = -EIO; /* TA failed the fop. Return EIO to application. */
}
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
+ afr_ta_locked_xattrop_validate(priv, local, &valid);
+ }
+ UNLOCK(&priv->lock);
+ if (valid == _gf_false) {
+ gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s for gfid %s invalidated due "
+ "to event-gen mismatch.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ ret = -EIO;
+ }
+
+ afr_ta_post_op_unlock(this, &loc);
+out:
+ if (xattr)
+ dict_unref(xattr);
- if (call_count == 0) {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
- }
+ if (changelog)
+ afr_matrix_cleanup(changelog, priv->child_count);
- return 0;
-}
+ loc_wipe(&loc);
+ return ret;
+}
-void
-afr_update_read_child (call_frame_t *frame, xlator_t *this, inode_t *inode,
- afr_transaction_type type)
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local)
{
- int curr_read_child = -1;
- int new_read_child = -1;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int **pending = NULL;
- int idx = 0;
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
+
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ goto err;
+ }
+ ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done,
+ ta_frame, local);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to launch post-op on thin arbiter for gfid %s",
+ uuid_utoa(local->inode->gfid));
+ STACK_DESTROY(ta_frame->root);
+ goto err;
+ }
+
+ return ret;
+err:
+ afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM);
+ return ret;
+}
- idx = afr_index_for_transaction_type (type);
+static void
+afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,
+ int *on_wire_count)
+{
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Put the fop in waitq until notify dom lock is released.*/
+ local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL;
+ list_add_tail(&local->ta_waitq, &priv->ta_waitq);
+ } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) {
+ /* Post-op on thin-arbiter to decide success/failure. */
+ local->fop_state = TA_GET_INFO_FROM_TA_FILE;
+ *on_wire_count = ++priv->ta_on_wire_txn_count;
+ if (*on_wire_count > 1) {
+ /*Avoid sending multiple on-wire post-ops on TA*/
+ list_add_tail(&local->ta_onwireq, &priv->ta_onwireq);
+ }
+ } else if (local->ta_failed_subvol == priv->ta_bad_child_index) {
+ /* Post-op on TA not needed as the fop failed on the in-memory bad
+ * brick. Just mark pending xattrs on the good data brick.*/
+ local->fop_state = TA_INFO_IN_MEMORY_SUCCESS;
+ priv->ta_in_mem_txn_count++;
+ } else {
+ /* Post-op on TA not needed as the fop succeeded only on the
+ * in-memory bad data brick and not the good one. Fail the fop.*/
+ local->fop_state = TA_INFO_IN_MEMORY_FAILED;
+ priv->ta_in_mem_txn_count++;
+ }
+ }
+ UNLOCK(&priv->lock);
+}
- priv = this->private;
- local = frame->local;
- curr_read_child = afr_read_child (this, inode);
- pending = local->pending;
+static void
+afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local)
+{
+ int i = 0;
- if (pending[curr_read_child][idx] != 0)
- return;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ local->ta_failed_subvol = i;
+ break;
+ }
+ }
+}
- /* need to set new read_child */
- for (new_read_child = 0; new_read_child < priv->child_count;
- new_read_child++) {
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
- if (!priv->child_up[new_read_child])
- /* child is down */
- continue;
+ local = frame->local;
+ if (local->is_new_entry == _gf_true) {
+ afr_mark_new_entry_changelog(frame, this);
+ }
+ afr_changelog_post_op_do(frame, this);
- if (pending[new_read_child][idx] == 0)
- /* op just failed */
- continue;
+ return;
+}
- break;
- }
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+ afr_changelog_post_op_fail(frame, this, op_errno);
- if (new_read_child == priv->child_count)
- /* all children uneligible. leave as-is */
- return;
+ return;
+}
- afr_set_read_child (this, inode, new_read_child);
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int on_wire_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_ta_set_fop_state(priv, local, &on_wire_count);
+
+ switch (local->fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ if (on_wire_count == 1)
+ afr_ta_post_op_synctask(this, local);
+ /*else, fop is queued in ta_onwireq.*/
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ /*Post releasing the notify lock, we will act on this queue*/
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ afr_post_op_handle_success(frame, this);
+ break;
+ case TA_INFO_IN_MEMORY_FAILED:
+ afr_post_op_handle_failure(frame, this, EIO);
+ break;
+ default:
+ break;
+ }
+ return;
}
+static void
+afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ afr_ta_fill_failed_subvol(priv, local);
+ gf_msg_debug(this->name, 0,
+ "Fop failed on data brick (%s) for gfid=%s. "
+ "ta info needed to decide fop result.",
+ priv->children[local->ta_failed_subvol]->name,
+ uuid_utoa(local->inode->gfid));
+ afr_ta_decide_post_op_state(frame, this);
+}
-int
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = this->private;
- afr_internal_lock_t *int_lock = NULL;
- int ret = 0;
- int i = 0;
- int call_count = 0;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ dict_t *xattr = NULL;
+ int i = 0;
+ int ret = 0;
+ int idx = 0;
+ int nothing_failed = 1;
+ gf_boolean_t need_undirty = _gf_false;
+
+ afr_handle_quorum(frame, this);
+ local = frame->local;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ xattr = dict_new();
+ if (!xattr) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ nothing_failed = afr_txn_nothing_failed(frame, this);
+
+ if (afr_changelog_pre_op_uninherit(frame, this))
+ need_undirty = _gf_false;
+ else
+ need_undirty = _gf_true;
+
+ if (local->op_ret < 0 && !nothing_failed) {
+ if (afr_need_dirty_marking(frame, this)) {
+ local->dirty[idx] = hton32(1);
+ goto set_dirty;
+ }
- afr_local_t * local = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- dict_t **xattr = NULL;
- int piggyback = 0;
- int index = 0;
- int nothing_failed = 1;
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ if (nothing_failed && !need_undirty) {
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ if (local->transaction.in_flight_sb) {
+ afr_changelog_post_op_fail(frame, this,
+ local->transaction.in_flight_sb_errno);
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+
+ ret = afr_set_pending_dict(priv, xattr, local->pending);
+ if (ret < 0) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ if (need_undirty)
+ local->dirty[idx] = hton32(-1);
+ else
+ local->dirty[idx] = hton32(0);
+
+set_dirty:
+ ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ afr_changelog_do(frame, this, xattr, afr_changelog_post_op_done,
+ AFR_TRANSACTION_POST_OP);
+out:
+ if (xattr)
+ dict_unref(xattr);
- local = frame->local;
- int_lock = &local->internal_lock;
+ return;
+}
- __mark_down_children (local->pending, priv->child_count,
- local->child_up, local->transaction.type);
+static int
+afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int failed_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (priv->thin_arbiter_count) {
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failed_count == 1) {
+ afr_handle_failure_using_thin_arbiter(frame, this);
+ return 0;
+ } else {
+ /* Txn either succeeded or failed on both data bricks. Let
+ * post_op_do handle it as the case might be. */
+ }
+ }
- if (local->fd)
- afr_update_read_child (frame, this, local->fd->inode,
- local->transaction.type);
+ afr_changelog_post_op_do(frame, this);
+ return 0;
+}
- xattr = alloca (priv->child_count * sizeof (*xattr));
- memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
+gf_boolean_t
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
+
+ local = frame->local;
+ priv = this->private;
+ ctx = local->inode_ctx;
+
+ type = afr_index_for_transaction_type(local->transaction.type);
+ if (type != AFR_DATA_TRANSACTION)
+ return !local->transaction.dirtied;
+
+ if (local->transaction.no_uninherit)
+ return _gf_false;
+
+ /* This function must be idempotent. So check if we
+ were called before and return the same answer again.
+
+ It is important to keep this function idempotent for
+ the call in afr_changelog_post_op_safe() to not have
+ side effects on the call from afr_changelog_post_op_now()
+ */
+ if (local->transaction.uninherit_done)
+ return local->transaction.uninherit_value;
+
+ LOCK(&local->inode->lock);
+ {
for (i = 0; i < priv->child_count; i++) {
- xattr[i] = get_new_dict ();
- dict_ref (xattr[i]);
+ if (local->transaction.pre_op[i] != ctx->pre_op_done[type][i]) {
+ ret = !local->transaction.dirtied;
+ goto unlock;
+ }
}
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
+ if (ctx->inherited[type]) {
+ ret = _gf_true;
+ ctx->inherited[type]--;
+ } else if (ctx->on_disk[type]) {
+ ret = _gf_false;
+ ctx->on_disk[type]--;
+ } else {
+ /* ASSERT */
+ ret = _gf_false;
}
- local->call_count = call_count;
+ if (!ctx->inherited[type] && !ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ ctx->pre_op_done[type][i] = 0;
+ }
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- if (local->fd)
- fdctx = afr_fd_ctx_get (local->fd, this);
+ local->transaction.uninherit_done = _gf_true;
+ local->transaction.uninherit_value = ret;
- if (call_count == 0) {
- /* no child is up */
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+ return ret;
+}
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- return 0;
+gf_boolean_t
+afr_changelog_pre_op_inherit(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return _gf_false;
+
+ type = afr_index_for_transaction_type(local->transaction.type);
+
+ LOCK(&local->inode->lock);
+ {
+ if (!local->inode_ctx->on_disk[type]) {
+ /* nothing to inherit yet */
+ ret = _gf_false;
+ goto unlock;
}
- /* check if something has failed, to handle piggybacking */
- nothing_failed = 1;
- index = afr_index_for_transaction_type (local->transaction.type);
for (i = 0; i < priv->child_count; i++) {
- if (local->pending[i][index] == 0) {
- nothing_failed = 0;
- break;
- }
+ if (local->transaction.pre_op[i] !=
+ local->inode_ctx->pre_op_done[type][i]) {
+ /* either inherit exactly, or don't */
+ ret = _gf_false;
+ goto unlock;
+ }
}
- index = afr_index_for_transaction_type (local->transaction.type);
- if (local->optimistic_change_log &&
- local->transaction.type != AFR_DATA_TRANSACTION) {
- /* if nothing_failed, then local->pending[..] == {0 .. 0} */
- for (i = 0; i < priv->child_count; i++)
- local->pending[i][index]++;
- }
+ local->inode_ctx->inherited[type]++;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
+ ret = _gf_true;
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- {
- if (!fdctx) {
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- break;
- }
-
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_piggyback[i]) {
- fdctx->pre_op_piggyback[i]--;
- piggyback = 1;
- }
- }
- UNLOCK (&local->fd->lock);
-
- if (piggyback && !nothing_failed)
- ret = afr_set_piggyback_dict (priv, xattr[i],
- local->pending,
- local->transaction.type);
-
- if (nothing_failed && piggyback) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- } else {
- STACK_WIND_COOKIE (frame,
- afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- }
- break;
- case AFR_METADATA_TRANSACTION:
- {
- if (nothing_failed) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- break;
- }
-
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
+ local->transaction.inherited = _gf_true;
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- if (nothing_failed) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- } else {
- STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- call_count--;
- }
+ return ret;
+}
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+gf_boolean_t
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
+ local = frame->local;
+ priv = this->private;
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ return _gf_false;
- /* fall through */
+ if (local->transaction.inherited)
+ /* was already inherited in afr_changelog_pre_op */
+ return _gf_false;
- case AFR_ENTRY_TRANSACTION:
- {
- if (nothing_failed) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- break;
- }
-
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
- }
+ if (!local->transaction.dirtied)
+ return _gf_false;
- if (!--call_count)
- break;
- }
+ if (!afr_txn_nothing_failed(frame, this))
+ return _gf_false;
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
+ type = afr_index_for_transaction_type(local->transaction.type);
+
+ ret = _gf_false;
+
+ LOCK(&local->inode->lock);
+ {
+ if (!local->inode_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ local->inode_ctx->pre_op_done[type][i] =
+ (!local->transaction.failed_subvols[i]);
+ } else {
+ for (i = 0; i < priv->child_count; i++)
+ if (local->inode_ctx->pre_op_done[type][i] !=
+ (!local->transaction.failed_subvols[i])) {
+ local->transaction.no_uninherit = 1;
+ goto unlock;
+ }
}
+ local->inode_ctx->on_disk[type]++;
- return 0;
-}
+ ret = _gf_true;
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
+ return ret;
+}
-int32_t
-afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+int
+afr_changelog_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = this->private;
- loc_t * loc = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
- local = frame->local;
- loc = &local->loc;
+ local = frame->local;
+ child_index = (long)cookie;
- LOCK (&frame->lock);
- {
- if (op_ret == 1) {
- /* special op_ret for piggyback */
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
- if (op_ret == 0) {
- __mark_pre_op_done_on_fd (frame, this, child_index);
- }
+ if (xattr)
+ local->transaction.changelog_xdata[child_index] = dict_ref(xattr);
- if (op_ret == -1) {
- local->child_up[child_index] = 0;
-
- if (op_errno == ENOTSUP) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop not supported by %s",
- priv->children[child_index]->name);
- local->op_ret = -1;
-
- } else if (!child_went_down (op_ret, op_errno)) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop failed on child %s: %s",
- priv->children[child_index]->name,
- strerror (op_errno));
- }
- local->op_errno = op_errno;
- }
-
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
+ call_count = afr_frame_return(frame);
- if (call_count == 0) {
- if ((local->op_ret == -1) &&
- (local->op_errno == ENOTSUP)) {
- local->transaction.resume (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
+ if (call_count == 0) {
+ local->transaction.changelog_resume(frame, this);
+ }
- afr_pid_restore (frame);
+ return 0;
+}
- local->transaction.fop (frame, this);
+void
+afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op,
+ dict_t **xdata, dict_t **newloc_xdata)
+{
+ int i = 0;
+ int ret = 0;
+ char *key = NULL;
+ int keylen = 0;
+ const char *name = NULL;
+ dict_t *xdata1 = NULL;
+ dict_t *xdata2 = NULL;
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t need_entry_key_set = _gf_true;
+
+ local = frame->local;
+ this = THIS;
+ priv = this->private;
+
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION)
+ goto out;
+
+ if (!priv->esh_granular)
+ goto out;
+
+ xdata1 = dict_new();
+ if (!xdata1)
+ goto out;
+
+ name = local->loc.name;
+ if (local->op == GF_FOP_LINK)
+ name = local->newloc.name;
+
+ switch (op) {
+ case AFR_TRANSACTION_PRE_OP:
+ key = GF_XATTROP_ENTRY_IN_KEY;
+ break;
+ case AFR_TRANSACTION_POST_OP:
+ if (afr_txn_nothing_failed(frame, this)) {
+ key = GF_XATTROP_ENTRY_OUT_KEY;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i])
+ continue;
+ need_entry_key_set = _gf_false;
+ break;
}
+ /* If the transaction itself did not fail and there
+ * are no failed subvolumes, check whether the fop
+ * failed due to a symmetric error. If it did, do
+ * not set the ENTRY_OUT xattr which would end up
+ * deleting a name index which was created possibly by
+ * an earlier entry txn that may have failed on some
+ * of the sub-volumes.
+ */
+ if (local->op_ret)
+ need_entry_key_set = _gf_false;
+ } else {
+ key = GF_XATTROP_ENTRY_IN_KEY;
+ }
+ break;
+ }
+
+ if (need_entry_key_set) {
+ keylen = strlen(key);
+ ret = dict_set_strn(xdata1, key, keylen, (char *)name);
+ if (ret)
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "%s/%s: Could not set %s key during xattrop",
+ uuid_utoa(local->loc.pargfid), local->loc.name, key);
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ xdata2 = dict_new();
+ if (!xdata2)
+ goto out;
+
+ ret = dict_set_strn(xdata2, key, keylen,
+ (char *)local->newloc.name);
+ if (ret)
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "%s/%s: Could not set %s key during "
+ "xattrop",
+ uuid_utoa(local->newloc.pargfid), local->newloc.name,
+ key);
}
+ }
- return 0;
+ *xdata = xdata1;
+ *newloc_xdata = xdata2;
+ xdata1 = xdata2 = NULL;
+out:
+ if (xdata1)
+ dict_unref(xdata1);
+ return;
}
-
int
-afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
+afr_changelog_prepare(xlator_t *this, call_frame_t *frame, int *call_count,
+ afr_changelog_resume_t changelog_resume,
+ afr_xattrop_type_t op, dict_t **xdata,
+ dict_t **newloc_xdata)
{
- afr_private_t * priv = this->private;
- int i = 0;
- int ret = 0;
- int call_count = 0;
- dict_t **xattr = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- afr_local_t *local = NULL;
- int piggyback = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- xattr = alloca (priv->child_count * sizeof (*xattr));
- memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
+ *call_count = afr_changelog_call_count(
+ local->transaction.type, local->transaction.pre_op,
+ local->transaction.failed_subvols, priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- xattr[i] = get_new_dict ();
- dict_ref (xattr[i]);
- }
+ if (*call_count == 0) {
+ changelog_resume(frame, this);
+ return -1;
+ }
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
+ afr_changelog_populate_xdata(frame, op, xdata, newloc_xdata);
+ local->call_count = *call_count;
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
- }
-
- if (call_count == 0) {
- /* no child is up */
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+ local->transaction.changelog_resume = changelog_resume;
+ return 0;
+}
- local->internal_lock.lock_cbk =
- local->transaction.done;
- afr_unlock (frame, this);
- return 0;
+int
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ dict_t *newloc_xdata = NULL;
+ int i = 0;
+ int call_count = 0;
+ int ret = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.changelog_xdata[i]) {
+ dict_unref(local->transaction.changelog_xdata[i]);
+ local->transaction.changelog_xdata[i] = NULL;
}
+ }
- local->call_count = call_count;
+ ret = afr_changelog_prepare(this, frame, &call_count, changelog_resume, op,
+ &xdata, &newloc_xdata);
- __mark_all_pending (local->pending, priv->child_count,
- local->transaction.type);
+ if (ret)
+ return 0;
- if (local->fd)
- fdctx = afr_fd_ctx_get (local->fd, this);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i] ||
+ local->transaction.failed_subvols[i])
+ continue;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- {
- if (!fdctx) {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- break;
- }
-
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_done[i]) {
- fdctx->pre_op_piggyback[i]++;
- piggyback = 1;
- fdctx->hit++;
- } else {
- fdctx->miss++;
- }
- }
- UNLOCK (&local->fd->lock);
-
- if (piggyback)
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
- case AFR_METADATA_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- break;
- }
-
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i]);
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ if (!local->fd) {
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->xattrop,
+ &local->loc, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ } else {
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fxattrop,
+ local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
}
break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- } else {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
-
- call_count--;
- }
-
-
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
-
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr, newloc_xdata);
+ call_count--;
/* fall through */
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- break;
- }
-
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd)
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fxattrop,
+ local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ else
+ STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
break;
- }
-
- if (!--call_count)
- break;
}
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+ if (!--call_count)
+ break;
+ }
- return 0;
+ if (xdata)
+ dict_unref(xdata);
+ if (newloc_xdata)
+ dict_unref(newloc_xdata);
+ return 0;
}
+static void
+afr_init_optimistic_changelog_for_txn(xlator_t *this, afr_local_t *local)
+{
+ int locked_count = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ if (priv->optimistic_change_log && locked_count == priv->child_count)
+ local->optimistic_change_log = 1;
+
+ return;
+}
int
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
+afr_changelog_pre_op(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ int op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ unsigned char *locked_nodes = NULL;
+ int idx = -1;
+ gf_boolean_t pre_nop = _gf_true;
+ dict_t *xdata_req = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ locked_nodes = afr_locked_nodes_get(local->transaction.type, int_lock);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_nodes[i]) {
+ local->transaction.pre_op[i] = 1;
+ call_count++;
+ } else {
+ local->transaction.failed_subvols[i] = 1;
+ }
+ }
+
+ afr_init_optimistic_changelog_for_txn(this, local);
+
+ if (afr_changelog_pre_op_inherit(frame, this))
+ goto next;
+
+ /* This condition should not be met with present code, as
+ * transaction.done will be called if locks are not acquired on even a
+ * single node.
+ */
+ if (call_count == 0) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+
+ /* Check if the fop can be performed on at least
+ * quorum number of nodes.
+ */
+ if (priv->quorum_count && !afr_has_fop_quorum(frame)) {
+ op_errno = int_lock->lock_op_errno;
+ if (op_errno == 0)
+ op_errno = afr_quorum_errno(priv);
+ goto err;
+ }
+
+ xdata_req = dict_new();
+ if (!xdata_req) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (call_count < priv->child_count)
+ pre_nop = _gf_false;
+
+ /* Set an all-zero pending changelog so that in the cbk, we can get the
+ * current on-disk values. In a replica 3 volume with arbiter enabled,
+ * these values are needed to arrive at a go/ no-go of the fop phase to
+ * avoid ending up in split-brain.*/
+
+ ret = afr_set_pending_dict(priv, xdata_req, local->pending);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (afr_needs_changelog_update(local)) {
+ local->dirty[idx] = hton32(1);
+
+ ret = dict_set_static_bin(xdata_req, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- local = frame->local;
- int_lock = &local->internal_lock;
+ pre_nop = _gf_false;
+ local->transaction.dirtied = 1;
+ }
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Blocking inodelks failed.");
- local->transaction.done (frame, this);
- } else {
+ if (pre_nop)
+ goto next;
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
+ if (!local->pre_op_compat) {
+ dict_copy(xdata_req, local->xdata_req);
+ goto next;
+ }
- return 0;
+ afr_changelog_do(frame, this, xdata_req, afr_transaction_perform_fop,
+ AFR_TRANSACTION_PRE_OP);
+
+ if (xdata_req)
+ dict_unref(xdata_req);
+
+ return 0;
+next:
+ afr_transaction_perform_fop(frame, this);
+
+ if (xdata_req)
+ dict_unref(xdata_req);
+
+ return 0;
+err:
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ afr_handle_lock_acquire_failure(local);
+
+ if (xdata_req)
+ dict_unref(xdata_req);
+
+ return 0;
}
+int
+afr_post_nonblocking_lock_cbk(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ /* Initiate blocking locks if non-blocking has failed */
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg_debug(this->name, 0,
+ "Non blocking locks failed. Proceeding to blocking");
+ int_lock->lock_cbk = afr_internal_lock_finish;
+ afr_blocking_lock(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Non blocking locks done. Proceeding to FOP");
+
+ afr_internal_lock_finish(frame, this);
+ }
+
+ return 0;
+}
int
-afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking inodelks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
- afr_blocking_lock (frame, this);
- } else {
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INTERNAL_LKS_FAILED,
+ "Blocking entrylks failed.");
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
+ afr_transaction_done(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Blocking entrylks done. Proceeding to FOP");
- return 0;
+ afr_internal_lock_finish(frame, this);
+ }
+ return 0;
}
-
int
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Blocking entrylks failed.");
- local->transaction.done (frame, this);
- } else {
+ GF_ASSERT(!int_lock->higher_locked);
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
+ int_lock->lock_cbk = afr_post_blocking_rename_cbk;
+ afr_blocking_lock(frame, this);
- return 0;
+ return 0;
}
+int
+afr_set_transaction_flock(xlator_t *this, afr_local_t *local,
+ afr_lockee_t *lockee)
+{
+ afr_private_t *priv = NULL;
+ struct gf_flock *flock = NULL;
+
+ priv = this->private;
+ flock = &lockee->flock;
+
+ if ((priv->arbiter_count || local->transaction.eager_lock_on ||
+ priv->full_lock) &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ /*Lock entire file to avoid network split brains.*/
+ flock->l_len = 0;
+ flock->l_start = 0;
+ } else {
+ flock->l_len = local->transaction.len;
+ flock->l_start = local->transaction.start;
+ }
+ flock->l_type = F_WRLCK;
+
+ return 0;
+}
int
-afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+afr_lock(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
- afr_blocking_lock (frame, this);
- } else {
+ int_lock->lock_cbk = afr_post_nonblocking_lock_cbk;
+ int_lock->domain = this->name;
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ afr_set_transaction_flock(this, local, &int_lock->lockee[i]);
+ }
- return 0;
-}
+ break;
+ case AFR_ENTRY_TRANSACTION:
+ int_lock->lk_basename = local->transaction.basename;
+ if (local->transaction.parent_loc.path)
+ int_lock->lk_loc = &local->transaction.parent_loc;
+ else
+ GF_ASSERT(local->fd);
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ break;
+ }
+ afr_lock_nonblocking(frame, this);
-int
-afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this)
+ return 0;
+}
+
+static gf_boolean_t
+afr_locals_overlap(afr_local_t *local1, afr_local_t *local2)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
+
+ return ((end1 >= start2) && (end2 >= start1));
+}
- local = frame->local;
- int_lock = &local->internal_lock;
+gf_boolean_t
+afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check)
+{
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = NULL;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ list_for_each_entry(each, &lock->owners, transaction.owner_list)
+ {
+ if (afr_locals_overlap(each, local)) {
+ return _gf_true;
+ }
+ }
+
+ if (!waitlist_check)
+ return _gf_false;
+ list_for_each_entry(each, &lock->waiting, transaction.wait_list)
+ {
+ if (afr_locals_overlap(each, local)) {
+ return _gf_true;
+ }
+ }
+ return _gf_false;
+}
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Blocking entrylks failed.");
- local->transaction.done (frame, this);
- } else {
+/* }}} */
+static void
+afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src,
+ xlator_t *this, int lockee_num)
+{
+ afr_private_t *priv = this->private;
+ afr_lockee_t *sl = &src->lockee[lockee_num];
+ afr_lockee_t *dl = &dst->lockee[lockee_num];
+
+ dst->domain = src->domain;
+ dl->flock.l_len = sl->flock.l_len;
+ dl->flock.l_start = sl->flock.l_start;
+ dl->flock.l_type = sl->flock.l_type;
+ dl->locked_count = sl->locked_count;
+ memcpy(dl->locked_nodes, sl->locked_nodes,
+ priv->child_count * sizeof(*dl->locked_nodes));
+}
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
+void
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared)
+{
+ gf_boolean_t conflict = _gf_false;
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
+
+ while (!conflict) {
+ if (list_empty(&lock->waiting))
+ return;
+ each = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ if (afr_has_lock_conflict(each, _gf_false)) {
+ conflict = _gf_true;
}
- return 0;
+ if (conflict && !list_empty(&lock->owners))
+ return;
+ afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock,
+ each->transaction.frame->this, 0);
+ list_move_tail(&each->transaction.wait_list, shared);
+ list_add_tail(&each->transaction.owner_list, &lock->owners);
+ }
}
+static void
+afr_lock_resume_shared(struct list_head *list)
+{
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t, transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ afr_changelog_pre_op(each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
int
-afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this)
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+
+ local->internal_lock.lock_cbk = NULL;
+ if (!local->transaction.eager_lock_on) {
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_transaction_done(frame, this);
+ return 0;
+ }
+ afr_changelog_pre_op(frame, this);
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_handle_lock_acquire_failure(local);
+ } else {
+ lock->event_generation = local->event_generation;
+ afr_changelog_pre_op(frame, this);
+ }
+ }
- local = frame->local;
- int_lock = &local->internal_lock;
+ return 0;
+}
- GF_ASSERT (!int_lock->higher_locked);
+gf_boolean_t
+afr_are_conflicting_ops_waiting(afr_local_t *local, xlator_t *this)
+{
+ afr_lock_t *lock = NULL;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any {meta,}data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1 for metadata transactions and if num-inodelks > 1
+ * for data transactions
+ */
+
+ if (local->transaction.type == AFR_METADATA_TRANSACTION) {
+ if (local->inode_ctx->open_fd_count > 1) {
+ return _gf_true;
+ }
+ } else if (local->transaction.type == AFR_DATA_TRANSACTION) {
+ if (lock->num_inodelks > 1) {
+ return _gf_true;
+ }
+ }
- int_lock->lock_cbk = afr_post_blocking_rename_cbk;
- afr_blocking_lock (frame, this);
+ return _gf_false;
+}
- return 0;
+gf_boolean_t
+afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
+ int delay)
+{
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t res = _gf_false;
+
+ local = frame->local;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ if (!afr_txn_nothing_failed(frame, this)) {
+ lock->release = _gf_true;
+ goto out;
+ }
+
+ if (afr_are_conflicting_ops_waiting(local, this)) {
+ lock->release = _gf_true;
+ goto out;
+ }
+
+ if (!list_empty(&lock->owners))
+ goto out;
+ else
+ GF_ASSERT(list_empty(&lock->waiting));
+
+ if (lock->release) {
+ goto out;
+ }
+
+ if (!delay) {
+ goto out;
+ }
+
+ if (local->transaction.disable_delayed_post_op) {
+ goto out;
+ }
+
+ if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
+ (local->op != GF_FOP_FSYNC)) {
+ /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
+ * they are fine too*/
+ goto out;
+ }
+
+ res = _gf_true;
+out:
+ return res;
}
+void
+afr_delayed_changelog_wake_up_cbk(void *data)
+{
+ afr_lock_t *lock = NULL;
+ afr_local_t *local = data;
+ afr_local_t *timer_local = NULL;
+ struct list_head shared;
+
+ INIT_LIST_HEAD(&shared);
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ if (list_empty(&lock->owners) && (local == timer_local)) {
+ GF_ASSERT(list_empty(&lock->waiting));
+ /*Last owner*/
+ lock->release = _gf_true;
+ lock->delay_timer = NULL;
+ }
+ }
+ UNLOCK(&local->inode->lock);
+ afr_changelog_post_op_now(local->transaction.frame,
+ local->transaction.frame->this);
+}
+/* SET operation */
int
-afr_set_transaction_flock (afr_local_t *local)
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local)
{
- afr_internal_lock_t *int_lock = NULL;
+ LOCK(&local->inode->lock);
+ {
+ local->inode_ctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&local->inode->lock);
- int_lock = &local->internal_lock;
+ return 0;
+}
- int_lock->lk_flock.l_len = local->transaction.len;
- int_lock->lk_flock.l_start = local->transaction.start;
- int_lock->lk_flock.l_type = F_WRLCK;
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ gf_boolean_t witness = _gf_false;
- return 0;
+ LOCK(&inode->lock);
+ {
+ (void)__afr_inode_ctx_get(this, inode, &ctx);
+
+ if (ctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ ctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK(&inode->lock);
+
+ return witness;
}
int
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
+afr_changelog_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ priv = this->private;
+ local = frame->local;
- int_lock->transaction_lk_type = AFR_TRANSACTION_LK;
+ if (op_ret != 0) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, AFR_MSG_FSYNC_FAILED,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name, gf_fop_list[local->op]);
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- afr_set_transaction_flock (local);
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
- int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
+ call_count = afr_frame_return(frame);
- afr_nonblocking_inodelk (frame, this);
- break;
+ if (call_count == 0)
+ afr_changelog_post_op_now(frame, this);
- case AFR_ENTRY_RENAME_TRANSACTION:
+ return 0;
+}
- int_lock->lock_cbk = afr_post_blocking_rename_cbk;
- afr_blocking_lock (frame, this);
- break;
+int
+afr_changelog_fsync(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
- case AFR_ENTRY_TRANSACTION:
- int_lock->lk_basename = local->transaction.basename;
- if (&local->transaction.parent_loc)
- int_lock->lk_loc = &local->transaction.parent_loc;
- else
- GF_ASSERT (local->fd);
+ local = frame->local;
+ priv = this->private;
- int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
- afr_nonblocking_entrylk (frame, this);
- break;
- }
+ call_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now(frame, this);
return 0;
-}
+ }
+ local->call_count = call_count;
-int
-afr_lock (call_frame_t *frame, xlator_t *this)
-{
- afr_pid_save (frame);
+ xdata = dict_new();
+ if (xdata) {
+ ret = dict_set_int32_sizen(xdata, "batch-fsync", 1);
+ ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ }
- frame->root->pid = (long) frame->root;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
- afr_set_lk_owner (frame, this);
+ STACK_WIND_COOKIE(frame, afr_changelog_fsync_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fsync,
+ local->fd, 1, xdata);
+ if (!--call_count)
+ break;
+ }
- afr_set_lock_number (frame, this);
+ if (xdata)
+ dict_unref(xdata);
- return afr_lock_rec (frame, this);
+ return 0;
}
-
-/* }}} */
-
int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- priv = this->private;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- if (__changelog_needed_pre_op (frame, this)) {
- afr_changelog_pre_op (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
+
+ if (afr_changelog_pre_op_uninherit(frame, this) &&
+ afr_txn_nothing_failed(frame, this)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
+
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write(this, local->inode)) {
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
+
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync(frame, this);
+ } else {
+ afr_changelog_post_op_now(frame, this);
+ }
+
+ return 0;
+}
- afr_pid_restore (frame);
+void
+afr_changelog_post_op(call_frame_t *frame, xlator_t *this)
+{
+ struct timespec delta = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t post_op = _gf_true;
+ struct list_head shared;
+
+ priv = this->private;
+ delta.tv_sec = priv->post_op_delay_secs;
+ delta.tv_nsec = 0;
+
+ INIT_LIST_HEAD(&shared);
+ if (!local->transaction.eager_lock_on)
+ goto out;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ list_del_init(&local->transaction.owner_list);
+ list_add(&local->transaction.owner_list, &lock->post_op);
+ __afr_transaction_wake_shared(local, &shared);
+
+ if (!afr_is_delayed_changelog_post_op_needed(frame, this,
+ delta.tv_sec)) {
+ if (list_empty(&lock->owners))
+ lock->release = _gf_true;
+ goto unlock;
+ }
- local->transaction.fop (frame, this);
+ GF_ASSERT(lock->delay_timer == NULL);
+ lock->delay_timer = gf_timer_call_after(
+ this->ctx, delta, afr_delayed_changelog_wake_up_cbk, local);
+ if (!lock->delay_timer) {
+ lock->release = _gf_true;
+ } else {
+ post_op = _gf_false;
}
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- return 0;
-}
+ if (!list_empty(&shared)) {
+ afr_lock_resume_shared(&shared);
+ }
+out:
+ if (post_op) {
+ if (!local->transaction.eager_lock_on || lock->release) {
+ afr_changelog_post_op_safe(frame, this);
+ } else {
+ afr_changelog_post_op_now(frame, this);
+ }
+ }
+}
int
-afr_transaction_resume (call_frame_t *frame, xlator_t *this)
+afr_transaction_resume(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
+ local = frame->local;
- if (__changelog_needed_post_op (frame, this)) {
- afr_changelog_post_op (frame, this);
- } else {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
- }
+ afr_restore_lk_owner(frame);
- return 0;
-}
+ afr_handle_symmetric_errors(frame, this);
+
+ if (!local->pre_op_compat)
+ /* new mode, pre-op was done along
+ with OP */
+ afr_changelog_pre_op_update(frame, this);
+ afr_changelog_post_op(frame, this);
+
+ return 0;
+}
/**
* afr_transaction_fop_failed - inform that an fop failed
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index)
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, int child_index)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+
+ local->transaction.failed_subvols[child_index] = 1;
+}
+
+static gf_boolean_t
+__need_previous_lock_unlocked(afr_local_t *local)
+{
+ afr_lock_t *lock = NULL;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (!lock->acquired)
+ return _gf_false;
+ if (lock->acquired && lock->event_generation != local->event_generation)
+ return _gf_true;
+ return _gf_false;
+}
- __mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
+void
+__afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock,
+ gf_boolean_t *do_pre_op, afr_local_t **timer_local)
+{
+ afr_lock_t *lock = NULL;
+ afr_local_t *owner_local = NULL;
+ xlator_t *this = local->transaction.frame->this;
+
+ local->transaction.eager_lock_on = _gf_true;
+ afr_set_lk_owner(local->transaction.frame, this, local->inode);
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (__need_previous_lock_unlocked(local)) {
+ if (!list_empty(&lock->owners)) {
+ lock->release = _gf_true;
+ } else if (lock->delay_timer) {
+ lock->release = _gf_true;
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ /* It will be put in frozen list
+ * in the code flow below*/
+ } else {
+ *timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ lock->delay_timer = NULL;
+ }
+ }
+ }
+
+ if (lock->release) {
+ list_add_tail(&local->transaction.wait_list, &lock->frozen);
+ *take_lock = _gf_false;
+ goto out;
+ }
+
+ if (lock->delay_timer) {
+ *take_lock = _gf_false;
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ list_add_tail(&local->transaction.wait_list, &lock->frozen);
+ } else {
+ *timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars(&local->internal_lock,
+ &(*timer_local)->internal_lock, this, 0);
+ lock->delay_timer = NULL;
+ *do_pre_op = _gf_true;
+ list_add_tail(&local->transaction.owner_list, &lock->owners);
+ }
+ goto out;
+ }
+
+ if (!list_empty(&lock->owners)) {
+ if (!lock->acquired || afr_has_lock_conflict(local, _gf_true)) {
+ list_add_tail(&local->transaction.wait_list, &lock->waiting);
+ *take_lock = _gf_false;
+ goto out;
+ }
+ owner_local = list_entry(lock->owners.next, afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars(&local->internal_lock,
+ &owner_local->internal_lock, this, 0);
+ *take_lock = _gf_false;
+ *do_pre_op = _gf_true;
+ }
+
+ if (lock->acquired)
+ GF_ASSERT(!(*take_lock));
+ list_add_tail(&local->transaction.owner_list, &lock->owners);
+out:
+ return;
}
+void
+afr_transaction_start(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ gf_boolean_t take_lock = _gf_true;
+ gf_boolean_t do_pre_op = _gf_false;
+ afr_local_t *timer_local = NULL;
+
+ priv = this->private;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION &&
+ local->transaction.type != AFR_METADATA_TRANSACTION)
+ goto lock_phase;
+
+ if (!priv->eager_lock)
+ goto lock_phase;
+
+ LOCK(&local->inode->lock);
+ {
+ __afr_eager_lock_handle(local, &take_lock, &do_pre_op, &timer_local);
+ }
+ UNLOCK(&local->inode->lock);
+lock_phase:
+ if (!local->transaction.eager_lock_on) {
+ afr_set_lk_owner(local->transaction.frame, this,
+ local->transaction.frame->root);
+ }
+
+ if (take_lock) {
+ afr_lock(local->transaction.frame, this);
+ } else if (do_pre_op) {
+ afr_changelog_pre_op(local->transaction.frame, this);
+ }
+ /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
+ * so that any inheriting can happen*/
+ if (timer_local)
+ afr_delayed_changelog_wake_up_cbk(timer_local);
+}
int
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+afr_write_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_local_t *local = frame->local;
+
+ if (err) {
+ AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+ goto fail;
+ }
+
+ afr_transaction_start(local, this);
+ return 0;
+fail:
+ local->transaction.unwind(frame, this);
+ AFR_STACK_DESTROY(frame);
+ return 0;
+}
- local = frame->local;
- priv = this->private;
+int
+afr_transaction_lockee_init(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_private_t *priv = frame->this->private;
+ int ret = 0;
- afr_transaction_local_init (local, priv);
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ ret = afr_add_inode_lockee(local, priv->child_count);
+ break;
- local->transaction.resume = afr_transaction_resume;
- local->transaction.type = type;
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ ret = afr_add_entry_lockee(local, &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ if (local->op == GF_FOP_RENAME) {
+ ret = afr_add_entry_lockee(
+ local, &local->transaction.new_parent_loc,
+ local->transaction.new_basename, priv->child_count);
+ if (ret) {
+ goto out;
+ }
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- afr_internal_lock_finish (frame, this);
- } else {
- afr_lock (frame, this);
- }
+ if (local->newloc.inode &&
+ IA_ISDIR(local->newloc.inode->ia_type)) {
+ ret = afr_add_entry_lockee(local, &local->newloc, NULL,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ }
+ } else if (local->op == GF_FOP_RMDIR) {
+ ret = afr_add_entry_lockee(local, &local->loc, NULL,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ }
+
+ if (int_lock->lockee_count > 1) {
+ qsort(int_lock->lockee, int_lock->lockee_count,
+ sizeof(*int_lock->lockee), afr_entry_lockee_cmp);
+ }
+ break;
+ }
+out:
+ return ret;
+}
- return 0;
+int
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ int event_generation = 0;
+
+ local = frame->local;
+ priv = this->private;
+ local->transaction.frame = frame;
+
+ local->transaction.type = type;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &ret)) {
+ ret = -ret; /*op_errno to ret conversion*/
+ goto out;
+ }
+
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
+ ret = afr_transaction_local_init(local, this);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_transaction_lockee_init(frame);
+ if (ret)
+ goto out;
+
+ if (type != AFR_METADATA_TRANSACTION) {
+ goto txn_start;
+ }
+
+ ret = afr_inode_get_readable(frame, local->inode, this, local->readable,
+ &event_generation, type);
+ if (ret < 0 ||
+ afr_is_inode_refresh_reqd(local->inode, this, priv->event_generation,
+ event_generation)) {
+ afr_inode_refresh(frame, this, local->inode, local->loc.gfid,
+ afr_write_txn_refresh_done);
+ ret = 0;
+ goto out;
+ }
+
+txn_start:
+ ret = 0;
+ afr_transaction_start(local, this);
+out:
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index 4b4428cc59e..beefa26f4a6 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -1,33 +1,75 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __TRANSACTION_H__
#define __TRANSACTION_H__
+#include "afr.h"
+
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
- int child_index);
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this,
+ int child_index);
+
+int32_t
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type);
int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int32_t **pending);
-int32_t
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
+void
+afr_delayed_changelog_wake_up(xlator_t *this, fd_t *fd);
+
+void
+__mark_all_success(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this);
+
+int
+afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type);
+
+int
+afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol);
+
+void
+afr_pending_read_increment(afr_private_t *priv, int child_index);
+void
+afr_pending_read_decrement(afr_private_t *priv, int child_index);
+
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame);
+gf_boolean_t
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame);
+gf_boolean_t
+afr_needs_changelog_update(afr_local_t *local);
+void
+afr_zero_fill_stat(afr_local_t *local);
+
+void
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+ unsigned char *readable1, inode_t *inode2,
+ unsigned char *readable2);
+int
+afr_transaction_resume(call_frame_t *frame, xlator_t *this);
+
+int
+afr_lock(call_frame_t *frame, xlator_t *this);
+
+void
+afr_delayed_changelog_wake_up_cbk(void *data);
+
+int
+afr_release_notify_lock_for_ta(void *opaque);
+
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index a662037e788..df7366f0a65 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#include <libgen.h>
@@ -24,863 +15,1330 @@
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
#include "afr-common.c"
+#include "afr-messages.h"
struct volume_options options[];
+static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = {
+ [AFR_FAV_CHILD_NONE] = "none",
+ [AFR_FAV_CHILD_BY_SIZE] = "size",
+ [AFR_FAV_CHILD_BY_CTIME] = "ctime",
+ [AFR_FAV_CHILD_BY_MTIME] = "mtime",
+ [AFR_FAV_CHILD_BY_MAJORITY] = "majority",
+ [AFR_FAV_CHILD_POLICY_MAX] = NULL,
+};
+
int32_t
-notify (xlator_t *this, int32_t event,
- void *data, ...)
+notify(xlator_t *this, int32_t event, void *data, ...)
{
- int ret = -1;
+ int ret = -1;
+ va_list ap;
+ void *data2 = NULL;
- ret = afr_notify (this, event, data);
+ va_start(ap, data);
+ data2 = va_arg(ap, dict_t *);
+ va_end(ap);
+ ret = afr_notify(this, event, data, data2);
- return ret;
+ return ret;
}
int32_t
-mem_acct_init (xlator_t *this)
+mem_acct_init(xlator_t *this)
{
- int ret = -1;
-
- if (!this)
- return ret;
+ int ret = -1;
- ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1);
+ if (!this)
+ return ret;
- if (ret != 0) {
- gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
+ ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1);
+ if (ret != 0) {
return ret;
+ }
+
+ return ret;
}
int
-validate_options (xlator_t *this, char **op_errstr)
+xlator_subvolume_index(xlator_t *this, xlator_t *subvol)
{
- int ret = 0;
- volume_opt_list_t *vol_opt = NULL;
- volume_opt_list_t *tmp;
+ int index = -1;
+ int i = 0;
+ xlator_list_t *list = NULL;
- if (!this) {
- gf_log (this->name, GF_LOG_DEBUG, "'this' not a valid ptr");
- ret =-1;
- goto out;
- }
+ list = this->children;
- if (list_empty (&this->volume_options))
- goto out;
-
- vol_opt = list_entry (this->volume_options.next,
- volume_opt_list_t, list);
- list_for_each_entry_safe (vol_opt, tmp, &this->volume_options, list) {
- ret = validate_xlator_volume_options_attacherr (this,
- vol_opt->given_opt,
- op_errstr);
+ while (list) {
+ if (subvol == list->xlator ||
+ strcmp(subvol->name, list->xlator->name) == 0) {
+ index = i;
+ break;
}
+ list = list->next;
+ i++;
+ }
-out:
-
- return ret;
+ return index;
}
+static void
+fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype,
+ dict_t *options)
+{
+ if (dict_get_sizen(options, "quorum-type") == NULL) {
+ /* If user doesn't configure anything enable auto-quorum if the
+ * replica has more than two subvolumes */
+ if (priv->child_count > 2)
+ qtype = "auto";
+ }
+
+ if (priv->quorum_count && strcmp(qtype, "fixed")) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE,
+ "quorum-type %s overriding quorum-count %u", qtype,
+ priv->quorum_count);
+ }
+
+ if (!strcmp(qtype, "none")) {
+ priv->quorum_count = 0;
+ } else if (!strcmp(qtype, "auto")) {
+ priv->quorum_count = AFR_QUORUM_AUTO;
+ }
+}
int
-reconfigure (xlator_t *this, dict_t *options)
+afr_set_favorite_child_policy(afr_private_t *priv, char *policy)
{
+ int index = -1;
- gf_boolean_t metadata_self_heal; /* on/off */
- gf_boolean_t entry_self_heal;
- gf_boolean_t data_self_heal;
- gf_boolean_t data_change_log; /* on/off */
- gf_boolean_t metadata_change_log; /* on/off */
- gf_boolean_t entry_change_log; /* on/off */
- gf_boolean_t strict_readdir;
-
- afr_private_t * priv = NULL;
- xlator_list_t * trav = NULL;
-
- char * read_subvol = NULL;
- char * self_heal = NULL;
- char * change_log = NULL;
- char * str_readdir = NULL;
- char * self_heal_algo = NULL;
-
- int32_t background_count = 0;
- int32_t window_size = 0;
-
- int read_ret = -1;
- int dict_ret = -1;
- int flag = 1;
- int ret = 0;
- int temp_ret = -1;
-
- priv = this->private;
-
- dict_ret = dict_get_int32 (options, "background-self-heal-count",
- &background_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring background self-heal count to %d",
- background_count);
-
- priv->background_self_heal_count = background_count;
- }
-
- dict_ret = dict_get_str (options, "metadata-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &metadata_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Reconfiguration Invalid 'option metadata"
- "-self-heal %s'. Defaulting to old value.",
- self_heal);
- ret = -1;
- goto out;
- }
-
- priv->metadata_self_heal = metadata_self_heal;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option metadata"
- "-self-heal %s'.",
- self_heal);
- }
+ index = gf_get_index_by_elem(afr_favorite_child_policies, policy);
+ if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX)
+ return -1;
- dict_ret = dict_get_str (options, "data-self-heal", &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &data_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Reconfiguration Invalid 'option data"
- "-self-heal %s'. Defaulting to old value.",
- self_heal);
- ret = -1;
- goto out;
- }
-
- priv->data_self_heal = data_self_heal;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option data"
- "-self-heal %s'.", self_heal);
- }
+ priv->fav_child_policy = index;
- dict_ret = dict_get_str (options, "entry-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &entry_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Reconfiguration Invalid 'option data"
- "-self-heal %s'. Defaulting to old value.",
- self_heal);
- ret = -1;
- goto out;
- }
-
- priv->entry_self_heal = entry_self_heal;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option entry"
- "-self-heal %s'.", self_heal);
- }
-
-
- dict_ret = dict_get_str (options, "strict-readdir",
- &str_readdir);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (str_readdir, &strict_readdir);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option strict-readdir %s'. "
- "Defaulting to old value.",
- str_readdir);
- ret = -1;
- goto out;
- }
-
- priv->strict_readdir = strict_readdir;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option strict"
- "-readdir %s'.", str_readdir);
- }
-
- dict_ret = dict_get_int32 (options, "data-self-heal-window-size",
- &window_size);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring, Setting data self-heal window size to %d",
- window_size);
-
- priv->data_self_heal_window_size = window_size;
- }
- else {
- priv->data_self_heal_window_size = 16;
- }
+ return 0;
+}
+static void
+set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
+{
+ if (!algo) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+ } else if (strcmp(algo, "full") == 0) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL;
+ } else if (strcmp(algo, "diff") == 0) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF;
+ } else {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+ }
+}
- dict_ret = dict_get_str (options, "data-change-log", &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log, &data_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Reconfiguration Invalid 'option data-"
- "change-log %s'. Defaulting to old value.",
- change_log);
- ret = -1;
- goto out;
- }
-
- priv->data_change_log = data_change_log;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option data-"
- "change-log %s'.", change_log);
- }
+void
+afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options)
+{
+ char *volfile_id_str = NULL;
+ uuid_t anon_inode_gfid = {0};
+
+ /*If volume id is not present don't enable anything*/
+ if (dict_get_str(options, "volume-id", &volfile_id_str))
+ return;
+ GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX);
+ /*anon_inode_name is not supposed to change once assigned*/
+ if (!priv->anon_inode_name[0]) {
+ snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s",
+ AFR_ANON_DIR_PREFIX, volfile_id_str);
+ gf_uuid_parse(volfile_id_str, anon_inode_gfid);
+ /*Flip a bit to make sure volfile-id and anon-gfid are not same*/
+ anon_inode_gfid[0] ^= 1;
+ uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str);
+ }
+}
- dict_ret = dict_get_str (options, "metadata-change-log",
- &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log,
- &metadata_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option metadata-change-log %s'. "
- "Defaulting to metadata-change-log as 'off'.",
- change_log);
- ret = -1;
- goto out;
- }
-
- priv->metadata_change_log = metadata_change_log;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option metadata-"
- "change-log %s'.", change_log);
- }
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *read_subvol = NULL;
+ int read_subvol_index = -1;
+ int timeout_old = 0;
+ int ret = -1;
+ int index = -1;
+ char *qtype = NULL;
+ char *fav_child_policy = NULL;
+ char *data_self_heal = NULL;
+ char *data_self_heal_algorithm = NULL;
+ char *locking_scheme = NULL;
+ gf_boolean_t consistent_io = _gf_false;
+ gf_boolean_t choose_local_old = _gf_false;
+ gf_boolean_t enabled_old = _gf_false;
- dict_ret = dict_get_str (options, "entry-change-log",
- &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log, &entry_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option entry-change-log %s'. "
- "Defaulting to entry-change-log as 'on'.",
- change_log);
- ret = -1;
- goto out;
- }
-
- priv->entry_change_log = entry_change_log;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option entry-"
- "change-log %s'.", change_log);
- }
+ priv = this->private;
- dict_ret = dict_get_str (options, "data-self-heal-algorithm",
- &self_heal_algo);
- if (dict_ret == 0) {
- /* Handling both strcmp cases - s1 > s2 and s1 < s2 */
-
- if (!strcmp (self_heal_algo, "full")) {
- priv->data_self_heal_algorithm = self_heal_algo;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option data-self"
- "heal-algorithm %s'.", self_heal_algo);
- goto next;
- }
-
- if (!strcmp (self_heal_algo, "diff")) {
- priv->data_self_heal_algorithm = self_heal_algo;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option data-self"
- "heal-algorithm %s'.", self_heal_algo);
- goto next;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid self-heal algorithm %s,"
- "defaulting back to old value",
- self_heal_algo);
- ret = -1;
- goto out;
- }
+ GF_OPTION_RECONF("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, options, bool, out);
- read_ret = dict_get_str (options, "read-subvolume", &read_subvol);
+ GF_OPTION_RECONF("background-self-heal-count",
+ priv->background_self_heal_count, options, uint32, out);
- if (read_ret < 0)
- goto next;// No need to traverse, hence set the next option
+ GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options,
+ uint32, out);
- trav = this->children;
- flag = 0;
- while (trav) {
- if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Subvolume '%s' specified as read child.",
- trav->xlator->name);
+ GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options,
+ bool, out);
- flag = 1;
- break;
- }
+ GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
- trav = trav->next;
- }
+ GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
+ out);
- if (flag == 0 ) {
- gf_log (this->name, GF_LOG_ERROR,
- "Invalid 'option read-subvolume %s', no such subvolume"
- , read_subvol);
- ret = -1;
- goto out;
- }
+ GF_OPTION_RECONF("data-self-heal-window-size",
+ priv->data_self_heal_window_size, options, uint32, out);
-next:
-out:
- return ret;
+ GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm,
+ options, str, out);
+ set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
-}
+ GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out);
+ GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+ options, uint32, out);
-static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
- "as the 'favorite child'. This means that if a discrepancy in the content "
- "or attributes (ownership, permission, etc.) of a file is detected among "
- "the subvolumes, the file on '%s' will be considered the definitive "
- "version and its contents will OVERWRITE the contents of the file on other "
- "subvolumes. All versions of the file except that on '%s' "
- "WILL BE LOST.";
+ GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
+ options, uint32, out);
-static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. "
- "This means correctness is NO LONGER GUARANTEED in all cases. If two or more "
- "applications write to the same region of a file, there is a possibility that "
- "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you "
- "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS "
- "RESPONSIBLE for inconsistent data. If you are in doubt, set it to a value "
- "greater than 0.";
+ GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options,
+ uint32, out);
-int32_t
-init (xlator_t *this)
-{
- afr_private_t * priv = NULL;
- int child_count = 0;
- xlator_list_t * trav = NULL;
- int i = 0;
- int ret = -1;
- int op_errno = 0;
- char * read_subvol = NULL;
- char * fav_child = NULL;
- char * self_heal = NULL;
- char * algo = NULL;
- char * change_log = NULL;
- char * strict_readdir = NULL;
- char * inodelk_trace = NULL;
- char * entrylk_trace = NULL;
- char * def_val = NULL;
- int32_t background_count = 0;
- int32_t lock_server_count = 1;
- int32_t window_size = 0;
- int fav_ret = -1;
- int read_ret = -1;
- int dict_ret = -1;
-
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "replicate translator needs more than one "
- "subvolume defined.");
- return -1;
- }
+ GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options,
+ uint32, out);
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "Volume is dangling.");
- }
+ GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options,
+ uint32, out);
- ALLOC_OR_GOTO (this->private, afr_private_t, out);
+ GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out);
- priv = this->private;
+ choose_local_old = priv->choose_local;
+ GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out);
- read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol);
+ if (choose_local_old != priv->choose_local) {
priv->read_child = -1;
-
- fav_ret = dict_get_str (this->options, "favorite-child", &fav_child);
- priv->favorite_child = -1;
-
- priv->background_self_heal_count = 16;
-
- dict_ret = dict_get_int32 (this->options, "background-self-heal-count",
- &background_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting background self-heal count to %d",
- background_count);
-
- priv->background_self_heal_count = background_count;
+ if (choose_local_old == _gf_false)
+ priv->did_discovery = _gf_false;
+ }
+
+ GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out);
+
+ if (read_subvol) {
+ index = xlator_subvolume_index(this, read_subvol);
+ if (index == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%s not a subvolume", read_subvol->name);
+ goto out;
}
-
- /* Default values */
-
- priv->data_self_heal = 1;
- priv->metadata_self_heal = 1;
- priv->entry_self_heal = 1;
-
- dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->data_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option data-self-heal %s'. "
- "Defaulting to data-self-heal as 'on'",
- self_heal);
- priv->data_self_heal = 1;
- }
+ priv->read_child = index;
+ }
+
+ GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32,
+ out);
+
+ if (read_subvol_index > -1) {
+ index = read_subvol_index;
+ if (index >= priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%d not a subvolume-index", index);
+ goto out;
}
+ priv->read_child = index;
+ }
+
+ GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out);
+ GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out);
+ priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
+ GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out);
+ GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool,
+ out);
+
+ GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out);
+ GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log,
+ options, bool, out);
+ GF_OPTION_RECONF("quorum-type", qtype, options, str, out);
+ GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out);
+ fix_quorum_options(this, priv, qtype, options);
+ if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL))
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
+ "Client-quorum is not met");
+
+ GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options,
+ uint32, out);
+
+ GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options,
+ size_uint64, out);
+ /* Reset this so we re-discover in case the topology changed. */
+ GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options,
+ bool, out);
+
+ enabled_old = priv->shd.enabled;
+ GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out);
+
+ GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool,
+ out);
+
+ timeout_old = priv->shd.timeout;
+ GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out);
+
+ GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options,
+ bool, out);
+
+ GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32,
+ out);
+
+ GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options,
+ uint32, out);
+
+ GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str,
+ out);
+ if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+ goto out;
+
+ priv->did_discovery = _gf_false;
+
+ GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out);
+ if (priv->quorum_count != 0)
+ consistent_io = _gf_false;
+ priv->consistent_io = consistent_io;
+
+ afr_handle_anon_inode_options(priv, options);
+
+ GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool,
+ out);
+ if (priv->shd.enabled) {
+ if ((priv->shd.enabled != enabled_old) ||
+ (timeout_old != priv->shd.timeout))
+ afr_selfheal_childup(this, priv);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
- if (xlator_get_volopt_info (&this->volume_options,
- "data-self-heal-algorithm", &def_val, NULL)) {
- gf_log (this->name, GF_LOG_ERROR, "Default value of "
- " data-self-heal-algorithm not found");
- ret = -1;
- goto out;
- } else {
- priv->data_self_heal_algorithm = def_val;
- }
- dict_ret = dict_get_str (this->options, "data-self-heal-algorithm",
- &algo);
- if (dict_ret == 0) {
- priv->data_self_heal_algorithm = gf_strdup (algo);
- }
+static int
+afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this)
+{
+ int ret = -1;
+ int i = 0;
+ char *ptr = NULL;
+ char *ptr1 = NULL;
+ char *xattrs_list = NULL;
+ xlator_list_t *trav = NULL;
+ int child_count = -1;
+
+ trav = this->children;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the
+ * name of the thin arbiter file for persistence across add/
+ * removal of DHT subvols.*/
+ child_count++;
+ }
+
+ GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out);
+ priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count,
+ gf_afr_mt_char);
+ if (!priv->pending_key) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (!xattrs_list) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG,
+ "Unable to fetch afr-pending-xattr option from volfile."
+ " Falling back to using client translator names. ");
- if (xlator_get_volopt_info (&this->volume_options,
- "data-self-heal-window-size",&def_val,
- NULL)) {
- gf_log (this->name, GF_LOG_ERROR, "Default value of "
- "data-self-heal-window-size not found");
- ret = -1;
+ while (i < child_count) {
+ ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+ trav->xlator->name);
+ if (ret == -1) {
+ ret = -ENOMEM;
goto out;
- } else {
- if (gf_string2int32 (def_val,
- (int *)&priv->data_self_heal_window_size)) {
- gf_log (this->name, GF_LOG_ERROR, "Default value of "
- "data-self-heal-window-size corrupt");
- ret = -1;
- goto out;
- }
+ }
+ trav = trav->next;
+ i++;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ ptr = ptr1 = gf_strdup(xattrs_list);
+ if (!ptr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) {
+ ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+ ptr);
+ if (ret == -1) {
+ ret = -ENOMEM;
+ goto out;
}
+ i++;
+ }
+ ret = 0;
- dict_ret = dict_get_int32 (this->options, "data-self-heal-window-size",
- &window_size);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting data self-heal window size to %d",
- window_size);
+out:
+ GF_FREE(ptr1);
+ return ret;
+}
- priv->data_self_heal_window_size = window_size;
- }
+void
+afr_ta_init(afr_private_t *priv)
+{
+ priv->thin_arbiter_count = 1;
+ priv->child_count--;
+ priv->ta_child_up = 0;
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->ta_notify_dom_lock_offset = 0;
+ priv->ta_in_mem_txn_count = 0;
+ priv->ta_on_wire_txn_count = 0;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ INIT_LIST_HEAD(&priv->ta_waitq);
+ INIT_LIST_HEAD(&priv->ta_onwireq);
+ gf_uuid_clear(priv->ta_gfid);
+}
- dict_ret = dict_get_str (this->options, "metadata-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->metadata_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option metadata-self-heal %s'. "
- "Defaulting to metadata-self-heal as 'on'.",
- self_heal);
- priv->metadata_self_heal = 1;
- }
+int32_t
+init(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int child_count = 0;
+ xlator_list_t *trav = NULL;
+ int i = 0;
+ int ret = -1;
+ GF_UNUSED int op_errno = 0;
+ xlator_t *read_subvol = NULL;
+ int read_subvol_index = -1;
+ char *qtype = NULL;
+ char *fav_child_policy = NULL;
+ char *thin_arbiter = NULL;
+ char *data_self_heal = NULL;
+ char *locking_scheme = NULL;
+ char *data_self_heal_algorithm = NULL;
+
+ if (!this->children) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED,
+ "replicate translator needs more than one "
+ "subvolume defined.");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED,
+ "Volume is dangling.");
+ }
+
+ this->private = GF_CALLOC(1, sizeof(afr_private_t),
+ gf_afr_mt_afr_private_t);
+ if (!this->private)
+ goto out;
+
+ priv = this->private;
+ INIT_LIST_HEAD(&priv->saved_locks);
+ INIT_LIST_HEAD(&priv->lk_healq);
+ LOCK_INIT(&priv->lock);
+
+ child_count = xlator_subvolume_count(this);
+
+ priv->child_count = child_count;
+
+ priv->read_child = -1;
+
+ GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
+ GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
+ if (thin_arbiter && strlen(thin_arbiter) > 0) {
+ afr_ta_init(priv);
+ }
+ INIT_LIST_HEAD(&priv->healing);
+ INIT_LIST_HEAD(&priv->heal_waiting);
+
+ priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
+
+ GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out);
+
+ GF_OPTION_INIT("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, bool, out);
+
+ GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out);
+ if (read_subvol) {
+ priv->read_child = xlator_subvolume_index(this, read_subvol);
+ if (priv->read_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%s not a subvolume", read_subvol->name);
+ goto out;
}
-
- dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->entry_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option entry-self-heal %s'. "
- "Defaulting to entry-self-heal as 'on'.",
- self_heal);
- priv->entry_self_heal = 1;
- }
+ }
+ GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out);
+ if (read_subvol_index > -1) {
+ if (read_subvol_index >= priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%d not a subvolume-index", read_subvol_index);
+ goto out;
}
+ priv->read_child = read_subvol_index;
+ }
+ GF_OPTION_INIT("choose-local", priv->choose_local, bool, out);
- /* Change log options */
-
- priv->data_change_log = 1;
- priv->metadata_change_log = 1;
- priv->entry_change_log = 1;
- priv->optimistic_change_log = 1;
-
- dict_ret = dict_get_str (this->options, "data-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log, &priv->data_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option data-change-log %s'. "
- "Defaulting to data-change-log as 'on'.",
- change_log);
- priv->data_change_log = 1;
- }
- }
+ priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads),
+ priv->child_count, gf_afr_mt_atomic_t);
- dict_ret = dict_get_str (this->options, "metadata-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log,
- &priv->metadata_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option metadata-change-log %s'. "
- "Defaulting to metadata-change-log as 'off'.",
- change_log);
- priv->metadata_change_log = 0;
- }
- }
+ GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out);
- dict_ret = dict_get_str (this->options, "entry-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log, &priv->entry_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option entry-change-log %s'. "
- "Defaulting to entry-change-log as 'on'.",
- change_log);
- priv->entry_change_log = 1;
- }
- }
+ priv->favorite_child = -1;
- dict_ret = dict_get_str (this->options, "optimistic-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log, &priv->optimistic_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option optimistic-change-log %s'. "
- "Defaulting to optimistic-change-log as 'on'.",
- change_log);
- priv->optimistic_change_log = 1;
- }
- }
+ GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out);
+ if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+ goto out;
- /* Locking options */
+ GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out);
- priv->inodelk_trace = 0;
- priv->entrylk_trace = 0;
+ GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out);
- dict_ret = dict_get_str (this->options, "inodelk-trace",
- &inodelk_trace);
- if (dict_ret == 0) {
- ret = gf_string2boolean (inodelk_trace, &priv->inodelk_trace);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option inodelk-trace %s' ",
- inodelk_trace);
+ GF_OPTION_INIT("background-self-heal-count",
+ priv->background_self_heal_count, uint32, out);
- priv->inodelk_trace = 0;
- }
- }
+ GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);
+ GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
- dict_ret = dict_get_str (this->options, "entrylk-trace",
- &entrylk_trace);
- if (dict_ret == 0) {
- ret = gf_string2boolean (entrylk_trace, &priv->entrylk_trace);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option entrylk-trace %s' ",
- inodelk_trace);
+ GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
+ out);
+ set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
- priv->entrylk_trace = 0;
- }
- }
+ GF_OPTION_INIT("data-self-heal-window-size",
+ priv->data_self_heal_window_size, uint32, out);
+ GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out);
- priv->data_lock_server_count = 1;
- priv->metadata_lock_server_count = 0;
- priv->entry_lock_server_count = 1;
+ GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out);
- dict_ret = dict_get_int32 (this->options, "data-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting data lock server count to %d.",
- lock_server_count);
+ GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+ uint32, out);
- if (lock_server_count == 0)
- gf_log (this->name, GF_LOG_WARNING, "%s",
- no_lock_servers_warning_str);
+ GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32,
+ out);
+ GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out);
+ GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out);
- priv->data_lock_server_count = lock_server_count;
- }
+ GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out);
+ GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
+ uint32, out);
- dict_ret = dict_get_int32 (this->options,
- "metadata-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting metadata lock server count to %d.",
- lock_server_count);
- priv->metadata_lock_server_count = lock_server_count;
- }
+ GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);
+ GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool,
+ out);
- dict_ret = dict_get_int32 (this->options, "entry-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting entry lock server count to %d.",
- lock_server_count);
+ GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out);
+ GF_OPTION_INIT("locking-scheme", locking_scheme, str, out);
+ priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
+ GF_OPTION_INIT("full-lock", priv->full_lock, bool, out);
+ GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out);
- priv->entry_lock_server_count = lock_server_count;
- }
+ GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out);
+ GF_OPTION_INIT("quorum-type", qtype, str, out);
+ GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out);
+ GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
+ out);
+ fix_quorum_options(this, priv, qtype, this->options);
- priv->strict_readdir = _gf_false;
-
- dict_ret = dict_get_str (this->options, "strict-readdir",
- &strict_readdir);
- if (dict_ret == 0) {
- ret = gf_string2boolean (strict_readdir, &priv->strict_readdir);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option strict-readdir %s'. "
- "Defaulting to strict-readdir as 'off'.",
- strict_readdir);
- }
- }
+ GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
+ GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out);
- trav = this->children;
- while (trav) {
- if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Subvolume '%s' specified as read child.",
- trav->xlator->name);
-
- priv->read_child = child_count;
- }
-
- if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_WARNING,
- favorite_child_warning_str, trav->xlator->name,
- trav->xlator->name, trav->xlator->name);
- priv->favorite_child = child_count;
- }
-
- child_count++;
- trav = trav->next;
- }
+ GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out);
- priv->wait_count = 1;
+ GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
+ GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out);
- priv->child_count = child_count;
+ GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
+ GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
+ afr_handle_anon_inode_options(priv, this->options);
- LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
+ GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out);
+ if (priv->quorum_count != 0)
+ priv->consistent_io = _gf_false;
- priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
- gf_afr_mt_char);
- if (!priv->child_up) {
- ret = -ENOMEM;
- goto out;
- }
+ priv->wait_count = 1;
- for (i = 0; i < child_count; i++)
- priv->child_up[i] = -1; /* start with unknown state.
- this initialization needed
- for afr_notify() to work
- reliably
- */
+ priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char);
+ if (!priv->local) {
+ ret = -ENOMEM;
+ goto out;
+ }
- priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
- gf_afr_mt_xlator_t);
- if (!priv->children) {
- ret = -ENOMEM;
- goto out;
- }
+ priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
- priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key),
- child_count,
- gf_afr_mt_char);
- if (!priv->pending_key) {
- ret = -ENOMEM;
- goto out;
- }
+ priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
- trav = this->children;
- i = 0;
- while (i < child_count) {
- priv->children[i] = trav->xlator;
-
- ret = gf_asprintf (&priv->pending_key[i], "%s.%s",
- AFR_XATTR_PREFIX,
- trav->xlator->name);
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed to set pending key");
- ret = -ENOMEM;
- goto out;
- }
-
- trav = trav->next;
- i++;
- }
+ priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
+ gf_afr_mt_child_latency_t);
+ priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
- priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
- gf_afr_mt_int32_t);
- if (!priv->last_event) {
- ret = -ENOMEM;
- goto out;
+ if (!priv->child_up || !priv->child_latency || !priv->halo_child_up ||
+ !priv->anon_inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /*Initialize to -ve ping timeout so that they are not considered
+ * in child-up events until ping-event comes*/
+ for (i = 0; i < child_count; i++)
+ priv->child_latency[i] = -1;
+
+ priv->children = GF_CALLOC(sizeof(xlator_t *), child_count,
+ gf_afr_mt_xlator_t);
+ if (!priv->children) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = afr_pending_xattrs_init(priv, this);
+ if (ret)
+ goto out;
+
+ trav = this->children;
+ i = 0;
+ while (i < child_count) {
+ priv->children[i] = trav->xlator;
+ trav = trav->next;
+ i++;
+ }
+
+ ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event),
+ gf_afr_mt_int32_t);
+ if (!priv->last_event) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this);
+ if (!this->itable) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (priv->shd.iamshd) {
+ ret = afr_selfheal_daemon_init(this);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
}
+ }
- LOCK_INIT (&priv->root_inode_lk);
- priv->first_lookup = 1;
- priv->root_inode = NULL;
+ /* keep more local here as we may need them for self-heal etc */
+ this->local_pool = mem_pool_new(afr_local_t, 512);
+ if (!this->local_pool) {
+ ret = -1;
+ goto out;
+ }
- pthread_mutex_init (&priv->mutex, NULL);
- INIT_LIST_HEAD (&priv->saved_fds);
+ priv->root_inode = NULL;
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
+}
+void
+afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = -1;
+
+ if (!healer)
+ return;
+
+ if (healer->running) {
+ /*
+ * If there are any resources to cleanup, We need
+ * to do that gracefully using pthread_cleanup_push
+ */
+ ret = gf_thread_cleanup_xint(healer->thread);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED,
+ "Failed to clean up healer threads.");
+ healer->thread = 0;
+ }
+ pthread_cond_destroy(&healer->cond);
+ pthread_mutex_destroy(&healer->mutex);
}
-
-int
-fini (xlator_t *this)
+void
+afr_selfheal_daemon_fini(xlator_t *this)
{
- return 0;
+ struct subvol_healer *healer = NULL;
+ afr_self_heald_t *shd = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+ if (!priv)
+ return;
+
+ shd = &priv->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ healer = &shd->full_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ if (shd->statistics[i])
+ eh_destroy(shd->statistics[i]);
+ }
+ GF_FREE(shd->index_healers);
+ GF_FREE(shd->full_healers);
+ GF_FREE(shd->statistics);
+ if (shd->split_brain)
+ eh_destroy(shd->split_brain);
}
+void
+fini(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ afr_selfheal_daemon_fini(this);
+ GF_ASSERT(list_empty(&priv->saved_locks));
+
+ LOCK(&priv->lock);
+ if (priv->timer != NULL) {
+ gf_timer_call_cancel(this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+ UNLOCK(&priv->lock);
+
+ if (this->local_pool != NULL) {
+ mem_pool_destroy(this->local_pool);
+ this->local_pool = NULL;
+ }
+ this->private = NULL;
+ afr_priv_destroy(priv);
+ if (this->itable) {
+ inode_table_destroy(this->itable);
+ this->itable = NULL;
+ }
+
+ return;
+}
struct xlator_fops fops = {
- .lookup = afr_lookup,
- .open = afr_open,
- .lk = afr_lk,
- .flush = afr_flush,
- .statfs = afr_statfs,
- .fsync = afr_fsync,
- .fsyncdir = afr_fsyncdir,
- .xattrop = afr_xattrop,
- .fxattrop = afr_fxattrop,
- .inodelk = afr_inodelk,
- .finodelk = afr_finodelk,
- .entrylk = afr_entrylk,
- .fentrylk = afr_fentrylk,
-
- /* inode read */
- .access = afr_access,
- .stat = afr_stat,
- .fstat = afr_fstat,
- .readlink = afr_readlink,
- .getxattr = afr_getxattr,
- .readv = afr_readv,
-
- /* inode write */
- .writev = afr_writev,
- .truncate = afr_truncate,
- .ftruncate = afr_ftruncate,
- .setxattr = afr_setxattr,
- .setattr = afr_setattr,
- .fsetattr = afr_fsetattr,
- .removexattr = afr_removexattr,
-
- /* dir read */
- .opendir = afr_opendir,
- .readdir = afr_readdir,
- .readdirp = afr_readdirp,
-
- /* dir write */
- .create = afr_create,
- .mknod = afr_mknod,
- .mkdir = afr_mkdir,
- .unlink = afr_unlink,
- .rmdir = afr_rmdir,
- .link = afr_link,
- .symlink = afr_symlink,
- .rename = afr_rename,
+ .lookup = afr_lookup,
+ .lk = afr_lk,
+ .flush = afr_flush,
+ .statfs = afr_statfs,
+ .fsyncdir = afr_fsyncdir,
+ .inodelk = afr_inodelk,
+ .finodelk = afr_finodelk,
+ .entrylk = afr_entrylk,
+ .fentrylk = afr_fentrylk,
+ .ipc = afr_ipc,
+ .lease = afr_lease,
+
+ /* inode read */
+ .access = afr_access,
+ .stat = afr_stat,
+ .fstat = afr_fstat,
+ .readlink = afr_readlink,
+ .getxattr = afr_getxattr,
+ .fgetxattr = afr_fgetxattr,
+ .readv = afr_readv,
+ .seek = afr_seek,
+
+ /* inode write */
+ .writev = afr_writev,
+ .truncate = afr_truncate,
+ .ftruncate = afr_ftruncate,
+ .setxattr = afr_setxattr,
+ .fsetxattr = afr_fsetxattr,
+ .setattr = afr_setattr,
+ .fsetattr = afr_fsetattr,
+ .removexattr = afr_removexattr,
+ .fremovexattr = afr_fremovexattr,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
+ .xattrop = afr_xattrop,
+ .fxattrop = afr_fxattrop,
+ .fsync = afr_fsync,
+
+ /*inode open*/
+ .opendir = afr_opendir,
+ .open = afr_open,
+
+ /* dir read */
+ .readdir = afr_readdir,
+ .readdirp = afr_readdirp,
+
+ /* dir write */
+ .create = afr_create,
+ .mknod = afr_mknod,
+ .mkdir = afr_mkdir,
+ .unlink = afr_unlink,
+ .rmdir = afr_rmdir,
+ .link = afr_link,
+ .symlink = afr_symlink,
+ .rename = afr_rename,
};
-
struct xlator_dumpops dumpops = {
- .priv = afr_priv_dump,
+ .priv = afr_priv_dump,
};
-
struct xlator_cbks cbks = {
- .release = afr_release,
- .releasedir = afr_releasedir,
+ .release = afr_release,
+ .releasedir = afr_releasedir,
+ .forget = afr_forget,
};
-
struct volume_options options[] = {
- { .key = {"read-subvolume" },
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"favorite-child"},
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"background-self-heal-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
- },
- { .key = {"data-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"data-self-heal-algorithm"},
- .type = GF_OPTION_TYPE_STR,
- .default_value = "",
- .description = "Select between \"full\", \"diff\". The "
- "\"full\" algorithm copies the entire file from "
- "source to sink. The \"diff\" algorithm copies to "
- "sink only those blocks whose checksums don't match "
- "with those of source.",
- .value = { "diff", "full" }
- },
- { .key = {"data-self-heal-window-size"},
- .type = GF_OPTION_TYPE_INT,
- .min = 1,
- .max = 1024,
- .default_value = "16",
- .description = "Maximum number blocks per file for which self-heal "
- "process would be applied simultaneously."
- },
- { .key = {"metadata-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"entry-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"data-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"metadata-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"entry-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"optimistic-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"data-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
- },
- { .key = {"metadata-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
- },
- { .key = {"entry-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
- },
- { .key = {"strict-readdir"},
- .type = GF_OPTION_TYPE_BOOL,
- },
- { .key = {NULL} },
+ {.key = {"read-subvolume"},
+ .type = GF_OPTION_TYPE_XLATOR,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. Afr will prefer the one specified using "
+ "this option if it is not stale. Option value must be "
+ "one of the xlator names of the children. "
+ "Ex: <volname>-client-0 till "
+ "<volname>-client-<number-of-bricks - 1>"},
+ {.key = {"read-subvolume-index"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "-1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one specified using "
+ "this option if it is not stale. allowed options"
+ " include -1 till replica-count - 1"},
+ {.key = {"read-hash-mode"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 5,
+ .default_value = "1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description =
+ "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one computed using "
+ "the method specified using this option.\n"
+ "0 = first readable child of AFR, starting from 1st child.\n"
+ "1 = hash by GFID of file (all clients use "
+ "same subvolume).\n"
+ "2 = hash by GFID of file and client PID.\n"
+ "3 = brick having the least outstanding read requests.\n"
+ "4 = brick having the least network ping latency.\n"
+ "5 = Hybrid mode between 3 and 4, ie least value among "
+ "network-latency multiplied by outstanding-read-requests."},
+ {
+ .key = {"choose-local"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "true",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Choose a local subvolume (i.e. Brick) to read from"
+ " if read-subvolume is not explicitly set.",
+ },
+ {.key = {"background-self-heal-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 256,
+ .default_value = "8",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This specifies the number of per client self-heal "
+ "jobs that can perform parallel heals in the "
+ "background."},
+ {.key = {"halo-shd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for shd halo replication in msec."},
+ {.key = {"halo-enabled"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "False",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Enable Halo (geo) replication mode."},
+ {.key = {"halo-nfsd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "5",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for nfsd halo replication in msec."},
+ {.key = {"halo-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = AFR_HALO_MAX_LATENCY,
+ .default_value = "5",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for halo replication in msec."},
+ {.key = {"halo-max-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "The maximum number of halo replicas; replicas"
+ " beyond this value will be written asynchronously"
+ "via the SHD."},
+ {.key = {"halo-min-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "2",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "The minimmum number of halo replicas, before adding "
+ "out of region replicas."},
+ {.key = {"heal-wait-queue-length"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
+ .default_value = "128",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .op_version = {GD_OP_VERSION_3_7_10},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This specifies the number of heals that can be queued"
+ " for the parallel background self heal jobs."},
+ {.key = {"data-self-heal"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false",
+ "disable", "open"},
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Using this option we can enable/disable data "
+ "self-heal on the file. \"open\" means data "
+ "self-heal action will only be triggered by file "
+ "open operations."},
+ {.key = {"data-self-heal-algorithm"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Select between \"full\", \"diff\". The "
+ "\"full\" algorithm copies the entire file from "
+ "source to sink. The \"diff\" algorithm copies to "
+ "sink only those blocks whose checksums don't match "
+ "with those of source. If no option is configured "
+ "the option is chosen dynamically as follows: "
+ "If the file does not exist on one of the sinks "
+ "or empty file exists or if the source file size is "
+ "about the same as page size the entire file will "
+ "be read and written i.e \"full\" algo, "
+ "otherwise \"diff\" algo is chosen.",
+ .value = {"diff", "full"}},
+ {.key = {"data-self-heal-window-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 1024,
+ .default_value = "1",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Maximum number blocks per file for which self-heal "
+ "process would be applied simultaneously."},
+ {.key = {"metadata-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica*/
+ .description = "Using this option we can enable/disable metadata "
+ "i.e. Permissions, ownerships, xattrs self-heal on "
+ "the file/directory."},
+ {.key = {"entry-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica*/
+ .description = "Using this option we can enable/disable entry "
+ "self-heal on the directory."},
+ {.key = {"data-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"metadata-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"entry-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Entry/Metadata fops will not perform "
+ "pre fop changelog operations in afr transaction "
+ "if this option is enabled."},
+ {.key = {"inodelk-trace"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enabling this option logs inode lock/unlocks"},
+ {.key = {"entrylk-trace"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enabling this option logs entry lock/unlocks"},
+ {.key = {"pre-op-compat"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Use separate pre-op xattrop() FOP rather than "
+ "overloading xdata of the OP"},
+ {.key = {"eager-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description =
+ "Enable/Disable eager lock for replica volume. "
+ "Lock phase of a transaction has two sub-phases. "
+ "First is an attempt to acquire locks in parallel by "
+ "broadcasting non-blocking lock requests. If lock "
+ "acquisition fails on any server, then the held locks "
+ "are unlocked and we revert to a blocking locks mode "
+ "sequentially on one server after another. If this "
+ "option is enabled the initial broadcasting lock "
+ "request attempts to acquire a full lock on the entire file. "
+ "If this fails, we revert back to the sequential "
+ "\"regional\" blocking locks as before. In the case "
+ "where such an \"eager\" lock is granted in the "
+ "non-blocking phase, it gives rise to an opportunity "
+ "for optimization. i.e, if the next write transaction "
+ "on the same FD arrives before the unlock phase of "
+ "the first transaction, it \"takes over\" the full "
+ "file lock. Similarly if yet another data transaction "
+ "arrives before the unlock phase of the \"optimized\" "
+ "transaction, that in turn \"takes over\" the lock as "
+ "well. The actual unlock now happens at the end of "
+ "the last \"optimized\" transaction."
+
+ },
+ {.key = {"self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica_heal_enable_disable*/
+ .description = "This option applies to only self-heal-daemon. "
+ "Index directory crawl and automatic healing of files "
+ "will not be performed if this option is turned off."},
+ {.key = {"iam-self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of self-heal-daemon "
+ "or not."},
+ {.key = {"iam-nfs-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of an NFS daemon "
+ "or not."},
+ {
+ .key = {"quorum-type"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"none", "auto", "fixed"},
+ .default_value = "none",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.option = quorum-type*/
+ .description = "If value is \"fixed\" only allow writes if "
+ "quorum-count bricks are present. If value is "
+ "\"auto\" only allow writes if more than half of "
+ "bricks, or exactly half including the first, are "
+ "present.",
+ },
+ {
+ .key = {"quorum-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = INT_MAX,
+ .default_value = 0,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.option = quorum-count*/
+ /*.validate_fn = validate_quorum_count*/
+ .description = "If quorum-type is \"fixed\" only allow writes if "
+ "this many bricks are present. Other quorum types "
+ "will OVERWRITE this value.",
+ },
+ {
+ .key = {"quorum-reads"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option has been removed. Reads are not allowed "
+ "if quorum is not met.",
+ },
+ {
+ .key = {"node-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Local glusterd uuid string, used in starting "
+ "self-heal-daemon so that it can crawl only on "
+ "local index directories.",
+ },
+ {
+ .key = {"post-op-delay-secs"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Time interval induced artificially before "
+ "post-operation phase of the transaction to "
+ "enhance overlap of adjacent write operations.",
+ },
+ {
+ .key = {AFR_SH_READDIR_SIZE_KEY},
+ .type = GF_OPTION_TYPE_SIZET,
+ .description = "readdirp size for performing entry self-heal",
+ .min = 1024,
+ .max = 131072,
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .default_value = "1KB",
+ },
+ {
+ .key = {"ensure-durability"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .op_version = {3},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Afr performs fsyncs for transactions if this "
+ "option is on to make sure the changelogs/data is "
+ "written to the disk",
+ .default_value = "on",
+ },
+ {
+ .key = {"afr-dirty-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = AFR_DIRTY_DEFAULT,
+ },
+ {.key = {"afr-pending-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Comma separated list of xattrs that are used to "
+ "capture information on pending heals."},
+ {
+ .key = {"metadata-splitbrain-forced-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
+ {.key = {"heal-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 5,
+ .max = INT_MAX,
+ .default_value = "600",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "time interval for checking the need to self-heal "
+ "in self-heal-daemon"},
+ {
+ .key = {"consistent-metadata"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is enabled, readdirp will force "
+ "lookups on those entries read whose read child is "
+ "not the same as that of the parent. This will "
+ "guarantee that all read operations on a file serve "
+ "attributes from the same subvol as long as it holds "
+ " a good copy of the file/dir.",
+ },
+ {.key = {"arbiter-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .description = "subset of child_count. Has to be 0 or 1."},
+ {
+ .key = {"thin-arbiter"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_4_1_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "contains host:path of thin abriter brick",
+ },
+ {.key = {"shd-max-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 64,
+ .default_value = "1",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Maximum number of parallel heals SHD can do per "
+ "local brick. This can substantially lower heal times"
+ ", but can also crush your bricks if you don't have "
+ "the storage hardware to support this."},
+ {
+ .key = {"shd-wait-qlength"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 655536,
+ .default_value = "1024",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option can be used to control number of heals"
+ " that can wait in SHD per subvolume",
+ },
+ {
+ .key = {"locking-scheme"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"full", "granular"},
+ .default_value = "full",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is set to granular, self-heal will "
+ "stop being compatible with afr-v1, which helps afr "
+ "be more granular while self-healing",
+ },
+ {.key = {"full-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes",
+ .op_version = {GD_OP_VERSION_3_13_2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "If this option is disabled, then the IOs will take "
+ "range locks same as versions till 3.13.1."},
+ {
+ .key = {"granular-entry-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is enabled, self-heal will resort to "
+ "granular way of recording changelogs and doing entry "
+ "self-heal.",
+ },
+ {
+ .key = {"favorite-child-policy"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"none", "size", "ctime", "mtime", "majority"},
+ .default_value = "none",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option can be used to automatically resolve "
+ "split-brains using various policies without user "
+ "intervention. \"size\" picks the file with the "
+ "biggest size as the source. \"ctime\" and \"mtime\" "
+ "pick the file with the latest ctime and mtime "
+ "respectively as the source. \"majority\" picks a file"
+ " with identical mtime and size in more than half the "
+ "number of bricks in the replica.",
+ },
+ {
+ .key = {"consistent-io"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .description = "If this option is enabled, i/o will fail even if "
+ "one of the bricks is down in the replicas",
+ },
+ {.key = {"use-compound-fops"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_8_4},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"use-anonymous-inode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "Setting this option heals directory renames efficiently"},
+
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "replicate",
+ .category = GF_MAINTAINED,
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index f99ce471994..d62f9a9caf2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1,1042 +1,1423 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#ifndef __AFR_H__
#define __AFR_H__
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "call-stub.h"
-#include "compat-errno.h"
+#include <glusterfs/call-stub.h>
+#include <glusterfs/compat-errno.h>
#include "afr-mem-types.h"
#include "libxlator.h"
+#include <glusterfs/timer.h>
+#include <glusterfs/syncop.h>
-#define AFR_XATTR_PREFIX "trusted.afr"
+#include "afr-self-heald.h"
+#include "afr-messages.h"
+
+#define SHD_INODE_LRU_LIMIT 1
#define AFR_PATHINFO_HEADER "REPLICATE:"
+#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
+#define AFR_DIRTY (((afr_private_t *)(THIS->private))->afr_dirty)
+
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
+#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/
+
+#define ARBITER_BRICK_INDEX 2
+#define THIN_ARBITER_BRICK_INDEX 2
+#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
+#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
+
+#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
+
+#define AFR_HALO_MAX_LATENCY 99999
+#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode"
+
+#define PFLAG_PENDING (1 << 0)
+#define PFLAG_SBRAIN (1 << 1)
+
+typedef int (*afr_lock_cbk_t)(call_frame_t *frame, xlator_t *this);
+
+typedef int (*afr_read_txn_wind_t)(call_frame_t *frame, xlator_t *this,
+ int subvol);
+
+typedef int (*afr_inode_refresh_cbk_t)(call_frame_t *frame, xlator_t *this,
+ int err);
+
+typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
+
+#define AFR_COUNT(array, max) \
+ ({ \
+ int __i; \
+ int __res = 0; \
+ for (__i = 0; __i < max; __i++) \
+ if (array[__i]) \
+ __res++; \
+ __res; \
+ })
+#define AFR_INTERSECT(dst, src1, src2, max) \
+ ({ \
+ int __i; \
+ for (__i = 0; __i < max; __i++) \
+ dst[__i] = src1[__i] && src2[__i]; \
+ })
+#define AFR_CMP(a1, a2, len) \
+ ({ \
+ int __cmp = 0; \
+ int __i; \
+ for (__i = 0; __i < len; __i++) \
+ if (a1[__i] != a2[__i]) { \
+ __cmp = 1; \
+ break; \
+ } \
+ __cmp; \
+ })
+#define AFR_IS_ARBITER_BRICK(priv, index) \
+ ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX))
+
+#define AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(ret, errnum) \
+ do { \
+ local->op_ret = ret; \
+ local->op_errno = errnum; \
+ if (local->op_errno == EIO) \
+ gf_msg(this->name, GF_LOG_ERROR, local->op_errno, \
+ AFR_MSG_SPLIT_BRAIN, \
+ "Failing %s on gfid %s: " \
+ "split-brain observed.", \
+ gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \
+ } while (0)
+
+#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \
+ do { \
+ afr_fd_ctx_t *__fd_ctx = NULL; \
+ __fd_ctx = afr_fd_ctx_get(__fd, __this); \
+ if (__fd_ctx && __fd_ctx->is_fd_bad) { \
+ __error = EBADF; \
+ goto __label; \
+ } \
+ } while (0)
+
+typedef enum {
+ AFR_READ_POLICY_FIRST_UP,
+ AFR_READ_POLICY_GFID_HASH,
+ AFR_READ_POLICY_GFID_PID_HASH,
+ AFR_READ_POLICY_LESS_LOAD,
+ AFR_READ_POLICY_LEAST_LATENCY,
+ AFR_READ_POLICY_LOAD_LATENCY_HYBRID,
+} afr_read_hash_mode_t;
-struct _pump_private;
+typedef enum {
+ AFR_FAV_CHILD_NONE,
+ AFR_FAV_CHILD_BY_SIZE,
+ AFR_FAV_CHILD_BY_CTIME,
+ AFR_FAV_CHILD_BY_MTIME,
+ AFR_FAV_CHILD_BY_MAJORITY,
+ AFR_FAV_CHILD_POLICY_MAX,
+} afr_favorite_child_policy;
-typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int child, int32_t op_error,
- int32_t op_errno);
+typedef enum {
+ AFR_SELFHEAL_DATA_FULL = 0,
+ AFR_SELFHEAL_DATA_DIFF,
+ AFR_SELFHEAL_DATA_DYNAMIC,
+} afr_data_self_heal_type_t;
-typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int child, int32_t op_error,
- int32_t op_errno);
-typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this);
+typedef enum {
+ AFR_CHILD_UNKNOWN = -1,
+ AFR_CHILD_ZERO,
+ AFR_CHILD_ONE,
+ AFR_CHILD_THIN_ARBITER,
+} afr_child_index;
-typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
-typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno);
+typedef enum {
+ TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall
+ notification and waiting for its release.*/
+ TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get
+ *info about which brick is bad.*/
+ TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed
+ *on BAD brick - Success*/
+ TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed
+ *on GOOD brick - Failed*/
+ TA_SUCCESS, /*FOP succeeded on both data bricks.*/
+} afr_ta_fop_state_t;
+
+struct afr_nfsd {
+ uint32_t halo_max_latency_msec;
+ gf_boolean_t iamnfsd;
+};
+
+typedef struct _afr_lk_heal_info {
+ fd_t *fd;
+ int32_t cmd;
+ struct gf_flock flock;
+ dict_t *xdata_req;
+ unsigned char *locked_nodes;
+ struct list_head pos;
+ gf_lkowner_t lk_owner;
+ pid_t pid;
+ int32_t *child_up_event_gen;
+ int32_t *child_down_event_gen;
+} afr_lk_heal_info_t;
typedef struct _afr_private {
- gf_lock_t lock; /* to guard access to child_count, etc */
- unsigned int child_count; /* total number of children */
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
+ unsigned int arbiter_count; /*subset of child_count.
+ Has to be 0 or 1.*/
+
+ xlator_t **children;
+
+ inode_t *root_inode;
+
+ int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
+ /* For thin-arbiter. */
+ uuid_t ta_gfid;
+ unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
+ int ta_bad_child_index;
+ int ta_event_gen;
+ unsigned int ta_in_mem_txn_count;
+ unsigned int ta_on_wire_txn_count;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+
+ unsigned char *anon_inode;
+ unsigned char *child_up;
+ unsigned char *halo_child_up;
+ int64_t *child_latency;
+ unsigned char *local;
+
+ char **pending_key;
+
+ afr_data_self_heal_type_t data_self_heal_algorithm;
+ unsigned int data_self_heal_window_size; /* max number of pipelined
+ read/writes */
+
+ struct list_head heal_waiting; /*queue for files that need heal*/
+ uint32_t heal_wait_qlen; /*configurable queue length for heal_waiting*/
+ int32_t heal_waiters; /* No. of elements currently in wait queue.*/
+
+ struct list_head healing; /* queue for files that are undergoing
+ background heal*/
+ uint32_t background_self_heal_count; /*configurable queue length for
+ healing queue*/
+ int32_t healers; /* No. of elements currently undergoing background
+ heal*/
+
+ gf_boolean_t release_ta_notify_dom_lock;
+
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
+
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+ int read_child; /* read-subvolume */
+ gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
+
+ gf_timer_t *timer; /* launched when parent up is received */
+
+ unsigned int wait_count; /* # of servers to wait for success */
+
+ unsigned char ta_child_up;
+ gf_boolean_t optimistic_change_log;
+ gf_boolean_t eager_lock;
+ gf_boolean_t pre_op_compat; /* on/off */
+ uint32_t post_op_delay_secs;
+ unsigned int quorum_count;
+
+ off_t ta_notify_dom_lock_offset;
+ afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
+ resolution of split-brains.*/
+ afr_read_hash_mode_t hash_mode; /* for when read_child is not set */
+
+ int32_t *last_event;
+
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
+ char vol_uuid[UUID_SIZE + 1];
+
+ gf_boolean_t choose_local;
+ gf_boolean_t did_discovery;
+ gf_boolean_t ensure_durability;
+ gf_boolean_t halo_enabled;
+ gf_boolean_t consistent_metadata;
+ gf_boolean_t need_heal;
+ gf_boolean_t granular_locks;
+ uint64_t sh_readdir_size;
+ char *sh_domain;
+ char *afr_dirty;
+
+ uint64_t spb_choice_timeout;
+
+ afr_self_heald_t shd;
+ struct afr_nfsd nfsd;
+
+ uint32_t halo_max_latency_msec;
+ uint32_t halo_max_replicas;
+ uint32_t halo_min_replicas;
+
+ gf_boolean_t full_lock;
+ gf_boolean_t esh_granular;
+ gf_boolean_t consistent_io;
+ gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t use_anon_inode;
+
+ /*For lock healing.*/
+ struct list_head saved_locks;
+ struct list_head lk_healq;
+
+ /*For anon-inode handling */
+ char anon_inode_name[NAME_MAX + 1];
+ char anon_gfid_str[UUID_SIZE + 1];
+} afr_private_t;
- unsigned int read_child_rr; /* round-robin index of the read_child */
- gf_lock_t read_child_lock; /* lock to protect above */
+typedef enum {
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+} afr_transaction_type;
- xlator_t **children;
+/*
+ xattr format: trusted.afr.volume = [x y z]
+ x - data pending
+ y - metadata pending
+ z - entry pending
+*/
- gf_lock_t root_inode_lk;
- int first_lookup;
- inode_t *root_inode;
+static inline int
+afr_index_for_transaction_type(afr_transaction_type type)
+{
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ return 0;
- unsigned char *child_up;
+ case AFR_METADATA_TRANSACTION:
+ return 1;
- char **pending_key;
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ return 2;
+ }
- gf_boolean_t data_self_heal; /* on/off */
- char * data_self_heal_algorithm; /* name of algorithm */
- unsigned int data_self_heal_window_size; /* max number of pipelined
- read/writes */
+ return -1; /* make gcc happy */
+}
- unsigned int background_self_heal_count;
- unsigned int background_self_heals_started;
- gf_boolean_t metadata_self_heal; /* on/off */
- gf_boolean_t entry_self_heal; /* on/off */
+static inline int
+afr_index_from_ia_type(ia_type_t type)
+{
+ switch (type) {
+ case IA_IFDIR:
+ return afr_index_for_transaction_type(AFR_ENTRY_TRANSACTION);
+ case IA_IFREG:
+ return afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+ default:
+ return -1;
+ }
+}
- gf_boolean_t data_change_log; /* on/off */
- gf_boolean_t metadata_change_log; /* on/off */
- gf_boolean_t entry_change_log; /* on/off */
+typedef struct {
+ struct gf_flock flock;
+ loc_t loc;
+ fd_t *fd;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
- int read_child; /* read-subvolume */
- int favorite_child; /* subvolume to be preferred in resolving
- split-brain cases */
+} afr_lockee_t;
- unsigned int data_lock_server_count;
- unsigned int metadata_lock_server_count;
- unsigned int entry_lock_server_count;
+int
+afr_entry_lockee_cmp(const void *l1, const void *l2);
- gf_boolean_t inodelk_trace;
- gf_boolean_t entrylk_trace;
+typedef struct {
+ loc_t *lk_loc;
- gf_boolean_t strict_readdir;
+ afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
- unsigned int wait_count; /* # of servers to wait for success */
+ const char *lk_basename;
+ const char *lower_basename;
+ const char *higher_basename;
- uint64_t up_count; /* number of CHILD_UPs we have seen */
- uint64_t down_count; /* number of CHILD_DOWNs we have seen */
+ unsigned char *lower_locked_nodes;
- struct _pump_private *pump_private; /* Set if we are loaded as pump */
- int use_afr_in_pump;
+ afr_lock_cbk_t lock_cbk;
- pthread_mutex_t mutex;
- struct list_head saved_fds; /* list of fds on which locks have succeeded */
- gf_boolean_t optimistic_change_log;
+ int lockee_count;
- char vol_uuid[UUID_SIZE + 1];
- int32_t *last_event;
-} afr_private_t;
+ int32_t lk_call_count;
+ int32_t lk_expected_count;
+ int32_t lk_attempted_count;
+
+ int32_t lock_op_ret;
+ int32_t lock_op_errno;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+ int32_t lock_count;
+ char lower_locked;
+ char higher_locked;
+} afr_internal_lock_t;
+
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ dict_t *xattr; /*For xattrop*/
+ dict_t *xdata;
+ struct iatt poststat;
+ struct iatt postparent;
+ struct iatt prestat;
+ struct iatt preparent;
+ struct iatt preparent2;
+ struct iatt postparent2;
+ int32_t op_errno;
+ /* For rchecksum */
+ uint8_t checksum[SHA256_DIGEST_LENGTH];
+ gf_boolean_t buf_has_zeroes;
+ gf_boolean_t fips_mode_rchecksum;
+ /* For lookup */
+ int8_t need_heal;
+};
+
+typedef enum {
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
+} afr_fd_open_status_t;
typedef struct {
- /* External interface: These are variables (some optional) that
- are set by whoever has triggered self-heal */
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+ int flags;
+
+ /* the subvolume on which the latest sequence of readdirs (starting
+ at offset 0) has begun. Till the next readdir request with 0 offset
+ arrives, we continue to read off this subvol.
+ */
+ int readdir_subvol;
+ /* lock-healing related members. */
+ gf_boolean_t is_fd_bad;
+ afr_lk_heal_info_t *lk_heal_info;
- inode_t *inode;
- gf_boolean_t need_data_self_heal;
- gf_boolean_t need_metadata_self_heal;
- gf_boolean_t need_entry_self_heal;
- gf_boolean_t need_gfid_self_heal;
- gf_boolean_t need_missing_entry_self_heal;
+} afr_fd_ctx_t;
- gf_boolean_t forced_merge; /* Is this a self-heal triggered to
- forcibly merge the directories? */
+typedef enum {
+ AFR_FOP_LOCK_PARALLEL,
+ AFR_FOP_LOCK_SERIAL,
+ AFR_FOP_LOCK_QUORUM_FAILED,
+} afr_fop_lock_state_t;
+
+typedef struct _afr_inode_lock_t {
+ /* @num_inodelks:
+ Number of inodelks queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ int32_t num_inodelks;
+ unsigned int event_generation;
+ gf_timer_t *delay_timer;
+ struct list_head owners; /*Transactions that are performing fop*/
+ struct list_head post_op; /*Transactions that are done with the fop
+ *So can not conflict with the fops*/
+ struct list_head waiting; /*Transaction that are waiting for
+ *conflicting transactions to complete*/
+ struct list_head frozen; /*Transactions that need to go as part of
+ * next batch of eager-lock*/
+ gf_boolean_t release;
+ gf_boolean_t acquired;
+} afr_lock_t;
+
+typedef struct _afr_inode_ctx {
+ uint64_t read_subvol;
+ uint64_t write_subvol;
+ int lock_count;
+ int spb_choice;
+ gf_timer_t *timer;
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+ /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
+ afr_lock_t lock[2];
+
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ uint32_t open_fd_count;
+ gf_boolean_t need_refresh;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+} afr_inode_ctx_t;
- gf_boolean_t healing_fd_opened; /* true if caller has already
- opened fd */
+typedef struct _afr_local {
+ glusterfs_fop_t op;
+ unsigned int call_count;
- gf_boolean_t data_lock_held; /* true if caller has already
- acquired 0-0 lock */
+ /* @event_generation: copy of priv->event_generation taken at the
+ time of starting the transaction. The copy is made so that we
+ have a stable value through the various phases of the transaction.
+ */
+ unsigned int event_generation;
- fd_t *healing_fd; /* set if callers has opened fd */
+ uint32_t open_fd_count;
+ int32_t num_inodelks;
- gf_boolean_t background; /* do self-heal in background
- if possible */
+ int32_t op_ret;
+ int32_t op_errno;
- ia_type_t type; /* st_mode of the entry we're doing
- self-heal on */
- uuid_t sh_gfid_req; /* gfid self-heal needs to be done
- with this gfid if it is not null */
+ int dirty[AFR_NUM_CHANGE_LOGS];
- /* Function to call to unwind. If self-heal is being done in the
- background, this function will be called as soon as possible. */
+ int32_t **pending;
- int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret,
- int32_t op_errno);
+ loc_t loc;
+ loc_t newloc;
- /* End of external interface members */
+ fd_t *fd;
+ afr_fd_ctx_t *fd_ctx;
+ /* @child_up: copy of priv->child_up taken at the time of transaction
+ start. The copy is taken so that we have a stable child_up array
+ through the phases of the transaction as priv->child_up[i] can keep
+ changing through time.
+ */
+ unsigned char *child_up;
- /* array of stat's, one for each child */
- struct iatt *buf;
- struct iatt *parentbufs;
- struct iatt parentbuf;
- struct iatt entrybuf;
+ /* @read_attempted:
+ array of flags representing subvolumes where read operations of
+ the read transaction have already been attempted. The array is
+ first pre-filled with down subvolumes, and as reads are performed
+ on other subvolumes, those are set as well. This way if the read
+ operation fails we do not retry on that subvolume again.
+ */
+ unsigned char *read_attempted;
- afr_expunge_done_cbk_t expunge_done;
- afr_impunge_done_cbk_t impunge_done;
- int32_t impunge_ret_child;
+ /* @readfn:
- /* array of xattr's, one for each child */
- dict_t **xattr;
+ pointer to function which will perform the read operation on a given
+ subvolume. Used in read transactions.
+ */
- /* array containing if the lookups succeeded in the order of response
- */
- int32_t *child_success;
- int success_count;
- /* array containing the fresh children found in the self-heal process */
- int32_t *fresh_children;
- /* array containing the fresh children found in the parent lookup */
- int32_t *fresh_parent_dirs;
- /* array of errno's, one for each child */
- int *child_errno;
- /*loc used for lookup*/
- loc_t lookup_loc;
- int32_t lookup_flags;
- afr_lookup_done_cbk_t lookup_done;
+ afr_read_txn_wind_t readfn;
- int32_t **pending_matrix;
- int32_t **delta_matrix;
+ /* @inode:
- int32_t op_ret;
- int32_t op_errno;
+ the inode on which the read txn is performed on. ref'ed and copied
+ from either fd->inode or loc.inode
+ */
- int *sources;
- int source;
- int active_source;
- int active_sinks;
- int *success;
- unsigned char *locked_nodes;
- int lock_count;
+ inode_t *inode;
- mode_t impunging_entry_mode;
- const char *linkname;
+ /* @parent[2]:
- int op_failed;
+ parent inode[s] on which directory transactions are performed.
+ */
- int file_has_holes;
- blksize_t block_size;
- off_t file_size;
- off_t offset;
- afr_post_remove_call_t post_remove_call;
+ inode_t *parent;
+ inode_t *parent2;
- loc_t parent_loc;
+ /* @readable:
- call_frame_t *orig_frame;
- gf_boolean_t unwound;
+ array of flags representing servers from which a read can be
+ performed. This is the output of afr_inode_refresh()
+ */
+ unsigned char *readable;
+ unsigned char *readable2; /*For rename transaction*/
- /* private data for the particular self-heal algorithm */
- void *private;
+ afr_inode_refresh_cbk_t refreshfn;
- int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this);
+ /* @refreshinode:
- int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
- void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this);
+ Inode currently getting refreshed.
+ */
+ inode_t *refreshinode;
- call_frame_t *sh_frame;
-} afr_self_heal_t;
+ dict_t *xattr_req;
+ dict_t *dict;
-typedef enum {
- AFR_DATA_TRANSACTION, /* truncate, write, ... */
- AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
- AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
- AFR_ENTRY_RENAME_TRANSACTION, /* rename */
-} afr_transaction_type;
+ int read_subvol; /* Current read subvolume */
-typedef enum {
- AFR_TRANSACTION_LK,
- AFR_SELFHEAL_LK,
-} transaction_lk_type_t;
+ int optimistic_change_log;
-typedef enum {
- AFR_LOCK_OP,
- AFR_UNLOCK_OP,
-} afr_lock_op_type_t;
+ afr_internal_lock_t internal_lock;
-typedef enum {
- AFR_DATA_SELF_HEAL_LK,
- AFR_METADATA_SELF_HEAL_LK,
- AFR_ENTRY_SELF_HEAL_LK,
-}selfheal_lk_type_t;
+ /*To handle setattr/setxattr on yet to be linked inode from dht*/
+ uuid_t refreshgfid;
-typedef enum {
- AFR_INODELK_TRANSACTION,
- AFR_INODELK_NB_TRANSACTION,
- AFR_ENTRYLK_TRANSACTION,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_INODELK_SELFHEAL,
- AFR_INODELK_NB_SELFHEAL,
- AFR_ENTRYLK_SELFHEAL,
- AFR_ENTRYLK_NB_SELFHEAL,
-} afr_lock_call_type_t;
+ /* @refreshed:
-/*
- xattr format: trusted.afr.volume = [x y z]
- x - data pending
- y - metadata pending
- z - entry pending
-*/
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
-static inline int
-afr_index_for_transaction_type (afr_transaction_type type)
-{
- switch (type) {
+ gf_boolean_t update_num_inodelks;
+ gf_boolean_t update_open_fd_count;
- case AFR_DATA_TRANSACTION:
- return 0;
+ /*
+ @pre_op_compat:
- case AFR_METADATA_TRANSACTION:
- return 1;
+ compatibility mode of pre-op. send a separate pre-op and
+ op operations as part of transaction, rather than combining
+ */
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- return 2;
- }
+ gf_boolean_t pre_op_compat;
- return -1; /* make gcc happy */
-}
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
-typedef struct {
- loc_t *lk_loc;
- struct gf_flock lk_flock;
+ /*
+ This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
- const char *lk_basename;
- const char *lower_basename;
- const char *higher_basename;
- char lower_locked;
- char higher_locked;
+ struct {
+ struct {
+ struct statvfs buf;
+ unsigned char buf_set;
+ } statfs;
- unsigned char *locked_nodes;
- unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
- unsigned char *entry_locked_nodes;
+ struct {
+ fd_t *fd;
+ int32_t flags;
+ } open;
- selfheal_lk_type_t selfheal_lk_type;
- transaction_lk_type_t transaction_lk_type;
+ struct {
+ struct gf_flock user_flock;
+ struct gf_flock ret_flock;
+ unsigned char *locked_nodes;
+ int32_t cmd;
+ /*For lock healing only.*/
+ unsigned char *dom_locked_nodes;
+ int32_t *dom_lock_op_ret;
+ int32_t *dom_lock_op_errno;
+ struct gf_flock *getlk_rsp;
+ } lk;
+
+ /* inode read */
- int32_t lock_count;
- int32_t inodelk_lock_count;
- int32_t entrylk_lock_count;
+ struct {
+ int32_t mask;
+ int last_index; /* index of the child we tried previously */
+ } access;
- uint64_t lock_number;
- int32_t lk_call_count;
+ struct {
+ int last_index;
+ } stat;
- int32_t lock_op_ret;
- int32_t lock_op_errno;
+ struct {
+ int last_index;
+ } fstat;
- int (*lock_cbk) (call_frame_t *, xlator_t *);
+ struct {
+ size_t size;
+ int last_index;
+ } readlink;
-} afr_internal_lock_t;
+ struct {
+ char *name;
+ long xattr_len;
+ int last_index;
+ } getxattr;
-typedef struct _afr_locked_fd {
- fd_t *fd;
- struct list_head list;
-} afr_locked_fd_t;
+ struct {
+ size_t size;
+ off_t offset;
+ int last_index;
+ uint32_t flags;
+ } readv;
-typedef struct _afr_local {
- int uid;
- int gid;
- unsigned int call_count;
- unsigned int success_count;
- unsigned int enoent_count;
+ /* dir read */
+ struct {
+ uint32_t *checksum;
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+ } opendir;
- unsigned int govinda_gOvinda;
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+ dict_t *dict;
+ int last_index;
+ gf_boolean_t failed;
+ } readdir;
+ /* inode write */
- unsigned int read_child_index;
- unsigned char read_child_returned;
- unsigned int first_up_child;
+ struct {
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } inode_wfop; // common structure for all inode-write-fops
- pid_t saved_pid;
+ struct {
+ struct iovec *vector;
+ struct iobref *iobref;
+ off_t offset;
+ int32_t op_ret;
+ int32_t count;
+ uint32_t flags;
+ } writev;
- int32_t op_ret;
- int32_t op_errno;
+ struct {
+ off_t offset;
+ } truncate;
- int32_t **pending;
+ struct {
+ off_t offset;
+ } ftruncate;
- loc_t loc;
- loc_t newloc;
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } setattr;
- fd_t *fd;
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } fsetattr;
- glusterfs_fop_t fop;
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
- unsigned char *child_up;
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } fsetxattr;
- int32_t *child_errno;
+ struct {
+ char *name;
+ } removexattr;
- dict_t *xattr_req;
+ struct {
+ dict_t *xattr;
+ gf_xattrop_flags_t optype;
+ } xattrop;
- int32_t inodelk_count;
- int32_t entrylk_count;
+ /* dir write */
- afr_internal_lock_t internal_lock;
+ struct {
+ inode_t *inode;
+ struct iatt buf;
+ struct iatt preparent;
+ struct iatt postparent;
+ struct iatt prenewparent;
+ struct iatt postnewparent;
+ } dir_fop; // common structure for all dir fops
- afr_locked_fd_t *locked_fd;
- int32_t source_child;
- int32_t lock_recovery_child;
+ struct {
+ fd_t *fd;
+ dict_t *params;
+ int32_t flags;
+ mode_t mode;
+ } create;
- dict_t *dict;
- int optimistic_change_log;
+ struct {
+ dict_t *params;
+ dev_t dev;
+ mode_t mode;
+ } mknod;
- int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this);
+ struct {
+ dict_t *params;
+ int32_t mode;
+ } mkdir;
- /*
- This struct contains the arguments for the "continuation"
- (scheme-like) of fops
- */
+ struct {
+ dict_t *params;
+ char *linkpath;
+ } symlink;
- int op;
struct {
- struct {
- unsigned char buf_set;
- struct statvfs buf;
- } statfs;
-
- struct {
- uuid_t gfid_req;
- inode_t *inode;
- struct iatt buf;
- struct iatt postparent;
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- dict_t **xattrs;
- dict_t *xattr;
- struct iatt *postparents;
- struct iatt *bufs;
- int32_t read_child;
- int32_t *child_success;//in the order of response
- int32_t *sources;
- } lookup;
-
- struct {
- int32_t flags;
- int32_t wbflags;
- } open;
-
- struct {
- int32_t cmd;
- struct gf_flock user_flock;
- struct gf_flock ret_flock;
- unsigned char *locked_nodes;
- } lk;
-
- /* inode read */
-
- struct {
- int32_t mask;
- int last_tried; /* index of the child we tried previously */
- } access;
-
- struct {
- int last_tried;
- ino_t ino;
- } stat;
-
- struct {
- int last_tried;
- ino_t ino;
- } fstat;
-
- struct {
- size_t size;
- int last_tried;
- ino_t ino;
- } readlink;
-
- struct {
- char *name;
- int last_tried;
- long pathinfo_len;
- } getxattr;
-
- struct {
- ino_t ino;
- size_t size;
- off_t offset;
- int last_tried;
- } readv;
-
- /* dir read */
-
- struct {
- int success_count;
- int32_t op_ret;
- int32_t op_errno;
-
- uint32_t *checksum;
- } opendir;
-
- struct {
- int32_t op_ret;
- int32_t op_errno;
- size_t size;
- off_t offset;
-
- gf_boolean_t failed;
- int last_tried;
- } readdir;
-
- struct {
- int32_t op_ret;
- int32_t op_errno;
-
- size_t size;
- off_t offset;
- int32_t flag;
-
- int last_tried;
- } getdents;
-
- /* inode write */
-
- struct {
- ino_t ino;
- struct iatt prebuf;
- struct iatt postbuf;
-
- int32_t op_ret;
-
- struct iovec *vector;
- struct iobref *iobref;
- int32_t count;
- off_t offset;
- } writev;
-
- struct {
- ino_t ino;
- struct iatt prebuf;
- struct iatt postbuf;
- } fsync;
-
- struct {
- ino_t ino;
- off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
- } truncate;
-
- struct {
- ino_t ino;
- off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
- } ftruncate;
-
- struct {
- ino_t ino;
- struct iatt in_buf;
- int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
- } setattr;
-
- struct {
- ino_t ino;
- struct iatt in_buf;
- int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
- } fsetattr;
-
- struct {
- dict_t *dict;
- int32_t flags;
- } setxattr;
-
- struct {
- char *name;
- } removexattr;
-
- /* dir write */
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- fd_t *fd;
- dict_t *params;
- int32_t flags;
- mode_t mode;
- inode_t *inode;
- struct iatt buf;
- struct iatt preparent;
- struct iatt postparent;
- struct iatt read_child_buf;
- } create;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- dev_t dev;
- mode_t mode;
- dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt preparent;
- struct iatt postparent;
- struct iatt read_child_buf;
- } mknod;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- int32_t mode;
- dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
- } mkdir;
-
- struct {
- ino_t parent_ino;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
- } unlink;
-
- struct {
- int flags;
- ino_t parent_ino;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
- } rmdir;
-
- struct {
- ino_t oldparent_ino;
- ino_t newparent_ino;
- ino_t ino;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preoldparent;
- struct iatt prenewparent;
- struct iatt postoldparent;
- struct iatt postnewparent;
- } rename;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
- } link;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- inode_t *inode;
- dict_t *params;
- struct iatt buf;
- struct iatt read_child_buf;
- char *linkpath;
- struct iatt preparent;
- struct iatt postparent;
- } symlink;
-
- struct {
- int32_t flags;
- dir_entry_t *entries;
- int32_t count;
- } setdents;
- } cont;
+ off_t offset;
+ size_t len;
+ int32_t mode;
+ } fallocate;
struct {
- off_t start, len;
+ off_t offset;
+ size_t len;
+ } discard;
- char *basename;
- char *new_basename;
+ struct {
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
- loc_t parent_loc;
- loc_t new_parent_loc;
+ struct {
+ char *volume;
+ int32_t cmd;
+ int32_t in_cmd;
+ struct gf_flock in_flock;
+ struct gf_flock flock;
+ void *xdata;
+ } inodelk;
- afr_transaction_type type;
+ struct {
+ char *volume;
+ char *basename;
+ void *xdata;
+ entrylk_cmd in_cmd;
+ entrylk_cmd cmd;
+ entrylk_type type;
+ } entrylk;
- int success_count;
- int erase_pending;
- int failure_count;
+ struct {
+ off_t offset;
+ gf_seek_what_t what;
+ } seek;
- int last_tried;
- int32_t *child_errno;
+ struct {
+ struct gf_lease user_lease;
+ struct gf_lease ret_lease;
+ unsigned char *locked_nodes;
+ } lease;
- call_frame_t *main_frame;
+ struct {
+ int flags;
+ } rmdir;
- int (*fop) (call_frame_t *frame, xlator_t *this);
+ struct {
+ int32_t datasync;
+ } fsync;
- int (*done) (call_frame_t *frame, xlator_t *this);
+ struct {
+ uuid_t gfid_req;
+ gf_boolean_t needs_fresh_lookup;
+ } lookup;
- int (*resume) (call_frame_t *frame, xlator_t *this);
+ } cont;
- int (*unwind) (call_frame_t *frame, xlator_t *this);
+ struct {
+ char *basename;
+ char *new_basename;
- /* post-op hook */
- } transaction;
+ loc_t parent_loc;
+ loc_t new_parent_loc;
- afr_self_heal_t self_heal;
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
- struct marker_str marker;
-} afr_local_t;
+ struct list_head owner_list;
+ struct list_head wait_list;
+ unsigned char *pre_op;
-typedef struct {
- unsigned int *pre_op_done;
- unsigned int *opened_on; /* which subvolumes the fd is open on */
- unsigned int *pre_op_piggyback;
+ /* Changelog xattr dict for [f]xattrop*/
+ dict_t **changelog_xdata;
+ unsigned char *pre_op_sources;
- int flags;
- int32_t wbflags;
- uint64_t up_count; /* number of CHILD_UPs this fd has seen */
- uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
+ /* @failed_subvols: subvolumes on which a pre-op or a
+ FOP failed. */
+ unsigned char *failed_subvols;
- int32_t last_tried;
+ call_frame_t *main_frame; /*Fop frame*/
+ call_frame_t *frame; /*Transaction frame*/
- int hit, miss;
- gf_boolean_t failed_over;
- struct list_head entries; /* needed for readdir failover */
+ int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
- unsigned char *locked_on; /* which subvolumes locks have been successful */
-} afr_fd_ctx_t;
+ int (*unwind)(call_frame_t *frame, xlator_t *this);
+ off_t start, len;
-/* try alloc and if it fails, goto label */
-#define ALLOC_OR_GOTO(var, type, label) do { \
- var = GF_CALLOC (sizeof (type), 1, \
- gf_afr_mt_##type); \
- if (!var) { \
- gf_log (this->name, GF_LOG_ERROR, \
- "out of memory :("); \
- op_errno = ENOMEM; \
- goto label; \
- } \
- } while (0);
+ afr_transaction_type type;
+ int32_t in_flight_sb_errno; /* This is where the cause of the
+ failure on the last good copy of
+ the file is stored.
+ */
-/* did a call fail due to a child failing? */
-#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
- ((op_errno == ENOTCONN) || \
- (op_errno == EBADFD)))
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+ afr_changelog_resume_t changelog_resume;
-#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1)
+ gf_boolean_t eager_lock_on;
+ gf_boolean_t do_eager_unlock;
-/* have we tried all children? */
-#define all_tried(i, count) ((i) == (count) - 1)
+ /* @dirtied: flag which indicates whether we set dirty flag
+ in the OP. Typically true when we are performing operation
+ on more than one subvol and optimistic changelog is disabled
-int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid);
+ A 'true' value set in @dirtied flag means an 'undirtying'
+ has to be done in POST-OP phase.
+ */
+ gf_boolean_t dirtied;
+
+ /* @inherited: flag which indicates that the dirty flags
+ of the previous transaction were inherited
+ */
+ gf_boolean_t inherited;
+
+ /*
+ @no_uninherit: flag which indicates that a pre_op_uninherit()
+ must _not_ be attempted (and returned as failure) always. This
+ flag is set when a hard pre-op is performed, but not accounted
+ for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+ from the pre-op piggybacking entirely and therefore uninherit
+ must not be attempted.
+ */
+ gf_boolean_t no_uninherit;
+
+ gf_boolean_t in_flight_sb; /* Indicator for occurrence of
+ split-brain while in the middle of
+ a txn. */
+
+ /* @uninherit_done:
+ @uninherit_value:
+
+ The above pair variables make pre_op_uninherit() idempotent.
+ Both are FALSE initially. The first call to pre_op_uninherit
+ sets @uninherit_done to TRUE and the return value to
+ @uninherit_value. Further calls will check for @uninherit_done
+ to be TRUE and if so will simply return @uninherit_value.
+ */
+ gf_boolean_t uninherit_done;
+ gf_boolean_t uninherit_value;
+
+ gf_boolean_t disable_delayed_post_op;
+ } transaction;
+
+ syncbarrier_t barrier;
+
+ /* extra data for fops */
+ dict_t *xdata_req;
+ dict_t *xdata_rsp;
+
+ dict_t *xattr_rsp; /*for [f]xattrop*/
+
+ mode_t umask;
+ int xflag;
+ struct afr_reply *replies;
+
+ /* For client side background heals. */
+ struct list_head healer;
+ call_frame_t *heal_frame;
+
+ afr_inode_ctx_t *inode_ctx;
+
+ /*For thin-arbiter transactions.*/
+ int ta_failed_subvol;
+ int ta_event_gen;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+ afr_ta_fop_state_t fop_state;
+ afr_fop_lock_state_t fop_lock_state;
+ gf_lkowner_t saved_lk_owner;
+ unsigned char read_txn_query_child;
+ unsigned char ta_child_up;
+ gf_boolean_t do_discovery;
+ gf_boolean_t need_full_crawl;
+ gf_boolean_t is_read_txn;
+ gf_boolean_t is_new_entry;
+} afr_local_t;
+
+typedef struct afr_spbc_timeout {
+ call_frame_t *frame;
+ loc_t *loc;
+ int spb_child_index;
+ gf_boolean_t d_spb;
+ gf_boolean_t m_spb;
+} afr_spbc_timeout_t;
+
+typedef struct afr_spb_status {
+ call_frame_t *frame;
+ loc_t *loc;
+} afr_spb_status_t;
+
+typedef struct afr_empty_brick_args {
+ call_frame_t *frame;
+ char *op_type;
+ loc_t loc;
+ int empty_index;
+} afr_empty_brick_args_t;
+
+typedef struct afr_read_subvol_args {
+ ia_type_t ia_type;
+ uuid_t gfid;
+} afr_read_subvol_args_t;
+
+typedef struct afr_granular_esh_args {
+ fd_t *heal_fd;
+ xlator_t *xl;
+ call_frame_t *frame;
+ gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid
+ mismatch */
+} afr_granular_esh_args_t;
+
+int
+afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type);
+int
+afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+int
+__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+
+int
+__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvol,
+ int event_generation);
+int
+afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int event_generation);
int
-pump_command_reply (call_frame_t *frame, xlator_t *this);
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
+
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
+
+int
+afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
+ unsigned char *readable,
+ afr_read_subvol_args_t *args);
+
+int
+afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type);
+int
+afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
+ unsigned char *readables, int *event_p,
+ afr_transaction_type type, afr_read_subvol_args_t *args);
+
+#define afr_data_subvol_get(i, t, s, r, e, a) \
+ afr_read_subvol_get(i, t, s, r, e, AFR_DATA_TRANSACTION, a)
+
+#define afr_metadata_subvol_get(i, t, s, r, e, a) \
+ afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a)
+
+int
+afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, afr_inode_refresh_cbk_t cbk);
int32_t
-afr_notify (xlator_t *this, int32_t event,
- void *data, ...);
+afr_notify(xlator_t *this, int32_t event, void *data, void *data2);
int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
+xattr_is_equal(dict_t *this, char *key1, data_t *value1, void *data);
int
-afr_save_locked_fd (xlator_t *this, fd_t *fd);
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+ int child_count);
int
-afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
- unsigned char *locked_nodes);
+afr_add_inode_lockee(afr_local_t *local, int child_count);
void
-afr_set_lk_owner (call_frame_t *frame, xlator_t *this);
+afr_lockees_cleanup(afr_internal_lock_t *int_lock);
+
+int
+afr_attempt_lock_recovery(xlator_t *this, int32_t child_index);
int
-afr_set_lock_number (call_frame_t *frame, xlator_t *this);
+afr_mark_locked_nodes(xlator_t *this, fd_t *fd, unsigned char *locked_nodes);
+void
+afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner);
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2);
+int
+afr_set_lock_number(call_frame_t *frame, xlator_t *this);
int32_t
-afr_unlock (call_frame_t *frame, xlator_t *this);
+afr_unlock(call_frame_t *frame, xlator_t *this);
int
-afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this);
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this);
int
-afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this);
+afr_blocking_lock(call_frame_t *frame, xlator_t *this);
int
-afr_blocking_lock (call_frame_t *frame, xlator_t *this);
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this);
int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
+__afr_fd_ctx_set(xlator_t *this, fd_t *fd);
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this);
-int pump_start (call_frame_t *frame, xlator_t *this);
+int
+afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno);
int
-afr_fd_ctx_set (xlator_t *this, fd_t *fd);
+afr_locked_nodes_count(unsigned char *locked_nodes, int child_count);
-uint64_t
-afr_read_child (xlator_t *this, inode_t *inode);
+int
+afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t *start_heal);
void
-afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child);
+afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv);
void
-afr_build_parent_loc (loc_t *parent, loc_t *child);
+afr_local_cleanup(afr_local_t *local, xlator_t *this);
int
-afr_up_children_count (int child_count, unsigned char *child_up);
+afr_frame_return(call_frame_t *frame);
-gf_boolean_t
-afr_is_fresh_lookup (loc_t *loc, xlator_t *this);
+int
+afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
void
-afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent);
+afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this);
int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
+
+#define AFR_STACK_UNWIND(fop, frame, op_ret, op_errno, params...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ int32_t __op_ret = 0; \
+ int32_t __op_errno = 0; \
+ \
+ __op_ret = op_ret; \
+ __op_errno = op_errno; \
+ if (frame) { \
+ __local = frame->local; \
+ __this = frame->this; \
+ afr_handle_inconsistent_fop(frame, &__op_ret, &__op_errno); \
+ if (__local && __local->is_read_txn) \
+ afr_pending_read_decrement(__this->private, \
+ __local->read_subvol); \
+ if (__local && __local->xdata_req && \
+ afr_is_lock_mode_mandatory(__local->xdata_req)) \
+ afr_dom_lock_release(frame); \
+ frame->local = NULL; \
+ } \
+ \
+ STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ } while (0)
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY(frame->root); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ } while (0);
+
+#define AFR_FRAME_INIT(frame, op_errno) \
+ ({ \
+ frame->local = mem_get0(THIS->local_pool); \
+ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \
+ afr_local_cleanup(frame->local, frame->this); \
+ mem_put(frame->local); \
+ frame->local = NULL; \
+ }; \
+ frame->local; \
+ })
+
+#define AFR_STACK_RESET(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ int __opr; \
+ STACK_RESET(frame->root); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ AFR_FRAME_INIT(frame, __opr); \
+ } while (0)
+
+/* allocate and return a string that is the basename of argument */
+static inline char *
+AFR_BASENAME(const char *str)
+{
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = gf_strdup(str);
+ __basename_str = gf_strdup(basename(__tmp_str));
+ GF_FREE(__tmp_str);
+ return __basename_str;
+}
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index);
+call_frame_t *
+afr_copy_frame(call_frame_t *base);
int
-afr_deitransform (ino64_t ino, int child_count);
+afr_transaction_local_init(afr_local_t *local, xlator_t *this);
-void
-afr_local_cleanup (afr_local_t *local, xlator_t *this);
+int32_t
+afr_marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, afr_local_t *local, afr_private_t *priv);
int
-afr_frame_return (call_frame_t *frame);
+afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
-uint64_t
-afr_is_split_brain (xlator_t *this, inode_t *inode);
+int
+afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count);
+
+int
+afr_higher_errno(int32_t old_errno, int32_t new_errno);
+
+int
+afr_final_errno(afr_local_t *local, afr_private_t *priv);
+
+int
+afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req);
void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set);
+afr_fix_open(fd_t *fd, xlator_t *this);
+
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this);
+void
+afr_set_low_priority(call_frame_t *frame);
int
-afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags);
+afr_child_fd_ctx_set(xlator_t *this, fd_t *fd, int32_t child, int flags);
void
-afr_set_opendir_done (xlator_t *this, inode_t *inode);
+afr_matrix_cleanup(int32_t **pending, unsigned int m);
-uint64_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode);
+int32_t **
+afr_matrix_create(unsigned int m, unsigned int n);
+
+int **
+afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, ia_type_t iat);
void
-afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
+afr_filter_xattrs(dict_t *xattr);
-int
-afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
+/*
+ * Special value indicating we should use the "auto" quorum method instead of
+ * a fixed value (including zero to turn off quorum enforcement).
+ */
+#define AFR_QUORUM_AUTO INT_MAX
int
-afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd);
-
-#define AFR_STACK_UNWIND(fop, frame, params ...) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- if (frame) { \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- } \
- STACK_UNWIND_STRICT (fop, frame, params); \
- afr_local_cleanup (__local, __this); \
- GF_FREE (__local); \
- } while (0);
-
-#define AFR_STACK_DESTROY(frame) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- STACK_DESTROY (frame->root); \
- afr_local_cleanup (__local, __this); \
- GF_FREE (__local); \
- } while (0);
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local);
-/* allocate and return a string that is the basename of argument */
-static inline char *
-AFR_BASENAME (const char *str)
-{
- char *__tmp_str = NULL;
- char *__basename_str = NULL;
- __tmp_str = gf_strdup (str);
- __basename_str = gf_strdup (basename (__tmp_str));
- GF_FREE (__tmp_str);
- return __basename_str;
-}
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode);
-/* initialize local_t */
-static inline int
-AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
-{
- int child_up_count = 0;
+void
+afr_reply_wipe(struct afr_reply *reply);
- local->child_up = GF_CALLOC (sizeof (*local->child_up),
- priv->child_count,
- gf_afr_mt_char);
- if (!local->child_up) {
- return -ENOMEM;
- }
+void
+afr_replies_wipe(struct afr_reply *replies, int count);
- memcpy (local->child_up, priv->child_up,
- sizeof (*local->child_up) * priv->child_count);
+gf_boolean_t
+afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2);
- child_up_count = afr_up_children_count (priv->child_count, local->child_up);
+gf_boolean_t
+afr_is_xattr_ignorable(char *key);
- if (priv->optimistic_change_log && child_up_count == priv->child_count)
- local->optimistic_change_log = 1;
+int
+afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc);
- local->call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (local->call_count == 0) {
- gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up");
- return -ENOTCONN;
- }
+int
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
- local->transaction.erase_pending = 1;
+int
+afr_get_split_brain_status(void *opaque);
- local->op_ret = -1;
- local->op_errno = EUCLEAN;
+int
+afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque);
- local->internal_lock.lock_op_ret = -1;
- local->internal_lock.lock_op_errno = EUCLEAN;
+int
+afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
+ int spb_choice);
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol);
+int
+afr_get_child_index_from_name(xlator_t *this, char *name);
+int
+afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb);
+int
+afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode);
- return 0;
-}
+int
+afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque);
+gf_boolean_t
+afr_get_need_heal(xlator_t *this);
-/**
- * first_up_child - return the index of the first child that is up
- */
+void
+afr_set_need_heal(xlator_t *this, afr_local_t *local);
-static inline int
-afr_first_up_child (afr_private_t *priv)
-{
- xlator_t ** children = NULL;
- int ret = -1;
- int i = 0;
-
- LOCK (&priv->lock);
- {
- children = priv->children;
- for (i = 0; i < priv->child_count; i++) {
- if (priv->child_up[i]) {
- ret = i;
- break;
- }
- }
- }
- UNLOCK (&priv->lock);
-
- return ret;
-}
+int
+afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd);
+int
+afr_get_msg_id(char *op_type);
-static inline int
-afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
-{
- int i;
-
- local->first_up_child = afr_first_up_child (priv);
-
- local->child_errno = GF_CALLOC (sizeof (*local->child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!local->child_errno) {
- return -ENOMEM;
- }
-
- local->pending = GF_CALLOC (sizeof (*local->pending),
- priv->child_count,
- gf_afr_mt_int32_t);
-
- if (!local->pending) {
- return -ENOMEM;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]),
- 3, /* data + metadata + entry */
- gf_afr_mt_int32_t);
- if (!local->pending[i])
- return -ENOMEM;
- }
-
- local->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- local->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- local->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- local->internal_lock.lower_locked_nodes
- = GF_CALLOC (sizeof (*local->internal_lock.lower_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- local->transaction.child_errno = GF_CALLOC (sizeof (*local->transaction.child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
-
- local->internal_lock.transaction_lk_type = AFR_TRANSACTION_LK;
-
- return 0;
-}
+int
+afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame,
+ inode_t *inode);
int32_t
-afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
+afr_quorum_errno(afr_private_t *priv);
+
+gf_boolean_t
+afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
+ int32_t *op_errno);
+void
+afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
+ int32_t *op_errno);
void
-afr_get_fresh_children (int32_t *success_children, int32_t *sources,
- int32_t *fresh_children, unsigned int child_count);
+afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
void
-afr_fresh_children_add_child (int32_t *fresh_children, int32_t child,
- int32_t child_count);
+afr_process_post_writev(call_frame_t *frame, xlator_t *this);
+
+void
+afr_writev_unwind(call_frame_t *frame, xlator_t *this);
+
+void
+afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame);
+
void
-afr_reset_children (int32_t *fresh_children, int32_t child_count);
+afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock,
+ int32_t child_index);
+afr_fd_ctx_t *
+__afr_fd_ctx_get(fd_t *fd, xlator_t *this);
+
+gf_boolean_t
+afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
+ int event_gen2);
+
+int
+afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
+ char *buf, const char *default_str,
+ int32_t *serz_len, char delimiter);
gf_boolean_t
-afr_error_more_important (int32_t old_errno, int32_t new_errno);
+afr_is_symmetric_error(call_frame_t *frame, xlator_t *this);
+
+int
+__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx);
+
+uint64_t
+afr_write_subvol_get(call_frame_t *frame, xlator_t *this);
+
int
-afr_errno_count (int32_t *children, int *child_errno,
- unsigned int child_count, int32_t op_errno);
+afr_write_subvol_set(call_frame_t *frame, xlator_t *this);
+
int
-afr_get_children_count (int32_t *fresh_children, unsigned int child_count);
+afr_write_subvol_reset(call_frame_t *frame, xlator_t *this);
+
+int
+afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode);
+
+int
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop);
+
+int
+afr_ta_post_op_lock(xlator_t *this, loc_t *loc);
+
+int
+afr_ta_post_op_unlock(xlator_t *this, loc_t *loc);
+
gf_boolean_t
-afr_is_child_present (int32_t *success_children, int32_t child_count,
- int32_t child);
+afr_is_pending_set(xlator_t *this, dict_t *xdata, int type);
+
+int
+__afr_get_up_children_count(afr_private_t *priv);
+
+call_frame_t *
+afr_ta_frame_create(xlator_t *this);
+
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
+
void
-afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs,
- int32_t *success_children,
- unsigned int child_count);
+afr_ta_lock_release_synctask(xlator_t *this);
+
void
-afr_reset_xattr (dict_t **xattr, unsigned int child_count);
+afr_ta_locked_priv_invalidate(afr_private_t *priv);
+
gf_boolean_t
-afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
- unsigned int child_count, const char *path,
- const char *xlator_name);
-unsigned int
-afr_gfid_missing_count (const char *xlator_name, int32_t *children,
- struct iatt *bufs, unsigned int child_count,
- const char *path);
+afr_lookup_has_quorum(call_frame_t *frame,
+ const unsigned int up_children_count);
+
void
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path);
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this);
+
void
-afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count);
-afr_transaction_type
-afr_transaction_type_get (ia_type_t ia_type);
-int32_t
-afr_resultant_errno_get (int32_t *children,
- int *child_errno, unsigned int child_count);
-int32_t*
-afr_children_create (unsigned int child_count);
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,
+ int child);
+
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata);
+
+void
+afr_dom_lock_release(call_frame_t *frame);
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies);
+
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid);
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
deleted file mode 100644
index 0eb57ac2408..00000000000
--- a/xlators/cluster/afr/src/pump.c
+++ /dev/null
@@ -1,2536 +0,0 @@
-/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <unistd.h>
-#include <sys/time.h>
-#include <stdlib.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "afr-common.c"
-#include "defaults.c"
-
-uint64_t pump_pid = 0;
-static int
-pump_mark_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- pump_priv->pump_start_pending = 1;
-
- return 0;
-}
-
-static int
-is_pump_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- return (pump_priv->pump_start_pending);
-}
-
-static int
-pump_remove_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- pump_priv->pump_start_pending = 0;
-
- return 0;
-}
-
-static pump_state_t
-pump_get_state ()
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t ret;
-
- this = THIS;
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->pump_state_lock);
- {
- ret = pump_priv->pump_state;
- }
- UNLOCK (&pump_priv->pump_state_lock);
-
- return ret;
-}
-
-int
-pump_change_state (xlator_t *this, pump_state_t state)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t state_old;
- pump_state_t state_new;
-
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (pump_priv);
-
- LOCK (&pump_priv->pump_state_lock);
- {
- state_old = pump_priv->pump_state;
- state_new = state;
-
- pump_priv->pump_state = state;
-
- }
- UNLOCK (&pump_priv->pump_state_lock);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump changing state from %d to %d",
- state_old,
- state_new);
-
- return 0;
-}
-
-static int
-pump_set_resume_path (xlator_t *this, const char *path)
-{
- int ret = 0;
-
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (pump_priv);
-
- LOCK (&pump_priv->resume_path_lock);
- {
- pump_priv->resume_path = strdup (path);
- if (!pump_priv->resume_path)
- ret = -1;
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- return ret;
-}
-
-static int
-pump_save_path (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- pump_state_t state;
- dict_t *dict = NULL;
- loc_t loc = {0};
- int dict_ret = 0;
- int ret = -1;
-
- state = pump_get_state ();
- if (state == PUMP_STATE_RESUME)
- return 0;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
-
- dict = dict_new ();
- dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path);
-
- ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setxattr failed - could not save path=%s", path);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "setxattr succeeded - saved path=%s", path);
- gf_log (this->name, GF_LOG_DEBUG,
- "Saving path for status info");
- }
-
- dict_unref (dict);
-
- return 0;
-}
-
-static int
-pump_check_and_update_status (xlator_t *this)
-{
- pump_state_t state;
- int ret = -1;
-
- state = pump_get_state ();
-
- switch (state) {
-
- case PUMP_STATE_RESUME:
- case PUMP_STATE_RUNNING:
- {
- ret = 0;
- break;
- }
- case PUMP_STATE_PAUSE:
- {
- ret = -1;
- break;
- }
- case PUMP_STATE_ABORT:
- {
- pump_save_path (this, "/");
- ret = -1;
- break;
- }
- default:
- {
- gf_log (this->name, GF_LOG_DEBUG,
- "Unknown pump state");
- ret = -1;
- break;
- }
-
- }
-
- return ret;
-}
-
-static const char *
-pump_get_resume_path (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- const char *resume_path = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- resume_path = pump_priv->resume_path;
-
- return resume_path;
-}
-
-static int
-pump_update_resume_state (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t state;
- const char *resume_path = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- state = pump_get_state ();
-
- if (state == PUMP_STATE_RESUME) {
- resume_path = pump_get_resume_path (this);
- if (strcmp (resume_path, "/") == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reached the resume path (/). Proceeding to change state"
- " to running");
- pump_change_state (this, PUMP_STATE_RUNNING);
- } else if (strcmp (resume_path, path) == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reached the resume path. Proceeding to change state"
- " to running");
- pump_change_state (this, PUMP_STATE_RUNNING);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not yet hit the resume path:res-path=%s,path=%s",
- resume_path, path);
- }
- }
-
- return 0;
-}
-
-static gf_boolean_t
-is_pump_traversal_allowed (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t state;
- const char *resume_path = NULL;
- gf_boolean_t ret = _gf_true;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- state = pump_get_state ();
-
- if (state == PUMP_STATE_RESUME) {
- resume_path = pump_get_resume_path (this);
- if (strstr (resume_path, path)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "On the right path to resumption path");
- ret = _gf_true;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not the right path to resuming=> ignoring traverse");
- ret = _gf_false;
- }
- }
-
- return ret;
-}
-
-static int
-pump_save_file_stats (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->resume_path_lock);
- {
- pump_priv->number_files_pumped++;
-
- strncpy (pump_priv->current_file, path,
- PATH_MAX);
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- return 0;
-}
-
-static void
-_generate_gfid_on_empty (uuid_t gfid)
-{
- if (uuid_is_null (gfid))
- uuid_generate (gfid);
-}
-
-static void
-_empty_gfid_on_set (uuid_t gfid, int lookup_status, struct iatt *iatt)
-{
- if (lookup_status || !uuid_compare (gfid, iatt->ia_gfid))
- uuid_clear (gfid);
-}
-
-static void
-_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent)
-{
- afr_update_loc_gfids (loc, iatt, parent);
- uuid_copy (loc->inode->gfid, iatt->ia_gfid);
- loc->ino = iatt->ia_ino;
- loc->inode->ino = iatt->ia_ino;
-}
-
-static void
-_wipe_loc (loc_t *loc)
-{
- loc_wipe (loc);
- uuid_clear (loc->gfid);
- uuid_clear (loc->pargfid);
- loc->ino = 0;
-}
-
-static int
-gf_pump_traverse_directory (loc_t *loc, uuid_t gfid)
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- off_t offset = 0;
- loc_t entry_loc = {0};
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- gf_dirent_t entries;
- struct iatt iatt = {0};
- struct iatt parent = {0};
- dict_t *xattr_rsp = NULL;
- int ret = 0;
- gf_boolean_t is_directory_empty = _gf_true;
- dict_t *xattr_req = NULL;
- gf_boolean_t free_entries = _gf_false;
-
- xattr_req = dict_new ();
- if (!xattr_req) {
- ret = -1;
- goto out;
- }
-
- INIT_LIST_HEAD (&entries.list);
- this = THIS;
- priv = this->private;
-
- GF_ASSERT (loc->inode);
-
- fd = fd_create (loc->inode, pump_pid);
- if (!fd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to create fd for %s", loc->path);
- goto out;
- }
-
- ret = syncop_opendir (this, loc, fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "opendir failed on %s", loc->path);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "pump opendir on %s returned=%d",
- loc->path, ret);
-
- while (syncop_readdirp (this, fd, 131072, offset, &entries)) {
- free_entries = _gf_true;
-
- if (list_empty (&entries.list)) {
- gf_log (this->name, GF_LOG_TRACE,
- "no more entries in directory");
- goto out;
- }
-
- list_for_each_entry_safe (entry, tmp, &entries.list, list) {
- gf_log (this->name, GF_LOG_DEBUG,
- "found readdir entry=%s", entry->d_name);
-
- _wipe_loc (&entry_loc);
- ret = afr_build_child_loc (this, &entry_loc,
- loc, entry->d_name);
- if (ret)
- goto out;
- if (!IS_ENTRY_CWD (entry->d_name) &&
- !IS_ENTRY_PARENT (entry->d_name)) {
-
- is_directory_empty = _gf_false;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup %s => %"PRId64,
- entry_loc.path,
- iatt.ia_ino);
-
- _generate_gfid_on_empty (gfid);
- ret = dict_reset (xattr_req);
- if (ret)
- goto out;
- ret = afr_set_dict_gfid (xattr_req, gfid);
- if (ret)
- goto out;
- ret = syncop_lookup (this, &entry_loc, xattr_req,
- &iatt, &xattr_rsp, &parent);
-
- _empty_gfid_on_set (gfid, ret, &iatt);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: lookup failed",
- entry_loc.path);
- continue;
- }
- _fill_loc_info (&entry_loc, &iatt, &parent);
-
- pump_update_resume_state (this, entry_loc.path);
-
- pump_save_path (this, entry_loc.path);
- pump_save_file_stats (this, entry_loc.path);
-
- ret = pump_check_and_update_status (this);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump beginning to exit out");
- goto out;
- }
-
- if (IA_ISDIR (iatt.ia_type)) {
- if (is_pump_traversal_allowed (this, entry_loc.path)) {
- gf_log (this->name, GF_LOG_TRACE,
- "entering dir=%s",
- entry->d_name);
- gf_pump_traverse_directory (&entry_loc, gfid);
- }
- }
- }
- offset = entry->d_off;
- }
-
- gf_dirent_free (&entries);
- free_entries = _gf_false;
- gf_log (this->name, GF_LOG_TRACE,
- "offset incremented to %d",
- (int32_t ) offset);
-
- }
-
- if (is_directory_empty && IS_ROOT_PATH (loc->path)) {
- pump_change_state (this, PUMP_STATE_RUNNING);
- gf_log (this->name, GF_LOG_INFO, "Empty source brick. "
- "Nothing to be done.");
- }
-
-out:
- if (xattr_req)
- dict_unref (xattr_req);
- if (entry_loc.path)
- loc_wipe (&entry_loc);
- if (free_entries)
- gf_dirent_free (&entries);
- return 0;
-
-}
-
-void
-build_root_loc (inode_t *inode, loc_t *loc)
-{
- loc->path = "/";
- loc->name = "";
- loc->inode = inode;
- loc->ino = 1;
- loc->inode->ino = 1;
- memset (loc->inode->gfid, 0, 16);
- loc->inode->gfid[15] = 1;
-
-}
-
-static int
-pump_update_resume_path (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- const char *resume_path = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- resume_path = pump_get_resume_path (this);
-
- if (resume_path) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Found a path to resume from: %s",
- resume_path);
-
- }else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Did not find a path=> setting to '/'");
- pump_set_resume_path (this, "/");
- }
-
- pump_change_state (this, PUMP_STATE_RESUME);
-
- return 0;
-}
-
-static int
-pump_complete_migration (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- dict_t *dict = NULL;
- pump_state_t state;
- loc_t loc = {0};
- int dict_ret = 0;
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
-
- dict = dict_new ();
-
- state = pump_get_state ();
- if (state == PUMP_STATE_RUNNING) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump finished pumping");
-
- pump_priv->pump_finished = _gf_true;
-
- dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon");
-
- ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setxattr failed - while notifying source complete");
- }
- dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon");
-
- ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setxattr failed - while notifying sink complete");
- }
-
- pump_save_path (this, "/");
- }
-
- return 0;
-}
-
-static int
-pump_set_root_gfid (dict_t *dict)
-{
- uuid_t gfid;
- int ret = 0;
-
- memset (gfid, 0, 16);
- gfid[15] = 1;
-
- ret = afr_set_dict_gfid (dict, gfid);
-
- return ret;
-}
-
-static int
-pump_lookup_sink (loc_t *loc)
-{
- xlator_t *this = NULL;
- struct iatt iatt, parent;
- dict_t *xattr_rsp;
- dict_t *xattr_req = NULL;
- int ret = 0;
-
- this = THIS;
-
- xattr_req = dict_new ();
-
- ret = pump_set_root_gfid (xattr_req);
- if (ret)
- goto out;
-
- ret = syncop_lookup (PUMP_SINK_CHILD (this), loc,
- xattr_req, &iatt, &xattr_rsp, &parent);
-
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Lookup on sink child failed");
- goto out;
- }
-
-out:
- if (xattr_req)
- dict_unref (xattr_req);
-
- return ret;
-}
-
-static int
-pump_task (void *data)
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
-
-
- loc_t loc = {0};
- struct iatt iatt, parent;
- dict_t *xattr_rsp = NULL;
- dict_t *xattr_req = NULL;
- uuid_t gfid = {0};
-
- int ret = -1;
-
- this = THIS;
- priv = this->private;
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
- xattr_req = dict_new ();
- if (!xattr_req) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- pump_set_root_gfid (xattr_req);
- ret = syncop_lookup (this, &loc, xattr_req,
- &iatt, &xattr_rsp, &parent);
-
- gf_log (this->name, GF_LOG_TRACE,
- "lookup: ino=%"PRId64", path=%s",
- loc.ino,
- loc.path);
-
- ret = pump_check_and_update_status (this);
- if (ret < 0) {
- goto out;
- }
-
- pump_update_resume_path (this);
-
- pump_set_root_gfid (xattr_req);
- ret = pump_lookup_sink (&loc);
- if (ret) {
- pump_update_resume_path (this);
- goto out;
- }
-
- gf_pump_traverse_directory (&loc, gfid);
-
- pump_complete_migration (this);
-out:
- if (xattr_req)
- dict_unref (xattr_req);
-
- return 0;
-}
-
-
-static int
-pump_task_completion (int ret, void *data)
-{
- xlator_t *this = NULL;
- call_frame_t *frame = NULL;
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- this = THIS;
-
- frame = (call_frame_t *) data;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- inode_unref (priv->root_inode);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump xlator exiting");
- return 0;
-}
-
-int
-pump_start (call_frame_t *pump_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- pump_frame->root->lk_owner = (uint64_t) (unsigned long)pump_frame->root;
- pump_pid = (uint64_t) (unsigned long)pump_frame->root;
-
- ret = synctask_new (pump_priv->env, pump_task,
- pump_task_completion,
- pump_frame);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "starting pump failed");
- pump_change_state (this, PUMP_STATE_ABORT);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "setting pump as started");
-
- priv->use_afr_in_pump = 1;
-out:
- return ret;
-}
-
-static int
-pump_start_synctask (xlator_t *this)
-{
- call_frame_t *frame = NULL;
- int ret = 0;
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- pump_change_state (this, PUMP_STATE_RUNNING);
-
- ret = pump_start (frame, this);
-
-out:
- return ret;
-}
-
-int32_t
-pump_cmd_start_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-
-{
- call_frame_t *prev = NULL;
- afr_local_t *local = NULL;
- int ret = 0;
-
- local = frame->local;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not initiate destination "
- "brick connect");
- ret = op_ret;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Successfully initiated destination "
- "brick connect");
-
- pump_mark_start_pending (this);
-
- /* send the PARENT_UP as pump is ready now */
- prev = cookie;
- if (prev && prev->this)
- prev->this->notify (prev->this, GF_EVENT_PARENT_UP, this);
-
-out:
- local->op_ret = ret;
- pump_command_reply (frame, this);
-
- return 0;
-}
-
-static int
-pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *dict = NULL;
- data_t *data = NULL;
- char *clnt_cmd = NULL;
- loc_t loc = {0};
-
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
-
- data = data_ref (dict_get (local->dict, PUMP_CMD_START));
- if (!data) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not get destination brick value");
- goto out;
- }
-
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- clnt_cmd = GF_CALLOC (1, data->len+1, gf_common_mt_char);
- if (!clnt_cmd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto out;
- }
-
- memcpy (clnt_cmd, data->data, data->len);
- clnt_cmd[data->len] = '\0';
- gf_log (this->name, GF_LOG_DEBUG, "Got destination brick %s\n",
- clnt_cmd);
-
- ret = dict_set_dynstr (dict, CLIENT_CMD_CONNECT, clnt_cmd);
- if (ret < 0) {
- GF_FREE (clnt_cmd);
- gf_log (this->name, GF_LOG_ERROR,
- "Could not inititiate destination brick "
- "connect");
- goto out;
- }
-
- STACK_WIND (frame,
- pump_cmd_start_setxattr_cbk,
- PUMP_SINK_CHILD(this),
- PUMP_SINK_CHILD(this)->fops->setxattr,
- &loc,
- dict,
- 0);
-
- ret = 0;
-
- dict_unref (dict);
-out:
- if (data)
- data_unref (data);
- return ret;
-}
-
-static int
-is_pump_aborted (xlator_t *this)
-{
- pump_state_t state;
-
- state = pump_get_state ();
-
- return ((state == PUMP_STATE_ABORT));
-}
-
-int32_t
-pump_cmd_start_getxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- afr_local_t *local = NULL;
- char *path = NULL;
-
- pump_state_t state;
- int ret = 0;
- int need_unwind = 0;
- int dict_ret = -1;
-
- local = frame->local;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "getxattr failed - changing pump "
- "state to RUNNING with '/'");
- path = "/";
- ret = op_ret;
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "getxattr succeeded");
-
- dict_ret = dict_get_str (dict, PUMP_PATH, &path);
- if (dict_ret < 0)
- path = "/";
- }
-
- state = pump_get_state ();
- if ((state == PUMP_STATE_RUNNING) ||
- (state == PUMP_STATE_RESUME)) {
- gf_log (this->name, GF_LOG_ERROR,
- "Pump is already started");
- ret = -1;
- goto out;
- }
-
- pump_set_resume_path (this, path);
-
- if (is_pump_aborted (this))
- /* We're re-starting pump afresh */
- ret = pump_initiate_sink_connect (frame, this);
- else {
- /* We're re-starting pump from a previous
- pause */
- gf_log (this->name, GF_LOG_DEBUG,
- "about to start synctask");
- ret = pump_start_synctask (this);
- need_unwind = 1;
- }
-
-out:
- if ((ret < 0) || (need_unwind == 1)) {
- local->op_ret = ret;
- pump_command_reply (frame, this);
- }
- return 0;
-}
-
-int
-pump_execute_status (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- uint64_t number_files = 0;
-
- char filename[PATH_MAX];
- char *dict_str = NULL;
-
- int32_t op_ret = 0;
- int32_t op_errno = 0;
-
- dict_t *dict = NULL;
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->resume_path_lock);
- {
- number_files = pump_priv->number_files_pumped;
- strncpy (filename, pump_priv->current_file, PATH_MAX);
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- dict_str = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char);
- if (!dict_str) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- if (pump_priv->pump_finished) {
- snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Migration complete ",
- number_files);
- } else {
- snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Current file= %s ",
- number_files, filename);
- }
-
- dict = dict_new ();
-
- ret = dict_set_dynptr (dict, PUMP_CMD_STATUS, dict_str, PATH_MAX + 256);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "dict_set_dynptr returned negative value");
- }
-
- op_ret = 0;
-
-out:
-
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
-
- dict_unref (dict);
-
- return 0;
-}
-
-int
-pump_execute_pause (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- pump_change_state (this, PUMP_STATE_PAUSE);
-
- local->op_ret = 0;
- pump_command_reply (frame, this);
-
- return 0;
-}
-
-int
-pump_execute_start (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = 0;
- loc_t loc = {0};
-
- priv = this->private;
- local = frame->local;
-
- if (!priv->root_inode) {
- gf_log (this->name, GF_LOG_ERROR,
- "Pump xlator cannot be started without an initial "
- "lookup");
- ret = -1;
- goto out;
- }
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
-
- STACK_WIND (frame,
- pump_cmd_start_getxattr_cbk,
- PUMP_SOURCE_CHILD(this),
- PUMP_SOURCE_CHILD(this)->fops->getxattr,
- &loc,
- PUMP_PATH);
-
- ret = 0;
-
-out:
- if (ret < 0) {
- local->op_ret = ret;
- pump_command_reply (frame, this);
- }
-
- return 0;
-}
-
-int
-pump_execute_abort (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
- local = frame->local;
-
- pump_change_state (this, PUMP_STATE_ABORT);
-
- LOCK (&pump_priv->resume_path_lock);
- {
- pump_priv->number_files_pumped = 0;
- pump_priv->current_file[0] = '\0';
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- local->op_ret = 0;
- pump_command_reply (frame, this);
-
- return 0;
-}
-
-gf_boolean_t
-pump_command_status (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, PUMP_CMD_STATUS, &cmd);
- if (dict_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not a pump status command");
- ret = _gf_false;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit a pump command - status");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_pause (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, PUMP_CMD_PAUSE, &cmd);
- if (dict_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not a pump pause command");
- ret = _gf_false;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit a pump command - pause");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_abort (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, PUMP_CMD_ABORT, &cmd);
- if (dict_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not a pump abort command");
- ret = _gf_false;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit a pump command - abort");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_start (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, PUMP_CMD_START, &cmd);
- if (dict_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not a pump start command");
- ret = _gf_false;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit a pump command - start");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-struct _xattr_key {
- char *key;
- struct list_head list;
-};
-
-static void
-__gather_xattr_keys (dict_t *dict, char *key, data_t *value,
- void *data)
-{
- struct list_head * list = data;
- struct _xattr_key * xkey = NULL;
-
- if (!strncmp (key, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
-
- xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key);
- if (!xkey)
- return;
-
- xkey->key = key;
- INIT_LIST_HEAD (&xkey->list);
-
- list_add_tail (&xkey->list, list);
- }
-}
-
-static void
-__filter_xattrs (dict_t *dict)
-{
- struct list_head keys;
-
- struct _xattr_key *key;
- struct _xattr_key *tmp;
-
- INIT_LIST_HEAD (&keys);
-
- dict_foreach (dict, __gather_xattr_keys,
- (void *) &keys);
-
- list_for_each_entry_safe (key, tmp, &keys, list) {
- dict_del (dict, key->key);
-
- list_del_init (&key->list);
-
- GF_FREE (key);
- }
-}
-
-int32_t
-pump_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
-
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
-
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.getxattr.last_tried;
-
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.getxattr.last_tried;
-
- if (this_try == read_child) {
- goto retry;
- }
-
- unwind = 0;
- STACK_WIND_COOKIE (frame, pump_getxattr_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name);
- }
-
-out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
-
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
- }
-
- return 0;
-}
-
-int32_t
-pump_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
-{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t * local = NULL;
-
- int read_child = -1;
-
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- frame->local = local;
-
- if (name) {
- if (!strncmp (name, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
-
- op_errno = ENODATA;
- goto out;
- }
-
- if (!strcmp (name, PUMP_CMD_STATUS)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit pump command - status");
- pump_execute_status (frame, this);
- op_ret = 0;
- goto out;
- }
- }
-
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_getxattr_cbk,
- FIRST_CHILD (this),
- (FIRST_CHILD (this))->fops->getxattr,
- loc, name);
- return 0;
- }
-
- read_child = afr_read_child (this, loc->inode);
-
- if (read_child >= 0) {
- call_child = read_child;
-
- local->cont.getxattr.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
-
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
- }
-
- local->cont.getxattr.last_tried = call_child;
- }
-
- loc_copy (&local->loc, loc);
- if (name)
- local->cont.getxattr.name = gf_strdup (name);
-
- STACK_WIND_COOKIE (frame, pump_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->getxattr,
- loc, name);
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL);
- }
- return 0;
-}
-
-static int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (setxattr, main_frame,
- local->op_ret, local->op_errno)
- }
- return 0;
-}
-
-static int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
-}
-
-static int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-static int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int32_t
-pump_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- return 0;
-}
-
-int
-pump_command_reply (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- if (local->op_ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "Command failed");
- else
- gf_log (this->name, GF_LOG_INFO,
- "Command succeeded");
-
- dict_unref (local->dict);
-
- AFR_STACK_UNWIND (setxattr,
- frame,
- local->op_ret,
- local->op_errno);
-
- return 0;
-}
-
-int
-pump_parse_command (call_frame_t *frame, xlator_t *this,
- afr_local_t *local, dict_t *dict)
-{
-
- int ret = -1;
-
- if (pump_command_start (this, dict)) {
- frame->local = local;
- local->dict = dict_ref (dict);
- ret = pump_execute_start (frame, this);
-
- } else if (pump_command_pause (this, dict)) {
- frame->local = local;
- local->dict = dict_ref (dict);
- ret = pump_execute_pause (frame, this);
-
- } else if (pump_command_abort (this, dict)) {
- frame->local = local;
- local->dict = dict_ref (dict);
- ret = pump_execute_abort (frame, this);
- }
- return ret;
-}
-
-int
-pump_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- ret = pump_parse_command (frame, this,
- local, dict);
- if (ret >= 0) {
- op_ret = 0;
- goto out;
- }
-
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_setxattr_cbk,
- FIRST_CHILD (this),
- (FIRST_CHILD (this))->fops->setxattr,
- loc, dict, flags);
- return 0;
- }
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- transaction_frame->local = local;
-
- local->op_ret = -1;
-
- local->cont.setxattr.dict = dict_ref (dict);
- local->cont.setxattr.flags = flags;
-
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
- local->transaction.unwind = afr_setxattr_unwind;
-
- loc_copy (&local->loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
-
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno);
- }
-
- return 0;
-}
-
-/* Defaults */
-static int32_t
-pump_lookup (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *xattr_req)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_lookup_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lookup,
- loc,
- xattr_req);
- return 0;
- }
-
- afr_lookup (frame, this, loc, xattr_req);
- return 0;
-}
-
-
-static int32_t
-pump_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc,
- offset);
- return 0;
- }
-
- afr_truncate (frame, this, loc, offset);
- return 0;
-}
-
-
-static int32_t
-pump_ftruncate (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- off_t offset)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd,
- offset);
- return 0;
- }
-
- afr_ftruncate (frame, this, fd, offset);
- return 0;
-}
-
-
-
-
-int
-pump_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t rdev, dict_t *parms)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_mknod_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod,
- loc, mode, rdev, parms);
- return 0;
- }
- afr_mknod (frame, this, loc, mode, rdev, parms);
- return 0;
-
-}
-
-
-
-int
-pump_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_mkdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir,
- loc, mode, params);
- return 0;
- }
- afr_mkdir (frame, this, loc, mode, params);
- return 0;
-
-}
-
-
-static int32_t
-pump_unlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- loc);
- return 0;
- }
- afr_unlink (frame, this, loc);
- return 0;
-
-}
-
-
-static int
-pump_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags)
-{
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_rmdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rmdir,
- loc, flags);
- return 0;
- }
-
- afr_rmdir (frame, this, loc, flags);
- return 0;
-
-}
-
-
-
-int
-pump_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, dict_t *params)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_symlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink,
- linkpath, loc, params);
- return 0;
- }
- afr_symlink (frame, this, linkpath, loc, params);
- return 0;
-
-}
-
-
-static int32_t
-pump_rename (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_rename_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename,
- oldloc, newloc);
- return 0;
- }
- afr_rename (frame, this, oldloc, newloc);
- return 0;
-
-}
-
-
-static int32_t
-pump_link (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_link_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link,
- oldloc, newloc);
- return 0;
- }
- afr_link (frame, this, oldloc, newloc);
- return 0;
-
-}
-
-
-static int32_t
-pump_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, fd, params);
- return 0;
- }
- afr_create (frame, this, loc, flags, mode, fd, params);
- return 0;
-
-}
-
-
-static int32_t
-pump_open (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags, fd_t *fd,
- int32_t wbflags)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_open_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- loc, flags, fd, wbflags);
- return 0;
- }
- afr_open (frame, this, loc, flags, fd, wbflags);
- return 0;
-
-}
-
-
-static int32_t
-pump_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t off,
- struct iobref *iobref)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_writev_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev,
- fd,
- vector,
- count,
- off,
- iobref);
- return 0;
- }
- afr_writev (frame, this, fd, vector, count, off, iobref);
- return 0;
-
-}
-
-
-static int32_t
-pump_flush (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_flush_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- return 0;
- }
- afr_flush (frame, this, fd);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsync (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsync_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync,
- fd,
- flags);
- return 0;
- }
- afr_fsync (frame, this, fd, flags);
- return 0;
-
-}
-
-
-static int32_t
-pump_opendir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_opendir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->opendir,
- loc, fd);
- return 0;
- }
- afr_opendir (frame, this, loc, fd);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsyncdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsyncdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsyncdir,
- fd,
- flags);
- return 0;
- }
- afr_fsyncdir (frame, this, fd, flags);
- return 0;
-
-}
-
-
-static int32_t
-pump_xattrop (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- gf_xattrop_flags_t flags,
- dict_t *dict)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_xattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->xattrop,
- loc,
- flags,
- dict);
- return 0;
- }
- afr_xattrop (frame, this, loc, flags, dict);
- return 0;
-
-}
-
-static int32_t
-pump_fxattrop (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- gf_xattrop_flags_t flags,
- dict_t *dict)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fxattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fxattrop,
- fd,
- flags,
- dict);
- return 0;
- }
- afr_fxattrop (frame, this, fd, flags, dict);
- return 0;
-
-}
-
-
-static int32_t
-pump_removexattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_removexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
- loc,
- name);
- return 0;
- }
- afr_removexattr (frame, this, loc, name);
- return 0;
-
-}
-
-
-
-static int32_t
-pump_readdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t off)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_readdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdir,
- fd, size, off);
- return 0;
- }
- afr_readdir (frame, this, fd, size, off);
- return 0;
-
-}
-
-
-static int32_t
-pump_readdirp (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t off)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_readdirp_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdirp,
- fd, size, off);
- return 0;
- }
- afr_readdirp (frame, this, fd, size, off);
- return 0;
-
-}
-
-
-
-static int32_t
-pump_releasedir (xlator_t *this,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (priv->use_afr_in_pump)
- afr_releasedir (this, fd);
- return 0;
-
-}
-
-static int32_t
-pump_release (xlator_t *this,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (priv->use_afr_in_pump)
- afr_release (this, fd);
- return 0;
-
-}
-
-
-static int32_t
-pump_setattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- struct iatt *stbuf,
- int32_t valid)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_setattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setattr,
- loc, stbuf, valid);
- return 0;
- }
- afr_setattr (frame, this, loc, stbuf, valid);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsetattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iatt *stbuf,
- int32_t valid)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsetattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetattr,
- fd, stbuf, valid);
- return 0;
- }
- afr_fsetattr (frame, this, fd, stbuf, valid);
- return 0;
-
-}
-
-
-/* End of defaults */
-
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1);
-
- if (ret != 0) {
- gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-
-static int
-is_xlator_pump_sink (xlator_t *child)
-{
- return (child == PUMP_SINK_CHILD(THIS));
-}
-
-static int
-is_xlator_pump_source (xlator_t *child)
-{
- return (child == PUMP_SOURCE_CHILD(THIS));
-}
-
-int32_t
-notify (xlator_t *this, int32_t event,
- void *data, ...)
-{
- int ret = -1;
- xlator_t *child_xl = NULL;
-
- child_xl = (xlator_t *) data;
-
- ret = afr_notify (this, event, data);
-
- switch (event) {
- case GF_EVENT_CHILD_DOWN:
- if (is_xlator_pump_source (child_xl))
- pump_change_state (this, PUMP_STATE_ABORT);
- break;
-
- case GF_EVENT_CHILD_UP:
- if (is_xlator_pump_sink (child_xl))
- if (is_pump_start_pending (this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "about to start synctask");
- ret = pump_start_synctask (this);
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not start pump "
- "synctask");
- else
- pump_remove_start_pending (this);
- }
- }
-
- return ret;
-}
-
-int32_t
-init (xlator_t *this)
-{
- afr_private_t * priv = NULL;
- pump_private_t *pump_priv = NULL;
- int child_count = 0;
- xlator_list_t * trav = NULL;
- int i = 0;
- int ret = -1;
- int op_errno = 0;
-
- int source_child = 0;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "pump translator needs a source and sink"
- "subvolumes defined.");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "Volume is dangling.");
- }
-
- ALLOC_OR_GOTO (this->private, afr_private_t, out);
-
- priv = this->private;
-
- priv->read_child = source_child;
- priv->favorite_child = source_child;
- priv->background_self_heal_count = 0;
-
- priv->data_self_heal = 1;
- priv->metadata_self_heal = 1;
- priv->entry_self_heal = 1;
-
- priv->data_self_heal_algorithm = "";
-
- priv->data_self_heal_window_size = 16;
-
- priv->data_change_log = 1;
- priv->metadata_change_log = 1;
- priv->entry_change_log = 1;
- priv->use_afr_in_pump = 1;
-
- /* Locking options */
-
- /* Lock server count infact does not matter. Locks are held
- on all subvolumes, in this case being the source
- and the sink.
- */
-
- priv->data_lock_server_count = 2;
- priv->metadata_lock_server_count = 2;
- priv->entry_lock_server_count = 2;
-
- priv->strict_readdir = _gf_false;
-
- trav = this->children;
- while (trav) {
- child_count++;
- trav = trav->next;
- }
-
- priv->wait_count = 1;
-
- if (child_count != 2) {
- gf_log (this->name, GF_LOG_ERROR,
- "There should be exactly 2 children - one source "
- "and one sink");
- return -1;
- }
- priv->child_count = child_count;
-
- LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
-
- priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
- gf_afr_mt_char);
- if (!priv->child_up) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- goto out;
- }
-
- priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
- gf_afr_mt_xlator_t);
- if (!priv->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- goto out;
- }
-
- priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key),
- child_count,
- gf_afr_mt_char);
- if (!priv->pending_key) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- goto out;
- }
-
- trav = this->children;
- i = 0;
- while (i < child_count) {
- priv->children[i] = trav->xlator;
-
- ret = gf_asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
- trav->xlator->name);
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed to set pending key");
- op_errno = ENOMEM;
- goto out;
- }
-
- trav = trav->next;
- i++;
- }
-
- priv->first_lookup = 1;
- priv->root_inode = NULL;
-
- priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
- gf_afr_mt_int32_t);
- if (!priv->last_event) {
- ret = -ENOMEM;
- goto out;
- }
-
- pump_priv = GF_CALLOC (1, sizeof (*pump_priv),
- gf_afr_mt_pump_priv);
- if (!pump_priv) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto out;
- }
-
- LOCK_INIT (&pump_priv->resume_path_lock);
- LOCK_INIT (&pump_priv->pump_state_lock);
-
- pump_priv->resume_path = GF_CALLOC (1, PATH_MAX,
- gf_afr_mt_char);
- if (!pump_priv->resume_path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- pump_priv->env = syncenv_new (0);
- if (!pump_priv->env) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not create new sync-environment");
- ret = -1;
- goto out;
- }
-
- priv->pump_private = pump_priv;
-
- pthread_mutex_init (&priv->mutex, NULL);
- INIT_LIST_HEAD (&priv->saved_fds);
-
- pump_change_state (this, PUMP_STATE_ABORT);
-
- ret = 0;
-out:
- return ret;
-}
-
-int
-fini (xlator_t *this)
-{
- return 0;
-}
-
-
-struct xlator_fops fops = {
- .lookup = pump_lookup,
- .open = pump_open,
- .flush = pump_flush,
- .fsync = pump_fsync,
- .fsyncdir = pump_fsyncdir,
- .xattrop = pump_xattrop,
- .fxattrop = pump_fxattrop,
- .getxattr = pump_getxattr,
-
- /* inode write */
- .writev = pump_writev,
- .truncate = pump_truncate,
- .ftruncate = pump_ftruncate,
- .setxattr = pump_setxattr,
- .setattr = pump_setattr,
- .fsetattr = pump_fsetattr,
- .removexattr = pump_removexattr,
-
- /* dir read */
- .opendir = pump_opendir,
- .readdir = pump_readdir,
- .readdirp = pump_readdirp,
-
- /* dir write */
- .create = pump_create,
- .mknod = pump_mknod,
- .mkdir = pump_mkdir,
- .unlink = pump_unlink,
- .rmdir = pump_rmdir,
- .link = pump_link,
- .symlink = pump_symlink,
- .rename = pump_rename,
-};
-
-struct xlator_dumpops dumpops = {
- .priv = afr_priv_dump,
-};
-
-
-struct xlator_cbks cbks = {
- .release = pump_release,
- .releasedir = pump_releasedir,
-};
-
-struct volume_options options[] = {
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h
deleted file mode 100644
index 19bb96f8c6a..00000000000
--- a/xlators/cluster/afr/src/pump.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __PUMP_H__
-#define __PUMP_H__
-
-#include "syncop.h"
-
-/* FIXME: Needs to be defined in a common file */
-#define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect"
-#define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect"
-
-#define PUMP_LK_OWNER 696969
-
-#define IS_ROOT_PATH(path) (!strcmp (path, "/"))
-#define IS_ENTRY_CWD(entry) (!strcmp (entry, "."))
-#define IS_ENTRY_PARENT(entry) (!strcmp (entry, ".."))
-
-#define PUMP_CMD_START "trusted.glusterfs.pump.start"
-#define PUMP_CMD_ABORT "trusted.glusterfs.pump.abort"
-#define PUMP_CMD_PAUSE "trusted.glusterfs.pump.pause"
-#define PUMP_CMD_STATUS "trusted.glusterfs.pump.status"
-
-#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete"
-#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete"
-
-#define PUMP_PATH "trusted.glusterfs.pump-path"
-
-#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator)
-#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator)
-
-typedef enum {
- PUMP_STATE_RUNNING, /* Pump is running and migrating files */
- PUMP_STATE_RESUME, /* Pump is resuming from a previous pause */
- PUMP_STATE_PAUSE, /* Pump is paused */
- PUMP_STATE_ABORT, /* Pump is aborted */
-} pump_state_t;
-
-typedef struct _pump_private {
- struct syncenv *env; /* The env pointer to the pump synctask */
- const char *resume_path; /* path to resume from the last pause */
- gf_lock_t resume_path_lock; /* Synchronize resume_path changes */
- gf_lock_t pump_state_lock; /* Synchronize pump_state changes */
- pump_state_t pump_state; /* State of pump */
- char current_file[PATH_MAX]; /* Current file being pumped */
- uint64_t number_files_pumped; /* Number of files pumped */
- gf_boolean_t pump_finished; /* Boolean to indicate pump termination */
- char pump_start_pending; /* Boolean to mark start pending until
- CHILD_UP */
-} pump_private_t;
-
-void
-build_root_loc (inode_t *inode, loc_t *loc);
-int pump_start (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_start (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_start (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_pause (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_pause (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_abort (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_abort (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_status (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_status (call_frame_t *frame, xlator_t *this);
-
-#endif /* __PUMP_H__ */
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index 8ebcab04451..56f1f2ad7c8 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,36 +1,48 @@
-
xlator_LTLIBRARIES = dht.la nufa.la switch.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+AM_CFLAGS = -Wall $(GF_CFLAGS)
-dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \
- dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \
- $(top_builddir)/xlators/lib/src/libxlator.c
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-dht_la_SOURCES = $(dht_common_source) dht.c
+dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \
+ dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \
+ dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \
+ dht-lock.c $(top_builddir)/xlators/lib/src/libxlator.c
+
+dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
-dht_la_LDFLAGS = -module -avoidversion
+dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-nufa_la_LDFLAGS = -module -avoidversion
+nufa_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-switch_la_LDFLAGS = -module -avoidversion
+switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = dht-common.h dht-common.c dht-mem-types.h $(top_builddir)/xlators/lib/src/libxlator.h
+noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \
+ dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \
- -I$(top_srcdir)/xlators/lib/src
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(top_srcdir)/xlators/lib/src \
+ -DDATADIR=\"$(localstatedir)\" \
+ -DLIBDIR=\"$(libdir)\"
-CLEANFILES =
+CLEANFILES =
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/distribute.so
install-data-hook:
ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so
+
+if UNITTEST
+CLEANFILES += *.gcda *.gcno *_xunit.xml
+noinst_PROGRAMS =
+TESTS =
+endif
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 111ba193318..8ba0cc4c732 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -1,5348 +1,11391 @@
/*
- Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
/* TODO: add NS locking */
-#include "glusterfs.h"
-#include "xlator.h"
#include "libxlator.h"
#include "dht-common.h"
-#include "defaults.h"
-#include "byte-order.h"
+#include "dht-lock.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/quota-common-utils.h>
+#include <glusterfs/upcall-utils.h>
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include <glusterfs/common-utils.h>
#include <sys/time.h>
#include <libgen.h>
+#include <signal.h>
+static int
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata);
-void
-dht_aggregate (dict_t *this, char *key, data_t *value, void *data)
-{
- dict_t *dst = NULL;
- int64_t *ptr = 0, *size = NULL;
- int32_t ret = -1;
+static int
+dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
- dst = data;
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req);
- if (strcmp (key, GF_XATTR_QUOTA_SIZE_KEY) == 0) {
- ret = dict_get_bin (dst, key, (void **)&size);
- if (ret < 0) {
- size = GF_CALLOC (1, sizeof (int64_t),
- gf_common_mt_char);
- if (size == NULL) {
- gf_log ("dht", GF_LOG_WARNING,
- "memory allocation failed");
- return;
- }
- ret = dict_set_bin (dst, key, size, sizeof (int64_t));
- if (ret < 0) {
- gf_log ("dht", GF_LOG_WARNING,
- "dht aggregate dict set failed");
- GF_FREE (size);
- return;
- }
- }
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this);
- ptr = data_to_bin (value);
- if (ptr == NULL) {
- gf_log ("dht", GF_LOG_WARNING, "data to bin failed");
- return;
- }
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
- *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr));
- }
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this);
- return;
+static const char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL};
+
+/* Check the xdata to make sure EBADF has been set by client xlator */
+int32_t
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno)
+{
+ if (op_ret == -1 && (op_errno == EBADF || op_errno == EBADFD) &&
+ !(local->fd_checked)) {
+ return 1;
+ }
+ return 0;
}
+/* Sets the blocks and size values to fixed values. This is to be called
+ * only for dirs. The caller is responsible for checking the type
+ */
+int32_t
+dht_set_fixed_dir_stat(struct iatt *stat)
+{
+ if (stat) {
+ stat->ia_blocks = DHT_DIR_STAT_BLOCKS;
+ stat->ia_size = DHT_DIR_STAT_SIZE;
+ return 0;
+ }
+ return -1;
+}
-void
-dht_aggregate_xattr (dict_t *dst, dict_t *src)
+/* Return true if key exists in array
+ */
+static gf_boolean_t
+dht_match_xattr(const char *key)
{
- if ((dst == NULL) || (src == NULL)) {
- goto out;
+ char **xattrs_to_heal = get_xattrs_to_heal();
+
+ return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0;
+}
+
+static int
+dht_aggregate_quota_xattr(dict_t *dst, char *key, data_t *value)
+{
+ int ret = -1;
+ quota_meta_t *meta_dst = NULL;
+ quota_meta_t *meta_src = NULL;
+ int64_t *size = NULL;
+ int64_t dst_dir_count = 0;
+ int64_t src_dir_count = 0;
+
+ if (value == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL,
+ "data value is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin(dst, key, (void **)&meta_dst);
+ if (ret < 0) {
+ meta_dst = GF_CALLOC(1, sizeof(quota_meta_t), gf_common_quota_meta_t);
+ if (meta_dst == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Memory allocation failed");
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_bin(dst, key, meta_dst, sizeof(quota_meta_t));
+ if (ret < 0) {
+ gf_msg("dht", GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED,
+ "dht aggregate dict set failed");
+ GF_FREE(meta_dst);
+ ret = -1;
+ goto out;
}
+ }
+
+ if (value->len > sizeof(int64_t)) {
+ meta_src = data_to_bin(value);
+
+ meta_dst->size = hton64(ntoh64(meta_dst->size) +
+ ntoh64(meta_src->size));
+ meta_dst->file_count = hton64(ntoh64(meta_dst->file_count) +
+ ntoh64(meta_src->file_count));
+
+ if (value->len > (2 * sizeof(int64_t))) {
+ dst_dir_count = ntoh64(meta_dst->dir_count);
+ src_dir_count = ntoh64(meta_src->dir_count);
- dict_foreach (src, dht_aggregate, dst);
+ if (src_dir_count > dst_dir_count)
+ meta_dst->dir_count = meta_src->dir_count;
+ } else {
+ meta_dst->dir_count = 0;
+ }
+ } else {
+ size = data_to_bin(value);
+ meta_dst->size = hton64(ntoh64(meta_dst->size) + ntoh64(*size));
+ }
+
+ ret = 0;
out:
- return;
+ return ret;
}
-/* TODO:
- - use volumename in xattr instead of "dht"
- - use NS locks
- - handle all cases in self heal layout reconstruction
- - complete linkfile selfheal
-*/
-
+static int
+add_opt(char **optsp, const char *opt)
+{
+ char *newopts = NULL;
+ unsigned oldsize = 0;
+ unsigned newsize = 0;
+
+ if (*optsp == NULL)
+ newopts = gf_strdup(opt);
+ else {
+ oldsize = strlen(*optsp);
+ newsize = oldsize + 1 + strlen(opt) + 1;
+ newopts = GF_REALLOC(*optsp, newsize);
+ if (newopts)
+ sprintf(newopts + oldsize, ",%s", opt);
+ }
+ if (newopts == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices in buffer in add_opt");
+ return -1;
+ }
+ *optsp = newopts;
+ return 0;
+}
-int
-dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int op_ret, int op_errno)
+/* Return Choice list from Split brain status */
+static char *
+getChoices(const char *value)
{
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int ret = -1;
+ int i = 0;
+ char *ptr = NULL;
+ char *tok = NULL;
+ char *result = NULL;
+ char *newval = NULL;
+
+ ptr = strstr(value, "Choices:");
+ if (!ptr) {
+ result = ptr;
+ goto out;
+ }
+
+ newval = gf_strdup(ptr);
+ if (!newval) {
+ result = newval;
+ goto out;
+ }
+
+ tok = strtok(newval, ":");
+ if (!tok) {
+ result = tok;
+ goto out;
+ }
+
+ while (tok) {
+ i++;
+ if (i == 2)
+ break;
+ tok = strtok(NULL, ":");
+ }
+
+ result = gf_strdup(tok);
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+out:
+ if (newval)
+ GF_FREE(newval);
- local = frame->local;
- ret = op_ret;
+ return result;
+}
- FRAME_SU_UNDO (frame, dht_local_t);
+/* This function prepare a list of choices for key
+ (replica.split-brain-status) in case of metadata split brain
+ only on the basis of key-value passed to this function.
+ After prepare the list of choices it update the same key in dict
+ with this value to reflect the same in
+ replica.split-brain-status attr for file.
- if (ret == 0) {
- layout = local->selfheal.layout;
- ret = dht_layout_set (this, local->inode, layout);
+*/
- if (local->ia_ino) {
- local->stbuf.ia_ino = local->ia_ino;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not find hashed subvolume for %s",
- local->loc.path);
- }
+static int
+dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value)
+{
+ int ret = 0;
+ char *oldvalue = NULL;
+ char *old_choice = NULL;
+ char *new_choice = NULL;
+ char *full_choice = NULL;
+ char *status = NULL;
+
+ if (value == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL,
+ "GF_AFR_SBRAIN_STATUS value is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str(dst, key, &oldvalue);
+ if (ret)
+ goto out;
+
+ /* skip code that is irrelevant if !oldvalue */
+ if (!oldvalue)
+ goto out;
+
+ if (strstr(oldvalue, "not")) {
+ gf_msg_debug("dht", 0, "Need to update split-brain status in dict");
+ ret = -1;
+ goto out;
+ }
+ if (strstr(oldvalue, "metadata-split-brain:yes") &&
+ (strstr(oldvalue, "data-split-brain:no"))) {
+ if (strstr(value->data, "not")) {
+ gf_msg_debug("dht", 0, "No need to update split-brain status");
+ ret = 0;
+ goto out;
+ }
+ if (strstr(value->data, "yes") &&
+ (strncmp(oldvalue, value->data, strlen(oldvalue)))) {
+ old_choice = getChoices(oldvalue);
+ if (!old_choice) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to get choices");
+ ret = -1;
+ goto out;
+ }
+
+ ret = add_opt(&full_choice, old_choice);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices");
+ ret = -1;
+ goto out;
+ }
- if (local->loc.parent)
- local->postparent.ia_ino = local->loc.parent->ino;
+ new_choice = getChoices(value->data);
+ if (!new_choice) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to get choices");
+ ret = -1;
+ goto out;
+ }
+
+ ret = add_opt(&full_choice, new_choice);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices ");
+ ret = -1;
+ goto out;
+ }
+ ret = gf_asprintf(&status,
+ "data-split-brain:%s "
+ "metadata-split-brain:%s Choices:%s",
+ "no", "yes", full_choice);
+
+ if (-1 == ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to prepare status ");
+ goto out;
+ }
+ ret = dict_set_dynstr(dst, key, status);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set full choice");
+ }
}
+ }
- WIPE (&local->postparent);
+out:
+ if (old_choice)
+ GF_FREE(old_choice);
+ if (new_choice)
+ GF_FREE(new_choice);
+ if (full_choice)
+ GF_FREE(full_choice);
+
+ return ret;
+}
- DHT_STACK_UNWIND (lookup, frame, ret, local->op_errno, local->inode,
- &local->stbuf, local->xattr, &local->postparent);
+static int
+dht_aggregate(dict_t *this, char *key, data_t *value, void *data)
+{
+ dict_t *dst = NULL;
+ int32_t ret = -1;
+ data_t *dict_data = NULL;
+
+ dst = data;
+
+ /* compare split brain xattr only */
+ if (strcmp(key, GF_AFR_SBRAIN_STATUS) == 0) {
+ ret = dht_aggregate_split_brain_xattr(dst, key, value);
+ if (!ret)
+ goto out;
+ } else if (strcmp(key, QUOTA_SIZE_KEY) == 0) {
+ ret = dht_aggregate_quota_xattr(dst, key, value);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0,
+ DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED,
+ "Failed to aggregate quota xattr");
+ }
+ goto out;
+ } else if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ ret = gf_get_min_stime(THIS, dst, key, value);
+ goto out;
+ } else {
+ /* compare user xattrs only */
+ if (!strncmp(key, "user.", SLEN("user."))) {
+ ret = dict_lookup(dst, key, &dict_data);
+ if (!ret && dict_data && value) {
+ ret = is_data_equal(dict_data, value);
+ if (!ret)
+ gf_msg_debug("dht", 0, "xattr mismatch for %s", key);
+ }
+ }
+ }
+
+ ret = dict_set(dst, key, value);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s", key);
+ }
out:
- return ret;
+ return ret;
}
+static void
+dht_aggregate_xattr(dict_t *dst, dict_t *src)
+{
+ if ((dst == NULL) || (src == NULL)) {
+ goto out;
+ }
+
+ dict_foreach(src, dht_aggregate, dst);
+out:
+ return;
+}
+/* Code to save hashed subvol on inode ctx as a mds subvol
+ */
int
-dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- int ret = -1;
- int is_dir = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
-
- conf = this->private;
- local = frame->local;
- prev = cookie;
+dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+ uint64_t ctx_int = 0;
+ gf_boolean_t ctx_free = _gf_false;
+
+ LOCK(&inode->lock);
+ {
+ ret = __inode_ctx_get(inode, this, &ctx_int);
+ if (ctx_int) {
+ ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
+ ctx->mds_subvol = mds_subvol;
+ } else {
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+ if (!ctx)
+ goto unlock;
+ ctx->mds_subvol = mds_subvol;
+ ctx_free = _gf_true;
+ ctx_int = (long)ctx;
+ ret = __inode_ctx_set(inode, this, &ctx_int);
+ }
+ }
+unlock:
+ UNLOCK(&inode->lock);
+ if (ret && ctx_free)
+ GF_FREE(ctx);
+ return ret;
+}
- layout = local->layout;
+/*Code to get mds subvol from inode ctx */
- if (!op_ret && uuid_is_null (local->gfid))
- memcpy (local->gfid, stbuf->ia_gfid, 16);
+int
+dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- /* Check if the gfid is different for file from other node */
- if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: gfid different on %s",
- local->loc.path, prev->this->name);
- }
+ if (!mdsvol)
+ return ret;
- LOCK (&frame->lock);
- {
- /* TODO: assert equal mode on stbuf->st_mode and
- local->stbuf->st_mode
+ if (__is_root_gfid(inode->gfid)) {
+ (*mdsvol) = FIRST_CHILD(this);
+ return 0;
+ }
- else mkdir/chmod/chown and fix
- */
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, xattr);
+ ret = dht_inode_ctx_get(inode, this, &ctx);
- if (op_ret == -1) {
- local->op_errno = ENOENT;
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s returned error (%s)",
- local->loc.path, prev->this->name,
- strerror (op_errno));
+ if (!ret && ctx) {
+ if (ctx->mds_subvol) {
+ *mdsvol = ctx->mds_subvol;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+ }
- goto unlock;
- }
+ return ret;
+}
- is_dir = check_is_dir (inode, stbuf, xattr);
- if (!is_dir) {
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s returned non dir 0%o",
- local->loc.path, prev->this->name,
- stbuf->ia_type);
- local->need_selfheal = 1;
- goto unlock;
- }
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
- local->op_ret = 0;
- if (local->xattr == NULL) {
- local->xattr = dict_ref (xattr);
- } else {
- dht_aggregate_xattr (local->xattr, xattr);
- }
+static int
+dht_lookup_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
- if (local->inode == NULL)
- local->inode = inode_ref (inode);
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
+ local = frame->local;
+ conf = this->private;
+ ret = op_ret;
- if (!local->ia_ino &&
- (prev->this == dht_first_up_subvol (this))) {
- local->ia_ino = local->stbuf.ia_ino;
- }
+ FRAME_SU_UNDO(frame, dht_local_t);
- }
-unlock:
- UNLOCK (&frame->lock);
+ if (ret == 0) {
+ layout = local->selfheal.layout;
+ ret = dht_layout_set(this, local->inode, layout);
+ }
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, &local->postparent,
+ 1);
+ }
- this_call_cnt = dht_frame_return (frame);
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- if (is_last_call (this_call_cnt)) {
- if (local->need_selfheal) {
- local->need_selfheal = 0;
- dht_lookup_everywhere (frame, this, &local->loc);
- return 0;
- }
+ DHT_STACK_UNWIND(lookup, frame, ret, local->op_errno, local->inode,
+ &local->stbuf, local->xattr, &local->postparent);
- if (local->op_ret == 0) {
- ret = dht_layout_normalize (this, &local->loc, layout);
+out:
+ return ret;
+}
- if (ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fixing assignment on %s",
- local->loc.path);
- goto selfheal;
- }
+static int
+dht_discover_complete(xlator_t *this, call_frame_t *discover_frame)
+{
+ dht_local_t *local = NULL;
+ dht_local_t *heal_local = NULL;
+ call_frame_t *main_frame = NULL;
+ call_frame_t *heal_frame = NULL;
+ int op_errno = 0;
+ int ret = -1;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ uint32_t vol_commit_hash = 0;
+ xlator_t *source = NULL;
+ int heal_path = 0;
+ int error_while_marking_mds = 0;
+ int i = 0;
+ loc_t loc = {0};
+ int8_t is_read_only = 0, layout_anomalies = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+
+ local = discover_frame->local;
+ layout = local->layout;
+ conf = this->private;
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ LOCK(&discover_frame->lock);
+ {
+ main_frame = local->main_frame;
+ local->main_frame = NULL;
+ }
+ UNLOCK(&discover_frame->lock);
+
+ if (!main_frame)
+ return 0;
+
+ /* Code to update all extended attributed from
+ subvol to local->xattr on that internal xattr has found
+ */
+ if (conf->subvolume_cnt == 1)
+ local->need_xattr_heal = 0;
+ if (local->need_xattr_heal && (local->mds_xattr)) {
+ dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr,
+ NULL, NULL);
+ dict_unref(local->mds_xattr);
+ local->mds_xattr = NULL;
+ }
+
+ ret = dict_get_int8(local->xattr_req, QUOTA_READ_ONLY_KEY, &is_read_only);
+ if (ret < 0)
+ gf_msg_debug(this->name, 0, "key = %s not present in dict",
+ QUOTA_READ_ONLY_KEY);
+
+ if (local->file_count && local->dir_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "path %s exists as a file on one subvolume "
+ "and directory on another. "
+ "Please fix it manually",
+ local->loc.path);
+ op_errno = EIO;
+ goto out;
+ }
+
+ if (local->cached_subvol) {
+ ret = dht_layout_preset(this, local->cached_subvol, local->inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to set layout for subvolume %s",
+ local->cached_subvol ? local->cached_subvol->name : "<nil>");
+ op_errno = EINVAL;
+ goto out;
+ }
+ } else {
+ ret = dht_layout_normalize(this, &local->loc, layout);
+ if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) {
+ /* either the layout is incorrect or the directory is
+ * not found even in one subvolume.
+ */
+ gf_msg_debug(this->name, 0,
+ "normalizing failed on %s "
+ "(overlaps/holes present: %s, "
+ "ENOENT errors: %d)",
+ local->loc.path, (ret < 0) ? "yes" : "no",
+ (ret > 0) ? ret : 0);
+ layout_anomalies = 1;
+ } else if (local->inode) {
+ dht_layout_set(this, local->inode, layout);
+ }
+ }
+
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32(local->xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
- dht_layout_set (this, local->inode, layout);
+ if (IA_ISDIR(local->stbuf.ia_type) && !is_read_only) {
+ for (i = 0; i < layout->cnt; i++) {
+ if (!source && !layout->list[i].err)
+ source = layout->list[i].xlator;
+ if (layout->list[i].err == ENOENT ||
+ layout->list[i].err == ESTALE) {
+ heal_path = 1;
+ }
+
+ if (source && heal_path)
+ break;
+ }
+ }
+
+ if (IA_ISDIR(local->stbuf.ia_type)) {
+ /* Call function to save hashed subvol on inode ctx if
+ internal mds xattr is not present and all subvols are up
+ */
+ if (!local->op_ret && !__is_root_gfid(local->stbuf.ia_gfid))
+ (void)dht_common_mark_mdsxattr(discover_frame,
+ &error_while_marking_mds, 1);
+
+ if (local->need_xattr_heal && !heal_path) {
+ local->need_xattr_heal = 0;
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "xattr heal failed for "
+ "directory gfid is %s ",
+ gfid_local);
+ }
+ }
+ }
+
+ if (source && (heal_path || layout_anomalies || error_while_marking_mds)) {
+ gf_uuid_copy(loc.gfid, local->gfid);
+ if (gf_uuid_is_null(loc.gfid)) {
+ goto done;
+ }
+
+ if (local->inode)
+ loc.inode = inode_ref(local->inode);
+ else
+ goto done;
+
+ heal_frame = create_frame(this, this->ctx->pool);
+ if (heal_frame) {
+ heal_local = dht_local_init(heal_frame, &loc, NULL, 0);
+ if (!heal_local)
+ goto cleanup;
+
+ gf_uuid_copy(heal_local->gfid, local->gfid);
+ heal_frame->cookie = source;
+ heal_local->xattr = dict_ref(local->xattr);
+ heal_local->stbuf = local->stbuf;
+ heal_local->postparent = local->postparent;
+ heal_local->inode = inode_ref(loc.inode);
+ heal_local->main_frame = main_frame;
+ FRAME_SU_DO(heal_frame, dht_local_t);
+ ret = synctask_new(this->ctx->env, dht_heal_full_path,
+ dht_heal_full_path_done, heal_frame, heal_frame);
+ if (!ret) {
+ loc_wipe(&loc);
+ return 0;
+ }
+ /*
+ * Failed to spawn the synctask. Returning
+ * with out doing heal.
+ */
+ cleanup:
+ loc_wipe(&loc);
+ DHT_STACK_DESTROY(heal_frame);
+ }
+ }
+done:
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- if (local->ia_ino) {
- local->stbuf.ia_ino = local->ia_ino;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not find hashed subvol for %s",
- local->loc.path);
- }
+ DHT_STACK_UNWIND(lookup, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
- if (local->loc.parent)
- local->postparent.ia_ino =
- local->loc.parent->ino;
- }
+out:
+ DHT_STACK_UNWIND(lookup, main_frame, -1, op_errno, NULL, NULL, NULL, NULL);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+ return ret;
+}
+
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = cookie;
+ int ret = -1;
+ dht_conf_t *conf = 0;
+ dht_layout_t *layout = NULL;
+ int32_t mds_heal_fresh_lookup = 0;
+
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+
+ local = frame->local;
+ conf = this->private;
+ layout = local->selfheal.layout;
+ mds_heal_fresh_lookup = local->mds_heal_fresh_lookup;
+
+ if (op_ret) {
+ gf_msg_debug(this->name, op_ret,
+ "Failed to set %s on the MDS %s for path %s. ",
+ conf->mds_xattr_key, prev->name, local->loc.path);
+ } else {
+ /* Save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set mds subvol on inode ctx"
+ " %s for %s ",
+ prev->name, local->loc.path);
+ }
+ }
+ if (!local->mds_heal_fresh_lookup && layout) {
+ dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffffff,
+ layout);
+ }
+out:
+ if (mds_heal_fresh_lookup)
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+
+static xlator_t *
+dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc)
+{
+ char *path = NULL;
+ loc_t populate_loc = {
+ 0,
+ };
+ char *name = NULL;
+ xlator_t *hash_subvol = NULL;
+
+ if (!inode)
+ return hash_subvol;
+
+ if (loc && loc->parent && loc->path) {
+ if (!loc->name) {
+ name = strrchr(loc->path, '/');
+ if (name) {
+ loc->name = name + 1;
+ } else {
+ goto out;
+ }
}
+ hash_subvol = dht_subvol_get_hashed(this, loc);
+ goto out;
+ }
- return 0;
+ if (!gf_uuid_is_null(inode->gfid)) {
+ populate_loc.inode = inode_ref(inode);
+ populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL);
+ inode_path(populate_loc.inode, NULL, &path);
-selfheal:
- FRAME_SU_DO (frame, dht_local_t);
- ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk,
- &local->loc, layout);
+ if (!path)
+ goto out;
+
+ populate_loc.path = path;
+ if (!populate_loc.name && populate_loc.path) {
+ name = strrchr(populate_loc.path, '/');
+ if (name) {
+ populate_loc.name = name + 1;
+
+ } else {
+ goto out;
+ }
+ }
+ hash_subvol = dht_subvol_get_hashed(this, &populate_loc);
+ }
out:
- return ret;
+ if (populate_loc.inode)
+ loc_wipe(&populate_loc);
+ return hash_subvol;
}
+/* Common function call by revalidate/selfheal code path to populate
+ internal xattr if it is not present, mark_during_fresh_lookup value
+ determines either function is call by revalidate_cbk(discover_complete)
+ or call by selfheal code path while fresh lookup.
+ Here we do wind a call serially in case of fresh lookup and
+ for other lookup code path we do wind a call parallel.The reason
+ to wind a call serially is at the time of fresh lookup directory is not
+ discovered and at the time of revalidate_lookup directory is
+ already discovered. So, revalidate codepath can race with setxattr
+ codepath and can get into spurious heals because of an ongoing setxattr.
+ This can slow down revalidates, if healing happens in foreground.
+ However, if healing happens in background, there is no direct performance
+ penalty.
+*/
int
-dht_lookup_root_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- int ret = -1;
- int is_dir = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
-
- conf = this->private;
- local = frame->local;
- prev = cookie;
-
- layout = local->layout;
-
- LOCK (&frame->lock);
- {
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, xattr);
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR,
- "lookup of %s on %s returned error (%s)",
- local->loc.path, prev->this->name,
- strerror (op_errno));
- goto unlock;
+dht_common_mark_mdsxattr(call_frame_t *frame, int *errst,
+ int mark_during_fresh_lookup)
+{
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int ret = 0;
+ int i = 0;
+ dict_t *xattrs = NULL;
+ char gfid_local[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+ int32_t zero[1] = {0};
+ dht_conf_t *conf = 0;
+ dht_layout_t *layout = NULL;
+ dht_local_t *copy_local = NULL;
+ call_frame_t *xattr_frame = NULL;
+ gf_boolean_t vol_down = _gf_false;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ local = frame->local;
+ conf = this->private;
+ layout = local->selfheal.layout;
+ local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ /* Code to update hashed subvol consider as a mds subvol
+ and wind a setxattr call on hashed subvol to update
+ internal xattr
+ */
+ if (!local->xattr || !dict_get(local->xattr, conf->mds_xattr_key)) {
+ /* It means no internal MDS xattr has been set yet
+ */
+ /* Check the status of all subvol are up while call
+ this function call by lookup code path
+ */
+ if (mark_during_fresh_lookup) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ vol_down = _gf_true;
+ break;
}
+ }
+ if (vol_down) {
+ gf_msg_debug(this->name, 0,
+ "subvol %s is down. Unable to "
+ " save mds subvol on inode for "
+ " path %s gfid is %s ",
+ conf->subvolumes[i]->name, local->loc.path,
+ gfid_local);
+ goto out;
+ }
+ }
- is_dir = check_is_dir (inode, stbuf, xattr);
- if (!is_dir) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "lookup of %s on %s returned non dir 0%o",
- local->loc.path, prev->this->name,
- stbuf->ia_type);
- goto unlock;
- }
+ /* Calculate hashed subvol based on inode and parent node
+ */
+ hashed_subvol = dht_inode_get_hashed_subvol(local->inode, this,
+ &local->loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_DEBUG, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for path %s"
+ "gfid is %s ",
+ local->loc.path, gfid_local);
+ if (errst)
+ (*errst) = 1;
+ ret = -1;
+ goto out;
+ }
+ xattrs = dict_new();
+ if (!xattrs) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dict_new failed");
+ ret = -1;
+ goto out;
+ }
+ /* Add internal MDS xattr on disk for hashed subvol
+ */
+ ret = dht_dict_set_array(xattrs, conf->mds_xattr_key, zero, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary"
+ " value:key = %s for "
+ "path %s",
+ conf->mds_xattr_key, local->loc.path);
+ ret = -1;
+ goto out;
+ }
+ /* Create a new frame to wind a call only while
+ this function call by revalidate_cbk code path
+ To wind a call parallel need to create a new frame
+ */
+ if (mark_during_fresh_lookup) {
+ xattr_frame = create_frame(this, this->ctx->pool);
+ if (!xattr_frame) {
+ ret = -1;
+ goto out;
+ }
+ copy_local = dht_local_init(xattr_frame, &(local->loc), NULL, 0);
+ if (!copy_local) {
+ ret = -1;
+ DHT_STACK_DESTROY(xattr_frame);
+ goto out;
+ }
+ copy_local->stbuf = local->stbuf;
+ copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+ if (!copy_local->inode)
+ copy_local->inode = inode_ref(local->inode);
+ gf_uuid_copy(copy_local->loc.gfid, local->gfid);
+ FRAME_SU_DO(xattr_frame, dht_local_t);
+ STACK_WIND_COOKIE(xattr_frame, dht_common_mark_mdsxattr_cbk,
+ hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->setxattr, &local->loc,
+ xattrs, 0, NULL);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_common_mark_mdsxattr_cbk,
+ (void *)hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->setxattr, &local->loc,
+ xattrs, 0, NULL);
+ }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "internal xattr %s is present on subvol"
+ "on path %s gfid is %s ",
+ conf->mds_xattr_key, local->loc.path, gfid_local);
+ if (!mark_during_fresh_lookup)
+ dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf,
+ 0xffffffff, layout);
+ }
- local->op_ret = 0;
- if (local->xattr == NULL) {
- local->xattr = dict_ref (xattr);
- } else {
- dht_aggregate_xattr (local->xattr, xattr);
- }
+out:
+ if (xattrs)
+ dict_unref(xattrs);
+ return ret;
+}
- if (local->inode == NULL)
- local->inode = inode_ref (inode);
+/* Get the value of key from dict in the bytewise and save in array after
+ convert from network byte order to host byte order
+*/
+static int32_t
+dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size,
+ int *errst)
+{
+ void *ptr = NULL;
+ int32_t len = -1;
+ int32_t vindex = -1;
+ int32_t err = -1;
+ int ret = 0;
+
+ if (dict == NULL) {
+ (*errst) = -1;
+ return -EINVAL;
+ }
+ err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+ if (err != 0) {
+ (*errst) = -1;
+ return err;
+ }
+
+ if (len != (size * sizeof(int32_t))) {
+ (*errst) = -1;
+ return -EINVAL;
+ }
+
+ for (vindex = 0; vindex < size; vindex++) {
+ value[vindex] = ntoh32(*((int32_t *)ptr + vindex));
+ if (value[vindex] < 0)
+ ret = -1;
+ }
+
+ return ret;
+}
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+static int
+dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int32_t check_mds = 0;
+ int is_linkfile = 0;
+ int attempt_unwind = 0;
+ dht_conf_t *conf = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char gfid_node[GF_UUID_BUF_SIZE] = {0};
+ int32_t mds_xattr_val[1] = {0};
+ int errst = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ layout = local->layout;
+
+ /* Check if the gfid is different for file from other node */
+ if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on %s, gfid local = %s"
+ "gfid other = %s",
+ local->loc.path, prev->name, gfid_local, gfid_node);
+ }
+
+ LOCK(&frame->lock);
+ {
+ /* TODO: assert equal mode on stbuf->st_mode and
+ local->stbuf->st_mode
+
+ else mkdir/chmod/chown and fix
+ */
+
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "%s: failed to merge layouts for subvol %s", local->loc.path,
+ prev->name);
- if (prev->this == dht_first_up_subvol (this)) {
- local->ia_ino = local->stbuf.ia_ino;
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno,
+ "lookup of %s on %s returned error", local->loc.path,
+ prev->name);
+ goto unlock;
}
-unlock:
- UNLOCK (&frame->lock);
-
- this_call_cnt = dht_frame_return (frame);
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr,
+ conf->link_xattr_name);
+ is_dir = check_is_dir(inode, stbuf, xattr);
- if (is_last_call (this_call_cnt)) {
- if (local->op_ret == 0) {
- ret = dht_layout_normalize (this, &local->loc, layout);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_INFO,
- "fixing assignment on %s",
- local->loc.path);
- }
+ if (is_dir) {
+ local->dir_count++;
+ } else {
+ local->file_count++;
+
+ if (!is_linkfile && !local->cached_subvol) {
+ /* real file */
+ /* Ok, we somehow managed to find a file on
+ * more than one subvol. ignore this or we
+ * will end up overwriting information while a
+ * a thread is potentially unwinding from
+ * dht_discover_complete
+ */
+ local->cached_subvol = prev;
+ attempt_unwind = 1;
+ } else {
+ goto unlock;
+ }
+ }
- ret = dht_layout_set (this, local->inode, layout);
- }
+ local->op_ret = 0;
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+ if (local->xattr == NULL) {
+ local->xattr = dict_ref(xattr);
+ } else {
+ /* Don't aggregate for files. See BZ#1484709 */
+ if (is_dir)
+ dht_aggregate_xattr(local->xattr, xattr);
}
+ if (local->inode == NULL)
+ local->inode = inode_ref(inode);
+
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
+
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ goto unlock;
+ } else {
+ gf_msg_debug(this->name, 0,
+ "internal xattr %s is present on subvol"
+ "on path %s gfid is %s ",
+ conf->mds_xattr_key, local->loc.path, gfid_local);
+ }
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ /* save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set hashed subvol for %s vol is %s",
+ local->loc.path, prev->name);
+ }
+
+ if ((check_mds < 0) && !errst) {
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "Value of %s is not zero on mds subvol"
+ "so xattr needs to be healed on non mds"
+ " path is %s and vol name is %s "
+ " gfid is %s",
+ conf->mds_xattr_key, local->loc.path, prev->name,
+ gfid_local);
+ local->need_xattr_heal = 1;
+ local->mds_subvol = prev;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
out:
- return ret;
+ /* Make sure, the thread executing dht_discover_complete is the one
+ * which calls STACK_DESTROY (frame). In the case of "attempt_unwind",
+ * this makes sure that the thread don't call dht_frame_return, till
+ * call to dht_discover_complete is done.
+ */
+ if (attempt_unwind) {
+ dht_discover_complete(this, frame);
+ }
+
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt) && !attempt_unwind) {
+ dht_discover_complete(this, frame);
+ }
+
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_DESTROY(frame);
+
+ return 0;
+}
+
+static int
+dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int ret = -EINVAL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf) {
+ goto err;
+ }
+
+ if (!xattr_req) {
+ goto err;
+ }
+
+ /* Used to check whether this is a linkto file.
+ */
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->link_xattr_name, loc->path);
+ goto err;
+ }
+
+ /* This is used to make sure we don't unlink linkto files
+ * which are the target of an ongoing file migration.
+ */
+ ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ GLUSTERFS_OPEN_FD_COUNT, loc->path);
+ goto err;
+ }
+
+ ret = 0;
+err:
+ return ret;
}
+/* This is a gfid based nameless lookup. Without a name, the hashed subvol
+ * cannot be calculated so a lookup is sent to all subvols.
+ */
static int
-dht_do_fresh_lookup_on_root (xlator_t *this, call_frame_t *frame)
+dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int call_cnt = 0;
- int i = 0;
- int op_errno = EINVAL;
+ int ret;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int call_cnt = 0;
+ int op_errno = EINVAL;
+ int i = 0;
+ call_frame_t *discover_frame = NULL;
+
+ conf = this->private;
+ local = frame->local;
+
+ /* As we do not know if this is a file or directory, request
+ * both file and directory xattrs
+ */
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ goto err;
+ }
+
+ if (loc_is_root(loc)) {
+ /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+ * set on the brick root.
+ */
+ ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame, unwind);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind);
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
- local = frame->local;
- conf = this->private;
- if (!conf)
- goto err;
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
- if (local->layout) {
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
- }
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- if (local->xattr != NULL) {
- dict_unref (local->xattr);
- local->xattr = NULL;
- }
+ gf_uuid_copy(local->gfid, loc->gfid);
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set the dict entry for dht");
+ discover_frame = copy_frame(frame);
+ if (!discover_frame) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- call_cnt = local->call_cnt = conf->subvolume_cnt;
+ discover_frame->local = local;
+ frame->local = NULL;
+ local->main_frame = frame;
- local->layout = dht_layout_new (this,
- conf->subvolume_cnt);
- if (!local->layout) {
- local->op_errno = ENOMEM;
- goto err;
- }
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(discover_frame, dht_discover_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_root_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
- }
+ return 0;
- return 0;
err:
- DHT_STACK_UNWIND (lookup, frame, -1, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
-unwind:
- DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
-out:
- return -1;
+ return 0;
}
+/* Code to call syntask to heal custom xattr from hashed subvol
+ to non hashed subvol
+*/
int
-dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int is_dir = 0;
- int is_linkfile = 0;
- unsigned char root_gfid[16] = {0,};
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, err);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, err);
- GF_VALIDATE_OR_GOTO ("dht", cookie, err);
-
- local = frame->local;
- prev = cookie;
- conf = this->private;
- if (!conf)
- goto out;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
-
- if ((op_errno != ENOTCONN)
- && (op_errno != ENOENT)
- && (op_errno != ESTALE)) {
- gf_log (this->name, GF_LOG_INFO,
- "subvolume %s for %s returned -1 (%s)",
- prev->this->name, local->loc.path,
- strerror (op_errno));
- }
- if (op_errno == ESTALE) {
- /* propogate the ESTALE to parent.
- * setting local->layout_mismatch would send
- * ESTALE to parent. */
- local->layout_mismatch = 1;
- }
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno)
+{
+ dht_local_t *copy_local = NULL;
+ call_frame_t *copy = NULL;
+ int ret = -1;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+
+ if (gf_uuid_is_null(local->gfid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "No gfid exists for path %s "
+ "so healing xattr is not possible",
+ local->loc.path);
+ *op_errno = EIO;
+ goto out;
+ }
+
+ gf_uuid_unparse(local->gfid, gfid_local);
+ copy = create_frame(this, this->ctx->pool);
+ if (copy) {
+ copy_local = dht_local_init(copy, &(local->loc), NULL, 0);
+ if (!copy_local) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "Memory allocation failed "
+ "for path %s gfid %s ",
+ local->loc.path, gfid_local);
+ *op_errno = ENOMEM;
+ DHT_STACK_DESTROY(copy);
+ } else {
+ copy_local->stbuf = local->stbuf;
+ gf_uuid_copy(copy_local->loc.gfid, local->gfid);
+ copy_local->mds_subvol = local->mds_subvol;
+ FRAME_SU_DO(copy, dht_local_t);
+ ret = synctask_new(this->ctx->env, dht_dir_heal_xattrs,
+ dht_dir_heal_xattrs_done, copy, copy);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "Synctask creation failed to heal xattr "
+ "for path %s gfid %s ",
+ local->loc.path, gfid_local);
+ *op_errno = ENOMEM;
+ DHT_STACK_DESTROY(copy);
+ }
+ }
+ }
+out:
+ return ret;
+}
- goto unlock;
- }
+static int
+dht_needs_selfheal(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int needs_selfheal = 0;
+ int ret = 0;
+
+ local = frame->local;
+ layout = local->layout;
+
+ if (local->need_attrheal || local->need_xattr_heal ||
+ local->need_selfheal) {
+ needs_selfheal = 1;
+ }
+
+ ret = dht_layout_normalize(this, &local->loc, layout);
+
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0, "fixing assignment on %s", local->loc.path);
+ needs_selfheal = 1;
+ }
+ return needs_selfheal;
+}
- if (stbuf->ia_type != local->inode->ia_type) {
- gf_log (this->name, GF_LOG_INFO,
- "mismatching filetypes 0%o v/s 0%o for %s",
- (stbuf->ia_type), (local->inode->ia_type),
- local->loc.path);
+static int
+is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2)
+{
+ if ((prot1->owner.read != prot2->owner.read) ||
+ (prot1->owner.write != prot2->owner.write) ||
+ (prot1->owner.exec != prot2->owner.exec) ||
+ (prot1->group.read != prot2->group.read) ||
+ (prot1->group.write != prot2->group.write) ||
+ (prot1->group.exec != prot2->group.exec) ||
+ (prot1->other.read != prot2->other.read) ||
+ (prot1->other.write != prot2->other.write) ||
+ (prot1->other.exec != prot2->other.exec) ||
+ (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) ||
+ (prot1->sticky != prot2->sticky)) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
- local->op_ret = -1;
- local->op_errno = EINVAL;
+int
+dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int32_t check_mds = 0;
+ int errst = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char gfid_node[GF_UUID_BUF_SIZE] = {0};
+ int32_t mds_xattr_val[1] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ layout = local->layout;
+ gf_msg_debug(this->name, op_errno,
+ "%s: lookup on %s returned with op_ret = %d, op_errno = %d",
+ local->loc.path, prev->name, op_ret, op_errno);
+
+ /* The first successful lookup*/
+ if (!op_ret && gf_uuid_is_null(local->gfid)) {
+ memcpy(local->gfid, stbuf->ia_gfid, 16);
+ }
+ if (!gf_uuid_is_null(local->gfid)) {
+ gf_uuid_unparse(local->gfid, gfid_local);
+ }
+
+ /* Check if the gfid is different for file from other node */
+ if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on %s."
+ " gfid local = %s, gfid subvol = %s",
+ local->loc.path, prev->name, gfid_local, gfid_node);
+ }
+
+ LOCK(&frame->lock);
+ {
+ /* TODO: assert equal mode on stbuf->st_mode and
+ local->stbuf->st_mode
+ else mkdir/chmod/chown and fix
+ */
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
- goto unlock;
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
- layout = local->layout;
+ /* The GFID is missing on this subvol. Force a heal. */
+ if (op_errno == ENODATA) {
+ local->need_lookup_everywhere = 1;
+ }
+ goto unlock;
+ }
- is_dir = check_is_dir (inode, stbuf, xattr);
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ if (!is_dir) {
+ gf_msg_debug(this->name, 0,
+ "%s: lookup on %s returned non dir 0%o"
+ "calling lookup_everywhere",
+ local->loc.path, prev->name, stbuf->ia_type);
- if (is_linkfile) {
- gf_log (this->name, GF_LOG_INFO,
- "linkfile found in revalidate for %s",
- local->loc.path);
- local->layout_mismatch = 1;
+ local->need_lookup_everywhere = 1;
+ goto unlock;
+ }
- goto unlock;
+ local->op_ret = 0;
+ if (local->xattr == NULL) {
+ local->xattr = dict_ref(xattr);
+ } else {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
+
+ if (__is_root_gfid(stbuf->ia_gfid)) {
+ ret = dht_dir_has_layout(xattr, conf->xattr_name);
+ if (ret >= 0) {
+ if (is_greater_time(local->prebuf.ia_ctime,
+ local->prebuf.ia_ctime_nsec,
+ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+ /* Choose source */
+ local->prebuf.ia_gid = stbuf->ia_gid;
+ local->prebuf.ia_uid = stbuf->ia_uid;
+
+ local->prebuf.ia_ctime = stbuf->ia_ctime;
+ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+ local->prebuf.ia_prot = stbuf->ia_prot;
}
+ }
+ }
- if (is_dir) {
- ret = dht_layout_dir_mismatch (this, layout,
- prev->this, &local->loc,
- xattr);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_INFO,
- "mismatching layouts for %s",
- local->loc.path);
+ if (local->stbuf.ia_type != IA_INVAL) {
+ /* This is not the first subvol to respond
+ * Compare values to see if attrs need to be healed
+ */
+ if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+ (local->stbuf.ia_uid != stbuf->ia_uid) ||
+ (is_permission_different(&local->stbuf.ia_prot,
+ &stbuf->ia_prot))) {
+ local->need_attrheal = 1;
+ }
+ }
- local->layout_mismatch = 1;
+ if (local->inode == NULL)
+ local->inode = inode_ref(inode);
- goto unlock;
- }
- }
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
-
- local->op_ret = 0;
- local->stbuf.ia_ino = local->ia_ino;
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ gf_msg_debug(this->name, 0,
+ "%s: mds xattr %s is not present "
+ "on %s(gfid = %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid_local);
+ goto unlock;
+ }
- if (local->loc.parent)
- local->postparent.ia_ino = local->loc.parent->ino;
+ /* Save the mds subvol info and stbuf. This is the value that will
+ * be used for healing
+ */
+ local->mds_subvol = prev;
+ local->mds_stbuf = *stbuf;
+
+ /* Save mds subvol on inode ctx */
+
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "%s: Failed to set mds (%s)", local->loc.path, prev->name);
+ }
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ if ((check_mds < 0) && !errst) {
+ /* Check if xattrs need to be healed on the directories */
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "%s: %s is not zero on %s. Xattrs need to be healed."
+ "(gfid = %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid_local);
+ local->need_xattr_heal = 1;
+ }
+ }
- if (!local->xattr) {
- local->xattr = dict_ref (xattr);
- } else if (is_dir) {
- dht_aggregate_xattr (local->xattr, xattr);
- }
- }
unlock:
- UNLOCK (&frame->lock);
-out:
- this_call_cnt = dht_frame_return (frame);
-
- if (is_last_call (this_call_cnt)) {
- if (!IA_ISDIR (local->stbuf.ia_type)
- && (local->hashed_subvol != local->cached_subvol)
- && (local->stbuf.ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- local->stbuf.ia_prot.sticky = 1;
- }
+ UNLOCK(&frame->lock);
- if (local->layout_mismatch) {
- local->op_ret = -1;
- local->op_errno = ESTALE;
-
- /* Because for 'root' inode, there is no FRESH lookup
- * sent from FUSE layer upon ESTALE, we need to handle
- * that one case here */
- root_gfid[15] = 1;
- if (!local->loc.parent &&
- !uuid_compare (local->loc.inode->gfid, root_gfid)) {
- dht_do_fresh_lookup_on_root (this, frame);
- return 0;
- }
- }
+ this_call_cnt = dht_frame_return(frame);
- WIPE (&local->postparent);
+ if (is_last_call(this_call_cnt)) {
+ /* If the mds subvol is not set correctly*/
+ if (!__is_root_gfid(local->gfid) &&
+ (!dict_get(local->xattr, conf->mds_xattr_key))) {
+ local->need_selfheal = 1;
+ }
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+ /* No need to call xattr heal code if volume count is 1
+ */
+ if (conf->subvolume_cnt == 1) {
+ local->need_xattr_heal = 0;
}
-err:
- return ret;
-}
+ if (local->need_selfheal || local->need_lookup_everywhere) {
+ /* Set the gfid-req so posix will set the GFID*/
+ if (!gf_uuid_is_null(local->gfid)) {
+ /* Ok, this should _never_ happen */
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid, 16);
+ } else {
+ if (!gf_uuid_is_null(local->gfid_req))
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid_req, 16);
+ }
+ }
+ if (local->need_lookup_everywhere) {
+ local->need_lookup_everywhere = 0;
+ dht_lookup_everywhere(frame, this, &local->loc);
+ return 0;
+ }
-int
-dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+ if (local->op_ret == 0) {
+ if (dht_needs_selfheal(frame, this)) {
+ goto selfheal;
+ }
- local = frame->local;
- cached_subvol = local->cached_subvol;
- conf = this->private;
+ dht_layout_set(this, local->inode, layout);
+ if (local->inode) {
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+ }
- ret = dht_layout_preset (this, local->cached_subvol, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set layout for subvolume %s",
- cached_subvol ? cached_subvol->name : "<nil>");
- local->op_ret = -1;
- local->op_errno = EINVAL;
- goto unwind;
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
}
- local->op_ret = 0;
- if ((local->stbuf.ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- local->stbuf.ia_prot.sticky = 1;
- }
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- if (local->loc.parent)
- local->postparent.ia_ino = local->loc.parent->ino;
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ }
-unwind:
- WIPE (&local->postparent);
+ return 0;
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+selfheal:
+ FRAME_SU_DO(frame, dht_local_t);
+ ret = dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, &local->loc,
+ layout);
out:
- return ret;
+ return ret;
}
+static int
+dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int call_cnt = 0;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, unwind);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+ GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+ GF_VALIDATE_OR_GOTO("dht", loc, unwind);
+
+ conf = this->private;
+ local = frame->local;
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ goto unwind;
+ }
+
+ if (local->xattr != NULL) {
+ dict_unref(local->xattr);
+ local->xattr = NULL;
+ }
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ /* use this gfid in order to heal any missing ones */
+ ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: Failed to set dictionary value:"
+ " key = gfid-req",
+ local->loc.path);
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(
+ frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req);
+ }
+ return 0;
+unwind:
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+out:
+ return 0;
+}
int
-dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
+dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
{
- int ret = 0;
- dht_local_t *local = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
-
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int is_linkfile = 0;
+ int follow_link = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ uint32_t vol_commit_hash = 0;
+ xlator_t *subvol = NULL;
+ int32_t check_mds = 0;
+ int errst = 0, i = 0;
+ int32_t mds_xattr_val[1] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, err);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, err);
+ GF_VALIDATE_OR_GOTO("dht", cookie, err);
+ GF_VALIDATE_OR_GOTO("dht", this->private, err);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (!conf->vch_forced) {
+ /* Update the commithash value if available
+ */
+ ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
- local = frame->local;
- hashed_subvol = local->hashed_subvol;
- cached_subvol = local->cached_subvol;
+ gf_uuid_unparse(local->loc.gfid, gfid);
- if (local->file_count && local->dir_count) {
- gf_log (this->name, GF_LOG_ERROR,
- "path %s exists as a file on one subvolume "
- "and directory on another. "
- "Please fix it manually",
- local->loc.path);
- DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL,
- NULL);
- return 0;
- }
+ gf_msg_debug(this->name, op_errno,
+ "%s: revalidate lookup on %s returned op_ret %d",
+ local->loc.path, prev->name, op_ret);
- if (local->dir_count) {
- dht_lookup_directory (frame, this, &local->loc);
- return 0;
+ LOCK(&frame->lock);
+ {
+ if (gf_uuid_is_null(local->gfid)) {
+ memcpy(local->gfid, local->loc.gfid, 16);
}
- if (!cached_subvol) {
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
- NULL);
- return 0;
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+
+ if ((op_errno != ENOTCONN) && (op_errno != ENOENT) &&
+ (op_errno != ESTALE)) {
+ gf_msg(this->name, GF_LOG_INFO, op_errno,
+ DHT_MSG_REVALIDATE_CBK_INFO,
+ "Revalidate: subvolume %s for %s "
+ "(gfid = %s) returned -1",
+ prev->name, local->loc.path, gfid);
+ }
+ if (op_errno == ESTALE) {
+ /* propagate the ESTALE to parent.
+ * setting local->return_estale would send
+ * ESTALE to parent. */
+ local->return_estale = 1;
+ }
+
+ /* if it is ENOENT, we may have to do a
+ * 'lookup_everywhere()' to make sure
+ * the file is not migrated */
+ if (op_errno == ENOENT) {
+ if (IA_ISREG(local->loc.inode->ia_type)) {
+ gf_msg_debug(this->name, 0,
+ "found ENOENT for %s. "
+ "Setting "
+ "need_lookup_everywhere"
+ " flag to 1",
+ local->loc.path);
+
+ local->need_lookup_everywhere = 1;
+ } else if (IA_ISDIR(local->loc.inode->ia_type)) {
+ layout = local->layout;
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == prev) {
+ layout->list[i].err = op_errno;
+ break;
+ }
+ }
- if (!hashed_subvol) {
- gf_log (this->name, GF_LOG_INFO,
- "cannot create linkfile file for %s on %s: "
- "hashed subvolume cannot be found.",
- local->loc.path, cached_subvol->name);
+ local->need_selfheal = 1;
+ }
+ }
- local->op_ret = 0;
- local->op_errno = 0;
+ /* The GFID is missing on this subvol. Lookup everywhere to force a
+ * gfid heal
+ */
+ if ((op_errno == ENODATA) &&
+ (IA_ISDIR(local->loc.inode->ia_type))) {
+ local->need_lookup_everywhere = 1;
+ }
- ret = dht_layout_preset (frame->this, cached_subvol,
- local->inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "failed to set layout for subvol %s",
- cached_subvol ? cached_subvol->name :
- "<nil>");
- local->op_ret = -1;
- local->op_errno = EINVAL;
- }
+ goto unlock;
+ }
- if (local->loc.parent)
- local->postparent.ia_ino =
- local->loc.parent->ino;
+ if ((!IA_ISINVAL(local->inode->ia_type)) &&
+ stbuf->ia_type != local->inode->ia_type) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "mismatching filetypes 0%o v/s 0%o for %s,"
+ " gfid = %s",
+ (stbuf->ia_type), (local->inode->ia_type), local->loc.path,
+ gfid);
- WIPE (&local->postparent);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
- DHT_STACK_UNWIND (lookup, frame, local->op_ret,
- local->op_errno, local->inode,
- &local->stbuf, local->xattr,
- &local->postparent);
- return 0;
+ goto unlock;
}
- gf_log (this->name, GF_LOG_DEBUG,
- "linking file %s existing on %s to %s (hash)",
- local->loc.path, cached_subvol->name,
- hashed_subvol->name);
+ layout = local->layout;
- ret = dht_linkfile_create (frame,
- dht_lookup_linkfile_create_cbk,
- cached_subvol, hashed_subvol, &local->loc);
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr,
+ conf->link_xattr_name);
+ if (is_linkfile) {
+ follow_link = 1;
+ goto unlock;
+ }
+ if (is_dir) {
+ ret = dht_dir_has_layout(xattr, conf->xattr_name);
+ if (ret >= 0) {
+ if (is_greater_time(local->prebuf.ia_ctime,
+ local->prebuf.ia_ctime_nsec,
+ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+ /* Choose source */
+ local->prebuf.ia_gid = stbuf->ia_gid;
+ local->prebuf.ia_uid = stbuf->ia_uid;
+
+ local->prebuf.ia_ctime = stbuf->ia_ctime;
+ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+
+ if (__is_root_gfid(stbuf->ia_gfid))
+ local->prebuf.ia_prot = stbuf->ia_prot;
+ }
+ }
+
+ if (local->stbuf.ia_type != IA_INVAL) {
+ if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+ (local->stbuf.ia_uid != stbuf->ia_uid) ||
+ is_permission_different(&local->stbuf.ia_prot,
+ &stbuf->ia_prot)) {
+ local->need_attrheal = 1;
+ }
+ }
+
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ gf_msg_debug(this->name, 0,
+ "%s: internal xattr %s is not present"
+ " on subvol %s(gfid is %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid);
+ } else {
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ local->mds_subvol = prev;
+ local->mds_stbuf.ia_gid = stbuf->ia_gid;
+ local->mds_stbuf.ia_uid = stbuf->ia_uid;
+ local->mds_stbuf.ia_prot = stbuf->ia_prot;
+
+ /* save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set MDS subvol for %s vol is %s",
+ local->loc.path, prev->name);
+ }
+ if ((check_mds < 0) && !errst) {
+ /* Check if xattrs need to be healed on the directory
+ */
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "Value of %s is not zero on "
+ "hashed subvol so xattr needs to"
+ " be healed on non hashed"
+ " path is %s and vol name is %s "
+ " gfid is %s",
+ conf->mds_xattr_key, local->loc.path,
+ prev->name, gfid);
+ local->need_xattr_heal = 1;
+ }
+ }
+ ret = dht_layout_dir_mismatch(this, layout, prev, &local->loc,
+ xattr);
+ if (ret != 0) {
+ /* In memory layout does not match on-disk layout.
+ */
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_MISMATCH,
+ "Mismatching layouts for %s, gfid = %s", local->loc.path,
+ gfid);
- return ret;
-}
+ local->layout_mismatch = 1;
+ goto unlock;
+ }
+ }
-int
-dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iatt *preparent, struct iatt *postparent)
-{
- int this_call_cnt = 0;
+ gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid);
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_lookup_everywhere_done (frame, this);
- }
+ local->op_ret = 0;
- return 0;
-}
+ if (!local->xattr) {
+ local->xattr = dict_ref(xattr);
+ } else if (is_dir) {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+ if (follow_link) {
+ /* Found a linkto file. Follow it to see if the target file exists
+ */
+ gf_uuid_copy(local->gfid, stbuf->ia_gfid);
-int
-dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- int is_linkfile = 0;
- int is_dir = 0;
- xlator_t *subvol = NULL;
- loc_t *loc = NULL;
- xlator_t *link_subvol = NULL;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
-
- conf = this->private;
-
- local = frame->local;
- loc = &local->loc;
-
- prev = cookie;
- subvol = prev->this;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno != ENOENT)
- local->op_errno = op_errno;
- goto unlock;
- }
+ subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+ if (!subvol) {
+ op_errno = ESTALE;
+ local->op_ret = -1;
+ } else {
+ STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc,
+ local->xattr_req);
+ return 0;
+ }
+ }
- if (uuid_is_null (local->gfid))
- uuid_copy (local->gfid, buf->ia_gfid);
+ this_call_cnt = dht_frame_return(frame);
- if (uuid_compare (local->gfid, buf->ia_gfid)) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: gfid differs on subvolume %s",
- loc->path, prev->this->name);
+ if (is_last_call(this_call_cnt)) {
+ if (!IA_ISDIR(local->stbuf.ia_type) &&
+ (local->hashed_subvol != local->cached_subvol) &&
+ (local->stbuf.ia_nlink == 1) &&
+ (conf && conf->unhashed_sticky_bit)) {
+ local->stbuf.ia_prot.sticky = 1;
+ }
+ /* No need to call heal code if volume count is 1
+ */
+ if (conf->subvolume_cnt == 1)
+ local->need_xattr_heal = 0;
+
+ if (IA_ISDIR(local->stbuf.ia_type)) {
+ /* No mds xattr found. Trigger a heal to set it */
+ if (!__is_root_gfid(local->loc.inode->gfid) &&
+ (!dict_get(local->xattr, conf->mds_xattr_key)))
+ local->need_selfheal = 1;
+
+ if (dht_needs_selfheal(frame, this)) {
+ if (!__is_root_gfid(local->loc.inode->gfid)) {
+ if (local->mds_subvol) {
+ local->stbuf.ia_gid = local->mds_stbuf.ia_gid;
+ local->stbuf.ia_uid = local->mds_stbuf.ia_uid;
+ local->stbuf.ia_prot = local->mds_stbuf.ia_prot;
+ }
+ } else {
+ local->stbuf.ia_gid = local->prebuf.ia_gid;
+ local->stbuf.ia_uid = local->prebuf.ia_uid;
+ local->stbuf.ia_prot = local->prebuf.ia_prot;
}
- is_linkfile = check_is_linkfile (inode, buf, xattr);
- is_dir = check_is_dir (inode, buf, xattr);
-
- if (is_linkfile) {
- link_subvol = dht_linkfile_subvol (this, inode, buf,
- xattr);
- gf_log (this->name, GF_LOG_DEBUG,
- "found on %s linkfile %s (-> %s)",
- subvol->name, loc->path,
- link_subvol ? link_subvol->name : "''");
- goto unlock;
+ layout = local->layout;
+ dht_selfheal_directory(frame, dht_lookup_selfheal_cbk,
+ &local->loc, layout);
+ return 0;
+ }
+ }
+
+ if (local->layout_mismatch) {
+ /* Found layout mismatch in the directory, need to
+ fix this in the inode context */
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+
+ if (local->need_lookup_everywhere) {
+ /* As the current layout gave ENOENT error, we would
+ need a new layout */
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+
+ /* We know that current cached subvol is no longer
+ valid, get the new one */
+ local->cached_subvol = NULL;
+ if (local->xattr_req) {
+ if (!gf_uuid_is_null(local->gfid)) {
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid, 16);
}
+ }
- /* non linkfile GFID takes precedence */
- uuid_copy (local->gfid, buf->ia_gfid);
-
- if (is_dir) {
- local->dir_count++;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "found on %s directory %s",
- subvol->name, loc->path);
- } else {
- local->file_count++;
-
- if (!local->cached_subvol) {
- /* found one file */
- dht_iatt_merge (this, &local->stbuf, buf,
- subvol);
- local->xattr = dict_ref (xattr);
- local->cached_subvol = subvol;
- gf_log (this->name, GF_LOG_DEBUG,
- "found on %s file %s",
- subvol->name, loc->path);
-
- dht_iatt_merge (this, &local->postparent,
- postparent, subvol);
- } else {
- /* This is where we need 'rename' both entries logic */
- gf_log (this->name, GF_LOG_WARNING,
- "multiple subvolumes (%s and %s) have "
- "file %s (preferrably rename the file "
- "in the backend, and do a fresh lookup)",
- local->cached_subvol->name,
- subvol->name, local->loc.path);
- }
- }
+ dht_lookup_everywhere(frame, this, &local->loc);
+ return 0;
+ }
+ if (local->return_estale) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
}
-unlock:
- UNLOCK (&frame->lock);
- if (is_linkfile) {
- gf_log (this->name, GF_LOG_INFO,
- "deleting stale linkfile %s on %s",
- loc->path, subvol->name);
- STACK_WIND (frame, dht_lookup_unlink_cbk,
- subvol, subvol->fops->unlink, loc);
- return 0;
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
}
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_lookup_everywhere_done (frame, this);
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ /* local->stbuf is updated only from subvols which have a layout
+ * The reason is to avoid choosing attr heal source from newly
+ * added bricks. In case e.g we have only one subvol and for
+ * some reason layout is not present on it, then local->stbuf
+ * will be EINVAL. This is an indication that the subvols
+ * active in the cluster do not have layouts on disk.
+ * Unwind with ESTALE to trigger a fresh lookup */
+ if (is_dir && local->stbuf.ia_type == IA_INVAL) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
}
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ }
+
+err:
+ return ret;
+}
+
+static int
+dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cooie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ local = frame->local;
+ cached_subvol = local->cached_subvol;
+ conf = this->private;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (local->locked)
+ dht_unlock_namespace(frame, &local->lock[0]);
+
+ ret = dht_layout_preset(this, local->cached_subvol, local->loc.inode);
+ if (ret < 0) {
+ gf_msg_debug(this->name, EINVAL,
+ "Failed to set layout for subvolume %s, "
+ "(gfid = %s)",
+ cached_subvol ? cached_subvol->name : "<nil>", gfid);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ goto unwind;
+ }
+
+ local->op_ret = 0;
+ if ((local->stbuf.ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) {
+ local->stbuf.ia_prot.sticky = 1;
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+unwind:
+ gf_msg_debug(this->name, 0,
+ "creation of linkto on hashed subvol:%s, "
+ "returned with op_ret %d and op_errno %d: %s",
+ local->hashed_subvol->name, op_ret, op_errno,
+ uuid_utoa(local->loc.gfid));
+
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal(frame, this);
+
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
out:
- return ret;
+ return ret;
}
+static int
+dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+
+ local = (dht_local_t *)frame->local;
+ path = local->loc.path;
+ FRAME_SU_UNDO(frame, dht_local_t);
-int
-dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
+
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_lookup_everywhere_done(frame, this);
+ }
+
+ return 0;
+}
+
+static int
+dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int i = 0;
- int call_cnt = 0;
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
- GF_VALIDATE_OR_GOTO ("dht", loc, out);
+ local = (dht_local_t *)frame->local;
+ path = local->loc.path;
- conf = this->private;
- local = frame->local;
+ FRAME_SU_UNDO(frame, dht_local_t);
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
- if (!local->inode)
- local->inode = inode_ref (loc->inode);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) {
+ dht_lookup_everywhere_done(frame, this);
+ } else {
+ /*When dht_lookup_everywhere is performed, one cached
+ *and one hashed file was found and hashed file does
+ *not point to the above mentioned cached node. So it
+ *was considered as stale and an unlink was performed.
+ *But unlink fails. So may be rebalance is in progress.
+ *now ideally we have two data-files. One obtained during
+ *lookup_everywhere and one where unlink-failed. So
+ *at this point in time we cannot decide which one to
+ *choose because there are chances of first cached
+ *file is truncated after rebalance and if it is chosen
+ *as cached node, application will fail. So return EIO.*/
+
+ if (op_errno == EBUSY) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_UNLINK_FAILED,
+ "Could not unlink the linkto file as "
+ "either fd is open and/or linkto xattr "
+ "is set for %s",
+ ((path == NULL) ? "null" : path));
+ }
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+ }
+ }
+
+ return 0;
+}
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_everywhere_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- loc, local->xattr_req);
- }
+static int
+dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ const char *path = NULL;
- return 0;
-out:
- DHT_STACK_UNWIND (lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
-err:
- return -1;
+ /* NOTE:
+ * If stale file unlink fails either there is an open-fd or is not an
+ * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
+ * to ENOENT
+ */
+
+ local = frame->local;
+
+ if (local) {
+ FRAME_SU_UNDO(frame, dht_local_t);
+ if (local->loc.path)
+ path = local->loc.path;
+ }
+
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "Returned with op_ret %d and "
+ "op_errno %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
+
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
+
+ return 0;
}
+static int
+dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict)
+{
+ int ret = 0;
-int
-dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- call_frame_t *prev = NULL;
- dht_local_t *local = NULL;
- xlator_t *subvol = NULL;
- loc_t *loc = NULL;
- dht_conf_t *conf = NULL;
- int ret = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, unwind);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind);
- GF_VALIDATE_OR_GOTO ("dht", this->private, unwind);
- GF_VALIDATE_OR_GOTO ("dht", cookie, unwind);
-
- prev = cookie;
- subvol = prev->this;
- conf = this->private;
- local = frame->local;
- loc = &local->loc;
+ ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "lookup of %s on %s (following linkfile) failed (%s)",
- local->loc.path, subvol->name, strerror (op_errno));
- goto err;
- }
+ if (ret)
+ return -1;
- if (check_is_dir (inode, stbuf, xattr)) {
- gf_log (this->name, GF_LOG_INFO,
- "lookup of %s on %s (following linkfile) reached dir",
- local->loc.path, subvol->name);
- goto err;
- }
+ ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
- if (check_is_linkfile (inode, stbuf, xattr)) {
- gf_log (this->name, GF_LOG_INFO,
- "lookup of %s on %s (following linkfile) reached link",
- local->loc.path, subvol->name);
- goto err;
- }
+ if (ret)
+ return -1;
- if (uuid_compare (local->gfid, stbuf->ia_gfid)) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: gfid different on data file on %s",
- local->loc.path, subvol->name);
- goto err;
- }
+ return 0;
+}
- if ((stbuf->ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- stbuf->ia_prot.sticky = 1;
- }
- dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino);
- if (local->loc.parent)
- postparent->ia_ino = local->loc.parent->ino;
+static int32_t
+dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int call_cnt = 0, ret = 0;
+ xlator_t *subvol = NULL;
+ uuid_t gfid = {
+ 0,
+ };
+ char gfid_str[GF_UUID_BUF_SIZE] = {0};
+
+ subvol = cookie;
+ local = frame->local;
+
+ if (subvol == local->hashed_subvol) {
+ if ((op_ret == 0) || (op_errno != ENOENT))
+ local->dont_create_linkto = _gf_true;
+ } else {
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_copy(gfid, local->loc.gfid);
+ else
+ gf_uuid_copy(gfid, local->gfid);
+
+ if ((op_ret == 0) && gf_uuid_compare(gfid, buf->ia_gfid)) {
+ gf_uuid_unparse(gfid, gfid_str);
+ gf_msg_debug(this->name, 0,
+ "gfid (%s) different on cached subvol "
+ "(%s) and looked up inode (%s), not "
+ "creating linkto",
+ uuid_utoa(buf->ia_gfid), subvol->name, gfid_str);
+ local->dont_create_linkto = _gf_true;
+ } else if (op_ret == -1) {
+ local->dont_create_linkto = _gf_true;
+ }
+ }
+
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ if (local->dont_create_linkto)
+ goto no_linkto;
+ else {
+ gf_msg_debug(this->name, 0,
+ "Creating linkto file on %s(hash) to "
+ "%s on %s (gfid = %s)",
+ local->hashed_subvol->name, local->loc.path,
+ local->cached_subvol->name, gfid_str);
+
+ ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk,
+ this, local->cached_subvol,
+ local->hashed_subvol, &local->loc);
+
+ if (ret < 0)
+ goto no_linkto;
+ }
+ }
+
+ return 0;
+
+no_linkto:
+ gf_msg_debug(this->name, 0,
+ "skipped linkto creation (path:%s) (gfid:%s) "
+ "(hashed-subvol:%s) (cached-subvol:%s)",
+ local->loc.path, gfid_str, local->hashed_subvol->name,
+ local->cached_subvol->name);
+
+ dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode,
+ &local->stbuf, &local->preparent,
+ &local->postparent, local->xattr);
+ return 0;
+}
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "failed to set layout for subvolume %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- }
+static int32_t
+dht_call_lookup_linkfile_create(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int i = 0;
+ xlator_t *subvol = NULL;
-unwind:
- WIPE (postparent);
+ local = frame->local;
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ else
+ gf_uuid_unparse(local->gfid, gfid);
- DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
- postparent);
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "protecting namespace failed, skipping linkto "
+ "creation (path:%s)(gfid:%s)(hashed-subvol:%s)"
+ "(cached-subvol:%s)",
+ local->loc.path, gfid, local->hashed_subvol->name,
+ local->cached_subvol->name);
+ goto err;
+ }
- return 0;
+ local->locked = _gf_true;
+
+ local->call_cnt = 2;
+
+ for (i = 0; i < 2; i++) {
+ subvol = (subvol == NULL) ? local->hashed_subvol : local->cached_subvol;
+
+ STACK_WIND_COOKIE(frame, dht_linkfile_create_lookup_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, NULL);
+ }
+
+ return 0;
err:
- dht_lookup_everywhere (frame, this, loc);
-out:
- return 0;
+ dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode,
+ &local->stbuf, &local->preparent,
+ &local->postparent, local->xattr);
+ return 0;
}
+/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
+ * contains a non-linkto file. After migration it is converted to linkto and
+ * then unlinked. And at hashed_subvolume, first a linkto file is present,
+ * then after migration it is converted to a non-linkto file.
+ *
+ * Lets assume a file is present on cached subvolume and a new brick is added
+ * and new brick is the new_hashed subvolume. So fresh lookup on newly added
+ * hashed subvolume will fail and dht_lookup_everywhere gets called. If just
+ * before sending the dht_lookup_everywhere request rebalance is in progress,
+ *
+ * from cached subvolume it may see: Nonlinkto or linkto or No file
+ * from hashed subvolume it may see: No file or linkto file or non-linkto file
+ *
+ * So this boils down to 9 cases:
+ * at cached_subvol at hashed_subvol
+ * ---------------- -----------------
+ *
+ *a) No file No file
+ * [request reached after [Request reached before
+ * migration] Migration]
+ *
+ *b) No file Linkto File
+ *
+ *c) No file Non-Linkto File
+ *
+ *d) Linkto No-File
+ *
+ *e) Linkto Linkto
+ *
+ *f) Linkto Non-Linkto
+ *
+ *g) NonLinkto No-File
+ *
+ *h) NonLinkto Linkto
+ *
+ *i) NonLinkto NonLinkto
+ *
+ * dht_lookup_everywhere_done takes decision based on any of the above case
+ */
-int
-dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc)
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this)
{
- int call_cnt = 0;
- int i = 0;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int ret = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, unwind);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind);
- GF_VALIDATE_OR_GOTO ("dht", this->private, unwind);
- GF_VALIDATE_OR_GOTO ("dht", loc, unwind);
+ int ret = 0;
+ dht_local_t *local = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_layout_t *layout = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ gf_boolean_t found_non_linkto_on_hashed = _gf_false;
+
+ local = frame->local;
+ hashed_subvol = local->hashed_subvol;
+ cached_subvol = local->cached_subvol;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (local->file_count && local->dir_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "path %s (gfid = %s)exists as a file on one "
+ "subvolume and directory on another. "
+ "Please fix it manually",
+ local->loc.path, gfid);
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+ return 0;
+ }
+ if (local->op_ret && local->gfid_missing) {
+ if (gf_uuid_is_null(local->gfid_req)) {
+ DHT_STACK_UNWIND(lookup, frame, -1, ENODATA, NULL, NULL, NULL,
+ NULL);
+ return 0;
+ }
+ /* A hack */
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+
+ if (local->dir_count) {
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+
+ gf_msg_debug(this->name, 0,
+ "STATUS: hashed_subvol %s "
+ "cached_subvol %s",
+ (hashed_subvol == NULL) ? "null" : hashed_subvol->name,
+ (cached_subvol == NULL) ? "null" : cached_subvol->name);
+
+ if (!cached_subvol) {
+ if (local->skip_unlink.handle_valid_link && hashed_subvol) {
+ /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK":
+ * If this lookup is performed by rebalance and this
+ * rebalance process detected hashed file and by
+ * the time it sends the lookup request to cached node,
+ * file got migrated and now at initial hashed_node,
+ * final migrated file is present. With current logic,
+ * because this process fails to find the cached_node,
+ * it will unlink the file at initial hashed_node.
+ *
+ * So we avoid this by setting key, and checking at the
+ * posix_unlink that unlink the file only if file is a
+ * linkto file and not a migrated_file.
+ */
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+ local->xattr_req);
+
+ if (ret) {
+ /* If for some reason, setting key in the dict
+ * fails, return with ENOENT, as with respect to
+ * this process, it detected only a stale link
+ * file.
+ *
+ * Next lookup will delete it.
+ *
+ * Performing deletion of stale link file when
+ * setting key in dict fails, may cause the data
+ * loss because of the above mentioned race.
+ */
- conf = this->private;
- local = frame->local;
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_msg_debug(this->name, 0,
+ "No Cached was found and "
+ "unlink on hashed was skipped"
+ " so performing now: %s",
+ local->loc.path);
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_stale_linkto_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
+ } else {
+ gf_msg_debug(this->name, 0,
+ "There was no cached file and "
+ "unlink on hashed is not skipped %s",
+ local->loc.path);
- local->layout = dht_layout_new (this, conf->subvolume_cnt);
- if (!local->layout) {
- goto unwind;
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
}
+ return 0;
+ }
- if (local->xattr != NULL) {
- dict_unref (local->xattr);
- local->xattr = NULL;
- }
+ /* At the time of dht_lookup, no file was found on hashed and that is
+ * why dht_lookup_everywhere is called, but by the time
+ * dht_lookup_everywhere
+ * reached to server, file might have already migrated. In that case we
+ * will find a migrated file at the hashed_node. In this case store the
+ * layout in context and return successfully.
+ */
- if (!uuid_is_null (local->gfid))
- ret = dict_set_static_bin (local->xattr_req, "gfid-req",
- local->gfid, 16);
+ if (hashed_subvol || local->need_lookup_everywhere) {
+ if (local->need_lookup_everywhere) {
+ found_non_linkto_on_hashed = _gf_true;
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
+ } else if ((local->file_count == 1) &&
+ (hashed_subvol == cached_subvol)) {
+ gf_msg_debug(this->name, 0,
+ "found cached file on hashed subvolume "
+ "so store in context and return for %s",
+ local->loc.path);
+
+ found_non_linkto_on_hashed = _gf_true;
}
- return 0;
-unwind:
- DHT_STACK_UNWIND (lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
-out:
- return 0;
-}
+ if (found_non_linkto_on_hashed)
+ goto preset_layout;
+ }
+ if (hashed_subvol) {
+ if (local->skip_unlink.handle_valid_link == _gf_true) {
+ if (cached_subvol == local->skip_unlink.hash_links_to) {
+ if (gf_uuid_compare(local->skip_unlink.cached_gfid,
+ local->skip_unlink.hashed_gfid)) {
+ /*GFID different, return error*/
+ DHT_STACK_UNWIND(lookup, frame, -1, ESTALE, NULL, NULL,
+ NULL, NULL);
-int
-dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- char is_linkfile = 0;
- char is_dir = 0;
- xlator_t *subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- loc_t *loc = NULL;
- call_frame_t *prev = NULL;
- int ret = 0;
- uint64_t tmp_layout = 0;
- dht_layout_t *parent_layout = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
-
- conf = this->private;
-
- prev = cookie;
- local = frame->local;
- loc = &local->loc;
+ return 0;
+ }
- /* This is required for handling stale linkfile deletion,
- * or any more call which happens from this 'loc'.
- */
- if (!op_ret && uuid_is_null (local->gfid))
- memcpy (local->gfid, stbuf->ia_gfid, 16);
-
- if (ENTRY_MISSING (op_ret, op_errno)) {
- gf_log (this->name, GF_LOG_TRACE, "Entry %s missing on subvol"
- " %s", loc->path, prev->this->name);
- if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
+ ret = dht_layout_preset(this, cached_subvol, local->loc.inode);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Could not set pre-set layout "
+ "for subvolume %s",
+ cached_subvol->name);
}
- if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) &&
- (loc->parent)) {
- ret = inode_ctx_get (loc->parent, this, &tmp_layout);
- parent_layout = (dht_layout_t *)(long)tmp_layout;
- if (parent_layout->search_unhashed) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
+
+ local->op_ret = (ret == 0) ? ret : -1;
+ local->op_errno = (ret == 0) ? ret : EINVAL;
+
+ /* Presence of local->cached_subvol validates
+ * that lookup from cached node is successful
+ */
+
+ if (!local->op_ret && local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
}
- }
- if (op_ret == 0) {
- is_dir = check_is_dir (inode, stbuf, xattr);
- if (is_dir) {
- local->inode = inode_ref (inode);
- local->xattr = dict_ref (xattr);
+ gf_msg_debug(this->name, 0,
+ "Skipped unlinking linkto file "
+ "on the hashed subvolume. "
+ "Returning success as it is a "
+ "valid linkto file. Path:%s",
+ local->loc.path);
+
+ goto unwind_hashed_and_cached;
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_msg_debug(this->name, 0,
+ "Linkto file found on hashed "
+ "subvol "
+ "and data file found on cached "
+ "subvolume. But linkto points to "
+ "different cached subvolume (%s) "
+ "path %s",
+ (local->skip_unlink.hash_links_to
+ ? local->skip_unlink.hash_links_to->name
+ : " <nil>"),
+ local->loc.path);
+
+ if (local->skip_unlink.opend_fd_count == 0) {
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+ local->xattr_req);
+
+ if (ret) {
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL,
+ NULL, NULL);
+ } else {
+ local->call_cnt = 1;
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_of_false_linkto_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
+
+ return 0;
}
+ }
}
+ }
+
+preset_layout:
- if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) {
- dht_lookup_directory (frame, this, &local->loc);
+ if (found_non_linkto_on_hashed) {
+ if (local->need_lookup_everywhere) {
+ if (gf_uuid_compare(local->gfid, local->inode->gfid)) {
+ /* GFID different, return error */
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
return 0;
+ }
}
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG, "Lookup of %s for subvolume"
- " %s failed with error %s", loc->path, prev->this->name,
- strerror (op_errno));
- goto out;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ layout = dht_layout_for_subvol(this, cached_subvol);
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: no pre-set layout for subvolume %s,"
+ " gfid = %s",
+ local->loc.path,
+ (cached_subvol ? cached_subvol->name : "<nil>"), gfid);
}
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
-
- if (!is_linkfile) {
- /* non-directory and not a linkfile */
-
- dht_itransform (this, prev->this, stbuf->ia_ino,
- &stbuf->ia_ino);
- if (loc->parent)
- postparent->ia_ino = loc->parent->ino;
-
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "could not set pre-set layout for subvolume %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
- goto out;
+ ret = dht_layout_set(this, local->inode, layout);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: failed to set layout for subvol %s, "
+ "gfid = %s",
+ local->loc.path,
+ (cached_subvol ? cached_subvol->name : "<nil>"), gfid);
}
- subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "linkfile not having link subvolume. path=%s",
- loc->path);
- dht_lookup_everywhere (frame, this, loc);
- return 0;
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
}
- STACK_WIND (frame, dht_lookup_linkfile_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
-
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
return 0;
+ }
-out:
- /*
- * FIXME: postparent->ia_size and postparent->st_blocks do not have
- * correct values. since, postparent corresponds to a directory these
- * two members should have values equal to sum of corresponding values
- * from each of the subvolume. See dht_iatt_merge for reference.
- */
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "Cannot create linkfile for %s on %s: "
+ "hashed subvolume cannot be found, gfid = %s.",
+ local->loc.path, cached_subvol->name, gfid);
- WIPE (postparent);
+ local->op_ret = 0;
+ local->op_errno = 0;
- DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
- postparent);
-err:
- return 0;
+ ret = dht_layout_preset(frame->this, cached_subvol, local->inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Failed to set layout for subvol %s"
+ ", gfid = %s",
+ cached_subvol ? cached_subvol->name : "<nil>", gfid);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
+ }
+
+ if (frame->root->op != GF_FOP_RENAME) {
+ local->current = &local->lock[0];
+ ret = dht_protect_namespace(frame, &local->loc, hashed_subvol,
+ &local->current->ns,
+ dht_call_lookup_linkfile_create);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Creating linkto file on %s(hash) to %s on %s "
+ "(gfid = %s)",
+ hashed_subvol->name, local->loc.path, cached_subvol->name,
+ gfid);
+
+ ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, this,
+ cached_subvol, hashed_subvol, &local->loc);
+ }
+
+ return ret;
+
+unwind_hashed_and_cached:
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
}
+static int
+dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ int is_linkfile = 0;
+ int is_dir = 0;
+ loc_t *loc = NULL;
+ xlator_t *link_subvol = NULL;
+ int ret = -1;
+ int32_t fd_count = 0;
+ dht_conf_t *conf = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ dict_t *dict_req = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ local = frame->local;
+ loc = &local->loc;
+ conf = this->private;
+
+ prev = cookie;
+
+ gf_msg_debug(this->name, 0,
+ "returned with op_ret %d and op_errno %d (%s) "
+ "from subvol %s",
+ op_ret, op_errno, loc->path, prev->name);
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno != ENOENT)
+ local->op_errno = op_errno;
+ if (op_errno == ENODATA)
+ local->gfid_missing = _gf_true;
+ goto unlock;
+ }
-int
-dht_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
-{
- xlator_t *subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int op_errno = -1;
- dht_layout_t *layout = NULL;
- int i = 0;
- int call_cnt = 0;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- conf = this->private;
- if (!conf)
- goto err;
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_copy(local->gfid, buf->ia_gfid);
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- if (!dht_filter_loc_subvol_key (this, loc, &local->loc,
- &hashed_subvol)) {
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "copying location failed for path=%s",
- loc->path);
- goto err;
- }
+ gf_uuid_unparse(local->gfid, gfid);
+
+ if (gf_uuid_compare(local->gfid, buf->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid differs on subvolume %s,"
+ " gfid local = %s, gfid node = %s",
+ loc->path, prev->name, gfid, uuid_utoa(buf->ia_gfid));
}
- if (xattr_req) {
- local->xattr_req = dict_ref (xattr_req);
- } else {
- local->xattr_req = dict_new ();
+ is_linkfile = check_is_linkfile(inode, buf, xattr,
+ conf->link_xattr_name);
+
+ if (is_linkfile) {
+ link_subvol = dht_linkfile_subvol(this, inode, buf, xattr);
+ gf_msg_debug(this->name, 0, "found on %s linkfile %s (-> %s)",
+ prev->name, loc->path,
+ link_subvol ? link_subvol->name : "''");
+ goto unlock;
}
- if (!hashed_subvol)
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
+ is_dir = check_is_dir(inode, buf, xattr);
- local->cached_subvol = cached_subvol;
- local->hashed_subvol = hashed_subvol;
+ /* non linkfile GFID takes precedence but don't overwrite
+ gfid if we have already found a cached file*/
+ if (!local->cached_subvol)
+ gf_uuid_copy(local->gfid, buf->ia_gfid);
- if (is_revalidate (loc)) {
- local->layout = layout = dht_layout_get (this, loc->inode);
+ if (is_dir) {
+ local->dir_count++;
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "revalidate without cache. path=%s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ gf_msg_debug(this->name, 0, "found on %s directory %s", prev->name,
+ loc->path);
+ } else {
+ local->file_count++;
- local->ia_ino = loc->inode->ino;
+ gf_msg_debug(this->name, 0, "found cached file on %s for %s",
+ prev->name, loc->path);
- if (layout->gen && (layout->gen < conf->gen)) {
- gf_log (this->name, GF_LOG_TRACE,
- "incomplete layout failure for path=%s",
- loc->path);
+ if (!local->cached_subvol) {
+ /* found one file */
+ dht_iatt_merge(this, &local->stbuf, buf);
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
- local->cached_subvol = NULL;
- goto do_fresh_lookup;
- }
+ local->xattr = dict_ref(xattr);
+ local->cached_subvol = prev;
+
+ gf_msg_debug(this->name, 0,
+ "storing cached on %s file"
+ " %s",
+ prev->name, loc->path);
- local->inode = inode_ref (loc->inode);
+ dht_iatt_merge(this, &local->postparent, postparent);
- local->call_cnt = layout->cnt;
- call_cnt = local->call_cnt;
+ gf_uuid_copy(local->skip_unlink.cached_gfid, buf->ia_gfid);
+ } else {
+ /* This is where we need 'rename' both entries logic */
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_FILE_ON_MULT_SUBVOL,
+ "multiple subvolumes (%s and %s) have "
+ "file %s (preferably rename the file "
+ "in the backend,and do a fresh lookup)",
+ local->cached_subvol->name, prev->name, local->loc.path);
+ }
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (is_linkfile) {
+ ret = dict_get_int32(xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
+
+ /* Any linkto file found on the non-hashed subvolume should
+ * be unlinked (performed in the "else if" block below)
+ *
+ * But if a linkto file is found on hashed subvolume, it may be
+ * pointing to valid cached node. So unlinking of linkto
+ * file on hashed subvolume is skipped and inside
+ * dht_lookup_everywhere_done, checks are performed. If this
+ * linkto file is found as stale linkto file, it is deleted
+ * otherwise unlink is skipped.
+ */
- /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
- * revalidates directly go to the cached-subvolume.
+ if (local->hashed_subvol && local->hashed_subvol == prev) {
+ local->skip_unlink.handle_valid_link = _gf_true;
+ local->skip_unlink.opend_fd_count = fd_count;
+ local->skip_unlink.hash_links_to = link_subvol;
+ gf_uuid_copy(local->skip_unlink.hashed_gfid, buf->ia_gfid);
+
+ gf_msg_debug(this->name, 0,
+ "Found"
+ " one linkto file on hashed subvol %s "
+ "for %s: Skipping unlinking till "
+ "everywhere_done",
+ prev->name, loc->path);
+
+ } else if (!ret && (fd_count == 0)) {
+ dict_req = dict_new();
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_req);
+
+ if (ret) {
+ /* Skip unlinking for dict_failure
+ *File is found as a linkto file on non-hashed,
+ *subvolume. In the current implementation,
+ *finding a linkto-file on non-hashed does not
+ *always implies that it is stale. So deletion
+ *of file should be done only when both fd is
+ *closed and linkto-xattr is set. In case of
+ *dict_set failure, avoid skipping of file.
+ *NOTE: dht_frame_return should get called for
+ * this block.
*/
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
-
- for (i = 0; i < layout->cnt; i++) {
- subvol = layout->list[i].xlator;
-
- STACK_WIND (frame, dht_revalidate_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
-
- if (!--call_cnt)
- break;
- }
- } else {
- do_fresh_lookup:
- /* TODO: remove the hard-coding */
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
-
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
-
- if (!hashed_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s, "
- "checking on all the subvols to see if "
- "it is a directory", loc->path);
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
-
- local->layout = dht_layout_new (this,
- conf->subvolume_cnt);
- if (!local->layout) {
- op_errno = ENOMEM;
- goto err;
- }
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
- }
- return 0;
- }
+ dict_unref(dict_req);
+
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "attempting deletion of stale linkfile "
+ "%s on %s (hashed subvol is %s)",
+ loc->path, prev->name,
+ (local->hashed_subvol ? local->hashed_subvol->name
+ : "<null>"));
+ /* *
+ * These stale files may be created using root
+ * user. Hence deletion will work only with
+ * root.
+ */
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_cbk, prev,
+ prev->fops->unlink, loc, 0, dict_req);
+
+ dict_unref(dict_req);
- STACK_WIND (frame, dht_lookup_cbk,
- hashed_subvol, hashed_subvol->fops->lookup,
- loc, local->xattr_req);
+ return 0;
+ }
}
+ }
- return 0;
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_lookup_everywhere_done(frame, this);
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
+out:
+ return ret;
}
-
int
-dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ int call_cnt = 0;
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", loc, out);
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ conf = this->private;
+ local = frame->local;
- dht_iatt_merge (this, &local->prebuf, prebuf, prev->this);
- dht_iatt_merge (this, &local->stbuf, postbuf, prev->this);
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
- if (local->inode) {
- local->stbuf.ia_ino = local->inode->ino;
- local->prebuf.ia_ino = local->inode->ino;
- }
+ if (!local->inode)
+ local->inode = inode_ref(loc->inode);
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ gf_msg_debug(this->name, 0, "winding lookup call to %d subvols", call_cnt);
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_everywhere_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, loc,
+ local->xattr_req);
+ }
+
+ return 0;
out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (truncate, frame, local->op_ret, local->op_errno,
- &local->prebuf, &local->stbuf);
+ DHT_STACK_UNWIND(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
err:
- return 0;
+ return -1;
}
-
-
int
-dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *stbuf)
+dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ xlator_t *prev = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ loc_t *loc = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, unwind);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+ GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+ GF_VALIDATE_OR_GOTO("dht", cookie, unwind);
+
+ prev = cookie;
+ subvol = prev;
+ conf = this->private;
+ local = frame->local;
+ loc = &local->loc;
+
+ gf_uuid_unparse(loc->gfid, gfid);
+
+ if (op_ret == -1) {
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "Lookup of %s on %s (following linkfile) failed "
+ ",gfid = %s",
+ local->loc.path, subvol->name, gfid);
+
+ /* If cached subvol returned ENOTCONN, do not do
+ lookup_everywhere. We need to make sure linkfile does not get
+ removed, which can take away the namespace, and subvol is
+ anyways down. */
+
+ local->cached_subvol = NULL;
+ if (op_errno != ENOTCONN)
+ goto err;
+ else
+ goto unwind;
+ }
+
+ if (check_is_dir(inode, stbuf, xattr)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "Lookup of %s on %s (following linkfile) reached dir,"
+ " gfid = %s",
+ local->loc.path, subvol->name, gfid);
+ goto err;
+ }
+
+ if (check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "lookup of %s on %s (following linkfile) reached link,"
+ "gfid = %s",
+ local->loc.path, subvol->name, gfid);
+ goto err;
+ }
+
+ if (gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on data file on %s,"
+ " gfid local = %s, gfid node = %s ",
+ local->loc.path, subvol->name, gfid, uuid_utoa(stbuf->ia_gfid));
+ goto err;
+ }
+
+ if ((stbuf->ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) {
+ stbuf->ia_prot.sticky = 1;
+ }
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Failed to set layout for subvolume %s,"
+ "gfid = %s",
+ prev->name, gfid);
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+unwind:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+ postparent);
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ return 0;
+
+err:
+ dht_lookup_everywhere(frame, this, loc);
+out:
+ return 0;
+}
+
+/* Code to get hashed subvol based on inode and loc
+ First it check if loc->parent and loc->path exist then it get
+ hashed subvol based on loc.
+*/
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+static gf_boolean_t
+dht_should_lookup_everywhere(xlator_t *this, dht_conf_t *conf, loc_t *loc)
+{
+ dht_layout_t *parent_layout = NULL;
+ int ret = 0;
+ gf_boolean_t lookup_everywhere = _gf_true;
+
+ /* lookup-optimize supersedes lookup-unhashed settings.
+ * If it is set, do not process search_unhashed
+ * If lookup-optimize if enabled, lookup everywhere if:
+ * - this is the rebalance daemon.
+ * - loc->parent is unavailable.
+ * - parent_layout is unavailable
+ * - parent_layout->commit_hash != conf->vol_commit_hash
+ */
+
+ if (conf->lookup_optimize) {
+ if (!conf->defrag && loc->parent) {
+ ret = dht_inode_ctx_layout_get(loc->parent, this, &parent_layout);
+ if (!ret && parent_layout &&
+ (parent_layout->commit_hash == conf->vol_commit_hash)) {
+ lookup_everywhere = _gf_false;
+ }
+ }
+ goto out;
+ } else {
+ if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+ if (loc->parent) {
+ ret = dht_inode_ctx_layout_get(loc->parent, this,
+ &parent_layout);
+ if (ret || !parent_layout ||
+ (!parent_layout->search_unhashed)) {
+ lookup_everywhere = _gf_false;
+ }
+ } else {
+ lookup_everywhere = _gf_false;
+ }
- if (local->inode)
- local->stbuf.ia_ino = local->inode->ino;
- local->op_ret = 0;
+ goto out;
}
-unlock:
- UNLOCK (&frame->lock);
+ }
out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno,
- &local->stbuf);
-err:
- return 0;
+ return lookup_everywhere;
}
-
int
-dht_stat (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
-
+ char is_linkfile = 0;
+ char is_dir = 0;
+ xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ xlator_t *prev = NULL;
+ int ret = 0;
+ uint32_t vol_commit_hash = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ conf = this->private;
+
+ prev = cookie;
+ local = frame->local;
+ loc = &local->loc;
+
+ gf_msg_debug(this->name, op_errno,
+ "%s: fresh_lookup on %s returned with op_ret %d", loc->path,
+ prev->name, op_ret);
+
+ if (op_ret == -1) {
+ if (ENTRY_MISSING(op_ret, op_errno)) {
+ if (1 == conf->subvolume_cnt) {
+ /* No need to lookup again */
+ goto out;
+ }
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+ gf_msg_debug(this->name, 0, "Entry %s missing on subvol %s",
+ loc->path, prev->name);
+ if (dht_should_lookup_everywhere(this, conf, loc)) {
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere(frame, this, loc);
+ return 0;
+ }
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
+ } else {
+ /* posix returns ENODATA if the gfid is not set but the client and
+ * server protocol layers do not send the stbuf. We need to
+ * heal this so check if this is a directory on the other subvols.
+ */
+ if ((op_errno == ENOTCONN) || (op_errno == ENODATA)) {
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+ }
+ gf_msg_debug(this->name, op_errno, "%s: Lookup on subvolume %s failed",
+ loc->path, prev->name);
+ goto out;
+ }
+
+ /* Lookup succeeded - op_ret = 0 */
+
+ /* This is required for handling stale linkfile deletion,
+ * or any more call which happens from this 'loc'.
+ */
+ if (gf_uuid_is_null(local->gfid)) {
+ /*This is set from the first successful response*/
+ memcpy(local->gfid, stbuf->ia_gfid, 16);
+ }
+
+ if (!conf->vch_forced) {
+ /* Update the commit hash in conf if it is found */
+ ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
}
+ }
- local->layout = layout = dht_layout_get (this, loc->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ if (is_dir) {
+ /* A directory is present on all subvols, send the lookup to
+ * all subvols now */
+ local->inode = inode_ref(inode);
+ local->xattr = dict_ref(xattr);
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
- local->inode = inode_ref (loc->inode);
- local->call_cnt = layout->cnt;
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
- for (i = 0; i < layout->cnt; i++) {
- subvol = layout->list[i].xlator;
+ if (!is_linkfile) {
+ /* non-directory and not a linkto file. This is a data file
+ * Update the layout to point to the cached subvol
+ */
- STACK_WIND (frame, dht_attr_cbk,
- subvol, subvol->fops->stat,
- loc);
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "%s: could not set pre-set layout for subvolume %s",
+ loc->path, prev->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
}
+ goto out;
+ }
+ /* This is a linkto file. Get the value of the target subvol from the
+ * linkto xattr and lookup there to see if the file exists
+ */
+ subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: No link subvol for linkto", loc->path);
+ dht_lookup_everywhere(frame, this, loc);
return 0;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL);
+ gf_msg_debug(this->name, 0, "%s: Calling lookup on linkto target %s",
+ loc->path, subvol->name);
- return 0;
-}
+ STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, local->xattr_req);
+ return 0;
-int
-dht_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
+out:
+ /*
+ * FIXME: postparent->ia_size and postparent->st_blocks do not have
+ * correct values. since, postparent corresponds to a directory these
+ * two members should have values equal to sum of corresponding values
+ * from each of the subvolume. See dht_iatt_merge for reference.
+ */
+
+ if (!op_ret && local && local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+ postparent);
+err:
+ return 0;
+}
+
+/* For directories, check if acl xattrs have been requested (by the acl
+ * xlator), if not, request for them. These xattrs are needed for dht dir
+ * self-heal to perform proper self-healing of dirs
+ */
+static void
+dht_check_and_set_acl_xattr_req(xlator_t *this, dict_t *xattr_req)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
+ int ret = 0;
+ GF_ASSERT(xattr_req);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ if (!dict_get(xattr_req, POSIX_ACL_ACCESS_XATTR)) {
+ ret = dict_set_int8(xattr_req, POSIX_ACL_ACCESS_XATTR, 0);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s",
+ POSIX_ACL_ACCESS_XATTR);
+ }
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ if (!dict_get(xattr_req, POSIX_ACL_DEFAULT_XATTR)) {
+ ret = dict_set_int8(xattr_req, POSIX_ACL_DEFAULT_XATTR, 0);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s",
+ POSIX_ACL_DEFAULT_XATTR);
+ }
- local->layout = layout = dht_layout_get (this, fd->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ return;
+}
- local->inode = inode_ref (fd->inode);
- local->call_cnt = layout->cnt;;
+/* for directories, we need the following info:
+ * the layout : trusted.glusterfs.dht
+ * the mds information : trusted.glusterfs.dht.mds
+ * the acl info: See above
+ */
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int ret = -EINVAL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf) {
+ goto err;
+ }
+
+ if (!xattr_req) {
+ goto err;
+ }
+
+ /* Xattr to get the layout for a directory
+ */
+ ret = dict_set_uint32(xattr_req, conf->xattr_name, 4 * 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->xattr_name, loc->path);
+ goto err;
+ }
+
+ /*Non-fatal failure */
+ ret = dict_set_uint32(xattr_req, conf->mds_xattr_key, 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->mds_xattr_key, loc->path);
+ }
+
+ dht_check_and_set_acl_xattr_req(this, xattr_req);
+ ret = 0;
+err:
+ return ret;
+}
- for (i = 0; i < layout->cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_attr_cbk,
- subvol, subvol->fops->fstat,
- fd);
+/* If the hashed subvol is present, send the lookup to only that subvol first.
+ * If no hashed subvol, send a lookup to all subvols and proceed based on the
+ * responses.
+ */
+static int
+dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int call_cnt = 0;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Since we don't know whether this is a file or a directory,
+ * request all xattrs*/
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ /* Fuse sets a random value in gfid-req. If the gfid is missing
+ * on one or more subvols, posix will set the gfid to this value,
+ * causing GFID mismatches for directories. Remove the value fuse
+ * has sent before sending the lookup.
+ */
+ ret = dict_get_gfuuid(local->xattr_req, "gfid-req", &local->gfid_req);
+ if (ret) {
+ gf_msg_debug(this->name, 0, "%s: No gfid-req available", loc->path);
+ } else {
+ dict_del(local->xattr_req, "gfid-req");
+ }
+ /* This should have been set in dht_lookup */
+ hashed_subvol = local->hashed_subvol;
+
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "%s: no subvolume in layout for path, "
+ "checking on all the subvols to see if "
+ "it is a directory",
+ loc->path);
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ /* Allocate a layout. This will be populated and saved in
+ * the dht inode_ctx on successful lookup
+ */
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
}
+ gf_msg_debug(this->name, 0,
+ "%s: Found null hashed subvol. Calling lookup"
+ " on all nodes.",
+ loc->path);
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
return 0;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL);
+ /* if the hashed_subvol is non-null, send the lookup there first so
+ * as to see whether we have a file or a directory */
+ gf_msg_debug(this->name, 0, "%s: Calling fresh lookup on %s", loc->path,
+ hashed_subvol->name);
- return 0;
+ STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->lookup, loc, local->xattr_req);
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
-
-int
-dht_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset)
+static int
+dht_do_revalidate(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
+ xlator_t *subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+ int gen = 0;
+
+ conf = this->private;
+ if (!conf) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0,
+ "path = %s. No layout found in the inode ctx.", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Generation number has changed. This layout may be stale. */
+ if (layout->gen && (layout->gen < conf->gen)) {
+ gen = layout->gen;
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+ local->cached_subvol = NULL;
+
+ gf_msg_debug(this->name, 0,
+ "path = %s. In memory layout may be stale."
+ "(layout->gen (%d) is less than "
+ "conf->gen (%d)). Calling fresh lookup.",
+ loc->path, gen, conf->gen);
+
+ dht_do_fresh_lookup(frame, this, loc);
+ return 0;
+ }
+
+ local->inode = inode_ref(loc->inode);
+
+ /* Since we don't know whether this has changed,
+ * request all xattrs*/
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ if (IA_ISDIR(local->inode->ia_type)) {
+ ret = dht_inode_ctx_mdsvol_get(local->inode, this, &mds_subvol);
+ if (ret || !mds_subvol) {
+ gf_msg_debug(this->name, 0, "path = %s. No mds subvol in inode ctx",
+ local->loc.path);
+ }
+ local->mds_subvol = mds_subvol;
+ local->call_cnt = conf->subvolume_cnt;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+ /* local->call_cnt will change as responses are processed. Always use a
+ * local copy to loop through the STACK_WIND calls
+ */
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ call_cnt = local->call_cnt;
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_revalidate_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, loc,
+ local->xattr_req);
}
+ return 0;
+ }
- local->inode = inode_ref (loc->inode);
- local->call_cnt = 1;
+ /* If not a dir, this should be 1 */
+ local->call_cnt = layout->cnt;
+ call_cnt = local->call_cnt;
- STACK_WIND (frame, dht_truncate_cbk,
- subvol, subvol->fops->truncate,
- loc, offset);
+ for (i = 0; i < call_cnt; i++) {
+ subvol = layout->list[i].xlator;
- return 0;
+ gf_msg_debug(this->name, 0,
+ "path = %s. Calling "
+ "revalidate lookup on %s",
+ loc->path, subvol->name);
+ STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, local->xattr_req);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL);
-
- return 0;
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
+/* Depending on the input, decide if this is a:
+ * fresh-lookup: loc->name is provided but no dht inode ctx
+ * revalidation: loc->name is provided, dht inode ctx is present
+ * discover: gfid based nameless lookup.
+ */
int
-dht_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset)
+dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ loc_t new_loc = {
+ 0,
+ };
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ conf = this->private;
+ if (!conf)
+ goto err;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dht_filter_loc_subvol_key(this, loc, &new_loc, &hashed_subvol);
+ if (ret) {
+ loc_wipe(&local->loc);
+ ret = loc_dup(&new_loc, &local->loc);
+
+ /* we no longer need 'new_loc' entries */
+ loc_wipe(&new_loc);
+
+ /* check if loc_dup() is successful */
+ if (ret == -1) {
+ op_errno = errno;
+ gf_msg_debug(this->name, errno,
+ "copying location failed for path=%s", loc->path);
+ goto err;
+ }
+ }
+
+ if (xattr_req) {
+ local->xattr_req = dict_ref(xattr_req);
+ } else {
+ local->xattr_req = dict_new();
+ }
+
+ /* Nameless lookup */
+
+ /* This is usually sent by NFS. Lookups are done based on the gfid and
+ * no name information is available. Without the name, dht cannot calculate
+ * the hash and has to send a lookup to all subvols.
+ */
+ if (gf_uuid_is_null(loc->pargfid) && !gf_uuid_is_null(loc->gfid) &&
+ !__is_root_gfid(loc->inode->gfid)) {
+ local->cached_subvol = NULL;
+ dht_do_discover(frame, this, loc);
+ return 0;
+ }
+
+ if (loc_is_root(loc)) {
+ /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+ * set on the brick root.
+ */
+ ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ if (!hashed_subvol)
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ local->hashed_subvol = hashed_subvol;
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ if (is_revalidate(loc)) {
+ /* The entry has been looked up before and has a dht inode_ctx
+ */
+ dht_do_revalidate(frame, this, loc);
+ return 0;
+ } else {
+ /* Entry has not been looked up before
+ */
+ dht_do_fresh_lookup(frame, this, loc);
+ return 0;
+ }
- local->inode = inode_ref (fd->inode);
- local->call_cnt = 1;
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
- STACK_WIND (frame, dht_truncate_cbk,
- subvol, subvol->fops->ftruncate,
- fd, offset);
+static int
+dht_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
- return 0;
+ local = frame->local;
+ prev = cookie;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL);
+ LOCK(&frame->lock);
+ {
+ if ((op_ret == -1) &&
+ !((op_errno == ENOENT) || (op_errno == ENOTCONN))) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno,
+ "Unlink link: subvolume %s returned -1", prev->name);
+ goto post_unlock;
+ }
- return 0;
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
+
+ return 0;
}
-
-int
-dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent)
+static int
+dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *hashed_subvol = NULL;
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ local = frame->local;
+ prev = cookie;
- preparent->ia_ino = local->loc.parent->ino;
- postparent->ia_ino = local->loc.parent->ino;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno != ENOENT) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ } else {
local->op_ret = 0;
-
- local->postparent = *postparent;
- local->preparent = *preparent;
-
- WIPE (&local->postparent);
- WIPE (&local->preparent);
+ }
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno,
+ "Unlink: subvolume %s returned -1", prev->name);
+ goto post_unlock;
}
-unlock:
- UNLOCK (&frame->lock);
- DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent);
+ local->op_ret = 0;
- return 0;
-}
+ local->postparent = *postparent;
+ local->preparent = *preparent;
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!local->op_ret) {
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+ if (hashed_subvol && hashed_subvol != local->cached_subvol) {
+ /*
+ * If hashed and cached are different, then we need
+ * to unlink linkfile from hashed subvol if data
+ * file is deleted successfully
+ */
+ STACK_WIND_COOKIE(frame, dht_unlink_linkfile_cbk, hashed_subvol,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, local->flags, xdata);
+ return 0;
+ }
+ }
+
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
+
+ return 0;
+}
-int
-dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent)
+static int
+dht_common_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
- xlator_t *cached_subvol = NULL;
+static int
+dht_fix_layout_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ if (op_ret == 0) {
+ /* update the layout in the inode ctx */
local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ layout = local->selfheal.layout;
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ dht_layout_set(this, local->loc.inode, layout);
+ }
- if (op_ret == -1)
- goto err;
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
- cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
- if (!cached_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s",
- local->loc.path);
- local->op_errno = EINVAL;
- goto err;
- }
+static int
+dht_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
- STACK_WIND (frame, dht_unlink_cbk,
- cached_subvol, cached_subvol->fops->unlink,
- &local->loc);
+ local = frame->local;
+ prev = cookie;
- return 0;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
+ }
-err:
- DHT_STACK_UNWIND (unlink, frame, -1, local->op_errno,
- NULL, NULL);
- return 0;
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_FSETXATTR)) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+ }
+
+out:
+ return 0;
}
+/* Set the value[] of key into dict after convert from
+ host byte order to network byte order
+*/
+int32_t
+dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size)
+{
+ int ret = -1;
+ int32_t *ptr = NULL;
+ int32_t vindex;
+
+ if (value == NULL) {
+ return -EINVAL;
+ }
+
+ ptr = GF_MALLOC(sizeof(int32_t) * size, gf_common_mt_char);
+ if (ptr == NULL) {
+ return -ENOMEM;
+ }
+ for (vindex = 0; vindex < size; vindex++) {
+ ptr[vindex] = hton32(value[vindex]);
+ }
+ ret = dict_set_bin(dict, key, ptr, sizeof(int32_t) * size);
+ if (ret)
+ GF_FREE(ptr);
+ return ret;
+}
-int
-dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, struct iatt *prebuf, struct iatt *postbuf)
+static int
+dht_common_mds_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ call_frame_t *prev = cookie;
+ local = frame->local;
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ if (op_ret)
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
-
- if (local && (op_ret == 0)) {
- prebuf->ia_ino = local->ia_ino;
- postbuf->ia_ino = local->ia_ino;
- }
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, op_errno, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno,
- prebuf, postbuf);
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, op_errno, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
- return 0;
-}
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, op_errno, NULL);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, op_errno, NULL);
+ }
+out:
+ return 0;
+}
-int
-dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno)
+/* Code to wind a xattrop call to add 1 on current mds internal xattr
+ value
+*/
+static int
+dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int ret = 0;
+ dict_t *xattrop = NULL;
+ int32_t addone[1] = {1};
+ call_frame_t *prev = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto post_unlock;
+ }
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ if (!local->op_ret) {
+ xattrop = dict_new();
+ if (!xattrop) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
+ "dictionary creation failed");
+ ret = -1;
+ goto out;
+ }
+ ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, addone, 1);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "dictionary set array failed ");
+ ret = -1;
+ goto out;
+ }
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_REMOVEXATTR)) {
+ STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol,
+ local->mds_subvol->fops->xattrop, &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ } else {
+ STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fxattrop, local->fd,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ }
+ } else {
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+ }
+ }
+out:
+ if (ret) {
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
- local->op_ret = 0;
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
-unlock:
- UNLOCK (&frame->lock);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (setxattr, frame, local->op_ret, local->op_errno);
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
}
+ }
+just_return:
+ if (xattrop)
+ dict_unref(xattrop);
+ return 0;
+}
- return 0;
+static int
+dht_setxattr_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *mds_subvol = NULL;
+ int i = 0;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+ mds_subvol = local->mds_subvol;
+
+ if (op_ret == -1) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto out;
+ }
+
+ local->op_ret = 0;
+ local->call_cnt = conf->subvolume_cnt - 1;
+ local->xdata = dict_ref(xdata);
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (mds_subvol && (mds_subvol == conf->subvolumes[i]))
+ continue;
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->setxattr, &local->loc,
+ local->xattr, local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fsetxattr, local->fd,
+ local->xattr, local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->removexattr, &local->loc,
+ local->key, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fremovexattr, local->fd,
+ local->key, local->xattr_req);
+ }
+ }
+
+ return 0;
+out:
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+
+just_return:
+ return 0;
}
+static int
+dht_xattrop_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto out;
+ }
+
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->setxattr, &local->loc, local->xattr,
+ local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fsetxattr, local->fd, local->xattr,
+ local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->removexattr, &local->loc,
+ local->key, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+ }
+
+ return 0;
+out:
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+
+just_return:
+ return 0;
+}
-int
-dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno)
+static void
+fill_layout_info(dht_layout_t *layout, char *buf)
{
- DHT_STACK_UNWIND (access, frame, op_ret, op_errno);
- return 0;
+ int i = 0;
+ char tmp_buf[128] = {
+ 0,
+ };
+
+ for (i = 0; i < layout->cnt; i++) {
+ snprintf(tmp_buf, sizeof(tmp_buf), "(%s %u %u)",
+ layout->list[i].xlator->name, layout->list[i].start,
+ layout->list[i].stop);
+ if (i)
+ strcat(buf, " ");
+ strcat(buf, tmp_buf);
+ }
}
+static void
+dht_fill_pathinfo_xattr(xlator_t *this, dht_local_t *local, char *xattr_buf,
+ int32_t alloc_len, int flag, char *layout_buf)
+{
+ if (flag) {
+ if (local->xattr_val) {
+ snprintf(xattr_buf, alloc_len,
+ "((<" DHT_PATHINFO_HEADER "%s> %s) (%s-layout %s))",
+ this->name, local->xattr_val, this->name, layout_buf);
+ } else {
+ snprintf(xattr_buf, alloc_len, "(%s-layout %s)", this->name,
+ layout_buf);
+ }
+ } else if (local->xattr_val) {
+ snprintf(xattr_buf, alloc_len, "(<" DHT_PATHINFO_HEADER "%s> %s)",
+ this->name, local->xattr_val);
+ } else {
+ xattr_buf[0] = '\0';
+ }
+}
-int
-dht_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask)
+static int
+dht_vgetxattr_alloc_and_fill(dht_local_t *local, dict_t *xattr, xlator_t *this,
+ int op_errno)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ int ret = -1;
+ char *value = NULL;
+
+ ret = dict_get_str(xattr, local->xsel, &value);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
+ "Subvolume %s returned -1", this->name);
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto out;
+ }
+
+ local->alloc_len += strlen(value);
+
+ if (!local->xattr_val) {
+ local->alloc_len += (SLEN(DHT_PATHINFO_HEADER) + 10);
+ local->xattr_val = GF_MALLOC(local->alloc_len, gf_common_mt_char);
+ if (!local->xattr_val) {
+ ret = -1;
+ goto out;
+ }
+ local->xattr_val[0] = '\0';
+ }
+
+ int plen = strlen(local->xattr_val);
+ if (plen) {
+ /* extra byte(s) for \0 to be safe */
+ local->alloc_len += (plen + 2);
+ local->xattr_val = GF_REALLOC(local->xattr_val, local->alloc_len);
+ if (!local->xattr_val) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ (void)strcat(local->xattr_val, value);
+ (void)strcat(local->xattr_val, " ");
+ local->op_ret = 0;
+
+ ret = 0;
+out:
+ return ret;
+}
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+static int
+dht_vgetxattr_fill_and_set(dht_local_t *local, dict_t **dict, xlator_t *this,
+ gf_boolean_t flag)
+{
+ int ret = -1;
+ char *xattr_buf = NULL;
+ char layout_buf[8192] = {
+ 0,
+ };
+
+ if (flag)
+ fill_layout_info(local->layout, layout_buf);
+
+ *dict = dict_new();
+ if (!*dict)
+ goto out;
+
+ local->xattr_val[strlen(local->xattr_val) - 1] = '\0';
+
+ /* we would need max this many bytes to create xattr string
+ * extra 40 bytes is just an estimated amount of additional
+ * space required as we include translator name and some
+ * spaces, brackets etc. when forming the pathinfo string.
+ *
+ * For node-uuid we just don't have all the pretty formatting,
+ * but since this is a generic routine for pathinfo & node-uuid
+ * we don't have conditional space allocation and try to be
+ * generic
+ */
+ local->alloc_len += (2 * strlen(this->name)) + strlen(layout_buf) + 40;
+ xattr_buf = GF_MALLOC(local->alloc_len, gf_common_mt_char);
+ if (!xattr_buf)
+ goto out;
+
+ if (XATTR_IS_PATHINFO(local->xsel)) {
+ (void)dht_fill_pathinfo_xattr(this, local, xattr_buf, local->alloc_len,
+ flag, layout_buf);
+ } else if ((XATTR_IS_NODE_UUID(local->xsel)) ||
+ (XATTR_IS_NODE_UUID_LIST(local->xsel))) {
+ (void)snprintf(xattr_buf, local->alloc_len, "%s", local->xattr_val);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GET_XATTR_FAILED,
+ "Unknown local->xsel (%s)", local->xsel);
+ GF_FREE(xattr_buf);
+ goto out;
+ }
+
+ ret = dict_set_dynstr(*dict, local->xsel, xattr_buf);
+ if (ret)
+ GF_FREE(xattr_buf);
+ GF_FREE(local->xattr_val);
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+out:
+ return ret;
+}
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
+static int
+dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
+ int ret = 0;
+ char *uuid_str = NULL;
+ char *uuid_list = NULL;
+ char *next_uuid_str = NULL;
+ char *saveptr = NULL;
+ uuid_t node_uuid = {
+ 0,
+ };
+ char *uuid_list_copy = NULL;
+ int count = 0;
+ int i = 0;
+ int index = 0;
+ int found = 0;
+ nodeuuid_info_t *tmp_ptr = NULL;
+
+ VALIDATE_OR_GOTO(frame, out);
+ VALIDATE_OR_GOTO(frame->local, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ VALIDATE_OR_GOTO(conf->defrag, out);
+
+ gf_msg_debug(this->name, 0, "subvol %s returned", prev->name);
+
+ LOCK(&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ if (op_errno == ENODATA)
+ gf_msg_debug(this->name, 0, "failed to get node-uuid");
+ else
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "failed to get node-uuid");
+ goto post_unlock;
+ }
+
+ ret = dict_get_str(xattr, local->xsel, &uuid_list);
+
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_GET_FAILED,
+ "Failed to get %s", local->xsel);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ goto unlock;
}
- local->call_cnt = 1;
+ /* As DHT will not know details of its child xlators
+ * we need to parse this twice to get the count first
+ * and allocate memory later.
+ */
+ count = 0;
+ index = conf->local_subvols_cnt;
- STACK_WIND (frame, dht_access_cbk,
- subvol, subvol->fops->access,
- loc, mask);
+ uuid_list_copy = gf_strdup(uuid_list);
+ if (!uuid_list_copy)
+ goto unlock;
- return 0;
+ for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str;
+ uuid_str = next_uuid_str) {
+ next_uuid_str = strtok_r(NULL, " ", &saveptr);
+ if (gf_uuid_parse(uuid_str, node_uuid)) {
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR,
+ "Failed to parse uuid for %s", prev->name);
+ goto post_unlock;
+ }
+
+ count++;
+ if (gf_uuid_compare(node_uuid, conf->defrag->node_uuid)) {
+ gf_msg_debug(this->name, 0,
+ "subvol %s does not"
+ "belong to this node",
+ prev->name);
+ } else {
+ /* handle multiple bricks of the same replica
+ * on the same node */
+ if (found)
+ continue;
+ conf->local_subvols[(conf->local_subvols_cnt)++] = prev;
+ found = 1;
+ gf_msg_debug(this->name, 0,
+ "subvol %s belongs to"
+ " this node",
+ prev->name);
+ }
+ }
+
+ if (!found) {
+ local->op_ret = 0;
+ goto unlock;
+ }
+
+ conf->local_nodeuuids[index].count = count;
+ conf->local_nodeuuids[index].elements = GF_CALLOC(
+ count, sizeof(nodeuuid_info_t), 1);
+
+ /* The node-uuids are guaranteed to be returned in the same
+ * order as the bricks
+ * A null node-uuid is returned for a brick that is down.
+ */
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (access, frame, -1, op_errno);
+ saveptr = NULL;
+ i = 0;
- return 0;
-}
+ for (uuid_str = strtok_r(uuid_list_copy, " ", &saveptr); uuid_str;
+ uuid_str = next_uuid_str) {
+ next_uuid_str = strtok_r(NULL, " ", &saveptr);
+ tmp_ptr = &(conf->local_nodeuuids[index].elements[i]);
+ gf_uuid_parse(uuid_str, tmp_ptr->uuid);
+ if (!gf_uuid_compare(tmp_ptr->uuid, conf->defrag->node_uuid)) {
+ tmp_ptr->info = REBAL_NODEUUID_MINE;
+ }
+ i++;
+ tmp_ptr = NULL;
+ }
+ }
-int
-dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, const char *path, struct iatt *sbuf)
-{
- dht_local_t *local = NULL;
+ local->op_ret = 0;
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!is_last_call(this_call_cnt))
+ goto out;
- local = frame->local;
- if (op_ret == -1)
- goto err;
+ if (local->op_ret == -1) {
+ goto unwind;
+ }
- if (local) {
- sbuf->ia_ino = local->ia_ino;
- } else {
- op_ret = -1;
- op_errno = EINVAL;
- }
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, xattr, xdata);
+ goto out;
-err:
- DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, sbuf);
+unwind:
- return 0;
-}
+ GF_FREE(conf->local_nodeuuids[index].elements);
+ conf->local_nodeuuids[index].elements = NULL;
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, xdata);
+out:
+ GF_FREE(uuid_list_copy);
+ return 0;
+}
-int
-dht_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size)
+static int
+dht_vgetxattr_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ int ret = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ dict_t *dict = NULL;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+ VALIDATE_OR_GOTO(frame, out);
+ VALIDATE_OR_GOTO(frame->local, out);
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ if (op_ret < 0) {
+ if (op_errno != ENOTCONN) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir");
+ goto post_unlock;
+ }
+
+ goto unlock;
}
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
+ ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_DICT_SET_FAILED,
+ "alloc or fill failure");
+ goto post_unlock;
}
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!is_last_call(this_call_cnt))
+ goto out;
- local->ia_ino = loc->inode->ino;
+ /* -- last call: do patch ups -- */
- STACK_WIND (frame, dht_readlink_cbk,
- subvol, subvol->fops->readlink,
- loc, size);
+ if (local->op_ret == -1) {
+ goto unwind;
+ }
- return 0;
+ ret = dht_vgetxattr_fill_and_set(local, &dict, this, _gf_true);
+ if (ret)
+ goto unwind;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL);
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
- return 0;
+unwind:
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL);
+cleanup:
+ if (dict)
+ dict_unref(dict);
+out:
+ return 0;
}
-
-int
-dht_fix_layout_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+static int
+dht_vgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
{
- DHT_STACK_UNWIND (getxattr, frame, -1, ENODATA, NULL);
+ dht_local_t *local = NULL;
+ int ret = 0;
+ dict_t *dict = NULL;
+ xlator_t *prev = NULL;
+ gf_boolean_t flag = _gf_true;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
+ "vgetxattr: Subvolume %s returned -1", prev->name);
+ goto unwind;
+ }
+
+ ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Allocation or fill failure");
+ goto unwind;
+ }
+
+ flag = (local->layout->cnt > 1) ? _gf_true : _gf_false;
+
+ ret = dht_vgetxattr_fill_and_set(local, &dict, this, flag);
+ if (ret)
+ goto unwind;
+
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
- return 0;
+unwind:
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL);
+cleanup:
+ if (dict)
+ dict_unref(dict);
+
+ return 0;
}
-static void
-fill_layout_info (dht_layout_t *layout, char *buf)
+static int
+dht_linkinfo_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
{
- int i = 0;
- char tmp_buf[128] = {0,};
+ int ret = 0;
+ char *value = NULL;
- for (i = 0; i < layout->cnt; i++) {
- snprintf (tmp_buf, 128, "(%s %u %u)",
- layout->list[i].xlator->name,
- layout->list[i].start,
- layout->list[i].stop);
- if (i)
- strcat (buf, " ");
- strcat (buf, tmp_buf);
+ if (op_ret != -1) {
+ ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value);
+ if (!ret) {
+ ret = dict_set_str(xattr, GF_XATTR_LINKINFO_KEY, value);
+ if (!ret)
+ gf_msg_trace(this->name, 0, "failed to set linkinfo");
}
-}
+ }
-int
-dht_pathinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr)
-{
- dht_local_t *local = NULL;
- int ret = 0;
- int flag = 0;
- int this_call_cnt = 0;
- char *value_got = NULL;
- char layout_buf[8192] = {0,};
- char *xattr_buf = NULL;
- dict_t *dict = NULL;
- int32_t alloc_len = 0;
- int32_t plen = 0;
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
- local = frame->local;
+ return 0;
+}
- if (op_ret != -1) {
- ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value_got);
- if (!ret) {
- alloc_len = strlen (value_got);
-
- /**
- * allocate the buffer:- we allocate 10 bytes extra in case we need to
- * append ' Link: ' in the buffer for another STACK_WIND
- */
- if (!local->pathinfo) {
- alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10);
- local->pathinfo = GF_CALLOC (alloc_len, sizeof (char), gf_common_mt_char);
- }
+static int
+dht_mds_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
- if (local->pathinfo) {
- plen = strlen (local->pathinfo);
- if (plen) {
- /* extra byte(s) for \0 to be safe */
- alloc_len += (plen + 2);
- local->pathinfo = GF_REALLOC (local->pathinfo,
- alloc_len);
- if (!local->pathinfo)
- goto out;
- }
-
- strcat (local->pathinfo, value_got);
- }
- }
- }
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(frame->local, err);
+ VALIDATE_OR_GOTO(this->private, err);
- out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- if (local->layout->cnt > 1) {
- /* Set it for directory */
- fill_layout_info (local->layout, layout_buf);
- flag = 1;
- }
+ conf = this->private;
+ local = frame->local;
- dict = dict_new ();
+ if (!xattr || (op_ret == -1)) {
+ local->op_ret = op_ret;
+ goto out;
+ }
+ dict_del(xattr, conf->xattr_name);
+ local->op_ret = 0;
- /* we would need max-to-max this many bytes to create pathinfo string */
- alloc_len += (2 * strlen (this->name)) + strlen (layout_buf) + 40;
- xattr_buf = GF_CALLOC (alloc_len, sizeof (char), gf_common_mt_char);
+ if (!local->xattr) {
+ local->xattr = dict_copy_with_ref(xattr, NULL);
+ }
- if (flag && local->pathinfo)
- snprintf (xattr_buf, alloc_len, "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))",
- this->name, local->pathinfo, this->name,
- layout_buf);
- else if (local->pathinfo)
- snprintf (xattr_buf, alloc_len, "(<"DHT_PATHINFO_HEADER"%s> %s)",
- this->name, local->pathinfo);
- else if (flag)
- snprintf (xattr_buf, alloc_len, "(%s-layout %s)",
- this->name, layout_buf);
+out:
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
+ xdata);
+ return 0;
+err:
+ DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+ return 0;
+}
- ret = dict_set_dynstr (dict, GF_XATTR_PATHINFO_KEY,
- xattr_buf);
+int
+dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
- if (local->pathinfo)
- GF_FREE (local->pathinfo);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(frame->local, err);
+ VALIDATE_OR_GOTO(this->private, err);
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
+ conf = this->private;
+ local = frame->local;
- if (dict)
- dict_unref (dict);
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto err;
+ return 0;
+ }
- return 0;
+ LOCK(&frame->lock);
+ {
+ if (!xattr || (op_ret == -1)) {
+ local->op_ret = op_ret;
+ goto unlock;
}
- if (local->pathinfo)
- strcat (local->pathinfo, " Link: ");
- if (local->hashed_subvol) {
- /* This will happen if there pending */
- STACK_WIND (frame, dht_pathinfo_getxattr_cbk, local->hashed_subvol,
- local->hashed_subvol->fops->getxattr,
- &local->loc, local->key);
+ dict_del(xattr, conf->xattr_name);
+ dict_del(xattr, conf->mds_xattr_key);
- return 0;
+ dict_del(xattr, conf->commithash_xattr_name);
+
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
}
- gf_log ("this->name", GF_LOG_ERROR, "Unable to find hashed_subvol for path"
- " %s", local->pathinfo);
+ local->op_ret = 0;
- DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, dict);
- return 0;
-}
+ if (!local->xattr) {
+ local->xattr = dict_copy_with_ref(xattr, NULL);
+ } else {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
-int
-dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr)
-{
- int ret = 0;
- char *value = NULL;
-
- if (op_ret != -1) {
- ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value);
- if (!ret) {
- ret = dict_set_str (xattr, GF_XATTR_LINKINFO_KEY, value);
- if (!ret)
- gf_log (this->name, GF_LOG_TRACE,
- "failed to set linkinfo");
- }
+ if (!local->xdata) {
+ local->xdata = dict_ref(xdata);
+ } else if ((local->inode && IA_ISDIR(local->inode->ia_type)) ||
+ (local->fd && IA_ISDIR(local->fd->inode->ia_type))) {
+ dht_aggregate_xattr(local->xdata, xdata);
}
+ }
+unlock:
+ UNLOCK(&frame->lock);
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ /* If we have a valid xattr received from any one of the
+ * subvolume, let's return it */
+ if (local->xattr) {
+ local->op_ret = 0;
+ }
- return 0;
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
+ local->xdata);
+ }
+ return 0;
+err:
+ DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+ return 0;
}
-int
-dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr)
+static int32_t
+dht_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
+ dict_t *xdata)
{
- int this_call_cnt = 0;
- dht_local_t *local = NULL;
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (frame->local, out);
+static int
+dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- this_call_cnt = dht_frame_return (frame);
+ LOCK(&frame->lock);
+ {
+ if (local->op_errno == EOPNOTSUPP) {
+ /* Nothing to do here, we have already found
+ * a subvol which does not have the get_real_filename
+ * optimization. If condition is for simple logic.
+ */
+ goto unlock;
+ }
- if (!xattr || (op_ret == -1))
- goto out;
+ if (op_ret == -1) {
+ if (op_errno == EOPNOTSUPP) {
+ /* This subvol does not have the optimization.
+ * Better let the user know we don't support it.
+ * Remove previous results if any.
+ */
- if (dict_get (xattr, "trusted.glusterfs.dht")) {
- dict_del (xattr, "trusted.glusterfs.dht");
- }
- local->op_ret = 0;
+ if (local->xattr) {
+ dict_unref(local->xattr);
+ local->xattr = NULL;
+ }
- if (!local->xattr) {
- local->xattr = dict_copy_with_ref (xattr, NULL);
- } else {
- /* first aggregate everything into xattr and then copy into
- * local->xattr.
+ if (local->xattr_req) {
+ dict_unref(local->xattr_req);
+ local->xattr_req = NULL;
+ }
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UPGRADE_BRICKS,
+ "At least "
+ "one of the bricks does not support "
+ "this operation. Please upgrade all "
+ "bricks.");
+ goto post_unlock;
+ }
+
+ if (op_errno == ENOATTR) {
+ /* Do nothing, our defaults are set to this.
*/
- dht_aggregate_xattr (xattr, local->xattr);
- local->xattr = dict_copy (xattr, local->xattr);
- }
-out:
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, local->xattr);
- }
- return 0;
+ goto unlock;
+ }
+
+ /* This is a place holder for every other error
+ * case. I am not sure of how to interpret
+ * ENOTCONN etc. As of now, choosing to ignore
+ * down subvol and return a good result(if any)
+ * from other subvol.
+ */
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "Failed to get real filename.");
+ goto post_unlock;
+ }
+
+ /* This subvol has the required file.
+ * There could be other subvols which have returned
+ * success already, choosing to return the latest good
+ * result.
+ */
+ if (local->xattr)
+ dict_unref(local->xattr);
+ local->xattr = dict_ref(xattr);
+
+ if (local->xattr_req) {
+ dict_unref(local->xattr_req);
+ local->xattr_req = NULL;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->op_ret = op_ret;
+ local->op_errno = 0;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, 0, "Found a matching file.");
+ goto post_unlock;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->xattr, local->xattr_req);
+ }
+
+ return 0;
}
-int32_t
-dht_getxattr_unwind (call_frame_t *frame,
- int op_ret, int op_errno, dict_t *dict)
+static int
+dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
{
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
- return 0;
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int cnt = 0;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+ layout = local->layout;
+
+ cnt = local->call_cnt = layout->cnt;
+
+ local->op_ret = -1;
+ local->op_errno = ENOATTR;
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_get_real_filename_cbk, subvol,
+ subvol->fops->getxattr, loc, key, xdata);
+ }
+
+ return 0;
}
+static int
+dht_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+ xlator_t **subvols)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
-int
-dht_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *key)
-{
- xlator_t *subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- xlator_t **sub_volumes = NULL;
- int op_errno = -1;
- int ret = 0;
- int flag = 0;
- int i = 0;
- int cnt = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
- layout = dht_layout_get (this, loc->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_ERROR,
- "layout is NULL");
- op_errno = ENOENT;
- goto err;
- }
+ local = frame->local;
+ layout = local->layout;
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
+ for (i = 0; i < layout->cnt; i++)
+ subvols[i] = layout->list[i].xlator;
- goto err;
- }
+ return layout->cnt;
+}
- if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)) {
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
+static int
+dht_is_debug_xattr_key(const char **array, char *key)
+{
+ int i = 0;
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
+ for (i = 0; array[i]; i++) {
+ if (fnmatch(array[i], key, FNM_NOESCAPE) == 0)
+ return i;
+ }
- goto err;
- }
+ return -1;
+}
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
+/* Note we already have frame->local initialised here*/
- goto err;
- }
- local->key = gf_strdup (key);
- if (!local->key) {
- op_errno = ENOMEM;
+static int
+dht_handle_debug_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENODATA;
+ char *value = NULL;
+ loc_t file_loc = {0};
+ const char *name = NULL;
- goto err;
- }
- local->layout = layout;
+ local = frame->local;
- local->call_cnt = 1;
- if (hashed_subvol != cached_subvol) {
- local->call_cnt = 2;
- local->hashed_subvol = hashed_subvol;
- }
+ if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) == -1) {
+ goto out;
+ }
- STACK_WIND (frame, dht_pathinfo_getxattr_cbk, cached_subvol,
- cached_subvol->fops->getxattr, loc, key);
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- return 0;
+ if (strncmp(key, DHT_DBG_HASHED_SUBVOL_KEY,
+ SLEN(DHT_DBG_HASHED_SUBVOL_KEY)) == 0) {
+ name = key + strlen(DHT_DBG_HASHED_SUBVOL_KEY);
+ if (strlen(name) == 0) {
+ op_errno = EINVAL;
+ goto out;
}
- if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) {
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
- if (hashed_subvol == cached_subvol) {
- op_errno = ENODATA;
- goto err;
- }
- if (hashed_subvol) {
- STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
- hashed_subvol->fops->getxattr, loc,
- GF_XATTR_PATHINFO_KEY);
- return 0;
- }
- op_errno = ENODATA;
- goto err;
+
+ ret = dht_build_child_loc(this, &file_loc, loc, (char *)name);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
}
- if (key && (strcmp (key, GF_XATTR_FIX_LAYOUT_KEY) == 0)) {
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].start == layout->list[i].stop) {
- flag = 1;
- break;
- }
- }
- if ((layout->cnt < conf->subvolume_cnt) || flag) {
- gf_log (this->name, GF_LOG_INFO,
- "expanding layout of %s from %d to %d",
- loc->path, layout->cnt, conf->subvolume_cnt);
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
-
- goto err;
- }
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
+ local->hashed_subvol = dht_subvol_get_hashed(this, &file_loc);
+ if (local->hashed_subvol == NULL) {
+ op_errno = ENODATA;
+ goto out;
+ }
- goto err;
- }
- local->layout = layout;
- //layout = dht_layout_new (this, conf->subvolume_cnt);
+ value = gf_strdup(local->hashed_subvol->name);
+ if (!value) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- dht_selfheal_new_directory (frame, dht_fix_layout_cbk,
- layout);
- return 0;
- }
- op_errno = ENODATA;
- goto err;
+ ret = dict_set_dynstr(local->xattr, (char *)key, value);
+ if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
}
+ ret = 0;
+ goto out;
+ }
- if (key && (!strcmp (GF_XATTR_MARKER_KEY, key))
- && (-1 == frame->root->pid)) {
+out:
+ loc_wipe(&file_loc);
+ DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+ return 0;
+}
- if (loc->inode-> ia_type == IA_IFDIR) {
- cnt = layout->cnt;
- } else {
- cnt = 1;
- }
- sub_volumes = alloca ( cnt * sizeof (xlator_t *));
- for (i = 0; i < cnt; i++)
- *(sub_volumes + i) = layout->list[i].xlator;
-
- if (cluster_getmarkerattr (frame, this, loc, key,
- local, dht_getxattr_unwind,
- sub_volumes, cnt,
- MARKER_UUID_TYPE, conf->vol_uuid)) {
- op_errno = EINVAL;
- goto err;
- }
+/* Virtual Xattr which returns 1 if all subvols are up,
+ else returns 0. Geo-rep then uses this virtual xattr
+ after a fresh mount and starts the I/O.
+*/
- return 0;
- }
+enum dht_vxattr_subvol {
+ DHT_VXATTR_SUBVOLS_UP = 1,
+ DHT_VXATTR_SUBVOLS_DOWN = 0,
+};
- if (key && *conf->vol_uuid) {
- if ((match_uuid_local (key, conf->vol_uuid) == 0) &&
- (-1 == frame->root->pid)) {
- if (loc->inode-> ia_type == IA_IFDIR) {
- cnt = layout->cnt;
- } else {
- cnt = 1;
- }
- sub_volumes = alloca ( cnt * sizeof (xlator_t *));
- for (i = 0; i < cnt; i++)
- sub_volumes[i] = layout->list[i].xlator;
-
- if (cluster_getmarkerattr (frame, this, loc, key,
- local, dht_getxattr_unwind,
- sub_volumes, cnt,
- MARKER_XTIME_TYPE,
- conf->vol_uuid)) {
- op_errno = EINVAL;
- goto err;
- }
+int
+dht_vgetxattr_subvol_status(call_frame_t *frame, xlator_t *this,
+ const char *key)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENODATA;
+ int value = DHT_VXATTR_SUBVOLS_UP;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ local = frame->local;
+
+ if (!key) {
+ op_errno = EINVAL;
+ goto out;
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ value = DHT_VXATTR_SUBVOLS_DOWN;
+ gf_msg_debug(this->name, 0, "subvol %s is down ",
+ conf->subvolumes[i]->name);
+ break;
+ }
+ }
+ ret = dict_set_int8(local->xattr, (char *)key, value);
+ if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
- return 0;
- }
+out:
+ DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+ return 0;
+}
+
+int
+dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
+ dict_t *xdata)
+#define DHT_IS_DIR(layout) (layout->cnt > 1)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int op_errno = -1;
+ int i = 0;
+ int cnt = 0;
+ char *node_uuid_key = NULL;
+ int ret = -1;
+
+ GF_CHECK_XATTR_KEY_AND_GOTO(key, IO_THREADS_QUEUE_SIZE_KEY, op_errno, err);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_GETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL,
+ "Layout is NULL");
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ /* skip over code which is irrelevant without a valid key */
+ if (!key)
+ goto no_key;
+
+ local->key = gf_strdup(key);
+ if (!local->key) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ if (strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) {
+ dht_vgetxattr_subvol_status(frame, this, key);
+ return 0;
+ }
+
+ /* skip over code which is irrelevant if !DHT_IS_DIR(layout) */
+ if (!DHT_IS_DIR(layout))
+ goto no_dht_is_dir;
+
+ if ((strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY,
+ SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) &&
+ DHT_IS_DIR(layout)) {
+ dht_getxattr_get_real_filename(frame, this, loc, key, xdata);
+ return 0;
+ }
+
+ if (!strcmp(key, GF_REBAL_FIND_LOCAL_SUBVOL)) {
+ ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_LIST_NODE_UUIDS_KEY);
+ if (ret == -1 || !node_uuid_key) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Failed to copy node uuid key");
+ op_errno = ENOMEM;
+ goto err;
+ }
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key);
+ cnt = local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ node_uuid_key, xdata);
+ }
+ if (node_uuid_key)
+ GF_FREE(node_uuid_key);
+ return 0;
+ }
+
+ if (!strcmp(key, GF_REBAL_OLD_FIND_LOCAL_SUBVOL)) {
+ ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_NODE_UUID_KEY);
+ if (ret == -1 || !node_uuid_key) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Failed to copy node uuid key");
+ op_errno = ENOMEM;
+ goto err;
+ }
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key);
+ cnt = local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ node_uuid_key, xdata);
+ }
+ if (node_uuid_key)
+ GF_FREE(node_uuid_key);
+ return 0;
+ }
+
+ /* for file use cached subvolume (obviously!): see if {}
+ * below
+ * for directory:
+ * wind to all subvolumes and exclude subvolumes which
+ * return ENOTCONN (in callback)
+ *
+ * NOTE: Don't trust inode here, as that may not be valid
+ * (until inode_link() happens)
+ */
+
+ if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0) ||
+ (strcmp(key, GF_XATTR_LIST_NODE_UUIDS_KEY) == 0)) {
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key);
+ cnt = local->call_cnt = layout->cnt;
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_vgetxattr_dir_cbk, subvol,
+ subvol->fops->getxattr, loc, key, xdata);
}
+ return 0;
+ }
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
+no_dht_is_dir:
+ /* node-uuid or pathinfo for files */
+ if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0)) {
+ cached_subvol = local->cached_subvol;
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key);
+ local->call_cnt = 1;
+ STACK_WIND_COOKIE(frame, dht_vgetxattr_cbk, cached_subvol,
+ cached_subvol, cached_subvol->fops->getxattr, loc,
+ key, xdata);
- goto err;
+ return 0;
+ }
+
+ if (strcmp(key, GF_XATTR_LINKINFO_KEY) == 0) {
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
}
- if (key) {
- local->key = gf_strdup (key);
- if (!local->key) {
- op_errno = ENOMEM;
+ cached_subvol = dht_subvol_get_cached(this, loc->inode);
+ if (!cached_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_CACHED_SUBVOL_GET_FAILED,
+ "Failed to get cached subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
- goto err;
- }
+ if (hashed_subvol == cached_subvol) {
+ op_errno = ENODATA;
+ goto err;
}
- local->layout = layout;
- if (loc->inode-> ia_type == IA_IFDIR) {
- cnt = local->call_cnt = layout->cnt;
+ STACK_WIND(frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
+ hashed_subvol->fops->getxattr, loc, GF_XATTR_PATHINFO_KEY,
+ xdata);
+ return 0;
+ }
+
+ if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) >= 0) {
+ dht_handle_debug_getxattr(frame, this, loc, key);
+ return 0;
+ }
+
+no_key:
+ if (cluster_handle_marker_getxattr(frame, loc, key, conf->vol_uuid,
+ dht_getxattr_unwind,
+ dht_marker_populate_args) == 0)
+ return 0;
+
+ if (DHT_IS_DIR(layout)) {
+ local->call_cnt = conf->subvolume_cnt;
+ cnt = conf->subvolume_cnt;
+ ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+ if (!mds_subvol) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Cannot determine MDS, fetching xattr %s randomly"
+ " from a subvol for path %s ",
+ key, loc->path);
} else {
- cnt = local->call_cnt = 1;
+ /* TODO need to handle it, As of now we are
+ choosing availability instead of chossing
+ consistencty, in case of mds_subvol is
+ down winding a getxattr call on other subvol
+ and return xattr
+ */
+ local->mds_subvol = mds_subvol;
+ for (i = 0; i < cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_INFO, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS %s is down for path"
+ " path %s so fetching xattr "
+ "%s randomly from a subvol ",
+ local->mds_subvol->name, loc->path, key);
+ ret = 1;
+ }
+ }
+ }
}
- for (i = 0; i < cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_getxattr_cbk,
- subvol, subvol->fops->getxattr,
- loc, key);
+ if (!ret && key && local->mds_subvol && dht_match_xattr(key)) {
+ STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol,
+ local->mds_subvol->fops->getxattr, loc, key, xdata);
+
+ return 0;
}
- return 0;
+ } else {
+ cnt = local->call_cnt = 1;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, loc,
+ key, xdata);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+#undef DHT_IS_DIR
int
-dht_fsetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *xattr, int flags)
+dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
{
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int op_errno = -1;
+ int i = 0;
+ int cnt = 0;
+ xlator_t *mds_subvol = NULL;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(fd->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FGETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL,
+ "Layout is NULL");
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ if (key) {
+ local->key = gf_strdup(key);
+ if (!local->key) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ }
+
+ gf_uuid_unparse(fd->inode->gfid, gfid);
+
+ if ((fd->inode->ia_type == IA_IFDIR) && key &&
+ (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) !=
+ 0)) {
+ local->call_cnt = conf->subvolume_cnt;
+ cnt = conf->subvolume_cnt;
+ ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol);
+
+ if (!mds_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "cannot determine MDS, fetching xattr %s "
+ " randomly from a subvol for gfid %s ",
+ key, gfid);
+ } else {
+ /* TODO need to handle it, As of now we are
+ choosing availability instead of chossing
+ consistencty, in case of hashed_subvol is
+ down winding a getxattr call on other subvol
+ and return xattr
+ */
+ local->mds_subvol = mds_subvol;
+ for (i = 0; i < cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS subvolume %s is down"
+ " for gfid %s so fetching xattr "
+ " %s randomly from a subvol ",
+ local->mds_subvol->name, gfid, key);
+ ret = 1;
+ }
+ }
+ }
}
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ if (!ret && key && local->mds_subvol && dht_match_xattr(key)) {
+ STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fgetxattr, fd, key, NULL);
- local->inode = inode_ref (fd->inode);
- local->call_cnt = 1;
+ return 0;
+ }
- STACK_WIND (frame, dht_err_cbk, subvol, subvol->fops->fsetxattr,
- fd, xattr, flags);
+ } else {
+ cnt = local->call_cnt = 1;
+ }
- return 0;
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, fd,
+ key, NULL);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+static int
+dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
-int
-dht_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr, int flags)
-{
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
- int op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- conf = this->private;
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ if (!frame || !frame->local)
+ goto err;
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ local = frame->local;
+ op_errno = local->op_errno;
- local->layout = layout = dht_layout_get (this, loc->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ local->rebalance.xdata);
+ return 0;
+ }
- local->call_cnt = layout->cnt;
+ if (subvol == NULL)
+ goto err;
- for (i = 0; i < layout->cnt; i++) {
- STACK_WIND (frame, dht_err_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->setxattr,
- loc, xattr, flags);
- }
+ local->call_cnt = 2; /* This is the second attempt */
- return 0;
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->setxattr, &local->loc,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, local->fd,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (setxattr, frame, -1, op_errno);
+ return 0;
- return 0;
+err:
+ DHT_STACK_UNWIND(setxattr, frame, (local ? local->op_ret : -1), op_errno,
+ NULL);
+ return 0;
}
-
int
-dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno)
+dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ struct iatt *stbuf = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol1 = NULL, *subvol2 = NULL;
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ local = frame->local;
+ prev = cookie;
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (removexattr, frame, local->op_ret, local->op_errno);
- }
+ local->op_errno = op_errno;
+ if ((local->fop == GF_FOP_FSETXATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
return 0;
-}
+ }
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.",
+ prev->name);
+ goto out;
+ }
-int
-dht_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *key)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
+ if (local->call_cnt != 1)
+ goto out;
- int i;
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+ if ((!op_ret) && !stbuf) {
+ goto out;
+ }
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ local->op_ret = op_ret;
+ local->rebalance.target_op_fn = dht_setxattr2;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Phase 1 of migration */
+ if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ dht_setxattr2(this, subvol2, frame, 0);
+ return 0;
}
- local->layout = layout = dht_layout_get (this, loc->inode);
- if (!local->layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ } else {
+ DHT_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+ }
+
+ return 0;
+}
+
+/* Function is call by dict_foreach_fnmatch if key is match with
+ user.* and set boolean flag to true
+*/
+static int
+dht_is_user_xattr(dict_t *this, char *key, data_t *value, void *data)
+{
+ gf_boolean_t *user_xattr_found = data;
+ *user_xattr_found = _gf_true;
+ return 0;
+}
+
+/* Common code to wind a (f)(set|remove)xattr call to set xattr on directory
+ */
+static int
+dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, dict_t *xattr, int flags,
+ dict_t *xdata, int *op_errno)
+
+{
+ dict_t *xattrop = NULL;
+ int32_t subone[1] = {-1};
+ gf_boolean_t uxattr_key_found = _gf_false;
+ xlator_t *mds_subvol = NULL;
+ xlator_t *travvol = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int i = 0;
+ int call_cnt = 0;
+ dht_local_t *local = NULL;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char **xattrs_to_heal;
+
+ conf = this->private;
+ local = frame->local;
+ call_cnt = conf->subvolume_cnt;
+ local->flags = flags;
+ xattrs_to_heal = get_xattrs_to_heal();
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ gf_uuid_unparse(local->gfid, gfid_local);
+ }
+
+ if ((local->fop == GF_FOP_SETXATTR) || (local->fop == GF_FOP_FSETXATTR)) {
+ /* Check if any user xattr present in xattr
+ */
+ dict_foreach_fnmatch(xattr, "user*", dht_is_user_xattr,
+ &uxattr_key_found);
+
+ /* Check if any custom key xattr present in dict xattr
+ and start index from 1 because user xattr already
+ checked in previous line
+ */
+ for (i = 1; xattrs_to_heal[i]; i++)
+ if (dict_get(xattr, xattrs_to_heal[i]))
+ uxattr_key_found = _gf_true;
+ }
+
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ /* Check if any custom key xattr present in local->key
+ */
+ for (i = 0; xattrs_to_heal[i]; i++)
+ if (strstr(local->key, xattrs_to_heal[i]))
+ uxattr_key_found = _gf_true;
+ }
+
+ /* If there is no custom key xattr present or gfid is root
+ or call_cnt is 1 then wind a (f)setxattr call on all subvols
+ */
+ if (!uxattr_key_found || __is_root_gfid(local->gfid) || call_cnt == 1) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ travvol = conf->subvolumes[i];
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_FSETXATTR)) {
+ if (fd) {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->fsetxattr, fd, xattr,
+ flags, xdata);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->setxattr, loc, xattr,
+ flags, xdata);
+ }
+ }
+
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ if (fd) {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->fremovexattr, fd,
+ local->key, local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->removexattr, loc,
+ local->key, local->xattr_req);
+ }
+ }
+ }
+
+ return 0;
+ }
+
+ /* Calculate hash subvol based on inode and parent inode
+ */
+ if (fd) {
+ ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol);
+ } else {
+ ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+ }
+ if (ret || !mds_subvol) {
+ if (fd) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get mds subvol for fd %p"
+ "gfid is %s ",
+ fd, gfid_local);
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "%s: Failed to get mds subvol. (gfid is %s)", loc->path,
+ gfid_local);
+ }
+ (*op_errno) = ENOENT;
+ goto err;
+ }
+
+ local->mds_subvol = mds_subvol;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS subvol is down for path "
+ " %s gfid is %s Unable to set xattr ",
+ local->loc.path, gfid_local);
+ (*op_errno) = ENOTCONN;
goto err;
+ }
+ }
+ }
+
+ if (uxattr_key_found) {
+ xattrop = dict_new();
+ if (!xattrop) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
+ "dictionary creation failed for path %s "
+ "for gfid is %s ",
+ local->loc.path, gfid_local);
+ (*op_errno) = ENOMEM;
+ goto err;
+ }
+ local->xattr = dict_ref(xattr);
+ /* Subtract current MDS xattr value to -1 , value of MDS
+ xattr represents no. of times xattr modification failed
+ on non MDS subvols.
+ */
+ ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, subone, 1);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "dictionary set array failed for path %s "
+ "for gfid is %s ",
+ local->loc.path, gfid_local);
+ if (xattrop)
+ dict_unref(xattrop);
+ (*op_errno) = ret;
+ goto err;
+ }
+ /* Wind a xattrop call to use ref counting approach
+ update mds xattr to -1 before update xattr on
+ hashed subvol and update mds xattr to +1 after update
+ xattr on all non hashed subvol
+ */
+ if (fd) {
+ STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fxattrop, fd,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ } else {
+ STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->xattrop, loc,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
}
+ if (xattrop)
+ dict_unref(xattrop);
+ }
- local->call_cnt = layout->cnt;
+ return 0;
+err:
+ return -1;
+}
- for (i = 0; i < layout->cnt; i++) {
- STACK_WIND (frame, dht_removexattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->removexattr,
- loc, key);
+int
+dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(fd->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+ local->call_cnt = call_cnt = layout->cnt;
+
+ if (IA_ISDIR(fd->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, xattr,
+ flags, xdata, &op_errno);
+ if (ret)
+ goto err;
+ } else {
+ local->call_cnt = 1;
+ local->rebalance.xattr = dict_ref(xattr);
+ local->rebalance.flags = flags;
+
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to set dictionary key %s for fd=%p",
+ DHT_IATT_IN_XDATA_KEY, fd);
}
- return 0;
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, fd, xattr, flags,
+ local->xattr_req);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (removexattr, frame, -1, op_errno);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
+static int
+dht_checking_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ int i = -1;
+ int ret = -1;
+ char *value = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret == -1)
+ goto out;
+
+ ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value);
+ if (ret)
+ goto out;
+
+ if (!strcmp(value, local->key)) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev)
+ conf->decommissioned_bricks[i] = prev;
+ }
+ }
-int
-dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd)
+out:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, ENOTSUP, NULL);
+ }
+ return 0;
+}
+
+static int
+dht_nuke_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+static int
+dht_nuke_dir(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *tmp)
+{
+ if (!IA_ISDIR(loc->inode->ia_type)) {
+ DHT_STACK_UNWIND(setxattr, frame, -1, ENOTSUP, NULL);
+ return 0;
+ }
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ /* Setxattr didn't need the parent, but rmdir does. */
+ loc->parent = inode_parent(loc->inode, NULL, NULL);
+ if (!loc->parent) {
+ DHT_STACK_UNWIND(setxattr, frame, -1, ENOENT, NULL);
+ return 0;
+ }
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
- local->op_ret = 0;
+ if (!loc->name && loc->path) {
+ loc->name = strrchr(loc->path, '/');
+ if (loc->name) {
+ ++(loc->name);
}
-unlock:
- UNLOCK (&frame->lock);
+ }
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
- local->fd);
+ /*
+ * We do this instead of calling dht_rmdir_do directly for two reasons.
+ * The first is that we want to reuse all of the initialization that
+ * dht_rmdir does, so if it ever changes we'll just follow along. The
+ * second (i.e. why we don't use STACK_WIND_TAIL) is so that we don't
+ * obscure the fact that we came in via this path instead of a genuine
+ * rmdir. That makes debugging just a tiny bit easier.
+ */
+ STACK_WIND(frame, dht_nuke_dir_cbk, this, this->fops->rmdir, loc, 1, NULL);
- return 0;
+ return 0;
}
-
int
-dht_open (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags, fd_t *fd, int wbflags)
+dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
+ int flags, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int ret = -1;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int op_errno = EINVAL;
+ int ret = -1;
+ data_t *tmp = NULL;
+ uint32_t dir_spread = 0;
+ char value[4096] = {
+ 0,
+ };
+ gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;
+ int call_cnt = 0;
+ uint32_t new_hash = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, err);
+
+ methods = &(conf->methods);
+
+ /* Rebalance daemon is allowed to set internal keys */
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_SETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->call_cnt = call_cnt = layout->cnt;
+ tmp = dict_get(xattr, conf->mds_xattr_key);
+ if (tmp) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ tmp = dict_get(xattr, GF_XATTR_FILE_MIGRATE_KEY);
+ if (tmp) {
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ /* TODO: need to interpret the 'value' for more meaning
+ (ie, 'target' subvolume given there, etc) */
+ memcpy(value, tmp->data, tmp->len);
+ if (strcmp(value, "force") == 0)
+ forced_rebalance = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS;
+
+ if (conf->decommission_in_progress)
+ forced_rebalance = GF_DHT_MIGRATE_HARDLINK;
+
+ if (!loc->path) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!local->loc.name)
+ local->loc.name = strrchr(local->loc.path, '/') + 1;
+
+ if (!local->loc.parent)
+ local->loc.parent = inode_parent(local->loc.inode, NULL, NULL);
+
+ if ((!local->loc.name) || (!local->loc.parent)) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (gf_uuid_is_null(local->loc.pargfid))
+ gf_uuid_copy(local->loc.pargfid, local->loc.parent->gfid);
+
+ methods->migration_get_dst_subvol(this, local);
+
+ if (!local->rebalance.target_node) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->rebalance.from_subvol = local->cached_subvol;
+
+ if (local->rebalance.target_node == local->rebalance.from_subvol) {
+ op_errno = EEXIST;
+ goto err;
+ }
+ if (local->rebalance.target_node) {
+ local->flags = forced_rebalance;
+ frame->root->pid = GF_CLIENT_PID_DEFRAG;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ ret = dht_start_rebalance_task(this, frame);
+ if (!ret)
+ return 0;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+ "%s: failed to create a new rebalance synctask", loc->path);
}
+ op_errno = EINVAL;
+ goto err;
+ }
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
-
- goto err;
+ tmp = dict_get(xattr, "decommission-brick");
+ if (tmp) {
+ /* This operation should happen only on '/' */
+ if (!__is_root_gfid(loc->inode->gfid)) {
+ op_errno = ENOTSUP;
+ goto err;
}
- local->fd = fd_ref (fd);
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
+ memcpy(value, tmp->data, min(tmp->len, 4095));
+ local->key = gf_strdup(value);
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* Get the pathinfo, and then compare */
+ STACK_WIND_COOKIE(frame, dht_checking_pathinfo_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ GF_XATTR_PATHINFO_KEY, NULL);
+ }
+ return 0;
+ }
+ tmp = dict_get(xattr, GF_XATTR_FIX_LAYOUT_KEY);
+ if (tmp) {
+ ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash);
+ if (ret == 0) {
+ gf_msg_debug(this->name, 0,
+ "updating commit hash for %s from %u to %u",
+ uuid_utoa(loc->gfid), layout->commit_hash, new_hash);
+ layout->commit_hash = new_hash;
+
+ ret = dht_update_commit_hash_for_layout(frame);
+ if (ret) {
+ op_errno = ENOTCONN;
goto err;
+ }
+ return ret;
}
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_FIX_LAYOUT_INFO,
+ "fixing the layout of %s", loc->path);
+
+ ret = dht_fix_directory_layout(frame, dht_fix_layout_setxattr_cbk,
+ layout);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
+
+ tmp = dict_get(xattr, "distribute.directory-spread-count");
+ if (tmp) {
+ /* Setxattr value is packed as 'binary', not string */
+ memcpy(value, tmp->data, min(tmp->len, 4095));
+ ret = gf_string2uint32(value, &dir_spread);
+ if (!ret && ((dir_spread <= conf->subvolume_cnt) && (dir_spread > 0))) {
+ layout->spread_cnt = dir_spread;
+
+ ret = dht_fix_directory_layout(frame, dht_common_setxattr_cbk,
+ layout);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_OPERATION_NOT_SUP,
+ "wrong 'directory-spread-count' value (%s)", value);
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ tmp = dict_get(xattr, "glusterfs.dht.nuke");
+ if (tmp) {
+ return dht_nuke_dir(frame, this, loc, tmp);
+ }
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, xattr,
+ flags, xdata, &op_errno);
+ if (ret)
+ goto err;
+ } else {
+ local->rebalance.xattr = dict_ref(xattr);
+ local->rebalance.flags = flags;
local->call_cnt = 1;
- STACK_WIND (frame, dht_fd_cbk,
- subvol, subvol->fops->open,
- loc, flags, fd, wbflags);
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
- return 0;
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->setxattr, loc, xattr, flags,
+ local->xattr_req);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
-int
-dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iovec *vector, int count, struct iatt *stbuf,
- struct iobref *iobref)
+static int
+dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = frame->local;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ if (!frame || !frame->local)
+ goto err;
- if (op_ret != -1)
- stbuf->ia_ino = local->ia_ino;
-out:
- DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf,
- iobref);
+ local = frame->local;
+ op_errno = local->op_errno;
+ local->call_cnt = 2; /* This is the second attempt */
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ local->rebalance.xdata);
return 0;
-}
+ }
+
+ if (subvol == NULL)
+ goto err;
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->removexattr, &local->loc, local->key,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+ }
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+ return 0;
+}
int
-dht_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off)
+dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ struct iatt *stbuf = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol1 = NULL, *subvol2 = NULL;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ local = frame->local;
+ prev = cookie;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ local->op_errno = op_errno;
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ if ((local->fop == GF_FOP_FREMOVEXATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- local->ia_ino = fd->inode->ino;
- STACK_WIND (frame, dht_readv_cbk,
- subvol, subvol->fops->readv,
- fd, size, off);
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
- return 0;
+ if (local->call_cnt != 1)
+ goto out;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL);
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
- return 0;
-}
+ if ((!op_ret) && !stbuf) {
+ goto out;
+ }
+ local->op_ret = 0;
-int
-dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- dht_local_t *local = NULL;
+ local->rebalance.target_op_fn = dht_removexattr2;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
- if (op_ret == -1) {
- goto out;
- }
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
- local = frame->local;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
+ /* Phase 1 of migration */
+ if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ dht_removexattr2(this, subvol2, frame, 0);
+ return 0;
}
- prebuf->ia_ino = local->ia_ino;
- postbuf->ia_ino = local->ia_ino;
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
out:
- DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf);
-
- return 0;
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+ } else {
+ DHT_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+ }
+ return 0;
}
-
int
-dht_writev (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iovec *vector, int count, off_t off,
- struct iobref *iobref)
+dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int call_cnt = 0;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err);
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_REMOVEXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!local->layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+
+ local->call_cnt = call_cnt = layout->cnt;
+ local->key = gf_strdup(key);
+
+ if (key && (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0)) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, NULL, 0,
+ local->xattr_req, &op_errno);
+ if (ret)
+ goto err;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
+ } else {
+ local->call_cnt = 1;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to "
+ "set dictionary key %s for %s",
+ DHT_IATT_IN_XDATA_KEY, loc->path);
}
- local = dht_local_init (frame);
- if (!local) {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->removexattr, loc, key,
+ local->xattr_req);
+ }
- op_errno = ENOMEM;
- goto err;
- }
+ return 0;
- local->ia_ino = fd->inode->ino;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
- STACK_WIND (frame, dht_writev_cbk,
- subvol, subvol->fops->writev,
- fd, vector, count, off, iobref);
+ return 0;
+}
- return 0;
+int
+dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int call_cnt = 0;
+ dht_conf_t *conf = 0;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err);
+
+ VALIDATE_OR_GOTO(frame, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FREMOVEXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for inode=%s",
+ uuid_utoa(fd->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!local->layout) {
+ gf_msg_debug(this->name, 0, "no layout for inode=%s",
+ uuid_utoa(fd->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+
+ local->call_cnt = call_cnt = layout->cnt;
+ local->key = gf_strdup(key);
+
+ if (IA_ISDIR(fd->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, NULL, 0,
+ local->xattr_req, &op_errno);
+ if (ret)
+ goto err;
+
+ } else {
+ local->call_cnt = 1;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to "
+ "set dictionary key %s for fd=%p",
+ DHT_IATT_IN_XDATA_KEY, fd);
+ }
+
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, fd, key,
+ local->xattr_req);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+dht_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ local = frame->local;
+ prev = cookie;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
}
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, local->fd,
+ NULL);
+
+ return 0;
+}
- goto err;
+/*
+ * dht_normalize_stats -
+ */
+static void
+dht_normalize_stats(struct statvfs *buf, unsigned long bsize,
+ unsigned long frsize)
+{
+ double factor = 0;
+
+ if (buf->f_bsize != bsize) {
+ buf->f_bsize = bsize;
+ }
+
+ if (buf->f_frsize != frsize) {
+ factor = ((double)buf->f_frsize) / frsize;
+ buf->f_frsize = frsize;
+ buf->f_blocks = (fsblkcnt_t)(factor * buf->f_blocks);
+ buf->f_bfree = (fsblkcnt_t)(factor * buf->f_bfree);
+ buf->f_bavail = (fsblkcnt_t)(factor * buf->f_bavail);
+ }
+}
+
+static int
+dht_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+ gf_boolean_t event = _gf_false;
+ qdstatfs_action_t action = qdstatfs_action_OFF;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int bsize = 0;
+ int frsize = 0;
+ GF_UNUSED int ret = 0;
+ unsigned long new_usage = 0;
+ unsigned long cur_usage = 0;
+
+ local = frame->local;
+ GF_ASSERT(local);
+
+ if (xdata)
+ ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event);
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
}
+ if (!statvfs) {
+ op_errno = EINVAL;
+ local->op_ret = -1;
+ goto unlock;
+ }
+ local->op_ret = 0;
- local->fd = fd_ref (fd);
- local->call_cnt = 1;
+ if (local->quota_deem_statfs) {
+ if (event == _gf_true) {
+ action = qdstatfs_action_COMPARE;
+ } else {
+ action = qdstatfs_action_NEGLECT;
+ }
+ } else {
+ if (event == _gf_true) {
+ action = qdstatfs_action_REPLACE;
+ local->quota_deem_statfs = _gf_true;
+ }
+ }
- STACK_WIND (frame, dht_err_cbk,
- subvol, subvol->fops->flush, fd);
+ if (local->quota_deem_statfs) {
+ switch (action) {
+ case qdstatfs_action_NEGLECT:
+ goto unlock;
- return 0;
+ case qdstatfs_action_REPLACE:
+ local->statvfs = *statvfs;
+ goto unlock;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (flush, frame, -1, op_errno);
+ case qdstatfs_action_COMPARE:
+ new_usage = statvfs->f_blocks - statvfs->f_bfree;
+ cur_usage = local->statvfs.f_blocks -
+ local->statvfs.f_bfree;
- return 0;
-}
+ /* Take the max of the usage from subvols */
+ if (new_usage >= cur_usage)
+ local->statvfs = *statvfs;
+ goto unlock;
+ default:
+ break;
+ }
+ }
+
+ if (local->statvfs.f_bsize != 0) {
+ bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
+ frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
+ dht_normalize_stats(&local->statvfs, bsize, frsize);
+ dht_normalize_stats(statvfs, bsize, frsize);
+ } else {
+ local->statvfs.f_bsize = statvfs->f_bsize;
+ local->statvfs.f_frsize = statvfs->f_frsize;
+ }
+
+ local->statvfs.f_blocks += statvfs->f_blocks;
+ local->statvfs.f_bfree += statvfs->f_bfree;
+ local->statvfs.f_bavail += statvfs->f_bavail;
+ local->statvfs.f_files += statvfs->f_files;
+ local->statvfs.f_ffree += statvfs->f_ffree;
+ local->statvfs.f_favail += statvfs->f_favail;
+ local->statvfs.f_fsid = statvfs->f_fsid;
+ local->statvfs.f_flag = statvfs->f_flag;
+ local->statvfs.f_namemax = statvfs->f_namemax;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+ &local->statvfs, xdata);
+
+ return 0;
+}
int
-dht_fsync (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int datasync)
+dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+ loc_t newloc = {
+ 0,
+ };
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(this->private, err);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ conf = this->private;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
+ local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) {
+ itable = loc->inode->table;
+ if (!itable) {
+ op_errno = EINVAL;
+ goto err;
}
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
+ loc = &local->loc2;
- goto err;
+ inode = inode_find(itable, root_gfid);
+ if (!inode) {
+ op_errno = EINVAL;
+ goto err;
}
- local->call_cnt = 1;
- local->ia_ino = fd->inode->ino;
+ dht_build_root_loc(inode, &newloc);
+ loc = &newloc;
+ }
- STACK_WIND (frame, dht_fsync_cbk,
- subvol, subvol->fops->fsync,
- fd, datasync);
+ local->call_cnt = conf->subvolume_cnt;
- return 0;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND(frame, dht_statfs_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->statfs, loc, xdata);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct gf_flock *flock)
+dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
- DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock);
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = 0;
+ gf_boolean_t new_xdata = _gf_false;
+ xlator_t **subvolumes = NULL;
+ int call_count = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_OPENDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ local->first_up_subvol = dht_first_up_subvol(this);
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ new_xdata = _gf_true;
+ }
+
+ ret = dict_set_uint32(xdata, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value : key = %s",
+ conf->link_xattr_name);
+
+ /* dht_readdirp will wind to all subvols so open has to be sent to
+ * all subvols whether or not conf->local_subvols is set */
+
+ call_count = local->call_cnt = conf->subvolume_cnt;
+ subvolumes = conf->subvolumes;
+
+ /* In case of parallel-readdir, the readdir-ahead will be loaded
+ * below dht, in this case, if we want to enable or disable SKIP_DIRs
+ * it has to be done in opendir, so that prefetching logic in
+ * readdir-ahead, honors it */
+ for (i = 0; i < call_count; i++) {
+ if (conf->readdir_optimize == _gf_true) {
+ if (subvolumes[i] != local->first_up_subvol) {
+ ret = dict_set_int32(xdata, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary"
+ " value :key = %s, ret:%d",
+ GF_READDIR_SKIP_DIRS, ret);
+ }
+ }
+
+ STACK_WIND_COOKIE(frame, dht_fd_cbk, subvolumes[i], subvolumes[i],
+ subvolumes[i]->fops->opendir, loc, fd, xdata);
+ dict_del(xdata, GF_READDIR_SKIP_DIRS);
+ }
+
+ if (new_xdata)
+ dict_unref(xdata);
+
+ return 0;
- return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
}
+/* dht_readdirp_cbk creates a new dentry and dentry->inode is not assigned.
+ This functions assigns an inode if all of the following conditions are
+ true:
-int
-dht_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int cmd, struct gf_flock *flock)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
+ * DHT has only one child. In this case the entire layout is present on
+ this single child and hence we can set complete layout in inode.
+ * backend has complete layout and there are no anomalies in it and from
+ this information layout can be constructed and set in inode.
+*/
+static void
+dht_populate_inode_for_dentry(xlator_t *this, xlator_t *subvol,
+ gf_dirent_t *entry, gf_dirent_t *orig_entry)
+{
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+
+ if (gf_uuid_is_null(orig_entry->d_stat.ia_gfid)) {
+ /* this skips the '..' entry for the root of the volume */
+ return;
+ }
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ gf_uuid_copy(loc.gfid, orig_entry->d_stat.ia_gfid);
+ loc.inode = inode_ref(orig_entry->inode);
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ if (is_revalidate(&loc)) {
+ goto out;
+ }
- STACK_WIND (frame, dht_lk_cbk,
- subvol, subvol->fops->lk,
- fd, cmd, flock);
+ layout = dht_layout_new(this, 1);
+ if (!layout)
+ goto out;
- return 0;
+ ret = dht_layout_merge(this, layout, subvol, 0, 0, orig_entry->dict);
+ if (!ret) {
+ ret = dht_layout_normalize(this, &loc, layout);
+ if (ret == 0) {
+ dht_layout_set(this, orig_entry->inode, layout);
+ entry->inode = inode_ref(orig_entry->inode);
+ layout = NULL;
+ }
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL);
+ if (layout)
+ dht_layout_unref(this, layout);
- return 0;
+out:
+ loc_wipe(&loc);
+ return;
}
-/*
- * dht_normalize_stats -
+/* Posix returns op_errno = ENOENT to indicate that there are no more
+ * entries
*/
-static void
-dht_normalize_stats (struct statvfs *buf, unsigned long bsize,
- unsigned long frsize)
+static int
+dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
{
- double factor = 0;
-
- if (buf->f_bsize != bsize) {
- buf->f_bsize = bsize;
- }
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ xlator_t *subvol = 0;
+ xlator_t *hashed_subvol = 0;
+ int ret = 0;
+ int readdir_optimize = 0;
+ inode_table_t *itable = NULL;
+ inode_t *inode = NULL;
+ gf_boolean_t skip_hashed_check = _gf_false;
+
+ INIT_LIST_HEAD(&entries.list);
+
+ prev = cookie;
+ local = frame->local;
+ GF_VALIDATE_OR_GOTO(this->name, local->fd, unwind);
+
+ itable = local->fd->inode->table;
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
+
+ methods = &(conf->methods);
+
+ if (op_ret <= 0) {
+ goto done;
+ }
+
+ /* Why aren't we skipping DHT entirely in case of a single subvol?
+ * Because if this was a larger volume earlier and all but one subvol
+ * was removed, there might be stale linkto files on the subvol.
+ */
+ if (conf->subvolume_cnt == 1) {
+ /* return all directory and file entries except
+ * linkto files for a single child DHT
+ */
+ skip_hashed_check = _gf_true;
+ }
- if (buf->f_frsize != frsize) {
- factor = ((double) buf->f_frsize) / frsize;
- buf->f_frsize = frsize;
- buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
- buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree);
- buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
+ if (!local->layout)
+ local->layout = dht_layout_get(this, local->fd->inode);
+ layout = local->layout;
+
+ /* This will skip the entries on the subvol without a layout,
+ * hence preventing the crash but rmdir might fail with
+ * "directory not empty" errors*/
+
+ if (layout == NULL)
+ goto done;
+
+ if (conf->readdir_optimize == _gf_true)
+ readdir_optimize = 1;
+
+ gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name);
+
+ list_for_each_entry(orig_entry, (&orig_entries->list), list)
+ {
+ next_offset = orig_entry->d_off;
+
+ gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
+ orig_entry->d_name, orig_entry->d_type);
+
+ if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
+ /*stat failed somewhere- display this entry but the data may
+ * be inaccurate.
+ */
+ gf_msg_debug(this->name, EINVAL, "Invalid stat for %s (gfid %s)",
+ orig_entry->d_name,
+ uuid_utoa(orig_entry->d_stat.ia_gfid));
+ }
+
+ if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict,
+ conf->link_xattr_name)) {
+ gf_msg_debug(this->name, 0, "%s: %s is a linkto file", prev->name,
+ orig_entry->d_name);
+ continue;
+ }
+
+ if (skip_hashed_check) {
+ goto list;
+ }
+
+ if (check_is_dir(NULL, (&orig_entry->d_stat), NULL)) {
+ /*Directory entries filtering :
+ * a) If rebalance is running, pick from first_up_subvol
+ * b) (rebalance not running)hashed subvolume is NULL or
+ * down then filter in first_up_subvolume. Other wise the
+ * corresponding hashed subvolume will take care of the
+ * directory entry.
+ */
+ if (readdir_optimize) {
+ if (prev == local->first_up_subvol)
+ goto list;
+ else
+ continue;
+ }
+
+ hashed_subvol = methods->layout_search(this, layout,
+ orig_entry->d_name);
+
+ if (prev == hashed_subvol)
+ goto list;
+ if ((hashed_subvol && dht_subvol_status(conf, hashed_subvol)) ||
+ (prev != local->first_up_subvol))
+ continue;
+
+ goto list;
+ }
+
+ list:
+ entry = gf_dirent_for_name(orig_entry->d_name);
+ if (!entry) {
+ goto unwind;
+ }
+
+ /* Do this if conf->search_unhashed is set to "auto" */
+ if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+ subvol = methods->layout_search(this, layout, orig_entry->d_name);
+ if (!subvol || (subvol != prev)) {
+ /* TODO: Count the number of entries which need
+ linkfile to prove its existence in fs */
+ layout->search_unhashed++;
+ }
+ }
+
+ entry->d_off = orig_entry->d_off;
+ entry->d_stat = orig_entry->d_stat;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ if (orig_entry->dict)
+ entry->dict = dict_ref(orig_entry->dict);
+
+ /* making sure we set the inode ctx right with layout,
+ currently possible only for non-directories, so for
+ directories don't set entry inodes */
+ if (IA_ISDIR(entry->d_stat.ia_type)) {
+ entry->d_stat.ia_blocks = DHT_DIR_STAT_BLOCKS;
+ entry->d_stat.ia_size = DHT_DIR_STAT_SIZE;
+ if (orig_entry->inode) {
+ dht_inode_ctx_time_update(orig_entry->inode, this,
+ &entry->d_stat, 1);
+
+ if (conf->subvolume_cnt == 1) {
+ dht_populate_inode_for_dentry(this, prev, entry,
+ orig_entry);
+ }
+ }
+ } else {
+ if (orig_entry->dict &&
+ dict_get(orig_entry->dict, conf->link_xattr_name)) {
+ /* Strip out the S and T flags set by rebalance*/
+ DHT_STRIP_PHASE1_FLAGS(&entry->d_stat);
+ }
+
+ if (orig_entry->inode) {
+ ret = dht_layout_preset(this, prev, orig_entry->inode);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout "
+ "in inode for %s",
+ orig_entry->d_name);
+
+ entry->inode = inode_ref(orig_entry->inode);
+ } else if (itable) {
+ /*
+ * orig_entry->inode might be null if any upper
+ * layer xlators below client set to null, to
+ * force a lookup on the inode even if the inode
+ * is present in the inode table. In that case
+ * we just update the ctx to make sure we didn't
+ * missed anything.
+ */
+ inode = inode_find(itable, orig_entry->d_stat.ia_gfid);
+ if (inode) {
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout"
+ " in inode for %s",
+ orig_entry->d_name);
+ inode_unref(inode);
+ inode = NULL;
+ }
+ }
}
-}
-
-int
-dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct statvfs *statvfs)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- int bsize = 0;
- int frsize = 0;
-
-
- local = frame->local;
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto unlock;
- }
- local->op_ret = 0;
+ gf_msg_debug(this->name, 0, "%s: Adding entry = %s", prev->name,
+ entry->d_name);
- if (local->statvfs.f_bsize != 0) {
- bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
- frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
- dht_normalize_stats(&local->statvfs, bsize, frsize);
- dht_normalize_stats(statvfs, bsize, frsize);
- } else {
- local->statvfs.f_bsize = statvfs->f_bsize;
- local->statvfs.f_frsize = statvfs->f_frsize;
- }
+ list_add_tail(&entry->list, &entries.list);
+ count++;
+ }
- local->statvfs.f_blocks += statvfs->f_blocks;
- local->statvfs.f_bfree += statvfs->f_bfree;
- local->statvfs.f_bavail += statvfs->f_bavail;
- local->statvfs.f_files += statvfs->f_files;
- local->statvfs.f_ffree += statvfs->f_ffree;
- local->statvfs.f_favail += statvfs->f_favail;
- local->statvfs.f_fsid = statvfs->f_fsid;
- local->statvfs.f_flag = statvfs->f_flag;
- local->statvfs.f_namemax = statvfs->f_namemax;
+done:
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ * Possible values:
+ * op_ret == 0 and op_errno != 0
+ * if op_errno != ENOENT : Error.Unwind.
+ * if op_errno == ENOENT : There are no more entries on this subvol.
+ * Move to the next one.
+ * op_ret > 0 and count == 0 :
+ * The subvol returned entries to dht but all were stripped out.
+ * For example, if they were linkto files or dirs where
+ * hashed_subvol != prev. Try to get some entries by winding
+ * to the next subvol. This can be dangerous if parallel readdir
+ * is enabled as it grows the stack.
+ *
+ * op_ret > 0 and count > 0:
+ * We found some entries. Unwind even if the buffer is not full.
+ *
+ */
+
+ op_ret = count;
+ if (count == 0) {
+ /* non-zero next_offset means that
+ * EOF is not yet hit on the current subvol
+ */
+ if ((next_offset == 0) || (op_errno == ENOENT)) {
+ next_offset = 0;
+ next_subvol = dht_subvol_next(this, prev);
+ } else {
+ next_subvol = prev;
}
-unlock:
- UNLOCK (&frame->lock);
+ if (!next_subvol) {
+ goto unwind;
+ }
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
- &local->statvfs);
+ if (conf->readdir_optimize == _gf_true) {
+ if (next_subvol != local->first_up_subvol) {
+ ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ ":key = %s",
+ GF_READDIR_SKIP_DIRS);
+ } else {
+ dict_del(local->xattr, GF_READDIR_SKIP_DIRS);
+ }
+ }
+ STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol,
+ next_subvol->fops->readdirp, local->fd, local->size,
+ next_offset, local->xattr);
return 0;
-}
+ }
+unwind:
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+ if (op_ret < 0)
+ op_ret = 0;
+
+ if (prev != dht_last_up_subvol(this))
+ op_errno = 0;
+
+ DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
+
+ gf_dirent_free(&entries);
+ return 0;
+}
-int
-dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
+static int
+dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+ dht_layout_t *layout = 0;
+ xlator_t *subvol = 0;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ gf_boolean_t skip_hashed_check = _gf_false;
+ INIT_LIST_HEAD(&entries.list);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (this->private, err);
+ prev = cookie;
+ local = frame->local;
- conf = this->private;
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, done);
- local = dht_local_init (frame);
- local->call_cnt = conf->subvolume_cnt;
+ methods = &(conf->methods);
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_statfs_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->statfs, loc);
- }
+ if (op_ret <= 0)
+ goto done;
- return 0;
+ if (!local->layout)
+ local->layout = dht_layout_get(this, local->fd->inode);
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL);
+ layout = local->layout;
- return 0;
-}
+ gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name);
+ if (conf->subvolume_cnt == 1) {
+ /*return everything*/
+ skip_hashed_check = _gf_true;
+ count = op_ret;
+ goto done;
+ }
-int
-dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
-{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int op_errno = -1;
- int i = -1;
+ list_for_each_entry(orig_entry, (&orig_entries->list), list)
+ {
+ next_offset = orig_entry->d_off;
+ gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
+ orig_entry->d_name, orig_entry->d_type);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (this->private, err);
+ subvol = methods->layout_search(this, layout, orig_entry->d_name);
- conf = this->private;
+ if (!subvol || (subvol == prev)) {
+ entry = gf_dirent_for_name(orig_entry->d_name);
+ if (!entry) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Memory allocation failed ");
+ goto unwind;
+ }
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
+ entry->d_off = orig_entry->d_off;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
- goto err;
- }
+ gf_msg_debug(this->name, 0, "%s: Adding = entry %s", prev->name,
+ entry->d_name);
- local->fd = fd_ref (fd);
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
+ list_add_tail(&entry->list, &entries.list);
+ count++;
+ }
+ }
+done:
+ op_ret = count;
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+ if (count == 0) {
+ if ((next_offset == 0) || (op_errno == ENOENT)) {
+ next_offset = 0;
+ next_subvol = dht_subvol_next(this, prev);
+ } else {
+ next_subvol = prev;
+ }
- goto err;
+ if (!next_subvol) {
+ goto unwind;
}
- local->call_cnt = conf->subvolume_cnt;
+ STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol,
+ next_subvol->fops->readdir, local->fd, local->size,
+ next_offset, NULL);
+ return 0;
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_fd_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->opendir,
- loc, fd);
+unwind:
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+
+ if (prev != dht_last_up_subvol(this))
+ op_errno = 0;
+
+ if (!skip_hashed_check) {
+ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
+ gf_dirent_free(&entries);
+
+ } else {
+ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL);
+ }
+ return 0;
+}
+
+static int
+dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, int whichop, dict_t *dict)
+{
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ xlator_t *xvol = NULL;
+ int ret = 0;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, NULL, whichop);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_ref(fd);
+ local->size = size;
+ local->xattr_req = (dict) ? dict_ref(dict) : NULL;
+ local->first_up_subvol = dht_first_up_subvol(this);
+ local->op_ret = -1;
+
+ dht_deitransform(this, yoff, &xvol);
+
+ /* TODO: do proper readdir */
+ if (whichop == GF_FOP_READDIRP) {
+ if (dict)
+ local->xattr = dict_ref(dict);
+ else
+ local->xattr = dict_new();
+
+ if (local->xattr) {
+ ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ " : key = %s",
+ conf->link_xattr_name);
+
+ if (conf->readdir_optimize == _gf_true) {
+ if (xvol != local->first_up_subvol) {
+ ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to set "
+ "dictionary value: "
+ "key = %s",
+ GF_READDIR_SKIP_DIRS);
+ } else {
+ dict_del(local->xattr, GF_READDIR_SKIP_DIRS);
+ }
+ }
+
+ if (conf->subvolume_cnt == 1) {
+ ret = dict_set_uint32(local->xattr, conf->xattr_name, 4 * 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary "
+ "value:key = %s ",
+ conf->xattr_name);
+ }
+ }
}
- return 0;
+ STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol,
+ xvol->fops->readdirp, fd, size, yoff, local->xattr);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol,
+ xvol->fops->readdir, fd, size, yoff, local->xattr);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+int
+dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *xdata)
+{
+ int op = GF_FOP_READDIR;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ op = GF_FOP_READDIRP;
+ break;
+ }
+ }
+
+ if (conf->use_readdirp)
+ op = GF_FOP_READDIRP;
+
+out:
+ dht_do_readdir(frame, this, fd, size, yoff, op, 0);
+ return 0;
+}
int
-dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, gf_dirent_t *orig_entries)
-{
- dht_local_t *local = NULL;
- gf_dirent_t entries;
- gf_dirent_t *orig_entry = NULL;
- gf_dirent_t *entry = NULL;
- call_frame_t *prev = NULL;
- xlator_t *next_subvol = NULL;
- off_t next_offset = 0;
- int count = 0;
- dht_layout_t *layout = 0;
- dht_conf_t *conf = NULL;
- xlator_t *subvol = 0;
-
- INIT_LIST_HEA