summaryrefslogtreecommitdiffstats
path: root/api
Commit message (Expand)AuthorAgeFilesLines
* libgfapi: fix use after freed of clnt when dispatching eventsKinglong Mee2018-09-261-2/+7
* api: old and new versioned symbols must appear in each section of gfapi.mapKaleb S. KEITHLEY2018-09-251-39/+46
* gfapi: Cleanup alias fileShyamsundarR2018-09-173-7/+1
* Land part 2 of clang-format changesGluster Ant2018-09-127-10783/+10766
* Land clang-format changesGluster Ant2018-09-125-894/+848
* core: python3Kaleb S. KEITHLEY2018-09-031-1/+1
* build: add --enable-asan configure optionsNiels de Vos2018-08-301-7/+9
* glfs-fops.c, glfs.c: strncpy() -> sprintf(), reduce strlen()'sYaniv Kaul2018-08-242-6/+6
* coverity: Multiple coverity fixes for issues with HIGH severityShyamsundarR2018-08-221-1/+3
* api/src/glfs.c: move to GF_MALLOC() instead of GF_CALLOC()Yaniv Kaul2018-08-221-1/+1
* gfapi: Use inode_forget in case of unlink/rename objectsKinglong Mee2018-08-213-3/+21
* gfapi : Handle the path == "" glfs_resolve_atJiffin Tony Thottan2018-08-131-7/+10
* coverity: Fix remaining SECURE_TEMP issues reportedShyamsundarR2018-08-031-5/+27
* libgfapi: fix memory leak on old volume filesZhang Huan2018-07-282-2/+5
* libgfapi: add missing unref of mgmt client in glfs_finiZhang Huan2018-07-281-0/+1
* build: rename event.h to gf-event.hNiels de Vos2018-07-273-3/+3
* All: run codespell on the code and fix issues.Yaniv Kaul2018-07-228-26/+27
* gfapi : Coverity FixesJiffin Tony Thottan2018-06-141-3/+2
* api: cleanup headersKaleb S. KEITHLEY2018-06-1111-261/+377
* rpc/clnt: Don't let consumers manage "connected" stateRaghavendra G2018-06-041-2/+0
* core/various: python3 compat, prepare for python2 -> python3Kaleb S. KEITHLEY2018-05-301-1/+1
* libgfapi: Fix lookup on rootKotresh HR2018-05-281-2/+2
* api: missing __THROW on pub function declsKaleb S. KEITHLEY2018-05-252-5/+5
* core: make glfs_iobuf_copy() consumable for general purpose.Susant Palai2018-05-241-47/+4
* gfapi: various broken symbol versionsKaleb S. KEITHLEY2018-05-106-80/+107
* gfapi: acutally avoid recall callback when closedv4.2devThomas Hindoe Paaboel Andersen2018-05-061-1/+2
* gfapi : RECALL_LEASE implementationSoumya Koduri2018-05-048-28/+532
* core/various: python3 compat, prepare for python2 -> python3Kaleb S. KEITHLEY2018-05-021-3/+4
* core/build/various: python3 compat, prepare for python2 -> python3Kaleb S. KEITHLEY2018-04-121-1/+1
* gfapi: fix a couple of minor issuesKaleb S. KEITHLEY2018-04-053-6/+2
* python: Remove all uses of find_library. Fixes #1450593Niklas Hambüchen2018-03-241-1/+1
* glusterd: TLS verification fails while using intermediate CAMohit Agrawal2018-03-191-0/+1
* Fetch backup volfile servers from glusterd2Prashanth Pai2018-02-162-52/+72
* gfapi: return pre/post attributes at callback for glfs apiKinglong Mee2018-02-124-96/+379
* gfapi: return pre/post attributes from glfs_ftruncateKinglong Mee2018-02-124-8/+37
* gfapi: return pre/post attributes from glfs_fsync/fdatasyncKinglong Mee2018-02-125-17/+72
* gfapi: return pre/post attributes from glfs_pread/pwriteKinglong Mee2018-02-124-26/+93
* libgfapi: Remove need_lookup from readdirpKinglong Mee2018-02-081-9/+2
* libgfapi: skip nameless lookup if stat is NULLKinglong Mee2018-02-081-2/+3
* gfapi : New APIs have been added to use lease feature in glusterSoumya Koduri2018-01-2610-29/+279
* build: use libtirpc by default, even if ipv6 is not the defaultKaleb S. KEITHLEY2018-01-261-1/+1
* libgfapi: Add new api for supporting mandatory-locksAnoop C S2018-01-225-32/+147
* gfapi : added glfs_setfsleaseid() for setting lease idSoumya Koduri2018-01-194-0/+31
* core: fix some of the dict_{get,set} with proper APIsAmar Tumballi2018-01-173-10/+10
* all: Simplify component message id's definitionXavier Hernandez2017-12-141-84/+60
* gfapi: fix issue when glfs_set_logging is called concurrentlyZhang Huan2017-12-061-5/+0
* libglusterfs: specify ctx in gf_log_set_loglevelZhang Huan2017-12-061-1/+1
* gfapi: avoid nameless lookup when inode_find successKinglong Mee2017-11-281-2/+22
* Coverity Issue: PW.INCLUDE_RECURSION in several filesGirjesh Rajoria2017-11-091-1/+0
* *.pc: Fix include path in CflagsAndrea Bolognani2017-11-081-2/+2
07b3d9b15fc5d6b3ed8e6'>xlators/cluster/dht/src/dht.sym8
-rw-r--r--xlators/cluster/dht/src/nufa.c24
-rw-r--r--xlators/cluster/dht/src/nufa.sym8
-rw-r--r--xlators/cluster/dht/src/switch.c32
-rw-r--r--xlators/cluster/dht/src/switch.sym8
-rw-r--r--xlators/cluster/dht/src/tier-common.c1199
-rw-r--r--xlators/cluster/dht/src/tier-common.h55
-rw-r--r--xlators/cluster/dht/src/tier.c3088
-rw-r--r--xlators/cluster/dht/src/tier.h110
-rw-r--r--xlators/cluster/dht/src/tier.sym9
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_mock.c6
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_unittest.c4
-rw-r--r--xlators/cluster/ec/src/ec-code.c4
-rw-r--r--xlators/cluster/ec/src/ec-code.h4
-rw-r--r--xlators/cluster/ec/src/ec-combine.c53
-rw-r--r--xlators/cluster/ec/src/ec-common.c318
-rw-r--r--xlators/cluster/ec/src/ec-common.h60
-rw-r--r--xlators/cluster/ec/src/ec-data.c12
-rw-r--r--xlators/cluster/ec/src/ec-data.h2
-rw-r--r--xlators/cluster/ec/src/ec-dir-read.c44
-rw-r--r--xlators/cluster/ec/src/ec-dir-write.c112
-rw-r--r--xlators/cluster/ec/src/ec-fops.h146
-rw-r--r--xlators/cluster/ec/src/ec-galois.c3
-rw-r--r--xlators/cluster/ec/src/ec-generic.c121
-rw-r--r--xlators/cluster/ec/src/ec-heal.c379
-rw-r--r--xlators/cluster/ec/src/ec-heald.c165
-rw-r--r--xlators/cluster/ec/src/ec-heald.h9
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c12
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c126
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c231
-rw-r--r--xlators/cluster/ec/src/ec-locks.c112
-rw-r--r--xlators/cluster/ec/src/ec-mem-types.h3
-rw-r--r--xlators/cluster/ec/src/ec-messages.h5
-rw-r--r--xlators/cluster/ec/src/ec-method.c8
-rw-r--r--xlators/cluster/ec/src/ec-method.h6
-rw-r--r--xlators/cluster/ec/src/ec-types.h40
-rw-r--r--xlators/cluster/ec/src/ec.c290
-rw-r--r--xlators/cluster/ec/src/ec.h1
-rw-r--r--xlators/cluster/stripe/src/Makefile.am22
-rw-r--r--xlators/cluster/stripe/src/stripe-helpers.c658
-rw-r--r--xlators/cluster/stripe/src/stripe-mem-types.h29
-rw-r--r--xlators/cluster/stripe/src/stripe.c5612
-rw-r--r--xlators/cluster/stripe/src/stripe.h291
-rw-r--r--xlators/debug/delay-gen/src/delay-gen-mem-types.h2
-rw-r--r--xlators/debug/delay-gen/src/delay-gen-messages.h2
-rw-r--r--xlators/debug/delay-gen/src/delay-gen.c19
-rw-r--r--xlators/debug/delay-gen/src/delay-gen.h6
-rw-r--r--xlators/debug/error-gen/src/error-gen-mem-types.h2
-rw-r--r--xlators/debug/error-gen/src/error-gen.c86
-rw-r--r--xlators/debug/error-gen/src/error-gen.h1
-rw-r--r--xlators/debug/io-stats/src/io-stats-mem-types.h2
-rw-r--r--xlators/debug/io-stats/src/io-stats.c450
-rw-r--r--xlators/debug/sink/src/sink.c16
-rw-r--r--xlators/debug/trace/src/trace-mem-types.h2
-rw-r--r--xlators/debug/trace/src/trace.c42
-rw-r--r--xlators/debug/trace/src/trace.h18
-rw-r--r--xlators/encryption/Makefile.am3
-rw-r--r--xlators/encryption/crypt/Makefile.am3
-rw-r--r--xlators/encryption/crypt/src/Makefile.am26
-rw-r--r--xlators/encryption/crypt/src/atom.c861
-rw-r--r--xlators/encryption/crypt/src/crypt-common.h133
-rw-r--r--xlators/encryption/crypt/src/crypt-mem-types.h41
-rw-r--r--xlators/encryption/crypt/src/crypt.c3906
-rw-r--r--xlators/encryption/crypt/src/crypt.h931
-rw-r--r--xlators/encryption/crypt/src/data.c715
-rw-r--r--xlators/encryption/crypt/src/keys.c284
-rw-r--r--xlators/encryption/crypt/src/metadata.c575
-rw-r--r--xlators/encryption/crypt/src/metadata.h79
-rw-r--r--xlators/encryption/rot-13/Makefile.am3
-rw-r--r--xlators/features/Makefile.am13
-rw-r--r--xlators/features/arbiter/src/arbiter-mem-types.h2
-rw-r--r--xlators/features/arbiter/src/arbiter.c21
-rw-r--r--xlators/features/arbiter/src/arbiter.h4
-rw-r--r--xlators/features/barrier/src/barrier-mem-types.h2
-rw-r--r--xlators/features/barrier/src/barrier.c100
-rw-r--r--xlators/features/barrier/src/barrier.h9
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h53
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c12
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h20
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub.c98
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub.h2
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-ssm.h2
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c323
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h36
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-common.h2
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c97
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h3
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h77
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.c487
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.h56
-rwxr-xr-x[-rw-r--r--]xlators/features/changelog/lib/examples/python/changes.py0
-rw-r--r--xlators/features/changelog/lib/examples/python/libgfchangelog.py4
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am2
-rw-r--r--xlators/features/changelog/lib/src/changelog-lib-messages.h32
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-api.c12
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.c59
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.h7
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-journal-handler.c35
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-reborp.c38
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.h2
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog.c67
-rw-r--r--xlators/features/changelog/lib/src/gf-history-changelog.c83
-rw-r--r--xlators/features/changelog/src/changelog-barrier.c17
-rw-r--r--xlators/features/changelog/src/changelog-encoders.h4
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.c23
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.h8
-rw-r--r--xlators/features/changelog/src/changelog-helpers.c318
-rw-r--r--xlators/features/changelog/src/changelog-helpers.h65
-rw-r--r--xlators/features/changelog/src/changelog-mem-types.h2
-rw-r--r--xlators/features/changelog/src/changelog-messages.h125
-rw-r--r--xlators/features/changelog/src/changelog-misc.h4
-rw-r--r--xlators/features/changelog/src/changelog-rpc-common.c72
-rw-r--r--xlators/features/changelog/src/changelog-rpc-common.h4
-rw-r--r--xlators/features/changelog/src/changelog-rpc.c185
-rw-r--r--xlators/features/changelog/src/changelog-rpc.h2
-rw-r--r--xlators/features/changelog/src/changelog-rt.c6
-rw-r--r--xlators/features/changelog/src/changelog-rt.h4
-rw-r--r--xlators/features/changelog/src/changelog.c373
-rw-r--r--xlators/features/changetimerecorder/src/Makefile.am26
-rw-r--r--xlators/features/changetimerecorder/src/changetimerecorder.c2358
-rw-r--r--xlators/features/changetimerecorder/src/changetimerecorder.h21
-rw-r--r--xlators/features/changetimerecorder/src/ctr-helper.c293
-rw-r--r--xlators/features/changetimerecorder/src/ctr-helper.h854
-rw-r--r--xlators/features/changetimerecorder/src/ctr-messages.h61
-rw-r--r--xlators/features/changetimerecorder/src/ctr-xlator-ctx.c362
-rw-r--r--xlators/features/changetimerecorder/src/ctr-xlator-ctx.h68
-rw-r--r--xlators/features/changetimerecorder/src/ctr_mem_types.h22
-rw-r--r--xlators/features/cloudsync/src/Makefile.am4
-rw-r--r--xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c8
-rw-r--r--xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h2
-rw-r--r--xlators/features/cloudsync/src/cloudsync-common.c16
-rw-r--r--xlators/features/cloudsync/src/cloudsync-common.h43
-rwxr-xr-x[-rw-r--r--]xlators/features/cloudsync/src/cloudsync-fops-c.py40
-rwxr-xr-x[-rw-r--r--]xlators/features/cloudsync/src/cloudsync-fops-h.py0
-rw-r--r--xlators/features/cloudsync/src/cloudsync-mem-types.h3
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am6
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h2
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c8
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h8
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am (renamed from xlators/features/changetimerecorder/Makefile.am)0
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am12
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h203
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h30
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym1
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h19
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c842
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h84
-rw-r--r--xlators/features/cloudsync/src/cloudsync.c582
-rw-r--r--xlators/features/cloudsync/src/cloudsync.h32
-rw-r--r--xlators/features/compress/src/cdc-helper.c6
-rw-r--r--xlators/features/compress/src/cdc-mem-types.h2
-rw-r--r--xlators/features/compress/src/cdc.c25
-rw-r--r--xlators/features/compress/src/cdc.h2
-rw-r--r--xlators/features/gfid-access/src/gfid-access-mem-types.h2
-rw-r--r--xlators/features/gfid-access/src/gfid-access.c56
-rw-r--r--xlators/features/gfid-access/src/gfid-access.h10
-rw-r--r--xlators/features/glupy/Makefile.am3
-rw-r--r--xlators/features/glupy/doc/README.md44
-rw-r--r--xlators/features/glupy/doc/TESTING9
-rw-r--r--xlators/features/glupy/doc/test.vol10
-rw-r--r--xlators/features/glupy/examples/Makefile.am5
-rw-r--r--xlators/features/glupy/examples/debug-trace.py777
-rw-r--r--xlators/features/glupy/examples/helloworld.py21
-rw-r--r--xlators/features/glupy/examples/negative.py93
-rw-r--r--xlators/features/glupy/src/Makefile.am36
-rw-r--r--xlators/features/glupy/src/__init__.py.in4
-rw-r--r--xlators/features/glupy/src/glupy.c2446
-rw-r--r--xlators/features/glupy/src/glupy.h56
-rw-r--r--xlators/features/glupy/src/glupy.sym101
-rw-r--r--xlators/features/glupy/src/glupy/Makefile.am5
-rw-r--r--xlators/features/glupy/src/glupy/__init__.py852
-rw-r--r--xlators/features/glupy/src/setup.py.in24
-rw-r--r--xlators/features/index/src/index-mem-types.h12
-rw-r--r--xlators/features/index/src/index-messages.h2
-rw-r--r--xlators/features/index/src/index.c108
-rw-r--r--xlators/features/index/src/index.h12
-rw-r--r--xlators/features/leases/src/leases-internal.c73
-rw-r--r--xlators/features/leases/src/leases-mem-types.h5
-rw-r--r--xlators/features/leases/src/leases-messages.h2
-rw-r--r--xlators/features/leases/src/leases.c45
-rw-r--r--xlators/features/leases/src/leases.h64
-rw-r--r--xlators/features/locks/src/clear.c58
-rw-r--r--xlators/features/locks/src/clear.h8
-rw-r--r--xlators/features/locks/src/common.c580
-rw-r--r--xlators/features/locks/src/common.h87
-rw-r--r--xlators/features/locks/src/entrylk.c106
-rw-r--r--xlators/features/locks/src/inodelk.c249
-rw-r--r--xlators/features/locks/src/locks-mem-types.h3
-rw-r--r--xlators/features/locks/src/locks.h105
-rw-r--r--xlators/features/locks/src/pl-messages.h2
-rw-r--r--xlators/features/locks/src/posix.c1442
-rw-r--r--xlators/features/locks/src/reservelk.c78
-rw-r--r--xlators/features/locks/tests/unit-test.c12
-rw-r--r--xlators/features/marker/src/marker-common.c7
-rw-r--r--xlators/features/marker/src/marker-common.h4
-rw-r--r--xlators/features/marker/src/marker-mem-types.h3
-rw-r--r--xlators/features/marker/src/marker-quota-helper.c95
-rw-r--r--xlators/features/marker/src/marker-quota-helper.h12
-rw-r--r--xlators/features/marker/src/marker-quota.c175
-rw-r--r--xlators/features/marker/src/marker-quota.h19
-rw-r--r--xlators/features/marker/src/marker.c60
-rw-r--r--xlators/features/marker/src/marker.h8
-rw-r--r--xlators/features/metadisp/Makefile.am (renamed from xlators/storage/bd/Makefile.am)0
-rw-r--r--xlators/features/metadisp/src/Makefile.am38
-rw-r--r--xlators/features/metadisp/src/backend.c45
-rw-r--r--xlators/features/metadisp/src/fops-tmpl.c10
-rw-r--r--xlators/features/metadisp/src/gen-fops.py160
-rw-r--r--xlators/features/metadisp/src/metadisp-create.c101
-rw-r--r--xlators/features/metadisp/src/metadisp-fops.h51
-rw-r--r--xlators/features/metadisp/src/metadisp-fsync.c54
-rw-r--r--xlators/features/metadisp/src/metadisp-lookup.c90
-rw-r--r--xlators/features/metadisp/src/metadisp-open.c70
-rw-r--r--xlators/features/metadisp/src/metadisp-readdir.c65
-rw-r--r--xlators/features/metadisp/src/metadisp-setattr.c90
-rw-r--r--xlators/features/metadisp/src/metadisp-stat.c124
-rw-r--r--xlators/features/metadisp/src/metadisp-unlink.c160
-rw-r--r--xlators/features/metadisp/src/metadisp.c46
-rw-r--r--xlators/features/metadisp/src/metadisp.h45
-rw-r--r--xlators/features/namespace/src/namespace.c27
-rw-r--r--xlators/features/namespace/src/namespace.h4
-rw-r--r--xlators/features/quiesce/src/quiesce-mem-types.h2
-rw-r--r--xlators/features/quiesce/src/quiesce-messages.h2
-rw-r--r--xlators/features/quiesce/src/quiesce.c69
-rw-r--r--xlators/features/quiesce/src/quiesce.h4
-rw-r--r--xlators/features/quota/src/Makefile.am5
-rw-r--r--xlators/features/quota/src/quota-enforcer-client.c32
-rw-r--r--xlators/features/quota/src/quota-mem-types.h3
-rw-r--r--xlators/features/quota/src/quota-messages.h2
-rw-r--r--xlators/features/quota/src/quota.c292
-rw-r--r--xlators/features/quota/src/quota.h31
-rw-r--r--xlators/features/quota/src/quotad-aggregator.c117
-rw-r--r--xlators/features/quota/src/quotad-aggregator.h10
-rw-r--r--xlators/features/quota/src/quotad-helpers.c6
-rw-r--r--xlators/features/quota/src/quotad.c43
-rw-r--r--xlators/features/quota/src/quotad.sym7
-rw-r--r--xlators/features/read-only/src/read-only-common.c2
-rw-r--r--xlators/features/read-only/src/read-only-common.h4
-rw-r--r--xlators/features/read-only/src/read-only-mem-types.h2
-rw-r--r--xlators/features/read-only/src/read-only.c14
-rw-r--r--xlators/features/read-only/src/read-only.h15
-rw-r--r--xlators/features/read-only/src/worm-helper.c24
-rw-r--r--xlators/features/read-only/src/worm.c144
-rw-r--r--xlators/features/sdfs/src/sdfs-messages.h2
-rw-r--r--xlators/features/sdfs/src/sdfs.c29
-rw-r--r--xlators/features/sdfs/src/sdfs.h6
-rw-r--r--xlators/features/selinux/src/selinux-mem-types.h2
-rw-r--r--xlators/features/selinux/src/selinux-messages.h2
-rw-r--r--xlators/features/selinux/src/selinux.c25
-rw-r--r--xlators/features/selinux/src/selinux.h2
-rw-r--r--xlators/features/shard/src/shard-mem-types.h2
-rw-r--r--xlators/features/shard/src/shard-messages.h2
-rw-r--r--xlators/features/shard/src/shard.c949
-rw-r--r--xlators/features/shard/src/shard.h26
-rw-r--r--xlators/features/snapview-client/src/Makefile.am2
-rw-r--r--xlators/features/snapview-client/src/snapview-client-mem-types.h2
-rw-r--r--xlators/features/snapview-client/src/snapview-client-messages.h71
-rw-r--r--xlators/features/snapview-client/src/snapview-client.c761
-rw-r--r--xlators/features/snapview-client/src/snapview-client.h16
-rw-r--r--xlators/features/snapview-server/src/Makefile.am2
-rw-r--r--xlators/features/snapview-server/src/snapview-server-helpers.c88
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mem-types.h2
-rw-r--r--xlators/features/snapview-server/src/snapview-server-messages.h54
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mgmt.c125
-rw-r--r--xlators/features/snapview-server/src/snapview-server.c619
-rw-r--r--xlators/features/snapview-server/src/snapview-server.h36
-rw-r--r--xlators/features/thin-arbiter/src/Makefile.am2
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h2
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter-messages.h2
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter.c23
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter.h12
-rw-r--r--xlators/features/trash/src/trash-mem-types.h2
-rw-r--r--xlators/features/trash/src/trash.c24
-rw-r--r--xlators/features/trash/src/trash.h10
-rw-r--r--xlators/features/upcall/src/upcall-cache-invalidation.h6
-rw-r--r--xlators/features/upcall/src/upcall-internal.c207
-rw-r--r--xlators/features/upcall/src/upcall-mem-types.h2
-rw-r--r--xlators/features/upcall/src/upcall-messages.h2
-rw-r--r--xlators/features/upcall/src/upcall.c142
-rw-r--r--xlators/features/upcall/src/upcall.h34
-rw-r--r--xlators/features/utime/src/utime-autogen-fops-tmpl.c10
-rw-r--r--xlators/features/utime/src/utime-autogen-fops-tmpl.h2
-rwxr-xr-x[-rw-r--r--]xlators/features/utime/src/utime-gen-fops-c.py28
-rwxr-xr-x[-rw-r--r--]xlators/features/utime/src/utime-gen-fops-h.py2
-rw-r--r--xlators/features/utime/src/utime-helpers.c11
-rw-r--r--xlators/features/utime/src/utime-helpers.h7
-rw-r--r--xlators/features/utime/src/utime-mem-types.h2
-rw-r--r--xlators/features/utime/src/utime-messages.h5
-rw-r--r--xlators/features/utime/src/utime.c186
-rw-r--r--xlators/features/utime/src/utime.h6
-rw-r--r--xlators/lib/src/libxlator.c15
-rw-r--r--xlators/lib/src/libxlator.h14
-rw-r--r--xlators/meta/src/active-link.c4
-rw-r--r--xlators/meta/src/cmdline-file.c8
-rw-r--r--xlators/meta/src/frames-file.c11
-rw-r--r--xlators/meta/src/graph-dir.c4
-rw-r--r--xlators/meta/src/graphs-dir.c4
-rw-r--r--xlators/meta/src/history-file.c8
-rw-r--r--xlators/meta/src/logfile-link.c4
-rw-r--r--xlators/meta/src/logging-dir.c4
-rw-r--r--xlators/meta/src/loglevel-file.c6
-rw-r--r--xlators/meta/src/mallinfo-file.c6
-rw-r--r--xlators/meta/src/measure-file.c6
-rw-r--r--xlators/meta/src/meminfo-file.c8
-rw-r--r--xlators/meta/src/meta-defaults.c23
-rw-r--r--xlators/meta/src/meta-helpers.c21
-rw-r--r--xlators/meta/src/meta-hooks.h2
-rw-r--r--xlators/meta/src/meta-mem-types.h2
-rw-r--r--xlators/meta/src/meta.c20
-rw-r--r--xlators/meta/src/meta.h2
-rw-r--r--xlators/meta/src/name-file.c8
-rw-r--r--xlators/meta/src/option-file.c4
-rw-r--r--xlators/meta/src/options-dir.c4
-rw-r--r--xlators/meta/src/private-file.c8
-rw-r--r--xlators/meta/src/process_uuid-file.c8
-rw-r--r--xlators/meta/src/profile-file.c8
-rw-r--r--xlators/meta/src/root-dir.c4
-rw-r--r--xlators/meta/src/subvolume-link.c4
-rw-r--r--xlators/meta/src/subvolumes-dir.c4
-rw-r--r--xlators/meta/src/top-link.c4
-rw-r--r--xlators/meta/src/type-file.c8
-rw-r--r--xlators/meta/src/version-file.c8
-rw-r--r--xlators/meta/src/view-dir.c4
-rw-r--r--xlators/meta/src/volfile-file.c6
-rw-r--r--xlators/meta/src/xlator-dir.c4
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am32
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitd-svc.c8
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitrot.c97
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-brick-ops.c1347
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-mgmt.c76
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-mgmt.h8
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-errno.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-ganesha.c927
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.c324
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.c7
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c20
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c1348
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c385
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.c126
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.h4
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.c130
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.h3
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-log-ops.c24
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mem-types.h101
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-messages.h158
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c247
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.c898
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.h15
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.c69
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.h6
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-nfs-svc.c13
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-nfs-svc.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c1427
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.h29
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-peer-utils.c346
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-peer-utils.h9
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.c40
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.h16
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c10
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-proc-mgmt.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.c104
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quotad-svc.c10
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rcu.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c634
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-replace-brick.c66
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-reset-brick.c26
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rpc-ops.c190
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-scrub-svc.c12
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-server-quorum.c20
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.c153
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.h42
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc.c678
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc.h17
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c275
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.h13
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc.c29
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c410
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot-utils.h10
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot.c415
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-statedump.c10
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-statedump.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.c1795
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.h58
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-helper.c836
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-helper.h43
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c284
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-mgmt.h46
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.c199
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.h4
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-tier.c1382
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-tierd-svc-helper.c9
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-tierd-svc-helper.h37
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-tierd-svc.c503
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-tierd-svc.h41
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c3587
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h111
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c2089
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h34
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c952
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c1129
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c455
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h531
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.c1535
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.h157
-rw-r--r--xlators/mount/fuse/src/fuse-helpers.c113
-rw-r--r--xlators/mount/fuse/src/fuse-mem-types.h4
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in84
-rwxr-xr-xxlators/mount/fuse/utils/mount_glusterfs.in19
-rw-r--r--xlators/nfs/server/src/acl3.c46
-rw-r--r--xlators/nfs/server/src/acl3.h4
-rw-r--r--xlators/nfs/server/src/auth-cache.c15
-rw-r--r--xlators/nfs/server/src/auth-cache.h2
-rw-r--r--xlators/nfs/server/src/exports.c16
-rw-r--r--xlators/nfs/server/src/exports.h2
-rw-r--r--xlators/nfs/server/src/mount3-auth.c2
-rw-r--r--xlators/nfs/server/src/mount3.c94
-rw-r--r--xlators/nfs/server/src/mount3.h12
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c10
-rw-r--r--xlators/nfs/server/src/netgroups.c2
-rw-r--r--xlators/nfs/server/src/netgroups.h2
-rw-r--r--xlators/nfs/server/src/nfs-common.c17
-rw-r--r--xlators/nfs/server/src/nfs-common.h6
-rw-r--r--xlators/nfs/server/src/nfs-fops.c14
-rw-r--r--xlators/nfs/server/src/nfs-fops.h8
-rw-r--r--xlators/nfs/server/src/nfs-generics.c2
-rw-r--r--xlators/nfs/server/src/nfs-generics.h2
-rw-r--r--xlators/nfs/server/src/nfs-inodes.c2
-rw-r--r--xlators/nfs/server/src/nfs-inodes.h8
-rw-r--r--xlators/nfs/server/src/nfs-mem-types.h9
-rw-r--r--xlators/nfs/server/src/nfs-messages.h2
-rw-r--r--xlators/nfs/server/src/nfs.c71
-rw-r--r--xlators/nfs/server/src/nfs.h8
-rw-r--r--xlators/nfs/server/src/nfs3-fh.c16
-rw-r--r--xlators/nfs/server/src/nfs3-fh.h6
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.c18
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.h2
-rw-r--r--xlators/nfs/server/src/nfs3.c77
-rw-r--r--xlators/nfs/server/src/nfs3.h10
-rw-r--r--xlators/nfs/server/src/nfsserver.sym10
-rw-r--r--xlators/nfs/server/src/nlm4.c128
-rw-r--r--xlators/nfs/server/src/nlm4.h14
-rw-r--r--xlators/nfs/server/src/nlmcbk_svc.c7
-rw-r--r--xlators/performance/Makefile.am2
-rw-r--r--xlators/performance/decompounder/Makefile.am1
-rw-r--r--xlators/performance/decompounder/src/Makefile.am19
-rw-r--r--xlators/performance/decompounder/src/decompounder-mem-types.h17
-rw-r--r--xlators/performance/decompounder/src/decompounder-messages.h28
-rw-r--r--xlators/performance/decompounder/src/decompounder.c833
-rw-r--r--xlators/performance/decompounder/src/decompounder.h78
-rw-r--r--xlators/performance/io-cache/src/io-cache-messages.h41
-rw-r--r--xlators/performance/io-cache/src/io-cache.c288
-rw-r--r--xlators/performance/io-cache/src/io-cache.h45
-rw-r--r--xlators/performance/io-cache/src/ioc-inode.c14
-rw-r--r--xlators/performance/io-cache/src/ioc-mem-types.h2
-rw-r--r--xlators/performance/io-cache/src/page.c67
-rw-r--r--xlators/performance/io-threads/src/io-threads-messages.h16
-rw-r--r--xlators/performance/io-threads/src/io-threads.c181
-rw-r--r--xlators/performance/io-threads/src/io-threads.h21
-rw-r--r--xlators/performance/io-threads/src/iot-mem-types.h2
-rw-r--r--xlators/performance/md-cache/src/md-cache-mem-types.h2
-rw-r--r--xlators/performance/md-cache/src/md-cache-messages.h2
-rw-r--r--xlators/performance/md-cache/src/md-cache.c742
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-helper.c63
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-mem-types.h5
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-messages.h2
-rw-r--r--xlators/performance/nl-cache/src/nl-cache.c22
-rw-r--r--xlators/performance/nl-cache/src/nl-cache.h10
-rw-r--r--xlators/performance/open-behind/src/open-behind-mem-types.h2
-rw-r--r--xlators/performance/open-behind/src/open-behind-messages.h8
-rw-r--r--xlators/performance/open-behind/src/open-behind.c1376
-rw-r--r--xlators/performance/quick-read/src/quick-read-mem-types.h6
-rw-r--r--xlators/performance/quick-read/src/quick-read-messages.h2
-rw-r--r--xlators/performance/quick-read/src/quick-read.c60
-rw-r--r--xlators/performance/quick-read/src/quick-read.h22
-rw-r--r--xlators/performance/read-ahead/src/page.c22
-rw-r--r--xlators/performance/read-ahead/src/read-ahead-mem-types.h2
-rw-r--r--xlators/performance/read-ahead/src/read-ahead-messages.h2
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c49
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.h10
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h2
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-messages.h2
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.c191
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.h2
-rw-r--r--xlators/performance/symlink-cache/Makefile.am3
-rw-r--r--xlators/performance/symlink-cache/src/Makefile.am16
-rw-r--r--xlators/performance/symlink-cache/src/symlink-cache-messages.h30
-rw-r--r--xlators/performance/symlink-cache/src/symlink-cache.c358
-rw-r--r--xlators/performance/write-behind/src/write-behind-mem-types.h2
-rw-r--r--xlators/performance/write-behind/src/write-behind-messages.h2
-rw-r--r--xlators/performance/write-behind/src/write-behind.c136
-rw-r--r--xlators/playground/rot-13/Makefile.am (renamed from xlators/cluster/stripe/Makefile.am)0
-rw-r--r--xlators/playground/rot-13/src/Makefile.am (renamed from xlators/encryption/rot-13/src/Makefile.am)0
-rw-r--r--xlators/playground/rot-13/src/rot-13.c (renamed from xlators/encryption/rot-13/src/rot-13.c)6
-rw-r--r--xlators/playground/rot-13/src/rot-13.h (renamed from xlators/encryption/rot-13/src/rot-13.h)0
-rw-r--r--xlators/playground/template/src/template.c2
-rw-r--r--xlators/playground/template/src/template.h14
-rw-r--r--xlators/protocol/auth/addr/src/addr.c18
-rw-r--r--xlators/protocol/client/src/client-callback.c91
-rw-r--r--xlators/protocol/client/src/client-common.c66
-rw-r--r--xlators/protocol/client/src/client-common.h10
-rw-r--r--xlators/protocol/client/src/client-handshake.c762
-rw-r--r--xlators/protocol/client/src/client-helpers.c2355
-rw-r--r--xlators/protocol/client/src/client-lk.c47
-rw-r--r--xlators/protocol/client/src/client-mem-types.h3
-rw-r--r--xlators/protocol/client/src/client-messages.h125
-rw-r--r--xlators/protocol/client/src/client-rpc-fops.c1137
-rw-r--r--xlators/protocol/client/src/client-rpc-fops_v2.c1276
-rw-r--r--xlators/protocol/client/src/client.c1358
-rw-r--r--xlators/protocol/client/src/client.h222
-rw-r--r--xlators/protocol/server/src/Makefile.am4
-rw-r--r--xlators/protocol/server/src/authenticate.h8
-rw-r--r--xlators/protocol/server/src/server-common.c53
-rw-r--r--xlators/protocol/server/src/server-common.h11
-rw-r--r--xlators/protocol/server/src/server-handshake.c159
-rw-r--r--xlators/protocol/server/src/server-helpers.c4706
-rw-r--r--xlators/protocol/server/src/server-helpers.h39
-rw-r--r--xlators/protocol/server/src/server-mem-types.h8
-rw-r--r--xlators/protocol/server/src/server-messages.h181
-rw-r--r--xlators/protocol/server/src/server-resolve.c40
-rw-r--r--xlators/protocol/server/src/server-rpc-fops.c401
-rw-r--r--xlators/protocol/server/src/server-rpc-fops_v2.c1647
-rw-r--r--xlators/protocol/server/src/server.c523
-rw-r--r--xlators/protocol/server/src/server.h97
-rw-r--r--xlators/storage/Makefile.am4
-rw-r--r--xlators/storage/bd/src/Makefile.am21
-rw-r--r--xlators/storage/bd/src/bd-aio.c518
-rw-r--r--xlators/storage/bd/src/bd-aio.h40
-rw-r--r--xlators/storage/bd/src/bd-helper.c1073
-rw-r--r--xlators/storage/bd/src/bd-mem-types.h26
-rw-r--r--xlators/storage/bd/src/bd.c2420
-rw-r--r--xlators/storage/bd/src/bd.h189
-rw-r--r--xlators/storage/posix/src/Makefile.am2
-rw-r--r--xlators/storage/posix/src/posix-aio.c20
-rw-r--r--xlators/storage/posix/src/posix-aio.h3
-rw-r--r--xlators/storage/posix/src/posix-common.c353
-rw-r--r--xlators/storage/posix/src/posix-entry-ops.c470
-rw-r--r--xlators/storage/posix/src/posix-gfid-path.c116
-rw-r--r--xlators/storage/posix/src/posix-gfid-path.h13
-rw-r--r--xlators/storage/posix/src/posix-handle.c209
-rw-r--r--xlators/storage/posix/src/posix-handle.h33
-rw-r--r--xlators/storage/posix/src/posix-helpers.c992
-rw-r--r--xlators/storage/posix/src/posix-inode-fd-ops.c1118
-rw-r--r--xlators/storage/posix/src/posix-inode-handle.h20
-rw-r--r--xlators/storage/posix/src/posix-mem-types.h7
-rw-r--r--xlators/storage/posix/src/posix-messages.h6
-rw-r--r--xlators/storage/posix/src/posix-metadata.c557
-rw-r--r--xlators/storage/posix/src/posix-metadata.h25
-rw-r--r--xlators/storage/posix/src/posix.c34
-rw-r--r--xlators/storage/posix/src/posix.h183
-rw-r--r--xlators/system/posix-acl/src/posix-acl-mem-types.h2
-rw-r--r--xlators/system/posix-acl/src/posix-acl-messages.h2
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.h6
-rw-r--r--xlators/system/posix-acl/src/posix-acl.c222
-rw-r--r--xlators/system/posix-acl/src/posix-acl.h7
-rw-r--r--xlators/xlator.sym10
601 files changed, 44814 insertions, 70603 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am
index 29549db724e..ef20cbb64fa 100644
--- a/xlators/Makefile.am
+++ b/xlators/Makefile.am
@@ -2,10 +2,10 @@ if BUILD_GNFS
GNFS_DIR = nfs
endif
-DIST_SUBDIRS = cluster storage protocol performance debug features encryption \
+DIST_SUBDIRS = cluster storage protocol performance debug features \
mount nfs mgmt system playground meta
-SUBDIRS = cluster storage protocol performance debug features encryption \
+SUBDIRS = cluster storage protocol performance debug features \
mount ${GNFS_DIR} mgmt system playground meta
EXTRA_DIST = xlator.sym
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 903fbb39f12..8e067d5ab58 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht ec
+SUBDIRS = afr dht ec
CLEANFILES =
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 7d352344e7a..032ab5c8001 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -15,22 +15,20 @@
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "statedump.h"
-#include "events.h"
-#include "upcall-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/hashfn.h>
+#include <glusterfs/list.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/events.h>
+#include <glusterfs/upcall-utils.h>
#include "afr-inode-read.h"
#include "afr-inode-write.h"
@@ -47,6 +45,56 @@ afr_quorum_errno(afr_private_t *priv)
return ENOTCONN;
}
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid)
+{
+ if (!__is_root_gfid(pargfid)) {
+ return _gf_false;
+ }
+
+ if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) {
+ /*For backward compatibility /.landfill is private*/
+ return _gf_true;
+ }
+
+ if (pid == GF_CLIENT_PID_GSYNCD) {
+ /*geo-rep needs to create/sync private directory on slave because
+ * it appears in changelog*/
+ return _gf_false;
+ }
+
+ if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) {
+ if (strcmp(name, priv->anon_inode_name) == 0) {
+ /* anonymous-inode dir is private*/
+ return _gf_true;
+ }
+ } else {
+ if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) ==
+ 0) {
+ /* anonymous-inode dir prefix is private for geo-rep to work*/
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
+}
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ replies[i] = 1;
+ } else {
+ replies[i] = 0;
+ }
+ }
+}
+
int
afr_fav_child_reset_sink_xattrs(void *opaque);
@@ -56,6 +104,581 @@ afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);
static void
afr_discover_done(call_frame_t *frame, xlator_t *this);
+int
+afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ local->cont.lk.dom_lock_op_ret[i] = op_ret;
+ local->cont.lk.dom_lock_op_errno[i] = op_errno;
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to acquire %s on %s",
+ uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM,
+ priv->children[i]->name);
+ } else {
+ local->cont.lk.dom_locked_nodes[i] = 1;
+ }
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
+
+int
+afr_dom_lock_acquire(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ local->cont.lk.dom_locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+ if (!local->cont.lk.dom_locked_nodes) {
+ return -ENOMEM;
+ }
+ local->cont.lk.dom_lock_op_ret = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_ret) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ local->cont.lk.dom_lock_op_errno = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_errno) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ flock.l_type = F_WRLCK;
+
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLK, &flock, NULL);
+
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL))
+ goto blocking_lock;
+
+ /*If any of the bricks returned EAGAIN, we still need blocking locks.*/
+ if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) !=
+ priv->child_count) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.dom_lock_op_ret[i] == -1 &&
+ local->cont.lk.dom_lock_op_errno[i] == EAGAIN)
+ goto blocking_lock;
+ }
+ }
+
+ return 0;
+
+blocking_lock:
+ afr_dom_lock_release(frame);
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLKW, &flock, NULL);
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) {
+ afr_dom_lock_release(frame);
+ return -afr_quorum_errno(priv);
+ }
+
+ return 0;
+}
+
+int
+afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to release %s on %s", local->loc.path,
+ AFR_LK_HEAL_DOM, priv->children[i]->name);
+ }
+ local->cont.lk.dom_locked_nodes[i] = 0;
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
+
+void
+afr_dom_lock_release(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = frame->this->private;
+ locked_on = local->cont.lk.dom_locked_nodes;
+ if (AFR_COUNT(locked_on, priv->child_count) == 0)
+ return;
+ flock.l_type = F_UNLCK;
+
+ AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk,
+ AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL);
+
+ return;
+}
+
+static void
+afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info)
+{
+ if (!info)
+ return;
+ if (info->xdata_req)
+ dict_unref(info->xdata_req);
+ if (info->fd)
+ fd_unref(info->fd);
+ GF_FREE(info->locked_nodes);
+ GF_FREE(info->child_up_event_gen);
+ GF_FREE(info->child_down_event_gen);
+ GF_FREE(info);
+}
+
+static int
+afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -ENOMEM;
+
+ info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t);
+ if (!info) {
+ goto cleanup;
+ }
+ INIT_LIST_HEAD(&info->pos);
+ info->fd = fd_ref(local->fd);
+ info->cmd = local->cont.lk.cmd;
+ info->pid = frame->root->pid;
+ info->flock = local->cont.lk.user_flock;
+ info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL);
+ if (!info->xdata_req) {
+ goto cleanup;
+ }
+ info->lk_owner = frame->root->lk_owner;
+ info->locked_nodes = GF_MALLOC(
+ sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char);
+ if (!info->locked_nodes) {
+ goto cleanup;
+ }
+ memcpy(info->locked_nodes, local->cont.lk.locked_nodes,
+ sizeof(*info->locked_nodes) * priv->child_count);
+ info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen),
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!info->child_up_event_gen) {
+ goto cleanup;
+ }
+ info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!info->child_down_event_gen) {
+ goto cleanup;
+ }
+
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(local->fd, this);
+ if (fd_ctx)
+ fd_ctx->lk_heal_info = info;
+ }
+ UNLOCK(&local->fd->lock);
+ if (!fd_ctx) {
+ goto cleanup;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&info->pos, &priv->saved_locks);
+ }
+ UNLOCK(&priv->lock);
+
+ return 0;
+cleanup:
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to add lock to healq",
+ uuid_utoa(local->fd->inode->gfid));
+ if (info) {
+ afr_lk_heal_info_cleanup(info);
+ if (fd_ctx) {
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx->lk_heal_info = NULL;
+ }
+ UNLOCK(&local->fd->lock);
+ }
+ }
+ return ret;
+}
+
+static int
+afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = local->cont.lk.user_flock;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -EINVAL;
+
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx || !fd_ctx->lk_heal_info) {
+ goto out;
+ }
+
+ info = fd_ctx->lk_heal_info;
+ if ((info->flock.l_start != flock.l_start) ||
+ (info->flock.l_whence != flock.l_whence) ||
+ (info->flock.l_len != flock.l_len)) {
+ /*TODO: Compare lkowners too.*/
+ goto out;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ UNLOCK(&priv->lock);
+
+ afr_lk_heal_info_cleanup(info);
+ fd_ctx->lk_heal_info = NULL;
+ ret = 0;
+out:
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to remove lock from healq",
+ uuid_utoa(local->fd->inode->gfid));
+ return ret;
+}
+
+int
+afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed to heal lock on child %d for %s", i,
+ uuid_utoa(local->fd->inode->gfid));
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid));
+ } else {
+ local->cont.lk.getlk_rsp[i] = *lock;
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+static gf_boolean_t
+afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ afr_local_t *local = frame->local;
+ struct gf_flock flock = {
+ 0,
+ };
+ gf_boolean_t ret = _gf_true;
+ char *wind_on = alloca0(priv->child_count);
+ unsigned char *success_replies = alloca0(priv->child_count);
+ local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp),
+ priv->child_count, gf_afr_mt_gf_lock);
+
+ flock = info->flock;
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ wind_on[i] = 1;
+ }
+
+ AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock,
+ info->xdata_req);
+
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) == 0) {
+ ret = _gf_false;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret != 0)
+ continue;
+ if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK)
+ continue;
+ /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/
+ if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner,
+ &info->lk_owner)) {
+ ret = _gf_false;
+ break;
+ }
+ }
+out:
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->cont.lk.getlk_rsp);
+ local->cont.lk.getlk_rsp = NULL;
+ return ret;
+}
+
+static void
+afr_mark_fd_bad(fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ if (!fd)
+ return;
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ if (fd_ctx) {
+ fd_ctx->is_fd_bad = _gf_true;
+ fd_ctx->lk_heal_info = NULL;
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+static void
+afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info)
+{
+ LOCK(&priv->lock);
+ {
+ list_del(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+}
+
+static void
+afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ int op_errno = 0;
+ int32_t *current_event_gen = NULL;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ char *wind_on = alloca0(priv->child_count);
+ gf_boolean_t retry = _gf_true;
+
+ frame->root->pid = info->pid;
+ lk_owner_copy(&frame->root->lk_owner, &info->lk_owner);
+
+ op_errno = -afr_dom_lock_acquire(frame);
+ if ((op_errno != 0)) {
+ goto release;
+ }
+
+ if (!afr_does_lk_owner_match(frame, priv, info)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM,
+ "Ignoring lock heal for %s since lk-onwers mismatch. "
+ "Lock possibly pre-empted by another client.",
+ uuid_utoa(info->fd->inode->gfid));
+ goto release;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ continue;
+ wind_on[i] = 1;
+ }
+
+ current_event_gen = alloca(priv->child_count);
+ memcpy(current_event_gen, info->child_up_event_gen,
+ priv->child_count * sizeof *current_event_gen);
+ AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd,
+ &info->flock, info->xdata_req);
+
+ LOCK(&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) {
+ continue;
+ }
+
+ if ((current_event_gen[i] == info->child_up_event_gen[i]) &&
+ (current_event_gen[i] > info->child_down_event_gen[i])) {
+ info->locked_nodes[i] = 1;
+ retry = _gf_false;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->saved_locks);
+ } else {
+ /*We received subsequent child up/down events while heal was in
+ * progress; don't mark child as healed. Attempt again on the
+ * new child up*/
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM,
+ "Event gen mismatch: skipped healing lock on child %d "
+ "for %s.",
+ i, uuid_utoa(info->fd->inode->gfid));
+ }
+ }
+ }
+ UNLOCK(&priv->lock);
+
+release:
+ afr_dom_lock_release(frame);
+ if (retry)
+ afr_add_lock_to_lkhealq(priv, info);
+ return;
+}
+
+static int
+afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ STACK_DESTROY(frame->root);
+ return 0;
+}
+
+static int
+afr_lock_heal(void *opaque)
+{
+ call_frame_t *frame = (call_frame_t *)opaque;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+ struct list_head healq = {
+ 0,
+ };
+ int ret = 0;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&healq);
+ LOCK(&priv->lock);
+ {
+ list_splice_init(&priv->lk_healq, &healq);
+ }
+ UNLOCK(&priv->lock);
+
+ list_for_each_entry_safe(info, tmp, &healq, pos)
+ {
+ GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) <
+ priv->child_count));
+ ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd);
+ afr_lock_heal_do(iter_frame, priv, info);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = ENOTCONN;
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN,
+ AFR_MSG_LK_HEAL_DOM,
+ "Aborting processing of lk_healq."
+ "Healing will be reattempted on next child up for locks "
+ "that are still in quorum.");
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&healq, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+ break;
+ }
+ }
+
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
+}
+
+static int
+__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+
+ if (priv->shd.iamshd)
+ return 0;
+
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_up_event_gen[child] = priv->event_generation;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return -1;
+
+ ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame,
+ frame);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM,
+ "Failed to launch lock heal synctask");
+
+ return ret;
+}
+
+static int
+__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child)
+{
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+
+ if (priv->shd.iamshd)
+ return 0;
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_down_event_gen[child] = priv->event_generation;
+ if (info->locked_nodes[child] == 1)
+ info->locked_nodes[child] = 0;
+ if (!afr_has_quorum(info->locked_nodes, this, NULL)) {
+ /* Since the lock was lost on quorum no. of nodes, we should
+ * not attempt to heal it anymore. Some other client could have
+ * acquired the lock, modified data and released it and this
+ * client wouldn't know about it if we heal it.*/
+ afr_mark_fd_bad(info->fd, this);
+ list_del(&info->pos);
+ afr_lk_heal_info_cleanup(info);
+ /* We're not winding an unlock on the node where the lock is still
+ * present because when fencing logic switches over to the new
+ * client (since we marked the fd bad), it should preempt any
+ * existing lock. */
+ }
+ }
+ return 0;
+}
+
gf_boolean_t
afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
int32_t *op_errno)
@@ -70,6 +693,19 @@ afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
return _gf_true;
}
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata)
+{
+ int ret = 0;
+ uint32_t lk_mode = GF_LK_ADVISORY;
+
+ ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode);
+ if (!ret && lk_mode == GF_LK_MANDATORY)
+ return _gf_true;
+
+ return _gf_false;
+}
+
call_frame_t *
afr_copy_frame(call_frame_t *base)
{
@@ -98,21 +734,24 @@ afr_is_possibly_under_txn(afr_transaction_type type, afr_local_t *local,
int tmp = 0;
afr_private_t *priv = NULL;
GF_UNUSED char *key = NULL;
+ int keylen = 0;
priv = this->private;
- if (type == AFR_ENTRY_TRANSACTION)
+ if (type == AFR_ENTRY_TRANSACTION) {
key = GLUSTERFS_PARENT_ENTRYLK;
- else if (type == AFR_DATA_TRANSACTION)
+ keylen = SLEN(GLUSTERFS_PARENT_ENTRYLK);
+ } else if (type == AFR_DATA_TRANSACTION) {
/*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once
* pl_inodelk_xattr_fill supports separate keys for different
* domains.*/
key = GLUSTERFS_INODELK_COUNT;
-
+ keylen = SLEN(GLUSTERFS_INODELK_COUNT);
+ }
for (i = 0; i < priv->child_count; i++) {
if (!local->replies[i].xdata)
continue;
- if (dict_get_int32(local->replies[i].xdata, key, &tmp) == 0)
+ if (dict_get_int32n(local->replies[i].xdata, key, keylen, &tmp) == 0)
if (tmp)
return _gf_true;
}
@@ -148,7 +787,7 @@ __afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
ret = __inode_ctx_get(inode, this, &ctx_int);
if (ret == 0) {
- *ctx = (afr_inode_ctx_t *)ctx_int;
+ *ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
return 0;
}
@@ -174,7 +813,7 @@ __afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
INIT_LIST_HEAD(&lock->owners);
}
- ctx_int = (uint64_t)ictx;
+ ctx_int = (uint64_t)(uintptr_t)ictx;
ret = __inode_ctx_set(inode, this, &ctx_int);
if (ret) {
goto out;
@@ -260,11 +899,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
count = gf_bits_count(tmp_map);
- if (count == 1)
- index = gf_bits_index(tmp_map);
-
for (i = 0; i < priv->child_count; i++) {
- mask = 0;
if (!local->transaction.failed_subvols[i])
continue;
@@ -278,25 +913,27 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
switch (txn_type) {
case AFR_METADATA_TRANSACTION:
if ((metadatamap_old != 0) && (metadatamap == 0) && (count == 1)) {
+ index = gf_bits_index(tmp_map);
local->transaction.in_flight_sb_errno = local->replies[index]
.op_errno;
local->transaction.in_flight_sb = _gf_true;
metadatamap |= (1 << index);
}
if (metadatamap_old != metadatamap) {
- event = 0;
+ __afr_inode_need_refresh_set(inode, this);
}
break;
case AFR_DATA_TRANSACTION:
if ((datamap_old != 0) && (datamap == 0) && (count == 1)) {
+ index = gf_bits_index(tmp_map);
local->transaction.in_flight_sb_errno = local->replies[index]
.op_errno;
local->transaction.in_flight_sb = _gf_true;
datamap |= (1 << index);
}
if (datamap_old != datamap)
- event = 0;
+ __afr_inode_need_refresh_set(inode, this);
break;
default:
@@ -460,34 +1097,6 @@ out:
}
int
-__afr_inode_event_gen_reset_small(inode_t *inode, xlator_t *this)
-{
- int ret = -1;
- uint16_t datamap = 0;
- uint16_t metadatamap = 0;
- uint32_t event = 0;
- uint64_t val = 0;
- afr_inode_ctx_t *ctx = NULL;
-
- ret = __afr_inode_ctx_get(this, inode, &ctx);
- if (ret)
- return ret;
-
- val = ctx->read_subvol;
-
- metadatamap = (val & 0x000000000000ffff) >> 0;
- datamap = (val & 0x00000000ffff0000) >> 16;
- event = 0;
-
- val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
- (((uint64_t)event) << 32);
-
- ctx->read_subvol = val;
-
- return ret;
-}
-
-int
__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int *event_p)
{
@@ -558,22 +1167,6 @@ out:
}
int
-__afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- int ret = -1;
-
- priv = this->private;
-
- if (priv->child_count <= 16)
- ret = __afr_inode_event_gen_reset_small(inode, this);
- else
- ret = -1;
-
- return ret;
-}
-
-int
afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int *event_p)
{
@@ -639,12 +1232,11 @@ afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
return 0;
}
-int
+static int
afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
int *spb_choice)
{
int ret = -1;
-
GF_VALIDATE_OR_GOTO(this->name, inode, out);
LOCK(&inode->lock);
@@ -656,6 +1248,40 @@ out:
return ret;
}
+/*
+ * frame is used to get the favourite policy. Since
+ * afr_inode_split_brain_choice_get was called with afr_open, it is possible to
+ * have a frame with out local->replies. So in that case, frame is passed as
+ * null, hence this function will handle the frame NULL case.
+ */
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol)
+{
+ int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO("afr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out);
+
+ priv = this->private;
+
+ ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol);
+ if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) {
+ local = frame->local;
+ *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (*spb_subvol >= 0) {
+ ret = 0;
+ }
+ }
+
+out:
+ return ret;
+}
int
afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
unsigned char *metadata, int event)
@@ -722,30 +1348,22 @@ out:
return need_refresh;
}
-static int
-afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
int ret = -1;
afr_inode_ctx_t *ctx = NULL;
- GF_VALIDATE_OR_GOTO(this->name, inode, out);
-
- LOCK(&inode->lock);
- {
- ret = __afr_inode_ctx_get(this, inode, &ctx);
- if (ret)
- goto unlock;
-
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret == 0) {
ctx->need_refresh = _gf_true;
}
-unlock:
- UNLOCK(&inode->lock);
-out:
+
return ret;
}
int
-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
int ret = -1;
@@ -753,7 +1371,7 @@ afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
LOCK(&inode->lock);
{
- ret = __afr_inode_event_gen_reset(inode, this);
+ ret = __afr_inode_need_refresh_set(inode, this);
}
UNLOCK(&inode->lock);
out:
@@ -773,6 +1391,7 @@ afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode)
{
ret = __afr_inode_ctx_get(this, inode, &ctx);
if (ret < 0 || !ctx) {
+ UNLOCK(&inode->lock);
gf_msg(this->name, GF_LOG_WARNING, 0,
AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
"Failed to cancel split-brain choice timer.");
@@ -785,8 +1404,8 @@ afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode)
}
ret = 0;
}
-out:
UNLOCK(&inode->lock);
+out:
return ret;
}
@@ -818,7 +1437,6 @@ afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
gf_boolean_t timer_set = _gf_false;
gf_boolean_t timer_cancelled = _gf_false;
gf_boolean_t timer_reset = _gf_false;
- gf_boolean_t need_invalidate = _gf_true;
int old_spb_choice = -1;
frame = data->frame;
@@ -862,10 +1480,11 @@ afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
{
ret = __afr_inode_ctx_get(this, inode, &ctx);
if (ret) {
+ UNLOCK(&inode->lock);
gf_msg(this->name, GF_LOG_ERROR, 0,
AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
"Failed to get inode_ctx for %s", loc->name);
- goto unlock;
+ goto post_unlock;
}
old_spb_choice = ctx->spb_choice;
@@ -929,10 +1548,10 @@ afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
timer_set = _gf_true;
if (timer_reset && !ctx->timer)
timer_cancelled = _gf_true;
- need_invalidate = _gf_false;
}
unlock:
UNLOCK(&inode->lock);
+post_unlock:
if (!timer_set)
inode_unref(inode);
if (timer_cancelled)
@@ -942,8 +1561,7 @@ unlock:
* reads from an older cached value despite a change in spb_choice to
* a new value.
*/
- if (need_invalidate)
- inode_invalidate(inode);
+ inode_invalidate(inode);
out:
GF_FREE(data);
AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
@@ -988,7 +1606,7 @@ afr_accuse_smallfiles(xlator_t *this, struct afr_reply *replies,
for (i = 0; i < priv->child_count; i++) {
if (replies[i].valid && replies[i].xdata &&
- dict_get(replies[i].xdata, GLUSTERFS_BAD_INODE))
+ dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))
continue;
if (data_accused[i])
continue;
@@ -1037,7 +1655,7 @@ afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode,
if (replies) { /* Lookup */
if (!replies[i].valid || replies[i].op_ret == -1 ||
(replies[i].xdata &&
- dict_get(replies[i].xdata, GLUSTERFS_BAD_INODE))) {
+ dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))) {
data_readable[i] = 0;
metadata_readable[i] = 0;
continue;
@@ -1050,6 +1668,8 @@ afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode,
ia_type = inode->ia_type;
}
+ if (!xdata)
+ continue; /* mkdir_cbk sends NULL xdata_rsp. */
afr_accused_fill(this, xdata, data_accused,
(ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION
: AFR_DATA_TRANSACTION);
@@ -1153,18 +1773,12 @@ ret:
}
gf_boolean_t
-afr_selfheal_enabled(xlator_t *this)
+afr_selfheal_enabled(const xlator_t *this)
{
- afr_private_t *priv = NULL;
- gf_boolean_t data = _gf_false;
- int ret = 0;
+ const afr_private_t *priv = this->private;
- priv = this->private;
-
- ret = gf_string2boolean(priv->data_self_heal, &data);
- GF_ASSERT(!ret);
-
- return data || priv->metadata_self_heal || priv->entry_self_heal;
+ return priv->data_self_heal || priv->metadata_self_heal ||
+ priv->entry_self_heal;
}
int
@@ -1177,7 +1791,6 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
inode_t *inode = NULL;
int event_generation = 0;
int read_subvol = -1;
- int op_errno = ENOMEM;
int ret = 0;
local = frame->local;
@@ -1193,7 +1806,7 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
ret = afr_inode_get_readable(frame, inode, this, local->readable,
&event_generation, local->transaction.type);
- if (ret == -EIO || (local->is_read_txn && !event_generation)) {
+ if (ret == -EIO) {
/* No readable subvolume even after refresh ==> splitbrain.*/
if (!priv->fav_child_policy) {
err = EIO;
@@ -1206,18 +1819,12 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
goto refresh_done;
}
- heal_frame = copy_frame(frame);
+ heal_frame = afr_frame_create(this, NULL);
if (!heal_frame) {
err = EIO;
goto refresh_done;
}
- heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
- heal_local = AFR_FRAME_INIT(heal_frame, op_errno);
- if (!heal_local) {
- err = EIO;
- AFR_STACK_DESTROY(heal_frame);
- goto refresh_done;
- }
+ heal_local = heal_frame->local;
heal_local->xdata_req = dict_new();
if (!heal_local->xdata_req) {
err = EIO;
@@ -1238,18 +1845,6 @@ refresh_done:
return 0;
}
-static void
-afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
- unsigned char *replies)
-{
- int i = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->replies[i].valid && local->replies[i].op_ret == 0)
- replies[i] = 1;
- }
-}
-
int
afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
{
@@ -1259,7 +1854,6 @@ afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
gf_boolean_t start_heal = _gf_false;
afr_local_t *heal_local = NULL;
unsigned char *success_replies = NULL;
- int op_errno = ENOMEM;
int ret = 0;
if (error != 0) {
@@ -1271,32 +1865,32 @@ afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
success_replies = alloca0(priv->child_count);
afr_fill_success_replies(local, priv, success_replies);
- if (!afr_has_quorum(success_replies, this)) {
- error = afr_final_errno(frame->local, this->private);
- if (!error)
- error = afr_quorum_errno(priv);
- goto refresh_done;
- }
-
if (priv->thin_arbiter_count && local->is_read_txn &&
AFR_COUNT(success_replies, priv->child_count) != priv->child_count) {
/* We need to query the good bricks and/or thin-arbiter.*/
+ if (success_replies[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (success_replies[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
error = EINVAL;
goto refresh_done;
}
+ if (!afr_has_quorum(success_replies, this, frame)) {
+ error = afr_final_errno(frame->local, this->private);
+ if (!error)
+ error = afr_quorum_errno(priv);
+ goto refresh_done;
+ }
+
ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal);
if (ret && afr_selfheal_enabled(this) && start_heal) {
- heal_frame = copy_frame(frame);
+ heal_frame = afr_frame_create(this, NULL);
if (!heal_frame)
goto refresh_done;
- heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
- heal_local = AFR_FRAME_INIT(heal_frame, op_errno);
- if (!heal_local) {
- AFR_STACK_DESTROY(heal_frame);
- goto refresh_done;
- }
+ heal_local = heal_frame->local;
heal_local->refreshinode = inode_ref(local->refreshinode);
heal_local->heal_frame = heal_frame;
if (!afr_throttled_selfheal(heal_frame, this)) {
@@ -1333,17 +1927,22 @@ afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (xdata)
local->replies[call_child].xdata = dict_ref(xdata);
}
+
if (xdata) {
ret = dict_get_int8(xdata, "link-count", &need_heal);
- local->replies[call_child].need_heal = need_heal;
- } else {
- local->replies[call_child].need_heal = need_heal;
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to get link count");
+ }
}
+ local->replies[call_child].need_heal = need_heal;
call_count = afr_frame_return(frame);
if (call_count == 0) {
afr_set_need_heal(this, local);
ret = afr_inode_refresh_err(frame, this);
+ if (ret) {
+ gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed");
+ }
afr_inode_refresh_done(frame, this, ret);
}
}
@@ -1452,12 +2051,12 @@ afr_inode_refresh_do(call_frame_t *frame, xlator_t *this)
return 0;
}
- ret = dict_set_str(xdata, "link-count", GF_XATTROP_INDEX_COUNT);
+ ret = dict_set_sizen_str_sizen(xdata, "link-count", GF_XATTROP_INDEX_COUNT);
if (ret) {
gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
}
- ret = dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name);
+ ret = dict_set_str_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name);
if (ret) {
gf_msg_debug(this->name, -ret,
"Unable to set inodelk-dom-count in dict ");
@@ -1555,7 +2154,7 @@ afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req)
"query flag");
}
- ret = dict_set_int32(xattr_req, "list-xattr", 1);
+ ret = dict_set_int32_sizen(xattr_req, "list-xattr", 1);
if (ret) {
gf_msg_debug(this->name, -ret, "Unable to set list-xattr in dict ");
}
@@ -1600,7 +2199,8 @@ afr_lookup_xattr_req_prepare(afr_local_t *local, xlator_t *this,
GLUSTERFS_PARENT_ENTRYLK);
}
- ret = dict_set_str(local->xattr_req, "link-count", GF_XATTROP_INDEX_COUNT);
+ ret = dict_set_sizen_str_sizen(local->xattr_req, "link-count",
+ GF_XATTROP_INDEX_COUNT);
if (ret) {
gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
}
@@ -1611,19 +2211,18 @@ out:
}
int
-afr_least_pending_reads_child(afr_private_t *priv)
+afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)
{
int i = 0;
- int child = 0;
+ int child = -1;
int64_t read_iter = -1;
int64_t pending_read = -1;
- pending_read = GF_ATOMIC_GET(priv->pending_reads[0]);
- for (i = 1; i < priv->child_count; i++) {
- if (AFR_IS_ARBITER_BRICK(priv, i))
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])
continue;
read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
- if (read_iter < pending_read) {
+ if (child == -1 || read_iter < pending_read) {
pending_read = read_iter;
child = i;
}
@@ -1632,8 +2231,54 @@ afr_least_pending_reads_child(afr_private_t *priv)
return child;
}
+static int32_t
+afr_least_latency_child(afr_private_t *priv, unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ if (child == -1 ||
+ priv->child_latency[i] < priv->child_latency[child]) {
+ child = i;
+ }
+ }
+ return child;
+}
+
+static int32_t
+afr_least_latency_times_pending_reads_child(afr_private_t *priv,
+ unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+ int64_t pending_read = 0;
+ int64_t latency = -1;
+ int64_t least_latency = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
+ latency = (pending_read + 1) * priv->child_latency[i];
+
+ if (child == -1 || latency < least_latency) {
+ least_latency = latency;
+ child = i;
+ }
+ }
+ return child;
+}
+
int
-afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
+afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
+ unsigned char *readable)
{
uuid_t gfid_copy = {
0,
@@ -1642,14 +2287,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
int child = -1;
switch (priv->hash_mode) {
- case 0:
+ case AFR_READ_POLICY_FIRST_UP:
break;
- case 1:
+ case AFR_READ_POLICY_GFID_HASH:
gf_uuid_copy(gfid_copy, args->gfid);
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
priv->child_count;
break;
- case 2:
+ case AFR_READ_POLICY_GFID_PID_HASH:
if (args->ia_type != IA_IFDIR) {
/*
* Why getpid? Because it's one of the cheapest calls
@@ -1661,14 +2306,21 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
* need is a low probability that multiple clients
* won't converge on the same subvolume.
*/
+ gf_uuid_copy(gfid_copy, args->gfid);
pid = getpid();
- memcpy(gfid_copy, &pid, sizeof(pid));
+ *(pid_t *)gfid_copy ^= pid;
}
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
priv->child_count;
break;
- case 3:
- child = afr_least_pending_reads_child(priv);
+ case AFR_READ_POLICY_LESS_LOAD:
+ child = afr_least_pending_reads_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LEAST_LATENCY:
+ child = afr_least_latency_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
+ child = afr_least_latency_times_pending_reads_child(priv, readable);
break;
}
@@ -1701,7 +2353,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
}
/* second preference - use hashed mode */
- read_subvol = afr_hash_child(&local_args, priv);
+ read_subvol = afr_hash_child(&local_args, priv, readable);
if (read_subvol >= 0 && readable[read_subvol])
return read_subvol;
@@ -1796,11 +2448,9 @@ afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this)
afr_matrix_cleanup(local->pending, priv->child_count);
- GF_FREE(local->internal_lock.locked_nodes);
-
GF_FREE(local->internal_lock.lower_locked_nodes);
- afr_entry_lockee_cleanup(&local->internal_lock);
+ afr_lockees_cleanup(&local->internal_lock);
GF_FREE(local->transaction.pre_op);
@@ -2009,6 +2659,9 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this)
{ /* lk */
GF_FREE(local->cont.lk.locked_nodes);
+ GF_FREE(local->cont.lk.dom_locked_nodes);
+ GF_FREE(local->cont.lk.dom_lock_op_ret);
+ GF_FREE(local->cont.lk.dom_lock_op_errno);
}
{ /* create */
@@ -2239,7 +2892,7 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int spb_choice = -1;
+ int spb_subvol = -1;
int child_count = -1;
if (*read_subvol != -1)
@@ -2249,13 +2902,15 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
local = frame->local;
child_count = priv->child_count;
- afr_inode_split_brain_choice_get(local->inode, this, &spb_choice);
- if ((spb_choice >= 0) &&
+ afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol);
+ if ((spb_subvol >= 0) &&
(AFR_COUNT(success_replies, child_count) == child_count)) {
- *read_subvol = spb_choice;
- } else if (!priv->quorum_count) {
+ *read_subvol = spb_subvol;
+ } else if (!priv->quorum_count ||
+ frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) {
*read_subvol = afr_first_up_child(frame, this);
- } else if (priv->quorum_count && afr_has_quorum(data_readable, this)) {
+ } else if (priv->quorum_count &&
+ afr_has_quorum(data_readable, this, NULL)) {
/* read_subvol is guaranteed to be valid if we hit this path. */
*read_subvol = afr_first_up_child(frame, this);
} else {
@@ -2270,7 +2925,7 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
local->loc.path);
}
if (*read_subvol >= 0)
- dict_del(local->replies[*read_subvol].xdata, GF_CONTENT_KEY);
+ dict_del_sizen(local->replies[*read_subvol].xdata, GF_CONTENT_KEY);
}
static void
@@ -2291,6 +2946,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
0,
};
gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t in_flight_create = _gf_false;
gf_boolean_t can_interpret = _gf_true;
inode_t *parent = NULL;
ia_type_t ia_type = IA_INVAL;
@@ -2334,17 +2990,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
if (!replies[i].valid)
continue;
- if (locked_entry && replies[i].op_ret == -1 &&
- replies[i].op_errno == ENOENT) {
- /* Second, check entry is still
- "underway" in creation */
- local->op_ret = -1;
- local->op_errno = ENOENT;
- goto error;
- }
-
- if (replies[i].op_ret == -1)
+ if (replies[i].op_ret == -1) {
+ if (locked_entry && replies[i].op_errno == ENOENT) {
+ in_flight_create = _gf_true;
+ }
continue;
+ }
if (read_subvol == -1 || !readable[read_subvol]) {
read_subvol = i;
@@ -2354,6 +3005,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
}
}
+ if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ goto error;
+ }
+
if (read_subvol == -1)
goto error;
/* We now have a read_subvol, which is readable[] (if there
@@ -2382,7 +3039,8 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
/* If we were called from glfsheal and there is still a gfid
* mismatch, succeed the lookup and let glfsheal print the
* response via gfid-heal-msg.*/
- if (!dict_get_str(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg))
+ if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg",
+ &gfid_heal_msg))
goto cant_interpret;
/* LOG ERROR */
@@ -2397,7 +3055,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
read_subvol = -1;
memset(readable, 0, sizeof(*readable) * priv->child_count);
if (can_interpret) {
- if (!afr_has_quorum(success_replies, this))
+ if (!afr_has_quorum(success_replies, this, NULL))
goto cant_interpret;
/* It is safe to call afr_replies_interpret() because we have
a response from all the UP subvolumes and all of them resolved
@@ -2411,8 +3069,8 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
if (read_subvol == -1)
goto cant_interpret;
if (ret) {
- afr_inode_event_gen_reset(local->inode, this);
- dict_del(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
+ afr_inode_need_refresh_set(local->inode, this);
+ dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
}
} else {
cant_interpret:
@@ -2436,10 +3094,10 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
goto error;
}
- ret = dict_get_str(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg);
+ ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg);
if (!ret) {
- ret = dict_set_str(local->replies[read_subvol].xdata, "gfid-heal-msg",
- gfid_heal_msg);
+ ret = dict_set_str_sizen(local->replies[read_subvol].xdata,
+ "gfid-heal-msg", gfid_heal_msg);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
"Error setting gfid-heal-msg dict");
@@ -2464,7 +3122,7 @@ error:
* others in that they must be given higher priority while
* returning to the user.
*
- * The hierarchy is ENODATA > ENOENT > ESTALE > others
+ * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others
*/
int
@@ -2476,6 +3134,8 @@ afr_higher_errno(int32_t old_errno, int32_t new_errno)
return ENOENT;
if (old_errno == ESTALE || new_errno == ESTALE)
return ESTALE;
+ if (old_errno == ENOSPC || new_errno == ENOSPC)
+ return ENOSPC;
return new_errno;
}
@@ -2517,7 +3177,7 @@ afr_local_discovery_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
priv = this->private;
child_index = (int32_t)(long)cookie;
- ret = dict_get_str(dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
+ ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
if (ret != 0) {
goto out;
}
@@ -2605,7 +3265,11 @@ afr_lookup_sh_metadata_wrap(void *opaque)
dict = dict_new();
if (!dict)
goto out;
- ret = dict_set_str(dict, "link-count", GF_XATTROP_INDEX_COUNT);
+ if (local->xattr_req) {
+ dict_copy(local->xattr_req, dict);
+ }
+
+ ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT);
if (ret) {
gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
}
@@ -2613,7 +3277,7 @@ afr_lookup_sh_metadata_wrap(void *opaque)
if (loc_is_nameless(&local->loc)) {
ret = afr_selfheal_unlocked_discover_on(frame, local->inode,
local->loc.gfid, local->replies,
- local->child_up);
+ local->child_up, dict);
} else {
inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
local->loc.name, local->replies,
@@ -2787,7 +3451,7 @@ afr_lookup_selfheal_wrap(void *opaque)
inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
local->loc.name, local->replies,
- local->child_up, NULL);
+ local->child_up, local->xattr_req);
if (inode)
inode_unref(inode);
@@ -2834,7 +3498,7 @@ afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this)
continue;
if (replies[i].op_ret == 0) {
- if (uuid_is_null(gfid)) {
+ if (gf_uuid_is_null(gfid)) {
gf_uuid_copy(gfid, replies[i].poststat.ia_gfid);
}
success[i] = 1;
@@ -2874,7 +3538,7 @@ afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this)
if (name_state_mismatch) {
if (!priv->quorum_count)
goto name_heal;
- if (!afr_has_quorum(success, this))
+ if (!afr_has_quorum(success, this, NULL))
goto name_heal;
if (op_errno)
goto name_heal;
@@ -2932,7 +3596,7 @@ afr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
* with ESTALE so that a fresh lookup will be sent by the top xlator.
* So remember it.
*/
- if (xdata && dict_get(xdata, "gfid-changed"))
+ if (xdata && dict_get_sizen(xdata, "gfid-changed"))
local->cont.lookup.needs_fresh_lookup = _gf_true;
if (xdata) {
@@ -2962,8 +3626,8 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int op_errno = 0;
int read_subvol = -1;
+ int ret = 0;
unsigned char *data_readable = NULL;
unsigned char *success_replies = NULL;
@@ -2976,17 +3640,19 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
if (AFR_COUNT(success_replies, priv->child_count) > 0)
local->op_ret = 0;
- op_errno = afr_final_errno(frame->local, this->private);
-
if (local->op_ret < 0) {
- AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return;
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(frame->local, this->private);
+ goto error;
}
- if (!afr_has_quorum(success_replies, this))
+ if (!afr_has_quorum(success_replies, this, frame))
goto unwind;
- afr_replies_interpret(frame, this, local->inode, NULL);
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ if (ret) {
+ afr_inode_need_refresh_set(local->inode, this);
+ }
read_subvol = afr_read_subvol_decide(local->inode, this, NULL,
data_readable);
@@ -2994,11 +3660,8 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
unwind:
afr_attempt_readsubvol_set(frame, this, success_replies, data_readable,
&read_subvol);
- if (read_subvol == -1) {
- AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL,
- NULL, NULL, NULL);
- return;
- }
+ if (read_subvol == -1)
+ goto error;
if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
local->op_ret = -1;
@@ -3013,6 +3676,11 @@ unwind:
local->inode, &local->replies[read_subvol].poststat,
local->replies[read_subvol].xdata,
&local->replies[read_subvol].postparent);
+ return;
+
+error:
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL, NULL);
}
static int
@@ -3036,7 +3704,7 @@ afr_ta_id_file_check(void *opaque)
this = opaque;
priv = this->private;
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_false);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.", loc.name);
@@ -3209,8 +3877,6 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
}
if (__is_root_gfid(loc->inode->gfid)) {
- if (!this->itable)
- this->itable = loc->inode->table;
if (!priv->root_inode)
priv->root_inode = inode_ref(loc->inode);
@@ -3229,10 +3895,15 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
local->inode = inode_ref(loc->inode);
- if (xattr_req)
+ if (xattr_req) {
/* If xattr_req was null, afr_lookup_xattr_req_prepare() will
allocate one for us */
- local->xattr_req = dict_ref(xattr_req);
+ local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+ if (!local->xattr_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ }
if (gf_uuid_is_null(loc->inode->gfid)) {
afr_discover_do(frame, this, 0);
@@ -3242,11 +3913,7 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
afr_read_subvol_get(loc->inode, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
- if (afr_is_inode_refresh_reqd(loc->inode, this, event,
- local->event_generation))
- afr_inode_refresh(frame, this, loc->inode, NULL, afr_discover_do);
- else
- afr_discover_do(frame, this, 0);
+ afr_discover_do(frame, this, 0);
return 0;
out:
@@ -3268,7 +3935,6 @@ afr_lookup_do(call_frame_t *frame, xlator_t *this, int err)
if (err < 0) {
local->op_errno = err;
- ret = -1;
goto out;
}
@@ -3279,7 +3945,6 @@ afr_lookup_do(call_frame_t *frame, xlator_t *this, int err)
&local->loc);
if (ret) {
local->op_errno = -ret;
- ret = -1;
goto out;
}
@@ -3344,16 +4009,15 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
if (loc_is_nameless(loc)) {
if (xattr_req)
- dict_del(xattr_req, "gfid-req");
+ dict_del_sizen(xattr_req, "gfid-req");
afr_discover(frame, this, loc, xattr_req);
return 0;
}
- if (__is_root_gfid(loc->parent->gfid)) {
- if (!strcmp(loc->name, GF_REPLICATE_TRASH_DIR)) {
- op_errno = EPERM;
- goto out;
- }
+ if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name,
+ frame->root->pid)) {
+ op_errno = EPERM;
+ goto out;
}
local = AFR_FRAME_INIT(frame, op_errno);
@@ -3382,18 +4046,14 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
ret = dict_get_gfuuid(local->xattr_req, "gfid-req",
&local->cont.lookup.gfid_req);
if (ret == 0) {
- dict_del(local->xattr_req, "gfid-req");
+ dict_del_sizen(local->xattr_req, "gfid-req");
}
}
afr_read_subvol_get(loc->parent, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
- if (afr_is_inode_refresh_reqd(loc->inode, this, event,
- local->event_generation))
- afr_inode_refresh(frame, this, loc->parent, NULL, afr_lookup_do);
- else
- afr_lookup_do(frame, this, 0);
+ afr_lookup_do(frame, this, 0);
return 0;
out:
@@ -3403,8 +4063,18 @@ out:
}
void
-_afr_cleanup_fd_ctx(afr_fd_ctx_t *fd_ctx)
+_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)
{
+ afr_private_t *priv = this->private;
+
+ if (fd_ctx->lk_heal_info) {
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info);
+ fd_ctx->lk_heal_info = NULL;
+ }
GF_FREE(fd_ctx->opened_on);
GF_FREE(fd_ctx);
return;
@@ -3424,7 +4094,7 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long)ctx;
if (fd_ctx) {
- _afr_cleanup_fd_ctx(fd_ctx);
+ _afr_cleanup_fd_ctx(this, fd_ctx);
}
out:
@@ -3517,13 +4187,14 @@ __afr_fd_ctx_set(xlator_t *this, fd_t *fd)
}
fd_ctx->readdir_subvol = -1;
+ fd_ctx->lk_heal_info = NULL;
ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);
if (ret)
gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);
out:
if (ret && fd_ctx)
- _afr_cleanup_fd_ctx(fd_ctx);
+ _afr_cleanup_fd_ctx(this, fd_ctx);
return ret;
}
@@ -3547,11 +4218,10 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
} else {
local->op_errno = op_errno;
}
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0)
AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno,
local->xdata_rsp);
@@ -3647,6 +4317,7 @@ afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
call_stub_t *stub = NULL;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -3687,11 +4358,10 @@ afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
} else {
local->op_errno = op_errno;
}
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0)
AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
local->xdata_rsp);
@@ -4008,7 +4678,7 @@ afr_fop_lock_done(call_frame_t *frame, xlator_t *this)
if (afr_is_conflicting_lock_present(local->op_ret, local->op_errno)) {
afr_unlock_locks_and_proceed(frame, this, lock_count);
- } else if (priv->quorum_count && !afr_has_quorum(success, this)) {
+ } else if (priv->quorum_count && !afr_has_quorum(success, this, NULL)) {
local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED;
local->op_ret = -1;
local->op_errno = afr_final_errno(local, priv);
@@ -4184,9 +4854,9 @@ out:
}
static int32_t
-afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
- loc_t *loc, fd_t *fd, int32_t cmd, struct gf_flock *flock,
- dict_t *xdata)
+afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
{
afr_local_t *local = NULL;
int32_t op_errno = ENOMEM;
@@ -4198,8 +4868,10 @@ afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
local->op = fop;
if (loc)
loc_copy(&local->loc, loc);
- if (fd)
+ if (fd && (flock->l_type != F_UNLCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local->fd = fd_ref(fd);
+ }
local->cont.inodelk.volume = gf_strdup(volume);
if (!local->cont.inodelk.volume) {
@@ -4228,8 +4900,8 @@ int32_t
afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- afr_handle_inodelk(frame, GF_FOP_INODELK, volume, loc, NULL, cmd, flock,
- xdata);
+ afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd,
+ flock, xdata);
return 0;
}
@@ -4237,15 +4909,16 @@ int32_t
afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- afr_handle_inodelk(frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, flock,
- xdata);
+ afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd,
+ flock, xdata);
return 0;
}
static int
-afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
- loc_t *loc, fd_t *fd, const char *basename, entrylk_cmd cmd,
- entrylk_type type, dict_t *xdata)
+afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
{
afr_local_t *local = NULL;
int32_t op_errno = ENOMEM;
@@ -4257,8 +4930,10 @@ afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,
local->op = fop;
if (loc)
loc_copy(&local->loc, loc);
- if (fd)
+ if (fd && (cmd != ENTRYLK_UNLOCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local->fd = fd_ref(fd);
+ }
local->cont.entrylk.cmd = cmd;
local->cont.entrylk.in_cmd = cmd;
local->cont.entrylk.type = type;
@@ -4285,8 +4960,8 @@ afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
const char *basename, entrylk_cmd cmd, entrylk_type type,
dict_t *xdata)
{
- afr_handle_entrylk(frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, cmd,
- type, xdata);
+ afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename,
+ cmd, type, xdata);
return 0;
}
@@ -4295,8 +4970,8 @@ afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
const char *basename, entrylk_cmd cmd, entrylk_type type,
dict_t *xdata)
{
- afr_handle_entrylk(frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, cmd,
- type, xdata);
+ afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename,
+ cmd, type, xdata);
return 0;
}
@@ -4308,10 +4983,10 @@ afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int call_count = 0;
struct statvfs *buf = NULL;
+ local = frame->local;
+
LOCK(&frame->lock);
{
- local = frame->local;
-
if (op_ret != 0) {
local->op_errno = op_errno;
goto unlock;
@@ -4337,10 +5012,9 @@ afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
}
}
unlock:
+ call_count = --local->call_count;
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0)
AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
&local->cont.statfs.buf, local->xdata_rsp);
@@ -4415,9 +5089,10 @@ afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
call_count = afr_frame_return(frame);
- if (call_count == 0)
+ if (call_count == 0) {
AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
local->xdata_rsp);
+ }
return 0;
}
@@ -4499,7 +5174,7 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
local->cont.lk.cmd, &local->cont.lk.user_flock,
local->xdata_req);
} else if (priv->quorum_count &&
- !afr_has_quorum(local->cont.lk.locked_nodes, this)) {
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
local->op_ret = -1;
local->op_errno = afr_final_errno(local, priv);
@@ -4516,11 +5191,133 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
}
int
+afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ return 0;
+}
+
+int
+afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = -1;
+
+ local = frame->local;
+ child_index = (long)cookie;
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int child_index = (long)cookie;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+ return 0;
+}
+int
+afr_lk_transaction(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *wind_on = NULL;
+ int op_errno = 0;
+ int i = 0;
+ int ret = 0;
+
+ frame = (call_frame_t *)opaque;
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+
+ if (priv->arbiter_count || priv->child_count != 3) {
+ op_errno = ENOTSUP;
+ gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Lock healing supported only for replica 3 volumes.",
+ uuid_utoa(local->fd->inode->gfid));
+ goto err;
+ }
+
+ op_errno = -afr_dom_lock_acquire(frame); // Released during
+ // AFR_STACK_UNWIND
+ if (op_errno != 0) {
+ goto err;
+ }
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) {
+ op_errno = afr_final_errno(local, priv);
+ goto err;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i])
+ wind_on[i] = 1;
+ }
+ AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ goto unlock;
+ } else {
+ if (local->cont.lk.user_flock.l_type == F_UNLCK)
+ ret = afr_remove_lock_from_saved_locks(local, this);
+ else
+ ret = afr_add_lock_to_saved_locks(frame, this);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ goto unlock;
+ }
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
+
+ return 0;
+
+unlock:
+ local->cont.lk.user_flock.l_type = F_UNLCK;
+ AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock, NULL);
+err:
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+ return -1;
+}
+
+int
afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ int ret = 0;
int i = 0;
int32_t op_errno = ENOMEM;
@@ -4531,9 +5328,11 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
goto out;
local->op = GF_FOP_LK;
- if (!afr_lk_is_unlock(cmd, flock) &&
- !afr_is_consistent_io_possible(local, priv, &op_errno))
- goto out;
+ if (!afr_lk_is_unlock(cmd, flock)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+ }
local->cont.lk.locked_nodes = GF_CALLOC(
priv->child_count, sizeof(*local->cont.lk.locked_nodes),
@@ -4551,6 +5350,16 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
if (xdata)
local->xdata_req = dict_ref(xdata);
+ if (afr_is_lock_mode_mandatory(xdata)) {
+ ret = synctask_new(this->ctx->env, afr_lk_transaction,
+ afr_lk_transaction_cbk, frame, frame);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ return 0;
+ }
+
STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],
priv->children[i]->fops->lk, fd, cmd, flock,
local->xdata_req);
@@ -4654,7 +5463,7 @@ afr_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
priv->children[child_index]->fops->lease, &local->loc,
&local->cont.lease.user_lease, xdata);
} else if (priv->quorum_count &&
- !afr_has_quorum(local->cont.lease.locked_nodes, this)) {
+ !afr_has_quorum(local->cont.lease.locked_nodes, this, NULL)) {
local->op_ret = -1;
local->op_errno = afr_final_errno(local, priv);
@@ -4842,7 +5651,7 @@ afr_forget(xlator_t *this, inode_t *inode)
if (!ctx_int)
return 0;
- ctx = (afr_inode_ctx_t *)ctx_int;
+ ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
afr_inode_ctx_destroy(ctx);
return 0;
}
@@ -4872,8 +5681,10 @@ afr_priv_dump(xlator_t *this)
GF_ATOMIC_GET(priv->pending_reads[i]));
sprintf(key, "child_latency[%d]", i);
gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]);
+ sprintf(key, "halo_child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->halo_child_up[i]);
}
- gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal);
+ gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal);
gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal);
gf_proc_dump_write("read_child", "%d", priv->read_child);
@@ -4884,6 +5695,7 @@ afr_priv_dump(xlator_t *this)
priv->background_self_heal_count);
gf_proc_dump_write("healers", "%d", priv->healers);
gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
+ gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode);
if (priv->quorum_count == AFR_QUORUM_AUTO) {
gf_proc_dump_write("quorum-type", "auto");
} else if (priv->quorum_count == 0) {
@@ -4892,7 +5704,14 @@ afr_priv_dump(xlator_t *this)
gf_proc_dump_write("quorum-type", "fixed");
gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);
}
- gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this));
+ gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this, NULL));
+ if (priv->thin_arbiter_count) {
+ gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up);
+ gf_proc_dump_write("ta_bad_child_index", "%d",
+ priv->ta_bad_child_index);
+ gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64,
+ priv->ta_notify_dom_lock_offset);
+ }
return 0;
}
@@ -4904,14 +5723,19 @@ afr_priv_dump(xlator_t *this)
*/
static int
-find_child_index(xlator_t *this, xlator_t *child)
+afr_find_child_index(xlator_t *this, xlator_t *child)
{
afr_private_t *priv = NULL;
+ int child_count = -1;
int i = -1;
priv = this->private;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ child_count++;
+ }
- for (i = 0; i < priv->child_count; i++) {
+ for (i = 0; i < child_count; i++) {
if ((xlator_t *)child == priv->children[i])
break;
}
@@ -4932,13 +5756,31 @@ __afr_get_up_children_count(afr_private_t *priv)
return up_children;
}
+static int
+__get_heard_from_all_status(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int i;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i]) {
+ return 0;
+ }
+ }
+ if (priv->thin_arbiter_count && !priv->ta_child_up) {
+ return 0;
+ }
+ return 1;
+}
+
glusterfs_event_t
-__afr_transform_event_from_state(afr_private_t *priv)
+__afr_transform_event_from_state(xlator_t *this)
{
int i = 0;
int up_children = 0;
+ afr_private_t *priv = this->private;
- if (AFR_COUNT(priv->last_event, priv->child_count) == priv->child_count)
+ if (__get_heard_from_all_status(this))
/* have_heard_from_all. Let afr_notify() do the propagation. */
return GF_EVENT_MAXVAL;
@@ -4980,7 +5822,7 @@ afr_notify_cbk(void *data)
goto unlock;
}
priv->timer = NULL;
- event = __afr_transform_event_from_state(priv);
+ event = __afr_transform_event_from_state(this);
if (event != GF_EVENT_MAXVAL)
propagate = _gf_true;
}
@@ -5007,22 +5849,6 @@ __afr_launch_notify_timer(xlator_t *this, afr_private_t *priv)
}
}
-int
-__get_heard_from_all_status(xlator_t *this)
-{
- afr_private_t *priv = this->private;
- int heard_from_all = 1;
- int i = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i]) {
- heard_from_all = 0;
- break;
- }
- }
- return heard_from_all;
-}
-
static int
find_best_down_child(xlator_t *this)
{
@@ -5034,7 +5860,7 @@ find_best_down_child(xlator_t *this)
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
- if (priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ if (!priv->child_up[i] && priv->child_latency[i] >= 0 &&
priv->child_latency[i] < best_latency) {
best_child = i;
best_latency = priv->child_latency[i];
@@ -5042,8 +5868,7 @@ find_best_down_child(xlator_t *this)
}
if (best_child >= 0) {
gf_msg_debug(this->name, 0,
- "Found best down child (%d) "
- "@ %ld ms latency",
+ "Found best down child (%d) @ %" PRId64 " ms latency",
best_child, best_latency);
}
return best_child;
@@ -5068,8 +5893,7 @@ find_worst_up_child(xlator_t *this)
}
if (worst_child >= 0) {
gf_msg_debug(this->name, 0,
- "Found worst up child (%d)"
- " @ %ld ms latency",
+ "Found worst up child (%d) @ %" PRId64 " ms latency",
worst_child, worst_latency);
}
return worst_child;
@@ -5086,7 +5910,7 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
priv = this->private;
priv->child_latency[idx] = child_latency_msec;
- gf_msg_debug(child_xlator->name, 0, "Client ping @ %ld ms",
+ gf_msg_debug(child_xlator->name, 0, "Client ping @ %" PRId64 " ms",
child_latency_msec);
if (priv->shd.iamshd)
return;
@@ -5102,21 +5926,29 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
priv->halo_min_replicas);
} else {
gf_log(child_xlator->name, GF_LOG_INFO,
- "Child latency (%ld ms) "
- "exceeds halo threshold (%ld), "
+ "Child latency (%" PRId64
+ " ms) "
+ "exceeds halo threshold (%" PRId64
+ "), "
"marking child down.",
child_latency_msec, halo_max_latency_msec);
- *event = GF_EVENT_CHILD_DOWN;
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_DOWN;
+ }
}
} else if (child_latency_msec < halo_max_latency_msec &&
priv->child_up[idx] == 0) {
if (up_children < priv->halo_max_replicas) {
gf_log(child_xlator->name, GF_LOG_INFO,
- "Child latency (%ld ms) "
- "below halo threshold (%ld), "
+ "Child latency (%" PRId64
+ " ms) "
+ "below halo threshold (%" PRId64
+ "), "
"marking child up.",
child_latency_msec, halo_max_latency_msec);
- *event = GF_EVENT_CHILD_UP;
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_UP;
+ }
} else {
gf_log(child_xlator->name, GF_LOG_INFO,
"Not marking child %d up, "
@@ -5141,7 +5973,7 @@ afr_get_halo_latency(xlator_t *this)
} else {
halo_max_latency_msec = priv->halo_max_latency_msec;
}
- gf_msg_debug(this->name, 0, "Using halo latency %ld",
+ gf_msg_debug(this->name, 0, "Using halo latency %" PRId64,
halo_max_latency_msec);
return halo_max_latency_msec;
}
@@ -5178,9 +6010,15 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
* want to set the child_latency to MAX to indicate
* the child needs ping data to be available before doing child-up
*/
- if (child_latency_msec < 0 && priv->halo_enabled) {
+ if (!priv->halo_enabled)
+ goto out;
+
+ if (child_latency_msec < 0) {
/*set to INT64_MAX-1 so that it is found for best_down_child*/
- priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ priv->halo_child_up[idx] = 1;
+ if (priv->child_latency[idx] < 0) {
+ priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ }
}
/*
@@ -5196,7 +6034,8 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
priv->child_latency[worst_up_child] > halo_max_latency_msec) {
gf_msg_debug(this->name, 0,
"Marking child %d down, "
- "doesn't meet halo threshold (%ld), and > "
+ "doesn't meet halo threshold (%" PRId64
+ "), and > "
"halo_min_replicas (%d)",
worst_up_child, halo_max_latency_msec,
priv->halo_min_replicas);
@@ -5217,13 +6056,14 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
"up_children (%d) > halo_max_replicas (%d)",
worst_up_child, up_children, priv->halo_max_replicas);
}
-
+out:
if (up_children == 1) {
gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
"Subvolume '%s' came back up; "
"going online.",
child_xlator->name);
- gf_event(EVENT_AFR_SUBVOL_UP, "subvol=%s", this->name);
+ gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
} else {
*event = GF_EVENT_SOME_DESCENDENT_UP;
}
@@ -5267,6 +6107,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
*/
if (child_latency_msec < 0) {
priv->child_latency[idx] = child_latency_msec;
+ priv->halo_child_up[idx] = 0;
}
priv->child_up[idx] = 0;
@@ -5279,7 +6120,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
* as we want it to be up to date if we are going to
* begin using it synchronously.
*/
- if (up_children < priv->halo_min_replicas) {
+ if (priv->halo_enabled && up_children < priv->halo_min_replicas) {
best_down_child = find_best_down_child(this);
if (best_down_child >= 0) {
gf_msg_debug(this->name, 0,
@@ -5291,7 +6132,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
*up_child = best_down_child;
}
}
-
for (i = 0; i < priv->child_count; i++)
if (priv->child_up[i] == 0)
down_children++;
@@ -5300,13 +6140,116 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
"All subvolumes are down. Going "
"offline until at least one of them "
"comes back up.");
- gf_event(EVENT_AFR_SUBVOLS_DOWN, "subvol=%s", this->name);
+ gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
} else {
*event = GF_EVENT_SOME_DESCENDENT_DOWN;
}
priv->last_event[idx] = *event;
}
+void
+afr_ta_lock_release_synctask(xlator_t *this)
+{
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
+
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ return;
+ }
+
+ ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta,
+ afr_ta_lock_release_done, ta_frame, this);
+ if (ret) {
+ STACK_DESTROY(ta_frame->root);
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to release "
+ "AFR_TA_DOM_NOTIFY lock.");
+ }
+}
+
+static void
+afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ afr_private_t *priv = this->private;
+
+ lc = upcall->data;
+
+ if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0)
+ return;
+
+ if (priv->shd.iamshd) {
+ /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */
+ return;
+ }
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Ignore multiple release requests from shds.*/
+ UNLOCK(&priv->lock);
+ return;
+ }
+ priv->release_ta_notify_dom_lock = _gf_true;
+ inmem_count = priv->ta_in_mem_txn_count;
+ onwire_count = priv->ta_on_wire_txn_count;
+ }
+ UNLOCK(&priv->lock);
+ if (inmem_count || onwire_count)
+ /* lock release will happen in txn code path after
+ * in-memory or on-wire txns are over.*/
+ return;
+
+ afr_ta_lock_release_synctask(this);
+}
+
+static void
+afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *up_ci = NULL;
+ afr_private_t *priv = this->private;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ int i = 0;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_INODELK_CONTENTION:
+ afr_handle_inodelk_contention(this, upcall);
+ break;
+ case GF_UPCALL_CACHE_INVALIDATION:
+ up_ci = (struct gf_upcall_cache_invalidation *)upcall->data;
+
+ /* Since md-cache will be aggressively filtering
+ * lookups, the stale read issue will be more
+ * pronounced. Hence when a pending xattr is set notify
+ * all the md-cache clients to invalidate the existing
+ * stat cache and send the lookup next time */
+ if (!up_ci->dict)
+ break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!dict_get(up_ci->dict, priv->pending_key[i]))
+ continue;
+ up_ci->flags |= UP_INVAL_ATTR;
+ itable = ((xlator_t *)this->graph->top)->itable;
+ /*Internal processes may not have itable for
+ *top xlator*/
+ if (itable)
+ inode = inode_find(itable, upcall->gfid);
+ if (inode)
+ afr_inode_need_refresh_set(inode, this);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
int32_t
afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
{
@@ -5324,10 +6267,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
dict_t *output = NULL;
gf_boolean_t had_quorum = _gf_false;
gf_boolean_t has_quorum = _gf_false;
- struct gf_upcall *up_data = NULL;
- struct gf_upcall_cache_invalidation *up_ci = NULL;
- inode_table_t *itable = NULL;
- inode_t *inode = NULL;
int64_t halo_max_latency_msec = 0;
int64_t child_latency_msec = -1;
@@ -5355,21 +6294,22 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
* subsequent revalidate lookup happens on all the dht's subvolumes
* which triggers afr self-heals if any.
*/
- idx = find_child_index(this, child_xlator);
+ idx = afr_find_child_index(this, child_xlator);
if (idx < 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
"Received child_up from invalid subvolume");
goto out;
}
- had_quorum = priv->quorum_count && afr_has_quorum(priv->child_up, this);
- if (priv->halo_enabled) {
- halo_max_latency_msec = afr_get_halo_latency(this);
+ had_quorum = priv->quorum_count &&
+ afr_has_quorum(priv->child_up, this, NULL);
+ if (event == GF_EVENT_CHILD_PING) {
+ child_latency_msec = (int64_t)(uintptr_t)data2;
+ if (priv->halo_enabled) {
+ halo_max_latency_msec = afr_get_halo_latency(this);
- if (event == GF_EVENT_CHILD_PING) {
/* Calculates the child latency and sets event
*/
- child_latency_msec = (int64_t)(uintptr_t)data2;
LOCK(&priv->lock);
{
__afr_handle_ping_event(this, child_xlator, idx,
@@ -5377,6 +6317,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
child_latency_msec);
}
UNLOCK(&priv->lock);
+ } else {
+ LOCK(&priv->lock);
+ {
+ priv->child_latency[idx] = child_latency_msec;
+ }
+ UNLOCK(&priv->lock);
}
}
@@ -5404,6 +6350,10 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
goto out;
}
+ if (event == GF_EVENT_UPCALL) {
+ afr_handle_upcall_event(this, data);
+ }
+
LOCK(&priv->lock);
{
had_heard_from_all = __get_heard_from_all_status(this);
@@ -5413,15 +6363,30 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
propagate = 1;
break;
case GF_EVENT_CHILD_UP:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 1;
+ priv->ta_event_gen++;
+ break;
+ }
__afr_handle_child_up_event(this, child_xlator, idx,
child_latency_msec, &event,
&call_psh, &up_child);
+ __afr_lock_heal_synctask(this, priv, idx);
break;
case GF_EVENT_CHILD_DOWN:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 0;
+ priv->ta_event_gen++;
+ afr_ta_locked_priv_invalidate(priv);
+ break;
+ }
__afr_handle_child_down_event(this, child_xlator, idx,
child_latency_msec, &event,
&call_psh, &up_child);
+ __afr_mark_pending_lk_heal(this, priv, idx);
break;
case GF_EVENT_CHILD_CONNECTING:
@@ -5432,34 +6397,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
case GF_EVENT_SOME_DESCENDENT_DOWN:
priv->last_event[idx] = event;
break;
- case GF_EVENT_UPCALL:
- up_data = (struct gf_upcall *)data;
- if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
- break;
- up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
-
- /* Since md-cache will be aggressively filtering
- * lookups, the stale read issue will be more
- * pronounced. Hence when a pending xattr is set notify
- * all the md-cache clients to invalidate the existing
- * stat cache and send the lookup next time */
- if (!up_ci->dict)
- break;
- for (i = 0; i < priv->child_count; i++) {
- if (dict_get(up_ci->dict, priv->pending_key[i])) {
- up_ci->flags |= UP_INVAL_ATTR;
- itable = ((xlator_t *)this->graph->top)->itable;
- /*Internal processes may not have itable for top
- * xlator*/
- if (itable)
- inode = inode_find(itable, up_data->gfid);
- if (inode)
- afr_inode_need_refresh_set(inode, this);
-
- break;
- }
- }
- break;
default:
propagate = 1;
break;
@@ -5491,16 +6428,18 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
UNLOCK(&priv->lock);
if (priv->quorum_count) {
- has_quorum = afr_has_quorum(priv->child_up, this);
+ has_quorum = afr_has_quorum(priv->child_up, this, NULL);
if (!had_quorum && has_quorum) {
gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
"Client-quorum is met");
- gf_event(EVENT_AFR_QUORUM_MET, "subvol=%s", this->name);
+ gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
}
if (had_quorum && !has_quorum) {
gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
"Client-quorum is not met");
- gf_event(EVENT_AFR_QUORUM_FAIL, "subvol=%s", this->name);
+ gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
}
}
@@ -5519,10 +6458,8 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
* b) Already heard from everyone, but we now got a child-up
* event.
*/
- if (have_heard_from_all && priv->shd.iamshd) {
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i])
- afr_selfheal_childup(this, i);
+ if (have_heard_from_all) {
+ afr_selfheal_childup(this, priv);
}
}
out:
@@ -5543,7 +6480,7 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
- local->child_up = GF_CALLOC(priv->child_count, sizeof(*local->child_up),
+ local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up),
gf_afr_mt_char);
if (!local->child_up) {
if (op_errno)
@@ -5599,6 +6536,14 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
}
local->need_full_crawl = _gf_false;
+ if (priv->thin_arbiter_count) {
+ local->ta_child_up = priv->ta_child_up;
+ local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
+ local->read_txn_query_child = AFR_CHILD_UNKNOWN;
+ local->ta_event_gen = priv->ta_event_gen;
+ local->fop_state = TA_SUCCESS;
+ }
+ local->is_new_entry = _gf_false;
INIT_LIST_HEAD(&local->healer);
return 0;
@@ -5611,11 +6556,6 @@ afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count)
{
int ret = -ENOMEM;
- lk->locked_nodes = GF_CALLOC(sizeof(*lk->locked_nodes), child_count,
- gf_afr_mt_char);
- if (NULL == lk->locked_nodes)
- goto out;
-
lk->lower_locked_nodes = GF_CALLOC(sizeof(*lk->lower_locked_nodes),
child_count, gf_afr_mt_char);
if (NULL == lk->lower_locked_nodes)
@@ -5673,6 +6613,10 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
afr_private_t *priv = NULL;
priv = this->private;
+ INIT_LIST_HEAD(&local->transaction.wait_list);
+ INIT_LIST_HEAD(&local->transaction.owner_list);
+ INIT_LIST_HEAD(&local->ta_waitq);
+ INIT_LIST_HEAD(&local->ta_onwireq);
ret = afr_internal_lock_init(&local->internal_lock, priv->child_count);
if (ret < 0)
goto out;
@@ -5710,8 +6654,6 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
goto out;
ret = 0;
- INIT_LIST_HEAD(&local->transaction.wait_list);
- INIT_LIST_HEAD(&local->transaction.owner_list);
out:
return ret;
}
@@ -5730,6 +6672,8 @@ afr_priv_destroy(afr_private_t *priv)
if (!priv)
goto out;
+
+ GF_FREE(priv->sh_domain);
GF_FREE(priv->last_event);
child_count = priv->child_count;
@@ -5745,7 +6689,9 @@ afr_priv_destroy(afr_private_t *priv)
GF_FREE(priv->local);
GF_FREE(priv->pending_key);
GF_FREE(priv->children);
+ GF_FREE(priv->anon_inode);
GF_FREE(priv->child_up);
+ GF_FREE(priv->halo_child_up);
GF_FREE(priv->child_latency);
LOCK_DESTROY(&priv->lock);
@@ -5796,283 +6742,248 @@ out:
return changelog;
}
-gf_boolean_t
-afr_decide_heal_info(afr_private_t *priv, unsigned char *sources, int source)
+static dict_t *
+afr_set_heal_info(char *status)
{
- int sources_count = 0;
+ dict_t *dict = NULL;
+ int ret = -1;
- if (source < 0)
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
goto out;
+ }
- sources_count = AFR_COUNT(sources, priv->child_count);
- if (sources_count == priv->child_count)
- return _gf_false;
+ ret = dict_set_dynstr_sizen(dict, "heal-info", status);
+ if (ret)
+ gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set heal-info key to "
+ "%s",
+ status);
out:
- return _gf_true;
+ /* Any error other than EINVAL, dict_set_dynstr frees status */
+ if (ret == -ENOMEM || ret == -EINVAL) {
+ GF_FREE(status);
+ }
+
+ if (ret && dict) {
+ dict_unref(dict);
+ dict = NULL;
+ }
+ return dict;
}
-int
-afr_selfheal_locked_metadata_inspect(call_frame_t *frame, xlator_t *this,
- inode_t *inode, gf_boolean_t *msh,
- unsigned char *pending)
+static gf_boolean_t
+afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type)
{
- int ret = -1;
- unsigned char *locked_on = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- unsigned char *undid_pending = NULL;
- struct afr_reply *locked_replies = NULL;
-
afr_private_t *priv = this->private;
+ int *dirty = alloca0(priv->child_count * sizeof(int));
+ int i = 0;
- locked_on = alloca0(priv->child_count);
- sources = alloca0(priv->child_count);
- sinks = alloca0(priv->child_count);
- healed_sinks = alloca0(priv->child_count);
- undid_pending = alloca0(priv->child_count);
-
- locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
-
- ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
- locked_on);
- {
- if (ret == 0) {
- /* Not a single lock */
- ret = -afr_final_errno(frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN; /* all invalid responses */
- goto out;
- }
- ret = __afr_selfheal_metadata_prepare(
- frame, this, inode, locked_on, sources, sinks, healed_sinks,
- undid_pending, locked_replies, pending);
- *msh = afr_decide_heal_info(priv, sources, ret);
+ afr_selfheal_extract_xattr(this, replies, type, dirty, NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dirty[i] > 1)
+ return _gf_true;
}
- afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
- locked_on);
-out:
- if (locked_replies)
- afr_replies_wipe(locked_replies, priv->child_count);
- return ret;
+
+ return _gf_false;
}
-int
-afr_selfheal_locked_data_inspect(call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_boolean_t *dsh, unsigned char *pflag)
+static gf_boolean_t
+afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type)
{
- int ret = -1;
- unsigned char *data_lock = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- unsigned char *undid_pending = NULL;
- afr_private_t *priv = NULL;
- struct afr_reply *locked_replies = NULL;
- inode_t *inode = fd->inode;
-
- priv = this->private;
- data_lock = alloca0(priv->child_count);
- sources = alloca0(priv->child_count);
- sinks = alloca0(priv->child_count);
- healed_sinks = alloca0(priv->child_count);
- undid_pending = alloca0(priv->child_count);
+ gf_boolean_t data_chk = _gf_false;
+ gf_boolean_t mdata_chk = _gf_false;
+ gf_boolean_t entry_chk = _gf_false;
- locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+ switch (ia_type) {
+ case IA_IFDIR:
+ mdata_chk = _gf_true;
+ entry_chk = _gf_true;
+ break;
+ case IA_IFREG:
+ mdata_chk = _gf_true;
+ data_chk = _gf_true;
+ break;
+ default:
+ /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/
+ mdata_chk = _gf_true;
+ break;
+ }
- ret = afr_selfheal_inodelk(frame, this, inode, this->name, 0, 0, data_lock);
- {
- if (ret == 0) {
- ret = -afr_final_errno(frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN; /* all invalid responses */
- goto out;
- }
- ret = __afr_selfheal_data_prepare(frame, this, inode, data_lock,
- sources, sinks, healed_sinks,
- undid_pending, locked_replies, pflag);
- *dsh = afr_decide_heal_info(priv, sources, ret);
+ if (data_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_DATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_METADATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_ENTRY_TRANSACTION)) {
+ return _gf_true;
}
- afr_selfheal_uninodelk(frame, this, inode, this->name, 0, 0, data_lock);
-out:
- if (locked_replies)
- afr_replies_wipe(locked_replies, priv->child_count);
- return ret;
+
+ return _gf_false;
}
-int
-afr_selfheal_locked_entry_inspect(call_frame_t *frame, xlator_t *this,
- inode_t *inode, gf_boolean_t *esh,
- unsigned char *pflag)
+static int
+afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh,
+ gf_boolean_t *msh, unsigned char pending)
{
int ret = -1;
- int source = -1;
+ GF_UNUSED int ret1 = 0;
+ int i = 0;
+ int io_domain_lk_count = 0;
+ int shd_domain_lk_count = 0;
afr_private_t *priv = NULL;
- unsigned char *locked_on = NULL;
- unsigned char *data_lock = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- struct afr_reply *locked_replies = NULL;
- gf_boolean_t granular_locks = _gf_false;
+ char *key1 = NULL;
+ char *key2 = NULL;
priv = this->private;
- if (strcmp("granular", priv->locking_scheme) == 0)
- granular_locks = _gf_true;
- locked_on = alloca0(priv->child_count);
- data_lock = alloca0(priv->child_count);
- sources = alloca0(priv->child_count);
- sinks = alloca0(priv->child_count);
- healed_sinks = alloca0(priv->child_count);
-
- locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
- if (!granular_locks) {
- ret = afr_selfheal_tryentrylk(frame, this, inode, priv->sh_domain, NULL,
- locked_on);
- }
- {
- if (!granular_locks && ret == 0) {
- ret = -afr_final_errno(frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN; /* all invalid responses */
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if ((replies[i].valid != 1) || (replies[i].op_ret != 0))
+ continue;
+ if (!io_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count);
}
+ if (!shd_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count);
+ }
+ }
- ret = afr_selfheal_entrylk(frame, this, inode, this->name, NULL,
- data_lock);
- {
- if (ret == 0) {
- ret = -afr_final_errno(frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN;
- /* all invalid responses */
- goto unlock;
- }
- ret = __afr_selfheal_entry_prepare(frame, this, inode, data_lock,
- sources, sinks, healed_sinks,
- locked_replies, &source, pflag);
- if ((ret == 0) && (*pflag & PFLAG_SBRAIN))
- ret = -EIO;
- *esh = afr_decide_heal_info(priv, sources, ret);
+ if (!pending) {
+ if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) ||
+ (!io_domain_lk_count)) {
+ /* Needs heal. */
+ ret = 0;
+ } else {
+ /* No heal needed. */
+ *dsh = *esh = *msh = 0;
+ }
+ } else {
+ if (shd_domain_lk_count) {
+ ret = -EAGAIN; /*For 'possibly-healing'. */
+ } else {
+ ret = 0; /*needs heal. Just set a non -ve value so that it is
+ assumed as the source index.*/
}
- afr_selfheal_unentrylk(frame, this, inode, this->name, NULL, data_lock,
- NULL);
}
-unlock:
- if (!granular_locks)
- afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL,
- locked_on, NULL);
-out:
- if (locked_replies)
- afr_replies_wipe(locked_replies, priv->child_count);
return ret;
}
+/*return EIO, EAGAIN or pending*/
int
-afr_selfheal_locked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
- inode_t **inode, gf_boolean_t *entry_selfheal,
- gf_boolean_t *data_selfheal,
- gf_boolean_t *metadata_selfheal,
- unsigned char *pending)
-
+afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **inode, gf_boolean_t *entry_selfheal,
+ gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal, unsigned char *pending)
{
int ret = -1;
- fd_t *fd = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
gf_boolean_t dsh = _gf_false;
gf_boolean_t msh = _gf_false;
gf_boolean_t esh = _gf_false;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *valid_on = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+ sources = alloca0(sizeof(*sources) * priv->child_count);
+ sinks = alloca0(sizeof(*sinks) * priv->child_count);
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ valid_on = alloca0(sizeof(*valid_on) * priv->child_count);
ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh,
- &esh);
+ &esh, replies);
if (ret)
goto out;
-
- /* For every heal type hold locks and check if it indeed needs heal */
-
- /* Heal-info does an open() on the file being examined so that the
- * current eager-lock holding client, if present, at some point sees
- * open-fd count being > 1 and releases the eager-lock so that heal-info
- * doesn't remain blocked forever until IO completes.
- */
- if ((*inode)->ia_type == IA_IFREG) {
- ret = afr_selfheal_data_open(this, *inode, &fd);
- if (ret < 0) {
- gf_msg_debug(this->name, -ret, "%s: Failed to open",
- uuid_utoa((*inode)->gfid));
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ valid_on[i] = 1;
}
}
-
if (msh) {
- ret = afr_selfheal_locked_metadata_inspect(frame, this, *inode, &msh,
- pending);
- if (ret == -EIO)
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_METADATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
goto out;
}
-
if (dsh) {
- ret = afr_selfheal_locked_data_inspect(frame, this, fd, &dsh, pending);
- if (ret == -EIO || (ret == -EAGAIN))
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_DATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
goto out;
}
-
if (esh) {
- ret = afr_selfheal_locked_entry_inspect(frame, this, *inode, &esh,
- pending);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
}
+ ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh,
+ &msh, *pending);
out:
*data_selfheal = dsh;
*entry_selfheal = esh;
*metadata_selfheal = msh;
- if (fd)
- fd_unref(fd);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
return ret;
}
-dict_t *
-afr_set_heal_info(char *status)
-{
- dict_t *dict = NULL;
- int ret = -1;
-
- dict = dict_new();
- if (!dict) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = dict_set_str(dict, "heal-info", status);
- if (ret)
- gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
- "Failed to set heal-info key to "
- "%s",
- status);
-out:
- return dict;
-}
-
int
afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
gf_boolean_t data_selfheal = _gf_false;
gf_boolean_t metadata_selfheal = _gf_false;
gf_boolean_t entry_selfheal = _gf_false;
- unsigned char pending = _gf_false;
+ unsigned char pending = 0;
dict_t *dict = NULL;
int ret = -1;
- int op_errno = 0;
+ int op_errno = ENOMEM;
inode_t *inode = NULL;
char *substr = NULL;
char *status = NULL;
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
- ret = afr_selfheal_locked_inspect(frame, this, loc->gfid, &inode,
- &entry_selfheal, &data_selfheal,
- &metadata_selfheal, &pending);
+ /*Use frame with lk-owner set*/
+ heal_frame = afr_frame_create(frame->this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+
+ ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode,
+ &entry_selfheal, &data_selfheal,
+ &metadata_selfheal, &pending);
if (ret == -ENOMEM) {
- op_errno = -ret;
ret = -1;
goto out;
}
@@ -6085,26 +6996,50 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
if (ret == -EIO) {
ret = gf_asprintf(&status, "split-brain%s", substr ? substr : "");
- if (ret < 0)
+ if (ret < 0) {
goto out;
+ }
dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
} else if (ret == -EAGAIN) {
ret = gf_asprintf(&status, "possibly-healing%s", substr ? substr : "");
- if (ret < 0)
+ if (ret < 0) {
goto out;
+ }
dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
} else if (ret >= 0) {
/* value of ret = source index
* so ret >= 0 and at least one of the 3 booleans set to
* true means a source is identified; heal is required.
*/
if (!data_selfheal && !entry_selfheal && !metadata_selfheal) {
- dict = afr_set_heal_info("no-heal");
+ status = gf_strdup("no-heal");
+ if (!status) {
+ ret = -1;
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
} else {
ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
- if (ret < 0)
+ if (ret < 0) {
goto out;
+ }
dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
}
} else if (ret < 0) {
/* Apart from above checked -ve ret values, there are
@@ -6116,14 +7051,25 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
*/
if (data_selfheal || entry_selfheal || metadata_selfheal) {
ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
- if (ret < 0)
+ if (ret < 0) {
goto out;
+ }
dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
}
}
+
ret = 0;
+ op_errno = 0;
out:
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
if (dict)
dict_unref(dict);
@@ -6245,10 +7191,10 @@ afr_get_split_brain_status(void *opaque)
}
/* Calculation for string length :
- * (child_count X length of child-name) + SLEN (" Choices :")
+ * (child_count X length of child-name) + SLEN(" Choices :")
* child-name consists of :
* a) 251 = max characters for volname according to GD_VOLUME_NAME_MAX
- * b) strlen ("-client-00,") assuming 16 replicas
+ * b) strlen("-client-00,") assuming 16 replicas
*/
choices = alloca0(priv->child_count * (256 + SLEN("-client-00,")) +
SLEN(" Choices:"));
@@ -6257,8 +7203,8 @@ afr_get_split_brain_status(void *opaque)
if (ret) {
op_errno = -ret;
if (ret == -EAGAIN) {
- ret = dict_set_str(dict, GF_AFR_SBRAIN_STATUS,
- SBRAIN_HEAL_NO_GO_MSG);
+ ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+ SBRAIN_HEAL_NO_GO_MSG);
if (ret) {
gf_msg(this->name, GF_LOG_WARNING, -ret,
AFR_MSG_DICT_SET_FAILED,
@@ -6287,16 +7233,15 @@ afr_get_split_brain_status(void *opaque)
op_errno = ENOMEM;
goto out;
}
- ret = dict_set_dynstr(dict, GF_AFR_SBRAIN_STATUS, status);
+ ret = dict_set_dynstr_sizen(dict, GF_AFR_SBRAIN_STATUS, status);
if (ret) {
op_errno = -ret;
ret = -1;
goto out;
}
} else {
- ret = dict_set_str(dict, GF_AFR_SBRAIN_STATUS,
- "The file is not under data or"
- " metadata split-brain");
+ ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+ SFILE_NOT_UNDER_DATA);
if (ret) {
op_errno = -ret;
ret = -1;
@@ -6321,6 +7266,8 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
int op_errno = 0;
dict_t *dict = NULL;
afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
local = frame->local;
dict = dict_new();
@@ -6330,10 +7277,20 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
goto out;
}
- ret = afr_selfheal_do(frame, this, loc->gfid);
+ heal_frame = afr_frame_create(this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+ /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk
+ * work correctly*/
+ ret = afr_selfheal_do(heal_frame, this, loc->gfid);
if (ret == 1 || ret == 2) {
- ret = dict_set_str(dict, "sh-fail-msg", "File not in split-brain");
+ ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
if (ret)
gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
"Failed to set sh-fail-msg in dict");
@@ -6351,6 +7308,10 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
}
out:
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
if (local->op == GF_FOP_GETXATTR)
AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
else if (local->op == GF_FOP_SETXATTR)
@@ -6493,7 +7454,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS)
+ if (ret < priv->child_count)
goto data_unlock;
ret = __afr_selfheal_data_prepare(
heal_frame, this, inode, locked_on, sources, sinks,
@@ -6510,7 +7471,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque)
ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
LLONG_MAX - 1, 0, locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS)
+ if (ret < priv->child_count)
goto mdata_unlock;
ret = __afr_selfheal_metadata_prepare(
heal_frame, this, inode, locked_on, sources, sinks,
@@ -6537,12 +7498,14 @@ afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
char *xattr = NULL;
int i = 0;
int len = 0;
+ int keylen = 0;
size_t str_len = 0;
int ret = -1;
priv = this->private;
local = frame->local;
+ keylen = strlen(local->cont.getxattr.name);
for (i = 0; i < priv->child_count; i++) {
if (!local->replies[i].valid || local->replies[i].op_ret) {
str_len = strlen(default_str);
@@ -6551,8 +7514,8 @@ afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
buf[len++] = delimiter;
buf[len] = '\0';
} else {
- ret = dict_get_str(local->replies[i].xattr,
- local->cont.getxattr.name, &xattr);
+ ret = dict_get_strn(local->replies[i].xattr,
+ local->cont.getxattr.name, keylen, &xattr);
if (ret) {
gf_msg("TEST", GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
"Failed to get the node_uuid of brick "
@@ -6700,9 +7663,6 @@ afr_ta_is_fop_called_from_synctask(xlator_t *this)
int
afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
{
- /*Note: At any given time, only one instance of this function must
- * be in progress.*/
-
int ret = 0;
uuid_t gfid = {
0,
@@ -6717,6 +7677,11 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
};
int32_t cmd = 0;
+ /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock
+ * has been released in afr_notify due to upcall notification from shd.
+ */
+ GF_ASSERT(priv->ta_notify_dom_lock_offset == 0);
+
if (!priv->shd.iamshd)
GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
flock1.l_type = F_WRLCK;
@@ -6728,14 +7693,10 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
flock1.l_len = 0;
} else {
cmd = F_SETLK;
- if (priv->ta_notify_dom_lock_offset) {
- flock1.l_start = priv->ta_notify_dom_lock_offset;
- } else {
- gf_uuid_generate(gfid);
- flock1.l_start = gfid_to_ino(gfid);
- if (flock1.l_start < 0)
- flock1.l_start = -flock1.l_start;
- }
+ gf_uuid_generate(gfid);
+ flock1.l_start = gfid_to_ino(gfid);
+ if (flock1.l_start < 0)
+ flock1.l_start = -flock1.l_start;
flock1.l_len = 1;
}
ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
@@ -6761,7 +7722,7 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Failed to get AFR_TA_DOM_MODIFY lock.");
+ "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);
flock1.l_type = F_UNLCK;
ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL,
@@ -6826,3 +7787,92 @@ afr_ta_frame_create(xlator_t *this)
afr_set_lk_owner(frame, this, lk_owner);
return frame;
}
+
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
+{
+ int data_count = 0;
+
+ data_count = AFR_COUNT(local->child_up, priv->child_count);
+ if (data_count == 2) {
+ return _gf_true;
+ } else if (data_count == 1 && local->ta_child_up) {
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static gf_boolean_t
+afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT)
+ return _gf_false;
+
+ local = frame->local;
+
+ if (local->op != GF_FOP_LOOKUP)
+ /* TODO:If the replica count is being increased on a plain distribute
+ * volume that was never mounted, we need to allow setxattr on '/' with
+ * GF_CLIENT_PID_NO_ROOT_SQUASH to accomodate for DHT layout setting */
+ return _gf_false;
+
+ if (local->inode == NULL)
+ return _gf_false;
+
+ if (!__is_root_gfid(local->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
+}
+
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count)
+{
+ if (frame && (up_children_count > 0) &&
+ afr_is_add_replica_mount_lookup_on_root(frame))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ unsigned char *success_replies = NULL;
+
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+
+ if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_errno = afr_final_errno(local, priv);
+ if (!local->op_errno)
+ local->op_errno = afr_quorum_errno(priv);
+ local->op_ret = -1;
+ }
+}
+
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child)
+{
+ int *pending = NULL;
+ int ret = 0;
+ int i = 0;
+
+ ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending);
+ if (ret == 0) {
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ /* Not doing a ntoh32(pending) as we just want to check
+ * if it is non-zero or not. */
+ if (pending[i]) {
+ return _gf_true;
+ }
+ }
+ }
+
+ return _gf_false;
+}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 4c40e85f393..f8bf8340dab 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -10,24 +10,17 @@
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
-#include "glusterfs.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "checksum.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/list.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
#include "afr.h"
#include "afr-transaction.h"
@@ -45,6 +38,10 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
fd_ctx = local->fd_ctx;
child_index = (long)cookie;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
LOCK(&frame->lock);
{
if (op_ret == -1) {
@@ -56,19 +53,22 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!local->xdata_rsp && xdata)
local->xdata_rsp = dict_ref(xdata);
}
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
- if (call_count == 0)
+ if (call_count == 0) {
+ afr_handle_replies_quorum(frame, this);
AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno,
local->fd, NULL);
+ }
+
return 0;
}
int
-afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -84,6 +84,12 @@ afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
goto out;
local->op = GF_FOP_OPENDIR;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
+
if (!afr_is_consistent_io_possible(local, priv, &op_errno))
goto out;
@@ -158,8 +164,8 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol)
}
static void
-afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol,
- gf_dirent_t *entries, fd_t *fd)
+afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries,
+ int subvol, gf_dirent_t *entries, fd_t *fd)
{
int ret = -1;
gf_dirent_t *entry = NULL;
@@ -177,8 +183,8 @@ afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol,
list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list)
{
- if (__is_root_gfid(fd->inode->gfid) &&
- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name,
+ frame->root->pid)) {
continue;
}
@@ -222,8 +228,8 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
if (op_ret >= 0)
- afr_readdir_transform_entries(subvol_entries, (long)cookie, &entries,
- local->fd);
+ afr_readdir_transform_entries(frame, subvol_entries, (long)cookie,
+ &entries, local->fd);
AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata);
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 5725b1c5cb3..b7cceb79158 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -10,24 +10,20 @@
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
#include "afr.h"
#include "afr-transaction.h"
@@ -98,7 +94,9 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
}
if (local->inode) {
- afr_replies_interpret(frame, this, local->inode, NULL);
+ if (local->op != GF_FOP_RENAME && local->op != GF_FOP_LINK)
+ afr_replies_interpret(frame, this, local->inode, NULL);
+
inode_read_subvol = afr_data_subvol_get(local->inode, this, NULL, NULL,
NULL, &args);
}
@@ -121,11 +119,11 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
continue;
if (local->replies[i].op_ret < 0) {
if (local->inode)
- afr_inode_event_gen_reset(local->inode, this);
+ afr_inode_need_refresh_set(local->inode, this);
if (local->parent)
- afr_inode_event_gen_reset(local->parent, this);
+ afr_inode_need_refresh_set(local->parent, this);
if (local->parent2)
- afr_inode_event_gen_reset(local->parent2, this);
+ afr_inode_need_refresh_set(local->parent2, this);
continue;
}
@@ -231,9 +229,9 @@ __afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
__afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf,
preparent, postparent, preparent2, postparent2,
xdata);
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
if (call_count == 0) {
__afr_dir_write_finalize(frame, this);
@@ -347,6 +345,7 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
int pre_op_count = 0;
int failed_count = 0;
+ unsigned char *success_replies = NULL;
local = frame->local;
priv = this->private;
@@ -362,9 +361,22 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
failed_count = AFR_COUNT(local->transaction.failed_subvols,
priv->child_count);
+ /* FOP succeeded on all bricks. */
if (pre_op_count == priv->child_count && !failed_count)
return;
+ /* FOP did not suceed on quorum no. of bricks. */
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+ if (!afr_has_quorum(success_replies, this, NULL))
+ return;
+
+ if (priv->thin_arbiter_count) {
+ /*Mark new entry using ta file*/
+ local->is_new_entry = _gf_true;
+ return;
+ }
+
afr_mark_new_entry_changelog(frame, this);
return;
@@ -423,15 +435,11 @@ int
afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_errno = ENOMEM;
- priv = this->private;
-
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -473,16 +481,6 @@ afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME(loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee(&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename, priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -553,15 +551,11 @@ int
afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
dev_t dev, mode_t umask, dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_errno = ENOMEM;
- priv = this->private;
-
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -596,16 +590,6 @@ afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME(loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee(&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename, priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -674,15 +658,11 @@ int
afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
mode_t umask, dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_errno = ENOMEM;
- priv = this->private;
-
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -698,7 +678,7 @@ afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local->cont.mkdir.mode = mode;
local->umask = umask;
- if (!xdata || !dict_get(xdata, "gfid-req")) {
+ if (!xdata || !dict_get_sizen(xdata, "gfid-req")) {
op_errno = EPERM;
gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno,
AFR_MSG_GFID_NULL,
@@ -724,16 +704,6 @@ afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME(loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee(&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename, priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -802,15 +772,11 @@ int
afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_errno = ENOMEM;
- priv = this->private;
-
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -845,16 +811,6 @@ afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME(newloc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee(&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename, priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -924,15 +880,11 @@ int
afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
loc_t *loc, mode_t umask, dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_errno = ENOMEM;
- priv = this->private;
-
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -966,16 +918,6 @@ afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME(loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee(&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename, priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -1048,15 +990,10 @@ int
afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_errno = ENOMEM;
- int nlockee = 0;
-
- priv = this->private;
transaction_frame = copy_frame(frame);
if (!transaction_frame) {
@@ -1099,35 +1036,6 @@ afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME(oldloc->path);
local->transaction.new_basename = AFR_BASENAME(newloc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = nlockee = 0;
- ret = afr_init_entry_lockee(
- &int_lock->lockee[nlockee], local, &local->transaction.new_parent_loc,
- local->transaction.new_basename, priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- ret = afr_init_entry_lockee(&int_lock->lockee[nlockee], local,
- &local->transaction.parent_loc,
- local->transaction.basename, priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- if (local->newloc.inode && IA_ISDIR(local->newloc.inode->ia_type)) {
- ret = afr_init_entry_lockee(&int_lock->lockee[nlockee], local,
- &local->newloc, NULL, priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- }
- qsort(int_lock->lockee, nlockee, sizeof(*int_lock->lockee),
- afr_entry_lockee_cmp);
- int_lock->lockee_count = nlockee;
-
ret = afr_transaction(transaction_frame, this,
AFR_ENTRY_RENAME_TRANSACTION);
if (ret < 0) {
@@ -1196,15 +1104,11 @@ int
afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_errno = ENOMEM;
- priv = this->private;
-
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1237,16 +1141,6 @@ afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME(loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee(&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename, priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
@@ -1313,15 +1207,10 @@ int
afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_errno = ENOMEM;
- int nlockee = 0;
-
- priv = this->private;
transaction_frame = copy_frame(frame);
if (!transaction_frame)
@@ -1355,26 +1244,6 @@ afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME(loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = nlockee = 0;
- ret = afr_init_entry_lockee(&int_lock->lockee[nlockee], local,
- &local->transaction.parent_loc,
- local->transaction.basename, priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- ret = afr_init_entry_lockee(&int_lock->lockee[nlockee], local, &local->loc,
- NULL, priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- qsort(int_lock->lockee, nlockee, sizeof(*int_lock->lockee),
- afr_entry_lockee_cmp);
- int_lock->lockee_count = nlockee;
-
ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 113e39acfe8..c5521704de2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -15,20 +15,17 @@
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "list.h"
-#include "call-stub.h"
-#include "byte-order.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "quota-common-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/quota-common-utils.h>
#include "afr-transaction.h"
#include "afr-messages.h"
@@ -77,7 +74,8 @@ afr_handle_quota_size(call_frame_t *frame, xlator_t *this)
continue;
if (!replies[i].xdata)
continue;
- ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY, &size);
+ ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY,
+ SLEN(QUOTA_SIZE_KEY), &size);
if (ret == -1)
continue;
if (read_subvol == -1)
@@ -304,6 +302,7 @@ afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
afr_local_t *local = NULL;
int op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -530,12 +529,16 @@ afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t callcnt = 0;
long int cky = 0;
int ret = 0;
+ int keylen = 0;
+ int children_keylen = 0;
priv = this->private;
children = priv->children;
local = frame->local;
cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ children_keylen = strlen(children[cky]->name);
LOCK(&frame->lock);
{
@@ -546,11 +549,12 @@ afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!local->dict)
local->dict = dict_new();
if (local->dict) {
- ret = dict_get_str(dict, local->cont.getxattr.name, &tmp_report);
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+ &tmp_report);
if (ret)
goto unlock;
- ret = dict_set_dynstr(local->dict, children[cky]->name,
- gf_strdup(tmp_report));
+ ret = dict_set_dynstrn(local->dict, children[cky]->name,
+ children_keylen, gf_strdup(tmp_report));
if (ret)
goto unlock;
}
@@ -574,8 +578,8 @@ unlock:
}
if (serz_len == -1)
snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
- ret = dict_set_dynstr(xattr, local->cont.getxattr.name,
- gf_strdup(lk_summary));
+ ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+ gf_strdup(lk_summary));
if (ret) {
op_ret = -1;
op_errno = ENOMEM;
@@ -612,6 +616,8 @@ afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t callcnt = 0;
long int cky = 0;
int ret = 0;
+ int keylen = 0;
+ int children_keylen = 0;
priv = this->private;
children = priv->children;
@@ -619,6 +625,9 @@ afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ children_keylen = strlen(children[cky]->name);
+
LOCK(&frame->lock);
{
callcnt = --local->call_count;
@@ -628,11 +637,12 @@ afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!local->dict)
local->dict = dict_new();
if (local->dict) {
- ret = dict_get_str(dict, local->cont.getxattr.name, &tmp_report);
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+ &tmp_report);
if (ret)
goto unlock;
- ret = dict_set_dynstr(local->dict, children[cky]->name,
- gf_strdup(tmp_report));
+ ret = dict_set_dynstrn(local->dict, children[cky]->name,
+ children_keylen, gf_strdup(tmp_report));
if (ret)
goto unlock;
}
@@ -656,8 +666,8 @@ unlock:
}
if (serz_len == -1)
snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
- ret = dict_set_dynstr(xattr, local->cont.getxattr.name,
- gf_strdup(lk_summary));
+ ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+ gf_strdup(lk_summary));
if (ret) {
op_ret = -1;
op_errno = ENOMEM;
@@ -801,15 +811,18 @@ unlock:
if (ret) {
local->op_ret = -1;
local->op_errno = ENOMEM;
+ GF_FREE(xattr_serz);
goto unwind;
}
- ret = dict_set_dynstr(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY,
- xattr_serz);
+ ret = dict_set_dynstr_sizen(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY,
+ xattr_serz);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
"Cannot set node_uuid key in dict");
local->op_ret = -1;
local->op_errno = ENOMEM;
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
} else {
local->op_ret = local->cont.getxattr.xattr_len - 1;
local->op_errno = 0;
@@ -933,24 +946,13 @@ unlock:
goto unwind;
}
- len = dict_serialized_length(local->dict);
- if (len <= 0) {
- goto unwind;
- }
-
- lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char);
- if (!lockinfo_buf) {
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
local->op_ret = -1;
- local->op_errno = ENOMEM;
goto unwind;
}
- op_ret = dict_serialize(local->dict, lockinfo_buf);
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = -op_ret;
- }
-
op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
(void *)lockinfo_buf, len);
if (op_ret < 0) {
@@ -1049,24 +1051,13 @@ unlock:
goto unwind;
}
- len = dict_serialized_length(local->dict);
- if (len <= 0) {
- goto unwind;
- }
-
- lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char);
- if (!lockinfo_buf) {
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
local->op_ret = -1;
- local->op_errno = ENOMEM;
goto unwind;
}
- op_ret = dict_serialize(local->dict, lockinfo_buf);
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = -op_ret;
- }
-
op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
(void *)lockinfo_buf, len);
if (op_ret < 0) {
@@ -1095,9 +1086,11 @@ afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int ret = 0;
char *xattr = NULL;
char *xattr_serz = NULL;
+ int keylen = 0;
char xattr_cky[1024] = {
0,
};
+ int xattr_cky_len = 0;
dict_t *nxattr = NULL;
long cky = 0;
int32_t padding = 0;
@@ -1110,7 +1103,9 @@ afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
cky = (long)cookie;
-
+ keylen = strlen(local->cont.getxattr.name);
+ xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+ local->cont.getxattr.name, cky);
LOCK(&frame->lock);
{
callcnt = --local->call_count;
@@ -1126,31 +1121,30 @@ afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!dict || (op_ret < 0))
goto unlock;
- if (!local->dict)
+ if (!local->dict) {
local->dict = dict_new();
-
- if (local->dict) {
- ret = dict_get_str(dict, local->cont.getxattr.name, &xattr);
- if (ret)
+ if (!local->dict)
goto unlock;
+ }
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+ if (ret)
+ goto unlock;
- xattr = gf_strdup(xattr);
-
- (void)snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
- local->cont.getxattr.name, cky);
- ret = dict_set_dynstr(local->dict, xattr_cky, xattr);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
- "Cannot set xattr cookie key");
- goto unlock;
- }
+ xattr = gf_strdup(xattr);
- local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+ ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set xattr cookie key");
+ goto post_unlock;
}
+
+ local->cont.getxattr.xattr_len += strlen(xattr) + 1;
}
unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
if (!callcnt) {
if (!local->cont.getxattr.xattr_len)
goto unwind;
@@ -1177,6 +1171,7 @@ unlock:
ret = dict_serialize_value_with_delim(
local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
if (ret) {
+ GF_FREE(xattr_serz);
goto unwind;
}
@@ -1184,10 +1179,14 @@ unlock:
*(xattr_serz + padding + tlen) = ')';
*(xattr_serz + padding + tlen + 1) = '\0';
- ret = dict_set_dynstr(nxattr, local->cont.getxattr.name, xattr_serz);
- if (ret)
+ ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+ xattr_serz);
+ if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
"Cannot set pathinfo key in dict");
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
+ }
unwind:
AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno,
@@ -1214,6 +1213,8 @@ afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
char xattr_cky[1024] = {
0,
};
+ int keylen = 0;
+ int xattr_cky_len = 0;
dict_t *nxattr = NULL;
long cky = 0;
int32_t padding = 0;
@@ -1226,7 +1227,9 @@ afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
cky = (long)cookie;
-
+ keylen = strlen(local->cont.getxattr.name);
+ xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+ local->cont.getxattr.name, cky);
LOCK(&frame->lock);
{
callcnt = --local->call_count;
@@ -1242,32 +1245,30 @@ afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!dict || (op_ret < 0))
goto unlock;
- if (!local->dict)
+ if (!local->dict) {
local->dict = dict_new();
-
- if (local->dict) {
- ret = dict_get_str(dict, local->cont.getxattr.name, &xattr);
- if (ret)
+ if (!local->dict)
goto unlock;
+ }
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+ if (ret)
+ goto unlock;
- xattr = gf_strdup(xattr);
-
- (void)snprintf(xattr_cky, 1024, "%s-%ld", local->cont.getxattr.name,
- cky);
- ret = dict_set_dynstr(local->dict, xattr_cky, xattr);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
- "Cannot set xattr "
- "cookie key");
- goto unlock;
- }
+ xattr = gf_strdup(xattr);
- local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+ ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set xattr cookie key");
+ goto post_unlock;
}
+
+ local->cont.getxattr.xattr_len += strlen(xattr) + 1;
}
unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
if (!callcnt) {
if (!local->cont.getxattr.xattr_len)
goto unwind;
@@ -1294,6 +1295,7 @@ unlock:
ret = dict_serialize_value_with_delim(
local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
if (ret) {
+ GF_FREE(xattr_serz);
goto unwind;
}
@@ -1301,10 +1303,14 @@ unlock:
*(xattr_serz + padding + tlen) = ')';
*(xattr_serz + padding + tlen + 1) = '\0';
- ret = dict_set_dynstr(nxattr, local->cont.getxattr.name, xattr_serz);
- if (ret)
+ ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+ xattr_serz);
+ if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
"Cannot set pathinfo key in dict");
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
+ }
unwind:
AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
@@ -1693,6 +1699,7 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
int32_t op_errno = 0;
fop_fgetxattr_cbk_t cbk = NULL;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -1786,6 +1793,7 @@ afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
afr_local_t *local = NULL;
int32_t op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -1861,6 +1869,7 @@ afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
afr_local_t *local = NULL;
int32_t op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
index 1627ee2c426..8c982bc7e6f 100644
--- a/xlators/cluster/afr/src/afr-inode-read.h
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -38,5 +38,8 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
dict_t *xdata);
int
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata);
+int
afr_handle_quota_size(call_frame_t *frame, xlator_t *this);
#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index df807342b99..1d6e4f3570a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -8,27 +8,21 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
#include "protocol-common.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "afr-transaction.h"
#include "afr-self-heal.h"
#include "afr-messages.h"
@@ -180,11 +174,10 @@ __afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
{
__afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
prebuf, postbuf, xattr, xdata);
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0) {
__afr_inode_write_finalize(frame, this);
@@ -307,8 +300,8 @@ afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
local->update_open_fd_count = _gf_true;
}
- ret = dict_get_int32n(xdata, GLUSTERFS_INODELK_COUNT,
- SLEN(GLUSTERFS_INODELK_COUNT), &num_inodelks);
+ ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT,
+ &num_inodelks);
if (ret < 0)
goto unlock;
if (num_inodelks > local->num_inodelks) {
@@ -498,6 +491,7 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
int op_errno = ENOMEM;
int ret = -1;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
local = AFR_FRAME_INIT(frame, op_errno);
if (!local)
goto out;
@@ -528,8 +522,8 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
goto out;
}
- if (dict_set_strn(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT,
- SLEN(GLUSTERFS_INODELK_DOM_COUNT), this->name)) {
+ if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT,
+ this->name)) {
op_errno = ENOMEM;
goto out;
}
@@ -737,6 +731,7 @@ afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -947,6 +942,7 @@ afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1059,15 +1055,14 @@ afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie,
local->replies[i].op_ret = op_ret;
local->replies[i].op_errno = op_errno;
- ret = dict_get_str(local->xdata_req, "replicate-brick-op", &op_type);
+ ret = dict_get_str_sizen(local->xdata_req, "replicate-brick-op", &op_type);
if (ret)
goto out;
- gf_msg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
- op_ret ? op_errno : 0, afr_get_msg_id(op_type),
- "Set of pending xattr %s on"
- " %s.",
- op_ret ? "failed" : "succeeded", priv->children[i]->name);
+ gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
+ op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s",
+ priv->children[i]->name, "op_ret=%s",
+ op_ret ? "failed" : "succeeded", NULL);
out:
syncbarrier_wake(&local->barrier);
@@ -1105,10 +1100,10 @@ out:
return -ret;
}
-int
+static int
_afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc,
int empty_index, afr_transaction_type type,
- char *op_type)
+ char *op_type, const int op_type_len)
{
int count = 0;
int ret = -ENOMEM;
@@ -1139,7 +1134,8 @@ _afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc,
if (!local->xdata_req)
goto out;
- ret = dict_set_str(local->xdata_req, "replicate-brick-op", op_type);
+ ret = dict_set_nstrn(local->xdata_req, "replicate-brick-op",
+ SLEN("replicate-brick-op"), op_type, op_type_len);
if (ret)
goto out;
@@ -1160,9 +1156,8 @@ _afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc,
}
if (!count) {
- gf_msg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS,
- "Couldn't acquire lock on"
- " any child.");
+ gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS,
+ NULL);
ret = -EAGAIN;
goto unlock;
}
@@ -1211,26 +1206,41 @@ _afr_handle_empty_brick(void *opaque)
call_frame_t *frame = NULL;
xlator_t *this = NULL;
char *op_type = NULL;
+ int op_type_len = 0;
afr_empty_brick_args_t *data = NULL;
+ call_frame_t *op_frame = NULL;
data = opaque;
frame = data->frame;
empty_index = data->empty_index;
+ if (!data->op_type)
+ goto out;
+
+ op_frame = copy_frame(frame);
+ if (!op_frame) {
+ ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
op_type = data->op_type;
- this = frame->this;
+ op_type_len = strlen(op_type);
+ this = op_frame->this;
priv = this->private;
- local = AFR_FRAME_INIT(frame, op_errno);
+ afr_set_lk_owner(op_frame, this, op_frame->root);
+ local = AFR_FRAME_INIT(op_frame, op_errno);
if (!local)
goto out;
loc_copy(&local->loc, &data->loc);
- gf_msg(this->name, GF_LOG_INFO, 0, 0, "New brick is : %s",
- priv->children[empty_index]->name);
+ gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s",
+ priv->children[empty_index]->name, NULL);
- ret = _afr_handle_empty_brick_type(this, frame, &local->loc, empty_index,
- AFR_METADATA_TRANSACTION, op_type);
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+ AFR_METADATA_TRANSACTION, op_type,
+ op_type_len);
if (ret) {
op_errno = -ret;
ret = -1;
@@ -1244,8 +1254,9 @@ _afr_handle_empty_brick(void *opaque)
local->xattr_req = NULL;
local->xdata_req = NULL;
- ret = _afr_handle_empty_brick_type(this, frame, &local->loc, empty_index,
- AFR_ENTRY_TRANSACTION, op_type);
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+ AFR_ENTRY_TRANSACTION, op_type,
+ op_type_len);
if (ret) {
op_errno = -ret;
ret = -1;
@@ -1253,6 +1264,9 @@ _afr_handle_empty_brick(void *opaque)
}
ret = 0;
out:
+ if (op_frame) {
+ AFR_STACK_DESTROY(op_frame);
+ }
AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
return 0;
}
@@ -1273,14 +1287,14 @@ afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc,
goto out;
}
- ret = dict_set_int32(local->xdata_req, "heal-op",
- GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK);
+ ret = dict_set_int32_sizen(local->xdata_req, "heal-op",
+ GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK);
if (ret) {
op_errno = -ret;
ret = -1;
goto out;
}
- ret = dict_set_str(local->xdata_req, "child-name", data);
+ ret = dict_set_str_sizen(local->xdata_req, "child-name", data);
if (ret) {
op_errno = -ret;
ret = -1;
@@ -1297,9 +1311,8 @@ afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc,
*/
ret = afr_inode_split_brain_choice_set(loc->inode, this, -1);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to set"
- "split-brain choice to -1");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ NULL);
afr_heal_splitbrain_file(frame, this, loc);
ret = 0;
out:
@@ -1322,8 +1335,8 @@ afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len)
spb_child_index = afr_get_child_index_from_name(this, spb_child_str);
if (spb_child_index < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
- "Invalid subvol: %s", spb_child_str);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "subvol=%s", spb_child_str, NULL);
}
return spb_child_index;
}
@@ -1345,11 +1358,9 @@ afr_can_set_split_brain_choice(void *opaque)
&data->m_spb);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to determine if %s"
- " is in split-brain. "
- "Aborting split-brain-choice set.",
- uuid_utoa(loc->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s",
+ uuid_utoa(loc->gfid), NULL);
return ret;
}
@@ -1357,7 +1368,8 @@ int
afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
dict_t *dict)
{
- void *value = NULL;
+ void *choice_value = NULL;
+ void *resolve_value = NULL;
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_spbc_timeout_t *data = NULL;
@@ -1368,6 +1380,14 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
priv = this->private;
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len);
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value,
+ &len);
+ if (!choice_value && !resolve_value) {
+ ret = -1;
+ goto out;
+ }
+
local = AFR_FRAME_INIT(frame, op_errno);
if (!local) {
ret = 1;
@@ -1376,9 +1396,9 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
local->op = GF_FOP_SETXATTR;
- ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &value, &len);
- if (value) {
- spb_child_index = afr_get_split_brain_child_index(this, value, len);
+ if (choice_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, choice_value,
+ len);
if (spb_child_index < 0) {
/* Case where value was "none" */
if (spb_child_index == -2)
@@ -1402,12 +1422,8 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice,
afr_set_split_brain_choice, NULL, data);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to create"
- " synctask. Aborting split-brain choice set"
- " for %s",
- loc->name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ "name=%s", loc->name, NULL);
ret = 1;
op_errno = ENOMEM;
goto out;
@@ -1416,9 +1432,9 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
goto out;
}
- ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &value, &len);
- if (value) {
- spb_child_index = afr_get_split_brain_child_index(this, value, len);
+ if (resolve_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, resolve_value,
+ len);
if (spb_child_index < 0) {
ret = 1;
goto out;
@@ -1470,20 +1486,20 @@ afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc,
char *op_type = NULL;
afr_empty_brick_args_t *data = NULL;
- ret = dict_get_str(dict, GF_AFR_REPLACE_BRICK, &empty_brick);
+ ret = dict_get_str_sizen(dict, GF_AFR_REPLACE_BRICK, &empty_brick);
if (!ret)
op_type = GF_AFR_REPLACE_BRICK;
- ab_ret = dict_get_str(dict, GF_AFR_ADD_BRICK, &empty_brick);
+ ab_ret = dict_get_str_sizen(dict, GF_AFR_ADD_BRICK, &empty_brick);
if (!ab_ret)
op_type = GF_AFR_ADD_BRICK;
if (ret && ab_ret)
goto out;
- if (frame->root->pid != GF_CLIENT_PID_SELF_HEALD) {
- gf_msg(this->name, GF_LOG_ERROR, EPERM, afr_get_msg_id(op_type),
- "'%s' is an internal extended attribute.", op_type);
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) {
+ gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR,
+ "op_type=%s", op_type, NULL);
ret = 1;
goto out;
}
@@ -1509,8 +1525,8 @@ afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc,
ret = synctask_new(this->ctx->env, _afr_handle_empty_brick,
_afr_handle_empty_brick_cbk, NULL, data);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, afr_get_msg_id(op_type),
- "Failed to create synctask.");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ NULL);
ret = 1;
op_errno = ENOMEM;
afr_brick_args_cleanup(data);
@@ -1668,6 +1684,7 @@ afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1876,6 +1893,7 @@ afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -1976,6 +1994,7 @@ afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2085,6 +2104,7 @@ afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2191,6 +2211,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2222,7 +2243,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
local->transaction.main_frame = frame;
- local->transaction.start = local->cont.discard.offset;
+ local->transaction.start = local->cont.zerofill.offset;
local->transaction.len = len;
afr_fix_open(fd, this);
@@ -2390,6 +2411,7 @@ afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
int ret = -1;
int op_errno = ENOMEM;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2484,7 +2506,9 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
call_frame_t *transaction_frame = NULL;
int ret = -1;
int32_t op_errno = ENOMEM;
+ int8_t last_fsync = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
transaction_frame = copy_frame(frame);
if (!transaction_frame)
goto out;
@@ -2493,10 +2517,16 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
if (!local)
goto out;
- if (xdata)
+ if (xdata) {
local->xdata_req = dict_copy_with_ref(xdata, NULL);
- else
+ if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
+ if (last_fsync) {
+ local->transaction.disable_delayed_post_op = _gf_true;
+ }
+ }
+ } else {
local->xdata_req = dict_new();
+ }
if (!local->xdata_req)
goto out;
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index 95e52ff4a09..bc8eabe0f43 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -8,9 +8,9 @@
cases as published by the Free Software Foundation.
*/
-#include "dict.h"
-#include "byte-order.h"
-#include "common-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
#include "afr.h"
#include "afr-transaction.h"
@@ -22,11 +22,40 @@
#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */
#define LOCKED_LOWER 0x2 /* for lower path */
+void
+afr_lockee_cleanup(afr_lockee_t *lockee)
+{
+ if (lockee->fd) {
+ fd_unref(lockee->fd);
+ lockee->fd = NULL;
+ } else {
+ loc_wipe(&lockee->loc);
+ }
+
+ GF_FREE(lockee->basename);
+ lockee->basename = NULL;
+ GF_FREE(lockee->locked_nodes);
+ lockee->locked_nodes = NULL;
+
+ return;
+}
+
+void
+afr_lockees_cleanup(afr_internal_lock_t *int_lock)
+{
+ int i = 0;
+
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ afr_lockee_cleanup(&int_lock->lockee[i]);
+ }
+
+ return;
+}
int
afr_entry_lockee_cmp(const void *l1, const void *l2)
{
- const afr_entry_lockee_t *r1 = l1;
- const afr_entry_lockee_t *r2 = l2;
+ const afr_lockee_t *r1 = l1;
+ const afr_lockee_t *r2 = l2;
int ret = 0;
uuid_t gfid1 = {0};
uuid_t gfid2 = {0};
@@ -81,31 +110,14 @@ internal_lock_count(call_frame_t *frame, xlator_t *this)
}
int
-afr_is_inodelk_transaction(afr_transaction_type type)
-{
- int ret = 0;
-
- switch (type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- ret = 1;
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- ret = 0;
- break;
- }
-
- return ret;
-}
-
-int
-afr_init_entry_lockee(afr_entry_lockee_t *lockee, afr_local_t *local,
- loc_t *loc, char *basename, int child_count)
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+ int child_count)
{
- int ret = -1;
+ int ret = -ENOMEM;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
+ GF_ASSERT(int_lock->lockee_count < AFR_LOCKEE_COUNT_MAX);
loc_copy(&lockee->loc, loc);
lockee->basename = (basename) ? gf_strdup(basename) : NULL;
if (basename && !lockee->basename)
@@ -119,28 +131,45 @@ afr_init_entry_lockee(afr_entry_lockee_t *lockee, afr_local_t *local,
goto out;
ret = 0;
+ int_lock->lockee_count++;
out:
+ if (ret) {
+ afr_lockee_cleanup(lockee);
+ }
return ret;
}
-void
-afr_entry_lockee_cleanup(afr_internal_lock_t *int_lock)
+int
+afr_add_inode_lockee(afr_local_t *local, int child_count)
{
- int i = 0;
+ int ret = -ENOMEM;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
- for (i = 0; i < int_lock->lockee_count; i++) {
- loc_wipe(&int_lock->lockee[i].loc);
- if (int_lock->lockee[i].basename)
- GF_FREE(int_lock->lockee[i].basename);
- if (int_lock->lockee[i].locked_nodes)
- GF_FREE(int_lock->lockee[i].locked_nodes);
+ if (local->fd) {
+ lockee->fd = fd_ref(local->fd);
+ } else {
+ loc_copy(&lockee->loc, &local->loc);
}
- return;
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
+
+ if (!lockee->locked_nodes)
+ goto out;
+
+ ret = 0;
+ int_lock->lockee_count++;
+out:
+ if (ret) {
+ afr_lockee_cleanup(lockee);
+ }
+ return ret;
}
static int
-initialize_entrylk_variables(call_frame_t *frame, xlator_t *this)
+initialize_internal_lock_variables(call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
@@ -152,9 +181,10 @@ initialize_entrylk_variables(call_frame_t *frame, xlator_t *this)
local = frame->local;
int_lock = &local->internal_lock;
- int_lock->entrylk_lock_count = 0;
+ int_lock->lock_count = 0;
int_lock->lock_op_ret = -1;
int_lock->lock_op_errno = 0;
+ int_lock->lk_attempted_count = 0;
for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
if (!int_lock->lockee[i].locked_nodes)
@@ -167,28 +197,6 @@ initialize_entrylk_variables(call_frame_t *frame, xlator_t *this)
return 0;
}
-static int
-initialize_inodelk_variables(call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_count = 0;
- int_lock->lk_attempted_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
-
- memset(int_lock->locked_nodes, 0,
- sizeof(*int_lock->locked_nodes) * priv->child_count);
-
- return 0;
-}
-
int
afr_lockee_locked_nodes_count(afr_internal_lock_t *int_lock)
{
@@ -216,19 +224,74 @@ afr_locked_nodes_count(unsigned char *locked_nodes, int child_count)
return call_count;
}
-/* FIXME: What if UNLOCK fails */
+static void
+afr_log_locks_failure(call_frame_t *frame, char *where, char *what,
+ int op_errno)
+{
+ xlator_t *this = frame->this;
+ gf_lkowner_t *lk_owner = &frame->root->lk_owner;
+ afr_local_t *local = frame->local;
+ const char *fop = NULL;
+ char *gfid = NULL;
+ const char *name = NULL;
+
+ fop = gf_fop_list[local->op];
+
+ switch (local->transaction.type) {
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
+ switch (local->op) {
+ case GF_FOP_LINK:
+ gfid = uuid_utoa(local->newloc.pargfid);
+ name = local->newloc.name;
+ break;
+ default:
+ gfid = uuid_utoa(local->loc.pargfid);
+ name = local->loc.name;
+ break;
+ }
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_INTERNAL_LKS_FAILED,
+ "Unable to do entry %s with lk-owner:%s on %s "
+ "while attempting %s on {pgfid:%s, name:%s}.",
+ what, lkowner_utoa(lk_owner), where, fop, gfid, name);
+ break;
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ gfid = uuid_utoa(local->inode->gfid);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_INTERNAL_LKS_FAILED,
+ "Unable to do inode %s with lk-owner:%s on %s "
+ "while attempting %s on gfid:%s.",
+ what, lkowner_utoa(lk_owner), where, fop, gfid);
+ break;
+ }
+}
+
static int32_t
afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
afr_internal_lock_t *int_lock = NULL;
+ int lockee_num = 0;
int call_count = 0;
+ int child_index = 0;
int ret = 0;
local = frame->local;
int_lock = &local->internal_lock;
+ priv = this->private;
+ lockee_num = (int)((long)cookie) / priv->child_count;
+ child_index = (int)((long)cookie) % priv->child_count;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ afr_log_locks_failure(frame, priv->children[child_index]->name,
+ "unlock", op_errno);
+ }
+ int_lock->lockee[lockee_num].locked_nodes[child_index] &= LOCKED_NO;
if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1)
ret = afr_write_subvol_reset(frame, this);
@@ -239,7 +302,6 @@ afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
UNLOCK(&frame->lock);
if (call_count == 0) {
- gf_msg_trace(this->name, 0, "All internal locks unlocked");
int_lock->lock_cbk(frame, this);
}
@@ -247,143 +309,88 @@ afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
void
-afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock,
- int32_t child_index)
-{
- int_lock->locked_nodes[child_index] &= LOCKED_NO;
-}
-
-static int32_t
-afr_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- int32_t child_index = (long)cookie;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- priv = this->private;
-
- if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
- gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
- "path=%s gfid=%s: unlock failed on subvolume %s "
- "with lock owner %s",
- local->loc.path, loc_gfid_utoa(&(local->loc)),
- priv->children[child_index]->name,
- lkowner_utoa(&frame->root->lk_owner));
- }
-
- afr_update_uninodelk(local, int_lock, child_index);
-
- afr_unlock_common_cbk(frame, cookie, this, op_ret, op_errno, xdata);
-
- return 0;
-}
-
-static int
-afr_unlock_inodelk(call_frame_t *frame, xlator_t *this)
+afr_internal_lock_wind(call_frame_t *frame,
+ int32_t (*cbk)(call_frame_t *, void *, xlator_t *,
+ int32_t, int32_t, dict_t *),
+ void *cookie, int child, int lockee_num,
+ gf_boolean_t blocking, gf_boolean_t unlock)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ entrylk_cmd cmd = ENTRYLK_LOCK_NB;
+ int32_t cmd1 = F_SETLK;
struct gf_flock flock = {
0,
};
- int call_count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->flock.l_start;
- flock.l_len = int_lock->flock.l_len;
- flock.l_type = F_UNLCK;
-
- call_count = afr_locked_nodes_count(int_lock->locked_nodes,
- priv->child_count);
-
- int_lock->lk_call_count = call_count;
-
- if (!call_count) {
- GF_ASSERT(!local->transaction.do_eager_unlock);
- gf_msg_trace(this->name, 0, "No internal locks unlocked");
-
- int_lock->lock_cbk(frame, this);
- goto out;
- }
- for (i = 0; i < priv->child_count; i++) {
- if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
- continue;
-
- if (local->fd) {
- STACK_WIND_COOKIE(
- frame, afr_unlock_inodelk_cbk, (void *)(long)i,
- priv->children[i], priv->children[i]->fops->finodelk,
- int_lock->domain, local->fd, F_SETLK, &flock, NULL);
- } else {
- STACK_WIND_COOKIE(
- frame, afr_unlock_inodelk_cbk, (void *)(long)i,
- priv->children[i], priv->children[i]->fops->inodelk,
- int_lock->domain, &local->loc, F_SETLK, &flock, NULL);
- }
+ switch (local->transaction.type) {
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ if (unlock) {
+ cmd = ENTRYLK_UNLOCK;
+ } else if (blocking) { /*Doesn't make sense to have blocking
+ unlock*/
+ cmd = ENTRYLK_LOCK;
+ }
- if (!--call_count)
+ if (local->fd) {
+ STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->fentrylk,
+ int_lock->domain,
+ int_lock->lockee[lockee_num].fd,
+ int_lock->lockee[lockee_num].basename, cmd,
+ ENTRYLK_WRLCK, NULL);
+ } else {
+ STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->entrylk,
+ int_lock->domain,
+ &int_lock->lockee[lockee_num].loc,
+ int_lock->lockee[lockee_num].basename, cmd,
+ ENTRYLK_WRLCK, NULL);
+ }
break;
- }
-out:
- return 0;
-}
-
-static int32_t
-afr_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_internal_lock_t *int_lock = NULL;
- int32_t child_index = 0;
- int lockee_no = 0;
-
- priv = this->private;
- lockee_no = (int)((long)cookie) / priv->child_count;
- child_index = (int)((long)cookie) % priv->child_count;
- local = frame->local;
- int_lock = &local->internal_lock;
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ flock = int_lock->lockee[lockee_num].flock;
+ if (unlock) {
+ flock.l_type = F_UNLCK;
+ } else if (blocking) { /*Doesn't make sense to have blocking
+ unlock*/
+ cmd1 = F_SETLKW;
+ }
- if (op_ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_ENTRY_UNLOCK_FAIL,
- "%s: unlock failed on %s", local->loc.path,
- priv->children[child_index]->name);
+ if (local->fd) {
+ STACK_WIND_COOKIE(
+ frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->finodelk, int_lock->domain,
+ int_lock->lockee[lockee_num].fd, cmd1, &flock, NULL);
+ } else {
+ STACK_WIND_COOKIE(
+ frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->inodelk, int_lock->domain,
+ &int_lock->lockee[lockee_num].loc, cmd1, &flock, NULL);
+ }
+ break;
}
-
- int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO;
- afr_unlock_common_cbk(frame, cookie, this, op_ret, op_errno, NULL);
-
- return 0;
}
static int
-afr_unlock_entrylk(call_frame_t *frame, xlator_t *this)
+afr_unlock_now(call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
int call_count = 0;
- int index = 0;
- int lockee_no = 0;
- int copies = 0;
+ int child_index = 0;
+ int lockee_num = 0;
int i = -1;
local = frame->local;
int_lock = &local->internal_lock;
priv = this->private;
- copies = priv->child_count;
call_count = afr_lockee_locked_nodes_count(int_lock);
@@ -396,16 +403,13 @@ afr_unlock_entrylk(call_frame_t *frame, xlator_t *this)
}
for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
- lockee_no = i / copies;
- index = i % copies;
- if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) {
- STACK_WIND_COOKIE(
- frame, afr_unlock_entrylk_cbk, (void *)(long)i,
- priv->children[index], priv->children[index]->fops->entrylk,
- int_lock->domain, &int_lock->lockee[lockee_no].loc,
- int_lock->lockee[lockee_no].basename, ENTRYLK_UNLOCK,
- ENTRYLK_WRLCK, NULL);
-
+ lockee_num = i / priv->child_count;
+ child_index = i % priv->child_count;
+ if (int_lock->lockee[lockee_num].locked_nodes[child_index] &
+ LOCKED_YES) {
+ afr_internal_lock_wind(frame, afr_unlock_common_cbk,
+ (void *)(long)i, child_index, lockee_num,
+ _gf_false, _gf_true);
if (!--call_count)
break;
}
@@ -415,18 +419,6 @@ out:
return 0;
}
-int32_t
-afr_unlock_now(call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = frame->local;
-
- if (afr_is_inodelk_transaction(local->transaction.type))
- afr_unlock_inodelk(frame, this);
- else
- afr_unlock_entrylk(frame, this);
- return 0;
-}
-
static int32_t
afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
@@ -436,14 +428,14 @@ afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
afr_private_t *priv = NULL;
int cky = (long)cookie;
int child_index = 0;
- int lockee_no = 0;
+ int lockee_num = 0;
priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
child_index = ((int)cky) % priv->child_count;
- lockee_no = ((int)cky) / priv->child_count;
+ lockee_num = ((int)cky) / priv->child_count;
LOCK(&frame->lock);
{
@@ -470,23 +462,16 @@ afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
afr_unlock_now(frame, this);
} else {
if (op_ret == 0) {
- if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
- local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- int_lock->lockee[lockee_no]
- .locked_nodes[child_index] |= LOCKED_YES;
- int_lock->lockee[lockee_no].locked_count++;
- int_lock->entrylk_lock_count++;
- } else {
- int_lock->locked_nodes[child_index] |= LOCKED_YES;
- int_lock->lock_count++;
-
- if (local->transaction.type == AFR_DATA_TRANSACTION) {
- LOCK(&local->inode->lock);
- {
- local->inode_ctx->lock_count++;
- }
- UNLOCK(&local->inode->lock);
+ int_lock->lockee[lockee_num]
+ .locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_num].locked_count++;
+ int_lock->lock_count++;
+ if (local->transaction.type == AFR_DATA_TRANSACTION) {
+ LOCK(&local->inode->lock);
+ {
+ local->inode_ctx->lock_count++;
}
+ UNLOCK(&local->inode->lock);
}
}
afr_lock_blocking(frame, this, cky + 1);
@@ -495,30 +480,6 @@ afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
return 0;
}
-static int32_t
-afr_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
- return 0;
-}
-
-static int32_t
-afr_blocking_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
- return 0;
-}
-
-static gf_boolean_t
-afr_is_entrylk(afr_transaction_type trans_type)
-{
- if (afr_is_inodelk_transaction(trans_type))
- return _gf_false;
- return _gf_true;
-}
-
static gf_boolean_t
_is_lock_wind_needed(afr_local_t *local, int child_index)
{
@@ -528,40 +489,12 @@ _is_lock_wind_needed(afr_local_t *local, int child_index)
return _gf_true;
}
-static void
-afr_log_entry_locks_failure(xlator_t *this, afr_local_t *local,
- afr_internal_lock_t *int_lock)
-{
- const char *fop = NULL;
- char *pargfid = NULL;
- const char *name = NULL;
-
- fop = gf_fop_list[local->op];
-
- switch (local->op) {
- case GF_FOP_LINK:
- pargfid = uuid_utoa(local->newloc.pargfid);
- name = local->newloc.name;
- break;
- default:
- pargfid = uuid_utoa(local->loc.pargfid);
- name = local->loc.name;
- break;
- }
-
- gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_BLOCKING_LKS_FAILED,
- "Unable to obtain sufficient blocking entry locks on at least "
- "one child while attempting %s on {pgfid:%s, name:%s}.",
- fop, pargfid, name);
-}
-
static gf_boolean_t
is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
afr_internal_lock_t *int_lock = NULL;
- gf_boolean_t is_entrylk = _gf_false;
int child = 0;
int nlockee = 0;
int lockee_count = 0;
@@ -571,42 +504,26 @@ is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this)
priv = this->private;
int_lock = &local->internal_lock;
lockee_count = int_lock->lockee_count;
- is_entrylk = afr_is_entrylk(local->transaction.type);
-
- if (!is_entrylk) {
- if (int_lock->lock_count == 0) {
- gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_BLOCKING_LKS_FAILED,
- "Unable to obtain "
- "blocking inode lock on even one child for "
- "gfid:%s.",
- uuid_utoa(local->inode->gfid));
- return _gf_false;
- } else {
- /*inodelk succeeded on at least one child. */
- return _gf_true;
- }
- } else {
- if (int_lock->entrylk_lock_count == 0) {
- afr_log_entry_locks_failure(this, local, int_lock);
- return _gf_false;
- }
- /* For FOPS that take multiple sets of locks (mkdir, rename),
- * there must be at least one brick on which the locks from
- * all lock sets were successful. */
- for (child = 0; child < priv->child_count; child++) {
- ret = _gf_true;
- for (nlockee = 0; nlockee < lockee_count; nlockee++) {
- if (!(int_lock->lockee[nlockee].locked_nodes[child] &
- LOCKED_YES))
- ret = _gf_false;
- }
- if (ret)
- return ret;
+ if (int_lock->lock_count == 0) {
+ afr_log_locks_failure(frame, "any subvolume", "lock",
+ int_lock->lock_op_errno);
+ return _gf_false;
+ }
+ /* For FOPS that take multiple sets of locks (mkdir, rename),
+ * there must be at least one brick on which the locks from
+ * all lock sets were successful. */
+ for (child = 0; child < priv->child_count; child++) {
+ ret = _gf_true;
+ for (nlockee = 0; nlockee < lockee_count; nlockee++) {
+ if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES))
+ ret = _gf_false;
}
- if (!ret)
- afr_log_entry_locks_failure(this, local, int_lock);
+ if (ret)
+ return ret;
}
+ if (!ret)
+ afr_log_locks_failure(frame, "all", "lock", int_lock->lock_op_errno);
return ret;
}
@@ -617,27 +534,16 @@ afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie)
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- struct gf_flock flock = {
- 0,
- };
uint64_t ctx = 0;
int ret = 0;
int child_index = 0;
- int lockee_no = 0;
- gf_boolean_t is_entrylk = _gf_false;
+ int lockee_num = 0;
local = frame->local;
int_lock = &local->internal_lock;
priv = this->private;
child_index = cookie % priv->child_count;
- lockee_no = cookie / priv->child_count;
- is_entrylk = afr_is_entrylk(local->transaction.type);
-
- if (!is_entrylk) {
- flock.l_start = int_lock->flock.l_start;
- flock.l_len = int_lock->flock.l_len;
- flock.l_type = int_lock->flock.l_type;
- }
+ lockee_num = cookie / priv->child_count;
if (local->fd) {
ret = fd_ctx_get(local->fd, this, &ctx);
@@ -681,52 +587,8 @@ afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie)
return 0;
}
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
-
- if (local->fd) {
- STACK_WIND_COOKIE(
- frame, afr_blocking_inodelk_cbk, (void *)(long)child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->finodelk,
- int_lock->domain, local->fd, F_SETLKW, &flock, NULL);
-
- } else {
- STACK_WIND_COOKIE(
- frame, afr_blocking_inodelk_cbk, (void *)(long)child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->inodelk,
- int_lock->domain, &local->loc, F_SETLKW, &flock, NULL);
- }
-
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- /*Accounting for child_index increments on 'down'
- *and 'fd-less' children */
-
- if (local->fd) {
- STACK_WIND_COOKIE(frame, afr_blocking_entrylk_cbk,
- (void *)(long)cookie,
- priv->children[child_index],
- priv->children[child_index]->fops->fentrylk,
- int_lock->domain, local->fd,
- int_lock->lockee[lockee_no].basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
- } else {
- STACK_WIND_COOKIE(
- frame, afr_blocking_entrylk_cbk, (void *)(long)cookie,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- int_lock->domain, &int_lock->lockee[lockee_no].loc,
- int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK,
- ENTRYLK_WRLCK, NULL);
- }
-
- break;
- }
+ afr_internal_lock_wind(frame, afr_lock_cbk, (void *)(long)cookie,
+ child_index, lockee_num, _gf_true, _gf_false);
return 0;
}
@@ -743,20 +605,10 @@ afr_blocking_lock(call_frame_t *frame, xlator_t *this)
local = frame->local;
int_lock = &local->internal_lock;
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- initialize_inodelk_variables(frame, this);
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- up_count = AFR_COUNT(local->child_up, priv->child_count);
- int_lock->lk_call_count = int_lock->lk_expected_count =
- (int_lock->lockee_count * up_count);
- initialize_entrylk_variables(frame, this);
- break;
- }
+ up_count = AFR_COUNT(local->child_up, priv->child_count);
+ int_lock->lk_call_count = int_lock->lk_expected_count =
+ (int_lock->lockee_count * up_count);
+ initialize_internal_lock_variables(frame, this);
afr_lock_blocking(frame, this, 0);
@@ -764,171 +616,20 @@ afr_blocking_lock(call_frame_t *frame, xlator_t *this)
}
static int32_t
-afr_nonblocking_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_nb_internal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
int call_count = 0;
- int child_index = (long)cookie;
- int copies = 0;
- int index = 0;
- int lockee_no = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- copies = priv->child_count;
- index = child_index % copies;
- lockee_no = child_index / copies;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK(&frame->lock);
- {
- if (op_ret < 0) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
- AFR_MSG_LOCK_XLATOR_NOT_LOADED,
- "subvolume does not support "
- "locking. please load features/locks"
- " xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
-
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- } else if (op_ret == 0) {
- int_lock->lockee[lockee_no].locked_nodes[index] |= LOCKED_YES;
- int_lock->lockee[lockee_no].locked_count++;
- int_lock->entrylk_lock_count++;
- }
-
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK(&frame->lock);
-
- if (call_count == 0) {
- gf_msg_trace(this->name, 0, "Last locking reply received");
- /* all locks successful. Proceed to call FOP */
- if (int_lock->entrylk_lock_count == int_lock->lk_expected_count) {
- gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk");
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk(frame, this);
- }
- /* Not all locks were successful. Unlock and try locking
- again, this time with serially blocking locks */
- else {
- gf_msg_trace(this->name, 0,
- "%d servers locked. Trying again "
- "with blocking calls",
- int_lock->lock_count);
-
- afr_unlock_now(frame, this);
- }
- }
-
- return 0;
-}
-
-int
-afr_nonblocking_entrylk(call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ int child_index = 0;
+ int lockee_num = 0;
afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int copies = 0;
- int index = 0;
- int lockee_no = 0;
- int32_t call_count = 0;
- int i = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
priv = this->private;
- copies = priv->child_count;
- initialize_entrylk_variables(frame, this);
-
- if (local->fd) {
- fd_ctx = afr_fd_ctx_get(local->fd, this);
- if (!fd_ctx) {
- gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
- "unable to get fd ctx for fd=%p", local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EINVAL;
- int_lock->lock_op_errno = EINVAL;
-
- afr_unlock_now(frame, this);
- return -1;
- }
-
- call_count = int_lock->lockee_count * internal_lock_count(frame, this);
- int_lock->lk_call_count = call_count;
- int_lock->lk_expected_count = call_count;
-
- if (!call_count) {
- gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON,
- "fd not open on any subvolumes. aborting.");
- afr_unlock_now(frame, this);
- goto out;
- }
-
- /* Send non-blocking entrylk calls only on up children
- and where the fd has been opened */
- for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
- index = i % copies;
- lockee_no = i / copies;
- if (local->child_up[index]) {
- STACK_WIND_COOKIE(frame, afr_nonblocking_entrylk_cbk,
- (void *)(long)i, priv->children[index],
- priv->children[index]->fops->fentrylk,
- this->name, local->fd,
- int_lock->lockee[lockee_no].basename,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- if (!--call_count)
- break;
- }
- }
- } else {
- call_count = int_lock->lockee_count * internal_lock_count(frame, this);
- int_lock->lk_call_count = call_count;
- int_lock->lk_expected_count = call_count;
-
- for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
- index = i % copies;
- lockee_no = i / copies;
- if (local->child_up[index]) {
- STACK_WIND_COOKIE(frame, afr_nonblocking_entrylk_cbk,
- (void *)(long)i, priv->children[index],
- priv->children[index]->fops->entrylk,
- this->name, &int_lock->lockee[lockee_no].loc,
- int_lock->lockee[lockee_no].basename,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
-
- if (!--call_count)
- break;
- }
- }
- }
-out:
- return 0;
-}
-
-int32_t
-afr_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- int call_count = 0;
- int child_index = (long)cookie;
+ child_index = ((long)cookie) % priv->child_count;
+ lockee_num = ((long)cookie) / priv->child_count;
local = frame->local;
int_lock = &local->internal_lock;
@@ -953,11 +654,14 @@ afr_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
" xlator on server");
local->op_ret = op_ret;
int_lock->lock_op_ret = op_ret;
+
int_lock->lock_op_errno = op_errno;
local->op_errno = op_errno;
}
- } else {
- int_lock->locked_nodes[child_index] |= LOCKED_YES;
+ } else if (op_ret == 0) {
+ int_lock->lockee[lockee_num]
+ .locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_num].locked_count++;
int_lock->lock_count++;
}
@@ -966,7 +670,7 @@ afr_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
UNLOCK(&frame->lock);
if (call_count == 0) {
- gf_msg_trace(this->name, 0, "Last inode locking reply received");
+ gf_msg_trace(this->name, 0, "Last locking reply received");
/* all locks successful. Proceed to call FOP */
if (int_lock->lock_count == int_lock->lk_expected_count) {
gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk");
@@ -977,8 +681,8 @@ afr_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
again, this time with serially blocking locks */
else {
gf_msg_trace(this->name, 0,
- "%d servers locked. "
- "Trying again with blocking calls",
+ "%d servers locked. Trying again "
+ "with blocking calls",
int_lock->lock_count);
afr_unlock_now(frame, this);
@@ -989,12 +693,14 @@ afr_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
int
-afr_nonblocking_inodelk(call_frame_t *frame, xlator_t *this)
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
afr_fd_ctx_t *fd_ctx = NULL;
+ int child = 0;
+ int lockee_num = 0;
int32_t call_count = 0;
int i = 0;
int ret = 0;
@@ -1003,7 +709,7 @@ afr_nonblocking_inodelk(call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- initialize_inodelk_variables(frame, this);
+ initialize_internal_lock_variables(frame, this);
if (local->fd) {
fd_ctx = afr_fd_ctx_get(local->fd, this);
@@ -1022,36 +728,29 @@ afr_nonblocking_inodelk(call_frame_t *frame, xlator_t *this)
}
}
- call_count = internal_lock_count(frame, this);
+ call_count = int_lock->lockee_count * internal_lock_count(frame, this);
int_lock->lk_call_count = call_count;
int_lock->lk_expected_count = call_count;
if (!call_count) {
- gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
- "All bricks are down, aborting.");
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON,
+ "fd not open on any subvolumes. aborting.");
afr_unlock_now(frame, this);
goto out;
}
- /* Send non-blocking inodelk calls only on up children
+ /* Send non-blocking lock calls only on up children
and where the fd has been opened */
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
-
- if (local->fd) {
- STACK_WIND_COOKIE(
- frame, afr_nonblocking_inodelk_cbk, (void *)(long)i,
- priv->children[i], priv->children[i]->fops->finodelk,
- int_lock->domain, local->fd, F_SETLK, &int_lock->flock, NULL);
- } else {
- STACK_WIND_COOKIE(
- frame, afr_nonblocking_inodelk_cbk, (void *)(long)i,
- priv->children[i], priv->children[i]->fops->inodelk,
- int_lock->domain, &local->loc, F_SETLK, &int_lock->flock, NULL);
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ child = i % priv->child_count;
+ lockee_num = i / priv->child_count;
+ if (local->child_up[child]) {
+ afr_internal_lock_wind(frame, afr_nb_internal_lock_cbk,
+ (void *)(long)i, child, lockee_num,
+ _gf_false, _gf_false);
+ if (!--call_count)
+ break;
}
- if (!--call_count)
- break;
}
out:
return ret;
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index d62f7876bcd..816065fb57a 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -11,35 +11,19 @@
#ifndef __AFR_MEM_TYPES_H__
#define __AFR_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_afr_mem_types_ {
- gf_afr_mt_iovec = gf_common_mt_end + 1,
- gf_afr_mt_afr_fd_ctx_t,
+ gf_afr_mt_afr_fd_ctx_t = gf_common_mt_end + 1,
gf_afr_mt_afr_private_t,
gf_afr_mt_int32_t,
gf_afr_mt_char,
gf_afr_mt_xattr_key,
gf_afr_mt_dict_t,
gf_afr_mt_xlator_t,
- gf_afr_mt_iatt,
- gf_afr_mt_int,
gf_afr_mt_afr_node_character,
- gf_afr_mt_sh_diff_loop_state,
- gf_afr_mt_uint8_t,
- gf_afr_mt_loc_t,
- gf_afr_mt_entry_name,
- gf_afr_mt_pump_priv,
- gf_afr_mt_locked_fd,
gf_afr_mt_inode_ctx_t,
- gf_afr_fd_paused_call_t,
- gf_afr_mt_crawl_data_t,
- gf_afr_mt_brick_pos_t,
- gf_afr_mt_shd_bool_t,
- gf_afr_mt_shd_timer_t,
gf_afr_mt_shd_event_t,
- gf_afr_mt_time_t,
- gf_afr_mt_pos_data_t,
gf_afr_mt_reply_t,
gf_afr_mt_subvol_healer_t,
gf_afr_mt_spbc_timeout_t,
@@ -47,6 +31,8 @@ enum gf_afr_mem_types_ {
gf_afr_mt_empty_brick_t,
gf_afr_mt_child_latency_t,
gf_afr_mt_atomic_t,
+ gf_afr_mt_lk_heal_info_t,
+ gf_afr_mt_gf_lock,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index 696336889d3..e73fd997765 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -11,7 +11,7 @@
#ifndef _AFR_MESSAGES_H_
#define _AFR_MESSAGES_H_
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
/* To add new message IDs, append new identifiers at the end of the list.
*
@@ -23,25 +23,145 @@
* glfs-message-id.h.
*/
-GLFS_MSGID(AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET,
- AFR_MSG_QUORUM_OVERRIDE, AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP,
- AFR_MSG_SUBVOLS_DOWN, AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN,
- AFR_MSG_OPEN_FAIL, AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS,
- AFR_MSG_GFID_NULL, AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED,
- AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS,
- AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED,
- AFR_MSG_BLOCKING_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO,
- AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED,
- AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR,
- AFR_MSG_SELF_HEAL_INFO, AFR_MSG_READ_SUBVOL_ERROR,
- AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD,
- AFR_MSG_INVALID_DATA, AFR_MSG_INVALID_ARG,
- AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED,
- AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED,
- AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS,
- AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL,
- AFR_MSG_SBRAIN_FAV_CHILD_POLICY, AFR_MSG_INODE_CTX_GET_FAILED,
- AFR_MSG_THIN_ARB);
+GLFS_MSGID(
+ AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE,
+ AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN,
+ AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL,
+ AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL,
+ AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS,
+ AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED,
+ AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED,
+ AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO,
+ AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA,
+ AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED,
+ AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED,
+ AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG,
+ AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB,
+ AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED,
+ AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET,
+ AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START,
+ AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT,
+ AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED,
+ AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG,
+ AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED,
+ AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH,
+ AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED,
+ AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA,
+ SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE,
+ SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED,
+ AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED,
+ AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED,
+ AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED,
+ AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN,
+ AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN,
+ AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP,
+ AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR,
+ AFR_MSG_INTERNAL_ATTR);
+#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed"
+#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed"
+#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed"
+#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed"
+#define AFR_MSG_INVALID_ARG_STR "Invalid argument"
+#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on "
+#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file"
+#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict."
+#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed."
+#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \
+ "Failed to populate loc for thin-arbiter"
+#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending"
+#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping."
+#define AFR_MSG_UNKNOWN_SET_STR "Unknown set"
+#define AFR_MSG_NO_XL_ID_STR "xl does not have id"
+#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on"
+#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on"
+#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter."
+#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output"
+#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time"
+#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict"
+#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \
+ "No majority to resolve gfid split brain"
+#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected"
+#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal"
+#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch"
+#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \
+ "Size policy is not applicable to directories."
+#define AFR_MSG_NO_CHILD_SELECTED_STR \
+ "No child selected by favorite-child policy"
+#define AFR_MSG_INVALID_CHILD_STR "Invalid child"
+#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \
+ "selected as authentic to resolve conflicting data"
+#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick"
+#define SNO_DIFF_IN_MTIME_STR "No difference in mtime"
+#define SNO_BIGGER_FILE_STR "No bigger file"
+#define SALL_BRICKS_UP_TO_RESOLVE_STR \
+ "All the bricks should be up to resolve the gfid split brain"
+#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock"
+#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed"
+#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame"
+#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop"
+#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed"
+#define AFR_MSG_FSYNC_FAILED_STR "fsync failed"
+#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met"
+#define AFR_MSG_FOP_FAILED_STR "Failing Fop"
+#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume"
+#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling"
+#define AFR_MSG_CHILD_MISCONFIGURED_STR \
+ "replicate translator needs more than one subvolume defined"
+#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads"
+#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count"
+#define AFR_MSG_UNABLE_TO_FETCH_STR \
+ "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \
+ "using client translator names"
+#define AFR_MSG_NULL_DEREF_STR "possible NULL deref"
+#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key"
+#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask"
+#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up"
+#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \
+ "Failed to cancel split-brain choice"
+#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \
+ "Cannot set replica. File is not in data/metadata split-brain"
+#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx"
+#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols"
+#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child"
+#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file"
+#define AFR_MSG_TIMER_CREATE_FAIL_STR \
+ "Cannot create timer for delayed initialization"
+#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online"
+#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \
+ "All subvolumes are down. Going offline until atleast one of them is up"
+#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock"
+#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume"
+#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met"
+#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir"
+#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid"
+#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file"
+#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain"
+#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down"
+#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask"
+#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \
+ "read subvolume in this generation is not up"
+#define AFR_MSG_INTERNAL_LKS_FAILED_STR \
+ "Unable to work with lk-owner while attempting fop"
+#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \
+ "subvolume does not support locking. please load features/locks xlator " \
+ "on server."
+#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx"
+#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting."
+#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child."
+#define AFR_MSG_NEW_BRICK_STR "New brick"
+#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \
+ "Failed to set split-brain choice to -1"
+#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \
+ "Failed to determine split-brain. Aborting split-brain-choice set"
+#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume"
+#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr"
+#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute"
#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 1763dc0a4cc..64856042b65 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -8,32 +8,22 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "statedump.h"
-
-#include "afr-inode-read.h"
-#include "afr-inode-write.h"
-#include "afr-dir-read.h"
-#include "afr-dir-write.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+
#include "afr-transaction.h"
gf_boolean_t
@@ -73,6 +63,10 @@ afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
local = frame->local;
fd_ctx = local->fd_ctx;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
LOCK(&frame->lock);
{
if (op_ret == -1) {
@@ -84,13 +78,16 @@ afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
if (!local->xdata_rsp && xdata)
local->xdata_rsp = dict_ref(xdata);
}
+ call_count = --local->call_count;
}
UNLOCK(&frame->lock);
- call_count = afr_frame_return(frame);
-
if (call_count == 0) {
- if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) {
+ afr_handle_replies_quorum(frame, this);
+ if (local->op_ret == -1) {
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ } else if (fd_ctx->flags & O_TRUNC) {
STACK_WIND(frame, afr_open_ftruncate_cbk, this,
this->fops->ftruncate, fd, 0, NULL);
} else {
@@ -140,7 +137,7 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int spb_choice = 0;
+ int spb_subvol = 0;
int event_generation = 0;
int ret = 0;
int32_t op_errno = 0;
@@ -161,6 +158,11 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
goto out;
}
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
+
if (!afr_is_consistent_io_possible(local, priv, &op_errno))
goto out;
@@ -177,9 +179,9 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
ret = afr_inode_get_readable(frame, local->inode, this, NULL,
&event_generation, AFR_DATA_TRANSACTION);
if ((ret < 0) &&
- (afr_inode_split_brain_choice_get(local->inode, this, &spb_choice) ==
- 0) &&
- spb_choice < 0) {
+ (afr_split_brain_read_subvol_get(local->inode, this, NULL,
+ &spb_subvol) == 0) &&
+ spb_subvol < 0) {
afr_inode_refresh(frame, this, local->inode, local->inode->gfid,
afr_open_continue);
} else {
@@ -213,11 +215,9 @@ afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
"successfully on subvolume %s",
local->loc.path, priv->children[child_index]->name);
} else {
- gf_msg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno,
- AFR_MSG_OPEN_FAIL,
- "Failed to open %s on "
- "subvolume %s",
- local->loc.path, priv->children[child_index]->name);
+ gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno,
+ AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s",
+ priv->children[child_index]->name, NULL);
}
fd_ctx = local->fd_ctx;
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 1df39c35fce..6fc2c75145c 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -30,27 +30,6 @@ afr_pending_read_decrement(afr_private_t *priv, int child_index)
GF_ATOMIC_DEC(priv->pending_reads[child_index]);
}
-static gf_boolean_t
-afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child)
-{
- int *pending = NULL;
- int ret = 0;
- int i = 0;
-
- ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending);
- if (ret == 0) {
- for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
- /* Not doing a ntoh32(pending) as we just want to check
- * if it is non-zero or not. */
- if (pending[i]) {
- return _gf_true;
- }
- }
- }
-
- return _gf_false;
-}
-
void
afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
@@ -114,7 +93,7 @@ afr_ta_read_txn(void *opaque)
call_frame_t *frame = NULL;
xlator_t *this = NULL;
int read_subvol = -1;
- int up_child = AFR_CHILD_UNKNOWN;
+ int query_child = AFR_CHILD_UNKNOWN;
int possible_bad_child = AFR_CHILD_UNKNOWN;
int ret = 0;
int op_errno = ENOMEM;
@@ -134,18 +113,18 @@ afr_ta_read_txn(void *opaque)
this = frame->this;
local = frame->local;
priv = this->private;
+ query_child = local->read_txn_query_child;
- if (local->child_up[AFR_CHILD_ZERO]) {
- up_child = AFR_CHILD_ZERO;
+ if (query_child == AFR_CHILD_ZERO) {
possible_bad_child = AFR_CHILD_ONE;
- } else if (local->child_up[AFR_CHILD_ONE]) {
- up_child = AFR_CHILD_ONE;
+ } else if (query_child == AFR_CHILD_ONE) {
possible_bad_child = AFR_CHILD_ZERO;
+ } else {
+ /*read_txn_query_child is AFR_CHILD_UNKNOWN*/
+ goto out;
}
- GF_ASSERT(up_child != AFR_CHILD_UNKNOWN);
-
- /* Query the up_child to see if it blames the down one. */
+ /* Ask the query_child to see if it blames the possibly bad one. */
xdata_req = dict_new();
if (!xdata_req)
goto out;
@@ -159,30 +138,33 @@ afr_ta_read_txn(void *opaque)
goto out;
if (local->fd) {
- ret = syncop_fxattrop(priv->children[up_child], local->fd,
+ ret = syncop_fxattrop(priv->children[query_child], local->fd,
GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
NULL);
} else {
- ret = syncop_xattrop(priv->children[up_child], &local->loc,
+ ret = syncop_xattrop(priv->children[query_child], &local->loc,
GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
NULL);
}
if (ret || !xdata_rsp) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed xattrop for gfid %s on %s",
- uuid_utoa(local->inode->gfid), priv->children[up_child]->name);
+ uuid_utoa(local->inode->gfid),
+ priv->children[query_child]->name);
op_errno = -ret;
goto out;
}
if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv,
possible_bad_child)) {
- read_subvol = up_child;
+ read_subvol = query_child;
goto out;
}
dict_unref(xdata_rsp);
- /* Query thin-arbiter to see if it blames any data brick. */
- ret = afr_fill_ta_loc(this, &loc);
+ xdata_rsp = NULL;
+
+ /* It doesn't. So query thin-arbiter to see if it blames any data brick. */
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.", loc.name);
@@ -211,8 +193,8 @@ afr_ta_read_txn(void *opaque)
goto unlock;
}
- if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) {
- read_subvol = up_child;
+ if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) {
+ read_subvol = query_child;
} else {
gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
"Failing read for gfid %s since good brick %s is down",
@@ -261,6 +243,8 @@ afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this)
if (!ta_frame) {
local->op_ret = -1;
local->op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
goto out;
}
ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done,
@@ -288,7 +272,7 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
int read_subvol = -1;
inode_t *inode = NULL;
int ret = -1;
- int spb_choice = -1;
+ int spb_subvol = -1;
local = frame->local;
inode = local->inode;
@@ -319,9 +303,9 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
local->read_attempted[read_subvol] = 1;
readfn:
if (read_subvol == -1) {
- ret = afr_inode_split_brain_choice_get(inode, this, &spb_choice);
- if ((ret == 0) && spb_choice >= 0)
- read_subvol = spb_choice;
+ ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol);
+ if ((ret == 0) && spb_subvol >= 0)
+ read_subvol = spb_subvol;
}
if (read_subvol == -1) {
@@ -429,7 +413,7 @@ afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
local->is_read_txn = _gf_true;
local->transaction.type = type;
- if (priv->quorum_count && !afr_has_quorum(local->child_up, this)) {
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
local->op_ret = -1;
local->op_errno = afr_quorum_errno(priv);
goto read;
@@ -440,8 +424,19 @@ afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
goto read;
}
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ local->op_ret = -1;
+ local->op_errno = -afr_quorum_errno(priv);
+ goto read;
+ }
+
if (priv->thin_arbiter_count &&
AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ if (local->child_up[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (local->child_up[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
afr_ta_read_txn_synctask(frame, this);
return 0;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index c0498b31578..a580a1584cc 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -10,10 +10,10 @@
#include "afr.h"
#include "afr-self-heal.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "protocol-common.h"
#include "afr-messages.h"
-#include "events.h"
+#include <glusterfs/events.h>
void
afr_heal_synctask(xlator_t *this, afr_local_t *local);
@@ -55,7 +55,8 @@ afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
for (i = 0; i < priv->child_count; i++) {
if (source == -1) {
/* case (a) above. */
- if (replies[i].valid && replies[i].op_ret == 0) {
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
ia_type = replies[i].poststat.ia_type;
break;
}
@@ -63,7 +64,8 @@ afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
/* case (b) above. */
if (i == source)
continue;
- if (sources[i] && replies[i].valid && replies[i].op_ret == 0) {
+ if (sources[i] && replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
ia_type = replies[i].poststat.ia_type;
break;
}
@@ -77,6 +79,12 @@ heal:
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid || replies[i].op_ret != 0)
continue;
+
+ if (gf_uuid_is_null(gfid) &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid) &&
+ replies[i].poststat.ia_type == ia_type)
+ gfid = replies[i].poststat.ia_gfid;
+
if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) ||
replies[i].poststat.ia_type != ia_type)
continue;
@@ -132,7 +140,7 @@ heal:
}
}
out:
- if (gfid_idx && (*gfid_idx == -1) && (ret == 0)) {
+ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) {
ret = -afr_final_errno(local, priv);
}
loc_wipe(&loc);
@@ -167,27 +175,23 @@ afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies,
{
int j = 0;
int i = 0;
- int src = -1;
- int votes[child_count];
+ int votes;
for (i = 0; i < child_count; i++) {
if (!replies[i].valid || replies[i].op_ret == -1)
continue;
- votes[i] = 1;
+ votes = 1;
for (j = i + 1; j < child_count; j++) {
if ((!gf_uuid_compare(replies[i].poststat.ia_gfid,
replies[j].poststat.ia_gfid)))
- votes[i]++;
- if (votes[i] > child_count / 2) {
- src = i;
- goto out;
- }
+ votes++;
+ if (votes > child_count / 2)
+ return i;
}
}
-out:
- return src;
+ return -1;
}
int
@@ -263,10 +267,8 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
"All the bricks should be up to resolve the gfid split "
"barin");
if (xdata) {
- ret = dict_set_str(xdata, "gfid-heal-msg",
- "All the "
- "bricks should be up to resolve the"
- " gfid split barin");
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SALL_BRICKS_UP_TO_RESOLVE);
if (ret)
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
"Error setting"
@@ -276,7 +278,7 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
}
if (xdata) {
- ret = dict_get_int32(xdata, "heal-op", &heal_op);
+ ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op);
if (ret)
goto fav_child;
} else {
@@ -289,10 +291,10 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
priv->child_count);
if (*src == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
- "No bigger file");
+ SNO_BIGGER_FILE);
if (xdata) {
- ret = dict_set_str(xdata, "gfid-heal-msg",
- "No bigger file");
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_BIGGER_FILE);
if (ret)
gf_msg(this->name, GF_LOG_ERROR, 0,
AFR_MSG_DICT_SET_FAILED,
@@ -307,10 +309,10 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
priv->child_count);
if (*src == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
- "No difference in mtime");
+ SNO_DIFF_IN_MTIME);
if (xdata) {
- ret = dict_set_str(xdata, "gfid-heal-msg",
- "No difference in mtime");
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_DIFF_IN_MTIME);
if (ret)
gf_msg(this->name, GF_LOG_ERROR, 0,
AFR_MSG_DICT_SET_FAILED,
@@ -321,7 +323,7 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
break;
case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
- ret = dict_get_str(xdata, "child-name", &src_brick);
+ ret = dict_get_str_sizen(xdata, "child-name", &src_brick);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
"Error getting the source "
@@ -332,12 +334,10 @@ afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
src_brick);
if (*src == -1) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
- "Error getting the source "
- "brick");
+ SERROR_GETTING_SRC_BRICK);
if (xdata) {
- ret = dict_set_str(xdata, "gfid-heal-msg",
- "Error getting the source "
- "brick");
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SERROR_GETTING_SRC_BRICK);
if (ret)
gf_msg(this->name, GF_LOG_ERROR, 0,
AFR_MSG_DICT_SET_FAILED,
@@ -391,11 +391,12 @@ out:
uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2),
priv->children[src_idx]->name);
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;type=gfid;file="
"<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
"child-%d=%s;gfid-%d=%s",
- this->name, uuid_utoa(pargfid), bname, child_idx,
- priv->children[child_idx]->name, child_idx,
+ this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid),
+ bname, child_idx, priv->children[child_idx]->name, child_idx,
uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx,
priv->children[src_idx]->name, src_idx,
uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2));
@@ -512,7 +513,8 @@ afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc,
&replies[source].poststat,
- (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME), NULL);
+ (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME),
+ NULL);
loc_wipe(&loc);
@@ -909,7 +911,7 @@ afr_dict_contains_heal_op(call_frame_t *frame)
local = frame->local;
xdata_req = local->xdata_req;
- ret = dict_get_int32(xdata_req, "heal-op", &heal_op);
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
if (ret)
return _gf_false;
if (local->xdata_rsp == NULL) {
@@ -917,8 +919,8 @@ afr_dict_contains_heal_op(call_frame_t *frame)
if (!local->xdata_rsp)
return _gf_true;
}
- ret = dict_set_str(local->xdata_rsp, "sh-fail-msg",
- "File not in split-brain");
+ ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
return _gf_true;
}
@@ -972,7 +974,8 @@ afr_mark_split_brain_source_sinks_by_heal_op(
xdata_rsp = local->xdata_rsp;
if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
- ret = dict_set_str(xdata_rsp, "sh-fail-msg", SBRAIN_HEAL_NO_GO_MSG);
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRAIN_HEAL_NO_GO_MSG);
ret = -1;
goto out;
}
@@ -983,16 +986,16 @@ afr_mark_split_brain_source_sinks_by_heal_op(
switch (heal_op) {
case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
if (type == AFR_METADATA_TRANSACTION) {
- ret = dict_set_str(xdata_rsp, "sh-fail-msg",
- "Use source-brick option to"
- " heal metadata split-brain");
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
if (!ret)
ret = -1;
goto out;
}
afr_mark_largest_file_as_source(this, sources, replies);
if (AFR_COUNT(sources, priv->child_count) != 1) {
- ret = dict_set_str(xdata_rsp, "sh-fail-msg", "No bigger file");
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_BIGGER_FILE);
if (!ret)
ret = -1;
goto out;
@@ -1000,36 +1003,36 @@ afr_mark_split_brain_source_sinks_by_heal_op(
break;
case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
if (type == AFR_METADATA_TRANSACTION) {
- ret = dict_set_str(xdata_rsp, "sh-fail-msg",
- "Use source-brick option to"
- " heal metadata split-brain");
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
if (!ret)
ret = -1;
goto out;
}
afr_mark_latest_mtime_file_as_source(this, sources, replies);
if (AFR_COUNT(sources, priv->child_count) != 1) {
- ret = dict_set_str(xdata_rsp, "sh-fail-msg",
- "No difference in mtime");
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_DIFF_IN_MTIME);
if (!ret)
ret = -1;
goto out;
}
break;
case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
- ret = dict_get_str(xdata_req, "child-name", &name);
+ ret = dict_get_str_sizen(xdata_req, "child-name", &name);
if (ret)
goto out;
source = afr_get_child_index_from_name(this, name);
if (source < 0) {
- ret = dict_set_str(xdata_rsp, "sh-fail-msg",
- "Invalid brick name");
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SINVALID_BRICK_NAME);
if (!ret)
ret = -1;
goto out;
}
if (locked_on[source] != 1) {
- ret = dict_set_str(xdata_rsp, "sh-fail-msg", "Brick is not up");
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRICK_IS_NOT_UP);
if (!ret)
ret = -1;
goto out;
@@ -1071,8 +1074,8 @@ afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies,
for (i = 0; i < priv->child_count; i++) {
if (replies[i].valid == 1) {
gf_msg_debug(this->name, 0,
- "Child:%s "
- "mtime_sec = %ld, size = %lu for gfid %s",
+ "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64
+ " for gfid %s",
priv->children[i]->name, replies[i].poststat.ia_mtime,
replies[i].poststat.ia_size, uuid_utoa(inode->gfid));
vote_count = 0;
@@ -1110,8 +1113,9 @@ afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
for (i = 0; i < priv->child_count; i++) {
if (replies[i].valid == 1) {
gf_msg_debug(this->name, 0,
- "Child:%s "
- "mtime = %ld, mtime_nsec = %d for gfid %s",
+ "Child:%s mtime = %" PRId64
+ ", mtime_nsec = %d for "
+ "gfid %s",
priv->children[i]->name, replies[i].poststat.ia_mtime,
replies[i].poststat.ia_mtime_nsec,
uuid_utoa(inode->gfid));
@@ -1147,8 +1151,9 @@ afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
for (i = 0; i < priv->child_count; i++) {
if (replies[i].valid == 1) {
gf_msg_debug(this->name, 0,
- "Child:%s "
- "ctime = %ld, ctime_nsec = %d for gfid %s",
+ "Child:%s ctime = %" PRId64
+ ", ctime_nsec = %d for "
+ "gfid %s",
priv->children[i]->name, replies[i].poststat.ia_ctime,
replies[i].poststat.ia_ctime_nsec,
uuid_utoa(inode->gfid));
@@ -1168,7 +1173,8 @@ afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
}
/*
- * afr_sh_fav_by_size: Choose favorite child by size.
+ * afr_sh_fav_by_size: Choose favorite child by size
+ * when not all files are of zero size.
*/
int
afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode)
@@ -1179,19 +1185,31 @@ afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode)
uint64_t cmp_sz = 0;
priv = this->private;
-
for (i = 0; i < priv->child_count; i++) {
- if (replies[i].valid == 1) {
- gf_msg_debug(this->name, 0,
- "Child:%s "
- "file size = %lu for gfid %s",
- priv->children[i]->name, replies[i].poststat.ia_size,
- uuid_utoa(inode->gfid));
- if (replies[i].poststat.ia_size > cmp_sz) {
- cmp_sz = replies[i].poststat.ia_size;
- fav_child = i;
- }
+ if (!replies[i].valid) {
+ continue;
+ }
+ gf_msg_debug(this->name, 0,
+ "Child:%s file size = %" PRIu64 " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_size,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_type == IA_IFDIR) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Cannot perform selfheal on %s. "
+ "Size policy is not applicable to directories.",
+ uuid_utoa(inode->gfid));
+ break;
}
+ if (replies[i].poststat.ia_size > cmp_sz) {
+ cmp_sz = replies[i].poststat.ia_size;
+ fav_child = i;
+ } else if (replies[i].poststat.ia_size == cmp_sz) {
+ fav_child = -1;
+ }
+ }
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No bigger file");
}
return fav_child;
}
@@ -1258,7 +1276,10 @@ afr_mark_split_brain_source_sinks_by_policy(
priv = this->private;
fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str);
- if (fav_child > priv->child_count - 1) {
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "No child selected by favorite-child policy.");
+ } else if (fav_child > priv->child_count - 1) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
"Invalid child (%d) "
"selected by policy %s.",
@@ -1272,10 +1293,10 @@ afr_mark_split_brain_source_sinks_by_policy(
strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
- "Source %s "
- "selected as authentic to resolve conflicting "
- "data in file (gfid:%s) by %s (%lu bytes @ %s mtime, "
- "%s ctime).",
+ "Source %s selected as authentic to resolve conflicting data "
+ "in file (gfid:%s) by %s (%" PRIu64
+ " bytes @ %s mtime, %s "
+ "ctime).",
priv->children[fav_child]->name, uuid_utoa(inode->gfid),
policy_str, replies[fav_child].poststat.ia_size, mtime_str,
ctime_str);
@@ -1389,7 +1410,7 @@ afr_mark_split_brain_source_sinks(
if (source >= 0)
return source;
- ret = dict_get_int32(xdata_req, "heal-op", &heal_op);
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
if (ret)
goto autoheal;
@@ -1405,7 +1426,7 @@ autoheal:
frame, this, inode, sources, sinks, healed_sinks, locked_on,
replies, type);
if (source != -1) {
- ret = dict_set_int32(xdata_req, "fav-child-policy", 1);
+ ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1);
if (ret)
return -1;
}
@@ -1436,7 +1457,7 @@ _afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
priv = this->private;
local = frame->local;
- if (!dict_get(local->xdata_req, "fav-child-policy"))
+ if (!dict_get_sizen(local->xdata_req, "fav-child-policy"))
return 0;
xdata = dict_new();
@@ -1554,7 +1575,6 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
char *accused = NULL; /* Accused others without any self-accusal */
char *pending = NULL; /* Have pending operations on others */
char *self_accused = NULL; /* Accused itself */
- int min_participants = -1;
priv = this->private;
@@ -1578,12 +1598,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
}
}
- if (type == AFR_DATA_TRANSACTION) {
- min_participants = priv->child_count;
- } else {
- min_participants = AFR_SH_MIN_PARTICIPANTS;
- }
- if (afr_success_count(replies, priv->child_count) < min_participants) {
+ if (afr_success_count(replies, priv->child_count) < priv->child_count) {
/* Treat this just like locks not being acquired */
return -ENOTCONN;
}
@@ -1644,7 +1659,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
}
}
- if (type == AFR_DATA_TRANSACTION)
+ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
afr_selfheal_post_op_failure_accounting(priv, accused, sources,
locked_on);
@@ -1752,11 +1767,9 @@ afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (xdata) {
local->replies[i].xdata = dict_ref(xdata);
ret = dict_get_int8(xdata, "link-count", &need_heal);
- local->replies[i].need_heal = need_heal;
- } else {
- local->replies[i].need_heal = need_heal;
}
+ local->replies[i].need_heal = need_heal;
syncbarrier_wake(&local->barrier);
return 0;
@@ -1812,10 +1825,41 @@ afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
return inode;
}
+static int
+afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
+
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+
+ ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ ret = dict_set_uint32(dict, key1, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+ ret = dict_set_uint32(dict, key2, 1);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
int
afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
uuid_t gfid, struct afr_reply *replies,
- unsigned char *discover_on)
+ unsigned char *discover_on, dict_t *dict)
{
loc_t loc = {
0,
@@ -1830,12 +1874,19 @@ afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
xattr_req = dict_new();
if (!xattr_req)
return -ENOMEM;
+ if (dict)
+ dict_copy(dict, xattr_req);
if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
dict_unref(xattr_req);
return -ENOMEM;
}
+ if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) {
+ dict_unref(xattr_req);
+ return -1;
+ }
+
loc.inode = inode_ref(inode);
gf_uuid_copy(loc.gfid, gfid);
@@ -1854,12 +1905,16 @@ int
afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ dict_t *dict = NULL;
- priv = frame->this->private;
+ local = frame->local;
+
+ if (local->xattr_req)
+ dict = local->xattr_req;
return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
- priv->child_up);
+ local->child_up, dict);
}
unsigned int
@@ -2226,7 +2281,8 @@ int
afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
inode_t **link_inode, gf_boolean_t *data_selfheal,
gf_boolean_t *metadata_selfheal,
- gf_boolean_t *entry_selfheal)
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies_dst)
{
afr_private_t *priv = NULL;
inode_t *inode = NULL;
@@ -2285,11 +2341,13 @@ afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
priv->children[i]->name,
uuid_utoa(replies[i].poststat.ia_gfid));
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;"
"type=file;gfid=%s;"
"ia_type-%d=%s;ia_type-%d=%s",
- this->name, uuid_utoa(replies[i].poststat.ia_gfid),
- first_idx, gf_inode_type_to_str(first.ia_type), i,
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(replies[i].poststat.ia_gfid), first_idx,
+ gf_inode_type_to_str(first.ia_type), i,
gf_inode_type_to_str(replies[i].poststat.ia_type));
ret = -EIO;
goto out;
@@ -2360,6 +2418,8 @@ afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
ret = 0;
out:
+ if (replies && replies_dst)
+ afr_replies_copy(replies_dst, replies, priv->child_count);
if (inode)
inode_unref(inode);
if (replies)
@@ -2399,8 +2459,11 @@ afr_frame_create(xlator_t *this, int32_t *op_errno)
pid_t pid = GF_CLIENT_PID_SELF_HEALD;
frame = create_frame(this, this->ctx->pool);
- if (!frame)
+ if (!frame) {
+ if (op_errno)
+ *op_errno = ENOMEM;
return NULL;
+ }
local = AFR_FRAME_INIT(frame, (*op_errno));
if (!local) {
@@ -2471,17 +2534,12 @@ afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
gf_boolean_t metadata_selfheal = _gf_false;
gf_boolean_t entry_selfheal = _gf_false;
afr_private_t *priv = NULL;
- gf_boolean_t dataheal_enabled = _gf_false;
priv = this->private;
- ret = gf_string2boolean(priv->data_self_heal, &dataheal_enabled);
- if (ret)
- goto out;
-
ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode,
&data_selfheal, &metadata_selfheal,
- &entry_selfheal);
+ &entry_selfheal, NULL);
if (ret)
goto out;
@@ -2498,7 +2556,7 @@ afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
}
}
- if (data_selfheal && dataheal_enabled)
+ if (data_selfheal && priv->data_self_heal)
data_ret = afr_selfheal_data(frame, this, fd);
if (metadata_selfheal && priv->metadata_self_heal)
@@ -2692,3 +2750,185 @@ afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
out:
return source;
}
+
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->replies[i].poststat = *buf;
+ local->replies[i].preparent = *preparent;
+ local->replies[i].postparent = *postparent;
+ }
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ unsigned char *mkdir_on = alloca0(priv->child_count);
+ unsigned char *lookup_on = alloca0(priv->child_count);
+ loc_t loc = {0};
+ int32_t op_errno = 0;
+ int32_t child_op_errno = 0;
+ struct iatt iatt = {0};
+ dict_t *xdata = NULL;
+ uuid_t anon_inode_gfid = {0};
+ int mkdir_count = 0;
+ int i = 0;
+
+ /*Try to mkdir everywhere and return success if the dir exists on 'child'
+ */
+
+ if (!priv->use_anon_inode) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &op_errno);
+ if (op_errno) {
+ goto out;
+ }
+ local = frame->local;
+ if (!local->child_up[child]) {
+ /*Other bricks may need mkdir so don't error out yet*/
+ child_op_errno = ENOTCONN;
+ }
+ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ if (priv->anon_inode[i]) {
+ mkdir_on[i] = 0;
+ } else {
+ mkdir_on[i] = 1;
+ mkdir_count++;
+ }
+ }
+
+ if (mkdir_count == 0) {
+ *linked_inode = inode_find(this->itable, anon_inode_gfid);
+ if (*linked_inode) {
+ op_errno = 0;
+ goto out;
+ }
+ }
+
+ loc.parent = inode_ref(this->itable->root);
+ loc.name = priv->anon_inode_name;
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+ if (op_errno) {
+ goto out;
+ }
+
+ if (mkdir_count == 0) {
+ memcpy(lookup_on, local->child_up, priv->child_count);
+ goto lookup;
+ }
+
+ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+ xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!mkdir_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno == EEXIST) {
+ lookup_on[i] = 1;
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+
+ if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+ goto link;
+ }
+
+lookup:
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xdata);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!lookup_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ if (gf_uuid_compare(anon_inode_gfid,
+ local->replies[i].poststat.ia_gfid) == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else {
+ if (i == child)
+ child_op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+ "%s has gfid: %s", priv->anon_inode_name,
+ uuid_utoa(local->replies[i].poststat.ia_gfid));
+ }
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+link:
+ if (!gf_uuid_is_null(iatt.ia_gfid)) {
+ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+ if (*linked_inode) {
+ op_errno = 0;
+ inode_lookup(*linked_inode);
+ } else {
+ op_errno = ENOMEM;
+ }
+ goto out;
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ /*child_op_errno takes precedence*/
+ if (child_op_errno == 0) {
+ child_op_errno = op_errno;
+ }
+
+ if (child_op_errno && *linked_inode) {
+ inode_unref(*linked_inode);
+ *linked_inode = NULL;
+ }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return -child_op_errno;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 9b296b9ad23..37bcc2b3f9e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -10,15 +10,10 @@
#include "afr.h"
#include "afr-self-heal.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "protocol-common.h"
#include "afr-messages.h"
-#include "events.h"
-
-enum {
- AFR_SELFHEAL_DATA_FULL = 0,
- AFR_SELFHEAL_DATA_DIFF,
-};
+#include <glusterfs/events.h>
#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
static int
@@ -73,7 +68,7 @@ __afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd,
xdata = dict_new();
if (!xdata)
goto out;
- if (dict_set_int32(xdata, "check-zero-filled", 1)) {
+ if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) {
dict_unref(xdata);
goto out;
}
@@ -134,7 +129,7 @@ __afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset,
priv = this->private;
ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec,
- &count, &iobref, NULL, NULL);
+ &count, &iobref, NULL, NULL, NULL);
if (ret < 0)
goto out;
ret = iov_0filled(iovec, count);
@@ -164,7 +159,7 @@ __afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec,
- &count, &iobref, NULL, NULL);
+ &count, &iobref, NULL, NULL, NULL);
if (ret <= 0)
return ret;
@@ -212,7 +207,7 @@ __afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
}
ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref,
- 0, NULL, NULL);
+ 0, NULL, NULL, NULL, NULL);
if (ret != iov_length(iovec, count)) {
/* write() failed on this sink. unset the corresponding
member in sinks[] (which is healed_sinks[] in the
@@ -230,24 +225,40 @@ __afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
return ret;
}
+static gf_boolean_t
+afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source,
+ unsigned char *healed_sinks)
+{
+ afr_private_t *priv = this->private;
+ int i = 0;
+
+ if (!locked_on[source])
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && locked_on[i])
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
static int
afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd,
int source, unsigned char *healed_sinks, off_t offset,
size_t size, int type, struct afr_reply *replies)
{
int ret = -1;
- int sink_count = 0;
afr_private_t *priv = NULL;
unsigned char *data_lock = NULL;
priv = this->private;
- sink_count = AFR_COUNT(healed_sinks, priv->child_count);
data_lock = alloca0(priv->child_count);
ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size,
data_lock);
{
- if (ret < sink_count) {
+ if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) {
ret = -ENOTCONN;
goto unlock;
}
@@ -301,7 +312,7 @@ afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks,
int type = AFR_SELFHEAL_DATA_FULL;
int i = 0;
- if (priv->data_self_heal_algorithm == NULL) {
+ if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) {
type = AFR_SELFHEAL_DATA_FULL;
for (i = 0; i < priv->child_count; i++) {
if (!healed_sinks[i] && i != source)
@@ -311,10 +322,8 @@ afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks,
break;
}
}
- } else if (strcmp(priv->data_self_heal_algorithm, "full") == 0) {
- type = AFR_SELFHEAL_DATA_FULL;
- } else if (strcmp(priv->data_self_heal_algorithm, "diff") == 0) {
- type = AFR_SELFHEAL_DATA_DIFF;
+ } else {
+ type = priv->data_self_heal_algorithm;
}
return type;
}
@@ -331,6 +340,9 @@ afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
call_frame_t *iter_frame = NULL;
unsigned char arbiter_sink_status = 0;
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing data selfheal on %s", uuid_utoa(fd->inode->gfid));
+
priv = this->private;
if (priv->arbiter_count) {
arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
@@ -382,17 +394,18 @@ __afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- unsigned char arbiter_sink_status = 0;
int i = 0;
local = frame->local;
priv = this->private;
- if (priv->arbiter_count) {
- arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
- healed_sinks[ARBITER_BRICK_INDEX] = 0;
- }
-
+ /* This will send truncate on the arbiter brick as well if it is marked as
+ * sink. If changelog is enabled on the volume it captures truncate as a
+ * data transactions on the arbiter brick. This will help geo-rep to
+ * properly sync the data from master to slave if arbiter is the ACTIVE
+ * brick during syncing and which had got some entries healed for data as
+ * part of self heal.
+ */
AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size,
NULL);
@@ -403,8 +416,6 @@ __afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
*/
healed_sinks[i] = 0;
- if (arbiter_sink_status)
- healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
return 0;
}
@@ -545,9 +556,11 @@ __afr_selfheal_data_finalize_source(
replies, AFR_DATA_TRANSACTION);
if (source < 0) {
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;type=data;"
"file=%s",
- this->name, uuid_utoa(inode->gfid));
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
return -EIO;
}
@@ -700,19 +713,18 @@ __afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd,
goto unlock;
}
- if (priv->arbiter_count &&
- AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
- healed_sinks[ARBITER_BRICK_INDEX]) {
- is_arbiter_the_only_sink = _gf_true;
- goto restore_time;
- }
-
ret = __afr_selfheal_truncate_sinks(
frame, this, fd, healed_sinks,
locked_replies[source].poststat.ia_size);
if (ret < 0)
goto unlock;
+ if (priv->arbiter_count &&
+ AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
+ healed_sinks[ARBITER_BRICK_INDEX]) {
+ is_arbiter_the_only_sink = _gf_true;
+ goto restore_time;
+ }
ret = 0;
}
unlock:
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index fb6952c10fd..64893f441e3 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -10,60 +10,176 @@
#include "afr.h"
#include "afr-self-heal.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "afr-transaction.h"
#include "afr-messages.h"
-#include "syncop-utils.h"
-#include "events.h"
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/events.h>
-static int
-afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
- inode_t *inode, int child, struct afr_reply *replies)
+int
+afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child,
+ struct afr_reply *replies,
+ gf_boolean_t *anon_inode)
{
afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
xlator_t *subvol = NULL;
int ret = 0;
+ int i = 0;
+ char g[64] = {0};
+ unsigned char *lookup_success = NULL;
+ call_frame_t *frame = NULL;
+ loc_t loc2 = {
+ 0,
+ };
loc_t loc = {
0,
};
- char g[64];
priv = this->private;
-
subvol = priv->children[child];
+ lookup_success = alloca0(priv->child_count);
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (replies[child].poststat.ia_type == IA_IFDIR) {
+ /* This directory may have sub-directory hierarchy which may need to
+ * be preserved for subsequent heals. So unconditionally move the
+ * directory to anonymous-inode directory*/
+ *anon_inode = _gf_true;
+ goto anon_inode;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid);
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ lookup_success[i] = 1;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ ret = -local->replies[i].op_errno;
+ }
+ }
+
+ if (priv->quorum_count) {
+ if (afr_has_quorum(lookup_success, this, NULL)) {
+ *anon_inode = _gf_true;
+ }
+ } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) {
+ *anon_inode = _gf_true;
+ } else if (ret) {
+ goto out;
+ }
+
+anon_inode:
+ if (!*anon_inode) {
+ ret = 0;
+ goto out;
+ }
loc.parent = inode_ref(dir);
gf_uuid_copy(loc.pargfid, dir->gfid);
loc.name = name;
- loc.inode = inode_ref(inode);
- if (replies[child].valid && replies[child].op_ret == 0) {
- switch (replies[child].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid),
- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_unlink(subvol, &loc, NULL, NULL);
- break;
- }
+ ret = afr_anon_inode_create(this, child, &loc2.parent);
+ if (ret < 0)
+ goto out;
+
+ loc2.name = g;
+ ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s failed",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s successful",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
}
+out:
loc_wipe(&loc);
+ loc_wipe(&loc2);
+ if (frame) {
+ AFR_STACK_DESTROY(frame);
+ }
return ret;
}
int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies)
+{
+ char g[64] = {0};
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ gf_boolean_t anon_inode = _gf_false;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ if ((!replies[child].valid) || (replies[child].op_ret < 0)) {
+ /*Nothing to do*/
+ ret = 0;
+ goto out;
+ }
+
+ if (priv->use_anon_inode) {
+ ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child,
+ replies, &anon_inode);
+ if (ret < 0 || anon_inode)
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name,
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ break;
+ default:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
+ name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+ break;
+ }
+
+out:
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
unsigned char *sources, inode_t *dir,
const char *name, inode_t *inode,
@@ -76,6 +192,9 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
loc_t srcloc = {
0,
};
+ loc_t anonloc = {
+ 0,
+ };
xlator_t *this = frame->this;
afr_private_t *priv = NULL;
dict_t *xdata = NULL;
@@ -86,15 +205,18 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
0,
};
unsigned char *newentry = NULL;
+ char iatt_uuid_str[64] = {0};
+ char dir_uuid_str[64] = {0};
priv = this->private;
iatt = &replies[source].poststat;
+ uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str);
if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
"Invalid ia_type (%d) or gfid(%s). source brick=%d, "
"pargfid=%s, name=%s",
- iatt->ia_type, uuid_utoa(iatt->ia_gfid), source,
- uuid_utoa(dir->gfid), name);
+ iatt->ia_type, iatt_uuid_str, source,
+ uuid_utoa_r(dir->gfid, dir_uuid_str), name);
ret = -EINVAL;
goto out;
}
@@ -119,14 +241,24 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
srcloc.inode = inode_ref(inode);
gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
- if (iatt->ia_type != IA_IFDIR)
- ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
- if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) {
+ ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == -ENOENT || ret == -ESTALE) {
newentry[dst] = 1;
ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
sources, newentry);
if (ret)
goto out;
+ } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) {
+ // Try rename from hidden directory
+ ret = afr_anon_inode_create(this, dst, &anonloc.parent);
+ if (ret < 0)
+ goto out;
+ anonloc.inode = inode_ref(inode);
+ anonloc.name = iatt_uuid_str;
+ ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = -1; /*This sets 'mismatch' to true*/
+ goto out;
}
mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
@@ -149,7 +281,7 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
}
break;
default:
- ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
if (ret)
goto out;
ret = syncop_mknod(
@@ -165,6 +297,7 @@ out:
GF_FREE(linkname);
loc_wipe(&loc);
loc_wipe(&srcloc);
+ loc_wipe(&anonloc);
return ret;
}
@@ -246,6 +379,19 @@ afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
if (replies[i].op_ret != 0)
continue;
+ if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+ continue;
+
+ if (replies[i].poststat.ia_type == IA_INVAL)
+ continue;
+
+ if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) {
+ src_idx = i;
+ ia_type = replies[src_idx].poststat.ia_type;
+ gfid = &replies[src_idx].poststat.ia_gfid;
+ continue;
+ }
+
if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) &&
(ia_type == replies[i].poststat.ia_type)) {
ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
@@ -269,11 +415,12 @@ afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
gf_inode_type_to_str(replies[src_idx].poststat.ia_type),
priv->children[src_idx]->name);
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;type=file;"
"file=<gfid:%s>/%s>;count=2;child-%d=%s;type-"
"%d=%s;child-%d=%s;type-%d=%s",
- this->name, uuid_utoa(pargfid), bname, i,
- priv->children[i]->name, i,
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name, i,
gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx,
priv->children[src_idx]->name, src_idx,
gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
@@ -465,6 +612,7 @@ __afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
afr_private_t *priv = NULL;
int source = -1;
int sources_count = 0;
+ int i = 0;
priv = this->private;
@@ -478,6 +626,20 @@ __afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
}
source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION);
+
+ /*If the selected source does not blame any other brick, then mark
+ * everything as sink to trigger conservative merge.
+ */
+ if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ return -1;
+ }
+
return source;
}
@@ -548,10 +710,15 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
+ if (afr_is_private_directory(priv, fd->inode->gfid, name,
+ GF_CLIENT_PID_SELF_HEALD)) {
+ return 0;
+ }
+
xattr = dict_new();
if (!xattr)
return -ENOMEM;
- ret = dict_set_int32(xattr, GF_GFIDLESS_LOOKUP, 1);
+ ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
if (ret) {
dict_unref(xattr);
return -1;
@@ -568,7 +735,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes "
@@ -596,7 +763,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
replies);
if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
- ret = afr_shd_index_purge(subvol, parent_idx_inode, name,
+ ret = afr_shd_entry_purge(subvol, parent_idx_inode, name,
inode->ia_type);
/* Why is ret force-set to 0? We do not care about
* index purge failing for full heal as it is quite
@@ -726,10 +893,6 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
continue;
- if (__is_root_gfid(fd->inode->gfid) &&
- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR))
- continue;
-
ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
loc.inode, subvol,
local->need_full_crawl);
@@ -792,7 +955,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
/* The name indices under the pgfid index dir are guaranteed
* to be regular files. Hence the hardcoding.
*/
- afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
ret = 0;
goto out;
}
@@ -831,6 +994,8 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
subvol = priv->children[subvol_idx];
args.frame = afr_copy_frame(frame);
+ if (!args.frame)
+ goto out;
args.xl = this;
/* args.heal_fd represents the fd associated with the original directory
* on which entry heal is being attempted.
@@ -849,9 +1014,10 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
* do not treat heal as failure.
*/
if (is_src)
- return -errno;
+ ret = -errno;
else
- return 0;
+ ret = 0;
+ goto out;
}
ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args,
@@ -861,7 +1027,9 @@ afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
if (args.mismatch == _gf_true)
ret = -1;
-
+out:
+ if (args.frame)
+ AFR_STACK_DESTROY(args.frame);
return ret;
}
@@ -957,7 +1125,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
data_lock);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes could "
@@ -1013,6 +1181,8 @@ unlock:
goto postop_unlock;
}
+ afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+ locked_replies);
ret = afr_selfheal_undo_pending(
frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
AFR_ENTRY_TRANSACTION, locked_replies, postop_lock);
@@ -1079,7 +1249,7 @@ afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
NULL, locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
gf_msg_debug(this->name, 0,
"%s: Skipping "
"entry self-heal as only %d sub-volumes could "
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index ea2a7bfd52f..03f43bad16e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -10,9 +10,9 @@
#include "afr.h"
#include "afr-self-heal.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "protocol-common.h"
-#include "events.h"
+#include <glusterfs/events.h>
#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE)
@@ -190,6 +190,59 @@ out:
return ret;
}
+static int
+__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
+ struct afr_reply *replies,
+ unsigned char *sources)
+{
+ int ret = 0;
+ int i = 0;
+ int m_idx = 0;
+ afr_private_t *priv = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ raw[m_idx] = 1;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO,
+ "Failed to set pending metadata xattr on child %d for %s", i,
+ uuid_utoa(inode->gfid));
+ goto out;
+ }
+ }
+
+ afr_replies_wipe(replies, priv->child_count);
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
+
/*
* Look for mismatching uid/gid or mode or user xattrs even if
* AFR xattrs don't say so, and pick one arbitrarily as winner. */
@@ -210,6 +263,7 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
};
int source = -1;
int sources_count = 0;
+ int ret = 0;
priv = this->private;
@@ -242,9 +296,11 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
if (!priv->metadata_splitbrain_forced_heal) {
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;"
"type=metadata;file=%s",
- this->name, uuid_utoa(inode->gfid));
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
return -EIO;
}
@@ -298,7 +354,13 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
healed_sinks[i] = 1;
}
}
-
+ if ((sources_count == priv->child_count) && (source > -1) &&
+ (AFR_COUNT(healed_sinks, priv->child_count) != 0)) {
+ ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode,
+ replies, sources);
+ if (ret < 0)
+ return ret;
+ }
out:
afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
return source;
@@ -396,7 +458,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
data_lock);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
ret = -ENOTCONN;
goto unlock;
}
@@ -419,12 +481,8 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
if (ret)
goto unlock;
- /* Restore atime/mtime for files that don't need data heal as
- * restoring timestamps happens only as a part of data-heal.
- */
- if (!IA_ISREG(locked_replies[source].poststat.ia_type))
- afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
- locked_replies);
+ afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
+ locked_replies);
ret = afr_selfheal_undo_pending(
frame, this, inode, sources, sinks, healed_sinks, undid_pending,
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index aa20ad1b835..834aac86d48 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -8,7 +8,7 @@
cases as published by the Free Software Foundation.
*/
-#include "events.h"
+#include <glusterfs/events.h>
#include "afr.h"
#include "afr-self-heal.h"
#include "afr-messages.h"
@@ -98,21 +98,12 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
const char *bname, inode_t *inode,
struct afr_reply *replies)
{
- loc_t loc = {
- 0,
- };
int i = 0;
afr_private_t *priv = NULL;
- char g[64];
int ret = 0;
priv = this->private;
- loc.parent = inode_ref(parent);
- gf_uuid_copy(loc.pargfid, pargfid);
- loc.name = bname;
- loc.inode = inode_ref(inode);
-
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid)
continue;
@@ -120,30 +111,10 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
if (replies[i].op_ret)
continue;
- switch (replies[i].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s", uuid_utoa(pargfid),
- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_rmdir(priv->children[i], &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s", uuid_utoa(pargfid),
- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_unlink(priv->children[i], &loc, NULL, NULL);
- break;
- }
+ ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i,
+ replies);
}
- loc_wipe(&loc);
-
return ret;
}
@@ -222,13 +193,14 @@ afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies,
gf_inode_type_to_str(inode_type),
priv->children[type_idx]->name);
gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
"subvol=%s;type=file;"
"file=<gfid:%s>/%s;count=2;"
"child-%d=%s;type-%d=%s;child-%d=%s;"
"type-%d=%s",
- this->name, uuid_utoa(pargfid), bname, i,
- priv->children[i]->name, i,
- gf_inode_type_to_str(inode_type1), type_idx,
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name,
+ i, gf_inode_type_to_str(inode_type1), type_idx,
priv->children[type_idx]->name, type_idx,
gf_inode_type_to_str(inode_type));
return -EIO;
@@ -277,9 +249,8 @@ afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies,
bname, gfid_idx_iter, i,
locked_on, gfid_idx, xdata);
if (!ret && *gfid_idx >= 0) {
- ret = dict_set_str(xdata, "gfid-heal-msg",
- "GFID split-brain "
- "resolved");
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ "GFID split-brain resolved");
if (ret)
gf_msg(this->name, GF_LOG_ERROR, 0,
AFR_MSG_DICT_SET_FAILED,
@@ -381,7 +352,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode,
replies, gfid, locked_on, source, sources,
is_gfid_absent, &gfid_idx);
- if (ret)
+ if (ret || (gfid_idx < 0))
return ret;
ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname,
@@ -496,7 +467,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
if (!xattr)
return -ENOMEM;
- ret = dict_set_int32(xattr, GF_GFIDLESS_LOOKUP, 1);
+ ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
if (ret) {
dict_unref(xattr);
return -1;
@@ -514,7 +485,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
locked_on);
{
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ if (ret < priv->child_count) {
ret = -ENOTCONN;
goto unlock;
}
@@ -560,13 +531,15 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
struct afr_reply *replies = NULL;
inode_t *inode = NULL;
int first_idx = -1;
+ afr_local_t *local = NULL;
priv = this->private;
+ local = frame->local;
replies = alloca0(sizeof(*replies) * priv->child_count);
inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
- priv->child_up, NULL);
+ local->child_up, NULL);
if (!inode)
return -ENOMEM;
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 41adbe05dc8..48e6dbcfb18 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -11,8 +11,6 @@
#ifndef _AFR_SELFHEAL_H
#define _AFR_SELFHEAL_H
-#define AFR_SH_MIN_PARTICIPANTS 2
-
/* Perform fop on all UP subvolumes and wait for all callbacks to return */
#define AFR_ONALL(frame, rfn, fop, args...) \
@@ -20,9 +18,8 @@
afr_local_t *__local = frame->local; \
afr_private_t *__priv = frame->this->private; \
int __i = 0, __count = 0; \
- unsigned char *__child_up = NULL; \
+ unsigned char *__child_up = alloca(__priv->child_count); \
\
- __child_up = alloca0(__priv->child_count); \
memcpy(__child_up, __priv->child_up, \
sizeof(*__child_up) * __priv->child_count); \
__count = AFR_COUNT(__child_up, __priv->child_count); \
@@ -48,13 +45,16 @@
afr_local_t *__local = frame->local; \
afr_private_t *__priv = frame->this->private; \
int __i = 0; \
- int __count = AFR_COUNT(list, __priv->child_count); \
+ int __count = 0; \
+ unsigned char *__list = alloca(__priv->child_count); \
\
+ memcpy(__list, list, sizeof(*__list) * __priv->child_count); \
+ __count = AFR_COUNT(__list, __priv->child_count); \
__local->barrier.waitfor = __count; \
afr_local_replies_wipe(__local, __priv); \
\
for (__i = 0; __i < __priv->child_count; __i++) { \
- if (!list[__i]) \
+ if (!__list[__i]) \
continue; \
STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
__priv->children[__i], \
@@ -83,9 +83,9 @@
#define ALLOC_MATRIX(n, type) \
({ \
- type **__ptr = NULL; \
int __i; \
- __ptr = alloca0(n * sizeof(type *)); \
+ type **__ptr = alloca(n * sizeof(type *)); \
+ \
for (__i = 0; __i < n; __i++) \
__ptr[__i] = alloca0(n * sizeof(type)); \
__ptr; \
@@ -97,6 +97,27 @@
#define SBRAIN_HEAL_NO_GO_MSG \
"Failed to obtain replies from all bricks of " \
"the replica (are they up?). Cannot resolve split-brain."
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SNO_BIGGER_FILE "No bigger file"
+#define SNO_DIFF_IN_MTIME "No difference in mtime"
+#define SUSE_SOURCE_BRICK_TO_HEAL \
+ "Use source-brick option to heal metadata" \
+ " split-brain"
+#define SINVALID_BRICK_NAME "Invalid brick name"
+#define SBRICK_IS_NOT_UP "Brick is not up"
+#define SBRICK_NOT_CONNECTED "Brick is not connected"
+#define SLESS_THAN2_BRICKS_in_REP "< 2 bricks in replica are up"
+#define SBRICK_IS_REMOTE "Brick is remote"
+#define SSTARTED_SELF_HEAL "Started self-heal"
+#define SOP_NOT_SUPPORTED "Operation Not Supported"
+#define SFILE_NOT_UNDER_DATA \
+ "The file is not under data or metadata " \
+ "split-brain"
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SALL_BRICKS_UP_TO_RESOLVE \
+ "All the bricks should be up to resolve the" \
+ " gfid split brain"
+#define SERROR_GETTING_SRC_BRICK "Error getting the source brick"
int
afr_selfheal(xlator_t *this, uuid_t gfid);
@@ -166,7 +187,7 @@ afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
int
afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
uuid_t gfid, struct afr_reply *replies,
- unsigned char *discover_on);
+ unsigned char *discover_on, dict_t *dict);
inode_t *
afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
const char *name, struct afr_reply *replies,
@@ -305,7 +326,8 @@ int
afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
inode_t **link_inode, gf_boolean_t *data_selfheal,
gf_boolean_t *metadata_selfheal,
- gf_boolean_t *entry_selfheal);
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies);
int
afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid);
@@ -347,4 +369,9 @@ gf_boolean_t
afr_is_file_empty_on_all_children(afr_private_t *priv,
struct afr_reply *replies);
+int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies);
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode);
#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 5ad39a2e341..109fd4b7421 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -12,11 +12,10 @@
#include "afr-self-heal.h"
#include "afr-self-heald.h"
#include "protocol-common.h"
-#include "syncop-utils.h"
+#include <glusterfs/syncop-utils.h>
#include "afr-messages.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
-#define SHD_INODE_LRU_LIMIT 2048
#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
#define AFR_STATISTICS_HISTORY_SIZE 50
@@ -95,7 +94,7 @@ __afr_shd_healer_wait(struct subvol_healer *healer)
priv = healer->this->private;
disabled_loop:
- wait_till.tv_sec = time(NULL) + priv->shd.timeout;
+ wait_till.tv_sec = gf_time() + priv->shd.timeout;
while (!healer->rerun) {
ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
@@ -223,7 +222,7 @@ out:
}
int
-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name,
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
ia_type_t type)
{
int ret = 0;
@@ -372,8 +371,9 @@ afr_shd_sweep_prepare(struct subvol_healer *healer)
event->split_brain_count = 0;
event->heal_failed_count = 0;
- time(&event->start_time);
+ event->start_time = gf_time();
event->end_time = 0;
+ _mask_cancellation();
}
void
@@ -386,8 +386,8 @@ afr_shd_sweep_done(struct subvol_healer *healer)
event = &healer->crawl_event;
shd = &(((afr_private_t *)healer->this->private)->shd);
- time(&event->end_time);
- history = memdup(event, sizeof(*event));
+ event->end_time = gf_time();
+ history = gf_memdup(event, sizeof(*event));
event->start_time = 0;
if (!history)
@@ -395,6 +395,7 @@ afr_shd_sweep_done(struct subvol_healer *healer)
if (eh_save_history(shd->statistics[healer->subvol], history) < 0)
GF_FREE(history);
+ _unmask_cancellation();
}
int
@@ -423,7 +424,7 @@ afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
ret = afr_shd_selfheal(healer, healer->subvol, gfid);
if (ret == -ENOENT || ret == -ESTALE)
- afr_shd_index_purge(subvol, parent->inode, entry->d_name, val);
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val);
if (ret == 2)
/* If bricks crashed in pre-op after creating indices/xattrop
@@ -465,7 +466,7 @@ afr_shd_index_sweep(struct subvol_healer *healer, char *vgfid)
}
xdata = dict_new();
- if (!xdata || dict_set_int32(xdata, "get-gfid-type", 1)) {
+ if (!xdata || dict_set_int32_sizen(xdata, "get-gfid-type", 1)) {
ret = -ENOMEM;
goto out;
}
@@ -523,6 +524,11 @@ afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
afr_private_t *priv = NULL;
priv = this->private;
+
+ if (this->cleanup_starting) {
+ return -ENOTCONN;
+ }
+
if (!priv->shd.enabled)
return -EBUSY;
@@ -591,7 +597,9 @@ _afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
{
afr_private_t *priv = NULL;
dict_t *xattr = NULL;
- int *raw = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {
+ 0,
+ };
int ret = -1;
int i = 0;
@@ -603,18 +611,11 @@ _afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
"Failed to create dict.");
goto out;
}
-
for (i = 0; i < priv->child_count; i++) {
- raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t);
- if (!raw)
- goto out;
-
- ret = dict_set_bin(xattr, priv->pending_key[i], raw,
- AFR_NUM_CHANGE_LOGS * sizeof(int));
- if (ret) {
- GF_FREE(raw);
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret)
goto out;
- }
}
ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
@@ -641,6 +642,7 @@ afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer,
if (afr_shd_fill_ta_loc(this, loc)) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.", loc->name);
+ ret = -1;
goto out;
}
@@ -799,6 +801,218 @@ afr_bricks_available_for_heal(afr_private_t *priv)
return _gf_true;
}
+static gf_boolean_t
+afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer)
+{
+ dict_t *xdata = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = -1;
+ int i = 0;
+ gf_boolean_t need_heal = _gf_false;
+
+ priv = this->private;
+
+ ret = afr_shd_fill_ta_loc(this, &loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ healer->rerun = 1;
+ goto out;
+ }
+
+ if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) {
+ healer->rerun = 1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+
+ return need_heal;
+}
+
+static int
+afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = healer->this->private;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int count = 0;
+ int i = 0;
+ int op_errno = 0;
+ struct iatt *iatt = NULL;
+ gf_boolean_t multiple_links = _gf_false;
+ unsigned char *gfid_present = alloca0(priv->child_count);
+ unsigned char *entry_present = alloca0(priv->child_count);
+ char *type = "file";
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ gf_msg_debug(healer->this->name, 0,
+ "Not all bricks are up. Skipping "
+ "cleanup of %s on %s",
+ entry->d_name, subvol->name);
+ ret = 0;
+ goto out;
+ }
+
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = gf_uuid_parse(entry->d_name, loc.gfid);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ gfid_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ if (iatt->ia_type == IA_IFDIR) {
+ type = "dir";
+ }
+
+ if (i == healer->subvol) {
+ if (local->replies[i].poststat.ia_nlink > 1) {
+ multiple_links = _gf_true;
+ }
+ }
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ /*Inode is deleted from subvol*/
+ if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type,
+ priv->anon_inode_name, entry->d_name, subvol->name);
+ ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name,
+ iatt->ia_type);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = 0;
+ } else if (count > 1) {
+ loc_wipe(&loc);
+ loc.parent = inode_ref(parent->inode);
+ loc.name = entry->d_name;
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup,
+ &loc, NULL);
+ count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ entry_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (gfid_present[i] && !entry_present[i]) {
+ /*Entry is not anonymous on at least one subvol*/
+ gf_msg_debug(healer->this->name, 0,
+ "Valid entry present on %s "
+ "Skipping cleanup of %s on %s",
+ priv->children[i]->name, entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging %s %s/%s on all subvols", type, priv->anon_inode_name,
+ entry->d_name);
+ ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent,
+ entry->d_name, iatt->ia_type);
+ if (op_errno != ENOENT && op_errno != ESTALE) {
+ ret |= -op_errno;
+ }
+ }
+ }
+
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return ret;
+}
+
+static void
+afr_cleanup_anon_inode_dir(struct subvol_healer *healer)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_private_t *priv = healer->this->private;
+ loc_t loc = {0};
+
+ ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode);
+ if (ret)
+ goto out;
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer,
+ afr_shd_anon_inode_cleaner, NULL,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return;
+}
+
void *
afr_shd_index_healer(void *data)
{
@@ -825,7 +1039,8 @@ afr_shd_index_healer(void *data)
priv->local[healer->subvol] = healer->local;
if (priv->thin_arbiter_count) {
- afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
+ if (afr_shd_ta_needs_heal(this, healer))
+ afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
}
do {
@@ -855,9 +1070,17 @@ afr_shd_index_healer(void *data)
sleep(1);
} while (ret > 0);
- if (pre_crawl_xdata && !healer->crawl_event.heal_failed_count) {
+ if (ret == 0) {
+ afr_cleanup_anon_inode_dir(healer);
+ }
+
+ if (ret == 0 && pre_crawl_xdata &&
+ !healer->crawl_event.heal_failed_count) {
afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
pre_crawl_xdata);
+ }
+
+ if (pre_crawl_xdata) {
dict_unref(pre_crawl_xdata);
pre_crawl_xdata = NULL;
}
@@ -975,7 +1198,9 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
{
int ret = 0;
uint64_t count = 0;
- char key[256] = {0};
+ char key[128] = {0};
+ int keylen = 0;
+ char suffix[64] = {0};
int xl_id = 0;
uint64_t healed_count = 0;
uint64_t split_brain_count = 0;
@@ -1010,8 +1235,8 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
ret = dict_get_uint64(output, key, &count);
- snprintf(key, sizeof(key), "statistics_healed_cnt-%d-%d-%" PRIu64, xl_id,
- child, count);
+ snprintf(suffix, sizeof(suffix), "%d-%d-%" PRIu64, xl_id, child, count);
+ snprintf(key, sizeof(key), "statistics_healed_cnt-%s", suffix);
ret = dict_set_uint64(output, key, healed_count);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
@@ -1019,8 +1244,7 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
goto out;
}
- snprintf(key, sizeof(key), "statistics_sb_cnt-%d-%d-%" PRIu64, xl_id, child,
- count);
+ snprintf(key, sizeof(key), "statistics_sb_cnt-%s", suffix);
ret = dict_set_uint64(output, key, split_brain_count);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
@@ -1028,17 +1252,15 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
goto out;
}
- snprintf(key, sizeof(key), "statistics_crawl_type-%d-%d-%" PRIu64, xl_id,
- child, count);
- ret = dict_set_str(output, key, crawl_type);
+ keylen = snprintf(key, sizeof(key), "statistics_crawl_type-%s", suffix);
+ ret = dict_set_strn(output, key, keylen, crawl_type);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
"Could not add statistics_crawl_type to output");
goto out;
}
- snprintf(key, sizeof(key), "statistics_heal_failed_cnt-%d-%d-%" PRIu64,
- xl_id, child, count);
+ snprintf(key, sizeof(key), "statistics_heal_failed_cnt-%s", suffix);
ret = dict_set_uint64(output, key, heal_failed_count);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
@@ -1046,9 +1268,8 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
goto out;
}
- snprintf(key, sizeof(key), "statistics_strt_time-%d-%d-%" PRIu64, xl_id,
- child, count);
- ret = dict_set_dynstr(output, key, start_time_str);
+ keylen = snprintf(key, sizeof(key), "statistics_strt_time-%s", suffix);
+ ret = dict_set_dynstrn(output, key, keylen, start_time_str);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
"Could not add statistics_crawl_start_time to output");
@@ -1062,11 +1283,10 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
else
progress = 0;
- snprintf(key, sizeof(key), "statistics_end_time-%d-%d-%" PRIu64, xl_id,
- child, count);
+ keylen = snprintf(key, sizeof(key), "statistics_end_time-%s", suffix);
if (!end_time_str)
end_time_str = gf_strdup("Could not determine the end time");
- ret = dict_set_dynstr(output, key, end_time_str);
+ ret = dict_set_dynstrn(output, key, keylen, end_time_str);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
"Could not add statistics_crawl_end_time to output");
@@ -1075,10 +1295,9 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
end_time_str = NULL;
}
- snprintf(key, sizeof(key), "statistics_inprogress-%d-%d-%" PRIu64, xl_id,
- child, count);
+ keylen = snprintf(key, sizeof(key), "statistics_inprogress-%s", suffix);
- ret = dict_set_int32(output, key, progress);
+ ret = dict_set_int32n(output, key, keylen, progress);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
"Could not add statistics_inprogress to output");
@@ -1104,7 +1323,9 @@ afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
{
int ret = -1;
uint64_t count = 0;
- char key[256] = {0};
+ char key[64] = {0};
+ int keylen = 0;
+ char xl_id_child_str[32] = {0};
int xl_id = 0;
ret = dict_get_int32(output, this->name, &xl_id);
@@ -1114,11 +1335,12 @@ afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
goto out;
}
- snprintf(key, sizeof(key), "%d-%d-count", xl_id, child);
+ snprintf(xl_id_child_str, sizeof(xl_id_child_str), "%d-%d", xl_id, child);
+ snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
ret = dict_get_uint64(output, key, &count);
- snprintf(key, sizeof(key), "%d-%d-%" PRIu64, xl_id, child, count);
- ret = dict_set_dynstr(output, key, path);
+ keylen = snprintf(key, sizeof(key), "%s-%" PRIu64, xl_id_child_str, count);
+ ret = dict_set_dynstrn(output, key, keylen, path);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
@@ -1127,7 +1349,7 @@ afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
}
if (tv) {
- snprintf(key, sizeof(key), "%d-%d-%" PRIu64 "-time", xl_id, child,
+ snprintf(key, sizeof(key), "%s-%" PRIu64 "-time", xl_id_child_str,
count);
ret = dict_set_uint32(output, key, tv->tv_sec);
if (ret) {
@@ -1137,7 +1359,7 @@ afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
}
}
- snprintf(key, sizeof(key), "%d-%d-count", xl_id, child);
+ snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
ret = dict_set_uint64(output, key, count + 1);
if (ret) {
@@ -1210,10 +1432,6 @@ afr_selfheal_daemon_init(xlator_t *this)
priv = this->private;
shd = &priv->shd;
- this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this);
- if (!this->itable)
- goto out;
-
shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers),
priv->child_count,
gf_afr_mt_subvol_healer_t);
@@ -1264,12 +1482,18 @@ out:
return ret;
}
-int
-afr_selfheal_childup(xlator_t *this, int subvol)
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv)
{
- afr_shd_index_healer_spawn(this, subvol);
+ int subvol = 0;
- return 0;
+ if (!priv->shd.iamshd)
+ return;
+ for (subvol = 0; subvol < priv->child_count; subvol++)
+ if (priv->child_up[subvol])
+ afr_shd_index_healer_spawn(this, subvol);
+
+ return;
}
int
@@ -1319,41 +1543,78 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
struct subvol_healer *healer = NULL;
int i = 0;
char key[64];
+ int keylen = 0;
+ int this_name_len = 0;
int op_ret = 0;
uint64_t cnt = 0;
+#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \
+ dict_str_len) \
+ { \
+ int ret; \
+ \
+ ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \
+ if (ret) { \
+ gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \
+ "key=%s", key, "value=%s", dict_str, NULL); \
+ } \
+ }
+
priv = this->private;
shd = &priv->shd;
- ret = dict_get_int32(input, "xl-op", (int32_t *)&op);
- if (ret)
+ ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=xl-op", NULL);
goto out;
- ret = dict_get_int32(input, this->name, &xl_id);
- if (ret)
+ }
+ this_name_len = strlen(this->name);
+ ret = dict_get_int32n(input, this->name, this_name_len, &xl_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=%s", this->name, NULL);
goto out;
- ret = dict_set_int32(output, this->name, xl_id);
- if (ret)
+ }
+ ret = dict_set_int32n(output, this->name, this_name_len, xl_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "key=%s", this->name, NULL);
goto out;
+ }
switch (op) {
case GF_SHD_OP_HEAL_INDEX:
op_ret = 0;
for (i = 0; i < priv->child_count; i++) {
healer = &shd->index_healers[i];
- snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
if (!priv->child_up[i]) {
- ret = dict_set_str(output, key, "Brick is not connected");
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
op_ret = -1;
} else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
- ret = dict_set_str(output, key,
- "< 2 bricks in replica are up");
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SLESS_THAN2_BRICKS_in_REP,
+ SLEN(SLESS_THAN2_BRICKS_in_REP));
op_ret = -1;
} else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
- ret = dict_set_str(output, key, "Brick is remote");
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
+ SLEN(SBRICK_IS_REMOTE));
} else {
- ret = dict_set_str(output, key, "Started self-heal");
- afr_shd_index_healer_spawn(this, i);
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SSTARTED_SELF_HEAL,
+ SLEN(SSTARTED_SELF_HEAL));
+
+ ret = afr_shd_index_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
}
}
break;
@@ -1362,18 +1623,31 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
for (i = 0; i < priv->child_count; i++) {
healer = &shd->full_healers[i];
- snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
if (!priv->child_up[i]) {
- ret = dict_set_str(output, key, "Brick is not connected");
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
} else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
- ret = dict_set_str(output, key,
- "< 2 bricks in replica are up");
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SLESS_THAN2_BRICKS_in_REP,
+ SLEN(SLESS_THAN2_BRICKS_in_REP));
} else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
- ret = dict_set_str(output, key, "Brick is remote");
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
+ SLEN(SBRICK_IS_REMOTE));
} else {
- ret = dict_set_str(output, key, "Started self-heal");
- afr_shd_full_healer_spawn(this, i);
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SSTARTED_SELF_HEAL,
+ SLEN(SSTARTED_SELF_HEAL));
+
+ ret = afr_shd_full_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
op_ret = 0;
}
}
@@ -1381,25 +1655,25 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
case GF_SHD_OP_INDEX_SUMMARY:
/* this case has been handled in glfs-heal.c */
break;
- case GF_SHD_OP_HEALED_FILES:
- case GF_SHD_OP_HEAL_FAILED_FILES:
- for (i = 0; i < priv->child_count; i++) {
- snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
- ret = dict_set_str(output, key,
- "Operation Not "
- "Supported");
- }
- break;
case GF_SHD_OP_SPLIT_BRAIN_FILES:
eh_dump(shd->split_brain, output, afr_add_shd_event);
break;
case GF_SHD_OP_STATISTICS:
for (i = 0; i < priv->child_count; i++) {
eh_dump(shd->statistics[i], output, afr_add_crawl_event);
- afr_shd_dict_add_crawl_event(
+ ret = afr_shd_dict_add_crawl_event(
this, output, &shd->index_healers[i].crawl_event);
- afr_shd_dict_add_crawl_event(this, output,
- &shd->full_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
+
+ ret = afr_shd_dict_add_crawl_event(
+ this, output, &shd->full_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
}
break;
case GF_SHD_OP_STATISTICS_HEAL_COUNT:
@@ -1408,14 +1682,21 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
for (i = 0; i < priv->child_count; i++) {
if (!priv->child_up[i]) {
- snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
- ret = dict_set_str(output, key, "Brick is not connected");
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id,
+ i);
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
} else {
snprintf(key, sizeof(key), "%d-%d-hardlinks", xl_id, i);
ret = afr_shd_get_index_count(this, i, &cnt);
if (ret == 0) {
ret = dict_set_uint64(output, key, cnt);
}
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_DICT_SET_FAILED, NULL);
+ }
op_ret = 0;
}
}
@@ -1423,11 +1704,13 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
break;
default:
- gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG,
- "Unknown set op %d", op);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d",
+ op, NULL);
break;
}
out:
- dict_del(output, this->name);
+ dict_deln(output, this->name, this_name_len);
return op_ret;
+
+#undef AFR_SET_DICT_AND_LOG
}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 7de7c431460..18db728ea7b 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -14,12 +14,11 @@
#include <pthread.h>
typedef struct {
- int child;
char *path;
+ int child;
} shd_event_t;
typedef struct {
- int child;
uint64_t healed_count;
uint64_t split_brain_count;
uint64_t heal_failed_count;
@@ -31,38 +30,36 @@ typedef struct {
cralwer is in progress */
time_t end_time;
char *crawl_type;
+ int child;
} crawl_event_t;
struct subvol_healer {
xlator_t *this;
- int subvol;
- gf_boolean_t local;
- gf_boolean_t running;
- gf_boolean_t rerun;
crawl_event_t crawl_event;
pthread_mutex_t mutex;
pthread_cond_t cond;
pthread_t thread;
+ int subvol;
+ gf_boolean_t local;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
};
typedef struct {
- gf_boolean_t iamshd;
- gf_boolean_t enabled;
- int timeout;
struct subvol_healer *index_healers;
struct subvol_healer *full_healers;
eh_t *split_brain;
eh_t **statistics;
+ int timeout;
uint32_t max_threads;
uint32_t wait_qlength;
uint32_t halo_max_latency_msec;
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
} afr_self_heald_t;
int
-afr_selfheal_childup(xlator_t *this, int subvol);
-
-int
afr_selfheal_daemon_init(xlator_t *this);
int
@@ -73,6 +70,6 @@ afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid,
char **path_p);
int
-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name,
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
ia_type_t type);
#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index cf153dea9cc..a51f79b1f43 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -8,10 +8,10 @@
cases as published by the Free Software Foundation.
*/
-#include "dict.h"
-#include "byte-order.h"
-#include "common-utils.h"
-#include "timer.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/timer.h>
#include "afr.h"
#include "afr-transaction.h"
@@ -28,11 +28,17 @@ typedef enum {
static void
afr_lock_resume_shared(struct list_head *list);
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this);
+
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno);
+
void
__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
void
-afr_changelog_post_op(call_frame_t *frame, xlator_t *this);
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this);
int
afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this);
@@ -53,6 +59,97 @@ afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
afr_changelog_resume_t changelog_resume,
afr_xattrop_type_t op);
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this);
+
+static int
+afr_ta_post_op_do(void *opaque);
+
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local);
+
+static int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);
+
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno);
+
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv)
+{
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ priv->ta_notify_dom_lock_offset = 0;
+}
+
+static void
+afr_ta_process_waitq(xlator_t *this)
+{
+ afr_local_t *entry = NULL;
+ afr_private_t *priv = this->private;
+ struct list_head waitq = {
+ 0,
+ };
+
+ INIT_LIST_HEAD(&waitq);
+ LOCK(&priv->lock);
+ list_splice_init(&priv->ta_waitq, &waitq);
+ UNLOCK(&priv->lock);
+ list_for_each_entry(entry, &waitq, ta_waitq)
+ {
+ afr_ta_decide_post_op_state(entry->transaction.frame, this);
+ }
+}
+
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ afr_ta_process_waitq(ta_frame->this);
+ STACK_DESTROY(ta_frame->root);
+ return 0;
+}
+
+int
+afr_release_notify_lock_for_ta(void *opaque)
+{
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = -1;
+
+ this = (xlator_t *)opaque;
+ priv = this->private;
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+ flock.l_type = F_UNLCK;
+ flock.l_start = priv->ta_notify_dom_lock_offset;
+ flock.l_len = 1;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+ }
+
+ LOCK(&priv->lock);
+ {
+ afr_ta_locked_priv_invalidate(priv);
+ }
+ UNLOCK(&priv->lock);
+out:
+ loc_wipe(&loc);
+ return ret;
+}
+
void
afr_zero_fill_stat(afr_local_t *local)
{
@@ -158,7 +255,7 @@ afr_changelog_has_quorum(afr_local_t *local, xlator_t *this)
}
}
- if (afr_has_quorum(success_children, this)) {
+ if (afr_has_quorum(success_children, this, NULL)) {
return _gf_true;
}
@@ -282,6 +379,8 @@ afr_transaction_done(call_frame_t *frame, xlator_t *this)
}
local->transaction.unwind(frame, this);
+ GF_ASSERT(list_empty(&local->transaction.owner_list));
+ GF_ASSERT(list_empty(&local->transaction.wait_list));
AFR_STACK_DESTROY(frame);
return 0;
@@ -303,7 +402,7 @@ afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
}
static void
-afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
+afr_handle_lock_acquire_failure(afr_local_t *local)
{
struct list_head shared;
afr_lock_t *lock = NULL;
@@ -324,13 +423,8 @@ afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
afr_lock_fail_shared(local, &shared);
local->transaction.do_eager_unlock = _gf_true;
out:
- if (locked) {
- local->internal_lock.lock_cbk = afr_transaction_done;
- afr_unlock(local->transaction.frame, local->transaction.frame->this);
- } else {
- afr_transaction_done(local->transaction.frame,
- local->transaction.frame->this);
- }
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ afr_unlock(local->transaction.frame, local->transaction.frame->this);
}
call_frame_t *
@@ -427,42 +521,6 @@ afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this)
local->transaction.pre_op_sources[j] = 0;
}
-gf_boolean_t
-afr_has_arbiter_fop_cbk_quorum(call_frame_t *frame)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- xlator_t *this = NULL;
- gf_boolean_t fop_failed = _gf_false;
- unsigned char *pre_op_sources = NULL;
- int i = 0;
-
- local = frame->local;
- this = frame->this;
- priv = this->private;
- pre_op_sources = local->transaction.pre_op_sources;
-
- /* If the fop failed on the brick, it is not a source. */
- for (i = 0; i < priv->child_count; i++)
- if (local->transaction.failed_subvols[i])
- pre_op_sources[i] = 0;
-
- switch (AFR_COUNT(pre_op_sources, priv->child_count)) {
- case 1:
- if (pre_op_sources[ARBITER_BRICK_INDEX])
- fop_failed = _gf_true;
- break;
- case 0:
- fop_failed = _gf_true;
- break;
- }
-
- if (fop_failed)
- return _gf_false;
-
- return _gf_true;
-}
-
void
afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this)
{
@@ -529,7 +587,7 @@ afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
failure_count = AFR_COUNT(local->transaction.failed_subvols,
priv->child_count);
if (failure_count == priv->child_count) {
- afr_handle_lock_acquire_failure(local, _gf_true);
+ afr_handle_lock_acquire_failure(local);
return 0;
} else {
lock = &local->inode_ctx->lock[local->transaction.type];
@@ -577,17 +635,105 @@ afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
return ret;
}
-/* {{{ pending */
+static void
+afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ gf_boolean_t release = _gf_false;
+
+ LOCK(&priv->lock);
+ {
+ /*Once we get notify lock release upcall notification,
+ if any of the fop state counters are non-zero, we will
+ not release the lock.
+ */
+ onwire_count = priv->ta_on_wire_txn_count;
+ inmem_count = priv->ta_in_mem_txn_count;
+ switch (fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ onwire_count = --priv->ta_on_wire_txn_count;
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ case TA_INFO_IN_MEMORY_FAILED:
+ inmem_count = --priv->ta_in_mem_txn_count;
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ GF_ASSERT(0);
+ break;
+ case TA_SUCCESS:
+ break;
+ }
+ release = priv->release_ta_notify_dom_lock;
+ }
+ UNLOCK(&priv->lock);
+
+ if (inmem_count != 0 || release == _gf_false || onwire_count != 0)
+ return;
+
+ afr_ta_lock_release_synctask(this);
+}
+
+static void
+afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *entry = NULL;
+ int bad_child = AFR_CHILD_UNKNOWN;
+
+ struct list_head onwireq = {
+ 0,
+ };
+ INIT_LIST_HEAD(&onwireq);
+
+ LOCK(&priv->lock);
+ {
+ bad_child = priv->ta_bad_child_index;
+ if (bad_child == AFR_CHILD_UNKNOWN) {
+ /*The previous on-wire ta_post_op was a failure. Just dequeue
+ *one element to wind on-wire again. */
+ entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ } else {
+ /* Prepare to process all fops based on bad_child_index. */
+ list_splice_init(&priv->ta_onwireq, &onwireq);
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (entry) {
+ afr_ta_post_op_synctask(this, entry);
+ return;
+ } else {
+ while (!list_empty(&onwireq)) {
+ entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ if (entry->ta_failed_subvol == bad_child) {
+ afr_post_op_handle_success(entry->transaction.frame, this);
+ } else {
+ afr_post_op_handle_failure(entry->transaction.frame, this, EIO);
+ }
+ }
+ }
+}
int
afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
+ priv = this->private;
int_lock = &local->internal_lock;
+ if (priv->thin_arbiter_count) {
+ /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */
+ afr_ta_dom_lock_check_and_release(local->fop_state, this);
+ }
+
/* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
if (!afr_changelog_has_quorum(local, this)) {
local->op_ret = -1;
@@ -605,24 +751,26 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
return 0;
}
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+ afr_local_t *local = frame->local;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB,
+ "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op],
+ uuid_utoa(local->inode->gfid), local->fop_state);
+
+ afr_changelog_post_op_done(frame, this);
+}
+
unsigned char *
afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)
{
- unsigned char *locked_nodes = NULL;
- switch (type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- locked_nodes = int_lock->locked_nodes;
- break;
-
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- /*Because same set of subvols participate in all lockee
- * entities*/
- locked_nodes = int_lock->lockee[0].locked_nodes;
- break;
- }
- return locked_nodes;
+ /*Because same set of subvols participate in all lockee
+ * entities*/
+ return int_lock->lockee[0].locked_nodes;
}
int
@@ -680,7 +828,7 @@ afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this)
}
gf_boolean_t
-afr_has_quorum(unsigned char *subvols, xlator_t *this)
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame)
{
unsigned int quorum_count = 0;
afr_private_t *priv = NULL;
@@ -689,6 +837,9 @@ afr_has_quorum(unsigned char *subvols, xlator_t *this)
priv = this->private;
up_children_count = AFR_COUNT(subvols, priv->child_count);
+ if (afr_lookup_has_quorum(frame, up_children_count))
+ return _gf_true;
+
if (priv->quorum_count == AFR_QUORUM_AUTO) {
/*
* Special case for auto-quorum with an even number of nodes.
@@ -743,7 +894,7 @@ afr_has_fop_quorum(call_frame_t *frame)
locked_nodes = afr_locked_nodes_get(local->transaction.type,
&local->internal_lock);
- return afr_has_quorum(locked_nodes, this);
+ return afr_has_quorum(locked_nodes, this, NULL);
}
static gf_boolean_t
@@ -761,7 +912,7 @@ afr_has_fop_cbk_quorum(call_frame_t *frame)
success[i] = 1;
}
- return afr_has_quorum(success, this);
+ return afr_has_quorum(success, this, NULL);
}
gf_boolean_t
@@ -784,12 +935,8 @@ afr_need_dirty_marking(call_frame_t *frame, xlator_t *this)
priv->child_count)
return _gf_false;
- if (priv->arbiter_count) {
- if (!afr_has_arbiter_fop_cbk_quorum(frame))
- need_dirty = _gf_true;
- } else if (!afr_has_fop_cbk_quorum(frame)) {
+ if (!afr_has_fop_cbk_quorum(frame))
need_dirty = _gf_true;
- }
return need_dirty;
}
@@ -839,12 +986,8 @@ afr_handle_quorum(call_frame_t *frame, xlator_t *this)
* no split-brain with the fix. The problem is eliminated completely.
*/
- if (priv->arbiter_count) {
- if (afr_has_arbiter_fop_cbk_quorum(frame))
- return;
- } else if (afr_has_fop_cbk_quorum(frame)) {
+ if (afr_has_fop_cbk_quorum(frame))
return;
- }
if (afr_need_dirty_marking(frame, this))
goto set_response;
@@ -886,7 +1029,7 @@ set_response:
}
int
-afr_fill_ta_loc(xlator_t *this, loc_t *loc)
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop)
{
afr_private_t *priv = NULL;
@@ -894,6 +1037,11 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc)
loc->parent = inode_ref(priv->root_inode);
gf_uuid_copy(loc->pargfid, loc->parent->gfid);
loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) {
+ /* Except afr_ta_id_file_check() which is path based, all other gluster
+ * FOPS need gfid.*/
+ return -EINVAL;
+ }
gf_uuid_copy(loc->gfid, priv->ta_gfid);
loc->inode = inode_new(loc->parent->table);
if (!loc->inode) {
@@ -903,35 +1051,104 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc)
return 0;
}
-int
-afr_changelog_thin_arbiter_post_op(xlator_t *this, afr_local_t *local)
+static int
+afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
{
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *txn_frame = NULL;
+ afr_ta_fop_state_t fop_state;
+
+ local = (afr_local_t *)opaque;
+ fop_state = local->fop_state;
+ txn_frame = local->transaction.frame;
+ this = frame->this;
+
+ if (ret == 0) {
+ /*Mark pending xattrs on the up data brick.*/
+ afr_post_op_handle_success(txn_frame, this);
+ } else {
+ afr_post_op_handle_failure(txn_frame, this, -ret);
+ }
+
+ STACK_DESTROY(frame->root);
+ afr_ta_process_onwireq(fop_state, this);
+
+ return 0;
+}
+
+int **
+afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, afr_local_t *local)
+{
+ int **changelog = NULL;
+ int idx = 0;
int ret = 0;
+ int i;
+
+ if (local->is_new_entry == _gf_true) {
+ changelog = afr_mark_pending_changelog(priv, pending, xattr,
+ local->cont.dir_fop.buf.ia_type);
+ } else {
+ idx = afr_index_for_transaction_type(local->transaction.type);
+ changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog) {
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ changelog[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict(priv, xattr, changelog);
+ if (ret < 0) {
+ afr_matrix_cleanup(changelog, priv->child_count);
+ return NULL;
+ }
+ }
+
+out:
+ return changelog;
+}
+
+static void
+afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local,
+ gf_boolean_t *valid)
+{
+ if (priv->ta_event_gen > local->ta_event_gen) {
+ /* We can't trust the ta's response anymore.*/
+ afr_ta_locked_priv_invalidate(priv);
+ *valid = _gf_false;
+ return;
+ }
+ return;
+}
+
+static int
+afr_ta_post_op_do(void *opaque)
+{
+ afr_local_t *local = NULL;
afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
dict_t *xattr = NULL;
- int failed_count = 0;
- struct gf_flock flock = {
- 0,
- };
+ unsigned char *pending = NULL;
+ int **changelog = NULL;
+ int failed_subvol = -1;
+ int success_subvol = -1;
loc_t loc = {
0,
};
int i = 0;
+ int ret = 0;
+ gf_boolean_t valid = _gf_true;
+ local = (afr_local_t *)opaque;
+ this = local->transaction.frame->this;
priv = this->private;
- if (!priv->thin_arbiter_count)
- return 0;
- failed_count = AFR_COUNT(local->transaction.failed_subvols,
- priv->child_count);
- if (!failed_count)
- return 0;
-
- GF_ASSERT(failed_count == 1);
- ret = afr_fill_ta_loc(this, &loc);
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ "Failed to populate loc for thin-arbiter.");
goto out;
}
@@ -940,51 +1157,217 @@ afr_changelog_thin_arbiter_post_op(xlator_t *this, afr_local_t *local)
ret = -ENOMEM;
goto out;
}
+
+ pending = alloca0(priv->child_count);
+
for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin(xattr, priv->pending_key[i],
- local->pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof(int));
- if (ret)
- goto out;
+ if (local->transaction.failed_subvols[i]) {
+ pending[i] = 1;
+ failed_subvol = i;
+ } else {
+ success_subvol = i;
+ }
}
- flock.l_type = F_WRLCK;
- flock.l_start = 0;
- flock.l_len = 0;
+ changelog = afr_set_changelog_xattr(priv, pending, xattr, local);
- /*TODO: Convert to two domain locking. */
- ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
- AFR_TA_DOM_NOTIFY, &loc, F_SETLKW, &flock, NULL, NULL);
+ if (!changelog) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = afr_ta_post_op_lock(this, &loc);
if (ret)
goto out;
ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
-
- if (ret == -EINVAL) {
- gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB,
- "Thin-arbiter has denied post-op on %s for gfid %s.",
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s failed for gfid %s.",
priv->pending_key[THIN_ARBITER_BRICK_INDEX],
uuid_utoa(local->inode->gfid));
+ }
+ LOCK(&priv->lock);
+ {
+ if (ret == 0) {
+ priv->ta_bad_child_index = failed_subvol;
+ } else if (ret == -EINVAL) {
+ priv->ta_bad_child_index = success_subvol;
+ ret = -EIO; /* TA failed the fop. Return EIO to application. */
+ }
- } else if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Post-op on thin-arbiter id file %s failed for gfid %s.",
+ afr_ta_locked_xattrop_validate(priv, local, &valid);
+ }
+ UNLOCK(&priv->lock);
+ if (valid == _gf_false) {
+ gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s for gfid %s invalidated due "
+ "to event-gen mismatch.",
priv->pending_key[THIN_ARBITER_BRICK_INDEX],
uuid_utoa(local->inode->gfid));
+ ret = -EIO;
}
- flock.l_type = F_UNLCK;
- syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY,
- &loc, F_SETLK, &flock, NULL, NULL);
+
+ afr_ta_post_op_unlock(this, &loc);
out:
if (xattr)
dict_unref(xattr);
+ if (changelog)
+ afr_matrix_cleanup(changelog, priv->child_count);
+
+ loc_wipe(&loc);
+
return ret;
}
-int
-afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local)
+{
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
+
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ goto err;
+ }
+ ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done,
+ ta_frame, local);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to launch post-op on thin arbiter for gfid %s",
+ uuid_utoa(local->inode->gfid));
+ STACK_DESTROY(ta_frame->root);
+ goto err;
+ }
+
+ return ret;
+err:
+ afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM);
+ return ret;
+}
+
+static void
+afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,
+ int *on_wire_count)
+{
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Put the fop in waitq until notify dom lock is released.*/
+ local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL;
+ list_add_tail(&local->ta_waitq, &priv->ta_waitq);
+ } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) {
+ /* Post-op on thin-arbiter to decide success/failure. */
+ local->fop_state = TA_GET_INFO_FROM_TA_FILE;
+ *on_wire_count = ++priv->ta_on_wire_txn_count;
+ if (*on_wire_count > 1) {
+ /*Avoid sending multiple on-wire post-ops on TA*/
+ list_add_tail(&local->ta_onwireq, &priv->ta_onwireq);
+ }
+ } else if (local->ta_failed_subvol == priv->ta_bad_child_index) {
+ /* Post-op on TA not needed as the fop failed on the in-memory bad
+ * brick. Just mark pending xattrs on the good data brick.*/
+ local->fop_state = TA_INFO_IN_MEMORY_SUCCESS;
+ priv->ta_in_mem_txn_count++;
+ } else {
+ /* Post-op on TA not needed as the fop succeeded only on the
+ * in-memory bad data brick and not the good one. Fail the fop.*/
+ local->fop_state = TA_INFO_IN_MEMORY_FAILED;
+ priv->ta_in_mem_txn_count++;
+ }
+ }
+ UNLOCK(&priv->lock);
+}
+
+static void
+afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ local->ta_failed_subvol = i;
+ break;
+ }
+ }
+}
+
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ if (local->is_new_entry == _gf_true) {
+ afr_mark_new_entry_changelog(frame, this);
+ }
+ afr_changelog_post_op_do(frame, this);
+
+ return;
+}
+
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+ afr_changelog_post_op_fail(frame, this, op_errno);
+
+ return;
+}
+
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int on_wire_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_ta_set_fop_state(priv, local, &on_wire_count);
+
+ switch (local->fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ if (on_wire_count == 1)
+ afr_ta_post_op_synctask(this, local);
+ /*else, fop is queued in ta_onwireq.*/
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ /*Post releasing the notify lock, we will act on this queue*/
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ afr_post_op_handle_success(frame, this);
+ break;
+ case TA_INFO_IN_MEMORY_FAILED:
+ afr_post_op_handle_failure(frame, this, EIO);
+ break;
+ default:
+ break;
+ }
+ return;
+}
+
+static void
+afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ afr_ta_fill_failed_subvol(priv, local);
+ gf_msg_debug(this->name, 0,
+ "Fop failed on data brick (%s) for gfid=%s. "
+ "ta info needed to decide fop result.",
+ priv->children[local->ta_failed_subvol]->name,
+ uuid_utoa(local->inode->gfid));
+ afr_ta_decide_post_op_state(frame, this);
+}
+
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this)
{
afr_private_t *priv = this->private;
afr_local_t *local = NULL;
@@ -1001,9 +1384,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
xattr = dict_new();
if (!xattr) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done(frame, this);
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
goto out;
}
@@ -1030,9 +1411,8 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
}
if (local->transaction.in_flight_sb) {
- local->op_ret = -1;
- local->op_errno = local->transaction.in_flight_sb_errno;
- afr_changelog_post_op_done(frame, this);
+ afr_changelog_post_op_fail(frame, this,
+ local->transaction.in_flight_sb_errno);
goto out;
}
@@ -1043,17 +1423,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
ret = afr_set_pending_dict(priv, xattr, local->pending);
if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done(frame, this);
- goto out;
- }
-
- ret = afr_changelog_thin_arbiter_post_op(this, local);
- if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = -ret;
- afr_changelog_post_op_done(frame, this);
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
goto out;
}
@@ -1066,9 +1436,7 @@ set_dirty:
ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,
sizeof(int) * AFR_NUM_CHANGE_LOGS);
if (ret) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done(frame, this);
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
goto out;
}
@@ -1078,6 +1446,32 @@ out:
if (xattr)
dict_unref(xattr);
+ return;
+}
+
+static int
+afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int failed_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (priv->thin_arbiter_count) {
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failed_count == 1) {
+ afr_handle_failure_using_thin_arbiter(frame, this);
+ return 0;
+ } else {
+ /* Txn either succeeded or failed on both data bricks. Let
+ * post_op_do handle it as the case might be. */
+ }
+ }
+
+ afr_changelog_post_op_do(frame, this);
return 0;
}
@@ -1281,6 +1675,7 @@ afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op,
int i = 0;
int ret = 0;
char *key = NULL;
+ int keylen = 0;
const char *name = NULL;
dict_t *xdata1 = NULL;
dict_t *xdata2 = NULL;
@@ -1338,7 +1733,8 @@ afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op,
}
if (need_entry_key_set) {
- ret = dict_set_str(xdata1, key, (char *)name);
+ keylen = strlen(key);
+ ret = dict_set_strn(xdata1, key, keylen, (char *)name);
if (ret)
gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
"%s/%s: Could not set %s key during xattrop",
@@ -1348,7 +1744,8 @@ afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op,
if (!xdata2)
goto out;
- ret = dict_set_str(xdata2, key, (char *)local->newloc.name);
+ ret = dict_set_strn(xdata2, key, keylen,
+ (char *)local->newloc.name);
if (ret)
gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
"%s/%s: Could not set %s key during "
@@ -1610,7 +2007,7 @@ err:
local->op_ret = -1;
local->op_errno = op_errno;
- afr_handle_lock_acquire_failure(local, _gf_true);
+ afr_handle_lock_acquire_failure(local);
if (xdata_req)
dict_unref(xdata_req);
@@ -1619,7 +2016,7 @@ err:
}
int
-afr_post_nonblocking_inodelk_cbk(call_frame_t *frame, xlator_t *this)
+afr_post_nonblocking_lock_cbk(call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
@@ -1630,36 +2027,12 @@ afr_post_nonblocking_inodelk_cbk(call_frame_t *frame, xlator_t *this)
/* Initiate blocking locks if non-blocking has failed */
if (int_lock->lock_op_ret < 0) {
gf_msg_debug(this->name, 0,
- "Non blocking inodelks failed. Proceeding to blocking");
+ "Non blocking locks failed. Proceeding to blocking");
int_lock->lock_cbk = afr_internal_lock_finish;
afr_blocking_lock(frame, this);
} else {
gf_msg_debug(this->name, 0,
- "Non blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish(frame, this);
- }
-
- return 0;
-}
-
-int
-afr_post_nonblocking_entrylk_cbk(call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_msg_debug(this->name, 0,
- "Non blocking entrylks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_internal_lock_finish;
- afr_blocking_lock(frame, this);
- } else {
- gf_msg_debug(this->name, 0,
- "Non blocking entrylks done. Proceeding to FOP");
+ "Non blocking locks done. Proceeding to FOP");
afr_internal_lock_finish(frame, this);
}
@@ -1677,7 +2050,7 @@ afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
if (int_lock->lock_op_ret < 0) {
- gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_BLOCKING_LKS_FAILED,
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INTERNAL_LKS_FAILED,
"Blocking entrylks failed.");
afr_transaction_done(frame, this);
@@ -1708,25 +2081,26 @@ afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this)
}
int
-afr_set_transaction_flock(xlator_t *this, afr_local_t *local)
+afr_set_transaction_flock(xlator_t *this, afr_local_t *local,
+ afr_lockee_t *lockee)
{
- afr_internal_lock_t *int_lock = NULL;
afr_private_t *priv = NULL;
+ struct gf_flock *flock = NULL;
- int_lock = &local->internal_lock;
priv = this->private;
+ flock = &lockee->flock;
if ((priv->arbiter_count || local->transaction.eager_lock_on ||
priv->full_lock) &&
local->transaction.type == AFR_DATA_TRANSACTION) {
/*Lock entire file to avoid network split brains.*/
- int_lock->flock.l_len = 0;
- int_lock->flock.l_start = 0;
+ flock->l_len = 0;
+ flock->l_start = 0;
} else {
- int_lock->flock.l_len = local->transaction.len;
- int_lock->flock.l_start = local->transaction.start;
+ flock->l_len = local->transaction.len;
+ flock->l_start = local->transaction.start;
}
- int_lock->flock.l_type = F_WRLCK;
+ flock->l_type = F_WRLCK;
return 0;
}
@@ -1736,26 +2110,21 @@ afr_lock(call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
+ int i = 0;
local = frame->local;
int_lock = &local->internal_lock;
+ int_lock->lock_cbk = afr_post_nonblocking_lock_cbk;
int_lock->domain = this->name;
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- afr_set_transaction_flock(this, local);
-
- int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
-
- afr_nonblocking_inodelk(frame, this);
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ afr_set_transaction_flock(this, local, &int_lock->lockee[i]);
+ }
- int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
- afr_nonblocking_entrylk(frame, this);
break;
case AFR_ENTRY_TRANSACTION:
@@ -1764,11 +2133,11 @@ afr_lock(call_frame_t *frame, xlator_t *this)
int_lock->lk_loc = &local->transaction.parent_loc;
else
GF_ASSERT(local->fd);
-
- int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
- afr_nonblocking_entrylk(frame, this);
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
break;
}
+ afr_lock_nonblocking(frame, this);
return 0;
}
@@ -1839,17 +2208,19 @@ afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check)
/* }}} */
static void
afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src,
- xlator_t *this)
+ xlator_t *this, int lockee_num)
{
afr_private_t *priv = this->private;
+ afr_lockee_t *sl = &src->lockee[lockee_num];
+ afr_lockee_t *dl = &dst->lockee[lockee_num];
dst->domain = src->domain;
- dst->flock.l_len = src->flock.l_len;
- dst->flock.l_start = src->flock.l_start;
- dst->flock.l_type = src->flock.l_type;
- dst->lock_count = src->lock_count;
- memcpy(dst->locked_nodes, src->locked_nodes,
- priv->child_count * sizeof(*dst->locked_nodes));
+ dl->flock.l_len = sl->flock.l_len;
+ dl->flock.l_start = sl->flock.l_start;
+ dl->flock.l_type = sl->flock.l_type;
+ dl->locked_count = sl->locked_count;
+ memcpy(dl->locked_nodes, sl->locked_nodes,
+ priv->child_count * sizeof(*dl->locked_nodes));
}
void
@@ -1870,7 +2241,7 @@ __afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared)
if (conflict && !list_empty(&lock->owners))
return;
afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock,
- each->transaction.frame->this);
+ each->transaction.frame->this, 0);
list_move_tail(&each->transaction.wait_list, shared);
list_add_tail(&each->transaction.owner_list, &lock->owners);
}
@@ -1905,7 +2276,7 @@ afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
} else {
lock = &local->inode_ctx->lock[local->transaction.type];
if (local->internal_lock.lock_op_ret < 0) {
- afr_handle_lock_acquire_failure(local, _gf_false);
+ afr_handle_lock_acquire_failure(local);
} else {
lock->event_generation = local->event_generation;
afr_changelog_pre_op(frame, this);
@@ -1976,8 +2347,13 @@ afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
goto out;
}
- if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP)) {
- /*Only allow writes but shard does [f]xattrops on writes, so
+ if (local->transaction.disable_delayed_post_op) {
+ goto out;
+ }
+
+ if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
+ (local->op != GF_FOP_FSYNC)) {
+ /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
* they are fine too*/
goto out;
}
@@ -2104,8 +2480,10 @@ afr_changelog_fsync(call_frame_t *frame, xlator_t *this)
local->call_count = call_count;
xdata = dict_new();
- if (xdata)
- ret = dict_set_int32(xdata, "batch-fsync", 1);
+ if (xdata) {
+ ret = dict_set_int32_sizen(xdata, "batch-fsync", 1);
+ ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ }
for (i = 0; i < priv->child_count; i++) {
if (!local->transaction.pre_op[i])
@@ -2347,7 +2725,7 @@ __afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock,
*timer_local = list_entry(lock->post_op.next, afr_local_t,
transaction.owner_list);
afr_copy_inodelk_vars(&local->internal_lock,
- &(*timer_local)->internal_lock, this);
+ &(*timer_local)->internal_lock, this, 0);
lock->delay_timer = NULL;
*do_pre_op = _gf_true;
list_add_tail(&local->transaction.owner_list, &lock->owners);
@@ -2364,7 +2742,7 @@ __afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock,
owner_local = list_entry(lock->owners.next, afr_local_t,
transaction.owner_list);
afr_copy_inodelk_vars(&local->internal_lock,
- &owner_local->internal_lock, this);
+ &owner_local->internal_lock, this, 0);
*take_lock = _gf_false;
*do_pre_op = _gf_true;
}
@@ -2434,6 +2812,62 @@ fail:
}
int
+afr_transaction_lockee_init(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_private_t *priv = frame->this->private;
+ int ret = 0;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ ret = afr_add_inode_lockee(local, priv->child_count);
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ ret = afr_add_entry_lockee(local, &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ if (local->op == GF_FOP_RENAME) {
+ ret = afr_add_entry_lockee(
+ local, &local->transaction.new_parent_loc,
+ local->transaction.new_basename, priv->child_count);
+ if (ret) {
+ goto out;
+ }
+
+ if (local->newloc.inode &&
+ IA_ISDIR(local->newloc.inode->ia_type)) {
+ ret = afr_add_entry_lockee(local, &local->newloc, NULL,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ }
+ } else if (local->op == GF_FOP_RMDIR) {
+ ret = afr_add_entry_lockee(local, &local->loc, NULL,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ }
+
+ if (int_lock->lockee_count > 1) {
+ qsort(int_lock->lockee, int_lock->lockee_count,
+ sizeof(*int_lock->lockee), afr_entry_lockee_cmp);
+ }
+ break;
+ }
+out:
+ return ret;
+}
+
+int
afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
{
afr_local_t *local = NULL;
@@ -2447,7 +2881,7 @@ afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local->transaction.type = type;
- if (priv->quorum_count && !afr_has_quorum(local->child_up, this)) {
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
ret = -afr_quorum_errno(priv);
goto out;
}
@@ -2457,10 +2891,19 @@ afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
goto out;
}
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
ret = afr_transaction_local_init(local, this);
if (ret < 0)
goto out;
+ ret = afr_transaction_lockee_init(frame);
+ if (ret)
+ goto out;
+
if (type != AFR_METADATA_TRANSACTION) {
goto txn_start;
}
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index fff8c65e976..beefa26f4a6 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -48,7 +48,7 @@ afr_pending_read_decrement(afr_private_t *priv, int child_index);
call_frame_t *
afr_transaction_detach_fop_frame(call_frame_t *frame);
gf_boolean_t
-afr_has_quorum(unsigned char *subvols, xlator_t *this);
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame);
gf_boolean_t
afr_needs_changelog_update(afr_local_t *local);
void
@@ -67,4 +67,9 @@ afr_lock(call_frame_t *frame, xlator_t *this);
void
afr_delayed_changelog_wake_up_cbk(void *data);
+int
+afr_release_notify_lock_for_ta(void *opaque);
+
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 26950fd7927..df7366f0a65 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -87,7 +87,7 @@ static void
fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype,
dict_t *options)
{
- if (dict_get(options, "quorum-type") == NULL) {
+ if (dict_get_sizen(options, "quorum-type") == NULL) {
/* If user doesn't configure anything enable auto-quorum if the
* replica has more than two subvolumes */
if (priv->child_count > 2)
@@ -120,23 +120,62 @@ afr_set_favorite_child_policy(afr_private_t *priv, char *policy)
return 0;
}
+
+static void
+set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
+{
+ if (!algo) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+ } else if (strcmp(algo, "full") == 0) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL;
+ } else if (strcmp(algo, "diff") == 0) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF;
+ } else {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+ }
+}
+
+void
+afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options)
+{
+ char *volfile_id_str = NULL;
+ uuid_t anon_inode_gfid = {0};
+
+ /*If volume id is not present don't enable anything*/
+ if (dict_get_str(options, "volume-id", &volfile_id_str))
+ return;
+ GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX);
+ /*anon_inode_name is not supposed to change once assigned*/
+ if (!priv->anon_inode_name[0]) {
+ snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s",
+ AFR_ANON_DIR_PREFIX, volfile_id_str);
+ gf_uuid_parse(volfile_id_str, anon_inode_gfid);
+ /*Flip a bit to make sure volfile-id and anon-gfid are not same*/
+ anon_inode_gfid[0] ^= 1;
+ uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str);
+ }
+}
+
int
reconfigure(xlator_t *this, dict_t *options)
{
afr_private_t *priv = NULL;
xlator_t *read_subvol = NULL;
int read_subvol_index = -1;
+ int timeout_old = 0;
int ret = -1;
int index = -1;
char *qtype = NULL;
char *fav_child_policy = NULL;
+ char *data_self_heal = NULL;
+ char *data_self_heal_algorithm = NULL;
+ char *locking_scheme = NULL;
gf_boolean_t consistent_io = _gf_false;
gf_boolean_t choose_local_old = _gf_false;
+ gf_boolean_t enabled_old = _gf_false;
priv = this->private;
- GF_OPTION_RECONF("afr-dirty-xattr", priv->afr_dirty, options, str, out);
-
GF_OPTION_RECONF("metadata-splitbrain-forced-heal",
priv->metadata_splitbrain_forced_heal, options, bool, out);
@@ -149,7 +188,9 @@ reconfigure(xlator_t *this, dict_t *options)
GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options,
bool, out);
- GF_OPTION_RECONF("data-self-heal", priv->data_self_heal, options, str, out);
+ GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
out);
@@ -157,8 +198,9 @@ reconfigure(xlator_t *this, dict_t *options)
GF_OPTION_RECONF("data-self-heal-window-size",
priv->data_self_heal_window_size, options, uint32, out);
- GF_OPTION_RECONF("data-self-heal-algorithm", priv->data_self_heal_algorithm,
+ GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm,
options, str, out);
+ set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out);
@@ -214,16 +256,19 @@ reconfigure(xlator_t *this, dict_t *options)
}
GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out);
- GF_OPTION_RECONF("locking-scheme", priv->locking_scheme, options, str, out);
+ GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out);
+ priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out);
GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool,
out);
GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out);
+ GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log,
+ options, bool, out);
GF_OPTION_RECONF("quorum-type", qtype, options, str, out);
GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out);
fix_quorum_options(this, priv, qtype, options);
- if (priv->quorum_count && !afr_has_quorum(priv->child_up, this))
+ if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL))
gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
"Client-quorum is not met");
@@ -236,11 +281,13 @@ reconfigure(xlator_t *this, dict_t *options)
GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options,
bool, out);
+ enabled_old = priv->shd.enabled;
GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out);
GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool,
out);
+ timeout_old = priv->shd.timeout;
GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out);
GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options,
@@ -264,6 +311,16 @@ reconfigure(xlator_t *this, dict_t *options)
consistent_io = _gf_false;
priv->consistent_io = consistent_io;
+ afr_handle_anon_inode_options(priv, options);
+
+ GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool,
+ out);
+ if (priv->shd.enabled) {
+ if ((priv->shd.enabled != enabled_old) ||
+ (timeout_old != priv->shd.timeout))
+ afr_selfheal_childup(this, priv);
+ }
+
ret = 0;
out:
return ret;
@@ -336,6 +393,22 @@ out:
return ret;
}
+void
+afr_ta_init(afr_private_t *priv)
+{
+ priv->thin_arbiter_count = 1;
+ priv->child_count--;
+ priv->ta_child_up = 0;
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->ta_notify_dom_lock_offset = 0;
+ priv->ta_in_mem_txn_count = 0;
+ priv->ta_on_wire_txn_count = 0;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ INIT_LIST_HEAD(&priv->ta_waitq);
+ INIT_LIST_HEAD(&priv->ta_onwireq);
+ gf_uuid_clear(priv->ta_gfid);
+}
+
int32_t
init(xlator_t *this)
{
@@ -350,6 +423,9 @@ init(xlator_t *this)
char *qtype = NULL;
char *fav_child_policy = NULL;
char *thin_arbiter = NULL;
+ char *data_self_heal = NULL;
+ char *locking_scheme = NULL;
+ char *data_self_heal_algorithm = NULL;
if (!this->children) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED,
@@ -369,6 +445,8 @@ init(xlator_t *this)
goto out;
priv = this->private;
+ INIT_LIST_HEAD(&priv->saved_locks);
+ INIT_LIST_HEAD(&priv->lk_healq);
LOCK_INIT(&priv->lock);
child_count = xlator_subvolume_count(this);
@@ -380,11 +458,7 @@ init(xlator_t *this)
GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
if (thin_arbiter && strlen(thin_arbiter) > 0) {
- priv->thin_arbiter_count = 1;
- priv->child_count--;
- priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
- priv->ta_notify_dom_lock_offset = 0;
- *priv->ta_gfid = 0;
+ afr_ta_init(priv);
}
INIT_LIST_HEAD(&priv->healing);
INIT_LIST_HEAD(&priv->heal_waiting);
@@ -436,10 +510,13 @@ init(xlator_t *this)
GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);
- GF_OPTION_INIT("data-self-heal", priv->data_self_heal, str, out);
+ GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
- GF_OPTION_INIT("data-self-heal-algorithm", priv->data_self_heal_algorithm,
- str, out);
+ GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
+ out);
+ set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
GF_OPTION_INIT("data-self-heal-window-size",
priv->data_self_heal_window_size, uint32, out);
@@ -467,7 +544,8 @@ init(xlator_t *this)
out);
GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out);
- GF_OPTION_INIT("locking-scheme", priv->locking_scheme, str, out);
+ GF_OPTION_INIT("locking-scheme", locking_scheme, str, out);
+ priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
GF_OPTION_INIT("full-lock", priv->full_lock, bool, out);
GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out);
@@ -488,7 +566,9 @@ init(xlator_t *this)
GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
+ afr_handle_anon_inode_options(priv, this->options);
+ GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out);
if (priv->quorum_count != 0)
priv->consistent_io = _gf_false;
@@ -500,13 +580,19 @@ init(xlator_t *this)
goto out;
}
+ priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
+
priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
gf_afr_mt_char);
priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
gf_afr_mt_child_latency_t);
+ priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
- if (!priv->child_up || !priv->child_latency) {
+ if (!priv->child_up || !priv->child_latency || !priv->halo_child_up ||
+ !priv->anon_inode) {
ret = -ENOMEM;
goto out;
}
@@ -547,12 +633,20 @@ init(xlator_t *this)
goto out;
}
- ret = afr_selfheal_daemon_init(this);
- if (ret) {
+ this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this);
+ if (!this->itable) {
ret = -ENOMEM;
goto out;
}
+ if (priv->shd.iamshd) {
+ ret = afr_selfheal_daemon_init(this);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
/* keep more local here as we may need them for self-heal etc */
this->local_pool = mem_pool_new(afr_local_t, 512);
if (!this->local_pool) {
@@ -566,24 +660,91 @@ init(xlator_t *this)
out:
return ret;
}
+void
+afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = -1;
-int
+ if (!healer)
+ return;
+
+ if (healer->running) {
+ /*
+ * If there are any resources to cleanup, We need
+ * to do that gracefully using pthread_cleanup_push
+ */
+ ret = gf_thread_cleanup_xint(healer->thread);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED,
+ "Failed to clean up healer threads.");
+ healer->thread = 0;
+ }
+ pthread_cond_destroy(&healer->cond);
+ pthread_mutex_destroy(&healer->mutex);
+}
+
+void
+afr_selfheal_daemon_fini(xlator_t *this)
+{
+ struct subvol_healer *healer = NULL;
+ afr_self_heald_t *shd = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+ if (!priv)
+ return;
+
+ shd = &priv->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ healer = &shd->full_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ if (shd->statistics[i])
+ eh_destroy(shd->statistics[i]);
+ }
+ GF_FREE(shd->index_healers);
+ GF_FREE(shd->full_healers);
+ GF_FREE(shd->statistics);
+ if (shd->split_brain)
+ eh_destroy(shd->split_brain);
+}
+void
fini(xlator_t *this)
{
afr_private_t *priv = NULL;
priv = this->private;
+
+ afr_selfheal_daemon_fini(this);
+ GF_ASSERT(list_empty(&priv->saved_locks));
+
LOCK(&priv->lock);
if (priv->timer != NULL) {
gf_timer_call_cancel(this->ctx, priv->timer);
priv->timer = NULL;
}
UNLOCK(&priv->lock);
+
+ if (this->local_pool != NULL) {
+ mem_pool_destroy(this->local_pool);
+ this->local_pool = NULL;
+ }
+
this->private = NULL;
afr_priv_destroy(priv);
- // if (this->itable);//I don't see any destroy func
+ if (this->itable) {
+ inode_table_destroy(this->itable);
+ this->itable = NULL;
+ }
- return 0;
+ return;
}
struct xlator_fops fops = {
@@ -607,6 +768,7 @@ struct xlator_fops fops = {
.getxattr = afr_getxattr,
.fgetxattr = afr_fgetxattr,
.readv = afr_readv,
+ .seek = afr_seek,
/* inode write */
.writev = afr_writev,
@@ -679,7 +841,7 @@ struct volume_options options[] = {
{.key = {"read-hash-mode"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
- .max = 3,
+ .max = 5,
.default_value = "1",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
@@ -692,7 +854,10 @@ struct volume_options options[] = {
"1 = hash by GFID of file (all clients use "
"same subvolume).\n"
"2 = hash by GFID of file and client PID.\n"
- "3 = brick having the least outstanding read requests."},
+ "3 = brick having the least outstanding read requests.\n"
+ "4 = brick having the least network ping latency.\n"
+ "5 = Hybrid mode between 3 and 4, ie least value among "
+ "network-latency multiplied by outstanding-read-requests."},
{
.key = {"choose-local"},
.type = GF_OPTION_TYPE_BOOL,
@@ -785,7 +950,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_STR,
.value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false",
"disable", "open"},
- .default_value = "on",
+ .default_value = "off",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
@@ -822,7 +987,7 @@ struct volume_options options[] = {
"process would be applied simultaneously."},
{.key = {"metadata-self-heal"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
+ .default_value = "off",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
@@ -832,7 +997,7 @@ struct volume_options options[] = {
"the file/directory."},
{.key = {"entry-self-heal"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
+ .default_value = "off",
.op_version = {1},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
@@ -1152,5 +1317,28 @@ struct volume_options options[] = {
.tags = {"replicate"},
.description = "This option exists only for backward compatibility "
"and configuring it doesn't have any effect"},
+ {.key = {"use-anonymous-inode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "Setting this option heals directory renames efficiently"},
+
{.key = {NULL}},
};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "replicate",
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 3d2c1950571..d62f9a9caf2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -11,17 +11,18 @@
#ifndef __AFR_H__
#define __AFR_H__
-#include "call-stub.h"
-#include "compat-errno.h"
+#include <glusterfs/call-stub.h>
+#include <glusterfs/compat-errno.h>
#include "afr-mem-types.h"
#include "libxlator.h"
-#include "timer.h"
-#include "syncop.h"
+#include <glusterfs/timer.h>
+#include <glusterfs/syncop.h>
#include "afr-self-heald.h"
#include "afr-messages.h"
+#define SHD_INODE_LRU_LIMIT 1
#define AFR_PATHINFO_HEADER "REPLICATE:"
#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
@@ -38,7 +39,10 @@
#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
+#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
+
#define AFR_HALO_MAX_LATENCY 99999
+#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode"
#define PFLAG_PENDING (1 << 0)
#define PFLAG_SBRAIN (1 << 1)
@@ -94,6 +98,25 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \
} while (0)
+#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \
+ do { \
+ afr_fd_ctx_t *__fd_ctx = NULL; \
+ __fd_ctx = afr_fd_ctx_get(__fd, __this); \
+ if (__fd_ctx && __fd_ctx->is_fd_bad) { \
+ __error = EBADF; \
+ goto __label; \
+ } \
+ } while (0)
+
+typedef enum {
+ AFR_READ_POLICY_FIRST_UP,
+ AFR_READ_POLICY_GFID_HASH,
+ AFR_READ_POLICY_GFID_PID_HASH,
+ AFR_READ_POLICY_LESS_LOAD,
+ AFR_READ_POLICY_LEAST_LATENCY,
+ AFR_READ_POLICY_LOAD_LATENCY_HYBRID,
+} afr_read_hash_mode_t;
+
typedef enum {
AFR_FAV_CHILD_NONE,
AFR_FAV_CHILD_BY_SIZE,
@@ -104,16 +127,48 @@ typedef enum {
} afr_favorite_child_policy;
typedef enum {
+ AFR_SELFHEAL_DATA_FULL = 0,
+ AFR_SELFHEAL_DATA_DIFF,
+ AFR_SELFHEAL_DATA_DYNAMIC,
+} afr_data_self_heal_type_t;
+
+typedef enum {
AFR_CHILD_UNKNOWN = -1,
AFR_CHILD_ZERO,
AFR_CHILD_ONE,
+ AFR_CHILD_THIN_ARBITER,
} afr_child_index;
+typedef enum {
+ TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall
+ notification and waiting for its release.*/
+ TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get
+ *info about which brick is bad.*/
+ TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed
+ *on BAD brick - Success*/
+ TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed
+ *on GOOD brick - Failed*/
+ TA_SUCCESS, /*FOP succeeded on both data bricks.*/
+} afr_ta_fop_state_t;
+
struct afr_nfsd {
- gf_boolean_t iamnfsd;
uint32_t halo_max_latency_msec;
+ gf_boolean_t iamnfsd;
};
+typedef struct _afr_lk_heal_info {
+ fd_t *fd;
+ int32_t cmd;
+ struct gf_flock flock;
+ dict_t *xdata_req;
+ unsigned char *locked_nodes;
+ struct list_head pos;
+ gf_lkowner_t lk_owner;
+ pid_t pid;
+ int32_t *child_up_event_gen;
+ int32_t *child_down_event_gen;
+} afr_lk_heal_info_t;
+
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
@@ -124,20 +179,27 @@ typedef struct _afr_private {
inode_t *root_inode;
+ int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
/* For thin-arbiter. */
- unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
uuid_t ta_gfid;
+ unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
int ta_bad_child_index;
- off_t ta_notify_dom_lock_offset;
+ int ta_event_gen;
+ unsigned int ta_in_mem_txn_count;
+ unsigned int ta_on_wire_txn_count;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+ unsigned char *anon_inode;
unsigned char *child_up;
+ unsigned char *halo_child_up;
int64_t *child_latency;
unsigned char *local;
char **pending_key;
- char *data_self_heal; /* on/off/open */
- char *data_self_heal_algorithm; /* name of algorithm */
+ afr_data_self_heal_type_t data_self_heal_algorithm;
unsigned int data_self_heal_window_size; /* max number of pipelined
read/writes */
@@ -152,30 +214,31 @@ typedef struct _afr_private {
int32_t healers; /* No. of elements currently undergoing background
heal*/
+ gf_boolean_t release_ta_notify_dom_lock;
+
gf_boolean_t metadata_self_heal; /* on/off */
gf_boolean_t entry_self_heal; /* on/off */
gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
- unsigned int hash_mode; /* for when read_child is not set */
gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
- int favorite_child; /* subvolume to be preferred in resolving
- split-brain cases */
- afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
- resolution of split-brains.*/
+ gf_timer_t *timer; /* launched when parent up is received */
unsigned int wait_count; /* # of servers to wait for success */
- gf_timer_t *timer; /* launched when parent up is received */
-
+ unsigned char ta_child_up;
gf_boolean_t optimistic_change_log;
gf_boolean_t eager_lock;
gf_boolean_t pre_op_compat; /* on/off */
uint32_t post_op_delay_secs;
unsigned int quorum_count;
- char vol_uuid[UUID_SIZE + 1];
+ off_t ta_notify_dom_lock_offset;
+ afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
+ resolution of split-brains.*/
+ afr_read_hash_mode_t hash_mode; /* for when read_child is not set */
+
int32_t *last_event;
/* @event_generation: Keeps count of number of events received which can
@@ -188,33 +251,41 @@ typedef struct _afr_private {
important as we might have had a network split brain.
*/
uint32_t event_generation;
+ char vol_uuid[UUID_SIZE + 1];
gf_boolean_t choose_local;
gf_boolean_t did_discovery;
- uint64_t sh_readdir_size;
gf_boolean_t ensure_durability;
+ gf_boolean_t halo_enabled;
+ gf_boolean_t consistent_metadata;
+ gf_boolean_t need_heal;
+ gf_boolean_t granular_locks;
+ uint64_t sh_readdir_size;
char *sh_domain;
char *afr_dirty;
- gf_boolean_t halo_enabled;
- uint32_t halo_max_latency_msec;
- uint32_t halo_max_replicas;
- uint32_t halo_min_replicas;
+ uint64_t spb_choice_timeout;
afr_self_heald_t shd;
struct afr_nfsd nfsd;
- gf_boolean_t consistent_metadata;
- uint64_t spb_choice_timeout;
- gf_boolean_t need_heal;
+ uint32_t halo_max_latency_msec;
+ uint32_t halo_max_replicas;
+ uint32_t halo_min_replicas;
- /* pump dependencies */
- void *pump_private;
- gf_boolean_t use_afr_in_pump;
- char *locking_scheme;
gf_boolean_t full_lock;
gf_boolean_t esh_granular;
gf_boolean_t consistent_io;
+ gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t use_anon_inode;
+
+ /*For lock healing.*/
+ struct list_head saved_locks;
+ struct list_head lk_healq;
+
+ /*For anon-inode handling */
+ char anon_inode_name[NAME_MAX + 1];
+ char anon_gfid_str[UUID_SIZE + 1];
} afr_private_t;
typedef enum {
@@ -263,12 +334,14 @@ afr_index_from_ia_type(ia_type_t type)
}
typedef struct {
+ struct gf_flock flock;
loc_t loc;
+ fd_t *fd;
char *basename;
unsigned char *locked_nodes;
int locked_count;
-} afr_entry_lockee_t;
+} afr_lockee_t;
int
afr_entry_lockee_cmp(const void *l1, const void *l2);
@@ -276,21 +349,17 @@ afr_entry_lockee_cmp(const void *l1, const void *l2);
typedef struct {
loc_t *lk_loc;
- int lockee_count;
- afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+ afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
- struct gf_flock flock;
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
- char lower_locked;
- char higher_locked;
- unsigned char *locked_nodes;
unsigned char *lower_locked_nodes;
- int32_t lock_count;
- int32_t entrylk_lock_count;
+ afr_lock_cbk_t lock_cbk;
+
+ int lockee_count;
int32_t lk_call_count;
int32_t lk_expected_count;
@@ -298,14 +367,15 @@ typedef struct {
int32_t lock_op_ret;
int32_t lock_op_errno;
- afr_lock_cbk_t lock_cbk;
char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+ int32_t lock_count;
+ char lower_locked;
+ char higher_locked;
} afr_internal_lock_t;
struct afr_reply {
int valid;
int32_t op_ret;
- int32_t op_errno;
dict_t *xattr; /*For xattrop*/
dict_t *xdata;
struct iatt poststat;
@@ -314,6 +384,7 @@ struct afr_reply {
struct iatt preparent;
struct iatt preparent2;
struct iatt postparent2;
+ int32_t op_errno;
/* For rchecksum */
uint8_t checksum[SHA256_DIGEST_LENGTH];
gf_boolean_t buf_has_zeroes;
@@ -337,6 +408,10 @@ typedef struct {
arrives, we continue to read off this subvol.
*/
int readdir_subvol;
+ /* lock-healing related members. */
+ gf_boolean_t is_fd_bad;
+ afr_lk_heal_info_t *lk_heal_info;
+
} afr_fd_ctx_t;
typedef enum {
@@ -353,8 +428,6 @@ typedef struct _afr_inode_lock_t {
*/
int32_t num_inodelks;
unsigned int event_generation;
- gf_boolean_t release;
- gf_boolean_t acquired;
gf_timer_t *delay_timer;
struct list_head owners; /*Transactions that are performing fop*/
struct list_head post_op; /*Transactions that are done with the fop
@@ -363,6 +436,8 @@ typedef struct _afr_inode_lock_t {
*conflicting transactions to complete*/
struct list_head frozen; /*Transactions that need to go as part of
* next batch of eager-lock*/
+ gf_boolean_t release;
+ gf_boolean_t acquired;
} afr_lock_t;
typedef struct _afr_inode_ctx {
@@ -371,15 +446,11 @@ typedef struct _afr_inode_ctx {
int lock_count;
int spb_choice;
gf_timer_t *timer;
- gf_boolean_t need_refresh;
unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
int inherited[AFR_NUM_CHANGE_LOGS];
int on_disk[AFR_NUM_CHANGE_LOGS];
-
- /* set if any write on this fd was a non stable write
- (i.e, without O_SYNC or O_DSYNC)
- */
- gf_boolean_t witnessed_unstable_write;
+ /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
+ afr_lock_t lock[2];
/* @open_fd_count:
Number of open FDs queried from the server, as queried through
@@ -387,8 +458,12 @@ typedef struct _afr_inode_ctx {
temporarily disabled.
*/
uint32_t open_fd_count;
- /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
- afr_lock_t lock[2];
+ gf_boolean_t need_refresh;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
} afr_inode_ctx_t;
typedef struct _afr_local {
@@ -402,19 +477,15 @@ typedef struct _afr_local {
unsigned int event_generation;
uint32_t open_fd_count;
- gf_boolean_t update_open_fd_count;
int32_t num_inodelks;
- gf_boolean_t update_num_inodelks;
-
- gf_lkowner_t saved_lk_owner;
int32_t op_ret;
int32_t op_errno;
- int32_t **pending;
-
int dirty[AFR_NUM_CHANGE_LOGS];
+ int32_t **pending;
+
loc_t loc;
loc_t newloc;
@@ -445,14 +516,6 @@ typedef struct _afr_local {
afr_read_txn_wind_t readfn;
- /* @refreshed:
-
- the inode was "refreshed" (i.e, pending xattrs from all subvols
- freshly inspected and inode ctx updated accordingly) as part of
- this transaction already.
- */
- gf_boolean_t refreshed;
-
/* @inode:
the inode on which the read txn is performed on. ref'ed and copied
@@ -477,8 +540,6 @@ typedef struct _afr_local {
unsigned char *readable;
unsigned char *readable2; /*For rename transaction*/
- int read_subvol; /* Current read subvolume */
-
afr_inode_refresh_cbk_t refreshfn;
/* @refreshinode:
@@ -487,9 +548,30 @@ typedef struct _afr_local {
*/
inode_t *refreshinode;
+ dict_t *xattr_req;
+
+ dict_t *dict;
+
+ int read_subvol; /* Current read subvolume */
+
+ int optimistic_change_log;
+
+ afr_internal_lock_t internal_lock;
+
/*To handle setattr/setxattr on yet to be linked inode from dht*/
uuid_t refreshgfid;
+ /* @refreshed:
+
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
+
+ gf_boolean_t update_num_inodelks;
+ gf_boolean_t update_open_fd_count;
+
/*
@pre_op_compat:
@@ -499,14 +581,6 @@ typedef struct _afr_local {
gf_boolean_t pre_op_compat;
- dict_t *xattr_req;
-
- afr_internal_lock_t internal_lock;
-
- dict_t *dict;
-
- int optimistic_change_log;
-
/* Is the current writev() going to perform a stable write?
i.e, is fd->flags or @flags writev param have O_SYNC or
O_DSYNC?
@@ -525,25 +599,25 @@ typedef struct _afr_local {
struct {
struct {
- gf_boolean_t needs_fresh_lookup;
- uuid_t gfid_req;
- } lookup;
-
- struct {
- unsigned char buf_set;
struct statvfs buf;
+ unsigned char buf_set;
} statfs;
struct {
- int32_t flags;
fd_t *fd;
+ int32_t flags;
} open;
struct {
- int32_t cmd;
struct gf_flock user_flock;
struct gf_flock ret_flock;
unsigned char *locked_nodes;
+ int32_t cmd;
+ /*For lock healing only.*/
+ unsigned char *dom_locked_nodes;
+ int32_t *dom_lock_op_ret;
+ int32_t *dom_lock_op_errno;
+ struct gf_flock *getlk_rsp;
} lk;
/* inode read */
@@ -568,8 +642,8 @@ typedef struct _afr_local {
struct {
char *name;
- int last_index;
long xattr_len;
+ int last_index;
} getxattr;
struct {
@@ -582,11 +656,10 @@ typedef struct _afr_local {
/* dir read */
struct {
+ uint32_t *checksum;
int success_count;
int32_t op_ret;
int32_t op_errno;
-
- uint32_t *checksum;
} opendir;
struct {
@@ -595,8 +668,8 @@ typedef struct _afr_local {
size_t size;
off_t offset;
dict_t *dict;
- gf_boolean_t failed;
int last_index;
+ gf_boolean_t failed;
} readdir;
/* inode write */
@@ -606,12 +679,11 @@ typedef struct _afr_local {
} inode_wfop; // common structure for all inode-write-fops
struct {
- int32_t op_ret;
-
struct iovec *vector;
struct iobref *iobref;
- int32_t count;
off_t offset;
+ int32_t op_ret;
+ int32_t count;
uint32_t flags;
} writev;
@@ -671,29 +743,25 @@ typedef struct _afr_local {
} create;
struct {
+ dict_t *params;
dev_t dev;
mode_t mode;
- dict_t *params;
} mknod;
struct {
- int32_t mode;
dict_t *params;
+ int32_t mode;
} mkdir;
struct {
- int flags;
- } rmdir;
-
- struct {
dict_t *params;
char *linkpath;
} symlink;
struct {
- int32_t mode;
off_t offset;
size_t len;
+ int32_t mode;
} fallocate;
struct {
@@ -720,10 +788,10 @@ typedef struct _afr_local {
struct {
char *volume;
char *basename;
+ void *xdata;
entrylk_cmd in_cmd;
entrylk_cmd cmd;
entrylk_type type;
- void *xdata;
} entrylk;
struct {
@@ -732,31 +800,33 @@ typedef struct _afr_local {
} seek;
struct {
- int32_t datasync;
- } fsync;
-
- struct {
struct gf_lease user_lease;
struct gf_lease ret_lease;
unsigned char *locked_nodes;
} lease;
- } cont;
+ struct {
+ int flags;
+ } rmdir;
- struct {
- off_t start, len;
+ struct {
+ int32_t datasync;
+ } fsync;
- gf_boolean_t eager_lock_on;
- gf_boolean_t do_eager_unlock;
+ struct {
+ uuid_t gfid_req;
+ gf_boolean_t needs_fresh_lookup;
+ } lookup;
+
+ } cont;
+ struct {
char *basename;
char *new_basename;
loc_t parent_loc;
loc_t new_parent_loc;
- afr_transaction_type type;
-
/* stub to resume on destruction
of the transaction frame */
call_stub_t *resume_stub;
@@ -774,6 +844,30 @@ typedef struct _afr_local {
FOP failed. */
unsigned char *failed_subvols;
+ call_frame_t *main_frame; /*Fop frame*/
+ call_frame_t *frame; /*Transaction frame*/
+
+ int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
+
+ int (*unwind)(call_frame_t *frame, xlator_t *this);
+
+ off_t start, len;
+
+ afr_transaction_type type;
+
+ int32_t in_flight_sb_errno; /* This is where the cause of the
+ failure on the last good copy of
+ the file is stored.
+ */
+
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+ afr_changelog_resume_t changelog_resume;
+
+ gf_boolean_t eager_lock_on;
+ gf_boolean_t do_eager_unlock;
+
/* @dirtied: flag which indicates whether we set dirty flag
in the OP. Typically true when we are performing operation
on more than one subvol and optimistic changelog is disabled
@@ -798,6 +892,10 @@ typedef struct _afr_local {
*/
gf_boolean_t no_uninherit;
+ gf_boolean_t in_flight_sb; /* Indicator for occurrence of
+ split-brain while in the middle of
+ a txn. */
+
/* @uninherit_done:
@uninherit_value:
@@ -810,27 +908,7 @@ typedef struct _afr_local {
gf_boolean_t uninherit_done;
gf_boolean_t uninherit_value;
- gf_boolean_t in_flight_sb; /* Indicator for occurrence of
- split-brain while in the middle of
- a txn. */
- int32_t in_flight_sb_errno; /* This is where the cause of the
- failure on the last good copy of
- the file is stored.
- */
-
- /* @changelog_resume: function to be called after changlogging
- (either pre-op or post-op) is done
- */
- afr_changelog_resume_t changelog_resume;
-
- call_frame_t *main_frame; /*Fop frame*/
- call_frame_t *frame; /*Transaction frame*/
-
- int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
-
- int (*unwind)(call_frame_t *frame, xlator_t *this);
-
- /* post-op hook */
+ gf_boolean_t disable_delayed_post_op;
} transaction;
syncbarrier_t barrier;
@@ -843,26 +921,36 @@ typedef struct _afr_local {
mode_t umask;
int xflag;
- gf_boolean_t do_discovery;
struct afr_reply *replies;
/* For client side background heals. */
struct list_head healer;
call_frame_t *heal_frame;
- gf_boolean_t need_full_crawl;
- afr_fop_lock_state_t fop_lock_state;
+ afr_inode_ctx_t *inode_ctx;
+ /*For thin-arbiter transactions.*/
+ int ta_failed_subvol;
+ int ta_event_gen;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+ afr_ta_fop_state_t fop_state;
+ afr_fop_lock_state_t fop_lock_state;
+ gf_lkowner_t saved_lk_owner;
+ unsigned char read_txn_query_child;
+ unsigned char ta_child_up;
+ gf_boolean_t do_discovery;
+ gf_boolean_t need_full_crawl;
gf_boolean_t is_read_txn;
- afr_inode_ctx_t *inode_ctx;
+ gf_boolean_t is_new_entry;
} afr_local_t;
typedef struct afr_spbc_timeout {
call_frame_t *frame;
- gf_boolean_t d_spb;
- gf_boolean_t m_spb;
loc_t *loc;
int spb_child_index;
+ gf_boolean_t d_spb;
+ gf_boolean_t m_spb;
} afr_spbc_timeout_t;
typedef struct afr_spb_status {
@@ -872,9 +960,9 @@ typedef struct afr_spb_status {
typedef struct afr_empty_brick_args {
call_frame_t *frame;
+ char *op_type;
loc_t loc;
int empty_index;
- char *op_type;
} afr_empty_brick_args_t;
typedef struct afr_read_subvol_args {
@@ -916,7 +1004,10 @@ afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
int event_generation);
int
-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this);
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
+
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
int
afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
@@ -948,11 +1039,14 @@ int
xattr_is_equal(dict_t *this, char *key1, data_t *value1, void *data);
int
-afr_init_entry_lockee(afr_entry_lockee_t *lockee, afr_local_t *local,
- loc_t *loc, char *basename, int child_count);
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+ int child_count);
+
+int
+afr_add_inode_lockee(afr_local_t *local, int child_count);
void
-afr_entry_lockee_cleanup(afr_internal_lock_t *int_lock);
+afr_lockees_cleanup(afr_internal_lock_t *int_lock);
int
afr_attempt_lock_recovery(xlator_t *this, int32_t child_index);
@@ -970,10 +1064,7 @@ int32_t
afr_unlock(call_frame_t *frame, xlator_t *this);
int
-afr_nonblocking_entrylk(call_frame_t *frame, xlator_t *this);
-
-int
-afr_nonblocking_inodelk(call_frame_t *frame, xlator_t *this);
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this);
int
afr_blocking_lock(call_frame_t *frame, xlator_t *this);
@@ -1032,6 +1123,9 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
if (__local && __local->is_read_txn) \
afr_pending_read_decrement(__this->private, \
__local->read_subvol); \
+ if (__local && __local->xdata_req && \
+ afr_is_lock_mode_mandatory(__local->xdata_req)) \
+ afr_dom_lock_release(frame); \
frame->local = NULL; \
} \
\
@@ -1059,8 +1153,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
#define AFR_FRAME_INIT(frame, op_errno) \
({ \
frame->local = mem_get0(THIS->local_pool); \
- if (afr_local_init(frame->local, THIS->private, &op_errno)) { \
- afr_local_cleanup(frame->local, THIS); \
+ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \
+ afr_local_cleanup(frame->local, frame->this); \
mem_put(frame->local); \
frame->local = NULL; \
}; \
@@ -1184,8 +1278,8 @@ int
afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
int spb_choice);
int
-afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
- int *spb_choice);
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol);
int
afr_get_child_index_from_name(xlator_t *this, char *name);
@@ -1240,9 +1334,6 @@ afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame);
void
afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock,
int32_t child_index);
-int
-afr_is_inodelk_transaction(afr_transaction_type type);
-
afr_fd_ctx_t *
__afr_fd_ctx_get(fd_t *fd, xlator_t *this);
@@ -1273,7 +1364,7 @@ int
afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode);
int
-afr_fill_ta_loc(xlator_t *this, loc_t *loc);
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop);
int
afr_ta_post_op_lock(xlator_t *this, loc_t *loc);
@@ -1289,4 +1380,44 @@ __afr_get_up_children_count(afr_private_t *priv);
call_frame_t *
afr_ta_frame_create(xlator_t *this);
+
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
+
+void
+afr_ta_lock_release_synctask(xlator_t *this);
+
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv);
+
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame,
+ const unsigned int up_children_count);
+
+void
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this);
+
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,
+ int child);
+
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata);
+
+void
+afr_dom_lock_release(call_frame_t *frame);
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies);
+
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid);
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index 7cb2961f30a..56f1f2ad7c8 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,7 +1,4 @@
xlator_LTLIBRARIES = dht.la nufa.la switch.la
-if BUILD_GFDB
- xlator_LTLIBRARIES += tier.la
-endif
AM_CFLAGS = -Wall $(GF_CFLAGS)
@@ -16,45 +13,28 @@ dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
-tier_la_SOURCES = $(dht_common_source) tier.c tier-common.c
-dht_la_LDFLAGS = -module \
- -export-symbols $(top_srcdir)/xlators/cluster/dht/src/dht.sym \
- $(GF_XLATOR_LDFLAGS)
+dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-nufa_la_LDFLAGS = -module \
- -export-symbols $(top_srcdir)/xlators/cluster/dht/src/nufa.sym \
- $(GF_XLATOR_LDFLAGS)
+nufa_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-switch_la_LDFLAGS = -module \
- -export-symbols $(top_srcdir)/xlators/cluster/dht/src/switch.sym \
- $(GF_XLATOR_LDFLAGS)
+switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-tier_la_LDFLAGS = -module \
- -export-symbols $(top_srcdir)/xlators/cluster/dht/src/tier.sym \
- $(LIB_DL) $(GF_XLATOR_LDFLAGS)
-tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \
- dht-lock.h tier-common.h tier.h \
- $(top_builddir)/xlators/lib/src/libxlator.h
+ dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/libglusterfs/src/gfdb \
-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
-I$(top_srcdir)/rpc/rpc-lib/src \
-I$(top_srcdir)/xlators/lib/src \
-DDATADIR=\"$(localstatedir)\" \
- -DLIBDIR=\"$(libdir)\" \
- -DLIBGFDB_VERSION=\"$(LIBGFDB_VERSION)\"
+ -DLIBDIR=\"$(libdir)\"
CLEANFILES =
-EXTRA_DIST = dht.sym nufa.sym switch.sym tier.sym
-
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/distribute.so
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index f43a10bec2f..8ba0cc4c732 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -10,48 +10,52 @@
/* TODO: add NS locking */
-#include "glusterfs.h"
-#include "xlator.h"
#include "libxlator.h"
#include "dht-common.h"
#include "dht-lock.h"
-#include "defaults.h"
-#include "byte-order.h"
-#include "quota-common-utils.h"
-#include "upcall-utils.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/quota-common-utils.h>
+#include <glusterfs/upcall-utils.h>
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include <glusterfs/common-utils.h>
#include <sys/time.h>
#include <libgen.h>
#include <signal.h>
-int run_defrag = 0;
-
-int
-dht_link2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata);
-int
-dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
- int ret);
+static int
+dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
-int
-dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req);
-int
-dht_rmdir_readdirp_do(call_frame_t *readdirp_frame, xlator_t *this);
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this);
-int
-dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata);
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
-int
-dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req);
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this);
-int
-dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req);
+static const char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL};
-int
-dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc);
+/* Check the xdata to make sure EBADF has been set by client xlator */
+int32_t
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno)
+{
+ if (op_ret == -1 && (op_errno == EBADF || op_errno == EBADFD) &&
+ !(local->fd_checked)) {
+ return 1;
+ }
+ return 0;
+}
/* Sets the blocks and size values to fixed values. This is to be called
* only for dirs. The caller is responsible for checking the type
@@ -67,67 +71,17 @@ dht_set_fixed_dir_stat(struct iatt *stat)
return -1;
}
-/* Set both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
- * Use DHT_MODE_IN_XDATA_KEY if available. Else fall back to
- * DHT_IATT_IN_XDATA_KEY
- */
-int
-dht_request_iatt_in_xdata(xlator_t *this, dict_t *xattr_req)
-{
- int ret = -1;
-
- ret = dict_set_int8(xattr_req, DHT_MODE_IN_XDATA_KEY, 1);
- ret = dict_set_int8(xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
-
- /* At least one call succeeded */
- return ret;
-}
-
-/* Get both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
- * Use DHT_MODE_IN_XDATA_KEY if available, else fall back to
- * DHT_IATT_IN_XDATA_KEY
- * This will return a dummy iatt with only the mode and type set
- */
-int
-dht_read_iatt_from_xdata(xlator_t *this, dict_t *xdata, struct iatt *stbuf)
-{
- int ret = -1;
- int32_t mode = 0;
-
- ret = dict_get_int32(xdata, DHT_MODE_IN_XDATA_KEY, &mode);
-
- if (ret) {
- ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
- } else {
- stbuf->ia_prot = ia_prot_from_st_mode(mode);
- stbuf->ia_type = ia_type_from_st_mode(mode);
- }
-
- return ret;
-}
-
-int
-dht_rmdir_unlock(call_frame_t *frame, xlator_t *this);
-
-char *xattrs_to_heal[] = {"user.",
- POSIX_ACL_ACCESS_XATTR,
- POSIX_ACL_DEFAULT_XATTR,
- QUOTA_LIMIT_KEY,
- QUOTA_LIMIT_OBJECTS_KEY,
- GF_SELINUX_XATTR_KEY,
- NULL};
-
-char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL};
-
/* Return true if key exists in array
*/
static gf_boolean_t
dht_match_xattr(const char *key)
{
+ char **xattrs_to_heal = get_xattrs_to_heal();
+
return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0;
}
-int
+static int
dht_aggregate_quota_xattr(dict_t *dst, char *key, data_t *value)
{
int ret = -1;
@@ -190,7 +144,7 @@ out:
return ret;
}
-int
+static int
add_opt(char **optsp, const char *opt)
{
char *newopts = NULL;
@@ -268,7 +222,7 @@ out:
*/
-int
+static int
dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value)
{
int ret = 0;
@@ -367,7 +321,7 @@ out:
return ret;
}
-int
+static int
dht_aggregate(dict_t *this, char *key, data_t *value, void *data)
{
dict_t *dst = NULL;
@@ -414,7 +368,7 @@ out:
return ret;
}
-void
+static void
dht_aggregate_xattr(dict_t *dst, dict_t *src)
{
if ((dst == NULL) || (src == NULL)) {
@@ -440,7 +394,7 @@ dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol)
{
ret = __inode_ctx_get(inode, this, &ctx_int);
if (ctx_int) {
- ctx = (dht_inode_ctx_t *)ctx_int;
+ ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
ctx->mds_subvol = mds_subvol;
} else {
ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
@@ -496,7 +450,7 @@ dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol)
- complete linkfile selfheal
*/
-int
+static int
dht_lookup_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
@@ -538,7 +492,7 @@ out:
return ret;
}
-int
+static int
dht_discover_complete(xlator_t *this, call_frame_t *discover_frame)
{
dht_local_t *local = NULL;
@@ -659,13 +613,14 @@ dht_discover_complete(xlator_t *this, call_frame_t *discover_frame)
if (local->need_xattr_heal && !heal_path) {
local->need_xattr_heal = 0;
- ret = dht_dir_xattr_heal(this, local);
- if (ret)
- gf_msg(this->name, GF_LOG_ERROR, ret,
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
DHT_MSG_DIR_XATTR_HEAL_FAILED,
"xattr heal failed for "
"directory gfid is %s ",
gfid_local);
+ }
}
}
@@ -726,7 +681,7 @@ out:
return ret;
}
-int
+static int
dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
@@ -735,6 +690,7 @@ dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int ret = -1;
dht_conf_t *conf = 0;
dht_layout_t *layout = NULL;
+ int32_t mds_heal_fresh_lookup = 0;
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
@@ -742,6 +698,7 @@ dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
conf = this->private;
layout = local->selfheal.layout;
+ mds_heal_fresh_lookup = local->mds_heal_fresh_lookup;
if (op_ret) {
gf_msg_debug(this->name, op_ret,
@@ -762,11 +719,63 @@ dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
layout);
}
out:
- if (local && local->mds_heal_fresh_lookup)
+ if (mds_heal_fresh_lookup)
DHT_STACK_DESTROY(frame);
return 0;
}
+static xlator_t *
+dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc)
+{
+ char *path = NULL;
+ loc_t populate_loc = {
+ 0,
+ };
+ char *name = NULL;
+ xlator_t *hash_subvol = NULL;
+
+ if (!inode)
+ return hash_subvol;
+
+ if (loc && loc->parent && loc->path) {
+ if (!loc->name) {
+ name = strrchr(loc->path, '/');
+ if (name) {
+ loc->name = name + 1;
+ } else {
+ goto out;
+ }
+ }
+ hash_subvol = dht_subvol_get_hashed(this, loc);
+ goto out;
+ }
+
+ if (!gf_uuid_is_null(inode->gfid)) {
+ populate_loc.inode = inode_ref(inode);
+ populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL);
+ inode_path(populate_loc.inode, NULL, &path);
+
+ if (!path)
+ goto out;
+
+ populate_loc.path = path;
+ if (!populate_loc.name && populate_loc.path) {
+ name = strrchr(populate_loc.path, '/');
+ if (name) {
+ populate_loc.name = name + 1;
+
+ } else {
+ goto out;
+ }
+ }
+ hash_subvol = dht_subvol_get_hashed(this, &populate_loc);
+ }
+out:
+ if (populate_loc.inode)
+ loc_wipe(&populate_loc);
+ return hash_subvol;
+}
+
/* Common function call by revalidate/selfheal code path to populate
internal xattr if it is not present, mark_during_fresh_lookup value
determines either function is call by revalidate_cbk(discover_complete)
@@ -801,9 +810,8 @@ dht_common_mark_mdsxattr(call_frame_t *frame, int *errst,
call_frame_t *xattr_frame = NULL;
gf_boolean_t vol_down = _gf_false;
- this = frame->this;
-
GF_VALIDATE_OR_GOTO("dht", frame, out);
+ this = frame->this;
GF_VALIDATE_OR_GOTO("dht", this, out);
GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
@@ -812,6 +820,7 @@ dht_common_mark_mdsxattr(call_frame_t *frame, int *errst,
conf = this->private;
layout = local->selfheal.layout;
local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+
gf_uuid_unparse(local->gfid, gfid_local);
/* Code to update hashed subvol consider as a mds subvol
@@ -852,7 +861,8 @@ dht_common_mark_mdsxattr(call_frame_t *frame, int *errst,
"Failed to get hashed subvol for path %s"
"gfid is %s ",
local->loc.path, gfid_local);
- (*errst) = 1;
+ if (errst)
+ (*errst) = 1;
ret = -1;
goto out;
}
@@ -923,7 +933,44 @@ out:
return ret;
}
-int
+/* Get the value of key from dict in the bytewise and save in array after
+ convert from network byte order to host byte order
+*/
+static int32_t
+dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size,
+ int *errst)
+{
+ void *ptr = NULL;
+ int32_t len = -1;
+ int32_t vindex = -1;
+ int32_t err = -1;
+ int ret = 0;
+
+ if (dict == NULL) {
+ (*errst) = -1;
+ return -EINVAL;
+ }
+ err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+ if (err != 0) {
+ (*errst) = -1;
+ return err;
+ }
+
+ if (len != (size * sizeof(int32_t))) {
+ (*errst) = -1;
+ return -EINVAL;
+ }
+
+ for (vindex = 0; vindex < size; vindex++) {
+ value[vindex] = ntoh32(*((int32_t *)ptr + vindex));
+ if (value[vindex] < 0)
+ ret = -1;
+ }
+
+ return ret;
+}
+
+static int
dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, inode_t *inode, struct iatt *stbuf,
dict_t *xattr, struct iatt *postparent)
@@ -1018,7 +1065,7 @@ dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
if (local->xattr == NULL) {
local->xattr = dict_ref(xattr);
} else {
- /* Don't aggregate for files. See BZ#1484113 */
+ /* Don't aggregate for files. See BZ#1484709 */
if (is_dir)
dht_aggregate_xattr(local->xattr, xattr);
}
@@ -1084,7 +1131,53 @@ out:
return 0;
}
-int
+static int
+dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int ret = -EINVAL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf) {
+ goto err;
+ }
+
+ if (!xattr_req) {
+ goto err;
+ }
+
+ /* Used to check whether this is a linkto file.
+ */
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->link_xattr_name, loc->path);
+ goto err;
+ }
+
+ /* This is used to make sure we don't unlink linkto files
+ * which are the target of an ongoing file migration.
+ */
+ ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ GLUSTERFS_OPEN_FD_COUNT, loc->path);
+ goto err;
+ }
+
+ ret = 0;
+err:
+ return ret;
+}
+
+/* This is a gfid based nameless lookup. Without a name, the hashed subvol
+ * cannot be calculated so a lookup is sent to all subvols.
+ */
+static int
dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
int ret;
@@ -1098,6 +1191,9 @@ dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc)
conf = this->private;
local = frame->local;
+ /* As we do not know if this is a file or directory, request
+ * both file and directory xattrs
+ */
ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
if (ret) {
goto err;
@@ -1109,6 +1205,9 @@ dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc)
}
if (loc_is_root(loc)) {
+ /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+ * set on the brick root.
+ */
ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
sizeof(uint32_t));
}
@@ -1150,48 +1249,11 @@ err:
return 0;
}
-/* Get the value of key from dict in the bytewise and save in array after
- convert from network byte order to host byte order
-*/
-int32_t
-dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size,
- int *errst)
-{
- void *ptr = NULL;
- int32_t len = -1;
- int32_t vindex = -1;
- int32_t err = -1;
- int ret = 0;
-
- if (dict == NULL) {
- (*errst) = -1;
- return -EINVAL;
- }
- err = dict_get_ptr_and_len(dict, key, &ptr, &len);
- if (err != 0) {
- (*errst) = -1;
- return err;
- }
-
- if (len != (size * sizeof(int32_t))) {
- (*errst) = -1;
- return -EINVAL;
- }
-
- for (vindex = 0; vindex < size; vindex++) {
- value[vindex] = ntoh32(*((int32_t *)ptr + vindex));
- if (value[vindex] < 0)
- ret = -1;
- }
-
- return ret;
-}
-
/* Code to call syntask to heal custom xattr from hashed subvol
to non hashed subvol
*/
int
-dht_dir_xattr_heal(xlator_t *this, dht_local_t *local)
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno)
{
dht_local_t *copy_local = NULL;
call_frame_t *copy = NULL;
@@ -1203,6 +1265,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local)
"No gfid exists for path %s "
"so healing xattr is not possible",
local->loc.path);
+ *op_errno = EIO;
goto out;
}
@@ -1216,6 +1279,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local)
"Memory allocation failed "
"for path %s gfid %s ",
local->loc.path, gfid_local);
+ *op_errno = ENOMEM;
DHT_STACK_DESTROY(copy);
} else {
copy_local->stbuf = local->stbuf;
@@ -1230,6 +1294,7 @@ dht_dir_xattr_heal(xlator_t *this, dht_local_t *local)
"Synctask creation failed to heal xattr "
"for path %s gfid %s ",
local->loc.path, gfid_local);
+ *op_errno = ENOMEM;
DHT_STACK_DESTROY(copy);
}
}
@@ -1238,6 +1303,51 @@ out:
return ret;
}
+static int
+dht_needs_selfheal(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int needs_selfheal = 0;
+ int ret = 0;
+
+ local = frame->local;
+ layout = local->layout;
+
+ if (local->need_attrheal || local->need_xattr_heal ||
+ local->need_selfheal) {
+ needs_selfheal = 1;
+ }
+
+ ret = dht_layout_normalize(this, &local->loc, layout);
+
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0, "fixing assignment on %s", local->loc.path);
+ needs_selfheal = 1;
+ }
+ return needs_selfheal;
+}
+
+static int
+is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2)
+{
+ if ((prot1->owner.read != prot2->owner.read) ||
+ (prot1->owner.write != prot2->owner.write) ||
+ (prot1->owner.exec != prot2->owner.exec) ||
+ (prot1->group.read != prot2->group.read) ||
+ (prot1->group.write != prot2->group.write) ||
+ (prot1->group.exec != prot2->group.exec) ||
+ (prot1->other.read != prot2->other.read) ||
+ (prot1->other.write != prot2->other.write) ||
+ (prot1->other.exec != prot2->other.exec) ||
+ (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) ||
+ (prot1->sticky != prot2->sticky)) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
int
dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
@@ -1255,8 +1365,6 @@ dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
char gfid_local[GF_UUID_BUF_SIZE] = {0};
char gfid_node[GF_UUID_BUF_SIZE] = {0};
int32_t mds_xattr_val[1] = {0};
- call_frame_t *copy = NULL;
- dht_local_t *copy_local = NULL;
GF_VALIDATE_OR_GOTO("dht", frame, out);
GF_VALIDATE_OR_GOTO("dht", this, out);
@@ -1269,7 +1377,11 @@ dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
conf = this->private;
layout = local->layout;
+ gf_msg_debug(this->name, op_errno,
+ "%s: lookup on %s returned with op_ret = %d, op_errno = %d",
+ local->loc.path, prev->name, op_ret, op_errno);
+ /* The first successful lookup*/
if (!op_ret && gf_uuid_is_null(local->gfid)) {
memcpy(local->gfid, stbuf->ia_gfid, 16);
}
@@ -1291,29 +1403,28 @@ dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
{
/* TODO: assert equal mode on stbuf->st_mode and
local->stbuf->st_mode
-
else mkdir/chmod/chown and fix
*/
ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
if (op_ret == -1) {
local->op_errno = op_errno;
- gf_msg_debug(this->name, op_errno,
- "lookup of %s on %s returned error", local->loc.path,
- prev->name);
+ /* The GFID is missing on this subvol. Force a heal. */
+ if (op_errno == ENODATA) {
+ local->need_lookup_everywhere = 1;
+ }
goto unlock;
}
is_dir = check_is_dir(inode, stbuf, xattr);
if (!is_dir) {
gf_msg_debug(this->name, 0,
- "lookup of %s on %s returned non"
- "dir 0%o"
+ "%s: lookup on %s returned non dir 0%o"
"calling lookup_everywhere",
local->loc.path, prev->name, stbuf->ia_type);
- local->need_selfheal = 1;
+ local->need_lookup_everywhere = 1;
goto unlock;
}
@@ -1324,19 +1435,31 @@ dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
dht_aggregate_xattr(local->xattr, xattr);
}
- if (dict_get(xattr, conf->mds_xattr_key)) {
- local->mds_subvol = prev;
- local->mds_stbuf.ia_gid = stbuf->ia_gid;
- local->mds_stbuf.ia_uid = stbuf->ia_uid;
- local->mds_stbuf.ia_prot = stbuf->ia_prot;
+ if (__is_root_gfid(stbuf->ia_gfid)) {
+ ret = dht_dir_has_layout(xattr, conf->xattr_name);
+ if (ret >= 0) {
+ if (is_greater_time(local->prebuf.ia_ctime,
+ local->prebuf.ia_ctime_nsec,
+ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+ /* Choose source */
+ local->prebuf.ia_gid = stbuf->ia_gid;
+ local->prebuf.ia_uid = stbuf->ia_uid;
+
+ local->prebuf.ia_ctime = stbuf->ia_ctime;
+ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+ local->prebuf.ia_prot = stbuf->ia_prot;
+ }
+ }
}
if (local->stbuf.ia_type != IA_INVAL) {
- if (!__is_root_gfid(stbuf->ia_gfid) &&
- ((local->stbuf.ia_gid != stbuf->ia_gid) ||
- (local->stbuf.ia_uid != stbuf->ia_uid) ||
- (is_permission_different(&local->stbuf.ia_prot,
- &stbuf->ia_prot)))) {
+ /* This is not the first subvol to respond
+ * Compare values to see if attrs need to be healed
+ */
+ if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+ (local->stbuf.ia_uid != stbuf->ia_uid) ||
+ (is_permission_different(&local->stbuf.ia_prot,
+ &stbuf->ia_prot))) {
local->need_attrheal = 1;
}
}
@@ -1349,125 +1472,99 @@ dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!dict_get(xattr, conf->mds_xattr_key)) {
gf_msg_debug(this->name, 0,
- "Internal xattr %s is not present "
- " on path %s gfid is %s ",
- conf->mds_xattr_key, local->loc.path, gfid_local);
+ "%s: mds xattr %s is not present "
+ "on %s(gfid = %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid_local);
goto unlock;
- } else {
- /* Save mds subvol on inode ctx */
- ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- DHT_MSG_SET_INODE_CTX_FAILED,
- "Failed to set hashed subvol for %s vol is %s",
- local->loc.path, prev->name);
- }
+ }
+
+ /* Save the mds subvol info and stbuf. This is the value that will
+ * be used for healing
+ */
+ local->mds_subvol = prev;
+ local->mds_stbuf = *stbuf;
+
+ /* Save mds subvol on inode ctx */
+
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "%s: Failed to set mds (%s)", local->loc.path, prev->name);
}
check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
mds_xattr_val, 1, &errst);
if ((check_mds < 0) && !errst) {
+ /* Check if xattrs need to be healed on the directories */
local->mds_xattr = dict_ref(xattr);
gf_msg_debug(this->name, 0,
- "Value of %s is not zero on hashed subvol "
- "so xattr needs to be heal on non hashed"
- " path is %s and vol name is %s "
- " gfid is %s",
- conf->mds_xattr_key, local->loc.path, prev->name,
+ "%s: %s is not zero on %s. Xattrs need to be healed."
+ "(gfid = %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
gfid_local);
local->need_xattr_heal = 1;
- local->mds_subvol = prev;
}
}
+
unlock:
UNLOCK(&frame->lock);
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
+ /* If the mds subvol is not set correctly*/
+ if (!__is_root_gfid(local->gfid) &&
+ (!dict_get(local->xattr, conf->mds_xattr_key))) {
+ local->need_selfheal = 1;
+ }
+
/* No need to call xattr heal code if volume count is 1
*/
- if (conf->subvolume_cnt == 1)
+ if (conf->subvolume_cnt == 1) {
local->need_xattr_heal = 0;
+ }
- /* Code to update all extended attributed from hashed subvol
- to local->xattr
- */
- if (local->need_xattr_heal && (local->mds_xattr)) {
- dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr,
- NULL, NULL);
- dict_unref(local->mds_xattr);
- local->mds_xattr = NULL;
+ if (local->need_selfheal || local->need_lookup_everywhere) {
+ /* Set the gfid-req so posix will set the GFID*/
+ if (!gf_uuid_is_null(local->gfid)) {
+ /* Ok, this should _never_ happen */
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid, 16);
+ } else {
+ if (!gf_uuid_is_null(local->gfid_req))
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid_req, 16);
+ }
}
- if (local->need_selfheal) {
- local->need_selfheal = 0;
+ if (local->need_lookup_everywhere) {
+ local->need_lookup_everywhere = 0;
dht_lookup_everywhere(frame, this, &local->loc);
return 0;
}
if (local->op_ret == 0) {
- ret = dht_layout_normalize(this, &local->loc, layout);
-
- if (ret != 0) {
- gf_msg_debug(this->name, 0, "fixing assignment on %s",
- local->loc.path);
+ if (dht_needs_selfheal(frame, this)) {
goto selfheal;
}
dht_layout_set(this, local->inode, layout);
- if (!dict_get(local->xattr, conf->mds_xattr_key) ||
- local->need_xattr_heal)
- goto selfheal;
- }
-
- if (local->inode) {
- dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
- }
-
- if (local->loc.parent) {
- dht_inode_ctx_time_update(local->loc.parent, this,
- &local->postparent, 1);
- }
-
- if (local->need_attrheal) {
- local->need_attrheal = 0;
- if (!__is_root_gfid(inode->gfid)) {
- local->stbuf.ia_gid = local->mds_stbuf.ia_gid;
- local->stbuf.ia_uid = local->mds_stbuf.ia_uid;
- local->stbuf.ia_prot = local->mds_stbuf.ia_prot;
+ if (local->inode) {
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
}
- copy = create_frame(this, this->ctx->pool);
- if (copy) {
- copy_local = dht_local_init(copy, &local->loc, NULL, 0);
- if (!copy_local) {
- DHT_STACK_DESTROY(copy);
- goto skip_attr_heal;
- }
- copy_local->stbuf = local->stbuf;
- gf_uuid_copy(copy_local->loc.gfid, local->stbuf.ia_gfid);
- copy_local->mds_stbuf = local->mds_stbuf;
- copy_local->mds_subvol = local->mds_subvol;
- copy->local = copy_local;
- FRAME_SU_DO(copy, dht_local_t);
- ret = synctask_new(this->ctx->env, dht_dir_attr_heal,
- dht_dir_attr_heal_done, copy, copy);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_DIR_ATTR_HEAL_FAILED,
- "Synctask creation failed to heal attr "
- "for path %s gfid %s ",
- local->loc.path, local->gfid);
- DHT_STACK_DESTROY(copy);
- }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
}
}
- skip_attr_heal:
DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
dht_set_fixed_dir_stat(&local->postparent);
/* Delete mds xattr at the time of STACK UNWIND */
if (local->xattr)
GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
+
DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->stbuf, local->xattr,
&local->postparent);
@@ -1483,24 +1580,57 @@ out:
return ret;
}
-int
-is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2)
+static int
+dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- if ((prot1->owner.read != prot2->owner.read) ||
- (prot1->owner.write != prot2->owner.write) ||
- (prot1->owner.exec != prot2->owner.exec) ||
- (prot1->group.read != prot2->group.read) ||
- (prot1->group.write != prot2->group.write) ||
- (prot1->group.exec != prot2->group.exec) ||
- (prot1->other.read != prot2->other.read) ||
- (prot1->other.write != prot2->other.write) ||
- (prot1->other.exec != prot2->other.exec) ||
- (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) ||
- (prot1->sticky != prot2->sticky)) {
- return 1;
- } else {
- return 0;
+ int call_cnt = 0;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, unwind);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+ GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+ GF_VALIDATE_OR_GOTO("dht", loc, unwind);
+
+ conf = this->private;
+ local = frame->local;
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ goto unwind;
+ }
+
+ if (local->xattr != NULL) {
+ dict_unref(local->xattr);
+ local->xattr = NULL;
+ }
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ /* use this gfid in order to heal any missing ones */
+ ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: Failed to set dictionary value:"
+ " key = gfid-req",
+ local->loc.path);
}
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(
+ frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req);
+ }
+ return 0;
+unwind:
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+out:
+ return 0;
}
int
@@ -1517,27 +1647,26 @@ dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int is_dir = 0;
int is_linkfile = 0;
int follow_link = 0;
- call_frame_t *copy = NULL;
- dht_local_t *copy_local = NULL;
char gfid[GF_UUID_BUF_SIZE] = {0};
uint32_t vol_commit_hash = 0;
xlator_t *subvol = NULL;
int32_t check_mds = 0;
- int errst = 0;
+ int errst = 0, i = 0;
int32_t mds_xattr_val[1] = {0};
GF_VALIDATE_OR_GOTO("dht", frame, err);
GF_VALIDATE_OR_GOTO("dht", this, err);
GF_VALIDATE_OR_GOTO("dht", frame->local, err);
GF_VALIDATE_OR_GOTO("dht", cookie, err);
+ GF_VALIDATE_OR_GOTO("dht", this->private, err);
local = frame->local;
prev = cookie;
conf = this->private;
- if (!conf)
- goto out;
if (!conf->vch_forced) {
+ /* Update the commithash value if available
+ */
ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
&vol_commit_hash);
if (ret == 0) {
@@ -1547,17 +1676,16 @@ dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
gf_uuid_unparse(local->loc.gfid, gfid);
+ gf_msg_debug(this->name, op_errno,
+ "%s: revalidate lookup on %s returned op_ret %d",
+ local->loc.path, prev->name, op_ret);
+
LOCK(&frame->lock);
{
if (gf_uuid_is_null(local->gfid)) {
memcpy(local->gfid, local->loc.gfid, 16);
}
- gf_msg_debug(this->name, op_errno,
- "revalidate lookup of %s "
- "returned with op_ret %d",
- local->loc.path, op_ret);
-
if (op_ret == -1) {
local->op_errno = op_errno;
@@ -1589,8 +1717,27 @@ dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path);
local->need_lookup_everywhere = 1;
+ } else if (IA_ISDIR(local->loc.inode->ia_type)) {
+ layout = local->layout;
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == prev) {
+ layout->list[i].err = op_errno;
+ break;
+ }
+ }
+
+ local->need_selfheal = 1;
}
}
+
+ /* The GFID is missing on this subvol. Lookup everywhere to force a
+ * gfid heal
+ */
+ if ((op_errno == ENODATA) &&
+ (IA_ISDIR(local->loc.inode->ia_type))) {
+ local->need_lookup_everywhere = 1;
+ }
+
goto unlock;
}
@@ -1620,13 +1767,16 @@ dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (is_dir) {
ret = dht_dir_has_layout(xattr, conf->xattr_name);
if (ret >= 0) {
- if (is_greater_time(local->stbuf.ia_ctime,
- local->stbuf.ia_ctime_nsec, stbuf->ia_ctime,
- stbuf->ia_ctime_nsec)) {
+ if (is_greater_time(local->prebuf.ia_ctime,
+ local->prebuf.ia_ctime_nsec,
+ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
/* Choose source */
local->prebuf.ia_gid = stbuf->ia_gid;
local->prebuf.ia_uid = stbuf->ia_uid;
+ local->prebuf.ia_ctime = stbuf->ia_ctime;
+ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+
if (__is_root_gfid(stbuf->ia_gfid))
local->prebuf.ia_prot = stbuf->ia_prot;
}
@@ -1637,15 +1787,16 @@ dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
(local->stbuf.ia_uid != stbuf->ia_uid) ||
is_permission_different(&local->stbuf.ia_prot,
&stbuf->ia_prot)) {
- local->need_selfheal = 1;
+ local->need_attrheal = 1;
}
}
if (!dict_get(xattr, conf->mds_xattr_key)) {
gf_msg_debug(this->name, 0,
- "internal xattr %s is not present"
- " on path %s gfid is %s ",
- conf->mds_xattr_key, local->loc.path, gfid);
+ "%s: internal xattr %s is not present"
+ " on subvol %s(gfid is %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid);
} else {
check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
mds_xattr_val, 1, &errst);
@@ -1663,6 +1814,8 @@ dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path, prev->name);
}
if ((check_mds < 0) && !errst) {
+ /* Check if xattrs need to be healed on the directory
+ */
local->mds_xattr = dict_ref(xattr);
gf_msg_debug(this->name, 0,
"Value of %s is not zero on "
@@ -1678,6 +1831,8 @@ dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
ret = dht_layout_dir_mismatch(this, layout, prev, &local->loc,
xattr);
if (ret != 0) {
+ /* In memory layout does not match on-disk layout.
+ */
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_MISMATCH,
"Mismatching layouts for %s, gfid = %s", local->loc.path,
gfid);
@@ -1688,19 +1843,9 @@ dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
}
- /* Update stbuf from the servers where layout is present. This
- * is an indication that the server is not a newly added brick.
- * Merging stbuf from newly added brick may result in the added
- * brick being the source of heal for uid/gid */
- if (!is_dir ||
- (is_dir && dht_dir_has_layout(xattr, conf->xattr_name) >= 0) ||
- conf->subvolume_cnt == 1) {
- dht_iatt_merge(this, &local->stbuf, stbuf);
- dht_iatt_merge(this, &local->postparent, postparent);
- } else {
- /* copy the gfid anyway */
- gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid);
- }
+ gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid);
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
local->op_ret = 0;
@@ -1714,6 +1859,8 @@ unlock:
UNLOCK(&frame->lock);
if (follow_link) {
+ /* Found a linkto file. Follow it to see if the target file exists
+ */
gf_uuid_copy(local->gfid, stbuf->ia_gfid);
subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
@@ -1728,7 +1875,6 @@ unlock:
}
}
-out:
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
@@ -1744,71 +1890,31 @@ out:
local->need_xattr_heal = 0;
if (IA_ISDIR(local->stbuf.ia_type)) {
- /* Code to update all extended attributed from hashed
- subvol to local->xattr and call heal code to heal
- custom xattr from hashed subvol to non-hashed subvol
- */
- if (local->need_xattr_heal && (local->mds_xattr)) {
- dht_dir_set_heal_xattr(this, local, local->xattr,
- local->mds_xattr, NULL, NULL);
- dict_unref(local->mds_xattr);
- local->mds_xattr = NULL;
- local->need_xattr_heal = 0;
- ret = dht_dir_xattr_heal(this, local);
- if (ret)
- gf_msg(this->name, GF_LOG_ERROR, ret,
- DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "xattr heal failed for directory %s "
- " gfid %s ",
- local->loc.path, gfid);
- } else {
- /* Call function to save hashed subvol on inode
- ctx if internal mds xattr is not present and
- all subvols are up
- */
- if (inode && !__is_root_gfid(inode->gfid) && (!local->op_ret))
- (void)dht_common_mark_mdsxattr(frame, NULL, 1);
- }
- }
- if (local->need_selfheal) {
- local->need_selfheal = 0;
- if (!__is_root_gfid(inode->gfid)) {
- gf_uuid_copy(local->gfid, local->mds_stbuf.ia_gfid);
- local->stbuf.ia_gid = local->mds_stbuf.ia_gid;
- local->stbuf.ia_uid = local->mds_stbuf.ia_uid;
- local->stbuf.ia_prot = local->mds_stbuf.ia_prot;
- } else {
- gf_uuid_copy(local->gfid, local->stbuf.ia_gfid);
- local->stbuf.ia_gid = local->prebuf.ia_gid;
- local->stbuf.ia_uid = local->prebuf.ia_uid;
- local->stbuf.ia_prot = local->prebuf.ia_prot;
- }
-
- copy = create_frame(this, this->ctx->pool);
- if (copy) {
- copy_local = dht_local_init(copy, &local->loc, NULL, 0);
- if (!copy_local) {
- DHT_STACK_DESTROY(copy);
- goto cont;
- }
- copy_local->stbuf = local->stbuf;
- copy_local->mds_stbuf = local->mds_stbuf;
- copy_local->mds_subvol = local->mds_subvol;
- copy->local = copy_local;
- FRAME_SU_DO(copy, dht_local_t);
- ret = synctask_new(this->ctx->env, dht_dir_attr_heal,
- dht_dir_attr_heal_done, copy, copy);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_DIR_ATTR_HEAL_FAILED,
- "Synctask creation failed to heal attr "
- "for path %s gfid %s ",
- local->loc.path, local->gfid);
- DHT_STACK_DESTROY(copy);
+ /* No mds xattr found. Trigger a heal to set it */
+ if (!__is_root_gfid(local->loc.inode->gfid) &&
+ (!dict_get(local->xattr, conf->mds_xattr_key)))
+ local->need_selfheal = 1;
+
+ if (dht_needs_selfheal(frame, this)) {
+ if (!__is_root_gfid(local->loc.inode->gfid)) {
+ if (local->mds_subvol) {
+ local->stbuf.ia_gid = local->mds_stbuf.ia_gid;
+ local->stbuf.ia_uid = local->mds_stbuf.ia_uid;
+ local->stbuf.ia_prot = local->mds_stbuf.ia_prot;
+ }
+ } else {
+ local->stbuf.ia_gid = local->prebuf.ia_gid;
+ local->stbuf.ia_uid = local->prebuf.ia_uid;
+ local->stbuf.ia_prot = local->prebuf.ia_prot;
}
+
+ layout = local->layout;
+ dht_selfheal_directory(frame, dht_lookup_selfheal_cbk,
+ &local->loc, layout);
+ return 0;
}
}
- cont:
+
if (local->layout_mismatch) {
/* Found layout mismatch in the directory, need to
fix this in the inode context */
@@ -1824,9 +1930,16 @@ out:
dht_layout_unref(this, local->layout);
local->layout = NULL;
- /* We know that current cached subvol is no more
+ /* We know that current cached subvol is no longer
valid, get the new one */
local->cached_subvol = NULL;
+ if (local->xattr_req) {
+ if (!gf_uuid_is_null(local->gfid)) {
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid, 16);
+ }
+ }
+
dht_lookup_everywhere(frame, this, &local->loc);
return 0;
}
@@ -1867,12 +1980,11 @@ err:
return ret;
}
-int
-dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
+static int
+dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cooie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
dht_local_t *local = NULL;
xlator_t *cached_subvol = NULL;
@@ -1934,7 +2046,7 @@ out:
return ret;
}
-int
+static int
dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
@@ -1960,7 +2072,7 @@ dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int
+static int
dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno,
struct iatt *preparent,
@@ -1982,7 +2094,7 @@ dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie,
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
- if (op_ret == 0) {
+ if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) {
dht_lookup_everywhere_done(frame, this);
} else {
/*When dht_lookup_everywhere is performed, one cached
@@ -2012,7 +2124,7 @@ dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie,
return 0;
}
-int
+static int
dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno,
struct iatt *preparent,
@@ -2029,52 +2141,41 @@ dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
local = frame->local;
- if (local && local->loc.path)
- path = local->loc.path;
+ if (local) {
+ FRAME_SU_UNDO(frame, dht_local_t);
+ if (local->loc.path)
+ path = local->loc.path;
+ }
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
"Returned with op_ret %d and "
"op_errno %d for %s",
op_ret, op_errno, ((path == NULL) ? "null" : path));
- FRAME_SU_UNDO(frame, dht_local_t);
DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
return 0;
}
-int
+static int
dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict)
{
int ret = 0;
- xlator_t *this = NULL;
- char *linktoskip_key = NULL;
- this = THIS;
- GF_VALIDATE_OR_GOTO("dht", this, err);
-
- if (dht_is_tier_xlator(this))
- linktoskip_key = TIER_SKIP_NON_LINKTO_UNLINK;
- else
- linktoskip_key = DHT_SKIP_NON_LINKTO_UNLINK;
-
- ret = dict_set_int32(dict, linktoskip_key, 1);
+ ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1);
if (ret)
- goto err;
+ return -1;
- ret = dict_set_int32(dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
+ ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
if (ret)
- goto err;
+ return -1;
return 0;
-
-err:
- return -1;
}
-int32_t
+static int32_t
dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
inode_t *inode, struct iatt *buf, dict_t *xdata,
@@ -2122,7 +2223,7 @@ dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie,
"Creating linkto file on %s(hash) to "
"%s on %s (gfid = %s)",
local->hashed_subvol->name, local->loc.path,
- local->cached_subvol->name, gfid);
+ local->cached_subvol->name, gfid_str);
ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk,
this, local->cached_subvol,
@@ -2148,7 +2249,7 @@ no_linkto:
return 0;
}
-int32_t
+static int32_t
dht_call_lookup_linkfile_create(call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
@@ -2234,7 +2335,7 @@ err:
* dht_lookup_everywhere_done takes decision based on any of the above case
*/
-int
+static int
dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this)
{
int ret = 0;
@@ -2260,6 +2361,16 @@ dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this)
DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
return 0;
}
+ if (local->op_ret && local->gfid_missing) {
+ if (gf_uuid_is_null(local->gfid_req)) {
+ DHT_STACK_UNWIND(lookup, frame, -1, ENODATA, NULL, NULL, NULL,
+ NULL);
+ return 0;
+ }
+ /* A hack */
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
if (local->dir_count) {
dht_lookup_directory(frame, this, &local->loc);
@@ -2538,7 +2649,7 @@ unwind_hashed_and_cached:
return 0;
}
-int
+static int
dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, inode_t *inode,
struct iatt *buf, dict_t *xattr,
@@ -2579,6 +2690,8 @@ dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1) {
if (op_errno != ENOENT)
local->op_errno = op_errno;
+ if (op_errno == ENODATA)
+ local->gfid_missing = _gf_true;
goto unlock;
}
@@ -2872,113 +2985,54 @@ out:
return 0;
}
-int
-dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- int call_cnt = 0;
- int i = 0;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int ret = 0;
-
- GF_VALIDATE_OR_GOTO("dht", frame, out);
- GF_VALIDATE_OR_GOTO("dht", this, unwind);
- GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
- GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
- GF_VALIDATE_OR_GOTO("dht", loc, unwind);
-
- conf = this->private;
- local = frame->local;
-
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
-
- local->layout = dht_layout_new(this, conf->subvolume_cnt);
- if (!local->layout) {
- goto unwind;
- }
-
- if (local->xattr != NULL) {
- dict_unref(local->xattr);
- local->xattr = NULL;
- }
-
- if (!gf_uuid_is_null(local->gfid)) {
- ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true);
- if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:"
- " key = gfid-req",
- local->loc.path);
- }
-
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND_COOKIE(
- frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req);
- }
- return 0;
-unwind:
- DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
-out:
- return 0;
-}
-
/* Code to get hashed subvol based on inode and loc
First it check if loc->parent and loc->path exist then it get
hashed subvol based on loc.
*/
-xlator_t *
-dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc)
+static gf_boolean_t
+dht_should_lookup_everywhere(xlator_t *this, dht_conf_t *conf, loc_t *loc)
{
- char *path = NULL;
- loc_t populate_loc = {
- 0,
- };
- char *name = NULL;
- xlator_t *hash_subvol = NULL;
-
- if (!inode)
- return hash_subvol;
+ dht_layout_t *parent_layout = NULL;
+ int ret = 0;
+ gf_boolean_t lookup_everywhere = _gf_true;
+
+ /* lookup-optimize supersedes lookup-unhashed settings.
+ * If it is set, do not process search_unhashed
+ * If lookup-optimize if enabled, lookup everywhere if:
+ * - this is the rebalance daemon.
+ * - loc->parent is unavailable.
+ * - parent_layout is unavailable
+ * - parent_layout->commit_hash != conf->vol_commit_hash
+ */
- if (loc && loc->parent && loc->path) {
- if (!loc->name) {
- name = strrchr(loc->path, '/');
- if (name) {
- loc->name = name + 1;
- } else {
- goto out;
+ if (conf->lookup_optimize) {
+ if (!conf->defrag && loc->parent) {
+ ret = dht_inode_ctx_layout_get(loc->parent, this, &parent_layout);
+ if (!ret && parent_layout &&
+ (parent_layout->commit_hash == conf->vol_commit_hash)) {
+ lookup_everywhere = _gf_false;
}
}
- hash_subvol = dht_subvol_get_hashed(this, loc);
goto out;
- }
-
- if (!gf_uuid_is_null(inode->gfid)) {
- populate_loc.inode = inode_ref(inode);
- populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL);
- inode_path(populate_loc.inode, NULL, &path);
-
- if (!path)
- goto out;
-
- populate_loc.path = path;
- if (!populate_loc.name && populate_loc.path) {
- name = strrchr(populate_loc.path, '/');
- if (name) {
- populate_loc.name = name + 1;
-
+ } else {
+ if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+ if (loc->parent) {
+ ret = dht_inode_ctx_layout_get(loc->parent, this,
+ &parent_layout);
+ if (ret || !parent_layout ||
+ (!parent_layout->search_unhashed)) {
+ lookup_everywhere = _gf_false;
+ }
} else {
- goto out;
+ lookup_everywhere = _gf_false;
}
+
+ goto out;
}
- hash_subvol = dht_subvol_get_hashed(this, &populate_loc);
}
out:
- if (populate_loc.inode)
- loc_wipe(&populate_loc);
- return hash_subvol;
+ return lookup_everywhere;
}
int
@@ -2994,7 +3048,6 @@ dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
loc_t *loc = NULL;
xlator_t *prev = NULL;
int ret = 0;
- dht_layout_t *parent_layout = NULL;
uint32_t vol_commit_hash = 0;
GF_VALIDATE_OR_GOTO("dht", frame, err);
@@ -3009,104 +3062,82 @@ dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
local = frame->local;
loc = &local->loc;
- /* This is required for handling stale linkfile deletion,
- * or any more call which happens from this 'loc'.
- */
- if (!op_ret && gf_uuid_is_null(local->gfid))
- memcpy(local->gfid, stbuf->ia_gfid, 16);
-
gf_msg_debug(this->name, op_errno,
- "fresh_lookup returned for %s with op_ret %d", loc->path,
- op_ret);
+ "%s: fresh_lookup on %s returned with op_ret %d", loc->path,
+ prev->name, op_ret);
- if (!conf->vch_forced) {
- ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
- &vol_commit_hash);
- if (ret == 0) {
- conf->vol_commit_hash = vol_commit_hash;
- }
- }
-
- if (ENTRY_MISSING(op_ret, op_errno)) {
- if (1 == conf->subvolume_cnt) {
- /* No need to lookup again */
- goto out;
- }
+ if (op_ret == -1) {
+ if (ENTRY_MISSING(op_ret, op_errno)) {
+ if (1 == conf->subvolume_cnt) {
+ /* No need to lookup again */
+ goto out;
+ }
- gf_msg_debug(this->name, 0, "Entry %s missing on subvol %s", loc->path,
- prev->name);
+ gf_msg_debug(this->name, 0, "Entry %s missing on subvol %s",
+ loc->path, prev->name);
- /* lookup-optimize supersedes lookup-unhashed settings,
- * - so if it is set, do not process search_unhashed
- * - except, in the case of rebalance daemon, we want to
- * force the lookup_everywhere behavior */
- if (!conf->defrag && conf->lookup_optimize && loc->parent) {
- ret = dht_inode_ctx_layout_get(loc->parent, this, &parent_layout);
- if (ret || !parent_layout ||
- (parent_layout->commit_hash != conf->vol_commit_hash)) {
- gf_msg_debug(this->name, 0,
- "hashes don't match (ret - %d,"
- " parent_layout - %p, parent_hash - %x,"
- " vol_hash - %x), do global lookup",
- ret, parent_layout,
- (parent_layout ? parent_layout->commit_hash : -1),
- conf->vol_commit_hash);
+ if (dht_should_lookup_everywhere(this, conf, loc)) {
local->op_errno = ENOENT;
dht_lookup_everywhere(frame, this, loc);
return 0;
}
+
} else {
- if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere(frame, this, loc);
+ /* posix returns ENODATA if the gfid is not set but the client and
+ * server protocol layers do not send the stbuf. We need to
+ * heal this so check if this is a directory on the other subvols.
+ */
+ if ((op_errno == ENOTCONN) || (op_errno == ENODATA)) {
+ dht_lookup_directory(frame, this, &local->loc);
return 0;
}
-
- if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) &&
- (loc->parent)) {
- ret = dht_inode_ctx_layout_get(loc->parent, this,
- &parent_layout);
- if (ret || !parent_layout)
- goto out;
- if (parent_layout->search_unhashed) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere(frame, this, loc);
- return 0;
- }
- }
}
+ gf_msg_debug(this->name, op_errno, "%s: Lookup on subvolume %s failed",
+ loc->path, prev->name);
+ goto out;
}
- if (op_ret == 0) {
- is_dir = check_is_dir(inode, stbuf, xattr);
- if (is_dir) {
- local->inode = inode_ref(inode);
- local->xattr = dict_ref(xattr);
+ /* Lookup succeeded - op_ret = 0 */
+
+ /* This is required for handling stale linkfile deletion,
+ * or any more call which happens from this 'loc'.
+ */
+ if (gf_uuid_is_null(local->gfid)) {
+ /*This is set from the first successful response*/
+ memcpy(local->gfid, stbuf->ia_gfid, 16);
+ }
+
+ if (!conf->vch_forced) {
+ /* Update the commit hash in conf if it is found */
+ ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
}
}
- if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) {
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ if (is_dir) {
+ /* A directory is present on all subvols, send the lookup to
+ * all subvols now */
+ local->inode = inode_ref(inode);
+ local->xattr = dict_ref(xattr);
dht_lookup_directory(frame, this, &local->loc);
return 0;
}
- if (op_ret == -1) {
- gf_msg_debug(this->name, op_errno,
- "Lookup of %s for subvolume"
- " %s failed",
- loc->path, prev->name);
- goto out;
- }
-
is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
if (!is_linkfile) {
- /* non-directory and not a linkfile */
+ /* non-directory and not a linkto file. This is a data file
+ * Update the layout to point to the cached subvol
+ */
ret = dht_layout_preset(this, prev, inode);
if (ret < 0) {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
- "could not set pre-set layout for subvolume %s", prev->name);
+ "%s: could not set pre-set layout for subvolume %s",
+ loc->path, prev->name);
op_ret = -1;
op_errno = EINVAL;
goto out;
@@ -3114,22 +3145,19 @@ dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
goto out;
}
+ /* This is a linkto file. Get the value of the target subvol from the
+ * linkto xattr and lookup there to see if the file exists
+ */
subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
if (!subvol) {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
- "linkfile not having link "
- "subvol for %s",
- loc->path);
-
- gf_msg_debug(this->name, 0,
- "linkfile not having link subvolume. path=%s", loc->path);
+ "%s: No link subvol for linkto", loc->path);
dht_lookup_everywhere(frame, this, loc);
return 0;
}
- gf_msg_debug(this->name, 0,
- "Calling lookup on linkto target %s for path %s", subvol->name,
- loc->path);
+ gf_msg_debug(this->name, 0, "%s: Calling lookup on linkto target %s",
+ loc->path, subvol->name);
STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
subvol->fops->lookup, &local->loc, local->xattr_req);
@@ -3156,11 +3184,11 @@ err:
return 0;
}
-/* For directories, check if acl xattrs have been requested (by the acl xlator),
- * if not, request for them. These xattrs are needed for dht dir self-heal to
- * perform proper self-healing of dirs
+/* For directories, check if acl xattrs have been requested (by the acl
+ * xlator), if not, request for them. These xattrs are needed for dht dir
+ * self-heal to perform proper self-healing of dirs
*/
-void
+static void
dht_check_and_set_acl_xattr_req(xlator_t *this, dict_t *xattr_req)
{
int ret = 0;
@@ -3191,7 +3219,7 @@ dht_check_and_set_acl_xattr_req(xlator_t *this, dict_t *xattr_req)
* the mds information : trusted.glusterfs.dht.mds
* the acl info: See above
*/
-int
+static int
dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
{
int ret = -EINVAL;
@@ -3232,50 +3260,109 @@ err:
return ret;
}
-int
-dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
+/* If the hashed subvol is present, send the lookup to only that subvol first.
+ * If no hashed subvol, send a lookup to all subvols and proceed based on the
+ * responses.
+ */
+static int
+dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- int ret = -EINVAL;
+ int ret = -1;
dht_conf_t *conf = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int call_cnt = 0;
+ int i = 0;
conf = this->private;
if (!conf) {
+ op_errno = EINVAL;
goto err;
}
- if (!xattr_req) {
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
goto err;
}
- /* Used to check whether this is a linkto file.
- */
- ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
- if (ret < 0) {
- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s",
- conf->link_xattr_name, loc->path);
+ /* Since we don't know whether this is a file or a directory,
+ * request all xattrs*/
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
goto err;
}
- /* This is used to make sure we don't unlink linkto files
- * which are the target of an ongoing file migration.
- */
- ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4);
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s",
- GLUSTERFS_OPEN_FD_COUNT, loc->path);
+ op_errno = -ret;
goto err;
}
- ret = 0;
+ /* Fuse sets a random value in gfid-req. If the gfid is missing
+ * on one or more subvols, posix will set the gfid to this value,
+ * causing GFID mismatches for directories. Remove the value fuse
+ * has sent before sending the lookup.
+ */
+ ret = dict_get_gfuuid(local->xattr_req, "gfid-req", &local->gfid_req);
+ if (ret) {
+ gf_msg_debug(this->name, 0, "%s: No gfid-req available", loc->path);
+ } else {
+ dict_del(local->xattr_req, "gfid-req");
+ }
+ /* This should have been set in dht_lookup */
+ hashed_subvol = local->hashed_subvol;
+
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "%s: no subvolume in layout for path, "
+ "checking on all the subvols to see if "
+ "it is a directory",
+ loc->path);
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ /* Allocate a layout. This will be populated and saved in
+ * the dht inode_ctx on successful lookup
+ */
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ gf_msg_debug(this->name, 0,
+ "%s: Found null hashed subvol. Calling lookup"
+ " on all nodes.",
+ loc->path);
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
+ return 0;
+ }
+
+ /* if the hashed_subvol is non-null, send the lookup there first so
+ * as to see whether we have a file or a directory */
+ gf_msg_debug(this->name, 0, "%s: Calling fresh lookup on %s", loc->path,
+ hashed_subvol->name);
+
+ STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->lookup, loc, local->xattr_req);
+ return 0;
err:
- return ret;
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
-int
+static int
dht_do_revalidate(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
xlator_t *subvol = NULL;
@@ -3350,6 +3437,11 @@ dht_do_revalidate(call_frame_t *frame, xlator_t *this, loc_t *loc)
}
local->mds_subvol = mds_subvol;
local->call_cnt = conf->subvolume_cnt;
+
+ /* local->call_cnt will change as responses are processed. Always use a
+ * local copy to loop through the STACK_WIND calls
+ */
+
call_cnt = local->call_cnt;
for (i = 0; i < call_cnt; i++) {
@@ -3383,91 +3475,11 @@ err:
return 0;
}
-int
-dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- int ret = -1;
- dht_conf_t *conf = NULL;
- xlator_t *hashed_subvol = NULL;
- dht_local_t *local = NULL;
- int op_errno = -1;
- int call_cnt = 0;
- int i = 0;
-
- conf = this->private;
- if (!conf) {
- op_errno = EINVAL;
- goto err;
- }
-
- local = frame->local;
- if (!local) {
- op_errno = EINVAL;
- goto err;
- }
-
- /* Since we don't know whether this is a file or a directory,
- * request all xattrs*/
- ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
- if (ret) {
- op_errno = -ret;
- goto err;
- }
-
- ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
- if (ret) {
- op_errno = -ret;
- goto err;
- }
-
- /* This should have been set in dht_lookup */
- hashed_subvol = local->hashed_subvol;
-
- if (!hashed_subvol) {
- gf_msg_debug(this->name, 0,
- "%s: no subvolume in layout for path, "
- "checking on all the subvols to see if "
- "it is a directory",
- loc->path);
-
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
-
- local->layout = dht_layout_new(this, conf->subvolume_cnt);
- if (!local->layout) {
- op_errno = ENOMEM;
- goto err;
- }
-
- gf_msg_debug(this->name, 0,
- "%s: Found null hashed subvol. Calling lookup"
- " on all nodes.",
- loc->path);
-
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup, &local->loc,
- local->xattr_req);
- }
- return 0;
- }
-
- /* if we have the hashed_subvol, send the lookup there first so
- * as to see whether we have a file or a directory */
- gf_msg_debug(this->name, 0,
- "Calling fresh lookup for %s on"
- " %s",
- loc->path, hashed_subvol->name);
-
- STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, hashed_subvol,
- hashed_subvol->fops->lookup, loc, local->xattr_req);
- return 0;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
-}
+/* Depending on the input, decide if this is a:
+ * fresh-lookup: loc->name is provided but no dht inode ctx
+ * revalidation: loc->name is provided, dht inode ctx is present
+ * discover: gfid based nameless lookup.
+ */
int
dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
@@ -3521,6 +3533,10 @@ dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
/* Nameless lookup */
+ /* This is usually sent by NFS. Lookups are done based on the gfid and
+ * no name information is available. Without the name, dht cannot calculate
+ * the hash and has to send a lookup to all subvols.
+ */
if (gf_uuid_is_null(loc->pargfid) && !gf_uuid_is_null(loc->gfid) &&
!__is_root_gfid(loc->inode->gfid)) {
local->cached_subvol = NULL;
@@ -3529,6 +3545,9 @@ dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
}
if (loc_is_root(loc)) {
+ /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+ * set on the brick root.
+ */
ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
sizeof(uint32_t));
}
@@ -3537,12 +3556,14 @@ dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
hashed_subvol = dht_subvol_get_hashed(this, loc);
local->hashed_subvol = hashed_subvol;
- /* The entry has been looked up before and has an inode_ctx set
- */
if (is_revalidate(loc)) {
+ /* The entry has been looked up before and has a dht inode_ctx
+ */
dht_do_revalidate(frame, this, loc);
return 0;
} else {
+ /* Entry has not been looked up before
+ */
dht_do_fresh_lookup(frame, this, loc);
return 0;
}
@@ -3554,7 +3575,7 @@ err:
return 0;
}
-int
+static int
dht_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
@@ -3570,18 +3591,16 @@ dht_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if ((op_ret == -1) &&
!((op_errno == ENOENT) || (op_errno == ENOTCONN))) {
local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
gf_msg_debug(this->name, op_errno,
- "Unlink link: subvolume %s"
- " returned -1",
- prev->name);
- goto unlock;
+ "Unlink link: subvolume %s returned -1", prev->name);
+ goto post_unlock;
}
local->op_ret = 0;
}
-unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
dht_set_fixed_dir_stat(&local->preparent);
dht_set_fixed_dir_stat(&local->postparent);
DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
@@ -3590,7 +3609,7 @@ unlock:
return 0;
}
-int
+static int
dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, struct iatt *preparent, struct iatt *postparent,
dict_t *xdata)
@@ -3611,9 +3630,10 @@ dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
} else {
local->op_ret = 0;
}
+ UNLOCK(&frame->lock);
gf_msg_debug(this->name, op_errno,
"Unlink: subvolume %s returned -1", prev->name);
- goto unlock;
+ goto post_unlock;
}
local->op_ret = 0;
@@ -3628,9 +3648,8 @@ dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
&local->postparent, 1);
}
}
-unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
if (!local->op_ret) {
hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
if (hashed_subvol && hashed_subvol != local->cached_subvol) {
@@ -3681,7 +3700,7 @@ dht_fix_layout_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int
+static int
dht_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, dict_t *xdata)
{
@@ -3696,22 +3715,24 @@ dht_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
{
if (op_ret == -1) {
local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
prev->name);
- goto unlock;
+ goto post_unlock;
}
local->op_ret = 0;
}
-unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
if ((local->fop == GF_FOP_SETXATTR) ||
(local->fop == GF_FOP_FSETXATTR)) {
DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
NULL);
+ /* 'local' itself may not be valid after this */
+ goto out;
}
if ((local->fop == GF_FOP_REMOVEXATTR) ||
(local->fop == GF_FOP_FREMOVEXATTR)) {
@@ -3720,6 +3741,7 @@ unlock:
}
}
+out:
return 0;
}
@@ -3750,7 +3772,7 @@ dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size)
return ret;
}
-int
+static int
dht_common_mds_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
@@ -3766,27 +3788,34 @@ dht_common_mds_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (local->fop == GF_FOP_SETXATTR) {
DHT_STACK_UNWIND(setxattr, frame, 0, op_errno, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto out;
}
if (local->fop == GF_FOP_FSETXATTR) {
DHT_STACK_UNWIND(fsetxattr, frame, 0, op_errno, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto out;
}
if (local->fop == GF_FOP_REMOVEXATTR) {
DHT_STACK_UNWIND(removexattr, frame, 0, op_errno, NULL);
+ /* 'local' itself may not be valid after this */
+ goto out;
}
if (local->fop == GF_FOP_FREMOVEXATTR) {
DHT_STACK_UNWIND(fremovexattr, frame, 0, op_errno, NULL);
}
+out:
return 0;
}
/* Code to wind a xattrop call to add 1 on current mds internal xattr
value
*/
-int
+static int
dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
@@ -3807,11 +3836,14 @@ dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret && !local->op_ret) {
local->op_ret = op_ret;
local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
prev->this->name);
+ goto post_unlock;
}
}
UNLOCK(&frame->lock);
+post_unlock:
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
@@ -3843,45 +3875,60 @@ dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
} else {
if (local->fop == GF_FOP_SETXATTR) {
DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_FSETXATTR) {
DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_REMOVEXATTR) {
DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_FREMOVEXATTR) {
DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
}
}
out:
- if (xattrop)
- dict_unref(xattrop);
if (ret) {
if (local->fop == GF_FOP_SETXATTR) {
DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_FSETXATTR) {
DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_REMOVEXATTR) {
DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_FREMOVEXATTR) {
DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
}
}
+just_return:
+ if (xattrop)
+ dict_unref(xattrop);
return 0;
}
-int
+static int
dht_setxattr_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
@@ -3941,16 +3988,22 @@ out:
if (local->fop == GF_FOP_SETXATTR) {
DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_FSETXATTR) {
DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_REMOVEXATTR) {
DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_FREMOVEXATTR) {
@@ -3958,10 +4011,11 @@ out:
NULL);
}
+just_return:
return 0;
}
-int
+static int
dht_xattrop_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
{
@@ -4008,16 +4062,22 @@ out:
if (local->fop == GF_FOP_SETXATTR) {
DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_FSETXATTR) {
DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_REMOVEXATTR) {
DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
if (local->fop == GF_FOP_FREMOVEXATTR) {
@@ -4025,6 +4085,7 @@ out:
NULL);
}
+just_return:
return 0;
}
@@ -4067,7 +4128,7 @@ dht_fill_pathinfo_xattr(xlator_t *this, dht_local_t *local, char *xattr_buf,
}
}
-int
+static int
dht_vgetxattr_alloc_and_fill(dht_local_t *local, dict_t *xattr, xlator_t *this,
int op_errno)
{
@@ -4116,7 +4177,7 @@ out:
return ret;
}
-int
+static int
dht_vgetxattr_fill_and_set(dht_local_t *local, dict_t **dict, xlator_t *this,
gf_boolean_t flag)
{
@@ -4172,7 +4233,7 @@ out:
return ret;
}
-int
+static int
dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xattr,
dict_t *xdata)
@@ -4211,11 +4272,15 @@ dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
{
this_call_cnt = --local->call_cnt;
if (op_ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
- "getxattr err for dir");
local->op_ret = -1;
local->op_errno = op_errno;
- goto unlock;
+ UNLOCK(&frame->lock);
+ if (op_errno == ENODATA)
+ gf_msg_debug(this->name, 0, "failed to get node-uuid");
+ else
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "failed to get node-uuid");
+ goto post_unlock;
}
ret = dict_get_str(xattr, local->xsel, &uuid_list);
@@ -4236,18 +4301,19 @@ dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
index = conf->local_subvols_cnt;
uuid_list_copy = gf_strdup(uuid_list);
+ if (!uuid_list_copy)
+ goto unlock;
for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str;
uuid_str = next_uuid_str) {
next_uuid_str = strtok_r(NULL, " ", &saveptr);
if (gf_uuid_parse(uuid_str, node_uuid)) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR,
- "Failed to parse uuid"
- " for %s",
- prev->name);
local->op_ret = -1;
local->op_errno = EINVAL;
- goto unlock;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR,
+ "Failed to parse uuid for %s", prev->name);
+ goto post_unlock;
}
count++;
@@ -4304,7 +4370,7 @@ dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = 0;
unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
if (!is_last_call(this_call_cnt))
goto out;
@@ -4326,7 +4392,7 @@ out:
return 0;
}
-int
+static int
dht_vgetxattr_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
@@ -4345,23 +4411,28 @@ dht_vgetxattr_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
this_call_cnt = --local->call_cnt;
if (op_ret < 0) {
if (op_errno != ENOTCONN) {
- gf_msg(this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir");
local->op_ret = -1;
local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir");
+ goto post_unlock;
}
goto unlock;
}
ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
- if (ret)
+ if (ret) {
+ UNLOCK(&frame->lock);
gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_DICT_SET_FAILED,
"alloc or fill failure");
+ goto post_unlock;
+ }
}
unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
if (!is_last_call(this_call_cnt))
goto out;
@@ -4387,7 +4458,7 @@ out:
return 0;
}
-int
+static int
dht_vgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, dict_t *xattr, dict_t *xdata)
{
@@ -4433,7 +4504,7 @@ cleanup:
return 0;
}
-int
+static int
dht_linkinfo_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xattr,
dict_t *xdata)
@@ -4455,16 +4526,16 @@ dht_linkinfo_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int
+static int
dht_mds_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
dht_local_t *local = NULL;
dht_conf_t *conf = NULL;
- VALIDATE_OR_GOTO(frame, out);
- VALIDATE_OR_GOTO(frame->local, out);
- VALIDATE_OR_GOTO(this->private, out);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(frame->local, err);
+ VALIDATE_OR_GOTO(this->private, err);
conf = this->private;
local = frame->local;
@@ -4473,9 +4544,7 @@ dht_mds_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = op_ret;
goto out;
}
- if (dict_get(xattr, conf->xattr_name)) {
- dict_del(xattr, conf->xattr_name);
- }
+ dict_del(xattr, conf->xattr_name);
local->op_ret = 0;
if (!local->xattr) {
@@ -4486,6 +4555,9 @@ out:
DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
xdata);
return 0;
+err:
+ DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+ return 0;
}
int
@@ -4495,14 +4567,22 @@ dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int this_call_cnt = 0;
dht_local_t *local = NULL;
dht_conf_t *conf = NULL;
+ int ret = 0;
- VALIDATE_OR_GOTO(frame, out);
- VALIDATE_OR_GOTO(frame->local, out);
- VALIDATE_OR_GOTO(this->private, out);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(frame->local, err);
+ VALIDATE_OR_GOTO(this->private, err);
conf = this->private;
local = frame->local;
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto err;
+ return 0;
+ }
+
LOCK(&frame->lock);
{
if (!xattr || (op_ret == -1)) {
@@ -4510,27 +4590,10 @@ dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
goto unlock;
}
- if (dict_get(xattr, conf->xattr_name)) {
- dict_del(xattr, conf->xattr_name);
- }
-
- if (dict_get(xattr, conf->mds_xattr_key)) {
- dict_del(xattr, conf->mds_xattr_key);
- }
-
- /* filter out following two xattrs that need not
- * be visible on the mount point for geo-rep -
- * trusted.tier.fix.layout.complete and
- * trusted.tier.tier-dht.commithash
- */
-
- if (dict_get(xattr, conf->commithash_xattr_name)) {
- dict_del(xattr, conf->commithash_xattr_name);
- }
+ dict_del(xattr, conf->xattr_name);
+ dict_del(xattr, conf->mds_xattr_key);
- if (frame->root->pid >= 0 && dht_is_tier_xlator(this)) {
- dict_del(xattr, GF_XATTR_TIER_LAYOUT_FIXED_KEY);
- }
+ dict_del(xattr, conf->commithash_xattr_name);
if (frame->root->pid >= 0) {
GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
@@ -4544,12 +4607,18 @@ dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
} else {
dht_aggregate_xattr(local->xattr, xattr);
}
+
+ if (!local->xdata) {
+ local->xdata = dict_ref(xdata);
+ } else if ((local->inode && IA_ISDIR(local->inode->ia_type)) ||
+ (local->fd && IA_ISDIR(local->fd->inode->ia_type))) {
+ dht_aggregate_xattr(local->xdata, xdata);
+ }
}
unlock:
UNLOCK(&frame->lock);
this_call_cnt = dht_frame_return(frame);
-out:
if (is_last_call(this_call_cnt)) {
/* If we have a valid xattr received from any one of the
* subvolume, let's return it */
@@ -4558,12 +4627,15 @@ out:
}
DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
- NULL);
+ local->xdata);
}
return 0;
+err:
+ DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+ return 0;
}
-int32_t
+static int32_t
dht_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
dict_t *xdata)
{
@@ -4571,7 +4643,7 @@ dht_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
return 0;
}
-int
+static int
dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno,
dict_t *xattr, dict_t *xdata)
@@ -4583,7 +4655,7 @@ dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
LOCK(&frame->lock);
{
- if (local->op_errno == ENODATA || local->op_errno == EOPNOTSUPP) {
+ if (local->op_errno == EOPNOTSUPP) {
/* Nothing to do here, we have already found
* a subvol which does not have the get_real_filename
* optimization. If condition is for simple logic.
@@ -4592,7 +4664,7 @@ dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
}
if (op_ret == -1) {
- if (op_errno == ENODATA || op_errno == EOPNOTSUPP) {
+ if (op_errno == EOPNOTSUPP) {
/* This subvol does not have the optimization.
* Better let the user know we don't support it.
* Remove previous results if any.
@@ -4610,16 +4682,17 @@ dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
local->op_ret = op_ret;
local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
gf_msg(this->name, GF_LOG_WARNING, op_errno,
DHT_MSG_UPGRADE_BRICKS,
"At least "
"one of the bricks does not support "
"this operation. Please upgrade all "
"bricks.");
- goto unlock;
+ goto post_unlock;
}
- if (op_errno == ENOENT) {
+ if (op_errno == ENOATTR) {
/* Do nothing, our defaults are set to this.
*/
goto unlock;
@@ -4631,9 +4704,10 @@ dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
* down subvol and return a good result(if any)
* from other subvol.
*/
+ UNLOCK(&frame->lock);
gf_msg(this->name, GF_LOG_WARNING, op_errno,
DHT_MSG_GET_XATTR_FAILED, "Failed to get real filename.");
- goto unlock;
+ goto post_unlock;
}
/* This subvol has the required file.
@@ -4654,13 +4728,13 @@ dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
local->op_ret = op_ret;
local->op_errno = 0;
- gf_msg_debug(this->name, 0,
- "Found a matching "
- "file.");
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, 0, "Found a matching file.");
+ goto post_unlock;
}
unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
DHT_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
@@ -4670,7 +4744,7 @@ unlock:
return 0;
}
-int
+static int
dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *key, dict_t *xdata)
{
@@ -4686,7 +4760,7 @@ dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc,
cnt = local->call_cnt = layout->cnt;
local->op_ret = -1;
- local->op_errno = ENOENT;
+ local->op_errno = ENOATTR;
for (i = 0; i < cnt; i++) {
subvol = layout->list[i].xlator;
@@ -4697,7 +4771,7 @@ dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
}
-int
+static int
dht_marker_populate_args(call_frame_t *frame, int type, int *gauge,
xlator_t **subvols)
{
@@ -4714,8 +4788,8 @@ dht_marker_populate_args(call_frame_t *frame, int type, int *gauge,
return layout->cnt;
}
-int
-dht_is_debug_xattr_key(char **array, char *key)
+static int
+dht_is_debug_xattr_key(const char **array, char *key)
{
int i = 0;
@@ -4729,7 +4803,7 @@ dht_is_debug_xattr_key(char **array, char *key)
/* Note we already have frame->local initialised here*/
-int
+static int
dht_handle_debug_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *key)
{
@@ -4741,10 +4815,6 @@ dht_handle_debug_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name = NULL;
local = frame->local;
- if (!key) {
- op_errno = EINVAL;
- goto out;
- }
if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) == -1) {
goto out;
@@ -4798,6 +4868,60 @@ out:
return 0;
}
+/* Virtual Xattr which returns 1 if all subvols are up,
+ else returns 0. Geo-rep then uses this virtual xattr
+ after a fresh mount and starts the I/O.
+*/
+
+enum dht_vxattr_subvol {
+ DHT_VXATTR_SUBVOLS_UP = 1,
+ DHT_VXATTR_SUBVOLS_DOWN = 0,
+};
+
+int
+dht_vgetxattr_subvol_status(call_frame_t *frame, xlator_t *this,
+ const char *key)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENODATA;
+ int value = DHT_VXATTR_SUBVOLS_UP;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ local = frame->local;
+
+ if (!key) {
+ op_errno = EINVAL;
+ goto out;
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ value = DHT_VXATTR_SUBVOLS_DOWN;
+ gf_msg_debug(this->name, 0, "subvol %s is down ",
+ conf->subvolumes[i]->name);
+ break;
+ }
+ }
+ ret = dict_set_int8(local->xattr, (char *)key, value);
+ if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+ return 0;
+}
+
int
dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
dict_t *xdata)
@@ -4855,6 +4979,11 @@ dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
goto err;
}
+ if (strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) {
+ dht_vgetxattr_subvol_status(frame, this, key);
+ return 0;
+ }
+
/* skip over code which is irrelevant if !DHT_IS_DIR(layout) */
if (!DHT_IS_DIR(layout))
goto no_dht_is_dir;
@@ -5087,8 +5216,7 @@ dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
}
}
- if (fd->inode)
- gf_uuid_unparse(fd->inode->gfid, gfid);
+ gf_uuid_unparse(fd->inode->gfid, gfid);
if ((fd->inode->ia_type == IA_IFDIR) && key &&
(strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) !=
@@ -5151,6 +5279,53 @@ err:
return 0;
}
+static int
+dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto err;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto err;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->setxattr, &local->loc,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, local->fd,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(setxattr, frame, (local ? local->op_ret : -1), op_errno,
+ NULL);
+ return 0;
+}
+
int
dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
@@ -5167,8 +5342,8 @@ dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
- if ((local->fop == GF_FOP_FSETXATTR) && op_ret == -1 &&
- (op_errno == EBADF) && !(local->fd_checked)) {
+ if ((local->fop == GF_FOP_FSETXATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -5241,7 +5416,7 @@ dht_is_user_xattr(dict_t *this, char *key, data_t *value, void *data)
/* Common code to wind a (f)(set|remove)xattr call to set xattr on directory
*/
-int
+static int
dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
fd_t *fd, dict_t *xattr, int flags,
dict_t *xdata, int *op_errno)
@@ -5258,11 +5433,13 @@ dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
int call_cnt = 0;
dht_local_t *local = NULL;
char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char **xattrs_to_heal;
conf = this->private;
local = frame->local;
call_cnt = conf->subvolume_cnt;
local->flags = flags;
+ xattrs_to_heal = get_xattrs_to_heal();
if (!gf_uuid_is_null(local->gfid)) {
gf_uuid_unparse(local->gfid, gfid_local);
@@ -5345,9 +5522,8 @@ dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
} else {
gf_msg(this->name, GF_LOG_ERROR, 0,
DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "Failed to get mds subvol for path %s"
- "gfid is %s ",
- loc->path, gfid_local);
+ "%s: Failed to get mds subvol. (gfid is %s)", loc->path,
+ gfid_local);
}
(*op_errno) = ENOENT;
goto err;
@@ -5495,7 +5671,7 @@ err:
return 0;
}
-int
+static int
dht_checking_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xattr,
dict_t *xdata)
@@ -5534,54 +5710,7 @@ out:
return 0;
}
-int
-dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
-{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
-
- if (!frame || !frame->local)
- goto err;
-
- local = frame->local;
- op_errno = local->op_errno;
-
- if (we_are_not_migrating(ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
- local->rebalance.xdata);
- return 0;
- }
-
- if (subvol == NULL)
- goto err;
-
- local->call_cnt = 2; /* This is the second attempt */
-
- if (local->fop == GF_FOP_SETXATTR) {
- STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
- subvol->fops->setxattr, &local->loc,
- local->rebalance.xattr, local->rebalance.flags,
- local->xattr_req);
- } else {
- STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
- subvol->fops->fsetxattr, local->fd,
- local->rebalance.xattr, local->rebalance.flags,
- local->xattr_req);
- }
-
- return 0;
-
-err:
- DHT_STACK_UNWIND(setxattr, frame, (local ? local->op_ret : -1), op_errno,
- NULL);
- return 0;
-}
-
-int
+static int
dht_nuke_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
@@ -5590,7 +5719,7 @@ dht_nuke_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int
+static int
dht_nuke_dir(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *tmp)
{
if (!IA_ISDIR(loc->inode->ia_type)) {
@@ -5743,22 +5872,7 @@ dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
if (local->rebalance.target_node) {
local->flags = forced_rebalance;
- /* Flag to suggest its a tiering migration
- * The reason for this dic key-value is that
- * promotions and demotions are multithreaded
- * so the original frame from gf_defrag_start()
- * is not carried. A new frame will be created when
- * we do syncop_setxattr(). This does not have the
- * frame->root->pid of the original frame. So we pass
- * this dic key-value when we do syncop_setxattr() to do
- * data migration and set the frame->root->pid to
- * GF_CLIENT_PID_TIER_DEFRAG in dht_setxattr() just before
- * calling dht_start_rebalance_task() */
- tmp = dict_get(xattr, TIERING_MIGRATION_KEY);
- if (tmp)
- frame->root->pid = GF_CLIENT_PID_TIER_DEFRAG;
- else
- frame->root->pid = GF_CLIENT_PID_DEFRAG;
+ frame->root->pid = GF_CLIENT_PID_DEFRAG;
ret = dht_start_rebalance_task(this, frame);
if (!ret)
@@ -5877,6 +5991,50 @@ err:
return 0;
}
+static int
+dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto err;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto err;
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->removexattr, &local->loc, local->key,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+ return 0;
+}
+
int
dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
@@ -5893,8 +6051,8 @@ dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
- if ((local->fop == GF_FOP_FREMOVEXATTR) && (op_ret == -1) &&
- (op_errno == EBADF) && !(local->fd_checked)) {
+ if ((local->fop == GF_FOP_FREMOVEXATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -5954,84 +6112,6 @@ out:
}
int
-dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
-{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
-
- if (!frame || !frame->local)
- goto err;
-
- local = frame->local;
- op_errno = local->op_errno;
-
- local->call_cnt = 2; /* This is the second attempt */
-
- if (we_are_not_migrating(ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
- local->rebalance.xdata);
- return 0;
- }
-
- if (subvol == NULL)
- goto err;
-
- if (local->fop == GF_FOP_REMOVEXATTR) {
- STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
- subvol->fops->removexattr, &local->loc, local->key,
- local->xattr_req);
- } else {
- STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
- subvol->fops->fremovexattr, local->fd, local->key,
- local->xattr_req);
- }
-
- return 0;
-
-err:
- DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
- return 0;
-}
-
-int
-dht_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- xlator_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK(&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
- prev->name);
- goto unlock;
- }
-
- local->op_ret = 0;
- }
-unlock:
- UNLOCK(&frame->lock);
-
- this_call_cnt = dht_frame_return(frame);
- if (is_last_call(this_call_cnt)) {
- DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
- NULL);
- }
-
- return 0;
-}
-
-int
dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *key, dict_t *xdata)
{
@@ -6208,16 +6288,16 @@ dht_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
{
if (op_ret == -1) {
local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
prev->name);
- goto unlock;
+ goto post_unlock;
}
local->op_ret = 0;
}
-unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt))
DHT_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, local->fd,
@@ -6229,7 +6309,7 @@ unlock:
/*
* dht_normalize_stats -
*/
-void
+static void
dht_normalize_stats(struct statvfs *buf, unsigned long bsize,
unsigned long frsize)
{
@@ -6248,7 +6328,7 @@ dht_normalize_stats(struct statvfs *buf, unsigned long bsize,
}
}
-int
+static int
dht_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, struct statvfs *statvfs, dict_t *xdata)
{
@@ -6358,9 +6438,7 @@ dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
int i = -1;
inode_t *inode = NULL;
inode_table_t *itable = NULL;
- uuid_t root_gfid = {
- 0,
- };
+ static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
loc_t newloc = {
0,
};
@@ -6386,7 +6464,6 @@ dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
}
loc = &local->loc2;
- root_gfid[15] = 1;
inode = inode_find(itable, root_gfid);
if (!inode) {
@@ -6495,15 +6572,16 @@ err:
}
/* dht_readdirp_cbk creates a new dentry and dentry->inode is not assigned.
- This functions assigns an inode if all of the following conditions are true:
+ This functions assigns an inode if all of the following conditions are
+ true:
- * DHT has only one child. In this case the entire layout is present on this
- single child and hence we can set complete layout in inode.
- * backend has complete layout and there are no anomalies in it and from this
- information layout can be constructed and set in inode.
+ * DHT has only one child. In this case the entire layout is present on
+ this single child and hence we can set complete layout in inode.
+ * backend has complete layout and there are no anomalies in it and from
+ this information layout can be constructed and set in inode.
*/
-void
+static void
dht_populate_inode_for_dentry(xlator_t *this, xlator_t *subvol,
gf_dirent_t *entry, gf_dirent_t *orig_entry)
{
@@ -6547,9 +6625,10 @@ out:
return;
}
-/* Posix returns op_errno = ENOENT to indicate that there are no more entries
+/* Posix returns op_errno = ENOENT to indicate that there are no more
+ * entries
*/
-int
+static int
dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
{
@@ -6576,7 +6655,9 @@ dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
prev = cookie;
local = frame->local;
- itable = local->fd ? local->fd->inode->table : NULL;
+ GF_VALIDATE_OR_GOTO(this->name, local->fd, unwind);
+
+ itable = local->fd->inode->table;
conf = this->private;
GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
@@ -6603,10 +6684,9 @@ dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
layout = local->layout;
- /* We have seen crashes in while running "rm -rf" on tier volumes
- when the layout was NULL on the hot tier. This will skip the
- entries on the subvol without a layout, hence preventing the crash
- but rmdir might fail with "directory not empty" errors*/
+ /* This will skip the entries on the subvol without a layout,
+ * hence preventing the crash but rmdir might fail with
+ * "directory not empty" errors*/
if (layout == NULL)
goto done;
@@ -6624,13 +6704,12 @@ dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
orig_entry->d_name, orig_entry->d_type);
if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
- /*stat failed somewhere- ignore this entry*/
- gf_msg_debug(this->name, EINVAL,
- "Invalid stat, ignoring entry "
- "%s gfid %s",
+ /*stat failed somewhere- display this entry but the data may
+ * be inaccurate.
+ */
+ gf_msg_debug(this->name, EINVAL, "Invalid stat for %s (gfid %s)",
orig_entry->d_name,
uuid_utoa(orig_entry->d_stat.ia_gfid));
- continue;
}
if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict,
@@ -6712,6 +6791,12 @@ dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
}
}
} else {
+ if (orig_entry->dict &&
+ dict_get(orig_entry->dict, conf->link_xattr_name)) {
+ /* Strip out the S and T flags set by rebalance*/
+ DHT_STRIP_PHASE1_FLAGS(&entry->d_stat);
+ }
+
if (orig_entry->inode) {
ret = dht_layout_preset(this, prev, orig_entry->inode);
if (ret)
@@ -6833,7 +6918,7 @@ unwind:
return 0;
}
-int
+static int
dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
{
@@ -6956,7 +7041,7 @@ unwind:
return 0;
}
-int
+static int
dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t yoff, int whichop, dict_t *dict)
{
@@ -7080,7 +7165,7 @@ dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
return 0;
}
-int
+static int
dht_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, dict_t *xdata)
{
@@ -7093,12 +7178,10 @@ dht_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
{
if (op_ret == -1)
local->op_errno = op_errno;
-
- if (op_ret == 0)
+ else if (op_ret == 0)
local->op_ret = 0;
}
UNLOCK(&frame->lock);
-
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt))
DHT_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
@@ -7212,7 +7295,7 @@ out:
return 0;
}
-int
+static int
dht_mknod_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
@@ -7263,7 +7346,7 @@ err:
return 0;
}
-int
+static int
dht_mknod_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
xlator_t *subvol, loc_t *loc, dev_t rdev,
mode_t mode, mode_t umask, dict_t *params)
@@ -7309,7 +7392,7 @@ out:
return 0;
}
-int32_t
+static int32_t
dht_mknod_do(call_frame_t *frame)
{
dht_local_t *local = NULL;
@@ -7359,7 +7442,7 @@ err:
return 0;
}
-int32_t
+static int32_t
dht_mknod_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -7367,7 +7450,7 @@ dht_mknod_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int32_t
+static int32_t
dht_mknod_finish(call_frame_t *frame, xlator_t *this, int op_ret,
int invoke_cbk)
{
@@ -7419,7 +7502,7 @@ done:
return 0;
}
-int32_t
+static int32_t
dht_mknod_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -7448,11 +7531,15 @@ dht_mknod_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
err:
- dht_mknod_finish(frame, this, -1, 0);
+ if (local)
+ dht_mknod_finish(frame, this, -1, 0);
+ else
+ DHT_STACK_UNWIND(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL);
return 0;
}
-int32_t
+static int32_t
dht_mknod_lock(call_frame_t *frame, xlator_t *subvol)
{
dht_local_t *local = NULL;
@@ -7497,7 +7584,7 @@ err:
return -1;
}
-int
+static int
dht_refresh_parent_layout_resume(call_frame_t *frame, xlator_t *this, int ret,
int invoke_cbk)
{
@@ -7527,7 +7614,7 @@ dht_refresh_parent_layout_resume(call_frame_t *frame, xlator_t *this, int ret,
return 0;
}
-int
+static int
dht_refresh_parent_layout_done(call_frame_t *frame)
{
dht_local_t *local = NULL;
@@ -7548,7 +7635,7 @@ resume:
return 0;
}
-int
+static int
dht_handle_parent_layout_change(xlator_t *this, call_stub_t *stub)
{
call_frame_t *refresh_frame = NULL, *frame = NULL;
@@ -7558,7 +7645,18 @@ dht_handle_parent_layout_change(xlator_t *this, call_stub_t *stub)
local = frame->local;
refresh_frame = copy_frame(frame);
+ if (!refresh_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "mem allocation failed for refresh_frame");
+ return -1;
+ }
+
refresh_local = dht_local_init(refresh_frame, NULL, NULL, stub->fop);
+ if (!refresh_local) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "mem allocation failed for refresh_local");
+ return -1;
+ }
refresh_local->loc.inode = inode_ref(local->loc.parent);
gf_uuid_copy(refresh_local->loc.gfid, local->loc.parent->gfid);
@@ -7572,7 +7670,7 @@ dht_handle_parent_layout_change(xlator_t *this, call_stub_t *stub)
return 0;
}
-int32_t
+static int32_t
dht_call_mkdir_stub(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -7595,7 +7693,7 @@ dht_call_mkdir_stub(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int32_t
+static int32_t
dht_guard_parent_layout_and_namespace(xlator_t *subvol, call_stub_t *stub)
{
dht_local_t *local = NULL;
@@ -7906,7 +8004,58 @@ err:
return 0;
}
-int
+static int
+dht_remove_stale_linkto_cbk(int ret, call_frame_t *sync_frame, void *data)
+{
+ DHT_STACK_DESTROY(sync_frame);
+ return 0;
+}
+
+static int
+dht_remove_stale_linkto(void *data)
+{
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ dict_t *xdata_in = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", data, out);
+
+ frame = data;
+ local = frame->local;
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", local, out);
+ GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out);
+
+ xdata_in = dict_new();
+ if (!xdata_in)
+ goto out;
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(xdata_in);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
+ "Failed to set keys for stale linkto"
+ "deletion on path %s",
+ local->loc.path);
+ goto out;
+ }
+
+ ret = syncop_unlink(local->link_subvol, &local->loc, xdata_in, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
+ "Removal of linkto failed"
+ " on path %s at subvol %s",
+ local->loc.path, local->link_subvol->name);
+ }
+out:
+ if (xdata_in)
+ dict_unref(xdata_in);
+ return ret;
+}
+
+static int
dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, inode_t *inode, struct iatt *stbuf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
@@ -8022,7 +8171,7 @@ out:
return 0;
}
-int
+static int
dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -8078,7 +8227,7 @@ err:
return 0;
}
-int
+static int
dht_link_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
@@ -8181,6 +8330,11 @@ dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
xlator_t *prev = NULL;
int ret = -1;
dht_local_t *local = NULL;
+ gf_boolean_t parent_layout_changed = _gf_false;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
local = frame->local;
if (!local) {
@@ -8189,8 +8343,69 @@ dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
goto out;
}
- if (op_ret == -1)
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ parent_layout_changed = (xdata &&
+ dict_get(xdata, GF_PREOP_CHECK_FAILED))
+ ? _gf_true
+ : _gf_false;
+
+ if (parent_layout_changed) {
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ /* Returning failure as the layout could not be fixed even under
+ * the lock */
+ goto out;
+ }
+
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "create (%s/%s) (path: %s): parent layout "
+ "changed. Attempting a layout refresh and then a "
+ "retry",
+ pgfid, local->loc.name, local->loc.path);
+
+ /*
+ dht_refresh_layout needs directory info in local->loc.Hence,
+ storing the parent_loc in local->loc and storing the create
+ context in local->loc2. We will restore this information in
+ dht_creation_do.
+ */
+
+ loc_wipe(&local->loc2);
+
+ ret = loc_copy(&local->loc2, &local->loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "loc_copy failed %s", local->loc.path);
+
+ goto out;
+ }
+
+ loc_wipe(&local->loc);
+
+ ret = dht_build_parent_loc(this, &local->loc, &local->loc2,
+ &op_errno);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+ "parent loc build failed");
+ goto out;
+ }
+
+ subvol = dht_subvol_get_hashed(this, &local->loc2);
+
+ ret = dht_create_lock(frame, subvol);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "locking parent failed");
+ goto out;
+ }
+
+ return 0;
+ }
+
goto out;
+ }
prev = cookie;
@@ -8245,7 +8460,7 @@ out:
return 0;
}
-int
+static int
dht_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
inode_t *inode, struct iatt *stbuf,
@@ -8296,7 +8511,7 @@ err:
return 0;
}
-int
+static int
dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
xlator_t *subvol, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd,
@@ -8311,6 +8526,8 @@ dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
subvol->name);
+ dht_set_parent_layout_in_dict(loc, this, local);
+
STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
subvol->fops->create, loc, flags, mode, umask, fd,
params);
@@ -8319,10 +8536,6 @@ dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
avail_subvol = dht_free_disk_available_subvol(this, subvol, local);
if (avail_subvol != subvol) {
- local->params = dict_ref(params);
- local->flags = flags;
- local->mode = mode;
- local->umask = umask;
local->cached_subvol = avail_subvol;
local->hashed_subvol = subvol;
@@ -8338,6 +8551,8 @@ dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
subvol->name);
+ dht_set_parent_layout_in_dict(loc, this, local);
+
STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
subvol->fops->create, loc, flags, mode, umask, fd,
params);
@@ -8406,7 +8621,7 @@ out:
return ret;
}
-int32_t
+static int32_t
dht_create_do(call_frame_t *frame)
{
dht_local_t *local = NULL;
@@ -8456,7 +8671,7 @@ err:
return 0;
}
-int32_t
+static int32_t
dht_create_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -8464,7 +8679,7 @@ dht_create_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int32_t
+static int32_t
dht_create_finish(call_frame_t *frame, xlator_t *this, int op_ret,
int invoke_cbk)
{
@@ -8516,7 +8731,7 @@ done:
return 0;
}
-int32_t
+static int32_t
dht_create_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -8545,7 +8760,11 @@ dht_create_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
err:
- dht_create_finish(frame, this, -1, 0);
+ if (local)
+ dht_create_finish(frame, this, -1, 0);
+ else
+ DHT_STACK_UNWIND(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL, NULL);
return 0;
}
@@ -8595,6 +8814,60 @@ err:
}
int
+dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local)
+{
+ dht_conf_t *conf = this->private;
+ dht_layout_t *parent_layout = NULL;
+ int *parent_disk_layout = NULL;
+ xlator_t *hashed_subvol = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ parent_layout = dht_layout_get(this, loc->parent);
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+
+ ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+ &parent_disk_layout);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "extracting in-memory layout of parent failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ ret = dict_set_str_sizen(local->params, GF_PREOP_PARENT_KEY,
+ conf->xattr_name);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting %s key in params dictionary failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path,
+ GF_PREOP_PARENT_KEY);
+ goto err;
+ }
+
+ ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout,
+ 4 * 4);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting parent-layout in params dictionary failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+err:
+ dht_layout_unref(this, parent_layout);
+ return ret;
+}
+
+int
dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
{
@@ -8620,6 +8893,11 @@ dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
goto err;
}
+ local->params = dict_ref(params);
+ local->flags = flags;
+ local->mode = mode;
+ local->umask = umask;
+
if (dht_filter_loc_subvol_key(this, loc, &local->loc, &subvol)) {
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
"creating %s on %s (got create on %s)", local->loc.path,
@@ -8635,10 +8913,6 @@ dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
if (hashed_subvol && (hashed_subvol != subvol)) {
/* Create the linkto file and then the data file */
- local->params = dict_ref(params);
- local->flags = flags;
- local->mode = mode;
- local->umask = umask;
local->cached_subvol = subvol;
local->hashed_subvol = hashed_subvol;
@@ -8651,6 +8925,9 @@ dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
* file as we expect a lookup everywhere if there are problems
* with the parent layout
*/
+
+ dht_set_parent_layout_in_dict(loc, this, local);
+
STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
subvol->fops->create, &local->loc, flags, mode, umask,
fd, params);
@@ -8702,11 +8979,6 @@ dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
goto err;
}
- local->params = dict_ref(params);
- local->flags = flags;
- local->mode = mode;
- local->umask = umask;
-
loc_wipe(&local->loc);
ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno);
@@ -8744,7 +9016,7 @@ err:
return 0;
}
-int
+static int
dht_mkdir_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -8778,7 +9050,7 @@ dht_mkdir_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int
+static int
dht_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, inode_t *inode, struct iatt *stbuf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
@@ -8846,13 +9118,13 @@ unlock:
return 0;
}
-int
+static int
dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata);
-int
+static int
dht_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
mode_t umask, dict_t *params)
{
@@ -8975,7 +9247,7 @@ err:
return 0;
}
-int
+static int
dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
@@ -8992,8 +9264,6 @@ dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
gf_boolean_t parent_layout_changed = _gf_false;
call_stub_t *stub = NULL;
- VALIDATE_OR_GOTO(this->private, err);
-
local = frame->local;
prev = cookie;
layout = local->layout;
@@ -9025,7 +9295,11 @@ dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
goto err;
}
- dht_handle_parent_layout_change(this, stub);
+ ret = dht_handle_parent_layout_change(this, stub);
+ if (ret) {
+ goto err;
+ }
+
stub = NULL;
return 0;
@@ -9104,7 +9378,7 @@ err:
return 0;
}
-int
+static int
dht_mkdir_guard_parent_layout_cbk(call_frame_t *frame, xlator_t *this,
loc_t *loc, mode_t mode, mode_t umask,
dict_t *params)
@@ -9255,7 +9529,7 @@ err:
return 0;
}
-int
+static int
dht_rmdir_selfheal_cbk(call_frame_t *heal_frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
@@ -9277,7 +9551,7 @@ dht_rmdir_selfheal_cbk(call_frame_t *heal_frame, void *cookie, xlator_t *this,
return 0;
}
-int
+static int
dht_rmdir_hashed_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
@@ -9380,7 +9654,63 @@ err:
return 0;
}
-int
+static int
+dht_rmdir_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ int lock_count = 0;
+
+ local = frame->local;
+
+ /* Unlock entrylk */
+ dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns);
+
+ /* Unlock inodelk */
+ lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks,
+ local->lock[0].ns.parent_layout.lk_count);
+
+ if (lock_count == 0)
+ goto done;
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL)
+ goto done;
+
+ lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+ lock_frame->root->op);
+ if (lock_local == NULL)
+ goto done;
+
+ lock_local->lock[0].ns.parent_layout.locks = local->lock[0]
+ .ns.parent_layout.locks;
+ lock_local->lock[0]
+ .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count;
+
+ local->lock[0].ns.parent_layout.locks = NULL;
+ local->lock[0].ns.parent_layout.lk_count = 0;
+ dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks,
+ lock_local->lock[0].ns.parent_layout.lk_count,
+ dht_rmdir_unlock_cbk);
+ lock_frame = NULL;
+
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+
+ return 0;
+}
+
+static int
dht_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, struct iatt *preparent, struct iatt *postparent,
dict_t *xdata)
@@ -9515,63 +9845,7 @@ err:
return 0;
}
-int
-dht_rmdir_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- DHT_STACK_DESTROY(frame);
- return 0;
-}
-
-int
-dht_rmdir_unlock(call_frame_t *frame, xlator_t *this)
-{
- dht_local_t *local = NULL, *lock_local = NULL;
- call_frame_t *lock_frame = NULL;
- int lock_count = 0;
-
- local = frame->local;
-
- /* Unlock entrylk */
- dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns);
-
- /* Unlock inodelk */
- lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks,
- local->lock[0].ns.parent_layout.lk_count);
-
- if (lock_count == 0)
- goto done;
-
- lock_frame = copy_frame(frame);
- if (lock_frame == NULL)
- goto done;
-
- lock_local = dht_local_init(lock_frame, &local->loc, NULL,
- lock_frame->root->op);
- if (lock_local == NULL)
- goto done;
-
- lock_local->lock[0].ns.parent_layout.locks = local->lock[0]
- .ns.parent_layout.locks;
- lock_local->lock[0]
- .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count;
-
- local->lock[0].ns.parent_layout.locks = NULL;
- local->lock[0].ns.parent_layout.lk_count = 0;
- dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks,
- lock_local->lock[0].ns.parent_layout.lk_count,
- dht_rmdir_unlock_cbk);
- lock_frame = NULL;
-
-done:
- if (lock_frame != NULL) {
- DHT_STACK_DESTROY(lock_frame);
- }
-
- return 0;
-}
-
-int
+static int
dht_rmdir_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -9580,8 +9854,6 @@ dht_rmdir_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int i = 0;
xlator_t *hashed_subvol;
- VALIDATE_OR_GOTO(this->private, err);
-
conf = this->private;
local = frame->local;
@@ -9614,7 +9886,7 @@ err:
return 0;
}
-int
+static int
dht_rmdir_do(call_frame_t *frame, xlator_t *this)
{
dht_local_t *local = NULL;
@@ -9623,13 +9895,13 @@ dht_rmdir_do(call_frame_t *frame, xlator_t *this)
xlator_t *hashed_subvol = NULL;
char gfid[GF_UUID_BUF_SIZE] = {0};
- VALIDATE_OR_GOTO(this->private, err);
-
- conf = this->private;
+ VALIDATE_OR_GOTO(frame->local, err);
local = frame->local;
+ VALIDATE_OR_GOTO(this->private, out);
+ conf = this->private;
if (local->op_ret == -1)
- goto err;
+ goto out;
local->call_cnt = conf->subvolume_cnt;
@@ -9661,21 +9933,80 @@ dht_rmdir_do(call_frame_t *frame, xlator_t *this)
if (ret < 0) {
local->op_ret = -1;
local->op_errno = errno ? errno : EINVAL;
- goto err;
+ goto out;
}
return 0;
-err:
+out:
dht_set_fixed_dir_stat(&local->preparent);
dht_set_fixed_dir_stat(&local->postparent);
DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
&local->preparent, &local->postparent, NULL);
return 0;
+err:
+ DHT_STACK_UNWIND(rmdir, frame, -1, EINVAL, NULL, NULL, NULL);
+ return 0;
}
-int
+static void
+dht_rmdir_readdirp_done(call_frame_t *readdirp_frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ dht_local_t *main_local = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+ local = readdirp_frame->local;
+ main_frame = local->main_frame;
+ main_local = main_frame->local;
+
+ /* At least one readdirp failed.
+ * This is a bit hit or miss - if readdirp failed on more than
+ * one subvol, we don't know which error is returned.
+ */
+ if (local->op_ret == -1) {
+ main_local->op_ret = local->op_ret;
+ main_local->op_errno = local->op_errno;
+ }
+
+ this_call_cnt = dht_frame_return(main_frame);
+
+ if (is_last_call(this_call_cnt))
+ dht_rmdir_do(main_frame, this);
+
+ DHT_STACK_DESTROY(readdirp_frame);
+}
+
+/* Keep sending readdirp on the subvol until it returns no more entries
+ * It is possible that not all entries will fit in a single readdirp in
+ * which case the rmdir will keep failing with ENOTEMPTY
+ */
+
+static int
+dht_rmdir_readdirp_do(call_frame_t *readdirp_frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+
+ local = readdirp_frame->local;
+
+ if (local->op_ret == -1) {
+ /* there is no point doing another readdirp on this
+ * subvol . */
+ dht_rmdir_readdirp_done(readdirp_frame, this);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk,
+ local->hashed_subvol, local->hashed_subvol,
+ local->hashed_subvol->fops->readdirp, local->fd, 4096, 0,
+ local->xattr);
+
+ return 0;
+}
+
+static int
dht_rmdir_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
@@ -9719,7 +10050,7 @@ dht_rmdir_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-int
+static int
dht_rmdir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, dict_t *xattr, struct iatt *parent)
@@ -9744,8 +10075,7 @@ dht_rmdir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret != 0) {
gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_FILE_LOOKUP_FAILED,
- "lookup failed for %s on %s (type=0%o)", local->loc.path,
- src->name, stbuf->ia_type);
+ "lookup failed for %s on %s", local->loc.path, src->name);
goto err;
}
@@ -9775,7 +10105,7 @@ err:
return 0;
}
-int
+static int
dht_rmdir_cached_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, dict_t *xattr,
@@ -9855,7 +10185,7 @@ err:
return 0;
}
-int
+static int
dht_rmdir_is_subvol_empty(call_frame_t *frame, xlator_t *this,
gf_dirent_t *entries, xlator_t *src)
{
@@ -9951,7 +10281,10 @@ dht_rmdir_is_subvol_empty(call_frame_t *frame, xlator_t *this,
lookup_local->loc.path, src->name, gfid);
subvol = dht_linkfile_subvol(this, NULL, &trav->d_stat, trav->dict);
- if (!subvol) {
+ if (!subvol || (subvol == src)) {
+ /* we need to delete the linkto file if it does not have a
+ * valid subvol or it points to itself.
+ */
gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_INVALID_LINKFILE,
"Linkfile does not have link subvolume. "
"path = %s, gfid = %s",
@@ -10013,36 +10346,7 @@ err:
* No more entries on this subvol. Proceed to the actual rmdir operation.
*/
-void
-dht_rmdir_readdirp_done(call_frame_t *readdirp_frame, xlator_t *this)
-{
- call_frame_t *main_frame = NULL;
- dht_local_t *main_local = NULL;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
-
- local = readdirp_frame->local;
- main_frame = local->main_frame;
- main_local = main_frame->local;
-
- /* At least one readdirp failed.
- * This is a bit hit or miss - if readdirp failed on more than
- * one subvol, we don't know which error is returned.
- */
- if (local->op_ret == -1) {
- main_local->op_ret = local->op_ret;
- main_local->op_errno = local->op_errno;
- }
-
- this_call_cnt = dht_frame_return(main_frame);
-
- if (is_last_call(this_call_cnt))
- dht_rmdir_do(main_frame, this);
-
- DHT_STACK_DESTROY(readdirp_frame);
-}
-
-int
+static int
dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, gf_dirent_t *entries,
dict_t *xdata)
@@ -10051,12 +10355,18 @@ dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *prev = NULL;
xlator_t *src = NULL;
int ret = 0;
+ char *path = NULL;
local = frame->local;
prev = cookie;
src = prev;
if (op_ret > 2) {
+ /* dht_rmdir_is_subvol_empty() may free the frame,
+ * copy path for logging.
+ */
+ path = gf_strdup(local->loc.path);
+
ret = dht_rmdir_is_subvol_empty(frame, this, entries, src);
switch (ret) {
@@ -10067,56 +10377,26 @@ dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
prev->name, local->loc.path, op_ret);
local->op_ret = -1;
local->op_errno = ENOTEMPTY;
- goto done;
+ break;
default:
/* @ret number of linkfiles are getting unlinked */
gf_msg_trace(this->name, 0,
"readdir on %s for %s found %d "
"linkfiles",
- prev->name, local->loc.path, ret);
+ prev->name, path, ret);
break;
}
}
- if (ret) {
- return 0;
- }
-
-done:
/* readdirp failed or no linkto files were found on this subvol */
+ if (!ret)
+ dht_rmdir_readdirp_done(frame, this);
- dht_rmdir_readdirp_done(frame, this);
+ GF_FREE(path);
return 0;
}
-/* Keep sending readdirp on the subvol until it returns no more entries
- * It is possible that not all entries will fit in a single readdirp in which
- * case the rmdir will keep failing with ENOTEMPTY
- */
-
-int
-dht_rmdir_readdirp_do(call_frame_t *readdirp_frame, xlator_t *this)
-{
- dht_local_t *local = NULL;
-
- local = readdirp_frame->local;
-
- if (local->op_ret == -1) {
- /* there is no point doing another readdirp on this
- * subvol . */
- dht_rmdir_readdirp_done(readdirp_frame, this);
- return 0;
- }
-
- STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk,
- local->hashed_subvol, local->hashed_subvol,
- local->hashed_subvol->fops->readdirp, local->fd, 4096, 0,
- local->xattr);
-
- return 0;
-}
-
-int
+static int
dht_rmdir_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
{
@@ -10236,6 +10516,8 @@ dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
dht_conf_t *conf = NULL;
int op_errno = -1;
int i = -1;
+ int ret = -1;
+ dict_t *xattr_req = NULL;
VALIDATE_OR_GOTO(frame, err);
VALIDATE_OR_GOTO(this, err);
@@ -10267,14 +10549,37 @@ dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
if (flags) {
return dht_rmdir_do(frame, this);
}
+ if (xdata) {
+ xattr_req = dict_ref(xdata);
+ } else {
+ xattr_req = dict_new();
+ }
+ if (xattr_req) {
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ /* If parallel-readdir is enabled, this is required
+ * to handle stale linkto files in the directory
+ * being deleted. If this fails, log an error but
+ * do not prevent the operation.
+ */
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s",
+ loc->path, conf->link_xattr_name);
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s",
+ loc->path, conf->link_xattr_name);
+ }
for (i = 0; i < conf->subvolume_cnt; i++) {
STACK_WIND_COOKIE(frame, dht_rmdir_opendir_cbk, conf->subvolumes[i],
conf->subvolumes[i],
conf->subvolumes[i]->fops->opendir, loc, local->fd,
- NULL);
+ xattr_req);
}
+ if (xattr_req) {
+ dict_unref(xattr_req);
+ }
return 0;
err:
@@ -10284,7 +10589,7 @@ err:
return 0;
}
-int
+static int
dht_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -10344,7 +10649,7 @@ err:
return 0;
}
-int
+static int
dht_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -10391,7 +10696,7 @@ err:
return 0;
}
-int32_t
+static int32_t
dht_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
{
@@ -10508,23 +10813,17 @@ dht_notify(xlator_t *this, int event, void *data, ...)
int had_heard_from_all = 0;
int have_heard_from_all = 0;
- struct timeval time = {
- 0,
- };
gf_defrag_info_t *defrag = NULL;
dict_t *dict = NULL;
gf_defrag_type cmd = 0;
dict_t *output = NULL;
va_list ap;
- dht_methods_t *methods = NULL;
struct gf_upcall *up_data = NULL;
struct gf_upcall_cache_invalidation *up_ci = NULL;
conf = this->private;
GF_VALIDATE_OR_GOTO(this->name, conf, out);
- methods = &(conf->methods);
-
/* had all subvolumes reported status once till now? */
had_heard_from_all = 1;
for (i = 0; i < conf->subvolume_cnt; i++) {
@@ -10554,12 +10853,11 @@ dht_notify(xlator_t *this, int event, void *data, ...)
break;
}
- gettimeofday(&time, NULL);
LOCK(&conf->subvolume_lock);
{
conf->subvolume_status[cnt] = 1;
conf->last_event[cnt] = event;
- conf->subvol_up_time[cnt] = time.tv_sec;
+ conf->subvol_up_time[cnt] = gf_time();
}
UNLOCK(&conf->subvolume_lock);
@@ -10667,21 +10965,13 @@ dht_notify(xlator_t *this, int event, void *data, ...)
if (defrag->is_exiting)
goto unlock;
if ((cmd == GF_DEFRAG_CMD_STATUS) ||
- (cmd == GF_DEFRAG_CMD_STATUS_TIER) ||
(cmd == GF_DEFRAG_CMD_DETACH_STATUS))
gf_defrag_status_get(conf, output);
- else if (cmd == GF_DEFRAG_CMD_START_DETACH_TIER)
- gf_defrag_start_detach_tier(defrag);
else if (cmd == GF_DEFRAG_CMD_DETACH_START)
defrag->cmd = GF_DEFRAG_CMD_DETACH_START;
else if (cmd == GF_DEFRAG_CMD_STOP ||
- cmd == GF_DEFRAG_CMD_STOP_DETACH_TIER ||
cmd == GF_DEFRAG_CMD_DETACH_STOP)
gf_defrag_stop(conf, GF_DEFRAG_STATUS_STOPPED, output);
- else if (cmd == GF_DEFRAG_CMD_PAUSE_TIER)
- ret = gf_defrag_pause_tier(this, defrag);
- else if (cmd == GF_DEFRAG_CMD_RESUME_TIER)
- ret = gf_defrag_resume_tier(this, defrag);
}
unlock:
UNLOCK(&defrag->lock);
@@ -10729,6 +11019,7 @@ dht_notify(xlator_t *this, int event, void *data, ...)
}
if (!had_heard_from_all && have_heard_from_all) {
+ static int run_defrag = 0;
/* This is the first event which completes aggregation
of events from all subvolumes. If at least one subvol
had come up, propagate CHILD_UP, but only this time
@@ -10755,15 +11046,13 @@ dht_notify(xlator_t *this, int event, void *data, ...)
* thread has already started.
*/
if (conf->defrag && !run_defrag) {
- if (methods->migration_needed(this)) {
- run_defrag = 1;
- ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start,
- this, "dhtdg");
- if (ret) {
- GF_FREE(conf->defrag);
- conf->defrag = NULL;
- kill(getpid(), SIGTERM);
- }
+ run_defrag = 1;
+ ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start,
+ this, "dhtdg");
+ if (ret) {
+ GF_FREE(conf->defrag);
+ conf->defrag = NULL;
+ kill(getpid(), SIGTERM);
}
}
}
@@ -10832,8 +11121,8 @@ dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
len += ret;
/* Calculation of total length of the string required to calloc
- * output_string. Log includes subvolume-name, start-range, end-range and
- * err value.
+ * output_string. Log includes subvolume-name, start-range, end-range
+ * and err value.
*
* This log will help to debug cases where:
* a) Different processes set different layout of a directory.
@@ -10844,8 +11133,7 @@ dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
for (i = 0; i < layout->cnt; i++) {
ret = snprintf(string, sizeof(string),
"[Subvol_name: %s, Err: %d , Start: "
- "%" PRIu32 " , Stop: %" PRIu32 " , Hash: %" PRIu32
- " ], ",
+ "0x%x, Stop: 0x%x, Hash: 0x%x], ",
layout->list[i].xlator->name, layout->list[i].err,
layout->list[i].start, layout->list[i].stop,
layout->list[i].commit_hash);
@@ -10874,8 +11162,7 @@ dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
for (i = 0; i < layout->cnt; i++) {
ret = snprintf(output_string + off, len - off,
"[Subvol_name: %s, Err: %d , Start: "
- "%" PRIu32 " , Stop: %" PRIu32 " , Hash: %" PRIu32
- " ], ",
+ "0x%x, Stop: 0x%x, Hash: 0x%x], ",
layout->list[i].xlator->name, layout->list[i].err,
layout->list[i].start, layout->list[i].stop,
layout->list[i].commit_hash);
@@ -10910,28 +11197,6 @@ out:
return ret;
}
-int32_t
-dht_migration_needed(xlator_t *this)
-{
- gf_defrag_info_t *defrag = NULL;
- dht_conf_t *conf = NULL;
- int ret = 0;
-
- conf = this->private;
-
- GF_VALIDATE_OR_GOTO("dht", conf, out);
- GF_VALIDATE_OR_GOTO("dht", conf->defrag, out);
-
- defrag = conf->defrag;
-
- if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
- (defrag->cmd != GF_DEFRAG_CMD_START_DETACH_TIER))
- ret = 1;
-
-out:
- return ret;
-}
-
/*
This function should not be called more then once during a FOP
handling path. It is valid only for for ops on files
@@ -10966,67 +11231,161 @@ dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf,
return 0;
}
-gf_boolean_t
-dht_is_tier_xlator(xlator_t *this)
-{
- if (strcmp(this->type, "cluster/tier") == 0)
- return _gf_true;
- return _gf_false;
-}
-
int32_t
dht_release(xlator_t *this, fd_t *fd)
{
return dht_fd_ctx_destroy(this, fd);
}
-int
-dht_remove_stale_linkto(void *data)
+static int
+dht_pt_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- call_frame_t *frame = NULL;
dht_local_t *local = NULL;
- xlator_t *this = NULL;
- dict_t *xdata_in = NULL;
+
+ local = frame->local;
+
+ if (!op_ret) {
+ dht_layout_set(this, inode, local->layout);
+ }
+
+ DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, NULL);
+
+ return 0;
+}
+
+int32_t
+dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ bool free_xdata = false;
int ret = 0;
+ int op_errno = 0;
+ int32_t *disk_layout_p = NULL;
- GF_VALIDATE_OR_GOTO("dht", data, out);
+ conf = this->private;
- frame = data;
- local = frame->local;
- this = frame->this;
- GF_VALIDATE_OR_GOTO("dht", this, out);
- GF_VALIDATE_OR_GOTO("dht", local, out);
- GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out);
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- xdata_in = dict_new();
- if (!xdata_in)
- goto out;
+ layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!layout)
+ goto wind;
- ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(xdata_in);
- if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
- "Failed to set keys for stale linkto"
- "deletion on path %s",
- local->loc.path);
- goto out;
+ local->layout = layout;
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata)
+ goto wind;
+ free_xdata = true;
}
- ret = syncop_unlink(local->link_subvol, &local->loc, xdata_in, NULL);
+ /*Set the xlator or the following will crash*/
+ layout->list[0].xlator = conf->subvolumes[0];
+
+ dht_selfheal_layout_new_directory(frame, loc, layout);
+
+ dht_disk_layout_extract(this, layout, 0, &disk_layout_p);
+
+ ret = dict_set_bin(xdata, conf->xattr_name, disk_layout_p, 4 * 4);
if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
- "Removal of linkto failed"
- " on path %s at subvol %s",
- local->loc.path, local->link_subvol->name);
+ gf_msg("dht", GF_LOG_DEBUG, EINVAL, DHT_MSG_DICT_SET_FAILED,
+ "dht layout dict set failed");
}
-out:
- if (xdata_in)
- dict_unref(xdata_in);
- return ret;
+wind:
+ STACK_WIND(frame, dht_pt_mkdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+ if (free_xdata)
+ dict_unref(xdata);
+ return 0;
+
+err:
+ op_errno = local ? local->op_errno : op_errno;
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+static int
+dht_pt_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ dict_del(xattr, conf->xattr_name);
+ dict_del(xattr, conf->mds_xattr_key);
+ dict_del(xattr, conf->commithash_xattr_name);
+
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
+
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
+ return 0;
}
int
-dht_remove_stale_linkto_cbk(int ret, call_frame_t *sync_frame, void *data)
+dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
{
- DHT_STACK_DESTROY(sync_frame);
+ STACK_WIND(frame, dht_pt_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr, loc, key, xdata);
return 0;
}
+
+static int
+dht_pt_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ dict_del(xattr, conf->xattr_name);
+
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
+
+ DHT_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+ return 0;
+}
+
+int
+dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
+{
+ STACK_WIND(frame, dht_pt_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata);
+ return 0;
+}
+
+/* The job of this function is to check if all the xlators have updated
+ * error in the layout. */
+int
+dht_dir_layout_error_check(xlator_t *this, inode_t *inode)
+{
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+ layout = dht_layout_get(this, inode);
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == 0) {
+ return 0;
+ }
+ }
+
+ /* Returning the first xlator error as all xlators have errors */
+ return layout->list[0].err;
+}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index bd30fd84b88..fe0dc3db34a 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -9,24 +9,21 @@
*/
#include <regex.h>
-#include <signal.h>
-#include <fnmatch.h>
#include "dht-mem-types.h"
#include "dht-messages.h"
-#include "call-stub.h"
+#include <glusterfs/call-stub.h>
#include "libxlator.h"
-#include "syncop.h"
-#include "refcount.h"
-#include "timer.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/refcount.h>
+#include <glusterfs/timer.h>
#include "protocol-common.h"
-#include "glusterfs-acl.h"
+#include <glusterfs/glusterfs-acl.h>
#ifndef _DHT_H
#define _DHT_H
#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout"
-#define GF_XATTR_TIER_LAYOUT_FIXED_KEY "trusted.tier.fix.layout.complete"
#define GF_XATTR_FILE_MIGRATE_KEY "trusted.distribute.migrate-data"
#define DHT_MDS_STR "mds"
#define GF_DHT_LOOKUP_UNHASHED_OFF 0
@@ -38,22 +35,21 @@
#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal"
/* Namespace synchronization */
#define DHT_ENTRY_SYNC_DOMAIN "dht.entry.sync"
-#define TIERING_MIGRATION_KEY "tiering.migration"
#define DHT_LAYOUT_HASH_INVALID 1
#define MAX_REBAL_THREADS sysconf(_SC_NPROCESSORS_ONLN)
#define DHT_DIR_STAT_BLOCKS 8
#define DHT_DIR_STAT_SIZE 4096
+/* Virtual xattr for subvols status */
+
+#define DHT_SUBVOL_STATUS_KEY "dht.subvol.status"
+
/* Virtual xattrs for debugging */
#define DHT_DBG_HASHED_SUBVOL_PATTERN "dht.file.hashed-subvol.*"
#define DHT_DBG_HASHED_SUBVOL_KEY "dht.file.hashed-subvol."
-/* Array to hold custom xattr keys
- */
-extern char *xattrs_to_heal[];
-
/* Rebalance nodeuuid flags */
#define REBAL_NODEUUID_MINE 0x01
@@ -152,8 +148,8 @@ struct dht_rebalance_ {
dict_t *xdata;
dict_t *xattr;
dict_t *dict;
- int32_t set;
struct gf_flock flock;
+ int32_t set;
int lock_cmd;
};
@@ -176,24 +172,24 @@ typedef enum {
} dht_reaction_type_t;
struct dht_skip_linkto_unlink {
- gf_boolean_t handle_valid_link;
- int opend_fd_count;
xlator_t *hash_links_to;
uuid_t cached_gfid;
uuid_t hashed_gfid;
+ int opend_fd_count;
+ gf_boolean_t handle_valid_link;
};
typedef struct {
xlator_t *xl;
loc_t loc; /* contains/points to inode to lock on. */
- short type; /* read/write lock. */
char *domain; /* Only locks within a single domain
* contend with each other
*/
char *basename; /* Required for entrylk */
- gf_lkowner_t lk_owner;
gf_boolean_t locked;
dht_reaction_type_t do_on_failure;
+ short type; /* read/write lock. */
+ gf_lkowner_t lk_owner;
} dht_lock_t;
/* The lock structure represents inodelk. */
@@ -244,23 +240,10 @@ typedef gf_boolean_t (*dht_need_heal_t)(call_frame_t *frame,
dht_layout_t **inmem,
dht_layout_t **ondisk);
-typedef struct {
- uint64_t blocks_used;
- uint64_t pblocks_used;
- uint64_t files_used;
- uint64_t pfiles_used;
- uint64_t unhashed_blocks_used;
- uint64_t unhashed_pblocks_used;
- uint64_t unhashed_files_used;
- uint64_t unhashed_pfiles_used;
- uint64_t unhashed_fsid;
- uint64_t hashed_fsid;
-} tier_statvfs_t;
-
struct dht_local {
- int call_cnt;
loc_t loc;
loc_t loc2;
+ int call_cnt;
int op_ret;
int op_errno;
int layout_mismatch;
@@ -274,7 +257,6 @@ struct dht_local {
struct iatt preparent;
struct iatt postparent;
struct statvfs statvfs;
- tier_statvfs_t tier_statvfs;
fd_t *fd;
inode_t *inode;
dict_t *params;
@@ -290,9 +272,6 @@ struct dht_local {
xlator_t *cached_subvol;
xlator_t *hashed_subvol;
xlator_t *mds_subvol; /* This is use for dir only */
- char need_selfheal;
- char need_xattr_heal;
- char need_attrheal;
int file_count;
int dir_count;
call_frame_t *main_frame;
@@ -310,12 +289,12 @@ struct dht_local {
uint32_t overlaps_cnt;
uint32_t down;
uint32_t misc;
- uint32_t missing_cnt;
dht_selfheal_dir_cbk_t dir_cbk;
dht_selfheal_layout_t healer;
dht_need_heal_t should_heal;
- gf_boolean_t force_mkdir;
dht_layout_t *layout, *refreshed_layout;
+ uint32_t missing_cnt;
+ gf_boolean_t force_mkdir;
} selfheal;
dht_refresh_layout_unlock refresh_layout_unlock;
@@ -323,6 +302,13 @@ struct dht_local {
uint32_t uid;
uint32_t gid;
+ pid_t pid;
+
+ glusterfs_fop_t fop;
+
+ /* need for file-info */
+ char *xattr_val;
+ char *key;
/* needed by nufa */
int32_t flags;
@@ -330,44 +316,25 @@ struct dht_local {
dev_t rdev;
mode_t umask;
- /* need for file-info */
- char *xattr_val;
- char *key;
-
/* which xattr request? */
char xsel[256];
int32_t alloc_len;
/* gfid related */
uuid_t gfid;
+ uuid_t gfid_req;
- /* flag used to make sure we need to return estale in
- {lookup,revalidate}_cbk */
- char return_estale;
- char need_lookup_everywhere;
-
- glusterfs_fop_t fop;
-
- gf_boolean_t linked;
xlator_t *link_subvol;
struct dht_rebalance_ rebalance;
xlator_t *first_up_subvol;
- gf_boolean_t quota_deem_statfs;
-
- gf_boolean_t added_link;
- gf_boolean_t is_linkfile;
-
struct dht_skip_linkto_unlink skip_unlink;
dht_dir_transaction_t lock[2], *current;
/* inodelks during filerename for backward compatibility */
dht_lock_t **rename_inodelk_backward_compatible;
- int rename_inodelk_bc_count;
-
- short lock_type;
call_stub_t *stub;
int32_t parent_disk_layout[4];
@@ -375,15 +342,30 @@ struct dht_local {
/* rename rollback */
int *ret_cache;
- /* fd open check */
- gf_boolean_t fd_checked;
+ loc_t loc2_copy;
+
+ int rename_inodelk_bc_count;
/* This is use only for directory operation */
int32_t valid;
- gf_boolean_t heal_layout;
int32_t mds_heal_fresh_lookup;
- loc_t loc2_copy;
+ short lock_type;
+ char need_selfheal;
+ char need_xattr_heal;
+ char need_attrheal;
+ /* flag used to make sure we need to return estale in
+ {lookup,revalidate}_cbk */
+ char return_estale;
+ char need_lookup_everywhere;
+ /* fd open check */
+ gf_boolean_t fd_checked;
+ gf_boolean_t linked;
+ gf_boolean_t added_link;
+ gf_boolean_t is_linkfile;
+ gf_boolean_t quota_deem_statfs;
+ gf_boolean_t heal_layout;
gf_boolean_t locked;
gf_boolean_t dont_create_linkto;
+ gf_boolean_t gfid_missing;
};
typedef struct dht_local dht_local_t;
@@ -407,14 +389,7 @@ enum gf_defrag_type {
GF_DEFRAG_CMD_STATUS = 1 + 2,
GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3,
GF_DEFRAG_CMD_START_FORCE = 1 + 4,
- GF_DEFRAG_CMD_START_TIER = 1 + 5,
- GF_DEFRAG_CMD_STATUS_TIER = 1 + 6,
- GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7,
- GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8,
- GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9,
- GF_DEFRAG_CMD_RESUME_TIER = 1 + 10,
GF_DEFRAG_CMD_DETACH_STATUS = 1 + 11,
- GF_DEFRAG_CMD_STOP_TIER = 1 + 12,
GF_DEFRAG_CMD_DETACH_START = 1 + 13,
GF_DEFRAG_CMD_DETACH_COMMIT = 1 + 14,
GF_DEFRAG_CMD_DETACH_COMMIT_FORCE = 1 + 15,
@@ -465,75 +440,6 @@ struct dht_container {
int local_subvol_index;
};
-typedef enum tier_mode_ {
- TIER_MODE_NONE = 0,
- TIER_MODE_TEST,
- TIER_MODE_WM
-} tier_mode_t;
-
-typedef enum tier_pause_state_ {
- TIER_RUNNING = 0,
- TIER_REQUEST_PAUSE,
- TIER_PAUSED
-} tier_pause_state_t;
-
-/* This Structure is only used in tiering fixlayout */
-typedef struct gf_tier_fix_layout_arg {
- xlator_t *this;
- dict_t *fix_layout;
- pthread_t thread_id;
-} gf_tier_fix_layout_arg_t;
-
-typedef struct gf_tier_conf {
- int is_tier;
- int watermark_hi;
- int watermark_low;
- int watermark_last;
- unsigned long block_size;
- fsblkcnt_t blocks_total;
- fsblkcnt_t blocks_used;
- int percent_full;
- uint64_t max_migrate_bytes;
- int max_migrate_files;
- int query_limit;
- tier_mode_t mode;
- /* These flags are only used for tier-compact */
- gf_boolean_t compact_active;
- /* These 3 flags are set to true when the client changes the */
- /* compaction mode on the command line. */
- /* When they are set, the daemon will trigger compaction as */
- /* soon as possible to activate or deactivate compaction. */
- /* If in the middle of a compaction, then the switches take */
- /* effect on the next compaction, not the current one. */
- /* If the user switches it off, we want to avoid needless */
- /* compactions. */
- /* If the user switches it on, they want to compact as soon */
- /* as possible. */
- gf_boolean_t compact_mode_switched;
- gf_boolean_t compact_mode_switched_hot;
- gf_boolean_t compact_mode_switched_cold;
- int tier_max_promote_size;
- int tier_promote_frequency;
- int tier_demote_frequency;
- int tier_compact_hot_frequency;
- int tier_compact_cold_frequency;
- uint64_t st_last_promoted_size;
- uint64_t st_last_demoted_size;
- tier_pause_state_t pause_state;
- struct synctask *pause_synctask;
- gf_timer_t *pause_timer;
- pthread_mutex_t pause_mutex;
- int promote_in_progress;
- int demote_in_progress;
- /* This Structure is only used in tiering fixlayout */
- gf_tier_fix_layout_arg_t tier_fix_layout_arg;
- /* Indicates the index of the first queryfile picked
- * in the last cycle of promote or demote */
- int32_t last_promote_qfile_index;
- int32_t last_demote_qfile_index;
- char volname[GD_VOLUME_NAME_MAX + 1];
-} gf_tier_conf_t;
-
typedef struct nodeuuid_info {
char info; /* Set to 1 is this is my node's uuid*/
uuid_t uuid; /* Store the nodeuuid as well for debugging*/
@@ -553,26 +459,18 @@ struct gf_defrag_info_ {
uint64_t num_dirs_processed;
uint64_t size_processed;
gf_lock_t lock;
- int cmd;
pthread_t th;
- gf_defrag_status_t defrag_status;
struct rpc_clnt *rpc;
uint32_t connected;
uint32_t is_exiting;
pid_t pid;
+ int cmd;
inode_t *root_inode;
uuid_t node_uuid;
- struct timeval start_time;
- gf_boolean_t stats;
+ time_t start_time;
uint32_t new_commit_hash;
+ gf_defrag_status_t defrag_status;
gf_defrag_pattern_list_t *defrag_pattern;
- gf_tier_conf_t tier_conf;
-
- /*Data Tiering params for scanner*/
- uint64_t total_files_promoted;
- uint64_t total_files_demoted;
- int write_freq_threshold;
- int read_freq_threshold;
pthread_cond_t parallel_migration_cond;
pthread_mutex_t dfq_mutex;
@@ -587,18 +485,20 @@ struct gf_defrag_info_ {
/*Throttle params*/
/*stands for reconfigured thread count*/
int32_t recon_thread_count;
- /*stands for current running thread count*/
- int32_t current_thread_count;
pthread_cond_t df_wakeup_thread;
- /* lock migration flag */
- gf_boolean_t lock_migration_enabled;
-
/* backpointer to make it easier to write functions for rebalance */
xlator_t *this;
pthread_cond_t fc_wakeup_cond;
pthread_mutex_t fc_mutex;
+
+ /*stands for current running thread count*/
+ int32_t current_thread_count;
+
+ gf_boolean_t stats;
+ /* lock migration flag */
+ gf_boolean_t lock_migration_enabled;
};
typedef struct gf_defrag_info_ gf_defrag_info_t;
@@ -606,7 +506,6 @@ typedef struct gf_defrag_info_ gf_defrag_info_t;
struct dht_methods_s {
int32_t (*migration_get_dst_subvol)(xlator_t *this, dht_local_t *local);
int32_t (*migration_other)(xlator_t *this, gf_defrag_info_t *defrag);
- int32_t (*migration_needed)(xlator_t *this);
xlator_t *(*layout_search)(xlator_t *this, dht_layout_t *layout,
const char *name);
};
@@ -614,36 +513,26 @@ struct dht_methods_s {
typedef struct dht_methods_s dht_methods_t;
struct dht_conf {
- gf_lock_t subvolume_lock;
- int subvolume_cnt;
xlator_t **subvolumes;
char *subvolume_status;
int *last_event;
dht_layout_t **file_layouts;
dht_layout_t **dir_layouts;
unsigned int search_unhashed;
- gf_boolean_t lookup_optimize;
int gen;
dht_du_t *du_stats;
double min_free_disk;
double min_free_inodes;
- char disk_unit;
+ int subvolume_cnt;
int32_t refresh_interval;
- gf_boolean_t unhashed_sticky_bit;
- struct timeval last_stat_fetch;
+ gf_lock_t subvolume_lock;
+ time_t last_stat_fetch;
gf_lock_t layout_lock;
dict_t *leaf_to_subvol;
void *private; /* Can be used by wrapper xlators over
dht */
- gf_boolean_t use_readdirp;
- char vol_uuid[UUID_SIZE + 1];
- gf_boolean_t assert_no_child_down;
time_t *subvol_up_time;
- /* This is the count used as the distribute layout for a directory */
- /* Will be a global flag to control the layout spread count */
- uint32_t dir_spread_cnt;
-
/* to keep track of nodes which are decommissioned */
xlator_t **decommissioned_bricks;
int decommission_in_progress;
@@ -652,15 +541,9 @@ struct dht_conf {
/* defrag related */
gf_defrag_info_t *defrag;
- /* Request to filter directory entries in readdir request */
-
- gf_boolean_t readdir_optimize;
-
/* Support regex-based name reinterpretation. */
regex_t rsync_regex;
- gf_boolean_t rsync_regex_valid;
regex_t extra_regex;
- gf_boolean_t extra_regex_valid;
/* Support variable xattr names. */
char *xattr_name;
@@ -669,11 +552,6 @@ struct dht_conf {
char *commithash_xattr_name;
char *wild_xattr_name;
- /* Support size-weighted rebalancing (heterogeneous bricks). */
- gf_boolean_t do_weighting;
- gf_boolean_t randomize_by_gfid;
- int dthrottle;
-
dht_methods_t methods;
struct mem_pool *lock_pool;
@@ -683,24 +561,55 @@ struct dht_conf {
subvol_nodeuuids_info_t *local_nodeuuids;
int32_t local_subvols_cnt;
+ int dthrottle;
+
+ /* Hard link handle requirement for migration triggered from client*/
+ synclock_t link_lock;
+
+ /* lock migration */
+ gf_lock_t lock;
+
+ /* This is the count used as the distribute layout for a directory */
+ /* Will be a global flag to control the layout spread count */
+ uint32_t dir_spread_cnt;
+
/*
* "Commit hash" for this volume topology. Changed whenever bricks
* are added or removed.
*/
uint32_t vol_commit_hash;
- gf_boolean_t vch_forced;
- /* lock migration */
+ char vol_uuid[UUID_SIZE + 1];
+
+ char disk_unit;
gf_boolean_t lock_migration_enabled;
- gf_lock_t lock;
- /* Hard link handle requirement for migration triggered from client*/
- synclock_t link_lock;
+ gf_boolean_t vch_forced;
gf_boolean_t use_fallocate;
gf_boolean_t force_migration;
+
+ gf_boolean_t lookup_optimize;
+
+ gf_boolean_t unhashed_sticky_bit;
+
+ gf_boolean_t assert_no_child_down;
+
+ gf_boolean_t use_readdirp;
+
+ /* Request to filter directory entries in readdir request */
+ gf_boolean_t readdir_optimize;
+
+ gf_boolean_t rsync_regex_valid;
+
+ gf_boolean_t extra_regex_valid;
+
+ /* Support size-weighted rebalancing (heterogeneous bricks). */
+ gf_boolean_t do_weighting;
+
+ gf_boolean_t randomize_by_gfid;
};
typedef struct dht_conf dht_conf_t;
@@ -739,6 +648,8 @@ struct dir_dfmeta {
struct list_head **head;
struct list_head **iterator;
int *fetch_entries;
+ /* fds corresponding to local subvols only */
+ fd_t **lfd;
};
typedef struct dht_migrate_info {
@@ -814,22 +725,18 @@ typedef struct dht_fd_ctx {
dht_local_wipe(__xl, __local); \
} while (0)
-#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, inode, post) \
+#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, post) \
do { \
- LOCK(&inode->lock); \
- { \
- if (ctx_sec == new_sec) \
- new_nsec = max(new_nsec, ctx_nsec); \
- else if (ctx_sec > new_sec) { \
- new_sec = ctx_sec; \
- new_nsec = ctx_nsec; \
- } \
- if (post) { \
- ctx_sec = new_sec; \
- ctx_nsec = new_nsec; \
- } \
+ if (ctx_sec == new_sec) \
+ new_nsec = max(new_nsec, ctx_nsec); \
+ else if (ctx_sec > new_sec) { \
+ new_sec = ctx_sec; \
+ new_nsec = ctx_nsec; \
+ } \
+ if (post) { \
+ ctx_sec = new_sec; \
+ ctx_nsec = new_nsec; \
} \
- UNLOCK(&inode->lock); \
} while (0)
#define is_greater_time(a, an, b, bn) \
@@ -866,7 +773,7 @@ int32_t
dht_migration_needed(xlator_t *this);
int
dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout);
-int
+void
dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
uint32_t *holes_p, uint32_t *overlaps_p,
uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
@@ -874,7 +781,6 @@ dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
int
dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
loc_t *loc, dict_t *xattr);
-
xlator_t *
dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *buf,
dict_t *xattr);
@@ -892,9 +798,6 @@ int
dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos,
int32_t **disk_layout_p);
int
-dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
- void *disk_layout_raw, int disk_layout_len);
-int
dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
xlator_t *subvol, int32_t **disk_layout_p);
@@ -930,25 +833,17 @@ dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
xlator_t *this, xlator_t *tovol, xlator_t *fromvol,
loc_t *loc);
int
-dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc);
-int
dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc);
int
dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
loc_t *loc, dht_layout_t *layout);
-
-int
-dht_selfheal_directory_for_nameless_lookup(call_frame_t *frame,
- dht_selfheal_dir_cbk_t cbk,
- loc_t *loc, dht_layout_t *layout);
-
int
dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
dht_layout_t *layout);
int
dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
loc_t *loc, dht_layout_t *layout);
-int
+void
dht_layout_sort_volname(dht_layout_t *layout);
int
@@ -965,14 +860,14 @@ dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx);
int
dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode);
int
-dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol);
-int
dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout);
;
void
dht_layout_unref(xlator_t *this, dht_layout_t *layout);
dht_layout_t *
dht_layout_ref(xlator_t *this, dht_layout_t *layout);
+int
+dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol);
xlator_t *
dht_first_up_subvol(xlator_t *this);
xlator_t *
@@ -1227,25 +1122,19 @@ dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
int
-gf_defrag_status_get(dht_conf_t *conf, dict_t *dict);
-
-void
-gf_defrag_set_pause_state(gf_tier_conf_t *tier_conf, tier_pause_state_t state);
-
-tier_pause_state_t
-gf_defrag_get_pause_state(gf_tier_conf_t *tier_conf);
+dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
int
-gf_defrag_pause_tier(xlator_t *this, gf_defrag_info_t *defrag);
-
-tier_pause_state_t
-gf_defrag_check_pause_tier(gf_tier_conf_t *defrag);
+dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata);
int
-gf_defrag_resume_tier(xlator_t *this, gf_defrag_info_t *defrag);
-
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
int
-gf_defrag_start_detach_tier(gf_defrag_info_t *defrag);
+gf_defrag_status_get(dht_conf_t *conf, dict_t *dict);
int
gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output);
@@ -1278,10 +1167,6 @@ int
dht_dir_attr_heal(void *data);
int
dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data);
-int
-dht_dir_has_layout(dict_t *xattr, char *name);
-gf_boolean_t
-dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator);
xlator_t *
dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
xlator_t *ignore, dht_layout_t *layout,
@@ -1290,15 +1175,18 @@ xlator_t *
dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol,
dht_layout_t *layout);
int
+dht_dir_has_layout(dict_t *xattr, char *name);
+int
dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this);
-void
-dht_layout_dump(dht_layout_t *layout, const char *prefix);
int32_t
dht_priv_dump(xlator_t *this);
int32_t
dht_inodectx_dump(xlator_t *this, inode_t *inode);
+gf_boolean_t
+dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator);
+
int
dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode,
xlator_t **src_subvol, xlator_t **dst_subvol);
@@ -1312,11 +1200,6 @@ dht_subvol_status(dht_conf_t *conf, xlator_t *subvol);
void
dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
dht_layout_t *layout);
-int
-dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this);
-
-int
-dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict);
int
dht_layout_sort(dht_layout_t *layout);
@@ -1333,9 +1216,6 @@ dht_layout_missing_dirs(dht_layout_t *layout);
int
dht_refresh_layout(call_frame_t *frame);
-gf_boolean_t
-dht_is_tier_xlator(xlator_t *this);
-
int
dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child,
int32_t *op_errno);
@@ -1366,22 +1246,6 @@ dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock,
int
dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret);
-void
-dht_normalize_stats(struct statvfs *buf, unsigned long bsize,
- unsigned long frsize);
-
-int
-add_opt(char **optsp, const char *opt);
-
-int
-dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value);
-
-int
-dht_remove_stale_linkto(void *data);
-
-int
-dht_remove_stale_linkto_cbk(int ret, call_frame_t *sync_frame, void *data);
-
int
dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *subvol);
@@ -1453,9 +1317,6 @@ dht_dir_heal_xattrs(void *data);
int
dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data);
-void
-dht_aggregate_xattr(dict_t *dst, dict_t *src);
-
int32_t
dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size);
@@ -1467,25 +1328,12 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
dict_t *src, int *uret, int *uflag);
int
-dht_dir_xattr_heal(xlator_t *this, dht_local_t *local);
-
-int32_t
-dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size,
- int *errst);
-
-xlator_t *
-dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc);
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno);
int
dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, int flag);
int
-dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata);
-
-int
-dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol);
-int
dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol);
int
@@ -1494,12 +1342,43 @@ dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
/* Abstract out the DHT-IATT-IN-DICT */
+void
+dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *new_layout);
+
int
-dht_request_iatt_in_xdata(xlator_t *this, dict_t *xattr_req);
+dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata);
int
-dht_read_iatt_from_xdata(xlator_t *this, dict_t *xdata, struct iatt *stbuf);
+dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata);
+
+int32_t
+dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
int
-is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2);
+dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int32_t
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno);
+
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int32_t
+dht_create_lock(call_frame_t *frame, xlator_t *subvol);
+
+int
+dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local);
+
+int
+dht_dir_layout_error_check(xlator_t *this, inode_t *inode);
+
+int
+dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol);
#endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 13eaabae1c1..c0588828fdb 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -10,14 +10,10 @@
/* TODO: add NS locking */
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "dht-messages.h"
-#include "defaults.h"
#include <sys/time.h>
-#include "events.h"
+#include <glusterfs/events.h>
int
dht_du_info_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
@@ -155,22 +151,18 @@ dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
dht_conf_t *conf = NULL;
call_frame_t *statfs_frame = NULL;
dht_local_t *statfs_local = NULL;
- struct timeval tv = {
- 0,
- };
loc_t tmp_loc = {
0,
};
+ time_t now;
conf = this->private;
-
- gettimeofday(&tv, NULL);
-
+ now = gf_time();
/* make it root gfid, should be enough to get the proper
info back */
tmp_loc.gfid[15] = 1;
- if (tv.tv_sec > (conf->refresh_interval + conf->last_stat_fetch.tv_sec)) {
+ if (now > (conf->refresh_interval + conf->last_stat_fetch)) {
statfs_frame = copy_frame(frame);
if (!statfs_frame) {
goto err;
@@ -202,7 +194,7 @@ dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
statfs_local->params);
}
- conf->last_stat_fetch.tv_sec = tv.tv_sec;
+ conf->last_stat_fetch = now;
}
return 0;
err:
@@ -260,7 +252,7 @@ dht_is_subvol_filled(xlator_t *this, xlator_t *subvol)
"full (%.2f %%), consider adding more bricks",
subvol->name, usage);
- strncpy(vol_name, this->name, sizeof(vol_name));
+ (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name);
vol_name[(strlen(this->name) - 4)] = '\0';
gf_event(EVENT_DHT_DISK_USAGE, "volume=%s;subvol=%s;usage=%.2f %%",
@@ -276,7 +268,7 @@ dht_is_subvol_filled(xlator_t *this, xlator_t *subvol)
"(%.2f %%), consider adding more bricks",
subvol->name, usage);
- strncpy(vol_name, this->name, sizeof(vol_name));
+ (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name);
vol_name[(strlen(this->name) - 4)] = '\0';
gf_event(EVENT_DHT_INODES_USAGE,
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
index 2f15c0370cc..acda67c312a 100644
--- a/xlators/cluster/dht/src/dht-hashfn.c
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -8,13 +8,12 @@
cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "hashfn.h"
+#include <glusterfs/hashfn.h>
-int
-dht_hash_compute_internal(int type, const char *name, uint32_t *hash_p)
+static int
+dht_hash_compute_internal(int type, const char *name, const int len,
+ uint32_t *hash_p)
{
int ret = 0;
uint32_t hash = 0;
@@ -22,7 +21,7 @@ dht_hash_compute_internal(int type, const char *name, uint32_t *hash_p)
switch (type) {
case DHT_HASH_TYPE_DM:
case DHT_HASH_TYPE_DM_USER:
- hash = gf_dm_hashfn(name, strlen(name));
+ hash = gf_dm_hashfn(name, len);
break;
default:
ret = -1;
@@ -36,7 +35,12 @@ dht_hash_compute_internal(int type, const char *name, uint32_t *hash_p)
return ret;
}
-static gf_boolean_t
+/* The function returns:
+ * 0 : in case no munge took place
+ * >0 : the length (inc. terminating NULL!) of the newly modified string,
+ * if it was munged.
+ */
+static int
dht_munge_name(const char *original, char *modified, size_t len, regex_t *re)
{
regmatch_t matches[2] = {
@@ -54,14 +58,14 @@ dht_munge_name(const char *original, char *modified, size_t len, regex_t *re)
if (new_len < len) {
memcpy(modified, original + matches[1].rm_so, new_len);
modified[new_len] = '\0';
- return _gf_true;
+ return new_len + 1; /* +1 for the terminating NULL */
}
}
}
/* This is guaranteed safe because of how the dest was allocated. */
strcpy(modified, original);
- return _gf_false;
+ return 0;
}
int
@@ -70,36 +74,37 @@ dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p)
char *rsync_friendly_name = NULL;
dht_conf_t *priv = NULL;
size_t len = 0;
- gf_boolean_t munged = _gf_false;
+ int munged = 0;
priv = this->private;
+ if (name == NULL)
+ return -1;
+
+ len = strlen(name) + 1;
+ rsync_friendly_name = alloca(len);
+
LOCK(&priv->lock);
{
if (priv->extra_regex_valid) {
- len = strlen(name) + 1;
- rsync_friendly_name = alloca(len);
munged = dht_munge_name(name, rsync_friendly_name, len,
&priv->extra_regex);
}
if (!munged && priv->rsync_regex_valid) {
- len = strlen(name) + 1;
- rsync_friendly_name = alloca(len);
gf_msg_trace(this->name, 0, "trying regex for %s", name);
munged = dht_munge_name(name, rsync_friendly_name, len,
&priv->rsync_regex);
- if (munged) {
- gf_msg_debug(this->name, 0, "munged down to %s",
- rsync_friendly_name);
- }
}
}
UNLOCK(&priv->lock);
-
- if (!munged) {
+ if (munged) {
+ gf_msg_debug(this->name, 0, "munged down to %s", rsync_friendly_name);
+ len = munged;
+ } else {
rsync_friendly_name = (char *)name;
}
- return dht_hash_compute_internal(type, rsync_friendly_name, hash_p);
+ return dht_hash_compute_internal(type, rsync_friendly_name, len - 1,
+ hash_p);
}
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 6d6ec24729d..3f2fe43d5f3 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -8,10 +8,9 @@
cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
#include "dht-lock.h"
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
static void
dht_free_fd_ctx(dht_fd_ctx_t *fd_ctx)
@@ -34,7 +33,7 @@ dht_fd_ctx_destroy(xlator_t *this, fd_t *fd)
goto out;
}
- fd_ctx = (dht_fd_ctx_t *)value;
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value;
if (fd_ctx) {
GF_REF_PUT(fd_ctx);
}
@@ -58,15 +57,15 @@ __dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst)
goto out;
}
- fd_ctx->opened_on_dst = (uint64_t)dst;
+ fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst;
GF_REF_INIT(fd_ctx, dht_free_fd_ctx);
- value = (uint64_t)fd_ctx;
+ value = (uint64_t)(uintptr_t)fd_ctx;
ret = __fd_ctx_set(fd, this, value);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FD_CTX_SET_FAILED,
- "Failed to set fd ctx in fd=0x%p", fd);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FD_CTX_SET_FAILED,
+ "fd=0x%p", fd, NULL);
GF_REF_PUT(fd_ctx);
}
out:
@@ -87,19 +86,20 @@ dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst)
{
ret = __fd_ctx_get(fd, this, &value);
if (ret && value) {
- fd_ctx = (dht_fd_ctx_t *)value;
- if (fd_ctx->opened_on_dst == (uint64_t)dst) {
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value;
+ if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) {
/* This could happen due to racing
* check_progress tasks*/
goto unlock;
} else {
/* This would be a big problem*/
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_VALUE,
- "Different dst found in the fd ctx");
-
/* Overwrite and hope for the best*/
- fd_ctx->opened_on_dst = (uint64_t)dst;
- goto unlock;
+ fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst;
+ UNLOCK(&fd->lock);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_VALUE,
+ NULL);
+
+ goto out;
}
}
ret = __dht_fd_ctx_set(this, fd, dst);
@@ -124,13 +124,13 @@ dht_fd_ctx_get(xlator_t *this, fd_t *fd)
{
ret = __fd_ctx_get(fd, this, &tmp_val);
if ((ret < 0) || (tmp_val == 0)) {
- UNLOCK(&fd->lock);
- goto out;
+ goto unlock;
}
- fd_ctx = (dht_fd_ctx_t *)tmp_val;
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)tmp_val;
GF_REF_GET(fd_ctx);
}
+unlock:
UNLOCK(&fd->lock);
out:
@@ -146,7 +146,7 @@ dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst)
fd_ctx = dht_fd_ctx_get(this, fd);
if (fd_ctx) {
- if (fd_ctx->opened_on_dst == (uint64_t)dst) {
+ if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) {
opened = _gf_true;
}
GF_REF_PUT(fd_ctx);
@@ -182,7 +182,7 @@ dht_inode_ctx_set_mig_info(xlator_t *this, inode_t *inode, xlator_t *src_subvol,
miginfo->dst_subvol = dst_subvol;
GF_REF_INIT(miginfo, dht_free_mig_info);
- value = (uint64_t)miginfo;
+ value = (uint64_t)(uintptr_t)miginfo;
ret = inode_ctx_set1(inode, this, &value);
if (ret < 0) {
@@ -209,7 +209,7 @@ dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode,
goto out;
}
- miginfo = (dht_migrate_info_t *)tmp_miginfo;
+ miginfo = (dht_migrate_info_t *)(uintptr_t)tmp_miginfo;
GF_REF_GET(miginfo);
}
UNLOCK(&inode->lock);
@@ -365,10 +365,27 @@ dht_check_and_open_fd_on_subvol_complete(int ret, call_frame_t *frame,
break;
+ case GF_FOP_FXATTROP:
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+ subvol->fops->fxattrop, local->fd,
+ local->rebalance.flags, local->rebalance.xattr,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr,
+ local->fd, local->key, NULL);
+ break;
+
+ case GF_FOP_FINODELK:
+ STACK_WIND(frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk,
+ local->key, local->fd, local->rebalance.lock_cmd,
+ &local->rebalance.flock, local->xattr_req);
+ break;
default:
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP,
- "Unknown FOP on fd (%p) on file %s @ %s", fd,
- uuid_utoa(fd->inode->gfid), subvol->name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p",
+ fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s",
+ subvol->name, NULL);
break;
}
@@ -428,10 +445,22 @@ handle_err:
DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
break;
+ case GF_FOP_FXATTROP:
+ DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
+ break;
+
+ case GF_FOP_FINODELK:
+ DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL);
+ break;
+
default:
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP,
- "Unknown FOP on fd (%p) on file %s @ %s", fd,
- uuid_utoa(fd->inode->gfid), subvol->name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p",
+ fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s",
+ subvol->name, NULL);
break;
}
@@ -484,10 +513,9 @@ dht_check_and_open_fd_on_subvol_task(void *data)
fd, NULL, NULL);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_OPEN_FD_ON_DST_FAILED,
- "Failed to open the fd"
- " (%p, flags=0%o) on file %s @ %s",
- fd, fd->flags, uuid_utoa(fd->inode->gfid), subvol->name);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_OPEN_FD_ON_DST_FAILED,
+ "fd=%p", fd, "flags=0%o", fd->flags, "gfid=%s",
+ uuid_utoa(fd->inode->gfid), "name=%s", subvol->name, NULL);
/* This can happen if the cached subvol was updated in the
* inode_ctx and the fd was opened on the new cached suvol
* after this fop was wound on the old cached subvol.
@@ -533,10 +561,8 @@ dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame)
dht_check_and_open_fd_on_subvol_complete, frame, frame);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, 0,
- "Failed to create synctask"
- " to check and open fd=%p",
- local->fd);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SYNCTASK_CREATE_FAILED,
+ "to-check-and-open fd=%p", local->fd, NULL);
}
return ret;
@@ -645,9 +671,7 @@ dht_get_subvol_from_id(xlator_t *this, int client_id)
ret = gf_asprintf(&sid, "%d", client_id);
if (ret == -1) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_ASPRINTF_FAILED,
- "asprintf failed while "
- "fetching subvol from the id");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_ASPRINTF_FAILED, NULL);
goto out;
}
@@ -938,7 +962,7 @@ dht_subvol_get_cached(xlator_t *this, inode_t *inode)
dht_layout_t *layout = NULL;
xlator_t *subvol = NULL;
- GF_VALIDATE_OR_GOTO(this->name, this, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
GF_VALIDATE_OR_GOTO(this->name, inode, out);
layout = dht_layout_get(this, inode);
@@ -1162,6 +1186,10 @@ dht_init_subvolumes(xlator_t *this, dht_conf_t *conf)
return -1;
}
conf->subvolume_cnt = cnt;
+ /* Doesn't make sense to do any dht layer tasks
+ if the subvol count is 1. Set it as pass_through */
+ if (cnt == 1)
+ this->pass_through = _gf_true;
conf->local_subvols_cnt = 0;
@@ -1256,6 +1284,7 @@ dht_migration_complete_check_task(void *data)
fd_t *tmp = NULL;
uint64_t tmp_miginfo = 0;
dht_migrate_info_t *miginfo = NULL;
+ gf_boolean_t skip_open = _gf_false;
int open_failed = 0;
this = THIS;
@@ -1302,11 +1331,11 @@ dht_migration_complete_check_task(void *data)
* migrated by two different layers. Raise
* a warning here.
*/
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO,
- "%s: Found miginfo in the inode ctx",
- tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid));
+ gf_smsg(
+ this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL);
- miginfo = (void *)tmp_miginfo;
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
GF_REF_PUT(miginfo);
}
ret = 1;
@@ -1325,10 +1354,9 @@ dht_migration_complete_check_task(void *data)
ret = syncop_lookup(this, &tmp_loc, &stbuf, 0, 0, 0);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
- "%s: failed to lookup the file on %s",
- tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
- this->name);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
+ "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", this->name, NULL);
local->op_errno = -ret;
ret = -1;
goto out;
@@ -1336,18 +1364,15 @@ dht_migration_complete_check_task(void *data)
dst_node = dht_subvol_get_cached(this, tmp_loc.inode);
if (linkto_target && dst_node != linkto_target) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_LINKFILE,
- "linkto target (%s) is "
- "different from cached-subvol (%s). Treating %s as "
- "destination subvol",
- linkto_target->name, dst_node->name, dst_node->name);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_LINKFILE,
+ "linkto_target_name=%s", linkto_target->name, "dst_name=%s",
+ dst_node->name, NULL);
}
if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH,
- "%s: gfid different on the target file on %s",
- tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
- dst_node->name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "dst_name=%s", dst_node->name, NULL);
ret = -1;
local->op_errno = EIO;
goto out;
@@ -1367,7 +1392,7 @@ dht_migration_complete_check_task(void *data)
done on all the fd of inode */
ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
if (tmp_miginfo) {
- miginfo = (void *)tmp_miginfo;
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
GF_REF_PUT(miginfo);
goto out;
}
@@ -1394,24 +1419,34 @@ dht_migration_complete_check_task(void *data)
* the loop will cause the destruction of the fd. So we need to
* iterate the list safely because iter_fd cannot be trusted.
*/
- list_for_each_entry_safe(iter_fd, tmp, &inode->fd_list, inode_list)
- {
- if (fd_is_anonymous(iter_fd))
- continue;
-
- if (dht_fd_open_on_dst(this, iter_fd, dst_node))
- continue;
-
+ iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list);
+ while (&iter_fd->inode_list != (&inode->fd_list)) {
+ if (fd_is_anonymous(iter_fd) ||
+ (dht_fd_open_on_dst(this, iter_fd, dst_node))) {
+ if (!tmp) {
+ iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd),
+ inode_list);
+ continue;
+ }
+ skip_open = _gf_true;
+ }
/* We need to release the inode->lock before calling
* syncop_open() to avoid possible deadlocks. However this
* can cause the iter_fd to be released by other threads.
* To avoid this, we take a reference before releasing the
* lock.
*/
- __fd_ref(iter_fd);
+ fd_ref(iter_fd);
UNLOCK(&inode->lock);
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
+ if (skip_open)
+ goto next;
+
/* flags for open are stripped down to allow following the
* new location of the file, otherwise we can get EEXIST or
* truncate the file again as rebalance is moving the data */
@@ -1419,12 +1454,10 @@ dht_migration_complete_check_task(void *data)
(iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
iter_fd, NULL, NULL);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_OPEN_FD_ON_DST_FAILED,
- "failed"
- " to open the fd"
- " (%p, flags=0%o) on file %s @ %s",
- iter_fd, iter_fd->flags, path, dst_node->name);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_OPEN_FD_ON_DST_FAILED, "id=%p", iter_fd,
+ "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s",
+ dst_node->name, NULL);
open_failed = 1;
local->op_errno = -ret;
@@ -1433,9 +1466,11 @@ dht_migration_complete_check_task(void *data)
dht_fd_ctx_set(this, iter_fd, dst_node);
}
- fd_unref(iter_fd);
-
+ next:
LOCK(&inode->lock);
+ skip_open = _gf_false;
+ tmp = iter_fd;
+ iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list);
}
SYNCTASK_SETID(frame->root->uid, frame->root->gid);
@@ -1448,6 +1483,10 @@ dht_migration_complete_check_task(void *data)
unlock:
UNLOCK(&inode->lock);
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
out:
if (dict) {
@@ -1529,6 +1568,7 @@ dht_rebalance_inprogress_task(void *data)
int open_failed = 0;
uint64_t tmp_miginfo = 0;
dht_migrate_info_t *miginfo = NULL;
+ gf_boolean_t skip_open = _gf_false;
this = THIS;
frame = data;
@@ -1571,10 +1611,10 @@ dht_rebalance_inprogress_task(void *data)
* migrated by two different layers. Raise
* a warning here.
*/
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO,
- "%s: Found miginfo in the inode ctx",
- tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid));
- miginfo = (void *)tmp_miginfo;
+ gf_smsg(
+ this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL);
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
GF_REF_PUT(miginfo);
}
ret = 1;
@@ -1582,17 +1622,16 @@ dht_rebalance_inprogress_task(void *data)
}
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_GET_XATTR_FAILED,
- "%s: failed to get the 'linkto' xattr", local->loc.path);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_GET_XATTR_FAILED,
+ "path=%s", local->loc.path, NULL);
ret = -1;
goto out;
}
dst_node = dht_linkfile_subvol(this, NULL, NULL, dict);
if (!dst_node) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_NOT_FOUND,
- "%s: failed to get the 'linkto' xattr from dict",
- local->loc.path);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GET_XATTR_FAILED,
+ "path=%s", local->loc.path, NULL);
ret = -1;
goto out;
}
@@ -1609,20 +1648,17 @@ dht_rebalance_inprogress_task(void *data)
/* lookup on dst */
ret = syncop_lookup(dst_node, &tmp_loc, &stbuf, NULL, NULL, NULL);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_FILE_LOOKUP_ON_DST_FAILED,
- "%s: failed to lookup the file on %s",
- tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
- dst_node->name);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
+ "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", dst_node->name, NULL);
ret = -1;
goto out;
}
if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH,
- "%s: gfid different on the target file on %s",
- tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
- dst_node->name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", dst_node->name, NULL);
ret = -1;
goto out;
}
@@ -1649,24 +1685,40 @@ dht_rebalance_inprogress_task(void *data)
* the loop will cause the destruction of the fd. So we need to
* iterate the list safely because iter_fd cannot be trusted.
*/
- list_for_each_entry_safe(iter_fd, tmp, &inode->fd_list, inode_list)
- {
- if (fd_is_anonymous(iter_fd))
- continue;
-
- if (dht_fd_open_on_dst(this, iter_fd, dst_node))
- continue;
-
+ iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list);
+ while (&iter_fd->inode_list != (&inode->fd_list)) {
/* We need to release the inode->lock before calling
* syncop_open() to avoid possible deadlocks. However this
* can cause the iter_fd to be released by other threads.
* To avoid this, we take a reference before releasing the
* lock.
*/
- __fd_ref(iter_fd);
+ if (fd_is_anonymous(iter_fd) ||
+ (dht_fd_open_on_dst(this, iter_fd, dst_node))) {
+ if (!tmp) {
+ iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd),
+ inode_list);
+ continue;
+ }
+ skip_open = _gf_true;
+ }
+
+ /* Yes, this is ugly but there isn't a cleaner way to do this
+ * the fd_ref is an atomic increment so not too bad. We want to
+ * reduce the number of inode locks and unlocks.
+ */
+
+ fd_ref(iter_fd);
UNLOCK(&inode->lock);
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
+ if (skip_open)
+ goto next;
+
/* flags for open are stripped down to allow following the
* new location of the file, otherwise we can get EEXIST or
* truncate the file again as rebalance is moving the data */
@@ -1674,11 +1726,10 @@ dht_rebalance_inprogress_task(void *data)
(iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
iter_fd, NULL, NULL);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_OPEN_FD_ON_DST_FAILED,
- "failed to send open "
- "the fd (%p, flags=0%o) on file %s @ %s",
- iter_fd, iter_fd->flags, path, dst_node->name);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_OPEN_FD_ON_DST_FAILED, "fd=%p", iter_fd,
+ "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s",
+ dst_node->name, NULL);
ret = -1;
open_failed = 1;
} else {
@@ -1687,9 +1738,11 @@ dht_rebalance_inprogress_task(void *data)
dht_fd_ctx_set(this, iter_fd, dst_node);
}
- fd_unref(iter_fd);
-
+ next:
LOCK(&inode->lock);
+ skip_open = _gf_false;
+ tmp = iter_fd;
+ iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list);
}
SYNCTASK_SETID(frame->root->uid, frame->root->gid);
@@ -1697,6 +1750,10 @@ dht_rebalance_inprogress_task(void *data)
unlock:
UNLOCK(&inode->lock);
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
if (open_failed) {
ret = -1;
goto out;
@@ -1704,9 +1761,8 @@ unlock:
ret = dht_inode_ctx_set_mig_info(this, inode, src_node, dst_node);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
- "%s: failed to set inode-ctx target file at %s", local->loc.path,
- dst_node->name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "path=%s", local->loc.path, "name=%s", dst_node->name, NULL);
goto out;
}
@@ -1799,12 +1855,16 @@ dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat,
time = &ctx->time;
- DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, stat->ia_mtime,
- stat->ia_mtime_nsec, inode, post);
- DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, stat->ia_ctime,
- stat->ia_ctime_nsec, inode, post);
- DHT_UPDATE_TIME(time->atime, time->atime_nsec, stat->ia_atime,
- stat->ia_atime_nsec, inode, post);
+ LOCK(&inode->lock);
+ {
+ DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, stat->ia_mtime,
+ stat->ia_mtime_nsec, post);
+ DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, stat->ia_ctime,
+ stat->ia_ctime_nsec, post);
+ DHT_UPDATE_TIME(time->atime, time->atime_nsec, stat->ia_atime,
+ stat->ia_atime_nsec, post);
+ }
+ UNLOCK(&inode->lock);
ret = dht_inode_ctx_set(inode, this, ctx);
out:
@@ -1826,7 +1886,7 @@ dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx)
return ret;
if (ctx)
- *ctx = (dht_inode_ctx_t *)ctx_int;
+ *ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
out:
return ret;
}
@@ -1873,9 +1933,7 @@ dht_heal_path(xlator_t *this, char *path, inode_table_t *itable)
};
char *bname = NULL;
char *save_ptr = NULL;
- uuid_t gfid = {
- 0,
- };
+ static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
char *tmp_path = NULL;
tmp_path = gf_strdup(path);
@@ -1883,9 +1941,6 @@ dht_heal_path(xlator_t *this, char *path, inode_table_t *itable)
goto out;
}
- memset(gfid, 0, 16);
- gfid[15] = 1;
-
gf_uuid_copy(loc.pargfid, gfid);
loc.parent = inode_ref(itable->root);
@@ -1914,10 +1969,10 @@ dht_heal_path(xlator_t *this, char *path, inode_table_t *itable)
*/
linked_inode = loc.inode;
bname = strtok_r(NULL, "/", &save_ptr);
- inode_unref(loc.parent);
if (!bname) {
goto out;
}
+ inode_unref(loc.parent);
loc.parent = loc.inode;
gf_uuid_copy(loc.pargfid, loc.inode->gfid);
loc.inode = NULL;
@@ -1929,10 +1984,9 @@ dht_heal_path(xlator_t *this, char *path, inode_table_t *itable)
ret = syncop_lookup(this, &loc, &iatt, NULL, NULL, NULL);
if (ret) {
- gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED,
- "Healing of path %s failed on subvolume %s for "
- "directory %s",
- path, this->name, bname);
+ gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED,
+ "path=%s", path, "subvolume=%s", this->name, "bname=%s",
+ bname, NULL);
goto out;
}
@@ -1990,10 +2044,8 @@ dht_heal_full_path(void *data)
ret = syncop_getxattr(source, &loc, &dict, GET_ANCESTRY_PATH_KEY, NULL,
NULL);
if (ret) {
- gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED,
- "Failed to get path from subvol %s. Aborting "
- "directory healing.",
- source->name);
+ gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_HEAL_ABORT,
+ "subvol=%s", source->name, NULL);
goto out;
}
@@ -2031,6 +2083,7 @@ dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data)
dht_local_t *local = NULL;
xlator_t *this = NULL;
int ret = -1;
+ int op_errno = 0;
local = heal_frame->local;
main_frame = local->main_frame;
@@ -2040,10 +2093,12 @@ dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data)
dht_set_fixed_dir_stat(&local->postparent);
if (local->need_xattr_heal) {
local->need_xattr_heal = 0;
- ret = dht_dir_xattr_heal(this, local);
- if (ret)
- gf_msg(this->name, GF_LOG_ERROR, ret, DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "xattr heal failed for directory %s ", local->loc.path);
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path,
+ NULL);
+ }
}
DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf,
@@ -2068,7 +2123,7 @@ __dht_lock_subvol_set(inode_t *inode, xlator_t *this, xlator_t *lock_subvol)
return -1;
}
- ctx = (dht_inode_ctx_t *)value;
+ ctx = (dht_inode_ctx_t *)(uintptr_t)value;
ctx->lock_subvol = lock_subvol;
out:
return ret;
@@ -2123,23 +2178,22 @@ dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock,
LOCK(&inode->lock);
ret = __inode_ctx_get0(inode, this, &value);
if (!ret && value) {
- ctx = (dht_inode_ctx_t *)value;
+ ctx = (dht_inode_ctx_t *)(uintptr_t)value;
subvol = ctx->lock_subvol;
}
if (!subvol && lock->l_type != F_UNLCK && cached_subvol) {
ret = __dht_lock_subvol_set(inode, this, cached_subvol);
if (ret) {
gf_uuid_unparse(inode->gfid, gfid);
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SET_INODE_CTX_FAILED,
- "Failed to set lock_subvol in "
- "inode ctx for gfid %s",
- gfid);
- goto unlock;
+ UNLOCK(&inode->lock);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "lock_subvol gfid=%s", gfid, NULL);
+ goto post_unlock;
}
subvol = cached_subvol;
}
-unlock:
UNLOCK(&inode->lock);
+post_unlock:
if (!subvol && inode && lock->l_type != F_UNLCK) {
inode_unref(inode);
}
@@ -2163,8 +2217,8 @@ dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret)
inode = local->loc.inode ? local->loc.inode : local->fd->inode;
}
if (!inode) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LOCK_INODE_UNREF_FAILED,
- "Found a NULL inode. Failed to unref the inode");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LOCK_INODE_UNREF_FAILED,
+ NULL);
goto out;
}
@@ -2190,11 +2244,8 @@ dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret)
inode_unref(inode);
} else {
gf_uuid_unparse(inode->gfid, gfid);
- gf_msg(this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LOCK_INODE_UNREF_FAILED,
- "Unlock request failed for gfid %s."
- "Failed to unref the inode",
- gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCK_INODE_UNREF_FAILED, "gfid=%s", gfid, NULL);
goto out;
}
default:
@@ -2216,12 +2267,11 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
int luret = -1;
int luflag = -1;
int i = 0;
+ char **xattrs_to_heal;
if (!src || !dst) {
- gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED,
- "src or dst is NULL. Failed to set "
- " dictionary value for path %s",
- local->loc.path);
+ gf_smsg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DST_NULL_SET_FAILED,
+ "path=%s", local->loc.path, NULL);
return;
}
/* Check if any user xattr present in src dict and set
@@ -2232,17 +2282,18 @@ dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
and set it to dst dict, here index start from 1 because
user xattr already checked in previous statement
*/
+
+ xattrs_to_heal = get_xattrs_to_heal();
+
for (i = 1; xattrs_to_heal[i]; i++) {
keyval = dict_get(src, xattrs_to_heal[i]);
if (keyval) {
luflag = 1;
ret = dict_set(dst, xattrs_to_heal[i], keyval);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s",
- xattrs_to_heal[i], local->loc.path);
+ gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_DICT_SET_FAILED, "key=%s", xattrs_to_heal[i],
+ "path=%s", local->loc.path, NULL);
keyval = NULL;
}
}
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index f2be5120e37..dbb8070b0da 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -10,25 +10,25 @@
#include "dht-common.h"
-int
+static int
dht_access2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
-int
+static int
dht_readv2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
-int
+static int
dht_attr2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
-int
+static int
dht_open2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
-int
+static int
dht_flush2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
-int
+static int
dht_lk2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
-int
+static int
dht_fsync2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
-int
+static int
dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
int ret);
-int
+static int
dht_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, fd_t *fd, dict_t *xdata)
{
@@ -67,7 +67,7 @@ out:
return 0;
}
-int
+static int
dht_open2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -162,8 +162,8 @@ dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
local = frame->local;
prev = cookie;
- if ((local->fop == GF_FOP_FSTAT) && (op_ret == -1) && (op_errno == EBADF) &&
- !(local->fd_checked)) {
+ if ((local->fop == GF_FOP_FSTAT) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -216,7 +216,7 @@ err:
return 0;
}
-int
+static int
dht_attr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -258,19 +258,13 @@ out:
return 0;
}
-int
+static int
dht_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, struct iatt *stbuf, dict_t *xdata)
{
dht_local_t *local = NULL;
int this_call_cnt = 0;
xlator_t *prev = NULL;
-
- GF_VALIDATE_OR_GOTO("dht", frame, err);
- GF_VALIDATE_OR_GOTO("dht", this, out);
- GF_VALIDATE_OR_GOTO("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO("dht", cookie, out);
-
local = frame->local;
prev = cookie;
@@ -278,25 +272,25 @@ dht_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
{
if (op_ret == -1) {
local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
prev->name);
- goto unlock;
+ goto post_unlock;
}
dht_iatt_merge(this, &local->stbuf, stbuf);
local->op_ret = 0;
}
-unlock:
UNLOCK(&frame->lock);
-out:
+post_unlock:
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
DHT_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno,
&local->stbuf, xdata);
}
-err:
+
return 0;
}
@@ -437,7 +431,7 @@ dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
if (local->call_cnt != 1)
goto out;
- if (op_ret == -1 && (op_errno == EBADF) && !(local->fd_checked)) {
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -479,7 +473,7 @@ out:
return 0;
}
-int
+static int
dht_readv2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -564,7 +558,7 @@ err:
return 0;
}
-int
+static int
dht_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, dict_t *xdata)
{
@@ -612,7 +606,7 @@ out:
return 0;
}
-int
+static int
dht_access2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -709,7 +703,7 @@ dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
if (local->call_cnt != 1)
goto out;
- if (op_ret == -1 && (op_errno == EBADF) && !(local->fd_checked)) {
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -741,7 +735,7 @@ out:
return 0;
}
-int
+static int
dht_flush2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -826,7 +820,7 @@ dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
local->op_errno = op_errno;
- if (op_ret == -1 && (op_errno == EBADF) && !(local->fd_checked)) {
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -887,7 +881,7 @@ out:
return 0;
}
-int
+static int
dht_fsync2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -965,7 +959,7 @@ err:
/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to
indicate that lock migration happened on the fd, so we can consider it as
phase 2 of migration */
-int
+static int
dht_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, struct gf_flock *flock, dict_t *xdata)
{
@@ -1012,7 +1006,7 @@ out:
return 0;
}
-int
+static int
dht_lk2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -1093,7 +1087,7 @@ err:
return 0;
}
-int
+static int
dht_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, struct gf_lease *lease, dict_t *xdata)
{
@@ -1135,7 +1129,7 @@ err:
}
/* Symlinks are currently not migrated, so no need for any check here */
-int
+static int
dht_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int op_errno, const char *path, struct iatt *stbuf,
dict_t *xdata)
@@ -1198,6 +1192,29 @@ err:
return 0;
}
+/* Get both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
+ * Use DHT_MODE_IN_XDATA_KEY if available, else fall back to
+ * DHT_IATT_IN_XDATA_KEY
+ * This will return a dummy iatt with only the mode and type set
+ */
+static int
+dht_read_iatt_from_xdata(dict_t *xdata, struct iatt *stbuf)
+{
+ int ret = -1;
+ int32_t mode = 0;
+
+ ret = dict_get_int32(xdata, DHT_MODE_IN_XDATA_KEY, &mode);
+
+ if (ret) {
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+ } else {
+ stbuf->ia_prot = ia_prot_from_st_mode(mode);
+ stbuf->ia_type = ia_type_from_st_mode(mode);
+ }
+
+ return ret;
+}
+
int
dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
@@ -1229,7 +1246,14 @@ dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (local->call_cnt != 1)
goto out;
- ret = dht_read_iatt_from_xdata(this, xdata, &stbuf);
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ ret = dht_read_iatt_from_xdata(xdata, &stbuf);
if ((!op_ret) && (ret)) {
/* This is a potential problem and can cause corruption
@@ -1281,7 +1305,7 @@ out:
return 0;
}
-int
+static int
dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
int ret)
{
@@ -1340,7 +1364,7 @@ out:
return 0;
}
-int
+static int
dht_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
@@ -1348,6 +1372,22 @@ dht_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
+/* Set both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
+ * Use DHT_MODE_IN_XDATA_KEY if available. Else fall back to
+ * DHT_IATT_IN_XDATA_KEY
+ */
+static int
+dht_request_iatt_in_xdata(dict_t *xattr_req)
+{
+ int ret = -1;
+
+ ret = dict_set_int8(xattr_req, DHT_MODE_IN_XDATA_KEY, 1);
+ ret = dict_set_int8(xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+
+ /* At least one call succeeded */
+ return ret;
+}
+
int
dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
@@ -1390,7 +1430,7 @@ dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
local->rebalance.xattr = dict_ref(dict);
local->rebalance.flags = flags;
- ret = dht_request_iatt_in_xdata(this, local->xattr_req);
+ ret = dht_request_iatt_in_xdata(local->xattr_req);
if (ret) {
gf_msg_debug(this->name, 0,
@@ -1412,7 +1452,7 @@ err:
return 0;
}
-int
+static int
dht_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
@@ -1460,7 +1500,7 @@ dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
local->rebalance.xattr = dict_ref(dict);
local->rebalance.flags = flags;
- ret = dht_request_iatt_in_xdata(this, local->xattr_req);
+ ret = dht_request_iatt_in_xdata(local->xattr_req);
if (ret) {
gf_msg_debug(this->name, 0, "Failed to set dictionary key %s fd=%p",
@@ -1485,7 +1525,7 @@ err:
* below fops, hence not implementing 'migration' related checks
*/
-int
+static int
dht_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -1541,8 +1581,26 @@ dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+
+ local = frame->local;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+out:
dht_lk_inode_unref(frame, op_ret);
DHT_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+
return 0;
}
@@ -1580,6 +1638,13 @@ dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
if (ret)
goto err;
*/
+ local->rebalance.flock = *lock;
+ local->rebalance.lock_cmd = cmd;
+ local->key = gf_strdup(volume);
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
STACK_WIND(frame, dht_finodelk_cbk, lock_subvol,
lock_subvol->fops->finodelk, volume, fd, cmd, lock, xdata);
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index d0d12fd7658..2f23ce90fbd 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -10,17 +10,17 @@
#include "dht-common.h"
-int
+static int
dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
-int
+static int
dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
-int
+static int
dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
-int
+static int
dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
-int
+static int
dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
-int
+static int
dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
int
@@ -49,7 +49,7 @@ dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
* We only check once as this could be a valid bad fd error.
*/
- if (op_ret == -1 && (op_errno == EBADF) && !(local->fd_checked)) {
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -93,30 +93,28 @@ dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
/* Check if the rebalance phase1 is true */
if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
- if (!dht_is_tier_xlator(this)) {
+ if (!local->xattr_req) {
+ local->xattr_req = dict_new();
if (!local->xattr_req) {
- local->xattr_req = dict_new();
- if (!local->xattr_req) {
- gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
- "insufficient memory");
- local->op_errno = ENOMEM;
- local->op_ret = -1;
- goto out;
- }
- }
-
- ret = dict_set_uint32(local->xattr_req,
- GF_PROTECT_FROM_EXTERNAL_WRITES, 1);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0,
- "Failed to set key %s in dictionary",
- GF_PROTECT_FROM_EXTERNAL_WRITES);
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+ "insufficient memory");
local->op_errno = ENOMEM;
local->op_ret = -1;
goto out;
}
}
+ ret = dict_set_uint32(local->xattr_req, GF_PROTECT_FROM_EXTERNAL_WRITES,
+ 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0,
+ "Failed to set key %s in dictionary",
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
+ local->op_errno = ENOMEM;
+ local->op_ret = -1;
+ goto out;
+ }
+
dht_iatt_merge(this, &local->stbuf, postbuf);
dht_iatt_merge(this, &local->prebuf, prebuf);
@@ -142,7 +140,7 @@ out:
return 0;
}
-int
+static int
dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -262,8 +260,8 @@ dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
* We only check once as this could actually be a valid error.
*/
- if ((local->fop == GF_FOP_FTRUNCATE) && (op_ret == -1) &&
- ((op_errno == EBADF) || (op_errno == EINVAL)) && !(local->fd_checked)) {
+ if ((local->fop == GF_FOP_FTRUNCATE) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -336,7 +334,7 @@ err:
return 0;
}
-int
+static int
dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -489,7 +487,7 @@ dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
* We only check once as this could actually be a valid error.
*/
- if ((op_ret == -1) && (op_errno == EBADF) && !(local->fd_checked)) {
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -555,7 +553,7 @@ err:
return 0;
}
-int
+static int
dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -666,7 +664,7 @@ dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
* and a lookup updated the cached subvol in the inode ctx.
* We only check once as this could actually be a valid error.
*/
- if ((op_ret == -1) && (op_errno == EBADF) && !(local->fd_checked)) {
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -731,7 +729,7 @@ err:
return 0;
}
-int
+static int
dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -838,7 +836,7 @@ dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
* and a lookup updated the cached subvol in the inode ctx.
* We only check once as this could actually be a valid error.
*/
- if ((op_ret == -1) && (op_errno == EBADF) && !(local->fd_checked)) {
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -902,7 +900,7 @@ err:
return 0;
}
-int
+static int
dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -1005,8 +1003,8 @@ dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
- if ((local->fop == GF_FOP_FSETATTR) && (op_ret == -1) &&
- (op_errno == EBADF) && !(local->fd_checked)) {
+ if ((local->fop == GF_FOP_FSETATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
ret = dht_check_and_open_fd_on_subvol(this, frame);
if (ret)
goto out;
@@ -1049,7 +1047,7 @@ out:
return 0;
}
-int
+static int
dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
dht_local_t *local = NULL;
@@ -1113,9 +1111,10 @@ dht_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
{
if (op_ret == -1) {
local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
prev->name);
- goto unlock;
+ goto post_unlock;
}
dht_iatt_merge(this, &local->prebuf, statpre);
@@ -1124,9 +1123,8 @@ dht_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
local->op_ret = 0;
local->op_errno = 0;
}
-unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
if (local->op_ret == 0)
@@ -1151,24 +1149,22 @@ dht_non_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
prev = cookie;
+ if (op_ret == -1) {
+ gf_msg(this->name, op_errno, 0, 0, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
+ }
+
LOCK(&frame->lock);
{
- if (op_ret == -1) {
- gf_msg(this->name, op_errno, 0, 0, "subvolume %s returned -1",
- prev->name);
-
- goto unlock;
- }
-
dht_iatt_merge(this, &local->prebuf, statpre);
dht_iatt_merge(this, &local->stbuf, statpost);
local->op_ret = 0;
local->op_errno = 0;
}
-unlock:
UNLOCK(&frame->lock);
-
+post_unlock:
this_call_cnt = dht_frame_return(frame);
if (is_last_call(this_call_cnt)) {
dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf);
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 43746bc63b9..fda904c92c9 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -8,11 +8,8 @@
cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "byte-order.h"
-#include "dht-messages.h"
+#include <glusterfs/byte-order.h>
#include "unittest/unittest.h"
#define layout_base_size (sizeof(dht_layout_t))
@@ -134,9 +131,8 @@ dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name)
ret = dht_hash_compute(this, layout->type, name, &hash);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMPUTE_HASH_FAILED,
- "hash computation failed for type=%d name=%s", layout->type,
- name);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMPUTE_HASH_FAILED,
+ "type=%d", layout->type, "name=%s", name, NULL);
goto out;
}
@@ -148,8 +144,8 @@ dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name)
}
if (!subvol) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "no subvolume for hash (value) = %u", hash);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "hash-value=0x%x", hash, NULL);
}
out:
@@ -258,7 +254,7 @@ dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
return dht_disk_layout_extract(this, layout, i, disk_layout_p);
}
-int
+static int
dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
void *disk_layout_raw, int disk_layout_len)
{
@@ -269,8 +265,8 @@ dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
int disk_layout[4];
if (!disk_layout_raw) {
- gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
- "error no layout on disk for merge");
+ gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ NULL);
return -1;
}
@@ -287,10 +283,8 @@ dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
case DHT_HASH_TYPE_DM:
break;
default:
- gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: "
- "Catastrophic error layout with unknown type found %d",
- disk_layout[1]);
+ gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_DISK_LAYOUT,
+ "layout=%d", disk_layout[1], NULL);
return -1;
}
@@ -302,9 +296,10 @@ dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
layout->list[pos].start = start_off;
layout->list[pos].stop = stop_off;
- gf_msg_trace(
- this->name, 0, "merged to layout: %u - %u (type %d, hash %d) from %s",
- start_off, stop_off, commit_hash, type, layout->list[pos].xlator->name);
+ gf_msg_trace(this->name, 0,
+ "merged to layout: 0x%x - 0x%x (hash 0x%x, type %d) from %s",
+ start_off, stop_off, commit_hash, type,
+ layout->list[pos].xlator->name);
return 0;
}
@@ -357,8 +352,8 @@ dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
ret = dht_disk_layout_merge(this, layout, i, disk_layout_raw,
disk_layout_len);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
- "layout merge from subvolume %s failed", subvol->name);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "subvolume=%s", subvol->name, NULL);
goto out;
}
@@ -417,8 +412,7 @@ dht_layout_range_swap(dht_layout_t *layout, int i, int j)
layout->list[j].start = start_swap;
layout->list[j].stop = stop_swap;
}
-
-int64_t
+static int64_t
dht_layout_entry_cmp_volname(dht_layout_t *layout, int i, int j)
{
return (strcmp(layout->list[i].xlator->name, layout->list[j].xlator->name));
@@ -441,7 +435,7 @@ dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator)
return _gf_false;
}
-int64_t
+static int64_t
dht_layout_entry_cmp(dht_layout_t *layout, int i, int j)
{
int64_t diff = 0;
@@ -477,7 +471,7 @@ dht_layout_sort(dht_layout_t *layout)
return 0;
}
-int
+void
dht_layout_sort_volname(dht_layout_t *layout)
{
int i = 0;
@@ -493,11 +487,9 @@ dht_layout_sort_volname(dht_layout_t *layout)
dht_layout_entry_swap(layout, i, j);
}
}
-
- return 0;
}
-int
+void
dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
uint32_t *holes_p, uint32_t *overlaps_p,
uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
@@ -510,7 +502,6 @@ dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
uint32_t hole_cnt = 0;
uint32_t overlap_cnt = 0;
int i = 0;
- int ret = 0;
uint32_t prev_stop = 0;
uint32_t last_stop = 0;
char is_virgin = 1;
@@ -593,8 +584,6 @@ dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
if (no_space_p)
*no_space_p = no_space;
-
- return ret;
}
int
@@ -630,21 +619,15 @@ dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout)
ret = dht_layout_sort(layout);
if (ret == -1) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED,
- "sort failed?! how the ....");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED,
+ NULL);
goto out;
}
gf_uuid_unparse(loc->gfid, gfid);
- ret = dht_layout_anomalies(this, loc, layout, &holes, &overlaps, &missing,
- &down, &misc, NULL);
- if (ret == -1) {
- gf_msg(this->name, GF_LOG_WARNING, 0,
- DHT_MSG_FIND_LAYOUT_ANOMALIES_ERROR,
- "Error finding anomalies in %s, gfid = %s", loc->path, gfid);
- goto out;
- }
+ dht_layout_anomalies(this, loc, layout, &holes, &overlaps, &missing, &down,
+ &misc, NULL);
if (holes || overlaps) {
if (missing == layout->cnt) {
@@ -653,10 +636,9 @@ dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout)
" gfid = %s",
loc->path, gfid);
} else {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_ANOMALIES_INFO,
- "Found anomalies in %s (gfid = %s). "
- "Holes=%d overlaps=%d",
- loc->path, gfid, holes, overlaps);
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_ANOMALIES_INFO,
+ "path=%s", loc->path, "gfid=%s", gfid, "holes=%d", holes,
+ "overlaps=%d", overlaps, NULL);
}
ret = -1;
}
@@ -723,12 +705,11 @@ dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
if (!xattr) {
if (err == 0) {
if (loc) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_DICT_GET_FAILED,
- "%s: xattr dictionary is NULL", loc->path);
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL,
+ "path=%s", loc->path, NULL);
} else {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_DICT_GET_FAILED,
- "path not found: "
- "xattr dictionary is NULL");
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL,
+ "path not found", NULL);
}
ret = -1;
}
@@ -740,13 +721,13 @@ dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
if (dict_ret < 0) {
if (err == 0 && layout->list[pos].stop) {
if (loc) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
- "%s: Disk layout missing, gfid = %s", loc->path, gfid);
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
} else {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
- "path not found: "
- "Disk layout missing, gfid = %s",
- gfid);
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
+ "path not found"
+ "gfid=%s",
+ gfid, NULL);
}
ret = -1;
}
@@ -762,13 +743,13 @@ dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
if ((layout->list[pos].start != start_off) ||
(layout->list[pos].stop != stop_off) ||
(layout->list[pos].commit_hash != commit_hash)) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_INFO,
- "subvol: %s; inode layout - %" PRIu32 " - %" PRIu32 " - %" PRIu32
- "; "
- "disk layout - %" PRIu32 " - %" PRIu32 " - %" PRIu32,
- layout->list[pos].xlator->name, layout->list[pos].start,
- layout->list[pos].stop, layout->list[pos].commit_hash, start_off,
- stop_off, commit_hash);
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_INFO, "subvol=%s",
+ layout->list[pos].xlator->name, "inode-layout:start=0x%x",
+ layout->list[pos].start, "inode-layout:stop=0x%x",
+ layout->list[pos].stop, "layout-commit-hash=0x%x; ",
+ layout->list[pos].commit_hash, "disk-layout:start-off=0x%x",
+ start_off, "disk-layout:top-off=0x%x", stop_off,
+ "commit-hash=0x%x", commit_hash, NULL);
ret = 1;
} else {
ret = 0;
@@ -790,9 +771,8 @@ dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode)
layout = dht_layout_for_subvol(this, subvol);
if (!layout) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_NO_LAYOUT_INFO,
- "no pre-set layout for subvolume %s",
- subvol ? subvol->name : "<nil>");
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_NO_LAYOUT_INFO,
+ "subvolume=%s", subvol ? subvol->name : "<nil>", NULL);
ret = -1;
goto out;
}
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index eb1695f7e05..89ec6cca56e 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -8,13 +8,10 @@
cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
-#include "compat.h"
+#include <glusterfs/compat.h>
#include "dht-common.h"
-#include "dht-messages.h"
-int
+static int
dht_linkfile_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, dict_t *xattr,
@@ -37,17 +34,16 @@ dht_linkfile_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
if (!is_linkfile)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR,
- "got non-linkfile %s:%s, gfid = %s", prev->name, local->loc.path,
- gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR,
+ "name=%s", prev->name, "path=%s", local->loc.path, "gfid=%s",
+ gfid, NULL);
out:
local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode,
stbuf, postparent, postparent, xattr);
return 0;
}
-#define is_equal(a, b) ((a) == (b))
-int
+static int
dht_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
@@ -76,9 +72,8 @@ dht_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value. key : %s",
- conf->link_xattr_name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "mame=%s", conf->link_xattr_name, NULL);
goto out;
}
@@ -128,27 +123,23 @@ dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true);
if (ret)
- gf_msg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value: "
- "key = gfid-req, gfid = %s ",
- loc->path, gfid);
+ gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
} else {
gf_uuid_unparse(loc->gfid, gfid);
}
ret = dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
if (ret)
- gf_msg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value: key = %s,"
- " gfid = %s",
- loc->path, GLUSTERFS_INTERNAL_FOP_KEY, gfid);
+ gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY,
+ "gfid=%s", gfid, NULL);
ret = dict_set_str(dict, conf->link_xattr_name, tovol->name);
if (ret < 0) {
- gf_msg(frame->this->name, GF_LOG_INFO, 0, DHT_MSG_CREATE_LINK_FAILED,
- "%s: failed to initialize linkfile data, gfid = %s", loc->path,
- gfid);
+ gf_smsg(frame->this->name, GF_LOG_INFO, 0, DHT_MSG_CREATE_LINK_FAILED,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
goto out;
}
@@ -189,10 +180,9 @@ dht_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1) {
gf_uuid_unparse(local->loc.gfid, gfid);
- gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_UNLINK_FAILED,
- "Unlinking linkfile %s (gfid = %s)on "
- "subvolume %s failed ",
- local->loc.path, gfid, subvol->name);
+ gf_smsg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_UNLINK_FAILED,
+ "path=%s", local->loc.path, "gfid=%s", gfid, "subvolume=%s",
+ subvol->name, NULL);
}
DHT_STACK_DESTROY(frame);
@@ -260,7 +250,7 @@ out:
return subvol;
}
-int
+static int
dht_linkfile_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *statpre,
struct iatt *statpost, dict_t *xdata)
@@ -272,10 +262,9 @@ dht_linkfile_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
loc = &local->loc;
if (op_ret)
- gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_SETATTR_FAILED,
- "Failed to set attr uid/gid on %s"
- " :<gfid:%s> ",
- (loc->path ? loc->path : "NULL"), uuid_utoa(local->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_SETATTR_FAILED,
+ "path=%s", (loc->path ? loc->path : "NULL"), "gfid=%s",
+ uuid_utoa(local->gfid), NULL);
DHT_STACK_DESTROY(frame);
diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c
index f9bac4f97c8..638821ccee5 100644
--- a/xlators/cluster/dht/src/dht-lock.c
+++ b/xlators/cluster/dht/src/dht-lock.c
@@ -44,7 +44,8 @@ dht_log_lk_array(char *name, gf_loglevel_t log_level, dht_lock_t **lk_array,
if (!lk_buf)
goto out;
- gf_msg(name, log_level, 0, DHT_MSG_LK_ARRAY_INFO, "%d. %s", i, lk_buf);
+ gf_smsg(name, log_level, 0, DHT_MSG_LK_ARRAY_INFO, "index=%d", i,
+ "lk_buf=%s", lk_buf, NULL);
GF_FREE(lk_buf);
}
@@ -313,11 +314,9 @@ dht_unlock_entrylk_done(call_frame_t *frame, void *cookie, xlator_t *this,
gfid);
if (op_ret < 0) {
- gf_msg(this->name, GF_LOG_WARNING, op_errno,
- DHT_MSG_PARENT_LAYOUT_CHANGED,
- "unlock failed on gfid: %s, stale lock might be left "
- "in DHT_LAYOUT_HEAL_DOMAIN",
- gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UNLOCK_GFID_FAILED, "gfid=%s", gfid,
+ "DHT_LAYOUT_HEAL_DOMAIN", NULL);
}
DHT_STACK_DESTROY(frame);
@@ -339,9 +338,10 @@ dht_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
uuid_utoa_r(local->lock[0].ns.directory_ns.locks[lk_index]->loc.gfid, gfid);
if (op_ret < 0) {
- gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
- "unlocking failed on %s:%s",
- local->lock[0].ns.directory_ns.locks[lk_index]->xl->name, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
+ "name=%s",
+ local->lock[0].ns.directory_ns.locks[lk_index]->xl->name,
+ "gfid=%s", gfid, NULL);
} else {
local->lock[0].ns.directory_ns.locks[lk_index]->locked = 0;
}
@@ -375,9 +375,9 @@ dht_unlock_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
lock_frame = dht_lock_frame(frame);
if (lock_frame == NULL) {
- gf_msg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
- "cannot allocate a frame, not unlocking following "
- "entrylks:");
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ NULL);
dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
goto done;
@@ -385,9 +385,9 @@ dht_unlock_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk);
if (ret < 0) {
- gf_msg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
- "storing locks in local failed, not unlocking "
- "following entrylks:");
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ NULL);
dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
@@ -446,21 +446,17 @@ dht_unlock_entrylk_wrapper(call_frame_t *frame, dht_elock_wrap_t *entrylk)
lock_frame = copy_frame(frame);
if (lock_frame == NULL) {
- gf_msg(frame->this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_PARENT_LAYOUT_CHANGED,
- "mkdir (%s/%s) (path: %s): "
- "copy frame failed",
- pgfid, local->loc.name, local->loc.path);
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
goto done;
}
lock_local = dht_local_init(lock_frame, NULL, NULL, 0);
if (lock_local == NULL) {
- gf_msg(frame->this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_PARENT_LAYOUT_CHANGED,
- "mkdir (%s/%s) (path: %s): "
- "local creation failed",
- pgfid, local->loc.name, local->loc.path);
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_CREATE_FAILED, "local", "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
goto done;
}
@@ -700,9 +696,10 @@ dht_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
uuid_utoa_r(local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid,
gfid);
- gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
- "unlocking failed on %s:%s",
- local->lock[0].layout.my_layout.locks[lk_index]->xl->name, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
+ "name=%s",
+ local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
+ "gfid=%s", gfid, NULL);
} else {
local->lock[0].layout.my_layout.locks[lk_index]->locked = 0;
}
@@ -727,11 +724,9 @@ dht_unlock_inodelk_done(call_frame_t *frame, void *cookie, xlator_t *this,
gfid);
if (op_ret < 0) {
- gf_msg(this->name, GF_LOG_WARNING, op_errno,
- DHT_MSG_PARENT_LAYOUT_CHANGED,
- "unlock failed on gfid: %s, stale lock might be left "
- "in DHT_LAYOUT_HEAL_DOMAIN",
- gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UNLOCK_GFID_FAILED, "DHT_LAYOUT_HEAL_DOMAIN gfid=%s",
+ gfid, NULL);
}
DHT_STACK_DESTROY(frame);
@@ -762,9 +757,9 @@ dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
lock_frame = dht_lock_frame(frame);
if (lock_frame == NULL) {
- gf_msg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
- "cannot allocate a frame, not unlocking following "
- "locks:");
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ NULL);
dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
goto done;
@@ -772,9 +767,9 @@ dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
if (ret < 0) {
- gf_msg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
- "storing locks in local failed, not unlocking "
- "following locks:");
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ NULL);
dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
@@ -834,21 +829,17 @@ dht_unlock_inodelk_wrapper(call_frame_t *frame, dht_ilock_wrap_t *inodelk)
lock_frame = copy_frame(frame);
if (lock_frame == NULL) {
- gf_msg(frame->this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_PARENT_LAYOUT_CHANGED,
- "mkdir (%s/%s) (path: %s): "
- "copy frame failed",
- pgfid, local->loc.name, local->loc.path);
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
goto done;
}
lock_local = dht_local_init(lock_frame, NULL, NULL, 0);
if (lock_local == NULL) {
- gf_msg(frame->this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_PARENT_LAYOUT_CHANGED,
- "mkdir (%s/%s) (path: %s): "
- "local creation failed",
- pgfid, local->loc.name, local->loc.path);
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_CREATE_FAILED, "local", "gfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
goto done;
}
@@ -1039,13 +1030,12 @@ dht_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
gfid);
local->lock[0].layout.my_layout.op_ret = -1;
local->lock[0].layout.my_layout.op_errno = op_errno;
- gf_msg(this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_INODELK_FAILED,
- "inodelk failed on subvol %s. gfid:%s",
- local->lock[0]
- .layout.my_layout.locks[lk_index]
- ->xl->name,
- gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_INODELK_FAILED, "subvol=%s",
+ local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->xl->name,
+ "gfid=%s", gfid, NULL);
goto cleanup;
}
break;
@@ -1060,13 +1050,12 @@ dht_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
gfid);
local->lock[0].layout.my_layout.op_ret = -1;
local->lock[0].layout.my_layout.op_errno = op_errno;
- gf_msg(this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_INODELK_FAILED,
- "inodelk failed on subvol %s. gfid:%s",
- local->lock[0]
- .layout.my_layout.locks[lk_index]
- ->xl->name,
- gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_INODELK_FAILED, "subvol=%s",
+ local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->xl->name,
+ "gfid=%s", gfid, NULL);
goto cleanup;
}
break;
@@ -1077,11 +1066,11 @@ dht_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
gfid);
local->lock[0].layout.my_layout.op_ret = -1;
local->lock[0].layout.my_layout.op_errno = op_errno;
- gf_msg(
+ gf_smsg(
this->name, GF_LOG_ERROR, op_errno, DHT_MSG_INODELK_FAILED,
- "inodelk failed on subvol %s, gfid:%s",
+ "subvol=%s",
local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
- gfid);
+ "gfid=%s", gfid, NULL);
goto cleanup;
}
}
@@ -1153,19 +1142,16 @@ dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
lock_frame = dht_lock_frame(frame);
if (lock_frame == NULL) {
gf_uuid_unparse(tmp_local->loc.gfid, gfid);
- gf_msg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCK_FRAME_FAILED,
- "memory allocation failed for lock_frame. gfid:%s"
- " path:%s",
- gfid, tmp_local->loc.path);
+ gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCK_FRAME_FAILED,
+ "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL);
goto out;
}
ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
if (ret < 0) {
gf_uuid_unparse(tmp_local->loc.gfid, gfid);
- gf_msg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCAL_LOCK_INIT_FAILED,
- "dht_local_lock_init failed, gfid: %s path:%s", gfid,
- tmp_local->loc.path);
+ gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCAL_LOCK_INIT_FAILED,
+ "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL);
goto out;
}
@@ -1246,11 +1232,10 @@ dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie,
if (ret < 0) {
local->op_ret = -1;
local->op_errno = EIO;
- gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
- DHT_MSG_ENTRYLK_ERROR,
- "%s (%s/%s): "
- "dht_blocking_entrylk failed after taking inodelk",
- gf_fop_list[local->fop], pgfid, entrylk->locks[0]->basename);
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_ENTRYLK_FAILED_AFT_INODELK, "fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "basename=%s",
+ entrylk->locks[0]->basename, NULL);
goto err;
}
@@ -1310,10 +1295,9 @@ dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
ret = dht_build_parent_loc(this, &parent, loc, &op_errno);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_LOC_FAILED,
- "gfid:%s (name:%s) (path: %s): "
- "parent loc build failed",
- loc->gfid, loc->name, loc->path);
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_LOC_FAILED,
+ "gfid=%s", loc->gfid, "name=%s", loc->name, "path=%s",
+ loc->path, NULL);
goto out;
}
gf_uuid_unparse(parent.gfid, pgfid);
@@ -1322,10 +1306,10 @@ dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
inodelk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
if (inodelk->locks == NULL) {
local->op_errno = ENOMEM;
- gf_msg(this->name, GF_LOG_WARNING, local->op_errno, DHT_MSG_NO_MEMORY,
- "%s (%s/%s) (path: %s): "
- "calloc failure",
- gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_CALLOC_FAILED, "fop=%s", gf_fop_list[local->fop],
+ "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path,
+ NULL);
goto out;
}
@@ -1334,10 +1318,10 @@ dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
FAIL_ON_ANY_ERROR);
if (inodelk->locks[0] == NULL) {
local->op_errno = ENOMEM;
- gf_msg(this->name, GF_LOG_WARNING, local->op_errno, DHT_MSG_NO_MEMORY,
- "%s (%s/%s) (path: %s): "
- "inodelk: lock allocation failed",
- gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_LOCK_ALLOC_FAILED, "inodelk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
goto err;
}
inodelk->lk_count = count;
@@ -1346,10 +1330,10 @@ dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
entrylk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
if (entrylk->locks == NULL) {
local->op_errno = ENOMEM;
- gf_msg(this->name, GF_LOG_WARNING, local->op_errno, DHT_MSG_NO_MEMORY,
- "%s (%s/%s) (path: %s): "
- "entrylk: calloc failure",
- gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_CALLOC_FAILED, "entrylk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
goto err;
}
@@ -1359,10 +1343,10 @@ dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
FAIL_ON_ANY_ERROR);
if (entrylk->locks[0] == NULL) {
local->op_errno = ENOMEM;
- gf_msg(this->name, GF_LOG_WARNING, local->op_errno, DHT_MSG_NO_MEMORY,
- "%s (%s/%s) (path: %s): "
- "entrylk: lock allocation failed",
- gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_LOCK_ALLOC_FAILED, "entrylk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
goto err;
}
@@ -1376,11 +1360,11 @@ dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
dht_blocking_entrylk_after_inodelk);
if (ret < 0) {
local->op_errno = EIO;
- gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
- DHT_MSG_INODELK_ERROR,
- "%s (%s/%s) (path: %s): "
- "dht_blocking_inodelk failed",
- gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_BLOCK_INODELK_FAILED, "fop=%s", gf_fop_list[local->fop],
+ "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path,
+ NULL);
+
goto err;
}
diff --git a/xlators/cluster/dht/src/dht-lock.h b/xlators/cluster/dht/src/dht-lock.h
index 802970adb3b..6485c03fb6e 100644
--- a/xlators/cluster/dht/src/dht-lock.h
+++ b/xlators/cluster/dht/src/dht-lock.h
@@ -11,7 +11,6 @@
#ifndef _DHT_LOCK_H
#define _DHT_LOCK_H
-#include "xlator.h"
#include "dht-common.h"
void
diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h
index 5b728f86c95..e3c4471334a 100644
--- a/xlators/cluster/dht/src/dht-mem-types.h
+++ b/xlators/cluster/dht/src/dht-mem-types.h
@@ -11,7 +11,7 @@
#ifndef __DHT_MEM_TYPES_H__
#define __DHT_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_dht_mem_types_ {
gf_dht_mt_dht_du_t = gf_common_mt_end + 1,
@@ -20,23 +20,17 @@ enum gf_dht_mem_types_ {
gf_dht_mt_int32_t,
gf_dht_mt_xlator_t,
gf_dht_mt_dht_layout_t,
- gf_switch_mt_dht_conf_t,
- gf_switch_mt_dht_du_t,
gf_switch_mt_switch_sched_array,
gf_switch_mt_switch_struct,
gf_dht_mt_subvol_time,
gf_dht_mt_loc_t,
gf_defrag_info_mt,
gf_dht_mt_inode_ctx_t,
- gf_dht_mt_ctx_stat_time_t,
gf_dht_mt_dirent_t,
gf_dht_mt_container_t,
gf_dht_mt_octx_t,
gf_dht_mt_miginfo_t,
- gf_tier_mt_bricklist_t,
- gf_tier_mt_ipc_ctr_params_t,
gf_dht_mt_fd_ctx_t,
- gf_tier_mt_qfile_array_t,
gf_dht_ret_cache_t,
gf_dht_nodeuuids_t,
gf_dht_mt_end
diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h
index 005ab57b505..601f8dad78b 100644
--- a/xlators/cluster/dht/src/dht-messages.h
+++ b/xlators/cluster/dht/src/dht-messages.h
@@ -10,7 +10,7 @@
#ifndef _DHT_MESSAGES_H_
#define _DHT_MESSAGES_H_
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
/* To add new message IDs, append new identifiers at the end of the list.
*
@@ -38,12 +38,11 @@ GLFS_MSGID(
DHT_MSG_REBALANCE_STATUS, DHT_MSG_REBALANCE_STOPPED, DHT_MSG_RENAME_FAILED,
DHT_MSG_SETATTR_FAILED, DHT_MSG_SUBVOL_INSUFF_INODES,
DHT_MSG_SUBVOL_INSUFF_SPACE, DHT_MSG_UNLINK_FAILED,
- DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT, DHT_MSG_LOG_TIER_ERROR,
- DHT_MSG_LOG_TIER_STATUS, DHT_MSG_GET_XATTR_FAILED,
- DHT_MSG_FILE_LOOKUP_FAILED, DHT_MSG_OPEN_FD_FAILED,
- DHT_MSG_SET_INODE_CTX_FAILED, DHT_MSG_UNLOCKING_FAILED,
- DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO, DHT_MSG_CHUNK_SIZE_INFO,
- DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR,
+ DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT,
+ DHT_MSG_GET_XATTR_FAILED, DHT_MSG_FILE_LOOKUP_FAILED,
+ DHT_MSG_OPEN_FD_FAILED, DHT_MSG_SET_INODE_CTX_FAILED,
+ DHT_MSG_UNLOCKING_FAILED, DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO,
+ DHT_MSG_CHUNK_SIZE_INFO, DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR,
DHT_MSG_LAYOUT_SORT_FAILED, DHT_MSG_REGEX_INFO, DHT_MSG_FOPEN_FAILED,
DHT_MSG_SET_HOSTNAME_FAILED, DHT_MSG_BRICK_ERROR, DHT_MSG_SYNCOP_FAILED,
DHT_MSG_MIGRATE_INFO, DHT_MSG_SOCKET_ERROR, DHT_MSG_CREATE_FD_FAILED,
@@ -69,8 +68,7 @@ GLFS_MSGID(
DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, DHT_MSG_SYS_CALL_GET_TIME_FAILED,
DHT_MSG_NO_DISK_USAGE_STATUS, DHT_MSG_SUBVOL_DOWN_ERROR,
DHT_MSG_REBAL_THROTTLE_INFO, DHT_MSG_COMMIT_HASH_INFO,
- DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_LOG_IPC_TIER_ERROR,
- DHT_MSG_TIER_PAUSED, DHT_MSG_TIER_RESUME, DHT_MSG_SETTLE_HASH_FAILED,
+ DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_SETTLE_HASH_FAILED,
DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, DHT_MSG_FD_CTX_SET_FAILED,
DHT_MSG_STALE_LOOKUP, DHT_MSG_PARENT_LAYOUT_CHANGED,
DHT_MSG_LOCK_MIGRATION_FAILED, DHT_MSG_LOCK_INODE_UNREF_FAILED,
@@ -79,6 +77,310 @@ GLFS_MSGID(
DHT_MSG_ENTRYLK_ERROR, DHT_MSG_INODELK_ERROR, DHT_MSG_LOC_FAILED,
DHT_MSG_UNKNOWN_FOP, DHT_MSG_MIGRATE_FILE_SKIPPED,
DHT_MSG_DIR_XATTR_HEAL_FAILED, DHT_MSG_HASHED_SUBVOL_DOWN,
- DHT_MSG_NON_HASHED_SUBVOL_DOWN);
+ DHT_MSG_NON_HASHED_SUBVOL_DOWN, DHT_MSG_SYNCTASK_CREATE_FAILED,
+ DHT_MSG_DIR_HEAL_ABORT, DHT_MSG_MIGRATE_SKIP, DHT_MSG_FD_CREATE_FAILED,
+ DHT_MSG_DICT_NEW_FAILED, DHT_MSG_FAILED_TO_OPEN, DHT_MSG_CREATE_FAILED,
+ DHT_MSG_FILE_NOT_EXIST, DHT_MSG_CHOWN_FAILED, DHT_MSG_FALLOCATE_FAILED,
+ DHT_MSG_FTRUNCATE_FAILED, DHT_MSG_STATFS_FAILED, DHT_MSG_WRITE_CROSS,
+ DHT_MSG_NEW_TARGET_FOUND, DHT_MSG_INSUFF_MEMORY, DHT_MSG_SET_XATTR_FAILED,
+ DHT_MSG_SET_MODE_FAILED, DHT_MSG_FILE_EXISTS_IN_DEST,
+ DHT_MSG_SYMLINK_FAILED, DHT_MSG_LINKFILE_DEL_FAILED, DHT_MSG_MKNOD_FAILED,
+ DHT_MSG_MIGRATE_CLEANUP_FAILED, DHT_MSG_LOCK_MIGRATE,
+ DHT_MSG_PARENT_BUILD_FAILED, DHT_MSG_HASHED_SUBVOL_NOT_FOUND,
+ DHT_MSG_ACQUIRE_ENTRYLK_FAILED, DHT_MSG_CREATE_DST_FAILED,
+ DHT_MSG_MIGRATION_EXIT, DHT_MSG_CHANGED_DST, DHT_MSG_TRACE_FAILED,
+ DHT_MSG_WRITE_LOCK_FAILED, DHT_MSG_GETACTIVELK_FAILED, DHT_MSG_STAT_FAILED,
+ DHT_MSG_UNLINK_PERFORM_FAILED, DHT_MSG_CLANUP_SOURCE_FILE_FAILED,
+ DHT_MSG_UNLOCK_FILE_FAILED, DHT_MSG_REMOVE_XATTR_FAILED,
+ DHT_MSG_DATA_MIGRATE_ABORT, DHT_MSG_DEFRAG_NULL, DHT_MSG_PARENT_NULL,
+ DHT_MSG_GFID_NOT_PRESENT, DHT_MSG_CHILD_LOC_FAILED,
+ DHT_MSG_SET_LOOKUP_FAILED, DHT_MSG_DIR_REMOVED, DHT_MSG_FIX_NOT_COMP,
+ DHT_MSG_SUBVOL_DETER_FAILED, DHT_MSG_LOCAL_SUBVOL, DHT_MSG_NODE_UUID,
+ DHT_MSG_SIZE_FILE, DHT_MSG_GET_DATA_SIZE_FAILED,
+ DHT_MSG_PTHREAD_JOIN_FAILED, DHT_MSG_COUNTER_THREAD_CREATE_FAILED,
+ DHT_MSG_MIGRATION_INIT_QUEUE_FAILED, DHT_MSG_PAUSED_TIMEOUT, DHT_MSG_WOKE,
+ DHT_MSG_ABORT_REBALANCE, DHT_MSG_CREATE_TASK_REBAL_FAILED,
+ DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL, DHT_MSG_ADD_CHOICES_ERROR,
+ DHT_MSG_GET_CHOICES_ERROR, DHT_MSG_PREPARE_STATUS_ERROR,
+ DHT_MSG_SET_CHOICE_FAILED, DHT_MSG_SET_HASHED_SUBVOL_FAILED,
+ DHT_MSG_XATTR_HEAL_NOT_POSS, DHT_MSG_LINKTO_FILE_FAILED,
+ DHT_MSG_STALE_LINKFILE_DELETE, DHT_MSG_NO_SUBVOL_FOR_LINKTO,
+ DHT_MSG_SUBVOL_RETURNED, DHT_MSG_UNKNOWN_LOCAL_XSEL, DHT_MSG_GET_XATTR_ERR,
+ DHT_MSG_ALLOC_OR_FILL_FAILED, DHT_MSG_GET_REAL_NAME_FAILED,
+ DHT_MSG_COPY_UUID_FAILED, DHT_MSG_MDS_DETER_FAILED,
+ DHT_MSG_CREATE_REBAL_FAILED, DHT_MSG_LINK_LAYOUT_FAILED,
+ DHT_MSG_NO_SUBVOL_IN_LAYOUT, DHT_MSG_MEM_ALLOC_FAILED,
+ DHT_MSG_SET_IN_PARAMS_DICT_FAILED, DHT_MSG_LOC_COPY_FAILED,
+ DHT_MSG_PARENT_LOC_FAILED, DHT_MSG_CREATE_LOCK_FAILED,
+ DHT_MSG_PREV_ATTEMPT_FAILED, DHT_MSG_REFRESH_ATTEMPT,
+ DHT_MSG_ACQUIRE_LOCK_FAILED, DHT_MSG_CREATE_STUB_FAILED,
+ DHT_MSG_WIND_LOCK_REQ_FAILED, DHT_MSG_REFRESH_FAILED,
+ DHT_MSG_CACHED_SUBVOL_ERROR, DHT_MSG_NO_LINK_SUBVOL, DHT_MSG_SET_KEY_FAILED,
+ DHT_MSG_REMOVE_LINKTO_FAILED, DHT_MSG_LAYOUT_DICT_SET_FAILED,
+ DHT_MSG_XATTR_DICT_NULL, DHT_MSG_DUMMY_ALLOC_FAILED, DHT_MSG_DICT_IS_NULL,
+ DHT_MSG_LINK_INODE_FAILED, DHT_MSG_SELFHEAL_FAILED, DHT_MSG_NO_MDS_SUBVOL,
+ DHT_MSG_LIST_XATTRS_FAILED, DHT_MSG_RESET_INTER_XATTR_FAILED,
+ DHT_MSG_MDS_DOWN_UNABLE_TO_SET, DHT_MSG_WIND_UNLOCK_FAILED,
+ DHT_MSG_COMMIT_HASH_FAILED, DHT_MSG_UNLOCK_GFID_FAILED,
+ DHT_MSG_UNLOCK_FOLLOW_ENTRYLK, DHT_MSG_COPY_FRAME_FAILED,
+ DHT_MSG_UNLOCK_FOLLOW_LOCKS, DHT_MSG_ENTRYLK_FAILED_AFT_INODELK,
+ DHT_MSG_CALLOC_FAILED, DHT_MSG_LOCK_ALLOC_FAILED,
+ DHT_MSG_BLOCK_INODELK_FAILED,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ DHT_MSG_DST_NULL_SET_FAILED);
+
+#define DHT_MSG_FD_CTX_SET_FAILED_STR "Failed to set fd ctx"
+#define DHT_MSG_INVALID_VALUE_STR "Different dst found in the fd ctx"
+#define DHT_MSG_UNKNOWN_FOP_STR "Unknown FOP on file"
+#define DHT_MSG_OPEN_FD_ON_DST_FAILED_STR "Failed to open the fd on file"
+#define DHT_MSG_SYNCTASK_CREATE_FAILED_STR "Failed to create synctask"
+#define DHT_MSG_ASPRINTF_FAILED_STR \
+ "asprintf failed while fetching subvol from the id"
+#define DHT_MSG_HAS_MIGINFO_STR "Found miginfo in the inode ctx"
+#define DHT_MSG_FILE_LOOKUP_FAILED_STR "failed to lookup the file"
+#define DHT_MSG_INVALID_LINKFILE_STR \
+ "linkto target is different from cached-subvol. treating as destination " \
+ "subvol"
+#define DHT_MSG_GFID_MISMATCH_STR "gfid different on the target file"
+#define DHT_MSG_GET_XATTR_FAILED_STR "failed to get 'linkto' xattr"
+#define DHT_MSG_SET_INODE_CTX_FAILED_STR "failed to set inode-ctx target file"
+#define DHT_MSG_DIR_SELFHEAL_FAILED_STR "Healing of path failed"
+#define DHT_MSG_DIR_HEAL_ABORT_STR \
+ "Failed to get path from subvol. Aborting directory healing"
+#define DHT_MSG_DIR_XATTR_HEAL_FAILED_STR "xattr heal failed for directory"
+#define DHT_MSG_LOCK_INODE_UNREF_FAILED_STR \
+ "Found a NULL inode. Failed to unref the inode"
+#define DHT_MSG_DICT_SET_FAILED_STR "Failed to set dictionary value"
+#define DHT_MSG_NOT_LINK_FILE_ERROR_STR "got non-linkfile"
+#define DHT_MSG_CREATE_LINK_FAILED_STR "failed to initialize linkfile data"
+#define DHT_MSG_UNLINK_FAILED_STR "Unlinking linkfile on subvolume failed"
+#define DHT_MSG_MIGRATE_FILE_FAILED_STR "Migrate file failed"
+#define DHT_MSG_NO_MEMORY_STR "could not allocate memory for dict"
+#define DHT_MSG_SUBVOL_ERROR_STR "Failed to get linkto subvol"
+#define DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED_STR "link failed on subvol"
+#define DHT_MSG_MIGRATE_FILE_SKIPPED_STR "Migration skipped"
+#define DHT_MSG_FD_CREATE_FAILED_STR "fd create failed"
+#define DHT_MSG_DICT_NEW_FAILED_STR "dict_new failed"
+#define DHT_MSG_FAILED_TO_OPEN_STR "failed to open"
+#define DHT_MSG_CREATE_FAILED_STR "failed to create"
+#define DHT_MSG_FILE_NOT_EXIST_STR "file does not exist"
+#define DHT_MSG_CHOWN_FAILED_STR "chown failed"
+#define DHT_MSG_FALLOCATE_FAILED_STR "fallocate failed"
+#define DHT_MSG_FTRUNCATE_FAILED_STR "ftruncate failed"
+#define DHT_MSG_STATFS_FAILED_STR "failed to get statfs"
+#define DHT_MSG_WRITE_CROSS_STR \
+ "write will cross min-fre-disk for file on subvol. looking for new subvol"
+#define DHT_MSG_SUBVOL_INSUFF_SPACE_STR \
+ "Could not find any subvol with space accommodating the file. Cosider " \
+ "adding bricks"
+#define DHT_MSG_NEW_TARGET_FOUND_STR "New target found for file"
+#define DHT_MSG_INSUFF_MEMORY_STR "insufficient memory"
+#define DHT_MSG_SET_XATTR_FAILED_STR "failed to set xattr"
+#define DHT_MSG_SET_MODE_FAILED_STR "failed to set mode"
+#define DHT_MSG_FILE_EXISTS_IN_DEST_STR "file exists in destination"
+#define DHT_MSG_LINKFILE_DEL_FAILED_STR "failed to delete the linkfile"
+#define DHT_MSG_SYMLINK_FAILED_STR "symlink failed"
+#define DHT_MSG_MKNOD_FAILED_STR "mknod failed"
+#define DHT_MSG_SETATTR_FAILED_STR "failed to perform setattr"
+#define DHT_MSG_MIGRATE_CLEANUP_FAILED_STR \
+ "Migrate file cleanup failed: failed to fstat file"
+#define DHT_MSG_LOCK_MIGRATE_STR "locks will be migrated for file"
+#define DHT_MSG_PARENT_BUILD_FAILED_STR \
+ "failed to build parent loc, which is needed to acquire entrylk to " \
+ "synchronize with renames on this path. Skipping migration"
+#define DHT_MSG_HASHED_SUBVOL_NOT_FOUND_STR \
+ "cannot find hashed subvol which is needed to synchronize with renames " \
+ "on this path. Skipping migration"
+#define DHT_MSG_ACQUIRE_ENTRYLK_FAILED_STR "failed to acquire entrylk on subvol"
+#define DHT_MSG_CREATE_DST_FAILED_STR "create dst failed for file"
+#define DHT_MSG_MIGRATION_EXIT_STR "Exiting migration"
+#define DHT_MSG_CHANGED_DST_STR "destination changed fo file"
+#define DHT_MSG_TRACE_FAILED_STR "Trace failed"
+#define DHT_MSG_WRITE_LOCK_FAILED_STR "write lock failed"
+#define DHT_MSG_GETACTIVELK_FAILED_STR "getactivelk failed for file"
+#define DHT_MSG_STAT_FAILED_STR "failed to do a stat"
+#define DHT_MSG_UNLINK_PERFORM_FAILED_STR "failed to perform unlink"
+#define DHT_MSG_MIGRATE_FILE_COMPLETE_STR "completed migration"
+#define DHT_MSG_CLANUP_SOURCE_FILE_FAILED_STR "failed to cleanup source file"
+#define DHT_MSG_UNLOCK_FILE_FAILED_STR "failed to unlock file"
+#define DHT_MSG_REMOVE_XATTR_FAILED_STR "remove xattr failed"
+#define DHT_MSG_SOCKET_ERROR_STR "Failed to unlink listener socket"
+#define DHT_MSG_HASHED_SUBVOL_GET_FAILED_STR "Failed to get hashed subvolume"
+#define DHT_MSG_CACHED_SUBVOL_GET_FAILED_STR "Failed to get cached subvolume"
+#define DHT_MSG_MIGRATE_DATA_FAILED_STR "migrate-data failed"
+#define DHT_MSG_DEFRAG_NULL_STR "defrag is NULL"
+#define DHT_MSG_DATA_MIGRATE_ABORT_STR \
+ "Readdirp failed. Aborting data migration for dict"
+#define DHT_MSG_LAYOUT_FIX_FAILED_STR "fix layout failed"
+#define DHT_MSG_PARENT_NULL_STR "parent is NULL"
+#define DHT_MSG_GFID_NOT_PRESENT_STR "gfid not present"
+#define DHT_MSG_CHILD_LOC_FAILED_STR "Child loc build failed"
+#define DHT_MSG_SET_LOOKUP_FAILED_STR "Failed to set lookup"
+#define DHT_MSG_DIR_LOOKUP_FAILED_STR "lookup failed"
+#define DHT_MSG_DIR_REMOVED_STR "Dir renamed or removed. Skipping"
+#define DHT_MSG_READDIR_ERROR_STR "readdir failed, Aborting fix-layout"
+#define DHT_MSG_SETTLE_HASH_FAILED_STR "Settle hash failed"
+#define DHT_MSG_DEFRAG_PROCESS_DIR_FAILED_STR "gf_defrag_process_dir failed"
+#define DHT_MSG_FIX_NOT_COMP_STR \
+ "Unable to retrieve fixlayout xattr. Assume background fix layout not " \
+ "complete"
+#define DHT_MSG_SUBVOL_DETER_FAILED_STR \
+ "local subvolume determination failed with error"
+#define DHT_MSG_LOCAL_SUBVOL_STR "local subvol"
+#define DHT_MSG_NODE_UUID_STR "node uuid"
+#define DHT_MSG_SIZE_FILE_STR "Total size files"
+#define DHT_MSG_GET_DATA_SIZE_FAILED_STR \
+ "Failed to get the total data size. Unable to estimate time to complete " \
+ "rebalance"
+#define DHT_MSG_PTHREAD_JOIN_FAILED_STR \
+ "file_counter_thread: pthread_join failed"
+#define DHT_MSG_COUNTER_THREAD_CREATE_FAILED_STR \
+ "Failed to create the file counter thread"
+#define DHT_MSG_MIGRATION_INIT_QUEUE_FAILED_STR \
+ "Failed to initialise migration queue"
+#define DHT_MSG_REBALANCE_STOPPED_STR "Received stop command on rebalance"
+#define DHT_MSG_PAUSED_TIMEOUT_STR "Request pause timer timeout"
+#define DHT_MSG_WOKE_STR "woken"
+#define DHT_MSG_ABORT_REBALANCE_STR "Aborting rebalance"
+#define DHT_MSG_REBALANCE_START_FAILED_STR \
+ "Failed to start rebalance: look up on / failed"
+#define DHT_MSG_CREATE_TASK_REBAL_FAILED_STR \
+ "Could not create task for rebalance"
+#define DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL_STR \
+ "Rebalance estimates will not be available"
+#define DHT_MSG_REBALANCE_STATUS_STR "Rebalance status"
+#define DHT_MSG_DATA_NULL_STR "data value is NULL"
+#define DHT_MSG_ADD_CHOICES_ERROR_STR "Error to add choices in buffer"
+#define DHT_MSG_GET_CHOICES_ERROR_STR "Error to get choices"
+#define DHT_MSG_PREPARE_STATUS_ERROR_STR "Error to prepare status"
+#define DHT_MSG_SET_CHOICE_FAILED_STR "Failed to set full choice"
+#define DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED_STR \
+ "Failed to aggregate quota xattr"
+#define DHT_MSG_FILE_TYPE_MISMATCH_STR \
+ "path exists as a file on one subvolume and directory on another. Please " \
+ "fix it manually"
+#define DHT_MSG_LAYOUT_SET_FAILED_STR "failed to set layout for subvolume"
+#define DHT_MSG_LAYOUT_MERGE_FAILED_STR "failed to merge layouts for subvolume"
+#define DHT_MSG_SET_HASHED_SUBVOL_FAILED_STR "Failed to set hashed subvolume"
+#define DHT_MSG_XATTR_HEAL_NOT_POSS_STR \
+ "No gfid exists for path. so healing xattr is not possible"
+#define DHT_MSG_REVALIDATE_CBK_INFO_STR "Revalidate: subvolume returned -1"
+#define DHT_MSG_LAYOUT_MISMATCH_STR "Mismatching layouts"
+#define DHT_MSG_UNLINK_LOOKUP_INFO_STR "lookup_unlink retuened"
+#define DHT_MSG_LINKTO_FILE_FAILED_STR \
+ "Could not unlink the linkto file as either fd is open and/or linkto " \
+ "xattr is set"
+#define DHT_MSG_LAYOUT_PRESET_FAILED_STR \
+ "Could not set pre-set layout for subvolume"
+#define DHT_MSG_FILE_ON_MULT_SUBVOL_STR \
+ "multiple subvolumes have file (preferably rename the file in the " \
+ "backend, and do a fresh lookup"
+#define DHT_MSG_STALE_LINKFILE_DELETE_STR \
+ "attempting deletion of stale linkfile"
+#define DHT_MSG_LINK_FILE_LOOKUP_INFO_STR "Lookup on following linkfile"
+#define DHT_MSG_NO_SUBVOL_FOR_LINKTO_STR "No link subvolume for linkto"
+#define DHT_MSG_SUBVOL_RETURNED_STR "Subvolume returned -1"
+#define DHT_MSG_UNKNOWN_LOCAL_XSEL_STR "Unknown local->xsel"
+#define DHT_MSG_DICT_GET_FAILED_STR "Failed to get"
+#define DHT_MSG_UUID_PARSE_ERROR_STR "Failed to parse uuid"
+#define DHT_MSG_GET_XATTR_ERR_STR "getxattr err for dir"
+#define DHT_MSG_ALLOC_OR_FILL_FAILED_STR "alloc or fill failed"
+#define DHT_MSG_UPGRADE_BRICKS_STR \
+ "At least one of the bricks does not support this operation. Please " \
+ "upgrade all bricks"
+#define DHT_MSG_GET_REAL_NAME_FAILED_STR "Failed to get real filename"
+#define DHT_MSG_LAYOUT_NULL_STR "Layout is NULL"
+#define DHT_MSG_COPY_UUID_FAILED_STR "Failed to copy node uuid key"
+#define DHT_MSG_MDS_DETER_FAILED_STR \
+ "Cannot determine MDS, fetching xattr randomly from a subvol"
+#define DHT_MSG_HASHED_SUBVOL_DOWN_STR \
+ "MDS is down for path, so fetching xattr randomly from subvol"
+#define DHT_MSG_CREATE_REBAL_FAILED_STR \
+ "failed to create a new rebalance synctask"
+#define DHT_MSG_FIX_LAYOUT_INFO_STR "fixing the layout"
+#define DHT_MSG_OPERATION_NOT_SUP_STR "wrong directory-spread-count value"
+#define DHT_MSG_LINK_LAYOUT_FAILED_STR "failed to link the layout in inode"
+#define DHT_MSG_NO_SUBVOL_IN_LAYOUT_STR "no subvolume in layout for path"
+#define DHT_MSG_INODE_LK_ERROR_STR "mknod lock failed for file"
+#define DHT_MSG_MEM_ALLOC_FAILED_STR "mem allocation failed"
+#define DHT_MSG_PARENT_LAYOUT_CHANGED_STR \
+ "extracting in-memory layout of parent failed"
+#define DHT_MSG_SET_IN_PARAMS_DICT_FAILED_STR \
+ "setting in params dictionary failed"
+#define DHT_MSG_LOC_COPY_FAILED_STR "loc_copy failed"
+#define DHT_MSG_LOC_FAILED_STR "parent loc build failed"
+#define DHT_MSG_PARENT_LOC_FAILED_STR "locking parent failed"
+#define DHT_MSG_CREATE_LOCK_FAILED_STR "Create lock failed"
+#define DHT_MSG_PREV_ATTEMPT_FAILED_STR \
+ "mkdir loop detected. parent layout didn't change even though previous " \
+ "attempt of mkdir failed because of in-memory layout not matching with " \
+ "that on disk."
+#define DHT_MSG_REFRESH_ATTEMPT_STR \
+ "mkdir parent layout changed. Attempting a refresh and then a retry"
+#define DHT_MSG_ACQUIRE_LOCK_FAILED_STR \
+ "Acquiring lock on parent to guard against layout-change failed"
+#define DHT_MSG_CREATE_STUB_FAILED_STR "creating stub failed"
+#define DHT_MSG_WIND_LOCK_REQ_FAILED_STR \
+ "cannot wind lock request to guard parent layout"
+#define DHT_MSG_REFRESH_FAILED_STR "refreshing parent layout failed."
+#define DHT_MSG_CACHED_SUBVOL_ERROR_STR "On cached subvol"
+#define DHT_MSG_NO_LINK_SUBVOL_STR "Linkfile does not have link subvolume"
+#define DHT_MSG_SET_KEY_FAILED_STR "failed to set key"
+#define DHT_MSG_CHILD_DOWN_STR "Received CHILD_DOWN. Exiting"
+#define DHT_MSG_LOG_FIXED_LAYOUT_STR "log layout fixed"
+#define DHT_MSG_REBAL_STRUCT_SET_STR "local->rebalance already set"
+#define DHT_MSG_REMOVE_LINKTO_FAILED_STR "Removal of linkto failed at subvol"
+#define DHT_MSG_LAYOUT_DICT_SET_FAILED_STR "dht layout dict set failed"
+#define DHT_MSG_SUBVOL_INFO_STR "creating subvolume"
+#define DHT_MSG_COMPUTE_HASH_FAILED_STR "hash computation failed"
+#define DHT_MSG_INVALID_DISK_LAYOUT_STR \
+ "Invalid disk layout: Catastrophic error layout with unknown type found"
+#define DHT_MSG_LAYOUT_SORT_FAILED_STR "layout sort failed"
+#define DHT_MSG_ANOMALIES_INFO_STR "Found anomalies"
+#define DHT_MSG_XATTR_DICT_NULL_STR "xattr dictionary is NULL"
+#define DHT_MSG_DISK_LAYOUT_MISSING_STR "Disk layout missing"
+#define DHT_MSG_LAYOUT_INFO_STR "layout info"
+#define DHT_MSG_SUBVOL_NO_LAYOUT_INFO_STR "no pre-set layout for subvol"
+#define DHT_MSG_SELFHEAL_XATTR_FAILED_STR "layout setxattr failed"
+#define DHT_MSG_DIR_SELFHEAL_XATTR_FAILED_STR "Directory self heal xattr failed"
+#define DHT_MSG_DUMMY_ALLOC_FAILED_STR "failed to allocate dummy layout"
+#define DHT_MSG_DICT_IS_NULL_STR \
+ "dict is NULL, need to make sure gfids are same"
+#define DHT_MSG_ENTRYLK_ERROR_STR "acquiring entrylk after inodelk failed"
+#define DHT_MSG_NO_DISK_USAGE_STATUS_STR "no du stats"
+#define DHT_MSG_LINK_INODE_FAILED_STR "linking inode failed"
+#define DHT_MSG_SELFHEAL_FAILED_STR "Directory selfheal failed"
+#define DHT_MSG_NO_MDS_SUBVOL_STR "No mds subvol"
+#define DHT_MSG_LIST_XATTRS_FAILED_STR "failed to list xattrs"
+#define DHT_MSG_RESET_INTER_XATTR_FAILED_STR "Failed to reset internal xattr"
+#define DHT_MSG_MDS_DOWN_UNABLE_TO_SET_STR \
+ "mds subvol is down, unable to set xattr"
+#define DHT_MSG_DIR_ATTR_HEAL_FAILED_STR \
+ "Directory attr heal failed. Failed to set uid/gid"
+#define DHT_MSG_WIND_UNLOCK_FAILED_STR \
+ "Winding unlock failed: stale locks left on brick"
+#define DHT_MSG_COMMIT_HASH_FAILED_STR "Directory commit hash updaten failed"
+#define DHT_MSG_LK_ARRAY_INFO_STR "lk info"
+#define DHT_MSG_UNLOCK_GFID_FAILED_STR \
+ "unlock failed on gfid: stale lock might be left"
+#define DHT_MSG_UNLOCKING_FAILED_STR "unlocking failed"
+#define DHT_MSG_UNLOCK_FOLLOW_ENTRYLK_STR "not unlocking following entrylks"
+#define DHT_MSG_COPY_FRAME_FAILED_STR "copy frame failed"
+#define DHT_MSG_UNLOCK_FOLLOW_LOCKS_STR "not unlocking following locks"
+#define DHT_MSG_INODELK_FAILED_STR "inodelk failed on subvol"
+#define DHT_MSG_LOCK_FRAME_FAILED_STR "memory allocation failed for lock_frame"
+#define DHT_MSG_LOCAL_LOCK_INIT_FAILED_STR "dht_local_lock_init failed"
+#define DHT_MSG_ENTRYLK_FAILED_AFT_INODELK_STR \
+ "dht_blocking_entrylk failed after taking inodelk"
+#define DHT_MSG_BLOCK_INODELK_FAILED_STR "dht_blocking_inodelk failed"
+#define DHT_MSG_CALLOC_FAILED_STR "calloc failed"
+#define DHT_MSG_LOCK_ALLOC_FAILED_STR "lock allocation failed"
+#define DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS_STR \
+ "cannot allocate a frame, not unlocking following entrylks"
+#define DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK_STR \
+ "storing locks in local failed, not unlocking following entrylks"
+#define DHT_MSG_DST_NULL_SET_FAILED_STR \
+ "src or dst is NULL, Failed to set dictionary value"
#endif /* _DHT_MESSAGES_H_ */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index a00200f0ca3..8ba8082bd86 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -8,18 +8,16 @@
cases as published by the Free Software Foundation.
*/
-#include "tier.h"
#include "dht-common.h"
-#include "xlator.h"
-#include "syscall.h"
-#include <signal.h>
+#include <glusterfs/syscall.h>
#include <fnmatch.h>
#include <signal.h>
-#include "events.h"
+#include <glusterfs/events.h>
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
#define GF_DISK_SECTOR_SIZE 512
-#define DHT_REBALANCE_PID 4242 /* Change it if required */
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
+#define DHT_REBALANCE_PID 4242 /* Change it if required */
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
#define MAX_MIGRATE_QUEUE_COUNT 500
#define MIN_MIGRATE_QUEUE_COUNT 200
#define MAX_REBAL_TYPE_SIZE 16
@@ -47,7 +45,10 @@ gf_defrag_free_dir_dfmeta(struct dir_dfmeta *meta, int local_subvols_cnt)
if (meta) {
for (i = 0; i < local_subvols_cnt; i++) {
- gf_dirent_free(&meta->equeue[i]);
+ if (meta->equeue)
+ gf_dirent_free(&meta->equeue[i]);
+ if (meta->lfd && meta->lfd[i])
+ fd_unref(meta->lfd[i]);
}
GF_FREE(meta->equeue);
@@ -55,6 +56,7 @@ gf_defrag_free_dir_dfmeta(struct dir_dfmeta *meta, int local_subvols_cnt)
GF_FREE(meta->iterator);
GF_FREE(meta->offset_var);
GF_FREE(meta->fetch_entries);
+ GF_FREE(meta->lfd);
GF_FREE(meta);
}
}
@@ -86,26 +88,6 @@ dht_set_global_defrag_error(gf_defrag_info_t *defrag, int ret)
return;
}
-static gf_boolean_t
-dht_is_tier_command(int cmd)
-{
- gf_boolean_t is_tier = _gf_false;
-
- switch (cmd) {
- case GF_DEFRAG_CMD_START_TIER:
- case GF_DEFRAG_CMD_STATUS_TIER:
- case GF_DEFRAG_CMD_START_DETACH_TIER:
- case GF_DEFRAG_CMD_STOP_DETACH_TIER:
- case GF_DEFRAG_CMD_PAUSE_TIER:
- case GF_DEFRAG_CMD_RESUME_TIER:
- is_tier = _gf_true;
- break;
- default:
- break;
- }
- return is_tier;
-}
-
static int
dht_send_rebalance_event(xlator_t *this, int cmd, gf_defrag_status_t status)
{
@@ -114,8 +96,6 @@ dht_send_rebalance_event(xlator_t *this, int cmd, gf_defrag_status_t status)
char *tmpstr = NULL;
char *ptr = NULL;
char *suffix = "-dht";
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
int len = 0;
eventtypes_t event = EVENT_LAST;
@@ -134,21 +114,14 @@ dht_send_rebalance_event(xlator_t *this, int cmd, gf_defrag_status_t status)
break;
}
- if (dht_is_tier_command(cmd)) {
- /* We should have the tier volume name*/
- conf = this->private;
- defrag = conf->defrag;
- volname = defrag->tier_conf.volname;
- } else {
- /* DHT volume */
- len = strlen(this->name) - strlen(suffix);
- tmpstr = gf_strdup(this->name);
- if (tmpstr) {
- ptr = tmpstr + len;
- if (!strcmp(ptr, suffix)) {
- tmpstr[len] = '\0';
- volname = tmpstr;
- }
+ /* DHT volume */
+ len = strlen(this->name) - strlen(suffix);
+ tmpstr = gf_strdup(this->name);
+ if (tmpstr) {
+ ptr = tmpstr + len;
+ if (!strcmp(ptr, suffix)) {
+ tmpstr[len] = '\0';
+ volname = tmpstr;
}
}
@@ -174,75 +147,6 @@ dht_strip_out_acls(dict_t *dict)
}
}
-static int
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
- int32_t size, off_t offset, struct iobref *iobref,
- int *fop_errno)
-{
- int i = 0;
- int ret = -1;
- int start_idx = 0;
- int tmp_offset = 0;
- int write_needed = 0;
- int buf_len = 0;
- int size_pending = 0;
- char *buf = NULL;
-
- /* loop through each vector */
- for (i = 0; i < count; i++) {
- buf = vec[i].iov_base;
- buf_len = vec[i].iov_len;
-
- for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
- start_idx += GF_DISK_SECTOR_SIZE) {
- if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
- write_needed = 1;
- continue;
- }
-
- if (write_needed) {
- ret = syncop_write(
- to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
- (offset + tmp_offset), iobref, 0, NULL, NULL);
- /* 'path' will be logged in calling function */
- if (ret < 0) {
- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
- strerror(-ret));
- *fop_errno = -ret;
- ret = -1;
- goto out;
- }
-
- write_needed = 0;
- }
- tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
- }
-
- if ((start_idx < buf_len) || write_needed) {
- /* This means, last chunk is not yet written.. write it */
- ret = syncop_write(to, fd, (buf + tmp_offset),
- (buf_len - tmp_offset), (offset + tmp_offset),
- iobref, 0, NULL, NULL);
- if (ret < 0) {
- /* 'path' will be logged in calling function */
- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
- strerror(-ret));
- *fop_errno = -ret;
- ret = -1;
- goto out;
- }
- }
-
- size_pending = (size - buf_len);
- if (!size_pending)
- break;
- }
-
- ret = size;
-out:
- return ret;
-}
-
/*
return values:
-1 : failure
@@ -650,7 +554,7 @@ out:
static int
__dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from,
loc_t *loc, struct iatt *stbuf, fd_t **dst_fd,
- int *fop_errno)
+ int *fop_errno, int file_has_holes)
{
int ret = -1;
int ret2 = -1;
@@ -705,26 +609,23 @@ __dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from,
goto out;
}
- if (!!dht_is_tier_xlator(this)) {
- xdata = dict_new();
- if (!xdata) {
- *fop_errno = ENOMEM;
- ret = -1;
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_MIGRATE_FILE_FAILED, "%s: dict_new failed)",
- loc->path);
- goto out;
- }
+ xdata = dict_new();
+ if (!xdata) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: dict_new failed)", loc->path);
+ goto out;
+ }
- ret = dict_set_int32(xdata, GF_CLEAN_WRITE_PROTECTION, 1);
- if (ret) {
- *fop_errno = ENOMEM;
- ret = -1;
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: failed to set dictionary value: key = %s ", loc->path,
- GF_CLEAN_WRITE_PROTECTION);
- goto out;
- }
+ ret = dict_set_int32_sizen(xdata, GF_CLEAN_WRITE_PROTECTION, 1);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: failed to set dictionary value: key = %s ", loc->path,
+ GF_CLEAN_WRITE_PROTECTION);
+ goto out;
}
ret = syncop_lookup(to, loc, &new_stbuf, NULL, xdata, NULL);
@@ -819,7 +720,7 @@ __dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from,
/* No need to bother about 0 byte size files */
if (stbuf->ia_size > 0) {
- if (conf->use_fallocate) {
+ if (conf->use_fallocate && !file_has_holes) {
ret = syncop_fallocate(to, fd, 0, 0, stbuf->ia_size, NULL, NULL);
if (ret < 0) {
if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -ENOSYS) {
@@ -835,7 +736,7 @@ __dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from,
/* fallocate does not release the space
* in some cases
*/
- ret2 = syncop_ftruncate(to, fd, 0, NULL, NULL);
+ ret2 = syncop_ftruncate(to, fd, 0, NULL, NULL, NULL, NULL);
if (ret2 < 0) {
gf_msg(this->name, GF_LOG_WARNING, -ret2,
DHT_MSG_MIGRATE_FILE_FAILED,
@@ -846,10 +747,9 @@ __dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from,
goto out;
}
}
- }
-
- if (!conf->use_fallocate) {
- ret = syncop_ftruncate(to, fd, stbuf->ia_size, NULL, NULL);
+ } else {
+ ret = syncop_ftruncate(to, fd, stbuf->ia_size, NULL, NULL, NULL,
+ NULL);
if (ret < 0) {
*fop_errno = -ret;
gf_msg(this->name, GF_LOG_WARNING, -ret,
@@ -875,7 +775,7 @@ out:
dict_unref(dict);
if (xdata)
- dict_unref(dict);
+ dict_unref(xdata);
return ret;
}
@@ -940,8 +840,8 @@ __dht_check_free_space(xlator_t *this, xlator_t *to, xlator_t *from, loc_t *loc,
}
gf_msg_debug(this->name, 0,
- "min_free_disk - %f , block available - "
- "%lu , block size - %lu ",
+ "min_free_disk - %f , block available - %" PRId64
+ ", block size - %lu",
conf->min_free_disk, dst_statfs.f_bavail, dst_statfs.f_bsize);
dst_statfs_blocks = dst_statfs.f_bavail *
@@ -971,43 +871,40 @@ __dht_check_free_space(xlator_t *this, xlator_t *to, xlator_t *from, loc_t *loc,
prevent any files being migrated to newly added bricks if they are
smaller then the free space available on the existing bricks.
*/
- if (stbuf) {
- if (!conf->use_fallocate) {
- file_blocks = stbuf->ia_size + GF_DISK_SECTOR_SIZE - 1;
- file_blocks /= GF_DISK_SECTOR_SIZE;
+ if (!conf->use_fallocate) {
+ file_blocks = stbuf->ia_size + GF_DISK_SECTOR_SIZE - 1;
+ file_blocks /= GF_DISK_SECTOR_SIZE;
- if (file_blocks >= dst_statfs_blocks) {
- dst_statfs_blocks = 0;
- } else {
- dst_statfs_blocks -= file_blocks;
- }
+ if (file_blocks >= dst_statfs_blocks) {
+ dst_statfs_blocks = 0;
+ } else {
+ dst_statfs_blocks -= file_blocks;
}
+ }
- src_post_availspacepercent = ((src_statfs_blocks + file_blocks) * 100) /
- src_total_blocks;
+ src_post_availspacepercent = ((src_statfs_blocks + file_blocks) * 100) /
+ src_total_blocks;
- dst_post_availspacepercent = (dst_statfs_blocks * 100) /
- dst_total_blocks;
+ dst_post_availspacepercent = (dst_statfs_blocks * 100) / dst_total_blocks;
- if (dst_post_availspacepercent < src_post_availspacepercent) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
- "data movement of file "
- "{blocks:%" PRIu64
- " name:(%s)} would result in "
- "dst node (%s:%" PRIu64
- ") having lower disk "
- "space than the source node (%s:%" PRIu64
- ")"
- ".Skipping file.",
- stbuf->ia_blocks, loc->path, to->name, dst_statfs_blocks,
- from->name, src_statfs_blocks);
-
- /* this is not a 'failure', but we don't want to
- consider this as 'success' too :-/ */
- *fop_errno = ENOSPC;
- ret = 1;
- goto out;
- }
+ if (dst_post_availspacepercent < src_post_availspacepercent) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "data movement of file "
+ "{blocks:%" PRIu64
+ " name:(%s)} would result in "
+ "dst node (%s:%" PRIu64
+ ") having lower disk "
+ "space than the source node (%s:%" PRIu64
+ ")"
+ ".Skipping file.",
+ stbuf->ia_blocks, loc->path, to->name, dst_statfs_blocks,
+ from->name, src_statfs_blocks);
+
+ /* this is not a 'failure', but we don't want to
+ consider this as 'success' too :-/ */
+ *fop_errno = ENOSPC;
+ ret = 1;
+ goto out;
}
check_avail_space:
@@ -1017,7 +914,7 @@ check_avail_space:
gf_msg_debug(this->name, 0,
"file : %s, post_availspacepercent"
- " : %lf f_bavail : %lu min-free-disk: %lf",
+ " : %lf f_bavail : %" PRIu64 " min-free-disk: %lf",
loc->path, dst_post_availspacepercent, dst_statfs.f_bavail,
conf->min_free_disk);
@@ -1038,9 +935,8 @@ check_avail_space:
if (conf->disk_unit != 'p') {
if ((dst_statfs_blocks * GF_DISK_SECTOR_SIZE) < conf->min_free_disk) {
gf_msg_debug(this->name, 0,
- "file : %s, destination "
- "frsize: %lu f_bavail : %lu "
- "min-free-disk: %lf",
+ "file : %s, destination frsize: %lu "
+ "f_bavail : %" PRIu64 " min-free-disk: %lf",
loc->path, dst_statfs.f_frsize, dst_statfs.f_bavail,
conf->min_free_disk);
@@ -1102,32 +998,103 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
int ret = 0;
int count = 0;
off_t offset = 0;
+ off_t data_offset = 0;
+ off_t hole_offset = 0;
struct iovec *vector = NULL;
struct iobref *iobref = NULL;
uint64_t total = 0;
size_t read_size = 0;
+ size_t data_block_size = 0;
dict_t *xdata = NULL;
dht_conf_t *conf = NULL;
conf = this->private;
+
/* if file size is '0', no need to enter this loop */
while (total < ia_size) {
- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
- ? DHT_REBALANCE_BLKSIZE
- : (ia_size - total));
+ /* This is a regular file - read it sequentially */
+ if (!hole_exists) {
+ read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
+ ? DHT_REBALANCE_BLKSIZE
+ : (ia_size - total));
+ } else {
+ /* This is a sparse file - read only the data segments in the file
+ */
+
+ /* If the previous data block is fully copied, find the next data
+ * segment
+ * starting at the offset of the last read and written byte, */
+ if (data_block_size <= 0) {
+ ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
+ &data_offset);
+ if (ret) {
+ if (ret == -ENXIO)
+ ret = 0; /* No more data segments */
+ else
+ *fop_errno = -ret; /* Error occurred */
+
+ break;
+ }
+
+ /* If the position of the current data segment is greater than
+ * the position of the next hole, find the next hole in order to
+ * calculate the length of the new data segment */
+ if (data_offset > hole_offset) {
+ /* Starting at the offset of the last data segment, find the
+ * next hole */
+ ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
+ NULL, &hole_offset);
+ if (ret) {
+ /* If an error occurred here it's a real error because
+ * if the seek for a data segment was successful then
+ * necessarily another hole must exist (EOF is a hole)
+ */
+ *fop_errno = -ret;
+ break;
+ }
+
+ /* Calculate the total size of the current data block */
+ data_block_size = hole_offset - data_offset;
+ }
+ } else {
+ /* There is still data in the current segment, move the
+ * data_offset to the position of the last written byte */
+ data_offset = offset;
+ }
+
+ /* Calculate how much data needs to be read and written. If the data
+ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
+ * write DHT_REBALANCE_BLKSIZE data length and the rest in the
+ * next iteration(s) */
+ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
+ ? DHT_REBALANCE_BLKSIZE
+ : data_block_size);
+
+ /* Calculate the remaining size of the data block - maybe there's no
+ * need to seek for data in the next iteration */
+ data_block_size -= read_size;
+
+ /* Set offset to the offset of the data segment so read and write
+ * will have the correct position */
+ offset = data_offset;
+ }
ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
- &iobref, NULL, NULL);
+ &iobref, NULL, NULL, NULL);
+
if (!ret || (ret < 0)) {
- *fop_errno = -ret;
+ if (!ret) {
+ /* File was probably truncated*/
+ ret = -1;
+ *fop_errno = ENOSPC;
+ } else {
+ *fop_errno = -ret;
+ }
break;
}
- if (hole_exists) {
- ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
- iobref, fop_errno);
- } else {
- if (!conf->force_migration && !dht_is_tier_xlator(this)) {
+ if (!conf->force_migration) {
+ if (!xdata) {
xdata = dict_new();
if (!xdata) {
gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
@@ -1147,7 +1114,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
* https://github.com/gluster/glusterfs/issues/308
* for more details.
*/
- ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
+ ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
if (ret) {
gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
"failed to set dict");
@@ -1156,22 +1123,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
break;
}
}
-
- ret = syncop_writev(to, dst, vector, count, offset, iobref, 0,
- xdata, NULL);
- if (ret < 0) {
- *fop_errno = -ret;
- }
- }
-
- if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
- (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
- "Migrate file paused");
- ret = -1;
}
+ ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
+ NULL, xdata, NULL);
if (ret < 0) {
+ *fop_errno = -ret;
break;
}
@@ -1565,6 +1522,7 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
xlator_t *old_target = NULL;
xlator_t *hashed_subvol = NULL;
fd_t *linkto_fd = NULL;
+ dict_t *xdata = NULL;
if (from == to) {
gf_msg_debug(this->name, 0,
@@ -1575,20 +1533,6 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
- /* If defrag is NULL, it should be assumed that migration is triggered
- * from client */
- defrag = conf->defrag;
-
- /* migration of files from clients is restricted to non-tiered clients
- * for now */
- if (!defrag && dht_is_tier_xlator(this)) {
- ret = ENOTSUP;
- goto out;
- }
-
- if (defrag && defrag->tier_conf.is_tier)
- log_level = GF_LOG_TRACE;
-
gf_log(this->name, log_level, "%s: attempting to move from %s to %s",
loc->path, from->name, to->name);
@@ -1631,6 +1575,10 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
loc->path);
}
+ /* The file is locked to prevent a rename during a migration. Renames
+ * and migrations on the file at the same time can lead to data loss.
+ */
+
ret = dht_build_parent_loc(this, &parent_loc, loc, fop_errno);
if (ret < 0) {
ret = -1;
@@ -1731,9 +1679,13 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* Try to preserve 'holes' while migrating data */
+ if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE))
+ file_has_holes = 1;
+
/* create the destination, with required modes/xattr */
ret = __dht_rebalance_create_dst_file(this, to, from, loc, &stbuf, &dst_fd,
- fop_errno);
+ fop_errno, file_has_holes);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0, 0,
"Create dst failed"
@@ -1757,7 +1709,7 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
- ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL);
+ ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL);
if (ret) {
gf_log(this->name, GF_LOG_WARNING,
"%s: failed to perform truncate on %s (%s)", loc->path,
@@ -1777,8 +1729,8 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
* destination. We need to do update this only post migration
* as in case of failure the linkto needs to point to the source
* subvol */
- ret = __dht_rebalance_create_dst_file(this, to, from, loc, &stbuf,
- &dst_fd, fop_errno);
+ ret = __dht_rebalance_create_dst_file(
+ this, to, from, loc, &stbuf, &dst_fd, fop_errno, file_has_holes);
if (ret) {
gf_log(this->name, GF_LOG_ERROR,
"Create dst failed"
@@ -1865,9 +1817,6 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
ret = 0;
goto out;
}
- /* Try to preserve 'holes' while migrating data */
- if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE))
- file_has_holes = 1;
ret = __dht_rebalance_migrate_data(this, defrag, from, to, src_fd, dst_fd,
stbuf.ia_size, file_has_holes,
@@ -1882,7 +1831,15 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
/* TODO: Sync the locks */
- ret = syncop_fsync(to, dst_fd, 0, NULL, NULL);
+ xdata = dict_new();
+ if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s: failed to set last-fsync flag on "
+ "%s (%s)",
+ loc->path, to->name, strerror(ENOMEM));
+ }
+
+ ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, xdata, NULL);
if (ret) {
gf_log(this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)",
loc->path, to->name, strerror(-ret));
@@ -2138,17 +2095,6 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
}
- /* store size of previous migrated file */
- if (defrag && defrag->tier_conf.is_tier) {
- if (from != TIER_HASHED_SUBVOL) {
- defrag->tier_conf.st_last_promoted_size = stbuf.ia_size;
- } else {
- /* Don't delete the linkto file on the hashed subvol */
- delete_src_linkto = _gf_false;
- defrag->tier_conf.st_last_demoted_size = stbuf.ia_size;
- }
- }
-
/* The src file is being unlinked after this so we don't need
to clean it up */
clean_src = _gf_false;
@@ -2169,7 +2115,7 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
/* Free up the data blocks on the source node, as the whole
file is migrated */
- ret = syncop_ftruncate(from, src_fd, 0, NULL, NULL);
+ ret = syncop_ftruncate(from, src_fd, 0, NULL, NULL, NULL, NULL);
if (ret) {
gf_log(this->name, GF_LOG_WARNING,
"%s: failed to perform truncate on %s (%s)", loc->path,
@@ -2291,7 +2237,7 @@ out:
/* reset the destination back to 0 */
if (clean_dst) {
- lk_ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL);
+ lk_ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL);
if (lk_ret) {
gf_msg(this->name, GF_LOG_ERROR, -lk_ret,
DHT_MSG_MIGRATE_FILE_FAILED,
@@ -2336,14 +2282,12 @@ out:
}
}
- if (!dht_is_tier_xlator(this)) {
- lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES,
- NULL, NULL);
- if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) {
- gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0,
- "%s: removexattr failed key %s", loc->path,
- GF_PROTECT_FROM_EXTERNAL_WRITES);
- }
+ lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES, NULL,
+ NULL);
+ if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0,
+ "%s: removexattr failed key %s", loc->path,
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
}
if (dict)
@@ -2356,11 +2300,15 @@ out:
if (dst_fd)
syncop_close(dst_fd);
+
if (src_fd)
syncop_close(src_fd);
if (linkto_fd)
syncop_close(linkto_fd);
+ if (xdata)
+ dict_unref(xdata);
+
loc_wipe(&tmp_loc);
loc_wipe(&parent_loc);
@@ -2455,15 +2403,12 @@ void
dht_build_root_inode(xlator_t *this, inode_t **inode)
{
inode_table_t *itable = NULL;
- uuid_t root_gfid = {
- 0,
- };
+ static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
itable = inode_table_new(0, this);
if (!itable)
return;
- root_gfid[15] = 1;
*inode = inode_find(itable, root_gfid);
}
@@ -2593,10 +2538,10 @@ out:
* all hardlinks.
*/
-int
+gf_boolean_t
gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid)
{
- int ret = 0;
+ gf_boolean_t ret = _gf_false;
int i = local_subvol_index;
char *str = NULL;
uint32_t hashval = 0;
@@ -2618,12 +2563,11 @@ gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid)
}
str = uuid_utoa_r(gfid, buf);
- ret = dht_hash_compute(this, 0, str, &hashval);
- if (ret == 0) {
+ if (dht_hash_compute(this, 0, str, &hashval) == 0) {
index = (hashval % entry->count);
if (entry->elements[index].info == REBAL_NODEUUID_MINE) {
/* Index matches this node's nodeuuid.*/
- ret = 1;
+ ret = _gf_true;
goto out;
}
@@ -2636,12 +2580,12 @@ gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid)
/* None of the bricks in the subvol are up.
* CHILD_DOWN will kill the process soon */
- return 0;
+ return _gf_false;
}
if (entry->elements[index].info == REBAL_NODEUUID_MINE) {
/* Index matches this node's nodeuuid.*/
- ret = 1;
+ ret = _gf_true;
goto out;
}
}
@@ -2690,6 +2634,7 @@ gf_defrag_migrate_single_file(void *opaque)
struct iatt *iatt_ptr = NULL;
gf_boolean_t update_skippedcount = _gf_true;
int i = 0;
+ gf_boolean_t should_i_migrate = 0;
rebal_entry = (struct dht_container *)opaque;
if (!rebal_entry) {
@@ -2744,17 +2689,29 @@ gf_defrag_migrate_single_file(void *opaque)
goto out;
}
- if (!gf_defrag_should_i_migrate(this, rebal_entry->local_subvol_index,
- entry->d_stat.ia_gfid)) {
- gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path);
- goto out;
- }
+ should_i_migrate = gf_defrag_should_i_migrate(
+ this, rebal_entry->local_subvol_index, entry->d_stat.ia_gfid);
gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid);
gf_uuid_copy(entry_loc.pargfid, loc->gfid);
ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+
+ if (!should_i_migrate) {
+ /* this node isn't supposed to migrate the file. suppressing any
+ * potential error from lookup as this file is under migration by
+ * another node */
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "Ignoring lookup failure: node isn't migrating %s",
+ entry_loc.path);
+ ret = 0;
+ }
+ gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path);
+ goto out;
+ }
+
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
"Migrate file failed: %s lookup failed", entry_loc.path);
@@ -2917,8 +2874,7 @@ gf_defrag_migrate_single_file(void *opaque)
if (defrag->stats == _gf_true) {
gettimeofday(&end, NULL);
- elapsed = (end.tv_sec - start.tv_sec) * 1e6 +
- (end.tv_usec - start.tv_usec);
+ elapsed = gf_tvdiff(&start, &end);
gf_log(this->name, GF_LOG_INFO,
"Migration of "
"file:%s size:%" PRIu64
@@ -3097,9 +3053,9 @@ int static gf_defrag_get_entry(xlator_t *this, int i,
dht_conf_t *conf, gf_defrag_info_t *defrag,
fd_t *fd, dict_t *migrate_data,
struct dir_dfmeta *dir_dfmeta, dict_t *xattr_req,
- int *should_commit_hash, int *perrno)
+ int *perrno)
{
- int ret = -1;
+ int ret = 0;
char is_linkfile = 0;
gf_dirent_t *df_entry = NULL;
struct dht_container *tmp_container = NULL;
@@ -3115,6 +3071,13 @@ int static gf_defrag_get_entry(xlator_t *this, int i,
}
if (dir_dfmeta->fetch_entries[i] == 1) {
+ if (!fd) {
+ dir_dfmeta->fetch_entries[i] = 0;
+ dir_dfmeta->offset_var[i].readdir_done = 1;
+ ret = 0;
+ goto out;
+ }
+
ret = syncop_readdirp(conf->local_subvols[i], fd, 131072,
dir_dfmeta->offset_var[i].offset,
&(dir_dfmeta->equeue[i]), xattr_req, NULL);
@@ -3274,7 +3237,6 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *migrate_data, int *perrno)
{
int ret = -1;
- fd_t *fd = NULL;
dht_conf_t *conf = NULL;
gf_dirent_t entries;
dict_t *xattr_req = NULL;
@@ -3295,7 +3257,7 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
int dfc_index = 0;
int throttle_up = 0;
struct dir_dfmeta *dir_dfmeta = NULL;
- int should_commit_hash = 1;
+ xlator_t *old_THIS = NULL;
gf_log(this->name, GF_LOG_INFO, "migrate data called on %s", loc->path);
gettimeofday(&dir_start, NULL);
@@ -3308,28 +3270,53 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
goto out;
}
- fd = fd_create(loc->inode, defrag->pid);
- if (!fd) {
- gf_log(this->name, GF_LOG_ERROR, "Failed to create fd");
+ old_THIS = THIS;
+ THIS = this;
+
+ dir_dfmeta = GF_CALLOC(1, sizeof(*dir_dfmeta), gf_common_mt_pointer);
+ if (!dir_dfmeta) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta is NULL");
ret = -1;
goto out;
}
- ret = syncop_opendir(this, loc, fd, NULL, NULL);
- if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_DATA_FAILED,
- "Migrate data failed: Failed to open dir %s", loc->path);
- *perrno = -ret;
+ dir_dfmeta->lfd = GF_CALLOC(local_subvols_cnt, sizeof(fd_t *),
+ gf_common_mt_pointer);
+ if (!dir_dfmeta->lfd) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY,
+ "for dir_dfmeta", NULL);
ret = -1;
+ *perrno = ENOMEM;
goto out;
}
- fd_bind(fd);
- dir_dfmeta = GF_CALLOC(1, sizeof(*dir_dfmeta), gf_common_mt_pointer);
- if (!dir_dfmeta) {
- gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta is NULL");
- ret = -1;
- goto out;
+ for (i = 0; i < local_subvols_cnt; i++) {
+ dir_dfmeta->lfd[i] = fd_create(loc->inode, defrag->pid);
+ if (!dir_dfmeta->lfd[i]) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_FD_CREATE_FAILED,
+ NULL);
+ *perrno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_opendir(conf->local_subvols[i], loc, dir_dfmeta->lfd[i],
+ NULL, NULL);
+ if (ret) {
+ fd_unref(dir_dfmeta->lfd[i]);
+ dir_dfmeta->lfd[i] = NULL;
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FAILED_TO_OPEN,
+ "dir: %s", loc->path, "subvol: %s",
+ conf->local_subvols[i]->name, NULL);
+
+ if (conf->decommission_in_progress) {
+ *perrno = -ret;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ fd_bind(dir_dfmeta->lfd[i]);
+ }
}
dir_dfmeta->head = GF_CALLOC(local_subvols_cnt, sizeof(*(dir_dfmeta->head)),
@@ -3364,6 +3351,7 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
ret = -1;
goto out;
}
+
ret = gf_defrag_ctx_subvols_init(dir_dfmeta->offset_var, this);
if (ret) {
gf_log(this->name, GF_LOG_ERROR,
@@ -3376,7 +3364,8 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dir_dfmeta->fetch_entries = GF_CALLOC(local_subvols_cnt, sizeof(int),
gf_common_mt_int);
if (!dir_dfmeta->fetch_entries) {
- gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->fetch_entries is NULL");
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY,
+ "for dir_dfmeta->fetch_entries", NULL);
ret = -1;
goto out;
}
@@ -3446,8 +3435,13 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
ldfq_count <= MAX_MIGRATE_QUEUE_COUNT &&
!dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) {
ret = gf_defrag_get_entry(this, dfc_index, &container, loc, conf,
- defrag, fd, migrate_data, dir_dfmeta,
- xattr_req, &should_commit_hash, perrno);
+ defrag, dir_dfmeta->lfd[dfc_index],
+ migrate_data, dir_dfmeta, xattr_req,
+ perrno);
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+ goto out;
+ }
if (ret) {
gf_log(this->name, GF_LOG_WARNING,
@@ -3487,27 +3481,19 @@ gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
}
gettimeofday(&end, NULL);
- elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 +
- (end.tv_usec - dir_start.tv_usec);
+ elapsed = gf_tvdiff(&dir_start, &end);
gf_log(this->name, GF_LOG_INFO,
"Migration operation on dir %s took "
"%.2f secs",
loc->path, elapsed / 1e6);
ret = 0;
out:
-
+ THIS = old_THIS;
gf_defrag_free_dir_dfmeta(dir_dfmeta, local_subvols_cnt);
if (xattr_req)
dict_unref(xattr_req);
- if (fd)
- fd_unref(fd);
-
- if (ret == 0 && should_commit_hash == 0) {
- ret = 2;
- }
-
/* It does not matter if it errored out - this number is
* used to calculate rebalance estimated time to complete.
* No locking required as dirs are processed by a single thread.
@@ -3515,6 +3501,7 @@ out:
defrag->num_dirs_processed++;
return ret;
}
+
int
gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *fix_layout)
@@ -3529,7 +3516,6 @@ gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
* rebalance is complete.
*/
if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX ||
- defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER ||
defrag->cmd == GF_DEFRAG_CMD_DETACH_START) {
return 0;
}
@@ -3575,114 +3561,6 @@ gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
return 0;
}
-/* Function for doing a named lookup on file inodes during an attach tier
- * So that a hardlink lookup heal i.e gfid to parent gfid lookup heal
- * happens on pre-existing data. This is required so that the ctr database has
- * hardlinks of all the exisitng file in the volume. CTR xlator on the
- * brick/server side does db update/insert of the hardlink on a namelookup.
- * Currently the namedlookup is done synchronous to the fixlayout that is
- * triggered by attach tier. This is not performant, adding more time to
- * fixlayout. The performant approach is record the hardlinks on a compressed
- * datastore and then do the namelookup asynchronously later, giving the ctr db
- * eventual consistency
- * */
-int
-gf_fix_layout_tier_attach_lookup(xlator_t *this, loc_t *parent_loc,
- gf_dirent_t *file_dentry)
-{
- int ret = -1;
- dict_t *lookup_xdata = NULL;
- dht_conf_t *conf = NULL;
- loc_t file_loc = {
- 0,
- };
- struct iatt iatt = {
- 0,
- };
-
- GF_VALIDATE_OR_GOTO("tier", this, out);
-
- GF_VALIDATE_OR_GOTO(this->name, parent_loc, out);
-
- GF_VALIDATE_OR_GOTO(this->name, file_dentry, out);
-
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
-
- if (!parent_loc->inode) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "%s/%s parent is NULL", parent_loc->path, file_dentry->d_name);
- goto out;
- }
-
- conf = this->private;
-
- loc_wipe(&file_loc);
-
- if (gf_uuid_is_null(file_dentry->d_stat.ia_gfid)) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "%s/%s gfid not present", parent_loc->path, file_dentry->d_name);
- goto out;
- }
-
- gf_uuid_copy(file_loc.gfid, file_dentry->d_stat.ia_gfid);
-
- if (gf_uuid_is_null(parent_loc->gfid)) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "%s/%s"
- " gfid not present",
- parent_loc->path, file_dentry->d_name);
- goto out;
- }
-
- gf_uuid_copy(file_loc.pargfid, parent_loc->gfid);
-
- ret = dht_build_child_loc(this, &file_loc, parent_loc, file_dentry->d_name);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Child loc build failed");
- ret = -1;
- goto out;
- }
-
- lookup_xdata = dict_new();
- if (!lookup_xdata) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed creating lookup dict for %s", file_dentry->d_name);
- goto out;
- }
-
- ret = dict_set_int32(lookup_xdata, CTR_ATTACH_TIER_LOOKUP, 1);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to set lookup flag");
- goto out;
- }
-
- gf_uuid_copy(file_loc.parent->gfid, parent_loc->gfid);
-
- /* Sending lookup to cold tier only */
- ret = syncop_lookup(conf->subvolumes[0], &file_loc, &iatt, NULL,
- lookup_xdata, NULL);
- if (ret) {
- /* If the file does not exist on the cold tier than it must */
- /* have been discovered on the hot tier. This is not an error. */
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "%s lookup to cold tier on attach heal failed", file_loc.path);
- goto out;
- }
-
- ret = 0;
-
-out:
-
- loc_wipe(&file_loc);
-
- if (lookup_xdata)
- dict_unref(lookup_xdata);
-
- return ret;
-}
-
int
gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *fix_layout, dict_t *migrate_data)
@@ -3702,7 +3580,6 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
};
inode_t *linked_inode = NULL, *inode = NULL;
dht_conf_t *conf = NULL;
- int should_commit_hash = 1;
int perrno = 0;
conf = this->private;
@@ -3805,16 +3682,6 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
continue;
if (!IA_ISDIR(entry->d_stat.ia_type)) {
- /* If its a fix layout during the attach
- * tier operation do lookups on files
- * on cold subvolume so that there is a
- * CTR DB Lookup Heal triggered on existing
- * data.
- * */
- if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
- gf_fix_layout_tier_attach_lookup(this, loc, entry);
- }
-
continue;
}
loc_wipe(&entry_loc);
@@ -3831,8 +3698,6 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
goto out;
} else {
- should_commit_hash = 0;
-
continue;
}
}
@@ -3895,7 +3760,6 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
ret = -1;
goto out;
} else {
- should_commit_hash = 0;
continue;
}
}
@@ -3908,7 +3772,12 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
ret = gf_defrag_fix_layout(this, defrag, &entry_loc, fix_layout,
migrate_data);
- if (ret && ret != 2) {
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED ||
+ defrag->defrag_status == GF_DEFRAG_STATUS_FAILED) {
+ goto out;
+ }
+
+ if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
"Fix layout failed for %s", entry_loc.path);
@@ -3931,7 +3800,25 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
INIT_LIST_HEAD(&entries.list);
}
+ /* A directory layout is fixed only after its subdirs are healed to
+ * any newly added bricks. If the layout is fixed before subdirs are
+ * healed, the newly added brick will get a non-null layout.
+ * Any subdirs which hash to that layout will no longer show up
+ * in a directory listing until they are healed.
+ */
+
ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+ /* In case of a race where the directory is deleted just before
+ * layout setxattr, the errors are updated in the layout structure.
+ * We can use this information to make a decision whether the directory
+ * is deleted entirely.
+ */
+ if (ret == 0) {
+ ret = dht_dir_layout_error_check(this, loc->inode);
+ ret = -ret;
+ }
+
if (ret) {
if (-ret == ENOENT || -ret == ESTALE) {
gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
@@ -3942,6 +3829,7 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
defrag->total_failures++;
}
ret = 0;
+ goto out;
} else {
gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
"Setxattr failed for %s", loc->path);
@@ -3956,11 +3844,10 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
}
}
- if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
- (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
+ if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
- if (ret && (ret != 2)) {
+ if (ret) {
if (perrno == ENOENT || perrno == ESTALE) {
ret = 0;
goto out;
@@ -3976,18 +3863,13 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (conf->decommission_in_progress) {
goto out;
}
-
- should_commit_hash = 0;
}
- } else if (ret == 2) {
- should_commit_hash = 0;
}
}
gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
- if (should_commit_hash &&
- gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+ if (gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
defrag->total_failures++;
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
@@ -4011,245 +3893,34 @@ out:
if (fd)
fd_unref(fd);
- if (ret == 0 && should_commit_hash == 0) {
- ret = 2;
- }
-
- return ret;
-}
-
-/******************************************************************************
- * Tier background Fix layout functions
- ******************************************************************************/
-/* This is the background tier fixlayout thread */
-void *
-gf_tier_do_fix_layout(void *args)
-{
- gf_tier_fix_layout_arg_t *tier_fix_layout_arg = args;
- int ret = -1;
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- dict_t *dict = NULL;
- loc_t loc = {
- 0,
- };
- struct iatt iatt = {
- 0,
- };
- struct iatt parent = {
- 0,
- };
-
- GF_VALIDATE_OR_GOTO("tier", tier_fix_layout_arg, out);
- GF_VALIDATE_OR_GOTO("tier", tier_fix_layout_arg->this, out);
- this = tier_fix_layout_arg->this;
-
- conf = this->private;
- GF_VALIDATE_OR_GOTO(this->name, conf, out);
-
- defrag = conf->defrag;
- GF_VALIDATE_OR_GOTO(this->name, defrag, out);
- GF_VALIDATE_OR_GOTO(this->name, defrag->root_inode, out);
-
- GF_VALIDATE_OR_GOTO(this->name, tier_fix_layout_arg->fix_layout, out);
-
- /* Get Root loc_t */
- dht_build_root_loc(defrag->root_inode, &loc);
- ret = syncop_lookup(this, &loc, &iatt, &parent, NULL, NULL);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_START_FAILED,
- "Lookup on root failed.");
- ret = -1;
- goto out;
- }
-
- /* Start the crawl */
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Tiering Fixlayout started");
-
- ret = gf_defrag_fix_layout(this, defrag, &loc,
- tier_fix_layout_arg->fix_layout, NULL);
- if (ret && ret != 2) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_FAILED,
- "Tiering fixlayout failed.");
- ret = -1;
- goto out;
- }
-
- if (ret != 2 &&
- gf_defrag_settle_hash(this, defrag, &loc,
- tier_fix_layout_arg->fix_layout) != 0) {
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
-
- dict = dict_new();
- if (!dict) {
- ret = -1;
- goto out;
- }
-
- ret = dict_set_str(dict, GF_XATTR_TIER_LAYOUT_FIXED_KEY, "yes");
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_FAILED,
- "Failed to set dictionary value: key = %s",
- GF_XATTR_TIER_LAYOUT_FIXED_KEY);
- ret = -1;
- goto out;
- }
-
- /* Marking the completion of tiering fix layout via a xattr on root */
- ret = syncop_setxattr(this, &loc, dict, 0, NULL, NULL);
- if (ret) {
- gf_log(this->name, GF_LOG_ERROR,
- "Failed to set tiering fix "
- "layout completed xattr on %s",
- loc.path);
- ret = -1;
- goto out;
- }
-
- ret = 0;
-out:
- if (ret && defrag)
- defrag->total_failures++;
-
- if (dict)
- dict_unref(dict);
-
- return NULL;
-}
-
-int
-gf_tier_start_fix_layout(xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag,
- dict_t *fix_layout)
-{
- int ret = -1;
- dict_t *tier_dict = NULL;
- gf_tier_fix_layout_arg_t *tier_fix_layout_arg = NULL;
-
- tier_dict = dict_new();
- if (!tier_dict) {
- gf_log("tier", GF_LOG_ERROR,
- "Tier fix layout failed :"
- "Creation of tier_dict failed");
- ret = -1;
- goto out;
- }
-
- /* Check if layout is fixed already */
- ret = syncop_getxattr(this, loc, &tier_dict, GF_XATTR_TIER_LAYOUT_FIXED_KEY,
- NULL, NULL);
- if (ret != 0) {
- tier_fix_layout_arg = &defrag->tier_conf.tier_fix_layout_arg;
-
- /*Fill crawl arguments */
- tier_fix_layout_arg->this = this;
- tier_fix_layout_arg->fix_layout = fix_layout;
-
- /* Spawn the fix layout thread so that its done in the
- * background */
- ret = gf_thread_create(&tier_fix_layout_arg->thread_id, NULL,
- gf_tier_do_fix_layout, tier_fix_layout_arg,
- "tierfixl");
- if (ret) {
- gf_log("tier", GF_LOG_ERROR,
- "Thread creation failed. "
- "Background fix layout for tiering will not "
- "work.");
- defrag->total_failures++;
- goto out;
- }
- }
- ret = 0;
-out:
- if (tier_dict)
- dict_unref(tier_dict);
-
return ret;
}
-void
-gf_tier_clear_fix_layout(xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
-{
- int ret = -1;
- dict_t *dict = NULL;
-
- GF_VALIDATE_OR_GOTO("tier", this, out);
- GF_VALIDATE_OR_GOTO(this->name, loc, out);
- GF_VALIDATE_OR_GOTO(this->name, defrag, out);
-
- /* Check if background fixlayout is completed. This is not
- * multi-process safe i.e there is a possibility that by the time
- * we move to remove the xattr there it might have been cleared by some
- * other detach process from other node. We ignore the error if such
- * a thing happens */
- ret = syncop_getxattr(this, loc, &dict, GF_XATTR_TIER_LAYOUT_FIXED_KEY,
- NULL, NULL);
- if (ret) {
- /* Background fixlayout not complete - nothing to clear*/
- gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_LOG_TIER_STATUS,
- "Unable to retrieve fixlayout xattr."
- "Assume background fix layout not complete");
- goto out;
- }
-
- ret = syncop_removexattr(this, loc, GF_XATTR_TIER_LAYOUT_FIXED_KEY, NULL,
- NULL);
- if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_LOG_TIER_STATUS,
- "Failed removing tier fix layout "
- "xattr from %s",
- loc->path);
- goto out;
- }
- ret = 0;
-out:
- if (dict)
- dict_unref(dict);
-}
-
-void
-gf_tier_wait_fix_lookup(gf_defrag_info_t *defrag)
-{
- if (defrag->tier_conf.tier_fix_layout_arg.thread_id) {
- pthread_join(defrag->tier_conf.tier_fix_layout_arg.thread_id, NULL);
- }
-}
-/******************Tier background Fix layout functions END********************/
-
int
dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
loc_t *loc)
{
dict_t *dict = NULL;
- gf_defrag_info_t *defrag = NULL;
uuid_t *uuid_ptr = NULL;
int ret = -1;
int i = 0;
int j = 0;
- defrag = conf->defrag;
-
- if (defrag->cmd != GF_DEFRAG_CMD_START_TIER) {
- /* Find local subvolumes */
- ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL,
- NULL, NULL);
- if (ret && (ret != -ENODATA)) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
- "local "
- "subvolume determination failed with error: %d",
- -ret);
- ret = -1;
- goto out;
- }
-
- if (!ret)
- goto out;
+ /* Find local subvolumes */
+ ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL, NULL,
+ NULL);
+ if (ret && (ret != -ENODATA)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+ "local "
+ "subvolume determination failed with error: %d",
+ -ret);
+ ret = -1;
+ goto out;
}
+ if (!ret)
+ goto out;
+
ret = syncop_getxattr(this, loc, &dict, GF_REBAL_OLD_FIND_LOCAL_SUBVOL,
NULL, NULL);
if (ret) {
@@ -4293,9 +3964,6 @@ gf_defrag_subvol_file_size(xlator_t *this, loc_t *root_loc)
0,
};
- if (!this)
- return 0;
-
ret = syncop_statfs(this, root_loc, &buf, NULL, NULL);
if (ret) {
/* Aargh! */
@@ -4343,9 +4011,6 @@ dht_file_counter_thread(void *args)
struct timespec time_to_wait = {
0,
};
- struct timeval now = {
- 0,
- };
uint64_t tmp_size = 0;
if (!args)
@@ -4355,9 +4020,8 @@ dht_file_counter_thread(void *args)
dht_build_root_loc(defrag->root_inode, &root_loc);
while (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) {
- gettimeofday(&now, NULL);
- time_to_wait.tv_sec = now.tv_sec + 600;
- time_to_wait.tv_nsec = 0;
+ timespec_now(&time_to_wait);
+ time_to_wait.tv_sec += 600;
pthread_mutex_lock(&defrag->fc_mutex);
pthread_cond_timedwait(&defrag->fc_wakeup_cond, &defrag->fc_mutex,
@@ -4430,7 +4094,7 @@ gf_defrag_estimates_init(xlator_t *this, loc_t *loc, pthread_t *filecnt_thread)
goto out;
}
- ret = gf_thread_create(filecnt_thread, NULL, &dht_file_counter_thread,
+ ret = gf_thread_create(filecnt_thread, NULL, dht_file_counter_thread,
(void *)defrag, "dhtfcnt");
if (ret) {
@@ -4454,9 +4118,6 @@ gf_defrag_parallel_migration_init(xlator_t *this, gf_defrag_info_t *defrag,
int thread_spawn_count = 0;
int index = 0;
pthread_t *tid = NULL;
- char thread_name[GF_THREAD_NAMEMAX] = {
- 0,
- };
if (!defrag)
goto out;
@@ -4490,10 +4151,8 @@ gf_defrag_parallel_migration_init(xlator_t *this, gf_defrag_info_t *defrag,
/*Spawn Threads Here*/
while (index < thread_spawn_count) {
- snprintf(thread_name, sizeof(thread_name), "dhtmig%d",
- ((index + 1) & 0x3ff));
- ret = gf_thread_create(&(tid[index]), NULL, &gf_defrag_task,
- (void *)defrag, thread_name);
+ ret = gf_thread_create(&(tid[index]), NULL, gf_defrag_task,
+ (void *)defrag, "dhtmig%d", (index + 1) & 0x3ff);
if (ret != 0) {
gf_msg("DHT", GF_LOG_ERROR, ret, 0, "Thread[%d] creation failed. ",
index);
@@ -4566,7 +4225,6 @@ gf_defrag_start_crawl(void *data)
dict_t *migrate_data = NULL;
dict_t *status = NULL;
glusterfs_ctx_t *ctx = NULL;
- dht_methods_t *methods = NULL;
call_frame_t *statfs_frame = NULL;
xlator_t *old_THIS = NULL;
int ret = -1;
@@ -4582,7 +4240,6 @@ gf_defrag_start_crawl(void *data)
int thread_index = 0;
pthread_t *tid = NULL;
pthread_t filecnt_thread;
- gf_boolean_t is_tier_detach = _gf_false;
gf_boolean_t fc_thread_started = _gf_false;
this = data;
@@ -4601,7 +4258,8 @@ gf_defrag_start_crawl(void *data)
if (!defrag)
goto exit;
- gettimeofday(&defrag->start_time, NULL);
+ defrag->start_time = gf_time();
+
dht_build_root_inode(this, &defrag->root_inode);
if (!defrag->root_inode)
goto out;
@@ -4735,43 +4393,17 @@ gf_defrag_start_crawl(void *data)
}
}
- if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
- /* Fix layout for attach tier */
- ret = gf_tier_start_fix_layout(this, &loc, defrag, fix_layout);
- if (ret) {
- goto out;
- }
-
- methods = &(conf->methods);
-
- /* Calling tier_start of tier.c */
- methods->migration_other(this, defrag);
- if (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER ||
- defrag->cmd == GF_DEFRAG_CMD_DETACH_START) {
- ret = dict_set_str(migrate_data, GF_XATTR_FILE_MIGRATE_KEY,
- "force");
- if (ret)
- goto out;
- }
- } else {
- ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout,
- migrate_data);
- if (ret && ret != 2) {
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
-
- if (ret != 2 &&
- gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) {
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
+ ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
- if (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER ||
- defrag->cmd == GF_DEFRAG_CMD_DETACH_START)
- is_tier_detach = _gf_true;
+ if (gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
}
gf_log("DHT", GF_LOG_INFO, "crawling file-system completed");
@@ -4785,19 +4417,6 @@ out:
defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
}
- if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
- /* Wait for the tier fixlayout to
- * complete if its was started.*/
- gf_tier_wait_fix_lookup(defrag);
- }
-
- if (is_tier_detach && ret == 0) {
- /* If it was a detach remove the tier fix-layout
- * xattr on root. Ignoring the failure, as nothing has to be
- * done, logging is done in gf_tier_clear_fix_layout */
- gf_tier_clear_fix_layout(this, &loc, defrag);
- }
-
gf_defrag_parallel_migration_cleanup(defrag, tid, thread_index);
if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) &&
@@ -4811,9 +4430,9 @@ out:
dht_send_rebalance_event(this, defrag->cmd, defrag->defrag_status);
+ status = dict_new();
LOCK(&defrag->lock);
{
- status = dict_new();
gf_defrag_status_get(conf, status);
if (ctx && ctx->notify)
ctx->notify(GF_EN_DEFRAG_STATUS, status);
@@ -4896,9 +4515,6 @@ gf_defrag_get_estimates_based_on_size(dht_conf_t *conf)
uint64_t total_processed = 0;
uint64_t tmp_count = 0;
uint64_t time_to_complete = 0;
- struct timeval now = {
- 0,
- };
double elapsed = 0;
defrag = conf->defrag;
@@ -4906,8 +4522,7 @@ gf_defrag_get_estimates_based_on_size(dht_conf_t *conf)
if (!g_totalsize)
goto out;
- gettimeofday(&now, NULL);
- elapsed = now.tv_sec - defrag->start_time.tv_sec;
+ elapsed = gf_time() - defrag->start_time;
/* Don't calculate the estimates for the first 10 minutes.
* It is unlikely to be accurate and estimates are not required
@@ -4957,13 +4572,8 @@ gf_defrag_status_get(dht_conf_t *conf, dict_t *dict)
uint64_t lookup = 0;
uint64_t failures = 0;
uint64_t skipped = 0;
- uint64_t promoted = 0;
- uint64_t demoted = 0;
char *status = "";
double elapsed = 0;
- struct timeval end = {
- 0,
- };
uint64_t time_to_complete = 0;
uint64_t time_left = 0;
gf_defrag_info_t *defrag = conf->defrag;
@@ -4980,17 +4590,12 @@ gf_defrag_status_get(dht_conf_t *conf, dict_t *dict)
lookup = defrag->num_files_lookedup;
failures = defrag->total_failures;
skipped = defrag->skipped;
- promoted = defrag->total_files_promoted;
- demoted = defrag->total_files_demoted;
-
- gettimeofday(&end, NULL);
- elapsed = end.tv_sec - defrag->start_time.tv_sec;
+ elapsed = gf_time() - defrag->start_time;
/* The rebalance is still in progress */
- if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
- (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED)) {
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) {
time_to_complete = gf_defrag_get_estimates_based_on_size(conf);
if (time_to_complete && (time_to_complete > elapsed))
@@ -5005,14 +4610,6 @@ gf_defrag_status_get(dht_conf_t *conf, dict_t *dict)
if (!dict)
goto log;
- ret = dict_set_uint64(dict, "promoted", promoted);
- if (ret)
- gf_log(THIS->name, GF_LOG_WARNING, "failed to set promoted count");
-
- ret = dict_set_uint64(dict, "demoted", demoted);
- if (ret)
- gf_log(THIS->name, GF_LOG_WARNING, "failed to set demoted count");
-
ret = dict_set_uint64(dict, "files", files);
if (ret)
gf_log(THIS->name, GF_LOG_WARNING, "failed to set file count");
@@ -5078,159 +4675,6 @@ out:
return 0;
}
-void
-gf_defrag_set_pause_state(gf_tier_conf_t *tier_conf, tier_pause_state_t state)
-{
- pthread_mutex_lock(&tier_conf->pause_mutex);
- tier_conf->pause_state = state;
- pthread_mutex_unlock(&tier_conf->pause_mutex);
-}
-
-tier_pause_state_t
-gf_defrag_get_pause_state(gf_tier_conf_t *tier_conf)
-{
- int state;
-
- pthread_mutex_lock(&tier_conf->pause_mutex);
- state = tier_conf->pause_state;
- pthread_mutex_unlock(&tier_conf->pause_mutex);
-
- return state;
-}
-
-tier_pause_state_t
-gf_defrag_check_pause_tier(gf_tier_conf_t *tier_conf)
-{
- int woke = 0;
- int state = -1;
-
- pthread_mutex_lock(&tier_conf->pause_mutex);
-
- if (tier_conf->pause_state == TIER_RUNNING)
- goto out;
-
- if (tier_conf->pause_state == TIER_PAUSED)
- goto out;
-
- if (tier_conf->promote_in_progress || tier_conf->demote_in_progress)
- goto out;
-
- tier_conf->pause_state = TIER_PAUSED;
-
- if (tier_conf->pause_synctask) {
- synctask_wake(tier_conf->pause_synctask);
- tier_conf->pause_synctask = 0;
- woke = 1;
- }
-
- gf_msg("tier", GF_LOG_DEBUG, 0, DHT_MSG_TIER_PAUSED, "woken %d", woke);
-
- gf_event(EVENT_TIER_PAUSE, "vol=%s", tier_conf->volname);
-out:
- state = tier_conf->pause_state;
-
- pthread_mutex_unlock(&tier_conf->pause_mutex);
-
- return state;
-}
-
-void
-gf_defrag_pause_tier_timeout(void *data)
-{
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
-
- this = (xlator_t *)data;
- GF_VALIDATE_OR_GOTO("tier", this, out);
-
- conf = this->private;
- GF_VALIDATE_OR_GOTO(this->name, conf, out);
-
- defrag = conf->defrag;
- GF_VALIDATE_OR_GOTO(this->name, defrag, out);
-
- gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_TIER_PAUSED,
- "Request pause timer timeout");
-
- gf_defrag_check_pause_tier(&defrag->tier_conf);
-
-out:
- return;
-}
-
-int
-gf_defrag_pause_tier(xlator_t *this, gf_defrag_info_t *defrag)
-{
- int ret = 0;
- struct timespec delta = {
- 0,
- };
- int delay = 2;
-
- if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED)
- goto out;
-
- /*
- * Set flag requesting to pause tiering. Wait 'delay' seconds for
- * tiering to actually stop as indicated by the pause state
- * before returning success or failure.
- */
- gf_defrag_set_pause_state(&defrag->tier_conf, TIER_REQUEST_PAUSE);
-
- /*
- * If migration is not underway, can pause immediately.
- */
- gf_defrag_check_pause_tier(&defrag->tier_conf);
- if (gf_defrag_get_pause_state(&defrag->tier_conf) == TIER_PAUSED)
- goto out;
-
- gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_TIER_PAUSED,
- "Request pause tier");
-
- defrag->tier_conf.pause_synctask = synctask_get();
- delta.tv_sec = delay;
- delta.tv_nsec = 0;
- defrag->tier_conf.pause_timer = gf_timer_call_after(
- this->ctx, delta, gf_defrag_pause_tier_timeout, this);
-
- synctask_yield(defrag->tier_conf.pause_synctask);
-
- if (gf_defrag_get_pause_state(&defrag->tier_conf) == TIER_PAUSED)
- goto out;
-
- gf_defrag_set_pause_state(&defrag->tier_conf, TIER_RUNNING);
-
- ret = -1;
-out:
-
- gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_TIER_PAUSED,
- "Pause tiering ret=%d", ret);
-
- return ret;
-}
-
-int
-gf_defrag_resume_tier(xlator_t *this, gf_defrag_info_t *defrag)
-{
- gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_TIER_RESUME,
- "Pause end. Resume tiering");
-
- gf_defrag_set_pause_state(&defrag->tier_conf, TIER_RUNNING);
-
- gf_event(EVENT_TIER_RESUME, "vol=%s", defrag->tier_conf.volname);
-
- return 0;
-}
-
-int
-gf_defrag_start_detach_tier(gf_defrag_info_t *defrag)
-{
- defrag->cmd = GF_DEFRAG_CMD_START_DETACH_TIER;
-
- return 0;
-}
-
int
gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output)
{
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index 45808a2bfa6..d9dbf50492f 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -11,11 +11,9 @@
/* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should
* delete the newpath if it gets EEXISTS from link() call.
*/
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
#include "dht-lock.h"
-#include "defaults.h"
+#include <glusterfs/defaults.h>
int
dht_rename_unlock(call_frame_t *frame, xlator_t *this);
@@ -472,13 +470,14 @@ err:
* dirs are the same.
*
*/
-static void
+static int
dht_order_rename_lock(call_frame_t *frame, loc_t **loc, xlator_t **subvol)
{
int ret = 0;
+ int op_ret = 0;
dht_local_t *local = NULL;
- char src[GF_UUID_BNAME_BUF_SIZE] = {0};
- char dst[GF_UUID_BNAME_BUF_SIZE] = {0};
+ char *src = NULL;
+ char *dst = NULL;
local = frame->local;
@@ -491,17 +490,38 @@ dht_order_rename_lock(call_frame_t *frame, loc_t **loc, xlator_t **subvol)
if (ret == 0) {
/* hashed subvols are the same for src and dst */
/* Entrylks need to be ordered*/
- if (local->loc.pargfid)
+
+ src = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc.name) + 1);
+ if (!src) {
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Insufficient memory for src");
+ op_ret = -1;
+ goto out;
+ }
+
+ if (!gf_uuid_is_null(local->loc.pargfid))
uuid_utoa_r(local->loc.pargfid, src);
else if (local->loc.parent)
uuid_utoa_r(local->loc.parent->gfid, src);
+ else
+ src[0] = '\0';
strcat(src, local->loc.name);
- if (local->loc2.pargfid)
+ dst = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc2.name) + 1);
+ if (!dst) {
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Insufficient memory for dst");
+ op_ret = -1;
+ goto out;
+ }
+
+ if (!gf_uuid_is_null(local->loc2.pargfid))
uuid_utoa_r(local->loc2.pargfid, dst);
else if (local->loc2.parent)
uuid_utoa_r(local->loc2.parent->gfid, dst);
+ else
+ dst[0] = '\0';
strcat(dst, local->loc2.name);
ret = strcmp(src, dst);
@@ -520,7 +540,10 @@ dht_order_rename_lock(call_frame_t *frame, loc_t **loc, xlator_t **subvol)
*subvol = local->dst_hashed;
}
- return;
+ op_ret = 0;
+
+out:
+ return op_ret;
}
int
@@ -561,7 +584,11 @@ dht_rename_dir(call_frame_t *frame, xlator_t *this)
* deadlocks when rename (src, dst) and rename (dst, src) is done from
* two different clients
*/
- dht_order_rename_lock(frame, &loc, &subvol);
+ ret = dht_order_rename_lock(frame, &loc, &subvol);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
/* Rename must take locks on src to avoid lookup selfheal from
* recreating src on those subvols where the rename was successful.
@@ -578,7 +605,6 @@ dht_rename_dir(call_frame_t *frame, xlator_t *this)
return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
NULL);
return 0;
@@ -985,9 +1011,11 @@ dht_rename_links_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
{
xlator_t *prev = NULL;
dht_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
prev = cookie;
local = frame->local;
+ main_frame = local->main_frame;
/* TODO: Handle this case in lookup-optimize */
if (op_ret == -1) {
@@ -1000,7 +1028,8 @@ dht_rename_links_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
dht_linkfile_attr_heal(frame, this);
}
- dht_rename_unlink(frame, this);
+ dht_rename_unlink(main_frame, this);
+ DHT_STACK_DESTROY(frame);
return 0;
}
@@ -1016,7 +1045,8 @@ dht_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *src_cached = NULL;
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
- loc_t link_loc = {0};
+ call_frame_t *link_frame = NULL;
+ dht_local_t *link_local = NULL;
local = frame->local;
prev = cookie;
@@ -1086,18 +1116,36 @@ dht_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
/* Create the linkto file for the dst file */
if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) {
- loc_copy(&link_loc, &local->loc2);
- if (link_loc.inode)
- inode_unref(link_loc.inode);
- link_loc.inode = inode_ref(local->loc.inode);
- gf_uuid_copy(local->gfid, local->loc.inode->gfid);
- gf_uuid_copy(link_loc.gfid, local->loc.inode->gfid);
-
- dht_linkfile_create(frame, dht_rename_links_create_cbk, this,
- src_cached, dst_hashed, &link_loc);
+ link_frame = copy_frame(frame);
+ if (!link_frame) {
+ goto unlink;
+ }
+
+ /* fop value sent as maxvalue because it is not used
+ * anywhere in this case */
+ link_local = dht_local_init(link_frame, &local->loc2, NULL,
+ GF_FOP_MAXVALUE);
+ if (!link_local) {
+ goto unlink;
+ }
+
+ if (link_local->loc.inode)
+ inode_unref(link_local->loc.inode);
+ link_local->loc.inode = inode_ref(local->loc.inode);
+ link_local->main_frame = frame;
+ link_local->stbuf = local->stbuf;
+ gf_uuid_copy(link_local->gfid, local->loc.inode->gfid);
+
+ dht_linkfile_create(link_frame, dht_rename_links_create_cbk, this,
+ src_cached, dst_hashed, &link_local->loc);
return 0;
}
+unlink:
+
+ if (link_frame) {
+ DHT_STACK_DESTROY(link_frame);
+ }
dht_rename_unlink(frame, this);
return 0;
@@ -1605,7 +1653,11 @@ dht_rename_file_protect_namespace(call_frame_t *frame, void *cookie,
* deadlocks when rename (src, dst) and rename (dst, src) is done from
* two different clients
*/
- dht_order_rename_lock(frame, &loc, &subvol);
+ ret = dht_order_rename_lock(frame, &loc, &subvol);
+ if (ret) {
+ local->op_errno = ENOMEM;
+ goto err;
+ }
ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
dht_rename_file_lock1_cbk);
@@ -1828,33 +1880,14 @@ dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
int op_errno = -1;
int ret = -1;
dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
char gfid[GF_UUID_BUF_SIZE] = {0};
char newgfid[GF_UUID_BUF_SIZE] = {0};
- gf_boolean_t free_xdata = _gf_false;
VALIDATE_OR_GOTO(frame, err);
VALIDATE_OR_GOTO(this, err);
VALIDATE_OR_GOTO(oldloc, err);
VALIDATE_OR_GOTO(newloc, err);
- conf = this->private;
-
- if (conf->subvolume_cnt == 1) {
- if (!IA_ISDIR(oldloc->inode->ia_type)) {
- if (!xdata) {
- free_xdata = _gf_true;
- }
- DHT_CHANGELOG_TRACK_AS_RENAME(xdata, oldloc, newloc);
- }
- default_rename(frame, this, oldloc, newloc, xdata);
- if (free_xdata && xdata) {
- dict_unref(xdata);
- xdata = NULL;
- }
- return 0;
- }
-
gf_uuid_unparse(oldloc->inode->gfid, gfid);
src_hashed = dht_subvol_get_hashed(this, oldloc);
@@ -1941,3 +1974,24 @@ err:
return 0;
}
+
+int
+dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ gf_boolean_t free_xdata = _gf_false;
+
+ /* Just a pass through */
+ if (!IA_ISDIR(oldloc->inode->ia_type)) {
+ if (!xdata) {
+ free_xdata = _gf_true;
+ }
+ DHT_CHANGELOG_TRACK_AS_RENAME(xdata, oldloc, newloc);
+ }
+ default_rename(frame, this, oldloc, newloc, xdata);
+ if (free_xdata && xdata) {
+ dict_unref(xdata);
+ xdata = NULL;
+ }
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index e17f96698bd..3e24065227c 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -8,12 +8,7 @@
cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
-#include "dht-common.h"
-#include "dht-messages.h"
#include "dht-lock.h"
-#include "glusterfs-acl.h"
#define DHT_SET_LAYOUT_RANGE(layout, i, srt, chunk, path) \
do { \
@@ -22,7 +17,7 @@
layout->list[i].commit_hash = layout->commit_hash; \
\
gf_msg_trace(this->name, 0, \
- "gave fix: %u - %u, with commit-hash %u" \
+ "gave fix: 0x%x - 0x%x, with commit-hash 0x%x" \
" on %s for %s", \
layout->list[i].start, layout->list[i].stop, \
layout->list[i].commit_hash, \
@@ -38,7 +33,7 @@
} \
} while (0)
-int
+static int
dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
gf_boolean_t newdir, dht_selfheal_layout_t healer,
dht_need_heal_t should_heal);
@@ -149,8 +144,8 @@ dht_refresh_layout_done(call_frame_t *frame)
ret = dht_layout_sort(refreshed);
if (ret == -1) {
- gf_msg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED,
- "sorting the layout failed");
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SORT_FAILED, NULL);
goto err;
}
@@ -206,10 +201,9 @@ dht_refresh_layout_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1) {
gf_uuid_unparse(local->loc.gfid, gfid);
local->op_errno = op_errno;
- gf_msg(this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_FILE_LOOKUP_FAILED,
- "lookup of %s on %s returned error, gfid: %s",
- local->loc.path, prev->name, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_FILE_LOOKUP_FAILED, "path=%s", local->loc.path,
+ "name=%s", prev->name, "gfid=%s", gfid, NULL);
goto unlock;
}
@@ -270,9 +264,8 @@ dht_refresh_layout(call_frame_t *frame)
conf->subvolume_cnt);
if (!local->selfheal.refreshed_layout) {
gf_uuid_unparse(local->loc.gfid, gfid);
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "mem allocation for layout failed, path:%s gfid:%s",
- local->loc.path, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
goto out;
}
@@ -284,9 +277,8 @@ dht_refresh_layout(call_frame_t *frame)
gf_uuid_unparse(local->loc.gfid, gfid);
local->xattr_req = dict_new();
if (local->xattr_req == NULL) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "dict mem allocation failed, path:%s gfid:%s",
- local->loc.path, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
goto out;
}
}
@@ -294,9 +286,9 @@ dht_refresh_layout(call_frame_t *frame)
if (dict_get(local->xattr_req, conf->xattr_name) == 0) {
ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:key = %s",
- local->loc.path, conf->xattr_name);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", local->loc.path, "key=%s", conf->xattr_name,
+ NULL);
}
for (i = 0; i < call_cnt; i++) {
@@ -349,7 +341,7 @@ dht_should_heal_layout(call_frame_t *frame, dht_layout_t **heal,
{
gf_boolean_t fixit = _gf_true;
dht_local_t *local = NULL;
- int ret = -1, heal_missing_dirs = 0;
+ int heal_missing_dirs = 0;
local = frame->local;
@@ -357,14 +349,11 @@ dht_should_heal_layout(call_frame_t *frame, dht_layout_t **heal,
(*ondisk == NULL))
goto out;
- ret = dht_layout_anomalies(
+ dht_layout_anomalies(
frame->this, &local->loc, *ondisk, &local->selfheal.hole_cnt,
&local->selfheal.overlaps_cnt, &local->selfheal.missing_cnt,
&local->selfheal.down, &local->selfheal.misc, NULL);
- if (ret < 0)
- goto out;
-
/* Directories might've been created as part of this self-heal. We've to
* sync non-layout xattrs and set range 0-0 on new directories
*/
@@ -485,7 +474,6 @@ dht_should_fix_layout(call_frame_t *frame, dht_layout_t **inmem,
dht_local_t *local = NULL;
int layout_span = 0;
int decommissioned_bricks = 0;
- int ret = 0;
dht_conf_t *conf = NULL;
dht_distribution_type_t inmem_dist_type = 0;
dht_distribution_type_t ondisk_dist_type = 0;
@@ -498,14 +486,10 @@ dht_should_fix_layout(call_frame_t *frame, dht_layout_t **inmem,
(*ondisk == NULL))
goto out;
- ret = dht_layout_anomalies(
- frame->this, &local->loc, *ondisk, &local->selfheal.hole_cnt,
- &local->selfheal.overlaps_cnt, NULL, &local->selfheal.down,
- &local->selfheal.misc, NULL);
- if (ret < 0) {
- fixit = _gf_false;
- goto out;
- }
+ dht_layout_anomalies(frame->this, &local->loc, *ondisk,
+ &local->selfheal.hole_cnt,
+ &local->selfheal.overlaps_cnt, NULL,
+ &local->selfheal.down, &local->selfheal.misc, NULL);
if (local->selfheal.down || local->selfheal.misc) {
fixit = _gf_false;
@@ -537,7 +521,7 @@ out:
return fixit;
}
-int
+static int
dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
gf_boolean_t newdir, dht_selfheal_layout_t healer,
dht_need_heal_t should_heal)
@@ -569,10 +553,8 @@ dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char);
if (lk_array == NULL) {
gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
- gf_msg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "mem allocation failed for "
- "lk_array, gfid:%s path: %s",
- gfid, local->loc.path);
+ gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
goto err;
}
@@ -582,10 +564,9 @@ dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
DHT_LAYOUT_HEAL_DOMAIN, NULL, FAIL_ON_ANY_ERROR);
if (lk_array[i] == NULL) {
gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
- gf_msg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "mem allocation "
- "failed for lk_array, gfid:%s path:%s",
- gfid, local->loc.path);
+ gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_MEM_ALLOC_FAILED, "lk_array-gfid=%s", gfid,
+ "path=%s", local->loc.path, NULL);
goto err;
}
}
@@ -594,10 +575,8 @@ dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char);
if (lk_array == NULL) {
gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
- gf_msg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "mem allocation failed for "
- "lk_array, gfid:%s path:%s",
- gfid, local->loc.path);
+ gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
goto err;
}
@@ -606,10 +585,8 @@ dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
NULL, FAIL_ON_ANY_ERROR);
if (lk_array[0] == NULL) {
gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
- gf_msg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "mem allocation failed for "
- "lk_array, gfid:%s path:%s",
- gfid, local->loc.path);
+ gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
goto err;
}
}
@@ -635,7 +612,7 @@ err:
return -1;
}
-int
+static int
dht_selfheal_dir_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
@@ -657,10 +634,9 @@ dht_selfheal_dir_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
err = 0;
} else {
gf_uuid_unparse(local->loc.gfid, gfid);
- gf_msg(this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "layout setxattr failed on %s, path:%s gfid:%s", subvol->name,
- local->loc.path, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "name=%s", subvol->name,
+ "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
err = op_errno;
}
@@ -707,7 +683,7 @@ dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data)
return ret;
}
-int
+static int
dht_selfheal_dir_xattr_persubvol(call_frame_t *frame, loc_t *loc,
dht_layout_t *layout, int i,
xlator_t *req_subvol)
@@ -749,19 +725,17 @@ dht_selfheal_dir_xattr_persubvol(call_frame_t *frame, loc_t *loc,
ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
if (ret < 0) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value: key = %s,"
- " gfid = %s",
- loc->path, GLUSTERFS_INTERNAL_FOP_KEY, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY,
+ "gfid=%s", gfid, NULL);
goto err;
}
ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value: key = %s,"
- " gfid = %s",
- loc->path, DHT_IATT_IN_XDATA_KEY, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", DHT_IATT_IN_XDATA_KEY,
+ "gfid=%s", gfid, NULL);
goto err;
}
@@ -769,27 +743,27 @@ dht_selfheal_dir_xattr_persubvol(call_frame_t *frame, loc_t *loc,
ret = dht_disk_layout_extract(this, layout, i, &disk_layout);
if (ret == -1) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "Directory self heal xattr failed:"
- " %s: (subvol %s) Failed to extract disk layout,"
- " gfid = %s",
- loc->path, subvol->name, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "extract-disk-layout-failed, path=%s", loc->path, "subvol=%s",
+ subvol->name, "gfid=%s", gfid, NULL);
goto err;
}
ret = dict_set_bin(xattr, conf->xattr_name, disk_layout, 4 * 4);
if (ret == -1) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "Directory self heal xattr failed:"
- "%s: (subvol %s) Failed to set xattr dictionary,"
- " gfid = %s",
- loc->path, subvol->name, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s", loc->path,
+ "subvol=%s", subvol->name,
+ "set-xattr-dictionary-failed"
+ "gfid=%s",
+ gfid, NULL);
goto err;
}
disk_layout = NULL;
gf_msg_trace(this->name, 0,
- "setting hash range %u - %u (type %d) on subvolume %s"
+ "setting hash range 0x%x - 0x%x (type %d) on subvolume %s"
" for %s",
layout->list[i].start, layout->list[i].stop, layout->type,
subvol->name, loc->path);
@@ -799,20 +773,17 @@ dht_selfheal_dir_xattr_persubvol(call_frame_t *frame, loc_t *loc,
if (data) {
ret = dict_add(xattr, QUOTA_LIMIT_KEY, data);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:"
- " key = %s",
- loc->path, QUOTA_LIMIT_KEY);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", QUOTA_LIMIT_KEY, NULL);
}
}
data = dict_get(local->xattr, QUOTA_LIMIT_OBJECTS_KEY);
if (data) {
ret = dict_add(xattr, QUOTA_LIMIT_OBJECTS_KEY, data);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:"
- " key = %s",
- loc->path, QUOTA_LIMIT_OBJECTS_KEY);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", QUOTA_LIMIT_OBJECTS_KEY,
+ NULL);
}
}
}
@@ -841,7 +812,7 @@ err:
return 0;
}
-int
+static int
dht_fix_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
{
dht_local_t *local = NULL;
@@ -890,7 +861,7 @@ out:
return 0;
}
-int
+static int
dht_selfheal_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
{
dht_local_t *local = NULL;
@@ -950,9 +921,8 @@ dht_selfheal_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
dummy = dht_layout_new(this, 1);
if (!dummy) {
gf_uuid_unparse(loc->gfid, gfid);
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "failed to allocate dummy layout, path:%s gfid:%s", loc->path,
- gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DUMMY_ALLOC_FAILED,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
goto out;
}
for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) {
@@ -968,38 +938,6 @@ out:
return 0;
}
-gf_boolean_t
-dht_is_subvol_part_of_layout(dht_layout_t *layout, xlator_t *xlator)
-{
- int i = 0;
- gf_boolean_t ret = _gf_false;
-
- for (i = 0; i < layout->cnt; i++) {
- if (!strcmp(layout->list[i].xlator->name, xlator->name)) {
- ret = _gf_true;
- break;
- }
- }
-
- return ret;
-}
-
-int
-dht_layout_index_from_conf(dht_layout_t *layout, xlator_t *xlator)
-{
- int i = -1;
- int j = 0;
-
- for (j = 0; j < layout->cnt; j++) {
- if (!strcmp(layout->list[j].xlator->name, xlator->name)) {
- i = j;
- break;
- }
- }
-
- return i;
-}
-
int
dht_selfheal_dir_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *statpre,
@@ -1041,18 +979,27 @@ dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
int missing_attr = 0;
int i = 0, ret = -1;
dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
xlator_t *this = NULL;
int cnt = 0;
local = frame->local;
this = frame->this;
+ conf = this->private;
+
+ /* We need to heal the attrs if:
+ * 1. Any directories were missing - the newly created dirs will need
+ * to have the correct attrs set
+ * 2. An existing dir does not have the correct permissions -they may
+ * have been changed when a brick was down.
+ */
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].err == -1)
missing_attr++;
}
- if (missing_attr == 0) {
+ if ((missing_attr == 0) && (local->need_attrheal == 0)) {
if (!local->heal_layout) {
gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ",
loc->path, uuid_utoa(loc->gfid));
@@ -1070,25 +1017,18 @@ dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
return 0;
}
- local->call_cnt = missing_attr;
- cnt = layout->cnt;
+ cnt = local->call_cnt = conf->subvolume_cnt;
for (i = 0; i < cnt; i++) {
- if (layout->list[i].err == -1) {
- gf_msg_trace(this->name, 0, "%s: setattr on subvol %s, gfid = %s",
- loc->path, layout->list[i].xlator->name,
- uuid_utoa(loc->gfid));
-
- STACK_WIND(
- frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator,
- layout->list[i].xlator->fops->setattr, loc, stbuf, valid, NULL);
- }
+ STACK_WIND(frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator,
+ layout->list[i].xlator->fops->setattr, loc, stbuf, valid,
+ NULL);
}
return 0;
}
-int
+static int
dht_selfheal_dir_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
@@ -1118,11 +1058,10 @@ dht_selfheal_dir_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret) {
gf_uuid_unparse(local->loc.gfid, gfid);
- gf_msg(this->name,
- ((op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_WARNING), op_errno,
- DHT_MSG_DIR_SELFHEAL_FAILED,
- "Directory selfheal failed: path = %s, gfid = %s",
- local->loc.path, gfid);
+ gf_smsg(this->name,
+ ((op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_WARNING),
+ op_errno, DHT_MSG_DIR_SELFHEAL_FAILED, "path=%s",
+ local->loc.path, "gfid=%s", gfid, NULL);
goto out;
}
dht_iatt_merge(this, &local->preparent, preparent);
@@ -1141,89 +1080,7 @@ out:
return 0;
}
-void
-dht_selfheal_dir_mkdir_setacl(dict_t *xattr, dict_t *dict)
-{
- data_t *acl_default = NULL;
- data_t *acl_access = NULL;
- xlator_t *this = NULL;
- int ret = -1;
-
- GF_ASSERT(xattr);
- GF_ASSERT(dict);
-
- this = THIS;
- GF_ASSERT(this);
-
- acl_default = dict_get(xattr, POSIX_ACL_DEFAULT_XATTR);
-
- if (!acl_default) {
- gf_msg_debug(this->name, 0, "ACL_DEFAULT xattr not present");
- goto cont;
- }
- ret = dict_set(dict, POSIX_ACL_DEFAULT_XATTR, acl_default);
- if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value.key = %s",
- POSIX_ACL_DEFAULT_XATTR);
-cont:
- acl_access = dict_get(xattr, POSIX_ACL_ACCESS_XATTR);
- if (!acl_access) {
- gf_msg_debug(this->name, 0, "ACL_ACCESS xattr not present");
- goto out;
- }
- ret = dict_set(dict, POSIX_ACL_ACCESS_XATTR, acl_access);
- if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value.key = %s",
- POSIX_ACL_ACCESS_XATTR);
-
-out:
- return;
-}
-
-void
-dht_selfheal_dir_mkdir_setquota(dict_t *src, dict_t *dst)
-{
- data_t *quota_limit_key = NULL;
- data_t *quota_limit_obj_key = NULL;
- xlator_t *this = NULL;
- int ret = -1;
-
- GF_ASSERT(src);
- GF_ASSERT(dst);
-
- this = THIS;
- GF_ASSERT(this);
-
- quota_limit_key = dict_get(src, QUOTA_LIMIT_KEY);
- if (!quota_limit_key) {
- gf_msg_debug(this->name, 0, "QUOTA_LIMIT_KEY xattr not present");
- goto cont;
- }
- ret = dict_set(dst, QUOTA_LIMIT_KEY, quota_limit_key);
- if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value.key = %s", QUOTA_LIMIT_KEY);
-
-cont:
- quota_limit_obj_key = dict_get(src, QUOTA_LIMIT_OBJECTS_KEY);
- if (!quota_limit_obj_key) {
- gf_msg_debug(this->name, 0,
- "QUOTA_LIMIT_OBJECTS_KEY xattr not present");
- goto out;
- }
- ret = dict_set(dst, QUOTA_LIMIT_OBJECTS_KEY, quota_limit_obj_key);
- if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value.key = %s",
- QUOTA_LIMIT_OBJECTS_KEY);
-
-out:
- return;
-}
-
-int
+static int
dht_selfheal_dir_mkdir_lookup_done(call_frame_t *frame, xlator_t *this)
{
dht_local_t *local = NULL;
@@ -1247,10 +1104,8 @@ dht_selfheal_dir_mkdir_lookup_done(call_frame_t *frame, xlator_t *this)
ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:"
- " key = gfid-req",
- loc->path);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=gfid-req", NULL);
} else if (local->params) {
/* Send the dictionary from higher layers directly */
@@ -1262,18 +1117,15 @@ dht_selfheal_dir_mkdir_lookup_done(call_frame_t *frame, xlator_t *this)
dht_dir_set_heal_xattr(this, local, dict, local->xattr, NULL, NULL);
if (!dict) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "dict is NULL, need to make sure gfids are same");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_IS_NULL, NULL);
dict = dict_new();
if (!dict)
return -1;
}
ret = dict_set_flag(dict, GF_INTERNAL_CTX_KEY, GF_DHT_HEAL_DIR);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value for"
- " key = %s at path: %s",
- GF_INTERNAL_CTX_KEY, loc->path);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "key=%s",
+ GF_INTERNAL_CTX_KEY, "path=%s", loc->path, NULL);
/* We can still continue. As heal can still happen
* unless quota limits have reached for the dir.
*/
@@ -1305,7 +1157,7 @@ err:
return 0;
}
-int
+static int
dht_selfheal_dir_mkdir_lookup_cbk(call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno,
inode_t *inode, struct iatt *stbuf,
@@ -1316,45 +1168,54 @@ dht_selfheal_dir_mkdir_lookup_cbk(call_frame_t *frame, void *cookie,
int this_call_cnt = 0;
int missing_dirs = 0;
dht_layout_t *layout = NULL;
- dht_conf_t *conf = 0;
+ xlator_t *prev = 0;
loc_t *loc = NULL;
- int check_mds = 0;
- int errst = 0;
- int32_t mds_xattr_val[1] = {0};
char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ int index = -1;
VALIDATE_OR_GOTO(this->private, err);
local = frame->local;
layout = local->layout;
loc = &local->loc;
- conf = this->private;
+ prev = cookie;
- if (local->gfid)
+ if (!gf_uuid_is_null(local->gfid))
gf_uuid_unparse(local->gfid, gfid_local);
- this_call_cnt = dht_frame_return(frame);
-
LOCK(&frame->lock);
{
+ index = dht_layout_index_for_subvol(layout, prev);
if ((op_ret < 0) && (op_errno == ENOENT || op_errno == ESTALE)) {
local->selfheal.hole_cnt = !local->selfheal.hole_cnt
? 1
: local->selfheal.hole_cnt + 1;
+ /* the status might have changed. Update the layout with the
+ * new status
+ */
+ if (index >= 0) {
+ layout->list[index].err = op_errno;
+ }
}
if (!op_ret) {
dht_iatt_merge(this, &local->stbuf, stbuf);
- }
- check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
- mds_xattr_val, 1, &errst);
- if (dict_get(xattr, conf->mds_xattr_key) && check_mds && !errst) {
- dict_unref(local->xattr);
- local->xattr = dict_ref(xattr);
+ if (prev == local->mds_subvol) {
+ dict_unref(local->xattr);
+ local->xattr = dict_ref(xattr);
+ }
+ /* the status might have changed. Update the layout with the
+ * new status
+ */
+ if (index >= 0) {
+ layout->list[index].err = -1;
+ }
}
}
UNLOCK(&frame->lock);
+ this_call_cnt = dht_frame_return(frame);
+
if (is_last_call(this_call_cnt)) {
if (local->selfheal.hole_cnt == layout->cnt) {
gf_msg_debug(this->name, op_errno,
@@ -1390,7 +1251,7 @@ err:
return 0;
}
-int
+static int
dht_selfheal_dir_mkdir_lock_cbk(call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
@@ -1410,19 +1271,14 @@ dht_selfheal_dir_mkdir_lock_cbk(call_frame_t *frame, void *cookie,
local->call_cnt = conf->subvolume_cnt;
if (op_ret < 0) {
- /* We get this error when the directory entry was not created
- * on a newky attached tier subvol. Hence proceed and do mkdir
- * on the tier subvol.
- */
if (op_errno == EINVAL) {
local->call_cnt = 1;
dht_selfheal_dir_mkdir_lookup_done(frame, this);
return 0;
}
- gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_ENTRYLK_ERROR,
- "acquiring entrylk after inodelk failed for %s",
- local->loc.path);
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_ENTRYLK_ERROR,
+ "path=%s", local->loc.path, NULL);
local->op_errno = op_errno;
goto err;
@@ -1436,10 +1292,8 @@ dht_selfheal_dir_mkdir_lock_cbk(call_frame_t *frame, void *cookie,
ret = dict_set_int32(local->xattr_req, "list-xattr", 1);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary key list-xattr value "
- " for path %s ",
- local->loc.path);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "path=%s",
+ local->loc.path, NULL);
for (i = 0; i < conf->subvolume_cnt; i++) {
if (mds_subvol && conf->subvolumes[i] == mds_subvol) {
@@ -1462,18 +1316,21 @@ err:
return 0;
}
-int
+static int
dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout,
int force)
{
int missing_dirs = 0;
int i = 0;
+ int op_errno = 0;
int ret = -1;
dht_local_t *local = NULL;
xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
local = frame->local;
this = frame->this;
+ conf = this->private;
local->selfheal.force_mkdir = force;
local->selfheal.hole_cnt = 0;
@@ -1484,16 +1341,18 @@ dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout,
}
if (missing_dirs == 0) {
+ /* We don't need to create any directories. Proceed to heal the
+ * attrs and xattrs
+ */
if (!__is_root_gfid(local->stbuf.ia_gfid)) {
if (local->need_xattr_heal) {
local->need_xattr_heal = 0;
- ret = dht_dir_xattr_heal(this, local);
- if (ret)
- gf_msg(this->name, GF_LOG_ERROR, ret,
- DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "xattr heal failed for "
- "directory %s gfid %s ",
- local->loc.path, local->gfid);
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s",
+ local->loc.path, "gfid=%s", local->gfid, NULL);
+ }
} else {
if (!gf_uuid_is_null(local->gfid))
gf_uuid_copy(loc->gfid, local->gfid);
@@ -1502,28 +1361,53 @@ dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout,
if (!ret)
return 0;
- gf_msg(this->name, GF_LOG_INFO, 0,
- DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "Failed to set mds xattr "
- "for directory %s gfid %s ",
- local->loc.path, local->gfid);
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SET_XATTR_FAILED,
+ "path=%s", local->loc.path, "gfid=%s", local->gfid,
+ NULL);
}
}
dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff, layout);
return 0;
}
- if (local->hashed_subvol == NULL)
- local->hashed_subvol = dht_subvol_get_hashed(this, loc);
+ /* MDS xattr is populated only while DHT is having more than one
+ subvol.In case of graph switch while adding more dht subvols need to
+ consider hash subvol as a MDS to avoid MDS check failure at the time
+ of running fop on directory
+ */
+ if (!dict_get(local->xattr, conf->mds_xattr_key) &&
+ (conf->subvolume_cnt > 1)) {
+ if (local->hashed_subvol == NULL) {
+ local->hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (local->hashed_subvol == NULL) {
+ local->op_errno = EINVAL;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s",
+ loc->pargfid, "name=%s", loc->name, "path=%s",
+ loc->path, NULL);
+ goto err;
+ }
+ }
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this,
+ local->hashed_subvol);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set hashed subvol for %s on inode vol is %s",
+ local->loc.path,
+ local->hashed_subvol ? local->hashed_subvol->name : "NULL");
+ goto err;
+ }
+ }
if (local->hashed_subvol == NULL) {
- local->op_errno = EINVAL;
- gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "(%s/%s) (path: %s): "
- "hashed subvolume not found",
- loc->pargfid, loc->name, loc->path);
- goto err;
+ local->hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (local->hashed_subvol == NULL) {
+ local->op_errno = EINVAL;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", loc->pargfid,
+ "name=%s", loc->name, "path=%s", loc->path, NULL);
+ goto err;
+ }
}
local->current = &local->lock[0];
@@ -1539,7 +1423,7 @@ err:
return -1;
}
-int
+static int
dht_selfheal_layout_alloc_start(xlator_t *this, loc_t *loc,
dht_layout_t *layout)
{
@@ -1635,7 +1519,7 @@ dht_get_layout_count(xlator_t *this, dht_layout_t *layout, int new_layout)
/* if layout->spread_cnt is set, check if it is <= available
* subvolumes (down brick and decommissioned bricks are considered
- * un-availbale). Else return count (available up bricks) */
+ * un-available). Else return count (available up bricks) */
count = ((layout->spread_cnt && (layout->spread_cnt <= count))
? layout->spread_cnt
: ((count) ? count : 1));
@@ -1648,8 +1532,6 @@ dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
dht_layout_t *new_layout);
void
-dht_layout_entry_swap(dht_layout_t *layout, int i, int j);
-void
dht_layout_range_swap(dht_layout_t *layout, int i, int j);
/*
@@ -1658,7 +1540,7 @@ dht_layout_range_swap(dht_layout_t *layout, int i, int j);
*/
#define OV_ENTRY(x, y) table[x * new->cnt + y]
-void
+static void
dht_selfheal_layout_maximize_overlap(call_frame_t *frame, loc_t *loc,
dht_layout_t *new, dht_layout_t *old)
{
@@ -1735,7 +1617,7 @@ dht_selfheal_layout_maximize_overlap(call_frame_t *frame, loc_t *loc,
}
}
-dht_layout_t *
+static dht_layout_t *
dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc,
dht_layout_t *layout)
{
@@ -1745,7 +1627,6 @@ dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc,
dht_conf_t *priv = NULL;
dht_local_t *local = NULL;
uint32_t subvol_down = 0;
- int ret = 0;
gf_boolean_t maximize_overlap = _gf_true;
char gfid[GF_UUID_BUF_SIZE] = {0};
@@ -1761,22 +1642,20 @@ dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc,
new_layout = dht_layout_new(this, priv->subvolume_cnt);
if (!new_layout) {
gf_uuid_unparse(loc->gfid, gfid);
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "mem allocation failed for new_layout, path:%s gfid:%s",
- loc->path, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "new_layout, path=%s", loc->path, "gfid=%s", gfid, NULL);
goto done;
}
/* If a subvolume is down, do not re-write the layout. */
- ret = dht_layout_anomalies(this, loc, layout, NULL, NULL, NULL,
- &subvol_down, NULL, NULL);
+ dht_layout_anomalies(this, loc, layout, NULL, NULL, NULL, &subvol_down,
+ NULL, NULL);
- if (subvol_down || (ret == -1)) {
+ if (subvol_down) {
gf_uuid_unparse(loc->gfid, gfid);
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_FIX_FAILED,
- "Layout fix failed: %u subvolume(s) are down"
- ". Skipping fix layout. path:%s gfid:%s",
- subvol_down, loc->path, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+ "subvol-down=%u", subvol_down, "Skipping-fix-layout", "path=%s",
+ loc->path, "gfid=%s", gfid, NULL);
GF_FREE(new_layout);
return NULL;
}
@@ -1794,10 +1673,10 @@ dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc,
if (priv->du_stats) {
for (i = 0; i < priv->subvolume_cnt; ++i) {
- gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_SUBVOL_INFO,
- "subvolume %d (%s): %u chunks, path:%s", i,
- priv->subvolumes[i]->name, priv->du_stats[i].chunks,
- loc->path);
+ gf_smsg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_SUBVOL_INFO,
+ "index=%d", i, "name=%s", priv->subvolumes[i]->name,
+ "chunks=%u", priv->du_stats[i].chunks, "path=%s", loc->path,
+ NULL);
/* Maximize overlap if the bricks are all the same
* size.
@@ -1809,8 +1688,8 @@ dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc,
}
}
} else {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_DISK_USAGE_STATUS,
- "no du stats ?!?");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_DISK_USAGE_STATUS,
+ NULL);
}
/* First give it a layout as though it is a new directory. This
@@ -1841,7 +1720,7 @@ done:
* Having to call this 2x for each entry in the layout is pretty horrible, but
* that's what all of this layout-sorting nonsense gets us.
*/
-uint32_t
+static uint32_t
dht_get_chunks_from_xl(xlator_t *parent, xlator_t *child)
{
dht_conf_t *priv = parent->private;
@@ -1908,8 +1787,9 @@ dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
if (weight_by_size && total_size) {
/* We know total_size is not zero. */
chunk = ((double)0xffffffff) / ((double)total_size);
- gf_msg_debug(this->name, 0, "chunk size = 0xffffffff / %lu = %f",
- total_size, chunk);
+ gf_msg_debug(this->name, 0,
+ "chunk size = 0xffffffff / %" PRIu64 " = %f", total_size,
+ chunk);
} else {
weight_by_size = _gf_false;
chunk = ((unsigned long)0xffffffff) / bricks_to_use;
@@ -1958,7 +1838,7 @@ done:
return;
}
-int
+static int
dht_selfheal_dir_getafix(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
{
dht_local_t *local = NULL;
@@ -2017,9 +1897,8 @@ dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
linked_inode = inode_link(loc->inode, loc->parent, loc->name,
&local->stbuf);
if (!linked_inode) {
- gf_msg(frame->this->name, GF_LOG_WARNING, 0,
- DHT_MSG_DIR_SELFHEAL_FAILED,
- "linking inode failed (%s/%s) => %s", pgfid, loc->name, gfid);
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED,
+ "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid, NULL);
ret = -1;
goto out;
}
@@ -2077,10 +1956,10 @@ dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
loc_t *loc, dht_layout_t *layout)
{
dht_local_t *local = NULL;
+ xlator_t *this = NULL;
uint32_t down = 0;
uint32_t misc = 0;
int ret = 0;
- xlator_t *this = NULL;
char pgfid[GF_UUID_BUF_SIZE] = {0};
char gfid[GF_UUID_BUF_SIZE] = {0};
inode_t *linked_inode = NULL, *inode = NULL;
@@ -2091,6 +1970,20 @@ dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
local->selfheal.dir_cbk = dir_cbk;
local->selfheal.layout = dht_layout_ref(this, layout);
+ if (local->need_attrheal) {
+ if (__is_root_gfid(local->stbuf.ia_gfid)) {
+ local->stbuf.ia_gid = local->prebuf.ia_gid;
+ local->stbuf.ia_uid = local->prebuf.ia_uid;
+
+ local->stbuf.ia_ctime = local->prebuf.ia_ctime;
+ local->stbuf.ia_ctime_nsec = local->prebuf.ia_ctime_nsec;
+ local->stbuf.ia_prot = local->prebuf.ia_prot;
+
+ } else if (!IA_ISINVAL(local->mds_stbuf.ia_type)) {
+ local->stbuf = local->mds_stbuf;
+ }
+ }
+
if (!__is_root_gfid(local->stbuf.ia_gfid)) {
gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
gf_uuid_unparse(loc->parent->gfid, pgfid);
@@ -2098,9 +1991,9 @@ dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
linked_inode = inode_link(loc->inode, loc->parent, loc->name,
&local->stbuf);
if (!linked_inode) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_FAILED,
- "linking inode failed (%s/%s) => %s", pgfid, loc->name,
- gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED,
+ "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid,
+ NULL);
ret = 0;
goto sorry_no_fix;
}
@@ -2110,6 +2003,13 @@ dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
inode_unref(inode);
}
+ if (local->need_xattr_heal && (local->mds_xattr)) {
+ dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr,
+ NULL, NULL);
+ dict_unref(local->mds_xattr);
+ local->mds_xattr = NULL;
+ }
+
dht_layout_anomalies(this, loc, layout, &local->selfheal.hole_cnt,
&local->selfheal.overlaps_cnt,
&local->selfheal.missing_cnt, &local->selfheal.down,
@@ -2119,19 +2019,17 @@ dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
misc = local->selfheal.misc;
if (down) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_FAILED,
- "Directory selfheal failed: %d subvolumes down."
- "Not fixing. path = %s, gfid = %s",
- down, loc->path, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED,
+ "path=%s", loc->path, "subvol-down=%d", down, "Not-fixing",
+ "gfid=%s", gfid, NULL);
ret = 0;
goto sorry_no_fix;
}
if (misc) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_FAILED,
- "Directory selfheal failed : %d subvolumes "
- "have unrecoverable errors. path = %s, gfid = %s",
- misc, loc->path, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED,
+ "path=%s", loc->path, "misc=%d", misc, "unrecoverable-errors",
+ "gfid=%s", gfid, NULL);
ret = 0;
goto sorry_no_fix;
@@ -2217,29 +2115,28 @@ dht_dir_heal_xattrs(void *data)
gf_uuid_unparse(local->loc.gfid, gfid);
if (!mds_subvol) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "No mds subvol for %s gfid = %s", local->loc.path, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s",
+ local->loc.path, "gfid=%s", gfid, NULL);
goto out;
}
if ((local->loc.inode && gf_uuid_is_null(local->loc.inode->gfid)) ||
gf_uuid_is_null(local->loc.gfid)) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "No gfid present so skip heal for path %s gfid = %s",
- local->loc.path, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_NOT_PRESENT,
+ "skip-heal path=%s", local->loc.path, "gfid=%s", gfid, NULL);
goto out;
}
internal_xattr = dict_new();
if (!internal_xattr) {
- gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
- "dictionary creation failed");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+ "dictionary", NULL);
goto out;
}
xdata = dict_new();
if (!xdata) {
- gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
- "dictionary creation failed");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+ "dictionary", NULL);
goto out;
}
@@ -2247,18 +2144,17 @@ dht_dir_heal_xattrs(void *data)
user_xattr = dict_new();
if (!user_xattr) {
- gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
- "dictionary creation failed");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+ "dictionary", NULL);
goto out;
}
ret = syncop_listxattr(local->mds_subvol, &local->loc, &mds_xattr, NULL,
NULL);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "failed to list xattrs for "
- "%s: on %s ",
- local->loc.path, local->mds_subvol->name);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LIST_XATTRS_FAILED,
+ "path=%s", local->loc.path, "name=%s", local->mds_subvol->name,
+ NULL);
}
if (!mds_xattr)
@@ -2273,10 +2169,9 @@ dht_dir_heal_xattrs(void *data)
dict_get(user_xattr, QUOTA_LIMIT_OBJECTS_KEY)) {
ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value: key = %s,"
- " path = %s",
- GLUSTERFS_INTERNAL_FOP_KEY, local->loc.path);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, "path=%s",
+ local->loc.path, NULL);
goto out;
}
}
@@ -2288,16 +2183,25 @@ dht_dir_heal_xattrs(void *data)
if (subvol == mds_subvol)
continue;
if (uret || uflag) {
+ /* Custom xattr heal is required - let posix handle it */
+ ret = dict_set_int8(xdata, "sync_backend_xattrs", _gf_true);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", local->loc.path, "key=%s",
+ "sync_backend_xattrs", NULL);
+ goto out;
+ }
+
ret = syncop_setxattr(subvol, &local->loc, user_xattr, 0, xdata,
NULL);
if (ret) {
xattr_hashed = 1;
- gf_msg(this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "Directory xattr heal failed. Failed to set"
- "user xattr on path %s on "
- "subvol %s, gfid = %s ",
- local->loc.path, subvol->name, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "set-user-xattr-failed path=%s", local->loc.path,
+ "subvol=%s", subvol->name, "gfid=%s", gfid, NULL);
+ } else {
+ dict_del(xdata, "sync_backend_xattrs");
}
}
}
@@ -2306,21 +2210,17 @@ dht_dir_heal_xattrs(void *data)
ret = dht_dict_set_array(internal_xattr, conf->mds_xattr_key, allzero,
1);
if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s",
- conf->mds_xattr_key, local->loc.path);
+ gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "key=%s", conf->mds_xattr_key, "path=%s", local->loc.path,
+ NULL);
goto out;
}
ret = syncop_setxattr(mds_subvol, &local->loc, internal_xattr, 0, NULL,
NULL);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_DIR_XATTR_HEAL_FAILED,
- "Failed to reset internal xattr "
- "on path %s on subvol %s"
- "gfid = %s ",
- local->loc.path, mds_subvol->name, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path,
+ "subvol=%s", mds_subvol->name, "gfid=%s", gfid, NULL);
}
}
@@ -2361,18 +2261,18 @@ dht_dir_attr_heal(void *data)
frame = data;
local = frame->local;
- mds_subvol = local->mds_subvol;
this = frame->this;
GF_VALIDATE_OR_GOTO("dht", this, out);
GF_VALIDATE_OR_GOTO("dht", local, out);
conf = this->private;
GF_VALIDATE_OR_GOTO("dht", conf, out);
+ mds_subvol = local->mds_subvol;
call_cnt = conf->subvolume_cnt;
if (!__is_root_gfid(local->stbuf.ia_gfid) && (!mds_subvol)) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_ATTR_HEAL_FAILED,
- "No mds subvol for %s gfid = %s", local->loc.path, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s",
+ local->loc.path, "gfid=%s", gfid, NULL);
goto out;
}
@@ -2380,11 +2280,9 @@ dht_dir_attr_heal(void *data)
for (i = 0; i < conf->subvolume_cnt; i++) {
if (conf->subvolumes[i] == mds_subvol) {
if (!conf->subvolume_status[i]) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- DHT_MSG_HASHED_SUBVOL_DOWN,
- "mds subvol is down for path "
- " %s gfid is %s Unable to set xattr ",
- local->loc.path, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MDS_DOWN_UNABLE_TO_SET, "path=%s",
+ local->loc.path, "gfid=%s", gfid, NULL);
goto out;
}
}
@@ -2410,10 +2308,9 @@ dht_dir_attr_heal(void *data)
if (ret) {
gf_uuid_unparse(local->loc.gfid, gfid);
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_ATTR_HEAL_FAILED,
- "Directory attr heal failed. Failed to set"
- " uid/gid on path %s on subvol %s, gfid = %s ",
- local->loc.path, subvol->name, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_ATTR_HEAL_FAILED, "path=%s", local->loc.path,
+ "subvol=%s", subvol->name, "gfid=%s", gfid, NULL);
}
}
out:
@@ -2428,7 +2325,7 @@ dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data)
}
/* EXIT: dht_update_commit_hash_for_layout */
-int
+static int
dht_update_commit_hash_for_layout_done(call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
@@ -2448,7 +2345,7 @@ dht_update_commit_hash_for_layout_done(call_frame_t *frame, void *cookie,
return 0;
}
-int
+static int
dht_update_commit_hash_for_layout_unlock(call_frame_t *frame, xlator_t *this)
{
dht_local_t *local = NULL;
@@ -2466,11 +2363,8 @@ dht_update_commit_hash_for_layout_unlock(call_frame_t *frame, xlator_t *this)
local->op_ret = -1;
}
- gf_msg(this->name, GF_LOG_WARNING, errno,
- DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "Winding unlock failed: stale locks left on brick"
- " %s",
- local->loc.path);
+ gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_WIND_UNLOCK_FAILED,
+ "path=%s", local->loc.path, NULL);
dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL);
}
@@ -2478,7 +2372,7 @@ dht_update_commit_hash_for_layout_unlock(call_frame_t *frame, xlator_t *this)
return 0;
}
-int
+static int
dht_update_commit_hash_for_layout_cbk(call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno,
dict_t *xdata)
@@ -2505,7 +2399,7 @@ dht_update_commit_hash_for_layout_cbk(call_frame_t *frame, void *cookie,
return 0;
}
-int
+static int
dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
@@ -2533,11 +2427,8 @@ dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie,
if (!xattr) {
local->op_errno = errno;
- gf_msg(this->name, GF_LOG_WARNING, errno,
- DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "Directory commit hash update failed:"
- " %s: Allocation failed",
- local->loc.path);
+ gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_COMMIT_HASH_FAILED,
+ "allocation-failed path=%s", local->loc.path, NULL);
goto err;
}
@@ -2548,11 +2439,10 @@ dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie,
if (ret < 0) {
local->op_errno = ENOENT;
- gf_msg(this->name, GF_LOG_WARNING, 0,
- DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "Directory commit hash update failed:"
- " %s: (subvol %s) Failed to find disk layout",
- local->loc.path, conf->local_subvols[i]->name);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMMIT_HASH_FAILED,
+ "path=%s", local->loc.path, "subvol=%s",
+ conf->local_subvols[i]->name, "find-disk-layout-failed",
+ NULL);
goto err;
}
@@ -2566,12 +2456,10 @@ dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie,
if (ret == -1) {
local->op_errno = errno;
- gf_msg(this->name, GF_LOG_WARNING, errno,
- DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "Directory commit hash update failed:"
- " %s: (subvol %s) Failed to extract disk"
- " layout",
- local->loc.path, conf->local_subvols[i]->name);
+ gf_smsg(this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_COMMIT_HASH_FAILED, "path=%s", local->loc.path,
+ "subvol=%s", conf->local_subvols[i]->name,
+ "extract-disk-layout-failed", NULL);
goto err;
}
@@ -2580,11 +2468,9 @@ dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie,
if (!xattr[i]) {
local->op_errno = errno;
- gf_msg(this->name, GF_LOG_WARNING, errno,
- DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "Directory commit hash update failed:"
- " %s: Allocation failed",
- local->loc.path);
+ gf_smsg(this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_COMMIT_HASH_FAILED, "path=%s Allocation-failed",
+ local->loc.path, NULL);
goto err;
}
@@ -2593,12 +2479,10 @@ dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie,
if (ret != 0) {
local->op_errno = ENOMEM;
- gf_msg(this->name, GF_LOG_WARNING, 0,
- DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
- "Directory self heal xattr failed:"
- "%s: (subvol %s) Failed to set xattr"
- " dictionary,",
- local->loc.path, conf->local_subvols[i]->name);
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s",
+ local->loc.path, "subvol=%s", conf->local_subvols[i]->name,
+ "set-xattr-failed", NULL);
goto err;
}
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 55e1fcf953c..bb72b0ffbb5 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -9,7 +9,7 @@
*/
/* TODO: add NS locking */
-#include "statedump.h"
+#include <glusterfs/statedump.h>
#include "dht-common.h"
#include "dht-messages.h"
@@ -17,35 +17,14 @@
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#endif
-#define GF_DECIDE_DEFRAG_THROTTLE_COUNT(throttle_count, conf) \
- { \
- pthread_mutex_lock(&conf->defrag->dfq_mutex); \
- \
- if (!strcasecmp(conf->dthrottle, "lazy")) \
- conf->defrag->recon_thread_count = 1; \
- \
- throttle_count = MAX((sysconf(_SC_NPROCESSORS_ONLN) - 4), 4); \
- \
- if (!strcasecmp(conf->dthrottle, "normal")) \
- conf->defrag->recon_thread_count = (throttle_count / 2); \
- \
- if (!strcasecmp(conf->dthrottle, "aggressive")) \
- conf->defrag->recon_thread_count = throttle_count; \
- \
- pthread_mutex_unlock(&conf->defrag->dfq_mutex); \
- }
-
/* TODO:
- use volumename in xattr instead of "dht"
- use NS locks
- handle all cases in self heal layout reconstruction
- complete linkfile selfheal
*/
-struct volume_options options[];
-
-extern dht_methods_t dht_methods;
-void
+static void
dht_layout_dump(dht_layout_t *layout, const char *prefix)
{
char key[GF_DUMP_MAX_BUF_LEN];
@@ -53,8 +32,6 @@ dht_layout_dump(dht_layout_t *layout, const char *prefix)
if (!layout)
goto out;
- if (!prefix)
- goto out;
gf_proc_dump_build_key(key, prefix, "cnt");
gf_proc_dump_write(key, "%d", layout->cnt);
@@ -74,9 +51,9 @@ dht_layout_dump(dht_layout_t *layout, const char *prefix)
gf_proc_dump_build_key(key, prefix, "list[%d].err", i);
gf_proc_dump_write(key, "%d", layout->list[i].err);
gf_proc_dump_build_key(key, prefix, "list[%d].start", i);
- gf_proc_dump_write(key, "%u", layout->list[i].start);
+ gf_proc_dump_write(key, "0x%x", layout->list[i].start);
gf_proc_dump_build_key(key, prefix, "list[%d].stop", i);
- gf_proc_dump_write(key, "%u", layout->list[i].stop);
+ gf_proc_dump_write(key, "0x%x", layout->list[i].stop);
if (layout->list[i].xlator) {
gf_proc_dump_build_key(key, prefix, "list[%d].xlator.type", i);
gf_proc_dump_write(key, "%s", layout->list[i].xlator->type);
@@ -153,7 +130,7 @@ dht_priv_dump(xlator_t *this)
gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_percent);
snprintf(key, sizeof(key), "du_stats[%d].avail_space", i);
- gf_proc_dump_write(key, "%lu", conf->du_stats[i].avail_space);
+ gf_proc_dump_write(key, "%" PRIu64, conf->du_stats[i].avail_space);
snprintf(key, sizeof(key), "du_stats[%d].avail_inodes", i);
gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_inodes);
@@ -163,9 +140,9 @@ dht_priv_dump(xlator_t *this)
}
}
- if (conf->last_stat_fetch.tv_sec)
+ if (conf->last_stat_fetch)
gf_proc_dump_write("last_stat_fetch", "%s",
- ctime(&conf->last_stat_fetch.tv_sec));
+ ctime(&conf->last_stat_fetch));
UNLOCK(&conf->subvolume_lock);
@@ -265,7 +242,7 @@ out:
return ret;
}
-int
+static int
dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf,
const char *bricks)
{
@@ -279,6 +256,10 @@ dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf,
goto out;
dup_brick = gf_strdup(bricks);
+ if (dup_brick == NULL) {
+ goto out;
+ }
+
node = strtok_r(dup_brick, ",", &tmpstr);
while (node) {
for (i = 0; i < conf->subvolume_cnt; i++) {
@@ -307,14 +288,10 @@ out:
return ret;
}
-int
+static void
dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf)
{
int i = 0;
- int ret = -1;
-
- if (!conf)
- goto out;
for (i = 0; i < conf->subvolume_cnt; i++) {
if (conf->decommissioned_bricks[i]) {
@@ -322,13 +299,9 @@ dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf)
conf->decommission_subvols_cnt--;
}
}
-
- ret = 0;
-out:
-
- return ret;
}
-void
+
+static void
dht_init_regex(xlator_t *this, dict_t *odict, char *name, regex_t *re,
gf_boolean_t *re_valid, dht_conf_t *conf)
{
@@ -385,7 +358,7 @@ out:
return ret;
}
-int
+static int
dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str)
{
int rebal_thread_count = 0;
@@ -402,18 +375,20 @@ dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str)
} else if ((gf_string2int(temp_str, &rebal_thread_count) == 0)) {
if ((rebal_thread_count > 0) &&
(rebal_thread_count <= MAX_REBAL_THREADS)) {
+ conf->defrag->recon_thread_count = rebal_thread_count;
+ pthread_mutex_unlock(&conf->defrag->dfq_mutex);
gf_msg(this->name, GF_LOG_INFO, 0, 0,
"rebal thread count configured to %d",
rebal_thread_count);
- conf->defrag->recon_thread_count = rebal_thread_count;
+ goto out;
} else {
+ pthread_mutex_unlock(&conf->defrag->dfq_mutex);
gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
"Invalid option: Reconfigure: "
"rebal-throttle should be "
"within range of 0 and maximum number of"
" cores available");
ret = -1;
- pthread_mutex_unlock(&conf->defrag->dfq_mutex);
goto out;
}
} else {
@@ -522,9 +497,7 @@ dht_reconfigure(xlator_t *this, dict_t *options)
if (ret == -1)
goto out;
} else {
- ret = dht_decommissioned_remove(this, conf);
- if (ret == -1)
- goto out;
+ dht_decommissioned_remove(this, conf);
}
dht_init_regex(this, options, "rsync-hash-regex", &conf->rsync_regex,
@@ -564,6 +537,8 @@ gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag,
pattern_str = strtok_r(data, ",", &tmp_str);
while (pattern_str) {
dup_str = gf_strdup(pattern_str);
+ if (!dup_str)
+ goto out;
pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1);
if (!pattern_list) {
goto out;
@@ -610,7 +585,7 @@ out:
return ret;
}
-int
+static int
dht_init_methods(xlator_t *this)
{
int ret = -1;
@@ -623,7 +598,6 @@ dht_init_methods(xlator_t *this)
methods = &(conf->methods);
methods->migration_get_dst_subvol = dht_migration_get_dst_subvol;
- methods->migration_needed = dht_migration_needed;
methods->migration_other = NULL;
methods->layout_search = dht_layout_search;
@@ -907,7 +881,7 @@ err:
return -1;
}
-struct volume_options options[] = {
+struct volume_options dht_options[] = {
{
.key = {"lookup-unhashed"},
.value = {"auto", "yes", "no", "enable", "disable", "1", "0", "on",
@@ -1072,84 +1046,6 @@ struct volume_options options[] = {
/* NUFA option */
{.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR},
- /* tier options */
- {
- .key = {"tier-pause"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
-
- {
- .key = {"tier-promote-frequency"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "120",
- },
-
- {
- .key = {"tier-demote-frequency"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "3600",
- },
-
- {
- .key = {"write-freq-threshold"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "0",
- },
-
- {
- .key = {"read-freq-threshold"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "0",
- },
- {
- .key = {"watermark-hi"},
- .type = GF_OPTION_TYPE_PERCENT,
- .default_value = "90",
- },
- {
- .key = {"watermark-low"},
- .type = GF_OPTION_TYPE_PERCENT,
- .default_value = "75",
- },
- {
- .key = {"tier-mode"},
- .type = GF_OPTION_TYPE_STR,
- .default_value = "test",
- },
- {
- .key = {"tier-compact"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- {.key = {"tier-hot-compact-frequency"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "604800",
- .description = "Frequency to compact DBs on hot tier in system"},
- {.key = {"tier-cold-compact-frequency"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "604800",
- .description = "Frequency to compact DBs on cold tier in system"},
- {
- .key = {"tier-max-mb"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "4000",
- },
- {
- .key = {"tier-max-promote-file-size"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "0",
- },
- {
- .key = {"tier-max-files"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "10000",
- },
- {
- .key = {"tier-query-limit"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "100",
- },
/* switch option */
{.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY},
@@ -1201,3 +1097,8 @@ struct volume_options options[] = {
{.key = {NULL}},
};
+
+#define NUM_DHT_OPTIONS (sizeof(dht_options) / sizeof(dht_options[0]))
+
+extern struct volume_options options[NUM_DHT_OPTIONS]
+ __attribute__((alias("dht_options")));
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index 677905f236e..53de8292704 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -8,13 +8,34 @@
cases as published by the Free Software Foundation.
*/
-#include "statedump.h"
#include "dht-common.h"
-class_methods_t class_methods = {.init = dht_init,
- .fini = dht_fini,
- .reconfigure = dht_reconfigure,
- .notify = dht_notify};
+struct xlator_fops dht_pt_fops = {
+ /* we need to keep mkdir to make sure we
+ have layout on new directory */
+ .mkdir = dht_pt_mkdir,
+ .getxattr = dht_pt_getxattr,
+ .fgetxattr = dht_pt_fgetxattr,
+
+ /* required to trace fop properly in changelog */
+ .rename = dht_pt_rename,
+
+ /* FIXME: commenting the '.lookup()' below made some of
+ the failing tests to pass. I would remove the below
+ line, but keeping it here as a reminder for people
+ to check for issues if they find concerns with DHT
+ pass-through logic */
+ /*
+ .lookup = dht_lookup,
+ .readdir = dht_readdir,
+ .readdirp = dht_readdirp,
+ */
+ /* Keeping above as commented, mainly to support the
+ usecase of a gluster volume getting to 1x(anytype),
+ due to remove-brick (shrinking) exercise. In that case,
+ we would need above fops to be available, so we can
+ handle the case of dangling linkto files (if any) */
+};
struct xlator_fops fops = {
.ipc = dht_ipc,
@@ -74,6 +95,29 @@ struct xlator_dumpops dumpops = {
.inodectx = dht_inodectx_dump,
};
-struct xlator_cbks cbks = {.release = dht_release,
- // .releasedir = dht_releasedir,
- .forget = dht_forget};
+struct xlator_cbks cbks = {
+ .release = dht_release,
+ // .releasedir = dht_releasedir,
+ .forget = dht_forget,
+};
+
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+extern struct volume_options dht_options[];
+
+xlator_api_t xlator_api = {
+ .init = dht_init,
+ .fini = dht_fini,
+ .notify = dht_notify,
+ .reconfigure = dht_reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = dht_options,
+ .identifier = "distribute",
+ .pass_through_fops = &dht_pt_fops,
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/cluster/dht/src/dht.sym b/xlators/cluster/dht/src/dht.sym
deleted file mode 100644
index 780b5fc0387..00000000000
--- a/xlators/cluster/dht/src/dht.sym
+++ /dev/null
@@ -1,8 +0,0 @@
-fops
-cbks
-class_methods
-dht_methods
-options
-mem_acct_init
-reconfigure
-dumpops
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index b8077f972d1..3648a564840 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -12,7 +12,7 @@
/* TODO: all 'TODO's in dht.c holds good */
-extern struct volume_options options[];
+extern struct volume_options dht_options[];
int
nufa_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
@@ -595,15 +595,9 @@ nufa_init(xlator_t *this)
dht_methods_t dht_methods = {
.migration_get_dst_subvol = dht_migration_get_dst_subvol,
- .migration_needed = dht_migration_needed,
.layout_search = dht_layout_search,
};
-class_methods_t class_methods = {.init = nufa_init,
- .fini = dht_fini,
- .reconfigure = dht_reconfigure,
- .notify = dht_notify};
-
struct xlator_fops fops = {
.lookup = nufa_lookup,
.create = nufa_create,
@@ -645,3 +639,19 @@ struct xlator_fops fops = {
};
struct xlator_cbks cbks = {.forget = dht_forget};
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+xlator_api_t xlator_api = {
+ .init = nufa_init,
+ .fini = dht_fini,
+ .notify = dht_notify,
+ .reconfigure = dht_reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = dht_options,
+ .identifier = "nufa",
+ .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/cluster/dht/src/nufa.sym b/xlators/cluster/dht/src/nufa.sym
deleted file mode 100644
index 780b5fc0387..00000000000
--- a/xlators/cluster/dht/src/nufa.sym
+++ /dev/null
@@ -1,8 +0,0 @@
-fops
-cbks
-class_methods
-dht_methods
-options
-mem_acct_init
-reconfigure
-dumpops
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index ca9bfce5a8e..207d109a025 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -16,7 +16,7 @@
#include <fnmatch.h>
#include <string.h>
-extern struct volume_options options[];
+extern struct volume_options dht_options[];
struct switch_sched_array {
xlator_t *xl;
@@ -610,9 +610,15 @@ set_switch_pattern(xlator_t *this, dht_conf_t *conf, const char *pattern_str)
/* Get the pattern for considering switch case.
"option block-size *avi:10MB" etc */
option_string = gf_strdup(pattern_str);
+ if (option_string == NULL) {
+ goto err;
+ }
switch_str = strtok_r(option_string, ";", &tmp_str);
while (switch_str) {
dup_str = gf_strdup(switch_str);
+ if (dup_str == NULL) {
+ goto err;
+ }
switch_opt = GF_CALLOC(1, sizeof(struct switch_struct),
gf_switch_mt_switch_struct);
if (!switch_opt) {
@@ -647,6 +653,9 @@ set_switch_pattern(xlator_t *this, dht_conf_t *conf, const char *pattern_str)
if (childs) {
dup_childs = gf_strdup(childs);
+ if (dup_childs == NULL) {
+ goto err;
+ }
child = strtok_r(dup_childs, ",", &tmp);
while (child) {
if (gf_switch_valid_child(this, child)) {
@@ -823,11 +832,6 @@ err:
return -1;
}
-class_methods_t class_methods = {.init = switch_init,
- .fini = switch_fini,
- .reconfigure = dht_reconfigure,
- .notify = dht_notify};
-
struct xlator_fops fops = {
.lookup = switch_lookup,
.create = switch_create,
@@ -869,3 +873,19 @@ struct xlator_fops fops = {
};
struct xlator_cbks cbks = {.forget = dht_forget};
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+xlator_api_t xlator_api = {
+ .init = switch_init,
+ .fini = switch_fini,
+ .notify = dht_notify,
+ .reconfigure = dht_reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = dht_options,
+ .identifier = "switch",
+ .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/cluster/dht/src/switch.sym b/xlators/cluster/dht/src/switch.sym
deleted file mode 100644
index 780b5fc0387..00000000000
--- a/xlators/cluster/dht/src/switch.sym
+++ /dev/null
@@ -1,8 +0,0 @@
-fops
-cbks
-class_methods
-dht_methods
-options
-mem_acct_init
-reconfigure
-dumpops
diff --git a/xlators/cluster/dht/src/tier-common.c b/xlators/cluster/dht/src/tier-common.c
deleted file mode 100644
index b86ed673042..00000000000
--- a/xlators/cluster/dht/src/tier-common.c
+++ /dev/null
@@ -1,1199 +0,0 @@
-/*
- Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "libxlator.h"
-#include "dht-common.h"
-#include "defaults.h"
-#include "tier-common.h"
-#include "tier.h"
-
-int
-dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
-
-int
-tier_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- loc_t *oldloc = NULL;
- loc_t *newloc = NULL;
-
- local = frame->local;
-
- oldloc = &local->loc;
- newloc = &local->loc2;
-
- if (op_ret == -1) {
- /* No continuation on DHT inode missing errors, as we should
- * then have a good stbuf that states P2 happened. We would
- * get inode missing if, the file completed migrated between
- * the lookup and the link call */
- goto out;
- }
-
- if (local->call_cnt != 1) {
- goto out;
- }
-
- local->call_cnt = 2;
-
- /* Do this on the hot tier now */
-
- STACK_WIND(frame, tier_link_cbk, local->cached_subvol,
- local->cached_subvol->fops->link, oldloc, newloc, xdata);
-
- return 0;
-
-out:
- DHT_STRIP_PHASE1_FLAGS(stbuf);
-
- DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
- postparent, NULL);
-
- return 0;
-}
-
-int
-tier_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata)
-{
- xlator_t *cached_subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- int op_errno = -1;
- int ret = -1;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(oldloc, err);
- VALIDATE_OR_GOTO(newloc, err);
-
- conf = this->private;
-
- local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->call_cnt = 1;
-
- cached_subvol = local->cached_subvol;
-
- if (!cached_subvol) {
- gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
- oldloc->path);
- op_errno = ENOENT;
- goto err;
- }
-
- hashed_subvol = TIER_HASHED_SUBVOL;
-
- ret = loc_copy(&local->loc2, newloc);
- if (ret == -1) {
- op_errno = ENOMEM;
- goto err;
- }
-
- if (hashed_subvol == cached_subvol) {
- STACK_WIND(frame, dht_link_cbk, cached_subvol,
- cached_subvol->fops->link, oldloc, newloc, xdata);
- return 0;
- }
-
- /* Create hardlinks to both the data file on the hot tier
- and the linkto file on the cold tier */
-
- gf_uuid_copy(local->gfid, oldloc->inode->gfid);
-
- STACK_WIND(frame, tier_link_cbk, hashed_subvol, hashed_subvol->fops->link,
- oldloc, newloc, xdata);
-
- return 0;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-int
-tier_create_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- dht_local_t *local = NULL;
-
- local = frame->local;
-
- if (local->params) {
- dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
- }
-
- DHT_STACK_UNWIND(create, frame, -1, local->op_errno, NULL, NULL, NULL, NULL,
- NULL, NULL);
-
- return 0;
-}
-
-int
-tier_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
-{
- xlator_t *prev = NULL;
- int ret = -1;
- dht_local_t *local = NULL;
- xlator_t *hashed_subvol = NULL;
- dht_conf_t *conf = NULL;
-
- local = frame->local;
- conf = this->private;
-
- hashed_subvol = TIER_HASHED_SUBVOL;
-
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
-
- if (op_ret == -1) {
- if (local->linked == _gf_true && local->xattr_req) {
- local->op_errno = op_errno;
- local->op_ret = op_ret;
- ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
- local->xattr_req);
- if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value to "
- "unlink of migrating file");
- goto out;
- }
-
- STACK_WIND(frame, tier_create_unlink_stale_linkto_cbk,
- hashed_subvol, hashed_subvol->fops->unlink, &local->loc,
- 0, local->xattr_req);
- return 0;
- }
- goto out;
- }
-
- prev = cookie;
-
- if (local->loc.parent) {
- dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
-
- dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
- }
-
- ret = dht_layout_preset(this, prev, inode);
- if (ret != 0) {
- gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s",
- prev->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
-
- local->op_errno = op_errno;
-
- if (local->linked == _gf_true) {
- local->stbuf = *stbuf;
- dht_linkfile_attr_heal(frame, this);
- }
-out:
- if (local) {
- if (local->xattr_req) {
- dict_del(local->xattr_req, TIER_LINKFILE_GFID);
- }
- }
-
- DHT_STRIP_PHASE1_FLAGS(stbuf);
-
- DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
- preparent, postparent, xdata);
-
- return 0;
-}
-
-int
-tier_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, inode_t *inode,
- struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- unsigned char *gfid = NULL;
-
- local = frame->local;
- if (!local) {
- op_errno = EINVAL;
- goto err;
- }
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto err;
- }
-
- conf = this->private;
- if (!conf) {
- local->op_errno = EINVAL;
- op_errno = EINVAL;
- goto err;
- }
-
- cached_subvol = TIER_UNHASHED_SUBVOL;
-
- if (local->params) {
- dict_del(local->params, conf->link_xattr_name);
- dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
- }
-
- /*
- * We will delete the linkfile if data file creation fails.
- * When deleting this stale linkfile, there is a possibility
- * for a race between this linkfile deletion and a stale
- * linkfile deletion triggered by another lookup from different
- * client.
- *
- * For eg:
- *
- * Client 1 Client 2
- *
- * 1 linkfile created for foo
- *
- * 2 data file creation failed
- *
- * 3 creating a file with same name
- *
- * 4 lookup before creation deleted
- * the linkfile created by client1
- * considering as a stale linkfile.
- *
- * 5 New linkfile created for foo
- * with different gfid.
- *
- * 6 Trigger linkfile deletion as
- * data file creation failed.
- *
- * 7 Linkfile deleted which is
- * created by client2.
- *
- * 8 Data file created.
- *
- * With this race, we will end up having a file in a non-hashed subvol
- * without a linkfile in hashed subvol.
- *
- * To avoid this, we store the gfid of linkfile created by client, So
- * If we delete the linkfile , we validate gfid of existing file with
- * stored value from posix layer.
- *
- * Storing this value in local->xattr_req as local->params was also used
- * to create the data file. During the linkfile deletion we will use
- * local->xattr_req dictionary.
- */
- if (!local->xattr_req) {
- local->xattr_req = dict_new();
- if (!local->xattr_req) {
- local->op_errno = ENOMEM;
- op_errno = ENOMEM;
- goto err;
- }
- }
-
- gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char);
- if (!gfid) {
- local->op_errno = ENOMEM;
- op_errno = ENOMEM;
- goto err;
- }
-
- gf_uuid_copy(gfid, stbuf->ia_gfid);
- ret = dict_set_dynptr(local->xattr_req, TIER_LINKFILE_GFID, gfid,
- sizeof(uuid_t));
- if (ret) {
- GF_FREE(gfid);
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value"
- " : key = %s",
- TIER_LINKFILE_GFID);
- }
-
- STACK_WIND_COOKIE(frame, tier_create_cbk, cached_subvol, cached_subvol,
- cached_subvol->fops->create, &local->loc, local->flags,
- local->mode, local->umask, local->fd, local->params);
-
- return 0;
-err:
- DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
-}
-
-gf_boolean_t
-tier_is_hot_tier_decommissioned(xlator_t *this)
-{
- dht_conf_t *conf = NULL;
- xlator_t *hot_tier = NULL;
- int i = 0;
-
- conf = this->private;
- hot_tier = conf->subvolumes[1];
-
- if (conf->decommission_subvols_cnt) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->decommissioned_bricks[i] &&
- conf->decommissioned_bricks[i] == hot_tier)
- return _gf_true;
- }
- }
-
- return _gf_false;
-}
-
-int
-tier_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
-{
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- xlator_t *hot_subvol = NULL;
- xlator_t *cold_subvol = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
-
- conf = this->private;
-
- dht_get_du_info(frame, this, loc);
-
- local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- cold_subvol = TIER_HASHED_SUBVOL;
- hot_subvol = TIER_UNHASHED_SUBVOL;
-
- if (conf->subvolumes[0] != cold_subvol) {
- hot_subvol = conf->subvolumes[0];
- }
- /*
- * if hot tier full, write to cold.
- * Also if hot tier is full, create in cold
- */
- if (dht_is_subvol_filled(this, hot_subvol) ||
- tier_is_hot_tier_decommissioned(this)) {
- gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
- cold_subvol->name);
-
- STACK_WIND_COOKIE(frame, tier_create_cbk, cold_subvol, cold_subvol,
- cold_subvol->fops->create, loc, flags, mode, umask,
- fd, params);
- } else {
- local->params = dict_ref(params);
- local->flags = flags;
- local->mode = mode;
- local->umask = umask;
- local->cached_subvol = hot_subvol;
- local->hashed_subvol = cold_subvol;
-
- gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)", loc->path,
- hot_subvol->name, cold_subvol->name);
-
- dht_linkfile_create(frame, tier_create_linkfile_create_cbk, this,
- hot_subvol, cold_subvol, loc);
-
- goto out;
- }
-out:
- return 0;
-
-err:
-
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
- NULL);
-
- return 0;
-}
-
-int
-tier_unlink_nonhashed_linkfile_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- xlator_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK(&frame->lock);
- {
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- local->op_errno = op_errno;
- local->op_ret = op_ret;
- gf_msg_debug(this->name, op_errno,
- "Unlink link: subvolume %s"
- " returned -1",
- prev->name);
- goto unlock;
- }
-
- local->op_ret = 0;
- }
-unlock:
- UNLOCK(&frame->lock);
-
- if (local->op_ret == -1)
- goto err;
- DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, NULL);
-
- return 0;
-
-err:
- DHT_STACK_UNWIND(unlink, frame, -1, local->op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int
-tier_unlink_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *preparent, dict_t *xdata,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- xlator_t *prev = NULL;
- dht_conf_t *conf = NULL;
- xlator_t *hot_subvol = NULL;
-
- local = frame->local;
- prev = cookie;
- conf = this->private;
- hot_subvol = TIER_UNHASHED_SUBVOL;
-
- if (!op_ret) {
- /*
- * linkfile present on hot tier. unlinking the linkfile
- */
- STACK_WIND_COOKIE(frame, tier_unlink_nonhashed_linkfile_cbk, hot_subvol,
- hot_subvol, hot_subvol->fops->unlink, &local->loc,
- local->flags, NULL);
- return 0;
- }
-
- LOCK(&frame->lock);
- {
- if (op_errno == ENOENT) {
- local->op_ret = 0;
- local->op_errno = op_errno;
- } else {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- }
- gf_msg_debug(this->name, op_errno, "Lookup : subvolume %s returned -1",
- prev->name);
- }
-
- UNLOCK(&frame->lock);
-
- DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, xdata);
-
- return 0;
-}
-
-int
-tier_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- xlator_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK(&frame->lock);
- {
- /* Ignore EINVAL for tier to ignore error when the file
- does not exist on the other tier */
- if ((op_ret == -1) && !((op_errno == ENOENT) || (op_errno == EINVAL))) {
- local->op_errno = op_errno;
- local->op_ret = op_ret;
- gf_msg_debug(this->name, op_errno,
- "Unlink link: subvolume %s"
- " returned -1",
- prev->name);
- goto unlock;
- }
-
- local->op_ret = 0;
- }
-unlock:
- UNLOCK(&frame->lock);
-
- if (local->op_ret == -1)
- goto err;
-
- DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, xdata);
-
- return 0;
-
-err:
- DHT_STACK_UNWIND(unlink, frame, -1, local->op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-tier_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- dht_local_t *local = NULL;
- xlator_t *prev = NULL;
- struct iatt *stbuf = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- xlator_t *hot_tier = NULL;
- xlator_t *cold_tier = NULL;
-
- local = frame->local;
- prev = cookie;
- conf = this->private;
-
- cold_tier = TIER_HASHED_SUBVOL;
- hot_tier = TIER_UNHASHED_SUBVOL;
-
- LOCK(&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno == ENOENT) {
- local->op_ret = 0;
- } else {
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
- gf_msg_debug(this->name, op_errno,
- "Unlink: subvolume %s returned -1"
- " with errno = %d",
- prev->name, op_errno);
- goto unlock;
- }
-
- local->op_ret = 0;
-
- local->postparent = *postparent;
- local->preparent = *preparent;
-
- if (local->loc.parent) {
- dht_inode_ctx_time_update(local->loc.parent, this,
- &local->preparent, 0);
- dht_inode_ctx_time_update(local->loc.parent, this,
- &local->postparent, 1);
- }
- }
-unlock:
- UNLOCK(&frame->lock);
-
- if (local->op_ret)
- goto out;
-
- if (cold_tier != local->cached_subvol) {
- /*
- * File is present in hot tier, so there will be
- * a link file on cold tier, deleting the linkfile
- * from cold tier
- */
- STACK_WIND_COOKIE(frame, tier_unlink_linkfile_cbk, cold_tier, cold_tier,
- cold_tier->fops->unlink, &local->loc, local->flags,
- xdata);
- return 0;
- }
-
- ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
- if (!ret && stbuf &&
- ((IS_DHT_MIGRATION_PHASE2(stbuf)) || IS_DHT_MIGRATION_PHASE1(stbuf))) {
- /*
- * File is migrating from cold to hot tier.
- * Delete the destination linkfile.
- */
- STACK_WIND_COOKIE(frame, tier_unlink_lookup_cbk, hot_tier, hot_tier,
- hot_tier->fops->lookup, &local->loc, NULL);
- return 0;
- }
-
-out:
- DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, xdata);
-
- return 0;
-}
-
-int
-tier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
- dict_t *xdata)
-{
- xlator_t *cached_subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- int ret = -1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
-
- conf = this->private;
-
- local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK);
- if (!local) {
- op_errno = ENOMEM;
-
- goto err;
- }
-
- hashed_subvol = TIER_HASHED_SUBVOL;
-
- cached_subvol = local->cached_subvol;
- if (!cached_subvol) {
- gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local->flags = xflag;
- if (IA_ISREG(loc->inode->ia_type) && (hashed_subvol == cached_subvol)) {
- /*
- * File resides in cold tier. We need to stat
- * the file to see if it is being promoted.
- * If yes we need to delete the destination
- * file as well.
- *
- * Currently we are doing this check only for
- * regular files.
- */
- xdata = xdata ? dict_ref(xdata) : dict_new();
- if (xdata) {
- ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1);
- if (ret) {
- gf_msg_debug(this->name, 0, "Failed to set dictionary key %s",
- DHT_IATT_IN_XDATA_KEY);
- }
- }
- }
-
- /*
- * File is on hot tier, delete the data file first, then
- * linkfile from cold.
- */
- STACK_WIND_COOKIE(frame, tier_unlink_cbk, cached_subvol, cached_subvol,
- cached_subvol->fops->unlink, loc, xflag, xdata);
- if (xdata)
- dict_unref(xdata);
- return 0;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
-
- return 0;
-}
-
-int
-tier_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
-{
- gf_dirent_t entries;
- gf_dirent_t *orig_entry = NULL;
- gf_dirent_t *entry = NULL;
- int count = 0;
-
- INIT_LIST_HEAD(&entries.list);
-
- if (op_ret < 0)
- goto unwind;
-
- list_for_each_entry(orig_entry, (&orig_entries->list), list)
- {
- entry = gf_dirent_for_name(orig_entry->d_name);
- if (!entry) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "Memory allocation failed ");
- goto unwind;
- }
-
- entry->d_off = orig_entry->d_off;
- entry->d_ino = orig_entry->d_ino;
- entry->d_type = orig_entry->d_type;
- entry->d_len = orig_entry->d_len;
-
- list_add_tail(&entry->list, &entries.list);
- count++;
- }
- op_ret = count;
-
-unwind:
- if (op_ret < 0)
- op_ret = 0;
-
- DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
-
- gf_dirent_free(&entries);
-
- return 0;
-}
-
-int
-tier_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- gf_dirent_t entries;
- gf_dirent_t *orig_entry = NULL;
- gf_dirent_t *entry = NULL;
- xlator_t *prev = NULL;
- xlator_t *next_subvol = NULL;
- off_t next_offset = 0;
- int count = 0;
- dht_conf_t *conf = NULL;
- int ret = 0;
- inode_table_t *itable = NULL;
- inode_t *inode = NULL;
-
- INIT_LIST_HEAD(&entries.list);
- prev = cookie;
- local = frame->local;
- itable = local->fd ? local->fd->inode->table : NULL;
-
- conf = this->private;
- GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
-
- if (op_ret < 0)
- goto done;
-
- list_for_each_entry(orig_entry, (&orig_entries->list), list)
- {
- next_offset = orig_entry->d_off;
-
- if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
- /*stat failed somewhere- ignore this entry*/
- continue;
- }
-
- entry = gf_dirent_for_name(orig_entry->d_name);
- if (!entry) {
- goto unwind;
- }
-
- entry->d_off = orig_entry->d_off;
- entry->d_stat = orig_entry->d_stat;
- entry->d_ino = orig_entry->d_ino;
- entry->d_type = orig_entry->d_type;
- entry->d_len = orig_entry->d_len;
-
- if (orig_entry->dict)
- entry->dict = dict_ref(orig_entry->dict);
-
- if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict,
- conf->link_xattr_name)) {
- goto entries;
-
- } else if (IA_ISDIR(entry->d_stat.ia_type)) {
- if (orig_entry->inode) {
- dht_inode_ctx_time_update(orig_entry->inode, this,
- &entry->d_stat, 1);
- }
- } else {
- if (orig_entry->inode) {
- ret = dht_layout_preset(this, prev, orig_entry->inode);
- if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_SET_FAILED,
- "failed to link the layout "
- "in inode");
-
- entry->inode = inode_ref(orig_entry->inode);
- } else if (itable) {
- /*
- * orig_entry->inode might be null if any upper
- * layer xlators below client set to null, to
- * force a lookup on the inode even if the inode
- * is present in the inode table. In that case
- * we just update the ctx to make sure we didn't
- * missed anything.
- */
- inode = inode_find(itable, orig_entry->d_stat.ia_gfid);
- if (inode) {
- ret = dht_layout_preset(this, TIER_HASHED_SUBVOL, inode);
- if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_SET_FAILED,
- "failed to link the layout"
- " in inode");
- inode_unref(inode);
- inode = NULL;
- }
- }
- }
-
- entries:
- list_add_tail(&entry->list, &entries.list);
- count++;
- }
- op_ret = count;
-
-done:
- if (count == 0) {
- /* non-zero next_offset means that
- EOF is not yet hit on the current subvol
- */
- if (next_offset != 0) {
- next_subvol = prev;
- } else {
- goto unwind;
- }
-
- STACK_WIND_COOKIE(frame, tier_readdirp_cbk, next_subvol, next_subvol,
- next_subvol->fops->readdirp, local->fd, local->size,
- next_offset, local->xattr);
- return 0;
- }
-
-unwind:
- if (op_ret < 0)
- op_ret = 0;
-
- DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
-
- gf_dirent_free(&entries);
-
- return 0;
-}
-
-int
-tier_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, int whichop, dict_t *dict)
-{
- dht_local_t *local = NULL;
- int op_errno = -1;
- xlator_t *hashed_subvol = NULL;
- int ret = 0;
- dht_conf_t *conf = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(this->private, err);
-
- conf = this->private;
-
- local = dht_local_init(frame, NULL, NULL, whichop);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- local->fd = fd_ref(fd);
- local->size = size;
- local->xattr_req = (dict) ? dict_ref(dict) : NULL;
-
- hashed_subvol = TIER_HASHED_SUBVOL;
-
- /* TODO: do proper readdir */
- if (whichop == GF_FOP_READDIRP) {
- if (dict)
- local->xattr = dict_ref(dict);
- else
- local->xattr = dict_new();
-
- if (local->xattr) {
- ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256);
- if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value"
- " : key = %s",
- conf->link_xattr_name);
- }
-
- STACK_WIND_COOKIE(frame, tier_readdirp_cbk, hashed_subvol,
- hashed_subvol, hashed_subvol->fops->readdirp, fd,
- size, yoff, local->xattr);
-
- } else {
- STACK_WIND_COOKIE(frame, tier_readdir_cbk, hashed_subvol, hashed_subvol,
- hashed_subvol->fops->readdir, fd, size, yoff,
- local->xattr);
- }
-
- return 0;
-
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int
-tier_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, dict_t *xdata)
-{
- int op = GF_FOP_READDIR;
- dht_conf_t *conf = NULL;
- int i = 0;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->subvolume_status[i]) {
- op = GF_FOP_READDIRP;
- break;
- }
- }
-
- if (conf->use_readdirp)
- op = GF_FOP_READDIRP;
-
-out:
- tier_do_readdir(frame, this, fd, size, yoff, op, 0);
- return 0;
-}
-
-int
-tier_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, dict_t *dict)
-{
- tier_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict);
- return 0;
-}
-
-int
-tier_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, struct statvfs *statvfs, dict_t *xdata)
-{
- gf_boolean_t event = _gf_false;
- qdstatfs_action_t action = qdstatfs_action_OFF;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- int bsize = 0;
- int frsize = 0;
- GF_UNUSED int ret = 0;
- unsigned long new_usage = 0;
- unsigned long cur_usage = 0;
- xlator_t *prev = NULL;
- dht_conf_t *conf = NULL;
- tier_statvfs_t *tier_stat = NULL;
-
- prev = cookie;
- local = frame->local;
- GF_ASSERT(local);
-
- conf = this->private;
-
- if (xdata)
- ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event);
-
- tier_stat = &local->tier_statvfs;
-
- LOCK(&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto unlock;
- }
- if (!statvfs) {
- op_errno = EINVAL;
- local->op_ret = -1;
- goto unlock;
- }
- local->op_ret = 0;
-
- if (local->quota_deem_statfs) {
- if (event == _gf_true) {
- action = qdstatfs_action_COMPARE;
- } else {
- action = qdstatfs_action_NEGLECT;
- }
- } else {
- if (event == _gf_true) {
- action = qdstatfs_action_REPLACE;
- local->quota_deem_statfs = _gf_true;
- }
- }
-
- if (local->quota_deem_statfs) {
- switch (action) {
- case qdstatfs_action_NEGLECT:
- goto unlock;
-
- case qdstatfs_action_REPLACE:
- local->statvfs = *statvfs;
- goto unlock;
-
- case qdstatfs_action_COMPARE:
- new_usage = statvfs->f_blocks - statvfs->f_bfree;
- cur_usage = local->statvfs.f_blocks -
- local->statvfs.f_bfree;
-
- /* Take the max of the usage from subvols */
- if (new_usage >= cur_usage)
- local->statvfs = *statvfs;
- goto unlock;
-
- default:
- break;
- }
- }
-
- if (local->statvfs.f_bsize != 0) {
- bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
- frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
- dht_normalize_stats(&local->statvfs, bsize, frsize);
- dht_normalize_stats(statvfs, bsize, frsize);
- } else {
- local->statvfs.f_bsize = statvfs->f_bsize;
- local->statvfs.f_frsize = statvfs->f_frsize;
- }
-
- if (prev == TIER_HASHED_SUBVOL) {
- local->statvfs.f_blocks = statvfs->f_blocks;
- local->statvfs.f_files = statvfs->f_files;
- local->statvfs.f_fsid = statvfs->f_fsid;
- local->statvfs.f_flag = statvfs->f_flag;
- local->statvfs.f_namemax = statvfs->f_namemax;
- tier_stat->blocks_used = (statvfs->f_blocks - statvfs->f_bfree);
- tier_stat->pblocks_used = (statvfs->f_blocks - statvfs->f_bavail);
- tier_stat->files_used = (statvfs->f_files - statvfs->f_ffree);
- tier_stat->pfiles_used = (statvfs->f_files - statvfs->f_favail);
- tier_stat->hashed_fsid = statvfs->f_fsid;
- } else {
- tier_stat->unhashed_fsid = statvfs->f_fsid;
- tier_stat->unhashed_blocks_used = (statvfs->f_blocks -
- statvfs->f_bfree);
- tier_stat->unhashed_pblocks_used = (statvfs->f_blocks -
- statvfs->f_bavail);
- tier_stat->unhashed_files_used = (statvfs->f_files -
- statvfs->f_ffree);
- tier_stat->unhashed_pfiles_used = (statvfs->f_files -
- statvfs->f_favail);
- }
- }
-unlock:
- UNLOCK(&frame->lock);
-
- this_call_cnt = dht_frame_return(frame);
- if (is_last_call(this_call_cnt)) {
- if (tier_stat->unhashed_fsid != tier_stat->hashed_fsid) {
- tier_stat->blocks_used += tier_stat->unhashed_blocks_used;
- tier_stat->pblocks_used += tier_stat->unhashed_pblocks_used;
- tier_stat->files_used += tier_stat->unhashed_files_used;
- tier_stat->pfiles_used += tier_stat->unhashed_pfiles_used;
- }
- local->statvfs.f_bfree = local->statvfs.f_blocks -
- tier_stat->blocks_used;
- local->statvfs.f_bavail = local->statvfs.f_blocks -
- tier_stat->pblocks_used;
- local->statvfs.f_ffree = local->statvfs.f_files - tier_stat->files_used;
- local->statvfs.f_favail = local->statvfs.f_files -
- tier_stat->pfiles_used;
- DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
- &local->statvfs, xdata);
- }
-
- return 0;
-}
-
-int
-tier_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
- inode_t *inode = NULL;
- inode_table_t *itable = NULL;
- uuid_t root_gfid = {
- 0,
- };
- loc_t newloc = {
- 0,
- };
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(this->private, err);
-
- conf = this->private;
-
- local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) {
- itable = loc->inode->table;
- if (!itable) {
- op_errno = EINVAL;
- goto err;
- }
-
- loc = &local->loc2;
- root_gfid[15] = 1;
-
- inode = inode_find(itable, root_gfid);
- if (!inode) {
- op_errno = EINVAL;
- goto err;
- }
-
- dht_build_root_loc(inode, &newloc);
- loc = &newloc;
- }
-
- local->call_cnt = conf->subvolume_cnt;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND_COOKIE(frame, tier_statfs_cbk, conf->subvolumes[i],
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->statfs, loc, xdata);
- }
-
- return 0;
-
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
diff --git a/xlators/cluster/dht/src/tier-common.h b/xlators/cluster/dht/src/tier-common.h
deleted file mode 100644
index b1ebaa8004d..00000000000
--- a/xlators/cluster/dht/src/tier-common.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _TIER_COMMON_H_
-#define _TIER_COMMON_H_
-/* Function definitions */
-int
-tier_create_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata);
-
-int
-tier_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
-
-int
-tier_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, inode_t *inode,
- struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata);
-
-int
-tier_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- mode_t mode, mode_t umask, fd_t *fd, dict_t *params);
-
-int32_t
-tier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
- dict_t *xdata);
-
-int32_t
-tier_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t off, dict_t *dict);
-
-int
-tier_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, dict_t *xdata);
-
-int
-tier_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata);
-
-int
-tier_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
-
-#endif
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
deleted file mode 100644
index f7fd6ef22e2..00000000000
--- a/xlators/cluster/dht/src/tier.c
+++ /dev/null
@@ -1,3088 +0,0 @@
-/*
- Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include <dlfcn.h>
-
-#include "dht-common.h"
-#include "tier.h"
-#include "tier-common.h"
-#include "syscall.h"
-#include "events.h"
-#include "tier-ctr-interface.h"
-
-/*Hard coded DB info*/
-static gfdb_db_type_t dht_tier_db_type = GFDB_SQLITE3;
-/*Hard coded DB info*/
-
-/*Mutex for updating the data movement stats*/
-static pthread_mutex_t dm_stat_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-/* Stores the path location of promotion query files */
-static char *promotion_qfile;
-/* Stores the path location of demotion query files */
-static char *demotion_qfile;
-
-static void *libhandle;
-static gfdb_methods_t gfdb_methods;
-
-#define DB_QUERY_RECORD_SIZE 4096
-
-/*
- * Closes all the fds and frees the qfile_array
- * */
-static void
-qfile_array_free(tier_qfile_array_t *qfile_array)
-{
- ssize_t i = 0;
-
- if (qfile_array) {
- if (qfile_array->fd_array) {
- for (i = 0; i < qfile_array->array_size; i++) {
- if (qfile_array->fd_array[i] != -1) {
- sys_close(qfile_array->fd_array[i]);
- }
- }
- }
- GF_FREE(qfile_array->fd_array);
- }
- GF_FREE(qfile_array);
-}
-
-/* Create a new query file list with given size */
-static tier_qfile_array_t *
-qfile_array_new(ssize_t array_size)
-{
- int ret = -1;
- tier_qfile_array_t *qfile_array = NULL;
- ssize_t i = 0;
-
- GF_VALIDATE_OR_GOTO("tier", (array_size > 0), out);
-
- qfile_array = GF_CALLOC(1, sizeof(tier_qfile_array_t),
- gf_tier_mt_qfile_array_t);
- if (!qfile_array) {
- gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to allocate memory for tier_qfile_array_t");
- goto out;
- }
-
- qfile_array->fd_array = GF_MALLOC(array_size * sizeof(int),
- gf_dht_mt_int32_t);
- if (!qfile_array->fd_array) {
- gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to allocate memory for "
- "tier_qfile_array_t->fd_array");
- goto out;
- }
-
- /* Init all the fds to -1 */
- for (i = 0; i < array_size; i++) {
- qfile_array->fd_array[i] = -1;
- }
-
- qfile_array->array_size = array_size;
- qfile_array->next_index = 0;
-
- /* Set exhausted count to list size as the list is empty */
- qfile_array->exhausted_count = qfile_array->array_size;
-
- ret = 0;
-out:
- if (ret) {
- qfile_array_free(qfile_array);
- qfile_array = NULL;
- }
- return qfile_array;
-}
-
-/* Checks if the query file list is empty or totally exhausted. */
-static gf_boolean_t
-is_qfile_array_empty(tier_qfile_array_t *qfile_array)
-{
- return (qfile_array->exhausted_count == qfile_array->array_size)
- ? _gf_true
- : _gf_false;
-}
-
-/* Shifts the next_fd pointer to the next available fd in the list */
-static void
-shift_next_index(tier_qfile_array_t *qfile_array)
-{
- int qfile_fd = 0;
- int spin_count = 0;
-
- if (is_qfile_array_empty(qfile_array)) {
- return;
- }
-
- do {
- /* change next_index in a rotional manner */
- (qfile_array->next_index == (qfile_array->array_size - 1))
- ? qfile_array->next_index = 0
- : qfile_array->next_index++;
-
- qfile_fd = (qfile_array->fd_array[qfile_array->next_index]);
-
- spin_count++;
-
- } while ((qfile_fd == -1) && (spin_count < qfile_array->array_size));
-}
-
-/*
- * This is a non-thread safe function to read query records
- * from a list of query files in a Round-Robin manner.
- * As in when the query files get exhuasted they are closed.
- * Returns:
- * 0 if all the query records in all the query files of the list are
- * exhausted.
- * > 0 if a query record is successfully read. Indicates the size of the query
- * record read.
- * < 0 if there was failure
- * */
-static int
-read_query_record_list(tier_qfile_array_t *qfile_array,
- gfdb_query_record_t **query_record)
-{
- int ret = -1;
- int qfile_fd = 0;
-
- GF_VALIDATE_OR_GOTO("tier", qfile_array, out);
- GF_VALIDATE_OR_GOTO("tier", qfile_array->fd_array, out);
-
- do {
- if (is_qfile_array_empty(qfile_array)) {
- ret = 0;
- break;
- }
-
- qfile_fd = qfile_array->fd_array[qfile_array->next_index];
- ret = gfdb_methods.gfdb_read_query_record(qfile_fd, query_record);
- if (ret <= 0) {
- /*The qfile_fd has reached EOF or
- * there was an error.
- * 1. Close the exhausted fd
- * 2. increment the exhausted count
- * 3. shift next_qfile to next qfile
- **/
- sys_close(qfile_fd);
- qfile_array->fd_array[qfile_array->next_index] = -1;
- qfile_array->exhausted_count++;
- /* shift next_qfile to next qfile */
- shift_next_index(qfile_array);
- continue;
- } else {
- /* shift next_qfile to next qfile */
- shift_next_index(qfile_array);
- break;
- }
- } while (1);
-out:
- return ret;
-}
-
-/* Check and update the watermark every WM_INTERVAL seconds */
-#define WM_INTERVAL 5
-#define WM_INTERVAL_EMERG 1
-
-static int
-tier_check_same_node(xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
-{
- int ret = -1;
- dict_t *dict = NULL;
- char *uuid_str = NULL;
- uuid_t node_uuid = {
- 0,
- };
-
- GF_VALIDATE_OR_GOTO("tier", this, out);
- GF_VALIDATE_OR_GOTO(this->name, loc, out);
- GF_VALIDATE_OR_GOTO(this->name, defrag, out);
-
- if (syncop_getxattr(this, loc, &dict, GF_XATTR_NODE_UUID_KEY, NULL, NULL)) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Unable to get NODE_UUID_KEY %s %s\n", loc->name, loc->path);
- goto out;
- }
-
- if (dict_get_str(dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to get node-uuids for %s", loc->path);
- goto out;
- }
-
- if (gf_uuid_parse(uuid_str, node_uuid)) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "uuid_parse failed for %s", loc->path);
- goto out;
- }
-
- if (gf_uuid_compare(node_uuid, defrag->node_uuid)) {
- gf_msg_debug(this->name, 0, "%s does not belong to this node",
- loc->path);
- ret = 1;
- goto out;
- }
-
- ret = 0;
-out:
- if (dict)
- dict_unref(dict);
-
- return ret;
-}
-
-int
-tier_get_fs_stat(xlator_t *this, loc_t *root_loc)
-{
- int ret = 0;
- gf_defrag_info_t *defrag = NULL;
- dht_conf_t *conf = NULL;
- dict_t *xdata = NULL;
- struct statvfs statfs = {
- 0,
- };
- gf_tier_conf_t *tier_conf = NULL;
-
- conf = this->private;
- if (!conf) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
- "conf is NULL");
- ret = -1;
- goto exit;
- }
-
- defrag = conf->defrag;
- if (!defrag) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
- "defrag is NULL");
- ret = -1;
- goto exit;
- }
-
- tier_conf = &defrag->tier_conf;
-
- xdata = dict_new();
- if (!xdata) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
- "failed to allocate dictionary");
- ret = -1;
- goto exit;
- }
-
- ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
- ret = -1;
- goto exit;
- }
-
- /* Find how much free space is on the hot subvolume.
- * Then see if that value */
- /* is less than or greater than user defined watermarks.
- * Stash results in */
- /* the tier_conf data structure. */
-
- ret = syncop_statfs(conf->subvolumes[1], root_loc, &statfs, xdata, NULL);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_STATUS,
- "Unable to obtain statfs.");
- goto exit;
- }
-
- pthread_mutex_lock(&dm_stat_mutex);
-
- tier_conf->block_size = statfs.f_bsize;
- tier_conf->blocks_total = statfs.f_blocks;
- tier_conf->blocks_used = statfs.f_blocks - statfs.f_bfree;
-
- tier_conf->percent_full = GF_PERCENTAGE(tier_conf->blocks_used,
- statfs.f_blocks);
- pthread_mutex_unlock(&dm_stat_mutex);
-
-exit:
- if (xdata)
- dict_unref(xdata);
- return ret;
-}
-
-static void
-tier_send_watermark_event(const char *volname, tier_watermark_op_t old_wm,
- tier_watermark_op_t new_wm)
-{
- if (old_wm == TIER_WM_LOW || old_wm == TIER_WM_NONE) {
- if (new_wm == TIER_WM_MID) {
- gf_event(EVENT_TIER_WATERMARK_RAISED_TO_MID, "vol=%s", volname);
- } else if (new_wm == TIER_WM_HI) {
- gf_event(EVENT_TIER_WATERMARK_HI, "vol=%s", volname);
- }
- } else if (old_wm == TIER_WM_MID) {
- if (new_wm == TIER_WM_LOW) {
- gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_LOW, "vol=%s", volname);
- } else if (new_wm == TIER_WM_HI) {
- gf_event(EVENT_TIER_WATERMARK_HI, "vol=%s", volname);
- }
- } else if (old_wm == TIER_WM_HI) {
- if (new_wm == TIER_WM_MID) {
- gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_MID, "vol=%s", volname);
- } else if (new_wm == TIER_WM_LOW) {
- gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_LOW, "vol=%s", volname);
- }
- }
-}
-
-int
-tier_check_watermark(xlator_t *this)
-{
- int ret = -1;
- gf_defrag_info_t *defrag = NULL;
- dht_conf_t *conf = NULL;
- gf_tier_conf_t *tier_conf = NULL;
- tier_watermark_op_t wm = TIER_WM_NONE;
-
- conf = this->private;
- if (!conf)
- goto exit;
-
- defrag = conf->defrag;
- if (!defrag)
- goto exit;
-
- tier_conf = &defrag->tier_conf;
-
- if (tier_conf->percent_full < tier_conf->watermark_low) {
- wm = TIER_WM_LOW;
-
- } else if (tier_conf->percent_full < tier_conf->watermark_hi) {
- wm = TIER_WM_MID;
-
- } else {
- wm = TIER_WM_HI;
- }
-
- if (wm != tier_conf->watermark_last) {
- tier_send_watermark_event(tier_conf->volname, tier_conf->watermark_last,
- wm);
-
- tier_conf->watermark_last = wm;
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Tier watermark now %d", wm);
- }
-
- ret = 0;
-
-exit:
- return ret;
-}
-
-static gf_boolean_t
-is_hot_tier_full(gf_tier_conf_t *tier_conf)
-{
- if (tier_conf && (tier_conf->mode == TIER_MODE_WM) &&
- (tier_conf->watermark_last == TIER_WM_HI))
- return _gf_true;
-
- return _gf_false;
-}
-
-int
-tier_do_migration(xlator_t *this, int promote)
-{
- gf_defrag_info_t *defrag = NULL;
- dht_conf_t *conf = NULL;
- long rand = 0;
- int migrate = 0;
- gf_tier_conf_t *tier_conf = NULL;
-
- conf = this->private;
- if (!conf)
- goto exit;
-
- defrag = conf->defrag;
- if (!defrag)
- goto exit;
-
- if (tier_check_watermark(this) != 0) {
- gf_msg(this->name, GF_LOG_CRITICAL, errno, DHT_MSG_LOG_TIER_ERROR,
- "Failed to get watermark");
- goto exit;
- }
-
- tier_conf = &defrag->tier_conf;
-
- switch (tier_conf->watermark_last) {
- case TIER_WM_LOW:
- migrate = promote ? 1 : 0;
- break;
- case TIER_WM_HI:
- migrate = promote ? 0 : 1;
- break;
- case TIER_WM_MID:
- /* coverity[DC.WEAK_CRYPTO] */
- rand = random() % 100;
- if (promote) {
- migrate = (rand > tier_conf->percent_full);
- } else {
- migrate = (rand <= tier_conf->percent_full);
- }
- break;
- }
-
-exit:
- return migrate;
-}
-
-int
-tier_migrate(xlator_t *this, int is_promotion, dict_t *migrate_data, loc_t *loc,
- gf_tier_conf_t *tier_conf)
-{
- int ret = -1;
-
- pthread_mutex_lock(&tier_conf->pause_mutex);
- if (is_promotion)
- tier_conf->promote_in_progress = 1;
- else
- tier_conf->demote_in_progress = 1;
- pthread_mutex_unlock(&tier_conf->pause_mutex);
-
- /* Data migration */
- ret = syncop_setxattr(this, loc, migrate_data, 0, NULL, NULL);
-
- pthread_mutex_lock(&tier_conf->pause_mutex);
- if (is_promotion)
- tier_conf->promote_in_progress = 0;
- else
- tier_conf->demote_in_progress = 0;
- pthread_mutex_unlock(&tier_conf->pause_mutex);
-
- return ret;
-}
-
-/* returns _gf_true: if file can be promoted
- * returns _gf_false: if file cannot be promoted
- */
-static gf_boolean_t
-tier_can_promote_file(xlator_t *this, char const *file_name,
- struct iatt *current, gf_defrag_info_t *defrag)
-{
- gf_boolean_t ret = _gf_false;
- fsblkcnt_t estimated_usage = 0;
-
- if (defrag->tier_conf.tier_max_promote_size &&
- (current->ia_size > defrag->tier_conf.tier_max_promote_size)) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "File %s (gfid:%s) with size (%lu) exceeds maxsize "
- "(%d) for promotion. File will not be promoted.",
- file_name, uuid_utoa(current->ia_gfid), current->ia_size,
- defrag->tier_conf.tier_max_promote_size);
- goto err;
- }
-
- /* bypass further validations for TEST mode */
- if (defrag->tier_conf.mode != TIER_MODE_WM) {
- ret = _gf_true;
- goto err;
- }
-
- /* convert the file size to blocks as per the block size of the
- * destination tier
- * NOTE: add (block_size - 1) to get the correct block size when
- * there is a remainder after a modulo
- */
- estimated_usage = ((current->ia_size + defrag->tier_conf.block_size - 1) /
- defrag->tier_conf.block_size) +
- defrag->tier_conf.blocks_used;
-
- /* test if the estimated block usage goes above HI watermark */
- if (GF_PERCENTAGE(estimated_usage, defrag->tier_conf.blocks_total) >=
- defrag->tier_conf.watermark_hi) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Estimated block count consumption on "
- "hot tier (%lu) exceeds hi watermark (%d%%). "
- "File will not be promoted.",
- estimated_usage, defrag->tier_conf.watermark_hi);
- goto err;
- }
- ret = _gf_true;
-err:
- return ret;
-}
-
-static int
-tier_set_migrate_data(dict_t *migrate_data)
-{
- int failed = 1;
-
- failed = dict_set_str(migrate_data, GF_XATTR_FILE_MIGRATE_KEY, "force");
- if (failed) {
- goto bail_out;
- }
-
- /* Flag to suggest the xattr call is from migrator */
- failed = dict_set_str(migrate_data, "from.migrator", "yes");
- if (failed) {
- goto bail_out;
- }
-
- /* Flag to suggest its a tiering migration
- * The reason for this dic key-value is that
- * promotions and demotions are multithreaded
- * so the original frame from gf_defrag_start()
- * is not carried. A new frame will be created when
- * we do syncop_setxattr(). This does not have the
- * frame->root->pid of the original frame. So we pass
- * this dic key-value when we do syncop_setxattr() to do
- * data migration and set the frame->root->pid to
- * GF_CLIENT_PID_TIER_DEFRAG in dht_setxattr() just before
- * calling dht_start_rebalance_task() */
- failed = dict_set_str(migrate_data, TIERING_MIGRATION_KEY, "yes");
- if (failed) {
- goto bail_out;
- }
-
- failed = 0;
-
-bail_out:
- return failed;
-}
-
-static char *
-tier_get_parent_path(xlator_t *this, loc_t *p_loc, struct iatt *par_stbuf,
- int *per_link_status)
-{
- int ret = -1;
- char *parent_path = NULL;
- dict_t *xdata_request = NULL;
- dict_t *xdata_response = NULL;
-
- xdata_request = dict_new();
- if (!xdata_request) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to create xdata_request dict");
- goto err;
- }
- ret = dict_set_int32(xdata_request, GET_ANCESTRY_PATH_KEY, 42);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to set value to dict : key %s \n",
- GET_ANCESTRY_PATH_KEY);
- goto err;
- }
-
- ret = syncop_lookup(this, p_loc, par_stbuf, NULL, xdata_request,
- &xdata_response);
- /* When the parent gfid is a stale entry, the lookup
- * will fail and stop the demotion process.
- * The parent gfid can be stale when a huge folder is
- * deleted while the files within it are being migrated
- */
- if (ret == -ESTALE) {
- gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_STALE_LOOKUP,
- "Stale entry in parent lookup for %s", uuid_utoa(p_loc->gfid));
- *per_link_status = 1;
- goto err;
- } else if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
- "Error in parent lookup for %s", uuid_utoa(p_loc->gfid));
- *per_link_status = -1;
- goto err;
- }
- ret = dict_get_str(xdata_response, GET_ANCESTRY_PATH_KEY, &parent_path);
- if (ret || !parent_path) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to get parent path for %s", uuid_utoa(p_loc->gfid));
- *per_link_status = -1;
- goto err;
- }
-
-err:
- if (xdata_request) {
- dict_unref(xdata_request);
- }
-
- if (xdata_response) {
- dict_unref(xdata_response);
- xdata_response = NULL;
- }
-
- return parent_path;
-}
-
-static int
-tier_get_file_name_and_path(xlator_t *this, uuid_t gfid,
- gfdb_link_info_t *link_info,
- char const *parent_path, loc_t *loc,
- int *per_link_status)
-{
- int ret = -1;
-
- loc->name = gf_strdup(link_info->file_name);
- if (!loc->name) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Memory "
- "allocation failed for %s",
- uuid_utoa(gfid));
- *per_link_status = -1;
- goto err;
- }
- ret = gf_asprintf((char **)&(loc->path), "%s/%s", parent_path, loc->name);
- if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to "
- "construct file path for %s %s\n",
- parent_path, loc->name);
- *per_link_status = -1;
- goto err;
- }
-
- ret = 0;
-
-err:
- return ret;
-}
-
-static int
-tier_lookup_file(xlator_t *this, loc_t *p_loc, loc_t *loc, struct iatt *current,
- int *per_link_status)
-{
- int ret = -1;
-
- ret = syncop_lookup(this, loc, current, NULL, NULL, NULL);
-
- /* The file may be deleted even when the parent
- * is available and the lookup will
- * return a stale entry which would stop the
- * migration. so if its a stale entry, then skip
- * the file and keep migrating.
- */
- if (ret == -ESTALE) {
- gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_STALE_LOOKUP,
- "Stale lookup for %s", uuid_utoa(p_loc->gfid));
- *per_link_status = 1;
- goto err;
- } else if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
- "Failed to "
- "lookup file %s\n",
- loc->name);
- *per_link_status = -1;
- goto err;
- }
- ret = 0;
-
-err:
- return ret;
-}
-
-static gf_boolean_t
-tier_is_file_already_at_destination(xlator_t *src_subvol,
- query_cbk_args_t *query_cbk_args,
- dht_conf_t *conf, int *per_link_status)
-{
- gf_boolean_t at_destination = _gf_true;
-
- if (src_subvol == NULL) {
- *per_link_status = 1;
- goto err;
- }
- if (query_cbk_args->is_promotion && src_subvol == conf->subvolumes[1]) {
- *per_link_status = 1;
- goto err;
- }
-
- if (!query_cbk_args->is_promotion && src_subvol == conf->subvolumes[0]) {
- *per_link_status = 1;
- goto err;
- }
- at_destination = _gf_false;
-
-err:
- return at_destination;
-}
-
-static void
-tier_update_migration_counters(query_cbk_args_t *query_cbk_args,
- gf_defrag_info_t *defrag,
- uint64_t *total_migrated_bytes, int *total_files)
-{
- if (query_cbk_args->is_promotion) {
- defrag->total_files_promoted++;
- *total_migrated_bytes += defrag->tier_conf.st_last_promoted_size;
- pthread_mutex_lock(&dm_stat_mutex);
- defrag->tier_conf.blocks_used += defrag->tier_conf
- .st_last_promoted_size;
- pthread_mutex_unlock(&dm_stat_mutex);
- } else {
- defrag->total_files_demoted++;
- *total_migrated_bytes += defrag->tier_conf.st_last_demoted_size;
- pthread_mutex_lock(&dm_stat_mutex);
- defrag->tier_conf.blocks_used -= defrag->tier_conf.st_last_demoted_size;
- pthread_mutex_unlock(&dm_stat_mutex);
- }
- if (defrag->tier_conf.blocks_total) {
- pthread_mutex_lock(&dm_stat_mutex);
- defrag->tier_conf.percent_full = GF_PERCENTAGE(
- defrag->tier_conf.blocks_used, defrag->tier_conf.blocks_total);
- pthread_mutex_unlock(&dm_stat_mutex);
- }
-
- (*total_files)++;
-}
-
-static int
-tier_migrate_link(xlator_t *this, dht_conf_t *conf, uuid_t gfid,
- gfdb_link_info_t *link_info, gf_defrag_info_t *defrag,
- query_cbk_args_t *query_cbk_args, dict_t *migrate_data,
- int *per_link_status, int *total_files,
- uint64_t *total_migrated_bytes)
-{
- int ret = -1;
- struct iatt current = {
- 0,
- };
- struct iatt par_stbuf = {
- 0,
- };
- loc_t p_loc = {
- 0,
- };
- loc_t loc = {
- 0,
- };
- xlator_t *src_subvol = NULL;
- inode_t *linked_inode = NULL;
- char *parent_path = NULL;
-
- /* Lookup for parent and get the path of parent */
- gf_uuid_copy(p_loc.gfid, link_info->pargfid);
- p_loc.inode = inode_new(defrag->root_inode->table);
- if (!p_loc.inode) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to create reference to inode"
- " for %s",
- uuid_utoa(p_loc.gfid));
-
- *per_link_status = -1;
- goto err;
- }
-
- parent_path = tier_get_parent_path(this, &p_loc, &par_stbuf,
- per_link_status);
- if (!parent_path) {
- goto err;
- }
-
- linked_inode = inode_link(p_loc.inode, NULL, NULL, &par_stbuf);
- inode_unref(p_loc.inode);
- p_loc.inode = linked_inode;
-
- /* Preparing File Inode */
- gf_uuid_copy(loc.gfid, gfid);
- loc.inode = inode_new(defrag->root_inode->table);
- gf_uuid_copy(loc.pargfid, link_info->pargfid);
- loc.parent = inode_ref(p_loc.inode);
-
- /* Get filename and Construct file path */
- if (tier_get_file_name_and_path(this, gfid, link_info, parent_path, &loc,
- per_link_status) != 0) {
- goto err;
- }
- gf_uuid_copy(loc.parent->gfid, link_info->pargfid);
-
- /* lookup file inode */
- if (tier_lookup_file(this, &p_loc, &loc, &current, per_link_status) != 0) {
- goto err;
- }
-
- if (query_cbk_args->is_promotion) {
- if (!tier_can_promote_file(this, link_info->file_name, &current,
- defrag)) {
- *per_link_status = 1;
- goto err;
- }
- }
-
- linked_inode = inode_link(loc.inode, NULL, NULL, &current);
- inode_unref(loc.inode);
- loc.inode = linked_inode;
-
- /*
- * Do not promote/demote if file already is where it
- * should be. It means another brick moved the file
- * so is not an error. So we set per_link_status = 1
- * so that we ignore counting this.
- */
- src_subvol = dht_subvol_get_cached(this, loc.inode);
-
- if (tier_is_file_already_at_destination(src_subvol, query_cbk_args, conf,
- per_link_status)) {
- goto err;
- }
-
- gf_msg_debug(this->name, 0, "Tier %s: src_subvol %s file %s",
- (query_cbk_args->is_promotion ? "promote" : "demote"),
- src_subvol->name, loc.path);
-
- ret = tier_check_same_node(this, &loc, defrag);
- if (ret != 0) {
- if (ret < 0) {
- *per_link_status = -1;
- goto err;
- }
- ret = 0;
- /* By setting per_link_status to 1 we are
- * ignoring this status and will not be counting
- * this file for migration */
- *per_link_status = 1;
- goto err;
- }
-
- gf_uuid_copy(loc.gfid, loc.inode->gfid);
-
- if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Tiering paused. "
- "Exiting tier_migrate_link");
- goto err;
- }
-
- ret = tier_migrate(this, query_cbk_args->is_promotion, migrate_data, &loc,
- &defrag->tier_conf);
-
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
- "Failed to "
- "migrate %s ",
- loc.path);
- *per_link_status = -1;
- goto err;
- }
-
- tier_update_migration_counters(query_cbk_args, defrag, total_migrated_bytes,
- total_files);
-
- ret = 0;
-
-err:
- GF_FREE((char *)loc.name);
- loc.name = NULL;
- loc_wipe(&loc);
- loc_wipe(&p_loc);
-
- if ((*total_files >= defrag->tier_conf.max_migrate_files) ||
- (*total_migrated_bytes > defrag->tier_conf.max_migrate_bytes)) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Reached cycle migration limit."
- "migrated bytes %" PRId64 " files %d",
- *total_migrated_bytes, *total_files);
- ret = -1;
- }
-
- return ret;
-}
-
-static int
-tier_migrate_using_query_file(void *_args)
-{
- int ret = -1;
- query_cbk_args_t *query_cbk_args = (query_cbk_args_t *)_args;
- xlator_t *this = NULL;
- gf_defrag_info_t *defrag = NULL;
- gfdb_query_record_t *query_record = NULL;
- gfdb_link_info_t *link_info = NULL;
- dict_t *migrate_data = NULL;
- /*
- * per_file_status and per_link_status
- * 0 : success
- * -1 : failure
- * 1 : ignore the status and don't count for migration
- * */
- int per_file_status = 0;
- int per_link_status = 0;
- int total_status = 0;
- dht_conf_t *conf = NULL;
- uint64_t total_migrated_bytes = 0;
- int total_files = 0;
- loc_t root_loc = {0};
- gfdb_time_t start_time = {0};
- gfdb_time_t current_time = {0};
- int total_time = 0;
- int max_time = 0;
- gf_boolean_t emergency_demote_mode = _gf_false;
-
- GF_VALIDATE_OR_GOTO("tier", query_cbk_args, out);
- GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
- this = query_cbk_args->this;
- GF_VALIDATE_OR_GOTO(this->name, query_cbk_args->defrag, out);
- GF_VALIDATE_OR_GOTO(this->name, query_cbk_args->qfile_array, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
-
- conf = this->private;
-
- defrag = query_cbk_args->defrag;
- migrate_data = dict_new();
- if (!migrate_data)
- goto out;
-
- emergency_demote_mode = (!query_cbk_args->is_promotion &&
- is_hot_tier_full(&defrag->tier_conf));
-
- if (tier_set_migrate_data(migrate_data) != 0) {
- goto out;
- }
-
- dht_build_root_loc(defrag->root_inode, &root_loc);
-
- ret = gettimeofday(&start_time, NULL);
- if (query_cbk_args->is_promotion) {
- max_time = defrag->tier_conf.tier_promote_frequency;
- } else {
- max_time = defrag->tier_conf.tier_demote_frequency;
- }
-
- /* Per file */
- while ((ret = read_query_record_list(query_cbk_args->qfile_array,
- &query_record)) != 0) {
- if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to fetch query record "
- "from query file");
- goto out;
- }
-
- if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
- ret = -1;
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Exiting tier migration as"
- "defrag status is not started");
- goto out;
- }
-
- ret = gettimeofday(&current_time, NULL);
- if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Could not get current time.");
- goto out;
- }
-
- total_time = current_time.tv_sec - start_time.tv_sec;
- if (total_time > max_time) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Max cycle time reached. Exiting migration.");
- goto out;
- }
-
- per_file_status = 0;
- per_link_status = 0;
-
- if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Tiering paused. "
- "Exiting tier_migrate_using_query_file");
- break;
- }
-
- if (defrag->tier_conf.mode == TIER_MODE_WM) {
- ret = tier_get_fs_stat(this, &root_loc);
- if (ret != 0) {
- gfdb_methods.gfdb_query_record_free(query_record);
- query_record = NULL;
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
- "tier_get_fs_stat() FAILED ... "
- "skipping file migrations until next cycle");
- break;
- }
-
- if (!tier_do_migration(this, query_cbk_args->is_promotion)) {
- gfdb_methods.gfdb_query_record_free(query_record);
- query_record = NULL;
-
- /* We have crossed the high watermark. Stop processing
- * files if this is a promotion cycle so demotion gets
- * a chance to start if not already running*/
-
- if (query_cbk_args->is_promotion &&
- is_hot_tier_full(&defrag->tier_conf)) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "High watermark crossed during "
- "promotion. Exiting "
- "tier_migrate_using_query_file");
- break;
- }
- continue;
- }
- }
-
- per_link_status = 0;
-
- /* For now we only support single link migration. And we will
- * ignore other hard links in the link info list of query record
- * TODO: Multiple hard links migration */
- if (!list_empty(&query_record->link_list)) {
- link_info = list_first_entry(&query_record->link_list,
- gfdb_link_info_t, list);
- }
- if (link_info != NULL) {
- if (tier_migrate_link(this, conf, query_record->gfid, link_info,
- defrag, query_cbk_args, migrate_data,
- &per_link_status, &total_files,
- &total_migrated_bytes) != 0) {
- gf_msg(
- this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "%s failed for %s(gfid:%s)",
- (query_cbk_args->is_promotion ? "Promotion" : "Demotion"),
- link_info->file_name, uuid_utoa(query_record->gfid));
- }
- }
- per_file_status = per_link_status;
-
- if (per_file_status < 0) { /* Failure */
- pthread_mutex_lock(&dm_stat_mutex);
- defrag->total_failures++;
- pthread_mutex_unlock(&dm_stat_mutex);
- } else if (per_file_status == 0) { /* Success */
- pthread_mutex_lock(&dm_stat_mutex);
- defrag->total_files++;
- pthread_mutex_unlock(&dm_stat_mutex);
- } else if (per_file_status == 1) { /* Ignore */
- per_file_status = 0;
- /* Since this attempt was ignored we
- * decrement the lookup count*/
- pthread_mutex_lock(&dm_stat_mutex);
- defrag->num_files_lookedup--;
- pthread_mutex_unlock(&dm_stat_mutex);
- }
- total_status = total_status + per_file_status;
- per_link_status = 0;
- per_file_status = 0;
-
- gfdb_methods.gfdb_query_record_free(query_record);
- query_record = NULL;
-
- /* If we are demoting and the entry watermark was HI, then
- * we are done with emergency demotions if the current
- * watermark has fallen below hi-watermark level
- */
- if (emergency_demote_mode) {
- if (tier_check_watermark(this) == 0) {
- if (!is_hot_tier_full(&defrag->tier_conf)) {
- break;
- }
- }
- }
- }
-
-out:
- if (migrate_data)
- dict_unref(migrate_data);
-
- gfdb_methods.gfdb_query_record_free(query_record);
- query_record = NULL;
-
- return total_status;
-}
-
-/* This is the call back function per record/file from data base */
-static int
-tier_gf_query_callback(gfdb_query_record_t *gfdb_query_record, void *_args)
-{
- int ret = -1;
- query_cbk_args_t *query_cbk_args = _args;
-
- GF_VALIDATE_OR_GOTO("tier", query_cbk_args, out);
- GF_VALIDATE_OR_GOTO("tier", query_cbk_args->defrag, out);
- GF_VALIDATE_OR_GOTO("tier", (query_cbk_args->query_fd > 0), out);
-
- ret = gfdb_methods.gfdb_write_query_record(query_cbk_args->query_fd,
- gfdb_query_record);
- if (ret) {
- gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed writing query record to query file");
- goto out;
- }
-
- pthread_mutex_lock(&dm_stat_mutex);
- query_cbk_args->defrag->num_files_lookedup++;
- pthread_mutex_unlock(&dm_stat_mutex);
-
- ret = 0;
-out:
- return ret;
-}
-
-/* Create query file in tier process */
-static int
-tier_process_self_query(tier_brick_list_t *local_brick, void *args)
-{
- int ret = -1;
- char *db_path = NULL;
- query_cbk_args_t *query_cbk_args = NULL;
- xlator_t *this = NULL;
- gfdb_conn_node_t *conn_node = NULL;
- dict_t *params_dict = NULL;
- dict_t *ctr_ipc_dict = NULL;
- gfdb_brick_info_t *gfdb_brick_info = args;
-
- /*Init of all the essentials*/
- GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
- query_cbk_args = gfdb_brick_info->_query_cbk_args;
-
- GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
- this = query_cbk_args->this;
-
- GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
-
- db_path = local_brick->brick_db_path;
-
- /*Preparing DB parameters before init_db i.e getting db connection*/
- params_dict = dict_new();
- if (!params_dict) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "DB Params cannot initialized");
- goto out;
- }
- SET_DB_PARAM_TO_DICT(this->name, params_dict,
- (char *)gfdb_methods.get_db_path_key(), db_path, ret,
- out);
-
- /*Get the db connection*/
- conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type);
- if (!conn_node) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "FATAL: Failed initializing db operations");
- goto out;
- }
-
- /* Query for eligible files from db */
- query_cbk_args->query_fd = open(local_brick->qfile_path,
- O_WRONLY | O_CREAT | O_APPEND,
- S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
- if (query_cbk_args->query_fd < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR,
- "Failed to open query file %s", local_brick->qfile_path);
- goto out;
- }
- if (!gfdb_brick_info->_gfdb_promote) {
- if (query_cbk_args->defrag->tier_conf.watermark_last == TIER_WM_HI) {
- /* emergency demotion mode */
- ret = gfdb_methods.find_all(
- conn_node, tier_gf_query_callback, (void *)query_cbk_args,
- query_cbk_args->defrag->tier_conf.query_limit);
- } else {
- if (query_cbk_args->defrag->write_freq_threshold == 0 &&
- query_cbk_args->defrag->read_freq_threshold == 0) {
- ret = gfdb_methods.find_unchanged_for_time(
- conn_node, tier_gf_query_callback, (void *)query_cbk_args,
- gfdb_brick_info->time_stamp);
- } else {
- ret = gfdb_methods.find_unchanged_for_time_freq(
- conn_node, tier_gf_query_callback, (void *)query_cbk_args,
- gfdb_brick_info->time_stamp,
- query_cbk_args->defrag->write_freq_threshold,
- query_cbk_args->defrag->read_freq_threshold, _gf_false);
- }
- }
- } else {
- if (query_cbk_args->defrag->write_freq_threshold == 0 &&
- query_cbk_args->defrag->read_freq_threshold == 0) {
- ret = gfdb_methods.find_recently_changed_files(
- conn_node, tier_gf_query_callback, (void *)query_cbk_args,
- gfdb_brick_info->time_stamp);
- } else {
- ret = gfdb_methods.find_recently_changed_files_freq(
- conn_node, tier_gf_query_callback, (void *)query_cbk_args,
- gfdb_brick_info->time_stamp,
- query_cbk_args->defrag->write_freq_threshold,
- query_cbk_args->defrag->read_freq_threshold, _gf_false);
- }
- }
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "FATAL: query from db failed");
- goto out;
- }
-
- /*Clear the heat on the DB entries*/
- /*Preparing ctr_ipc_dict*/
- ctr_ipc_dict = dict_new();
- if (!ctr_ipc_dict) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "ctr_ipc_dict cannot initialized");
- goto out;
- }
-
- SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, GFDB_IPC_CTR_KEY,
- GFDB_IPC_CTR_CLEAR_OPS, ret, out);
-
- ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict,
- NULL);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed clearing the heat "
- "on db %s error %d",
- local_brick->brick_db_path, ret);
- goto out;
- }
-
- ret = 0;
-out:
- if (params_dict) {
- dict_unref(params_dict);
- params_dict = NULL;
- }
-
- if (ctr_ipc_dict) {
- dict_unref(ctr_ipc_dict);
- ctr_ipc_dict = NULL;
- }
-
- if (query_cbk_args && query_cbk_args->query_fd >= 0) {
- sys_close(query_cbk_args->query_fd);
- query_cbk_args->query_fd = -1;
- }
- gfdb_methods.fini_db(conn_node);
-
- return ret;
-}
-
-/*Ask CTR to create the query file*/
-static int
-tier_process_ctr_query(tier_brick_list_t *local_brick, void *args)
-{
- int ret = -1;
- query_cbk_args_t *query_cbk_args = NULL;
- xlator_t *this = NULL;
- dict_t *ctr_ipc_in_dict = NULL;
- dict_t *ctr_ipc_out_dict = NULL;
- gfdb_brick_info_t *gfdb_brick_info = args;
- gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL;
- int count = 0;
-
- /*Init of all the essentials*/
- GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
- query_cbk_args = gfdb_brick_info->_query_cbk_args;
-
- GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
- this = query_cbk_args->this;
-
- GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
-
- /*Preparing ctr_ipc_in_dict*/
- ctr_ipc_in_dict = dict_new();
- if (!ctr_ipc_in_dict) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "ctr_ipc_in_dict cannot initialized");
- goto out;
- }
-
- ipc_ctr_params = GF_CALLOC(1, sizeof(gfdb_ipc_ctr_params_t),
- gf_tier_mt_ipc_ctr_params_t);
- if (!ipc_ctr_params) {
- goto out;
- }
-
- /* set all the query params*/
- ipc_ctr_params->is_promote = gfdb_brick_info->_gfdb_promote;
-
- ipc_ctr_params->write_freq_threshold = query_cbk_args->defrag
- ->write_freq_threshold;
-
- ipc_ctr_params->read_freq_threshold = query_cbk_args->defrag
- ->read_freq_threshold;
-
- ipc_ctr_params->query_limit = query_cbk_args->defrag->tier_conf.query_limit;
-
- ipc_ctr_params->emergency_demote = (!gfdb_brick_info->_gfdb_promote &&
- query_cbk_args->defrag->tier_conf
- .watermark_last == TIER_WM_HI);
-
- memcpy(&ipc_ctr_params->time_stamp, gfdb_brick_info->time_stamp,
- sizeof(gfdb_time_t));
-
- SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, GFDB_IPC_CTR_KEY,
- GFDB_IPC_CTR_QUERY_OPS, ret, out);
-
- SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict,
- GFDB_IPC_CTR_GET_QFILE_PATH, local_brick->qfile_path,
- ret, out);
-
- ret = dict_set_bin(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS,
- ipc_ctr_params, sizeof(*ipc_ctr_params));
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
- "Failed setting %s to params dictionary",
- GFDB_IPC_CTR_GET_QUERY_PARAMS);
- GF_FREE(ipc_ctr_params);
- goto out;
- }
- ipc_ctr_params = NULL;
-
- ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_in_dict,
- &ctr_ipc_out_dict);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_IPC_TIER_ERROR,
- "Failed query on %s ret %d", local_brick->brick_db_path, ret);
- goto out;
- }
-
- ret = dict_get_int32(ctr_ipc_out_dict, GFDB_IPC_CTR_RET_QUERY_COUNT,
- &count);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed getting count "
- "of records on %s",
- local_brick->brick_db_path);
- goto out;
- }
-
- if (count < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed query on %s", local_brick->brick_db_path);
- ret = -1;
- goto out;
- }
-
- pthread_mutex_lock(&dm_stat_mutex);
- query_cbk_args->defrag->num_files_lookedup = count;
- pthread_mutex_unlock(&dm_stat_mutex);
-
- ret = 0;
-out:
-
- if (ctr_ipc_in_dict) {
- dict_unref(ctr_ipc_in_dict);
- ctr_ipc_in_dict = NULL;
- }
-
- if (ctr_ipc_out_dict) {
- dict_unref(ctr_ipc_out_dict);
- ctr_ipc_out_dict = NULL;
- }
-
- GF_FREE(ipc_ctr_params);
-
- return ret;
-}
-
-/* This is the call back function for each brick from hot/cold bricklist
- * It picks up each bricks db and queries for eligible files for migration.
- * The list of eligible files are populated in appropriate query files*/
-static int
-tier_process_brick(tier_brick_list_t *local_brick, void *args)
-{
- int ret = -1;
- dict_t *ctr_ipc_in_dict = NULL;
- dict_t *ctr_ipc_out_dict = NULL;
- char *strval = NULL;
-
- GF_VALIDATE_OR_GOTO("tier", local_brick, out);
-
- GF_VALIDATE_OR_GOTO("tier", local_brick->xlator, out);
-
- if (dht_tier_db_type == GFDB_SQLITE3) {
- /*Preparing ctr_ipc_in_dict*/
- ctr_ipc_in_dict = dict_new();
- if (!ctr_ipc_in_dict) {
- gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "ctr_ipc_in_dict cannot initialized");
- goto out;
- }
-
- ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_KEY,
- GFDB_IPC_CTR_GET_DB_PARAM_OPS);
- if (ret) {
- gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
- "Failed to set %s "
- "to params dictionary",
- GFDB_IPC_CTR_KEY);
- goto out;
- }
-
- ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_DB_PARAM_OPS, "");
- if (ret) {
- gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
- "Failed to set %s "
- "to params dictionary",
- GFDB_IPC_CTR_GET_DB_PARAM_OPS);
- goto out;
- }
-
- ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_DB_KEY,
- "journal_mode");
- if (ret) {
- gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
- "Failed to set %s "
- "to params dictionary",
- GFDB_IPC_CTR_GET_DB_KEY);
- goto out;
- }
-
- ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR,
- ctr_ipc_in_dict, &ctr_ipc_out_dict);
- if (ret || ctr_ipc_out_dict == NULL) {
- gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to get "
- "journal_mode of sql db %s",
- local_brick->brick_db_path);
- goto out;
- }
-
- ret = dict_get_str(ctr_ipc_out_dict, "journal_mode", &strval);
- if (ret) {
- gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_GET_PARAM_FAILED,
- "Failed to get %s "
- "from params dictionary"
- "journal_mode",
- strval);
- goto out;
- }
-
- if (strval && (strncmp(strval, "wal", SLEN("wal")) == 0)) {
- ret = tier_process_self_query(local_brick, args);
- if (ret) {
- goto out;
- }
- } else {
- ret = tier_process_ctr_query(local_brick, args);
- if (ret) {
- goto out;
- }
- }
- ret = 0;
-
- } else {
- ret = tier_process_self_query(local_brick, args);
- if (ret) {
- goto out;
- }
- }
-
- ret = 0;
-out:
- if (ctr_ipc_in_dict)
- dict_unref(ctr_ipc_in_dict);
-
- if (ctr_ipc_out_dict)
- dict_unref(ctr_ipc_out_dict);
-
- return ret;
-}
-
-static int
-tier_build_migration_qfile(migration_args_t *args,
- query_cbk_args_t *query_cbk_args,
- gf_boolean_t is_promotion)
-{
- gfdb_time_t current_time;
- gfdb_brick_info_t gfdb_brick_info;
- gfdb_time_t time_in_past;
- int ret = -1;
- tier_brick_list_t *local_brick = NULL;
- int i = 0;
- time_in_past.tv_sec = args->freq_time;
- time_in_past.tv_usec = 0;
-
- ret = gettimeofday(&current_time, NULL);
- if (ret == -1) {
- gf_msg(args->this->name, GF_LOG_ERROR, errno,
- DHT_MSG_SYS_CALL_GET_TIME_FAILED, "Failed to get current time");
- goto out;
- }
- time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec;
-
- /* The migration daemon may run a varying numberof usec after the */
- /* sleep call triggers. A file may be registered in CTR some number */
- /* of usec X after the daemon started and missed in the subsequent */
- /* cycle if the daemon starts Y usec after the period in seconds */
- /* where Y>X. Normalize away this problem by always setting usec */
- /* to 0. */
- time_in_past.tv_usec = 0;
-
- gfdb_brick_info.time_stamp = &time_in_past;
- gfdb_brick_info._gfdb_promote = is_promotion;
- gfdb_brick_info._query_cbk_args = query_cbk_args;
-
- list_for_each_entry(local_brick, args->brick_list, list)
- {
- /* Construct query file path for this brick
- * i.e
- * /var/run/gluster/xlator_name/
- * {promote/demote}-brickname-indexinbricklist
- * So that no two query files will have same path even
- * bricks have the same name
- * */
- snprintf(local_brick->qfile_path, PATH_MAX, "%s-%s-%d",
- GET_QFILE_PATH(gfdb_brick_info._gfdb_promote),
- local_brick->brick_name, i);
-
- /* Delete any old query files for this brick */
- sys_unlink(local_brick->qfile_path);
-
- ret = tier_process_brick(local_brick, &gfdb_brick_info);
- if (ret) {
- gf_msg(args->this->name, GF_LOG_ERROR, 0,
- DHT_MSG_BRICK_QUERY_FAILED, "Brick %s query failed\n",
- local_brick->brick_db_path);
- }
- i++;
- }
- ret = 0;
-out:
- return ret;
-}
-
-static int
-tier_migrate_files_using_qfile(migration_args_t *comp,
- query_cbk_args_t *query_cbk_args)
-{
- int ret = -1;
- tier_brick_list_t *local_brick = NULL;
- tier_brick_list_t *temp = NULL;
- gfdb_time_t current_time = {
- 0,
- };
- ssize_t qfile_array_size = 0;
- int count = 0;
- int temp_fd = 0;
- gf_tier_conf_t *tier_conf = NULL;
-
- tier_conf = &(query_cbk_args->defrag->tier_conf);
-
- /* Time for error query files */
- gettimeofday(&current_time, NULL);
-
- /* Build the qfile list */
- list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
- {
- qfile_array_size++;
- }
- query_cbk_args->qfile_array = qfile_array_new(qfile_array_size);
- if (!query_cbk_args->qfile_array) {
- gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to create new "
- "qfile_array");
- goto out;
- }
-
- /*Open all qfiles*/
- count = 0;
- query_cbk_args->qfile_array->exhausted_count = 0;
- list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
- {
- temp_fd = query_cbk_args->qfile_array->fd_array[count];
- temp_fd = open(local_brick->qfile_path, O_RDONLY,
- S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
- if (temp_fd < 0) {
- gf_msg("tier", GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR,
- "Failed to open "
- "%s to the query file",
- local_brick->qfile_path);
- query_cbk_args->qfile_array->exhausted_count++;
- }
- query_cbk_args->qfile_array->fd_array[count] = temp_fd;
- count++;
- }
-
- /* Moving the query file index to the next, so that we won't the same
- * query file every cycle as the first one */
- query_cbk_args->qfile_array
- ->next_index = (query_cbk_args->is_promotion)
- ? tier_conf->last_promote_qfile_index
- : tier_conf->last_demote_qfile_index;
- shift_next_index(query_cbk_args->qfile_array);
- if (query_cbk_args->is_promotion) {
- tier_conf->last_promote_qfile_index = query_cbk_args->qfile_array
- ->next_index;
- } else {
- tier_conf->last_demote_qfile_index = query_cbk_args->qfile_array
- ->next_index;
- }
-
- /* Migrate files using query file list */
- ret = tier_migrate_using_query_file((void *)query_cbk_args);
-out:
- qfile_array_free(query_cbk_args->qfile_array);
-
- /* If there is an error rename all the query files to .err files
- * with a timestamp for better debugging */
- if (ret) {
- struct tm tm = {
- 0,
- };
- char time_str[128] = {
- 0,
- };
- char query_file_path_err[PATH_MAX] = {
- 0,
- };
- int32_t len = 0;
-
- /* Time format for error query files */
- gmtime_r(&current_time.tv_sec, &tm);
- strftime(time_str, sizeof(time_str), "%F-%T", &tm);
-
- list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
- {
- /* rename error qfile*/
- len = snprintf(query_file_path_err, sizeof(query_file_path_err),
- "%s-%s.err", local_brick->qfile_path, time_str);
- if ((len >= 0) && (len < sizeof(query_file_path_err))) {
- if (sys_rename(local_brick->qfile_path, query_file_path_err) ==
- -1)
- gf_msg_debug("tier", 0,
- "rename "
- "failed");
- }
- }
- }
-
- query_cbk_args->qfile_array = NULL;
-
- return ret;
-}
-
-int
-tier_demote(migration_args_t *demotion_args)
-{
- query_cbk_args_t query_cbk_args;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO("tier", demotion_args, out);
- GF_VALIDATE_OR_GOTO("tier", demotion_args->this, out);
- GF_VALIDATE_OR_GOTO(demotion_args->this->name, demotion_args->brick_list,
- out);
- GF_VALIDATE_OR_GOTO(demotion_args->this->name, demotion_args->defrag, out);
-
- THIS = demotion_args->this;
-
- query_cbk_args.this = demotion_args->this;
- query_cbk_args.defrag = demotion_args->defrag;
- query_cbk_args.is_promotion = 0;
-
- /*Build the query file using bricklist*/
- ret = tier_build_migration_qfile(demotion_args, &query_cbk_args, _gf_false);
- if (ret)
- goto out;
-
- /* Migrate files using the query file */
- ret = tier_migrate_files_using_qfile(demotion_args, &query_cbk_args);
- if (ret)
- goto out;
-
-out:
- demotion_args->return_value = ret;
- return ret;
-}
-
-int
-tier_promote(migration_args_t *promotion_args)
-{
- int ret = -1;
- query_cbk_args_t query_cbk_args;
-
- GF_VALIDATE_OR_GOTO("tier", promotion_args->this, out);
- GF_VALIDATE_OR_GOTO(promotion_args->this->name, promotion_args->brick_list,
- out);
- GF_VALIDATE_OR_GOTO(promotion_args->this->name, promotion_args->defrag,
- out);
-
- THIS = promotion_args->this;
-
- query_cbk_args.this = promotion_args->this;
- query_cbk_args.defrag = promotion_args->defrag;
- query_cbk_args.is_promotion = 1;
-
- /*Build the query file using bricklist*/
- ret = tier_build_migration_qfile(promotion_args, &query_cbk_args, _gf_true);
- if (ret)
- goto out;
-
- /* Migrate files using the query file */
- ret = tier_migrate_files_using_qfile(promotion_args, &query_cbk_args);
- if (ret)
- goto out;
-
-out:
- promotion_args->return_value = ret;
- return ret;
-}
-
-/*
- * Command the CTR on a brick to compact the local database using an IPC
- */
-static int
-tier_process_self_compact(tier_brick_list_t *local_brick, void *args)
-{
- int ret = -1;
- char *db_path = NULL;
- query_cbk_args_t *query_cbk_args = NULL;
- xlator_t *this = NULL;
- gfdb_conn_node_t *conn_node = NULL;
- dict_t *params_dict = NULL;
- dict_t *ctr_ipc_dict = NULL;
- gfdb_brick_info_t *gfdb_brick_info = args;
-
- /*Init of all the essentials*/
- GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
- query_cbk_args = gfdb_brick_info->_query_cbk_args;
-
- GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
- this = query_cbk_args->this;
-
- GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
-
- GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
-
- db_path = local_brick->brick_db_path;
-
- /*Preparing DB parameters before init_db i.e getting db connection*/
- params_dict = dict_new();
- if (!params_dict) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "DB Params cannot initialized");
- goto out;
- }
- SET_DB_PARAM_TO_DICT(this->name, params_dict,
- (char *)gfdb_methods.get_db_path_key(), db_path, ret,
- out);
-
- /*Get the db connection*/
- conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type);
- if (!conn_node) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "FATAL: Failed initializing db operations");
- goto out;
- }
-
- ret = 0;
-
- /*Preparing ctr_ipc_dict*/
- ctr_ipc_dict = dict_new();
- if (!ctr_ipc_dict) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "ctr_ipc_dict cannot initialized");
- goto out;
- }
-
- ret = dict_set_int32(ctr_ipc_dict, "compact_active",
- query_cbk_args->defrag->tier_conf.compact_active);
-
- if (ret) {
- gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
- "Failed to set %s "
- "to params dictionary",
- "compact_active");
- goto out;
- }
-
- ret = dict_set_int32(
- ctr_ipc_dict, "compact_mode_switched",
- query_cbk_args->defrag->tier_conf.compact_mode_switched);
-
- if (ret) {
- gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
- "Failed to set %s "
- "to params dictionary",
- "compact_mode_switched");
- goto out;
- }
-
- SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, GFDB_IPC_CTR_KEY,
- GFDB_IPC_CTR_SET_COMPACT_PRAGMA, ret, out);
-
- gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
- "Starting Compaction IPC");
-
- ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict,
- NULL);
-
- gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
- "Ending Compaction IPC");
-
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed compaction "
- "on db %s error %d",
- local_brick->brick_db_path, ret);
- goto out;
- }
-
- gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
- "SUCCESS: %s Compaction", local_brick->brick_name);
-
- ret = 0;
-out:
- if (params_dict) {
- dict_unref(params_dict);
- params_dict = NULL;
- }
-
- if (ctr_ipc_dict) {
- dict_unref(ctr_ipc_dict);
- ctr_ipc_dict = NULL;
- }
-
- gfdb_methods.fini_db(conn_node);
-
- return ret;
-}
-
-/*
- * This is the call back function for each brick from hot/cold bricklist.
- * It determines the database type on each brick and calls the corresponding
- * function to prepare the compaction IPC.
- */
-static int
-tier_compact_db_brick(tier_brick_list_t *local_brick, void *args)
-{
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO("tier", local_brick, out);
-
- GF_VALIDATE_OR_GOTO("tier", local_brick->xlator, out);
-
- ret = tier_process_self_compact(local_brick, args);
- if (ret) {
- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Brick %s did not compact", local_brick->brick_name);
- goto out;
- }
-
- ret = 0;
-
-out:
-
- return ret;
-}
-
-static int
-tier_send_compact(migration_args_t *args, query_cbk_args_t *query_cbk_args)
-{
- gfdb_time_t current_time;
- gfdb_brick_info_t gfdb_brick_info;
- gfdb_time_t time_in_past;
- int ret = -1;
- tier_brick_list_t *local_brick = NULL;
-
- time_in_past.tv_sec = args->freq_time;
- time_in_past.tv_usec = 0;
-
- ret = gettimeofday(&current_time, NULL);
- if (ret == -1) {
- gf_msg(args->this->name, GF_LOG_ERROR, errno,
- DHT_MSG_SYS_CALL_GET_TIME_FAILED, "Failed to get current time");
- goto out;
- }
- time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec;
-
- /* The migration daemon may run a varying numberof usec after the sleep
- call triggers. A file may be registered in CTR some number of usec X
- after the daemon started and missed in the subsequent cycle if the
- daemon starts Y usec after the period in seconds where Y>X. Normalize
- away this problem by always setting usec to 0. */
- time_in_past.tv_usec = 0;
-
- gfdb_brick_info.time_stamp = &time_in_past;
-
- /* This is meant to say we are always compacting at this point */
- /* We simply borrow the promotion flag to do this */
- gfdb_brick_info._gfdb_promote = 1;
-
- gfdb_brick_info._query_cbk_args = query_cbk_args;
-
- list_for_each_entry(local_brick, args->brick_list, list)
- {
- gf_msg(args->this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
- "Start compaction for %s", local_brick->brick_name);
-
- ret = tier_compact_db_brick(local_brick, &gfdb_brick_info);
- if (ret) {
- gf_msg(args->this->name, GF_LOG_ERROR, 0,
- DHT_MSG_BRICK_QUERY_FAILED, "Brick %s compaction failed\n",
- local_brick->brick_db_path);
- }
-
- gf_msg(args->this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
- "End compaction for %s", local_brick->brick_name);
- }
- ret = 0;
-out:
- return ret;
-}
-
-static int
-tier_compact(void *args)
-{
- int ret = -1;
- query_cbk_args_t query_cbk_args;
- migration_args_t *compaction_args = args;
-
- GF_VALIDATE_OR_GOTO("tier", compaction_args->this, out);
- GF_VALIDATE_OR_GOTO(compaction_args->this->name,
- compaction_args->brick_list, out);
- GF_VALIDATE_OR_GOTO(compaction_args->this->name, compaction_args->defrag,
- out);
-
- THIS = compaction_args->this;
-
- query_cbk_args.this = compaction_args->this;
- query_cbk_args.defrag = compaction_args->defrag;
- query_cbk_args.is_compaction = 1;
-
- /* Send the compaction pragma out to all the bricks on the bricklist. */
- /* tier_get_bricklist ensures all bricks on the list are local to */
- /* this node. */
- ret = tier_send_compact(compaction_args, &query_cbk_args);
- if (ret)
- goto out;
-
- ret = 0;
-out:
- compaction_args->return_value = ret;
- return ret;
-}
-
-static int
-tier_get_bricklist(xlator_t *xl, struct list_head *local_bricklist_head)
-{
- xlator_list_t *child = NULL;
- char *rv = NULL;
- char *rh = NULL;
- char *brickname = NULL;
- char db_name[PATH_MAX] = "";
- int ret = 0;
- tier_brick_list_t *local_brick = NULL;
- int32_t len = 0;
-
- GF_VALIDATE_OR_GOTO("tier", xl, out);
- GF_VALIDATE_OR_GOTO("tier", local_bricklist_head, out);
-
- /*
- * This function obtains remote subvolumes and filters out only
- * those running on the same node as the tier daemon.
- */
- if (strcmp(xl->type, "protocol/client") == 0) {
- ret = dict_get_str(xl->options, "remote-host", &rh);
- if (ret < 0)
- goto out;
-
- if (gf_is_local_addr(rh)) {
- local_brick = GF_CALLOC(1, sizeof(tier_brick_list_t),
- gf_tier_mt_bricklist_t);
- if (!local_brick) {
- goto out;
- }
-
- ret = dict_get_str(xl->options, "remote-subvolume", &rv);
- if (ret < 0)
- goto out;
-
- brickname = strrchr(rv, '/') + 1;
- snprintf(db_name, sizeof(db_name), "%s.db", brickname);
-
- local_brick->brick_db_path = GF_MALLOC(PATH_MAX, gf_common_mt_char);
- if (!local_brick->brick_db_path) {
- gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
- "Failed to allocate memory for"
- " bricklist.");
- ret = -1;
- goto out;
- }
-
- len = snprintf(local_brick->brick_db_path, PATH_MAX, "%s/%s/%s", rv,
- GF_HIDDEN_PATH, db_name);
- if ((len < 0) || (len >= PATH_MAX)) {
- gf_msg("tier", GF_LOG_ERROR, EINVAL, DHT_MSG_LOG_TIER_STATUS,
- "DB path too long");
- ret = -1;
- goto out;
- }
-
- local_brick->xlator = xl;
-
- snprintf(local_brick->brick_name, NAME_MAX, "%s", brickname);
-
- list_add_tail(&(local_brick->list), local_bricklist_head);
-
- ret = 0;
- goto out;
- }
- }
-
- for (child = xl->children; child; child = child->next) {
- ret = tier_get_bricklist(child->xlator, local_bricklist_head);
- if (ret) {
- goto out;
- }
- }
-
- ret = 0;
-out:
-
- if (ret) {
- if (local_brick) {
- GF_FREE(local_brick->brick_db_path);
- }
- GF_FREE(local_brick);
- }
-
- return ret;
-}
-
-int
-tier_get_freq_demote(gf_tier_conf_t *tier_conf)
-{
- if ((tier_conf->mode == TIER_MODE_WM) &&
- (tier_conf->watermark_last == TIER_WM_HI))
- return DEFAULT_DEMOTE_DEGRADED;
- else
- return tier_conf->tier_demote_frequency;
-}
-
-int
-tier_get_freq_promote(gf_tier_conf_t *tier_conf)
-{
- return tier_conf->tier_promote_frequency;
-}
-
-int
-tier_get_freq_compact_hot(gf_tier_conf_t *tier_conf)
-{
- return tier_conf->tier_compact_hot_frequency;
-}
-
-int
-tier_get_freq_compact_cold(gf_tier_conf_t *tier_conf)
-{
- return tier_conf->tier_compact_cold_frequency;
-}
-
-static int
-tier_check_demote(gfdb_time_t current_time, int freq)
-{
- return ((current_time.tv_sec % freq) == 0) ? _gf_true : _gf_false;
-}
-
-static gf_boolean_t
-tier_check_promote(gf_tier_conf_t *tier_conf, gfdb_time_t current_time,
- int freq)
-{
- if ((tier_conf->mode == TIER_MODE_WM) &&
- (tier_conf->watermark_last == TIER_WM_HI))
- return _gf_false;
-
- else
- return ((current_time.tv_sec % freq) == 0) ? _gf_true : _gf_false;
-}
-
-static gf_boolean_t
-tier_check_compact(gf_tier_conf_t *tier_conf, gfdb_time_t current_time,
- int freq_compact)
-{
- if (!(tier_conf->compact_active || tier_conf->compact_mode_switched))
- return _gf_false;
-
- return ((current_time.tv_sec % freq_compact) == 0) ? _gf_true : _gf_false;
-}
-
-void
-clear_bricklist(struct list_head *brick_list)
-{
- tier_brick_list_t *local_brick = NULL;
- tier_brick_list_t *temp = NULL;
-
- if (list_empty(brick_list)) {
- return;
- }
-
- list_for_each_entry_safe(local_brick, temp, brick_list, list)
- {
- list_del(&local_brick->list);
- GF_FREE(local_brick->brick_db_path);
- GF_FREE(local_brick);
- }
-}
-
-static void
-set_brick_list_qpath(struct list_head *brick_list, gf_boolean_t is_cold)
-{
- tier_brick_list_t *local_brick = NULL;
- int i = 0;
-
- GF_VALIDATE_OR_GOTO("tier", brick_list, out);
-
- list_for_each_entry(local_brick, brick_list, list)
- {
- /* Construct query file path for this brick
- * i.e
- * /var/run/gluster/xlator_name/
- * {promote/demote}-brickname-indexinbricklist
- * So that no two query files will have same path even
- * bricks have the same name
- * */
- snprintf(local_brick->qfile_path, PATH_MAX, "%s-%s-%d",
- GET_QFILE_PATH(is_cold), local_brick->brick_name, i);
- i++;
- }
-out:
- return;
-}
-
-static int
-tier_prepare_compact(migration_args_t *args, gfdb_time_t current_time)
-{
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- gf_tier_conf_t *tier_conf = NULL;
- gf_boolean_t is_hot_tier = args->is_hot_tier;
- int freq = 0;
- int ret = -1;
- const char *tier_type = is_hot_tier ? "hot" : "cold";
-
- this = args->this;
-
- conf = this->private;
-
- defrag = conf->defrag;
-
- tier_conf = &defrag->tier_conf;
-
- freq = is_hot_tier ? tier_get_freq_compact_hot(tier_conf)
- : tier_get_freq_compact_cold(tier_conf);
-
- defrag->tier_conf.compact_mode_switched =
- is_hot_tier ? defrag->tier_conf.compact_mode_switched_hot
- : defrag->tier_conf.compact_mode_switched_cold;
-
- gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
- "Compact mode %i", defrag->tier_conf.compact_mode_switched);
-
- if (tier_check_compact(tier_conf, current_time, freq)) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Start compaction on %s tier", tier_type);
-
- args->freq_time = freq;
- ret = tier_compact(args);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Compaction failed on "
- "%s tier",
- tier_type);
- goto out;
- }
-
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "End compaction on %s tier", tier_type);
-
- if (is_hot_tier) {
- defrag->tier_conf.compact_mode_switched_hot = _gf_false;
- } else {
- defrag->tier_conf.compact_mode_switched_cold = _gf_false;
- }
- }
-
-out:
- return ret;
-}
-
-static int
-tier_get_wm_interval(tier_mode_t mode, tier_watermark_op_t wm)
-{
- if (mode == TIER_MODE_WM && wm == TIER_WM_HI)
- return WM_INTERVAL_EMERG;
-
- return WM_INTERVAL;
-}
-
-/*
- * Main tiering loop. This is called from the promotion and the
- * demotion threads spawned in tier_start().
- *
- * Every second, wake from sleep to perform tasks.
- * 1. Check trigger to migrate data.
- * 2. Check for state changes (pause, unpause, stop).
- */
-static void *
-tier_run(void *in_args)
-{
- dht_conf_t *conf = NULL;
- gfdb_time_t current_time = {0};
- int freq = 0;
- int ret = 0;
- xlator_t *any = NULL;
- xlator_t *xlator = NULL;
- gf_tier_conf_t *tier_conf = NULL;
- loc_t root_loc = {0};
- int check_watermark = 0;
- gf_defrag_info_t *defrag = NULL;
- xlator_t *this = NULL;
- migration_args_t *args = in_args;
- GF_VALIDATE_OR_GOTO("tier", args, out);
- GF_VALIDATE_OR_GOTO("tier", args->brick_list, out);
-
- this = args->this;
- GF_VALIDATE_OR_GOTO("tier", this, out);
-
- conf = this->private;
- GF_VALIDATE_OR_GOTO("tier", conf, out);
-
- defrag = conf->defrag;
- GF_VALIDATE_OR_GOTO("tier", defrag, out);
-
- if (list_empty(args->brick_list)) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_ERROR,
- "Brick list for tier is empty. Exiting.");
- goto out;
- }
-
- defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
- tier_conf = &defrag->tier_conf;
-
- dht_build_root_loc(defrag->root_inode, &root_loc);
-
- while (1) {
- /*
- * Check if a graph switch occurred. If so, stop migration
- * thread. It will need to be restarted manually.
- */
- any = THIS->ctx->active->first;
- xlator = xlator_search_by_name(any, this->name);
-
- if (xlator != this) {
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Detected graph switch. Exiting migration "
- "daemon.");
- goto out;
- }
-
- gf_defrag_check_pause_tier(tier_conf);
-
- sleep(1);
-
- if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
- ret = 1;
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "defrag->defrag_status != "
- "GF_DEFRAG_STATUS_STARTED");
- goto out;
- }
-
- if (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER ||
- defrag->cmd == GF_DEFRAG_CMD_DETACH_START) {
- ret = 0;
- defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
- gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_TIER_ERROR,
- "defrag->defrag_cmd == "
- "GF_DEFRAG_CMD_START_DETACH_TIER");
- goto out;
- }
-
- if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)
- continue;
-
- /* To have proper synchronization amongst all
- * brick holding nodes, so that promotion and demotions
- * start atomically w.r.t promotion/demotion frequency
- * period, all nodes should have their system time
- * in-sync with each other either manually set or
- * using a NTP server*/
- ret = gettimeofday(&current_time, NULL);
- if (ret == -1) {
- gf_msg(this->name, GF_LOG_ERROR, errno,
- DHT_MSG_SYS_CALL_GET_TIME_FAILED,
- "Failed to get current time");
- goto out;
- }
-
- check_watermark++;
-
- /* emergency demotion requires frequent watermark monitoring */
- if (check_watermark >=
- tier_get_wm_interval(tier_conf->mode, tier_conf->watermark_last)) {
- check_watermark = 0;
- if (tier_conf->mode == TIER_MODE_WM) {
- ret = tier_get_fs_stat(this, &root_loc);
- if (ret != 0) {
- continue;
- }
- ret = tier_check_watermark(this);
- if (ret != 0) {
- gf_msg(this->name, GF_LOG_CRITICAL, errno,
- DHT_MSG_LOG_TIER_ERROR, "Failed to get watermark");
- continue;
- }
- }
- }
-
- if (args->is_promotion) {
- freq = tier_get_freq_promote(tier_conf);
-
- if (tier_check_promote(tier_conf, current_time, freq)) {
- args->freq_time = freq;
- ret = tier_promote(args);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Promotion failed");
- }
- }
- } else if (args->is_compaction) {
- tier_prepare_compact(args, current_time);
- } else {
- freq = tier_get_freq_demote(tier_conf);
-
- if (tier_check_demote(current_time, freq)) {
- args->freq_time = freq;
- ret = tier_demote(args);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Demotion failed");
- }
- }
- }
-
- /* Check the statfs immediately after the processing threads
- return */
- check_watermark = WM_INTERVAL;
- }
-
- ret = 0;
-out:
-
- args->return_value = ret;
-
- return NULL;
-}
-
-int
-tier_start(xlator_t *this, gf_defrag_info_t *defrag)
-{
- pthread_t promote_thread;
- pthread_t demote_thread;
- pthread_t hot_compact_thread;
- pthread_t cold_compact_thread;
- int ret = -1;
- struct list_head bricklist_hot = {0};
- struct list_head bricklist_cold = {0};
- migration_args_t promotion_args = {0};
- migration_args_t demotion_args = {0};
- migration_args_t hot_compaction_args = {0};
- migration_args_t cold_compaction_args = {0};
- dht_conf_t *conf = NULL;
-
- INIT_LIST_HEAD((&bricklist_hot));
- INIT_LIST_HEAD((&bricklist_cold));
-
- conf = this->private;
-
- tier_get_bricklist(conf->subvolumes[1], &bricklist_hot);
- set_brick_list_qpath(&bricklist_hot, _gf_false);
-
- demotion_args.this = this;
- demotion_args.brick_list = &bricklist_hot;
- demotion_args.defrag = defrag;
- demotion_args.is_promotion = _gf_false;
- demotion_args.is_compaction = _gf_false;
-
- ret = gf_thread_create(&demote_thread, NULL, &tier_run, &demotion_args,
- "tierdem");
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to start demotion thread.");
- defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- goto cleanup;
- }
-
- tier_get_bricklist(conf->subvolumes[0], &bricklist_cold);
- set_brick_list_qpath(&bricklist_cold, _gf_true);
-
- promotion_args.this = this;
- promotion_args.brick_list = &bricklist_cold;
- promotion_args.defrag = defrag;
- promotion_args.is_promotion = _gf_true;
-
- ret = gf_thread_create(&promote_thread, NULL, &tier_run, &promotion_args,
- "tierpro");
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to start promotion thread.");
- defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- goto waitforspawned;
- }
-
- hot_compaction_args.this = this;
- hot_compaction_args.brick_list = &bricklist_hot;
- hot_compaction_args.defrag = defrag;
- hot_compaction_args.is_promotion = _gf_false;
- hot_compaction_args.is_compaction = _gf_true;
- hot_compaction_args.is_hot_tier = _gf_true;
-
- ret = gf_thread_create(&hot_compact_thread, NULL, &tier_run,
- &hot_compaction_args, "tierhcom");
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to start compaction thread.");
- defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- goto waitforspawnedpromote;
- }
-
- cold_compaction_args.this = this;
- cold_compaction_args.brick_list = &bricklist_cold;
- cold_compaction_args.defrag = defrag;
- cold_compaction_args.is_promotion = _gf_false;
- cold_compaction_args.is_compaction = _gf_true;
- cold_compaction_args.is_hot_tier = _gf_false;
-
- ret = gf_thread_create(&cold_compact_thread, NULL, &tier_run,
- &cold_compaction_args, "tierccom");
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to start compaction thread.");
- defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- goto waitforspawnedhotcompact;
- }
- pthread_join(cold_compact_thread, NULL);
-
-waitforspawnedhotcompact:
- pthread_join(hot_compact_thread, NULL);
-
-waitforspawnedpromote:
- pthread_join(promote_thread, NULL);
-
-waitforspawned:
- pthread_join(demote_thread, NULL);
-
-cleanup:
- clear_bricklist(&bricklist_cold);
- clear_bricklist(&bricklist_hot);
- return ret;
-}
-
-int32_t
-tier_migration_needed(xlator_t *this)
-{
- gf_defrag_info_t *defrag = NULL;
- dht_conf_t *conf = NULL;
- int ret = 0;
-
- conf = this->private;
-
- GF_VALIDATE_OR_GOTO(this->name, conf, out);
- GF_VALIDATE_OR_GOTO(this->name, conf->defrag, out);
-
- defrag = conf->defrag;
-
- if ((defrag->cmd == GF_DEFRAG_CMD_START_TIER) ||
- (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER))
- ret = 1;
-out:
- return ret;
-}
-
-int32_t
-tier_migration_get_dst(xlator_t *this, dht_local_t *local)
-{
- dht_conf_t *conf = NULL;
- int32_t ret = -1;
- gf_defrag_info_t *defrag = NULL;
-
- GF_VALIDATE_OR_GOTO("tier", this, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
-
- conf = this->private;
-
- defrag = conf->defrag;
-
- if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER) {
- local->rebalance.target_node = conf->subvolumes[0];
-
- } else if (conf->subvolumes[0] == local->cached_subvol)
- local->rebalance.target_node = conf->subvolumes[1];
- else
- local->rebalance.target_node = conf->subvolumes[0];
-
- if (local->rebalance.target_node)
- ret = 0;
-
-out:
- return ret;
-}
-
-xlator_t *
-tier_search(xlator_t *this, dht_layout_t *layout, const char *name)
-{
- xlator_t *subvol = NULL;
- dht_conf_t *conf = NULL;
-
- GF_VALIDATE_OR_GOTO("tier", this, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
-
- conf = this->private;
-
- subvol = TIER_HASHED_SUBVOL;
-
-out:
- return subvol;
-}
-
-static int
-tier_load_externals(xlator_t *this)
-{
- int ret = -1;
- char *libpathfull = (LIBDIR "/libgfdb.so.0");
- get_gfdb_methods_t get_gfdb_methods;
-
- GF_VALIDATE_OR_GOTO("this", this, out);
-
- libhandle = dlopen(libpathfull, RTLD_NOW);
- if (!libhandle) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Error loading libgfdb.so %s\n", dlerror());
- ret = -1;
- goto out;
- }
-
- get_gfdb_methods = dlsym(libhandle, "get_gfdb_methods");
- if (!get_gfdb_methods) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Error loading get_gfdb_methods()");
- ret = -1;
- goto out;
- }
-
- get_gfdb_methods(&gfdb_methods);
-
- ret = 0;
-
-out:
- if (ret && libhandle)
- dlclose(libhandle);
-
- return ret;
-}
-
-static tier_mode_t
-tier_validate_mode(char *mode)
-{
- int ret = -1;
-
- if (strcmp(mode, "test") == 0) {
- ret = TIER_MODE_TEST;
- } else {
- ret = TIER_MODE_WM;
- }
-
- return ret;
-}
-
-static gf_boolean_t
-tier_validate_compact_mode(char *mode)
-{
- gf_boolean_t ret = _gf_false;
-
- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "tier_validate_compact_mode: mode = %s", mode);
-
- if (!strcmp(mode, "on")) {
- ret = _gf_true;
- } else {
- ret = _gf_false;
- }
-
- gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
- "tier_validate_compact_mode: ret = %i", ret);
-
- return ret;
-}
-
-int
-tier_init_methods(xlator_t *this)
-{
- int ret = -1;
- dht_conf_t *conf = NULL;
- dht_methods_t *methods = NULL;
-
- GF_VALIDATE_OR_GOTO("tier", this, err);
-
- conf = this->private;
-
- methods = &(conf->methods);
-
- methods->migration_get_dst_subvol = tier_migration_get_dst;
- methods->migration_other = tier_start;
- methods->migration_needed = tier_migration_needed;
- methods->layout_search = tier_search;
-
- ret = 0;
-err:
- return ret;
-}
-
-static void
-tier_save_vol_name(xlator_t *this)
-{
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- char *suffix = NULL;
- int name_len = 0;
-
- conf = this->private;
- defrag = conf->defrag;
-
- suffix = strstr(this->name, "-tier-dht");
-
- if (suffix)
- name_len = suffix - this->name;
- else
- name_len = strlen(this->name);
-
- if (name_len > GD_VOLUME_NAME_MAX)
- name_len = GD_VOLUME_NAME_MAX;
-
- strncpy(defrag->tier_conf.volname, this->name, name_len);
- defrag->tier_conf.volname[name_len] = 0;
-}
-
-int
-tier_init(xlator_t *this)
-{
- int ret = -1;
- int freq = 0;
- int maxsize = 0;
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- char *voldir = NULL;
- char *mode = NULL;
- char *paused = NULL;
- tier_mode_t tier_mode = DEFAULT_TIER_MODE;
- gf_boolean_t compact_mode = _gf_false;
-
- ret = dht_init(this);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "tier_init failed");
- goto out;
- }
-
- conf = this->private;
-
- ret = tier_init_methods(this);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "tier_init_methods failed");
- goto out;
- }
-
- if (conf->subvolume_cnt != 2) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Invalid number of subvolumes %d", conf->subvolume_cnt);
- goto out;
- }
-
- /* if instatiated from client side initialization is complete. */
- if (!conf->defrag) {
- ret = 0;
- goto out;
- }
-
- /* if instatiated from server side, load db libraries */
- ret = tier_load_externals(this);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Could not load externals. Aborting");
- goto out;
- }
-
- defrag = conf->defrag;
-
- defrag->tier_conf.last_demote_qfile_index = 0;
- defrag->tier_conf.last_promote_qfile_index = 0;
-
- defrag->tier_conf.is_tier = 1;
- defrag->this = this;
-
- ret = dict_get_int32(this->options, "tier-max-promote-file-size", &maxsize);
- if (ret) {
- maxsize = 0;
- }
-
- defrag->tier_conf.tier_max_promote_size = maxsize;
-
- ret = dict_get_int32(this->options, "tier-promote-frequency", &freq);
- if (ret) {
- freq = DEFAULT_PROMOTE_FREQ_SEC;
- }
-
- defrag->tier_conf.tier_promote_frequency = freq;
-
- ret = dict_get_int32(this->options, "tier-demote-frequency", &freq);
- if (ret) {
- freq = DEFAULT_DEMOTE_FREQ_SEC;
- }
-
- defrag->tier_conf.tier_demote_frequency = freq;
-
- ret = dict_get_int32(this->options, "tier-hot-compact-frequency", &freq);
- if (ret) {
- freq = DEFAULT_HOT_COMPACT_FREQ_SEC;
- }
-
- defrag->tier_conf.tier_compact_hot_frequency = freq;
-
- ret = dict_get_int32(this->options, "tier-cold-compact-frequency", &freq);
- if (ret) {
- freq = DEFAULT_COLD_COMPACT_FREQ_SEC;
- }
-
- defrag->tier_conf.tier_compact_cold_frequency = freq;
-
- ret = dict_get_int32(this->options, "watermark-hi", &freq);
- if (ret) {
- freq = DEFAULT_WM_HI;
- }
-
- defrag->tier_conf.watermark_hi = freq;
-
- ret = dict_get_int32(this->options, "watermark-low", &freq);
- if (ret) {
- freq = DEFAULT_WM_LOW;
- }
-
- defrag->tier_conf.watermark_low = freq;
-
- ret = dict_get_int32(this->options, "write-freq-threshold", &freq);
- if (ret) {
- freq = DEFAULT_WRITE_FREQ_SEC;
- }
-
- defrag->write_freq_threshold = freq;
-
- ret = dict_get_int32(this->options, "read-freq-threshold", &freq);
- if (ret) {
- freq = DEFAULT_READ_FREQ_SEC;
- }
-
- defrag->read_freq_threshold = freq;
-
- ret = dict_get_int32(this->options, "tier-max-mb", &freq);
- if (ret) {
- freq = DEFAULT_TIER_MAX_MIGRATE_MB;
- }
-
- defrag->tier_conf.max_migrate_bytes = (uint64_t)freq * 1024 * 1024;
-
- ret = dict_get_int32(this->options, "tier-max-files", &freq);
- if (ret) {
- freq = DEFAULT_TIER_MAX_MIGRATE_FILES;
- }
-
- defrag->tier_conf.max_migrate_files = freq;
-
- ret = dict_get_int32(this->options, "tier-query-limit",
- &(defrag->tier_conf.query_limit));
- if (ret) {
- defrag->tier_conf.query_limit = DEFAULT_TIER_QUERY_LIMIT;
- }
-
- ret = dict_get_str(this->options, "tier-compact", &mode);
-
- if (ret) {
- defrag->tier_conf.compact_active = DEFAULT_COMP_MODE;
- } else {
- compact_mode = tier_validate_compact_mode(mode);
- /* If compaction is now active, we need to inform the bricks on
- the hot and cold tier of this. See dht-common.h for more. */
- defrag->tier_conf.compact_active = compact_mode;
- if (compact_mode) {
- defrag->tier_conf.compact_mode_switched_hot = _gf_true;
- defrag->tier_conf.compact_mode_switched_cold = _gf_true;
- }
- }
-
- ret = dict_get_str(this->options, "tier-mode", &mode);
- if (ret) {
- defrag->tier_conf.mode = DEFAULT_TIER_MODE;
- } else {
- tier_mode = tier_validate_mode(mode);
- defrag->tier_conf.mode = tier_mode;
- }
-
- pthread_mutex_init(&defrag->tier_conf.pause_mutex, 0);
-
- gf_defrag_set_pause_state(&defrag->tier_conf, TIER_RUNNING);
-
- ret = dict_get_str(this->options, "tier-pause", &paused);
-
- if (paused && strcmp(paused, "on") == 0)
- gf_defrag_set_pause_state(&defrag->tier_conf, TIER_REQUEST_PAUSE);
-
- ret = gf_asprintf(&voldir, "%s/%s", DEFAULT_VAR_RUN_DIRECTORY, this->name);
- if (ret < 0)
- goto out;
-
- ret = mkdir_p(voldir, 0777, _gf_true);
- if (ret == -1 && errno != EEXIST) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "tier_init failed");
-
- GF_FREE(voldir);
- goto out;
- }
-
- GF_FREE(voldir);
-
- ret = gf_asprintf(&promotion_qfile, "%s/%s/promote",
- DEFAULT_VAR_RUN_DIRECTORY, this->name);
- if (ret < 0)
- goto out;
-
- ret = gf_asprintf(&demotion_qfile, "%s/%s/demote",
- DEFAULT_VAR_RUN_DIRECTORY, this->name);
- if (ret < 0) {
- GF_FREE(promotion_qfile);
- goto out;
- }
-
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "Promote/demote frequency %d/%d "
- "Write/Read freq thresholds %d/%d",
- defrag->tier_conf.tier_promote_frequency,
- defrag->tier_conf.tier_demote_frequency,
- defrag->write_freq_threshold, defrag->read_freq_threshold);
-
- tier_save_vol_name(this);
-
- ret = 0;
-
-out:
-
- return ret;
-}
-
-int
-tier_cli_pause_done(int op_ret, call_frame_t *sync_frame, void *data)
-{
- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
- "Migrate file paused with op_ret %d", op_ret);
-
- return op_ret;
-}
-
-int
-tier_cli_pause(void *data)
-{
- gf_defrag_info_t *defrag = NULL;
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
-
- this = data;
-
- conf = this->private;
- GF_VALIDATE_OR_GOTO(this->name, conf, exit);
-
- defrag = conf->defrag;
- GF_VALIDATE_OR_GOTO(this->name, defrag, exit);
-
- gf_defrag_pause_tier(this, defrag);
-
- ret = 0;
-exit:
- return ret;
-}
-
-int
-tier_reconfigure(xlator_t *this, dict_t *options)
-{
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- char *mode = NULL;
- int migrate_mb = 0;
- gf_boolean_t req_pause = _gf_false;
- int ret = 0;
- call_frame_t *frame = NULL;
- gf_boolean_t last_compact_setting = _gf_false;
-
- conf = this->private;
-
- if (conf->defrag) {
- defrag = conf->defrag;
- GF_OPTION_RECONF("tier-max-promote-file-size",
- defrag->tier_conf.tier_max_promote_size, options,
- int32, out);
-
- GF_OPTION_RECONF("tier-promote-frequency",
- defrag->tier_conf.tier_promote_frequency, options,
- int32, out);
-
- GF_OPTION_RECONF("tier-demote-frequency",
- defrag->tier_conf.tier_demote_frequency, options,
- int32, out);
-
- GF_OPTION_RECONF("write-freq-threshold", defrag->write_freq_threshold,
- options, int32, out);
-
- GF_OPTION_RECONF("read-freq-threshold", defrag->read_freq_threshold,
- options, int32, out);
-
- GF_OPTION_RECONF("watermark-hi", defrag->tier_conf.watermark_hi,
- options, int32, out);
-
- GF_OPTION_RECONF("watermark-low", defrag->tier_conf.watermark_low,
- options, int32, out);
-
- last_compact_setting = defrag->tier_conf.compact_active;
-
- GF_OPTION_RECONF("tier-compact", defrag->tier_conf.compact_active,
- options, bool, out);
-
- if (last_compact_setting != defrag->tier_conf.compact_active) {
- defrag->tier_conf.compact_mode_switched_hot = _gf_true;
- defrag->tier_conf.compact_mode_switched_cold = _gf_true;
- gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "compact mode switched");
- }
-
- GF_OPTION_RECONF("tier-hot-compact-frequency",
- defrag->tier_conf.tier_compact_hot_frequency, options,
- int32, out);
-
- GF_OPTION_RECONF("tier-cold-compact-frequency",
- defrag->tier_conf.tier_compact_cold_frequency, options,
- int32, out);
-
- GF_OPTION_RECONF("tier-mode", mode, options, str, out);
- defrag->tier_conf.mode = tier_validate_mode(mode);
-
- GF_OPTION_RECONF("tier-max-mb", migrate_mb, options, int32, out);
- defrag->tier_conf.max_migrate_bytes = (uint64_t)migrate_mb * 1024 *
- 1024;
-
- GF_OPTION_RECONF("tier-max-files", defrag->tier_conf.max_migrate_files,
- options, int32, out);
-
- GF_OPTION_RECONF("tier-query-limit", defrag->tier_conf.query_limit,
- options, int32, out);
-
- GF_OPTION_RECONF("tier-pause", req_pause, options, bool, out);
-
- if (req_pause == _gf_true) {
- frame = create_frame(this, this->ctx->pool);
- if (!frame)
- goto out;
-
- frame->root->pid = GF_CLIENT_PID_DEFRAG;
-
- ret = synctask_new(this->ctx->env, tier_cli_pause,
- tier_cli_pause_done, frame, this);
-
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "pause tier failed on reconfigure");
- }
- } else {
- ret = gf_defrag_resume_tier(this, defrag);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "resume tier failed on reconfigure");
- }
- }
- }
-
-out:
- return dht_reconfigure(this, options);
-}
-
-void
-tier_fini(xlator_t *this)
-{
- if (libhandle)
- dlclose(libhandle);
-
- GF_FREE(demotion_qfile);
- GF_FREE(promotion_qfile);
-
- dht_fini(this);
-}
-
-class_methods_t class_methods = {.init = tier_init,
- .fini = tier_fini,
- .reconfigure = tier_reconfigure,
- .notify = dht_notify};
-
-struct xlator_fops fops = {
-
- .lookup = dht_lookup,
- .create = tier_create,
- .mknod = dht_mknod,
-
- .open = dht_open,
- .statfs = tier_statfs,
- .opendir = dht_opendir,
- .readdir = tier_readdir,
- .readdirp = tier_readdirp,
- .fsyncdir = dht_fsyncdir,
- .symlink = dht_symlink,
- .unlink = tier_unlink,
- .link = tier_link,
- .mkdir = dht_mkdir,
- .rmdir = dht_rmdir,
- .rename = dht_rename,
- .entrylk = dht_entrylk,
- .fentrylk = dht_fentrylk,
-
- /* Inode read operations */
- .stat = dht_stat,
- .fstat = dht_fstat,
- .access = dht_access,
- .readlink = dht_readlink,
- .getxattr = dht_getxattr,
- .fgetxattr = dht_fgetxattr,
- .readv = dht_readv,
- .flush = dht_flush,
- .fsync = dht_fsync,
- .inodelk = dht_inodelk,
- .finodelk = dht_finodelk,
- .lk = dht_lk,
-
- /* Inode write operations */
- .fremovexattr = dht_fremovexattr,
- .removexattr = dht_removexattr,
- .setxattr = dht_setxattr,
- .fsetxattr = dht_fsetxattr,
- .truncate = dht_truncate,
- .ftruncate = dht_ftruncate,
- .writev = dht_writev,
- .xattrop = dht_xattrop,
- .fxattrop = dht_fxattrop,
- .setattr = dht_setattr,
- .fsetattr = dht_fsetattr,
- .fallocate = dht_fallocate,
- .discard = dht_discard,
- .zerofill = dht_zerofill,
-};
-
-struct xlator_cbks cbks = {.release = dht_release, .forget = dht_forget};
diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h
deleted file mode 100644
index f0ffdfcd769..00000000000
--- a/xlators/cluster/dht/src/tier.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _TIER_H_
-#define _TIER_H_
-
-/******************************************************************************/
-/* This is from dht-rebalancer.c as we don't have dht-rebalancer.h */
-#include "dht-common.h"
-#include "xlator.h"
-#include <signal.h>
-#include <fnmatch.h>
-#include <signal.h>
-
-/*
- * Size of timer wheel. We would not promote or demote less
- * frequently than this number.
- */
-#define TIMER_SECS 3600
-
-#include "gfdb_data_store.h"
-#include <ctype.h>
-#include <sys/stat.h>
-
-#define PROMOTION_QFILE "promotequeryfile"
-#define DEMOTION_QFILE "demotequeryfile"
-
-#define TIER_HASHED_SUBVOL conf->subvolumes[0]
-#define TIER_UNHASHED_SUBVOL conf->subvolumes[1]
-
-#define GET_QFILE_PATH(is_promotion) \
- (is_promotion) ? promotion_qfile : demotion_qfile
-
-typedef struct tier_qfile_array {
- int *fd_array;
- ssize_t array_size;
- ssize_t next_index;
- /* Indicate the number of exhuasted FDs*/
- ssize_t exhausted_count;
-} tier_qfile_array_t;
-
-typedef struct _query_cbk_args {
- xlator_t *this;
- gf_defrag_info_t *defrag;
- /* This is write */
- int query_fd;
- int is_promotion;
- int is_compaction;
- /* This is for read */
- tier_qfile_array_t *qfile_array;
-} query_cbk_args_t;
-
-int
-gf_run_tier(xlator_t *this, gf_defrag_info_t *defrag);
-
-typedef struct gfdb_brick_info {
- gfdb_time_t *time_stamp;
- gf_boolean_t _gfdb_promote;
- query_cbk_args_t *_query_cbk_args;
-} gfdb_brick_info_t;
-
-typedef struct brick_list {
- xlator_t *xlator;
- char *brick_db_path;
- char brick_name[NAME_MAX];
- char qfile_path[PATH_MAX];
- struct list_head list;
-} tier_brick_list_t;
-
-typedef struct _dm_thread_args {
- xlator_t *this;
- gf_defrag_info_t *defrag;
- struct list_head *brick_list;
- int freq_time;
- int return_value;
- int is_promotion;
- int is_compaction;
- gf_boolean_t is_hot_tier;
-} migration_args_t;
-
-typedef enum tier_watermark_op_ {
- TIER_WM_NONE = 0,
- TIER_WM_LOW,
- TIER_WM_HI,
- TIER_WM_MID
-} tier_watermark_op_t;
-
-#define DEFAULT_PROMOTE_FREQ_SEC 120
-#define DEFAULT_DEMOTE_FREQ_SEC 120
-#define DEFAULT_HOT_COMPACT_FREQ_SEC 604800
-#define DEFAULT_COLD_COMPACT_FREQ_SEC 604800
-#define DEFAULT_DEMOTE_DEGRADED 1
-#define DEFAULT_WRITE_FREQ_SEC 0
-#define DEFAULT_READ_FREQ_SEC 0
-#define DEFAULT_WM_LOW 75
-#define DEFAULT_WM_HI 90
-#define DEFAULT_TIER_MODE TIER_MODE_TEST
-#define DEFAULT_COMP_MODE _gf_true
-#define DEFAULT_TIER_MAX_MIGRATE_MB 1000
-#define DEFAULT_TIER_MAX_MIGRATE_FILES 5000
-#define DEFAULT_TIER_QUERY_LIMIT 100
-
-#endif
diff --git a/xlators/cluster/dht/src/tier.sym b/xlators/cluster/dht/src/tier.sym
deleted file mode 100644
index 60205d145b6..00000000000
--- a/xlators/cluster/dht/src/tier.sym
+++ /dev/null
@@ -1,9 +0,0 @@
-fops
-cbks
-class_methods
-dht_methods
-tier_methods
-options
-mem_acct_init
-reconfigure
-dumpops
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
index 49bf18b9fe6..771452963d1 100644
--- a/xlators/cluster/dht/src/unittest/dht_layout_mock.c
+++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
@@ -7,10 +7,10 @@
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
#include "dht-common.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
int
dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p)
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
index 72890070835..c94a1d0a2e1 100644
--- a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
+++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
@@ -9,8 +9,8 @@
*/
#include "dht-common.h"
-#include "logging.h"
-#include "xlator.h"
+#include <glusterfs/logging.h>
+#include <glusterfs/xlator.h>
#include <inttypes.h>
#include <stdarg.h>
diff --git a/xlators/cluster/ec/src/ec-code.c b/xlators/cluster/ec/src/ec-code.c
index 70878d794ca..03162ae05a9 100644
--- a/xlators/cluster/ec/src/ec-code.c
+++ b/xlators/cluster/ec/src/ec-code.c
@@ -14,7 +14,7 @@
#include <sys/stat.h>
#include <ctype.h>
-#include "syscall.h"
+#include <glusterfs/syscall.h>
#include "ec-mem-types.h"
#include "ec-code.h"
@@ -944,7 +944,7 @@ ec_code_cpu_check(uint32_t idx, char *list, uint32_t count)
{
ec_code_gen_t *gen;
char **ptr;
- char *table[count];
+ char *table[count + 1];
uint32_t i;
for (i = 0; i < count; i++) {
diff --git a/xlators/cluster/ec/src/ec-code.h b/xlators/cluster/ec/src/ec-code.h
index 355209c3944..75fb35d93e3 100644
--- a/xlators/cluster/ec/src/ec-code.h
+++ b/xlators/cluster/ec/src/ec-code.h
@@ -11,8 +11,8 @@
#ifndef __EC_CODE_H__
#define __EC_CODE_H__
-#include "xlator.h"
-#include "list.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/list.h>
#include "ec-types.h"
#include "ec-galois.h"
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c
index 551adfac043..703a30e2485 100644
--- a/xlators/cluster/ec/src/ec-combine.c
+++ b/xlators/cluster/ec/src/ec-combine.c
@@ -11,14 +11,14 @@
#include <fnmatch.h>
#include "libxlator.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "ec-types.h"
#include "ec-helpers.h"
#include "ec-common.h"
#include "ec-combine.h"
#include "ec-messages.h"
-#include "quota-common-utils.h"
+#include <glusterfs/quota-common-utils.h>
#define EC_QUOTA_PREFIX "trusted.glusterfs.quota."
@@ -174,15 +174,19 @@ ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src,
}
if (failed) {
gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_IATT_COMBINE_FAIL,
- "Failed to combine iatt (inode: %lu-%lu, links: %u-%u, "
- "uid: %u-%u, gid: %u-%u, rdev: %lu-%lu, size: %lu-%lu, "
- "mode: %o-%o)",
+ "Failed to combine iatt (inode: %" PRIu64 "-%" PRIu64
+ ", "
+ "links: %u-%u, uid: %u-%u, gid: %u-%u, "
+ "rdev: %" PRIu64 "-%" PRIu64 ", size: %" PRIu64 "-%" PRIu64
+ ", "
+ "mode: %o-%o), %s",
dst[i].ia_ino, src[i].ia_ino, dst[i].ia_nlink,
src[i].ia_nlink, dst[i].ia_uid, src[i].ia_uid, dst[i].ia_gid,
src[i].ia_gid, dst[i].ia_rdev, src[i].ia_rdev,
dst[i].ia_size, src[i].ia_size,
st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type),
- st_mode_from_ia(src[i].ia_prot, dst[i].ia_type));
+ st_mode_from_ia(src[i].ia_prot, dst[i].ia_type),
+ ec_msg_str(fop));
return 0;
}
@@ -339,9 +343,8 @@ out:
}
static int32_t
-ec_dict_data_concat(const char *fmt, ec_cbk_data_t *cbk, int32_t which,
- char *key, char *new_key, const char *def,
- gf_boolean_t global, ...)
+ec_dict_data_concat(ec_cbk_data_t *cbk, int32_t which, char *key, char *new_key,
+ const char *def, gf_boolean_t global, const char *fmt, ...)
{
ec_t *ec = cbk->fop->xl->private;
data_t *data[ec->nodes];
@@ -353,7 +356,7 @@ ec_dict_data_concat(const char *fmt, ec_cbk_data_t *cbk, int32_t which,
ec_dict_list(data, cbk, which, key, global);
- va_start(args, global);
+ va_start(args, fmt);
err = ec_concat_prepare(cbk->fop->xl, &pre, &sep, &post, fmt, args);
va_end(args);
@@ -482,22 +485,12 @@ ec_dict_data_merge(ec_cbk_data_t *cbk, int32_t which, char *key)
tmp = NULL;
- len = dict_serialized_length(lockinfo);
- if (len < 0) {
- err = len;
-
- goto out;
- }
- ptr = GF_MALLOC(len, gf_common_mt_char);
- if (ptr == NULL) {
- err = -ENOMEM;
-
- goto out;
- }
- err = dict_serialize(lockinfo, ptr);
+ err = dict_allocate_and_serialize(lockinfo, (char **)&ptr,
+ (unsigned int *)&len);
if (err != 0) {
goto out;
}
+
dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
err = dict_set_dynptr(dict, key, ptr, len);
if (err != 0) {
@@ -684,7 +677,7 @@ ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key)
*/
for (i = 0; i < ec->nodes; i++) {
if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA) ||
- (quota_data_to_meta(data[i], QUOTA_SIZE_KEY, &size) < 0)) {
+ (quota_data_to_meta(data[i], &size) < 0)) {
continue;
}
@@ -736,14 +729,14 @@ ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg)
if ((strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) ||
(strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0)) {
- return ec_dict_data_concat("(<EC:%s> { })", data->cbk, data->which, key,
- NULL, NULL, _gf_false,
+ return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL,
+ _gf_false, _gf_false, "(<EC:%s> { })",
data->cbk->fop->xl->name);
}
if (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) {
- return ec_dict_data_concat("{\n}", data->cbk, data->which, key, NULL,
- NULL, _gf_false);
+ return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL,
+ _gf_false, "{\n}");
}
if (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) == 0) {
@@ -773,9 +766,9 @@ ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg)
if (XATTR_IS_NODE_UUID(key)) {
if (data->cbk->fop->int32) {
/* List of node uuid is requested */
- return ec_dict_data_concat("{ }", data->cbk, data->which, key,
+ return ec_dict_data_concat(data->cbk, data->which, key,
GF_XATTR_LIST_NODE_UUIDS_KEY, UUID0_STR,
- _gf_true);
+ _gf_true, "{ }");
} else {
return ec_dict_data_uuid(data->cbk, data->which, key);
}
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 0eee0a3363f..b955efd8c2d 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -8,8 +8,8 @@
cases as published by the Free Software Foundation.
*/
-#include "byte-order.h"
-#include "hashfn.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/hashfn.h>
#include "ec-mem-types.h"
#include "ec-types.h"
@@ -44,16 +44,16 @@ ec_update_fd_status(fd_t *fd, xlator_t *xl, int idx, int32_t ret_status)
UNLOCK(&fd->lock);
}
-static int
-ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open)
+static uintptr_t
+ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t mask)
{
int i = 0;
int count = 0;
ec_t *ec = NULL;
ec_fd_t *fd_ctx = NULL;
+ uintptr_t need_open = 0;
ec = this->private;
- *need_open = 0;
fd_ctx = ec_fd_get(fd, this);
if (!fd_ctx)
@@ -63,9 +63,9 @@ ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open)
{
for (i = 0; i < ec->nodes; i++) {
if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) &&
- (ec->xl_up & (1 << i))) {
+ ((ec->xl_up & (1 << i)) != 0) && ((mask & (1 << i)) != 0)) {
fd_ctx->fd_status[i] = EC_FD_OPENING;
- *need_open |= (1 << i);
+ need_open |= (1 << i);
count++;
}
}
@@ -76,10 +76,11 @@ ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open)
* then ignore fixing the fd as it has been
* requested from heal operation.
*/
- if (count >= ec->fragments)
- count = 0;
+ if (count >= ec->fragments) {
+ need_open = 0;
+ }
- return count;
+ return need_open;
}
static gf_boolean_t
@@ -96,11 +97,11 @@ ec_is_fd_fixable(fd_t *fd)
}
static void
-ec_fix_open(ec_fop_data_t *fop)
+ec_fix_open(ec_fop_data_t *fop, uintptr_t mask)
{
- int call_count = 0;
uintptr_t need_open = 0;
int ret = 0;
+ int32_t flags = 0;
loc_t loc = {
0,
};
@@ -109,9 +110,10 @@ ec_fix_open(ec_fop_data_t *fop)
goto out;
/* Evaluate how many remote fd's to be opened */
- call_count = ec_fd_ctx_need_open(fop->fd, fop->xl, &need_open);
- if (!call_count)
+ need_open = ec_fd_ctx_need_open(fop->fd, fop->xl, mask);
+ if (need_open == 0) {
goto out;
+ }
loc.inode = inode_ref(fop->fd->inode);
gf_uuid_copy(loc.gfid, fop->fd->inode->gfid);
@@ -120,34 +122,38 @@ ec_fix_open(ec_fop_data_t *fop)
goto out;
}
+ flags = fop->fd->flags & (~(O_TRUNC | O_APPEND | O_CREAT | O_EXCL));
if (IA_IFDIR == fop->fd->inode->ia_type) {
- ec_opendir(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, NULL, NULL,
+ ec_opendir(fop->frame, fop->xl, need_open,
+ EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL,
&fop->loc[0], fop->fd, NULL);
} else {
- ec_open(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, NULL, NULL,
- &loc, fop->fd->flags, fop->fd, NULL);
+ ec_open(fop->frame, fop->xl, need_open,
+ EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, &loc,
+ flags, fop->fd, NULL);
}
out:
loc_wipe(&loc);
}
-off_t
-ec_range_end_get(off_t fl_start, size_t fl_size)
+static off_t
+ec_range_end_get(off_t fl_start, uint64_t fl_size)
{
- off_t fl_end = 0;
- switch (fl_size) {
- case 0:
- return fl_start;
- case LLONG_MAX: /*Infinity*/
- return LLONG_MAX;
- default:
- fl_end = fl_start + fl_size - 1;
- if (fl_end < 0) /*over-flow*/
- return LLONG_MAX;
- else
- return fl_end;
+ if (fl_size > 0) {
+ if (fl_size >= EC_RANGE_FULL) {
+ /* Infinity */
+ fl_start = LLONG_MAX;
+ } else {
+ fl_start += fl_size - 1;
+ if (fl_start < 0) {
+ /* Overflow */
+ fl_start = LLONG_MAX;
+ }
+ }
}
+
+ return fl_start;
}
static gf_boolean_t
@@ -224,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
int32_t
ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
- uintptr_t bad, dict_t *xdata)
+ uintptr_t bad, uint32_t pending, dict_t *xdata)
{
if (op_ret < 0) {
gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
@@ -310,16 +316,19 @@ ec_check_status(ec_fop_data_t *fop)
}
}
- gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
- "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
- "remaining=%s, good=%s, bad=%s)",
- gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
- ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
- ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
- ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
- ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
- ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
- ec->nodes));
+ gf_msg(
+ fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
+ "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
+ "remaining=%s, good=%s, bad=%s,"
+ "(Least significant bit represents first client/brick of subvol), %s)",
+ gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
+ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+ ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
+ ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
+ ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
+ ec->nodes),
+ ec_msg_str(fop));
if (fop->use_fd) {
if (fop->fd != NULL) {
ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
@@ -493,12 +502,16 @@ ec_resume(ec_fop_data_t *fop, int32_t error)
}
void
-ec_resume_parent(ec_fop_data_t *fop, int32_t error)
+ec_resume_parent(ec_fop_data_t *fop)
{
ec_fop_data_t *parent;
+ int32_t error = 0;
parent = fop->parent;
if (parent != NULL) {
+ if ((fop->fop_flags & EC_FOP_NO_PROPAGATE_ERROR) == 0) {
+ error = fop->error;
+ }
ec_trace("RESUME_PARENT", fop, "error=%u", error);
fop->parent = NULL;
ec_resume(parent, error);
@@ -591,6 +604,8 @@ ec_internal_op(ec_fop_data_t *fop)
return _gf_true;
if (fop->id == GF_FOP_FXATTROP)
return _gf_true;
+ if (fop->id == GF_FOP_OPEN)
+ return _gf_true;
return _gf_false;
}
@@ -601,10 +616,10 @@ ec_msg_str(ec_fop_data_t *fop)
loc_t *loc2 = NULL;
char gfid1[64] = {0};
char gfid2[64] = {0};
+ ec_fop_data_t *parent = fop->parent;
if (fop->errstr)
return fop->errstr;
-
if (!fop->use_fd) {
loc1 = &fop->loc[0];
loc2 = &fop->loc[1];
@@ -612,24 +627,46 @@ ec_msg_str(ec_fop_data_t *fop)
if (fop->id == GF_FOP_RENAME) {
gf_asprintf(&fop->errstr,
"FOP : '%s' failed on '%s' and '%s' with gfids "
- "%s and %s respectively",
+ "%s and %s respectively. Parent FOP: %s",
ec_fop_name(fop->id), loc1->path, loc2->path,
uuid_utoa_r(loc1->gfid, gfid1),
- uuid_utoa_r(loc2->gfid, gfid2));
+ uuid_utoa_r(loc2->gfid, gfid2),
+ parent ? ec_fop_name(parent->id) : "No Parent");
} else {
- gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' with gfid %s",
- ec_fop_name(fop->id), loc1->path,
- uuid_utoa_r(loc1->gfid, gfid1));
+ gf_asprintf(
+ &fop->errstr,
+ "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s",
+ ec_fop_name(fop->id), loc1->path,
+ uuid_utoa_r(loc1->gfid, gfid1),
+ parent ? ec_fop_name(parent->id) : "No Parent");
}
} else {
- gf_asprintf(&fop->errstr, "FOP : '%s' failed on gfid %s",
- ec_fop_name(fop->id),
- uuid_utoa_r(fop->fd->inode->gfid, gfid1));
+ gf_asprintf(
+ &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s",
+ ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1),
+ parent ? ec_fop_name(parent->id) : "No Parent");
}
return fop->errstr;
}
-int32_t
+static void
+ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need,
+ int32_t loglevel)
+{
+ ec_t *ec = fop->xl->private;
+ char str1[32], str2[32], str3[32];
+
+ gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT,
+ "Insufficient available children for this request: "
+ "Have : %d, Need : %u : Child UP : %s "
+ "Mask: %s, Healing : %s : %s ",
+ have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+ ec_bin(str3, sizeof(str3), fop->healing, ec->nodes),
+ ec_msg_str(fop));
+}
+
+static int32_t
ec_child_select(ec_fop_data_t *fop)
{
ec_t *ec = fop->xl->private;
@@ -643,12 +680,15 @@ ec_child_select(ec_fop_data_t *fop)
* unlock should go on all subvols where lock is performed*/
if (fop->parent && !ec_internal_op(fop)) {
fop->mask &= (fop->parent->mask & ~fop->parent->healing);
+ if (ec_is_data_fop(fop->id)) {
+ fop->healing |= fop->parent->healing;
+ }
}
if ((fop->mask & ~ec->xl_up) != 0) {
gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_EXEC_UNAVAIL,
"Executing operation with "
- "some subvolumes unavailable. (%lX). %s ",
+ "some subvolumes unavailable. (%" PRIXPTR "). %s ",
fop->mask & ~ec->xl_up, ec_msg_str(fop));
fop->mask &= ec->xl_up;
}
@@ -683,15 +723,18 @@ ec_child_select(ec_fop_data_t *fop)
ec_trace("SELECT", fop, "");
if ((num < fop->minimum) && (num < ec->fragments)) {
- gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT,
- "Insufficient available children "
- "for this request (have %d, need "
- "%d). %s",
- num, fop->minimum, ec_msg_str(fop));
+ ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR);
return 0;
}
- ec_sleep(fop);
+ if (!fop->parent && fop->lock_count &&
+ (fop->locks[0].update[EC_DATA_TXN] ||
+ fop->locks[0].update[EC_METADATA_TXN])) {
+ if (ec->quorum_count && (num < ec->quorum_count)) {
+ ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR);
+ return 0;
+ }
+ }
return 1;
}
@@ -771,6 +814,8 @@ ec_dispatch_one(ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
fop->expected = 1;
fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
@@ -805,6 +850,8 @@ ec_dispatch_inc(ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
fop->expected = gf_bits_count(fop->remaining);
fop->first = 0;
@@ -818,6 +865,8 @@ ec_dispatch_all(ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
fop->expected = gf_bits_count(fop->remaining);
fop->first = 0;
@@ -836,6 +885,8 @@ ec_dispatch_min(ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
fop->expected = count = ec->fragments;
fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
idx = fop->first - 1;
@@ -850,6 +901,23 @@ ec_dispatch_min(ec_fop_data_t *fop)
}
}
+void
+ec_succeed_all(ec_fop_data_t *fop)
+{
+ ec_dispatch_start(fop);
+
+ if (ec_child_select(fop)) {
+ fop->expected = gf_bits_count(fop->remaining);
+ fop->first = 0;
+
+ /* Simulate a successful execution on all bricks */
+ ec_trace("SUCCEED", fop, "");
+
+ fop->good = fop->remaining;
+ fop->remaining = 0;
+ }
+}
+
ec_lock_t *
ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
{
@@ -870,7 +938,7 @@ ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
lock = mem_get0(ec->lock_pool);
if (lock != NULL) {
- lock->good_mask = -1ULL;
+ lock->good_mask = UINTPTR_MAX;
INIT_LIST_HEAD(&lock->owners);
INIT_LIST_HEAD(&lock->waiting);
INIT_LIST_HEAD(&lock->frozen);
@@ -903,9 +971,9 @@ ec_lock_compare(ec_lock_t *lock1, ec_lock_t *lock2)
return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
}
-void
+static void
ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, loc_t *base,
- off_t fl_start, size_t fl_size)
+ off_t fl_start, uint64_t fl_size)
{
ec_lock_link_t *link;
@@ -945,9 +1013,9 @@ ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, loc_t *base,
lock->refs_pending++;
}
-void
+static void
ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
- loc_t *base, off_t fl_start, size_t fl_size)
+ loc_t *base, off_t fl_start, uint64_t fl_size)
{
ec_lock_t *lock = NULL;
ec_inode_t *ctx;
@@ -1019,7 +1087,7 @@ unlock:
void
ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
- off_t fl_start, size_t fl_size)
+ off_t fl_start, uint64_t fl_size)
{
ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size);
}
@@ -1048,14 +1116,14 @@ ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
base = NULL;
}
- ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, LLONG_MAX);
+ ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, EC_RANGE_FULL);
loc_wipe(&tmp);
}
void
ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start,
- size_t fl_size)
+ uint64_t fl_size)
{
loc_t loc;
int32_t err;
@@ -1371,27 +1439,28 @@ ec_get_size_version(ec_lock_link_t *link)
!ec_is_data_fop(fop->id))
link->optimistic_changelog = _gf_true;
+ memset(&loc, 0, sizeof(loc));
+
+ LOCK(&lock->loc.inode->lock);
+
set_dirty = ec_set_dirty_flag(link, ctx, dirty);
/* If ec metadata has already been retrieved, do not try again. */
- if (ctx->have_info && (!set_dirty)) {
+ if (ctx->have_info) {
if (ec_is_data_fop(fop->id)) {
fop->healing |= lock->healing;
}
- return;
+ if (!set_dirty)
+ goto unlock;
}
/* Determine if there's something we need to retrieve for the current
* operation. */
if (!set_dirty && !lock->query && (lock->loc.inode->ia_type != IA_IFREG) &&
(lock->loc.inode->ia_type != IA_INVAL)) {
- return;
+ goto unlock;
}
- memset(&loc, 0, sizeof(loc));
-
- LOCK(&lock->loc.inode->lock);
-
changed_flags = ec_set_xattrop_flags_and_params(lock, link, dirty);
if (link->waiting_flags) {
/* This fop needs to wait until all its flags are cleared which
@@ -1402,6 +1471,7 @@ ec_get_size_version(ec_lock_link_t *link)
GF_ASSERT(!changed_flags);
}
+unlock:
UNLOCK(&lock->loc.inode->lock);
if (!changed_flags)
@@ -1813,6 +1883,10 @@ ec_lock_acquired(ec_lock_link_t *link)
LOCK(&lock->loc.inode->lock);
lock->acquired = _gf_true;
+ if (lock->contention) {
+ lock->release = _gf_true;
+ lock->contention = _gf_false;
+ }
ec_lock_update_fd(lock, fop);
ec_lock_wake_shared(lock, &list);
@@ -1823,7 +1897,8 @@ ec_lock_acquired(ec_lock_link_t *link)
if (fop->use_fd &&
(link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) {
- ec_fix_open(fop);
+ /* Try to reopen closed fd's only if lock has succeeded. */
+ ec_fix_open(fop, lock->mask);
}
ec_lock_resume_shared(&list);
@@ -1837,15 +1912,20 @@ ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
ec_lock_link_t *link = NULL;
ec_lock_t *lock = NULL;
+ link = fop->data;
+ lock = link->lock;
if (op_ret >= 0) {
- link = fop->data;
- lock = link->lock;
lock->mask = lock->good_mask = fop->good;
lock->healing = 0;
ec_lock_acquired(link);
ec_lock(fop->parent);
} else {
+ LOCK(&lock->loc.inode->lock);
+ {
+ lock->contention = _gf_false;
+ }
+ UNLOCK(&lock->loc.inode->lock);
gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_PREOP_LOCK_FAILED,
"Failed to complete preop lock");
}
@@ -2176,7 +2256,7 @@ ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
if (op_ret < 0) {
gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED,
- "entry/inode unlocking failed (%s)", ec_fop_name(link->fop->id));
+ "entry/inode unlocking failed :(%s)", ec_msg_str(link->fop));
} else {
ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock);
}
@@ -2213,6 +2293,23 @@ ec_unlock_lock(ec_lock_link_t *link)
}
}
+void
+ec_inode_bad_inc(inode_t *inode, xlator_t *xl)
+{
+ ec_inode_t *ctx = NULL;
+
+ LOCK(&inode->lock);
+ {
+ ctx = __ec_inode_get(inode, xl);
+ if (ctx == NULL) {
+ goto unlock;
+ }
+ ctx->bad_version++;
+ }
+unlock:
+ UNLOCK(&inode->lock);
+}
+
int32_t
ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xattr,
@@ -2228,6 +2325,12 @@ ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this,
ctx = lock->ctx;
if (op_ret < 0) {
+ if (link->lock->fd == NULL) {
+ ec_inode_bad_inc(link->lock->loc.inode, this);
+ } else {
+ ec_inode_bad_inc(link->lock->fd->inode, this);
+ }
+
gf_msg(fop->xl->name, fop_log_level(fop->id, op_errno), op_errno,
EC_MSG_SIZE_VERS_UPDATE_FAIL,
"Failed to update version and size. %s", ec_msg_str(fop));
@@ -2370,37 +2473,47 @@ ec_update_info(ec_lock_link_t *link)
uint64_t dirty[2] = {0, 0};
uint64_t size;
ec_t *ec = NULL;
+ uintptr_t mask;
lock = link->lock;
ctx = lock->ctx;
ec = link->fop->xl->private;
/* pre_version[*] will be 0 if have_version is false */
- version[0] = ctx->post_version[0] - ctx->pre_version[0];
- version[1] = ctx->post_version[1] - ctx->pre_version[1];
+ version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
+ ctx->pre_version[EC_DATA_TXN];
+ version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
+ ctx->pre_version[EC_METADATA_TXN];
size = ctx->post_size - ctx->pre_size;
/* If we set the dirty flag for update fop, we have to unset it.
* If fop has failed on some bricks, leave the dirty as marked. */
+
if (lock->unlock_now) {
+ if (version[EC_DATA_TXN]) {
+ /*A data fop will have difference in post and pre version
+ *and for data fop we send writes on healing bricks also */
+ mask = lock->good_mask | lock->healing;
+ } else {
+ mask = lock->good_mask;
+ }
/* Ensure that nodes are up while doing final
* metadata update.*/
- if (!(ec->node_mask & ~lock->good_mask) &&
- !(ec->node_mask & ~ec->xl_up)) {
- if (ctx->dirty[0] != 0) {
- dirty[0] = -1;
+ if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
+ if (ctx->dirty[EC_DATA_TXN] != 0) {
+ dirty[EC_DATA_TXN] = -1;
}
- if (ctx->dirty[1] != 0) {
- dirty[1] = -1;
+ if (ctx->dirty[EC_METADATA_TXN] != 0) {
+ dirty[EC_METADATA_TXN] = -1;
}
/*If everything is fine and we already
*have version xattr set on entry, there
*is no need to update version again*/
- if (ctx->pre_version[0]) {
- version[0] = 0;
+ if (ctx->pre_version[EC_DATA_TXN]) {
+ version[EC_DATA_TXN] = 0;
}
- if (ctx->pre_version[1]) {
- version[1] = 0;
+ if (ctx->pre_version[EC_METADATA_TXN]) {
+ version[EC_METADATA_TXN] = 0;
}
} else {
link->optimistic_changelog = _gf_false;
@@ -2409,8 +2522,8 @@ ec_update_info(ec_lock_link_t *link)
memset(ctx->dirty, 0, sizeof(ctx->dirty));
}
- if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) ||
- (dirty[1] != 0)) {
+ if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
+ (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
ec_update_size_version(link, version, size, dirty);
return _gf_true;
}
@@ -2452,13 +2565,20 @@ ec_lock_release(ec_t *ec, inode_t *inode)
goto done;
}
lock = ctx->inode_lock;
- if ((lock == NULL) || !lock->acquired || lock->release) {
+ if ((lock == NULL) || lock->release) {
goto done;
}
gf_msg_debug(ec->xl->name, 0, "Releasing inode %p due to lock contention",
inode);
+ if (!lock->acquired) {
+ /* This happens if some bricks already got the lock while inodelk is in
+ * progress. Set release to true after lock is acquired*/
+ lock->contention = _gf_true;
+ goto done;
+ }
+
/* The lock is not marked to be released, so the frozen list should be
* empty. */
GF_ASSERT(list_empty(&lock->frozen));
@@ -2910,3 +3030,13 @@ ec_manager(ec_fop_data_t *fop, int32_t error)
__ec_manager(fop, error);
}
+
+gf_boolean_t
+__ec_is_last_fop(ec_t *ec)
+{
+ if ((list_empty(&ec->pending_fops)) &&
+ (GF_ATOMIC_GET(ec->async_fop_count) == 0)) {
+ return _gf_true;
+ }
+ return _gf_false;
+}
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index bea0c045a47..51493612ac6 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -11,8 +11,7 @@
#ifndef __EC_COMMON_H__
#define __EC_COMMON_H__
-#include "xlator.h"
-
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
#include "ec-data.h"
typedef enum { EC_DATA_TXN, EC_METADATA_TXN } ec_txn_t;
@@ -26,6 +25,30 @@ typedef enum { EC_DATA_TXN, EC_METADATA_TXN } ec_txn_t;
#define EC_FLAG_LOCK_SHARED 0x0001
+#define QUORUM_CBK(fn, fop, frame, cookie, this, op_ret, op_errno, params...) \
+ do { \
+ ec_t *__ec = fop->xl->private; \
+ int32_t __op_ret = 0; \
+ int32_t __op_errno = 0; \
+ int32_t __success_count = gf_bits_count(fop->good); \
+ \
+ __op_ret = op_ret; \
+ __op_errno = op_errno; \
+ if (!fop->parent && frame && \
+ (GF_CLIENT_PID_SELF_HEALD != frame->root->pid) && \
+ __ec->quorum_count && (__success_count < __ec->quorum_count) && \
+ op_ret >= 0) { \
+ __op_ret = -1; \
+ __op_errno = EIO; \
+ gf_msg(__ec->xl->name, GF_LOG_ERROR, 0, \
+ EC_MSG_CHILDS_INSUFFICIENT, \
+ "Insufficient available children for this request " \
+ "(have %d, need %d). %s", \
+ __success_count, __ec->quorum_count, ec_msg_str(fop)); \
+ } \
+ fn(frame, cookie, this, __op_ret, __op_errno, params); \
+ } while (0)
+
enum _ec_xattrop_flags {
EC_FLAG_XATTROP,
EC_FLAG_DATA_DIRTY,
@@ -54,9 +77,12 @@ enum _ec_xattrop_flags {
#define EC_SELFHEAL_BIT 62
-#define EC_MINIMUM_ONE -1
-#define EC_MINIMUM_MIN -2
-#define EC_MINIMUM_ALL -3
+#define EC_MINIMUM_ONE (1 << 6)
+#define EC_MINIMUM_MIN (2 << 6)
+#define EC_MINIMUM_ALL (3 << 6)
+#define EC_FOP_NO_PROPAGATE_ERROR (1 << 8)
+#define EC_FOP_MINIMUM(_flags) ((_flags)&255)
+#define EC_FOP_FLAGS(_flags) ((_flags) & ~255)
#define EC_UPDATE_DATA 1
#define EC_UPDATE_META 2
@@ -95,6 +121,9 @@ enum _ec_xattrop_flags {
#define EC_STATE_HEAL_POST_INODELK_UNLOCK 217
#define EC_STATE_HEAL_DISPATCH 218
+/* Value to cover the full range of a file */
+#define EC_RANGE_FULL ((uint64_t)LLONG_MAX + 1)
+
gf_boolean_t
ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk);
void
@@ -120,13 +149,13 @@ ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro);
void
ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
- off_t fl_start, size_t fl_size);
+ off_t fl_start, uint64_t fl_size);
void
ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
uint32_t flags);
void
ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start,
- size_t fl_size);
+ uint64_t fl_size);
void
ec_lock(ec_fop_data_t *fop);
void
@@ -160,11 +189,14 @@ void
ec_dispatch_one(ec_fop_data_t *fop);
void
+ec_succeed_all(ec_fop_data_t *fop);
+
+void
ec_sleep(ec_fop_data_t *fop);
void
ec_resume(ec_fop_data_t *fop, int32_t error);
void
-ec_resume_parent(ec_fop_data_t *fop, int32_t error);
+ec_resume_parent(ec_fop_data_t *fop);
void
ec_manager(ec_fop_data_t *fop, int32_t error);
@@ -187,4 +219,16 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
void
ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
int32_t ret_status);
+gf_boolean_t
+ec_is_entry_healing(ec_fop_data_t *fop);
+void
+ec_set_entry_healing(ec_fop_data_t *fop);
+void
+ec_reset_entry_healing(ec_fop_data_t *fop);
+char *
+ec_msg_str(ec_fop_data_t *fop);
+gf_boolean_t
+__ec_is_last_fop(ec_t *ec);
+void
+ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop);
#endif /* __EC_COMMON_H__ */
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
index fae8843a679..06388833546 100644
--- a/xlators/cluster/ec/src/ec-data.c
+++ b/xlators/cluster/ec/src/ec-data.c
@@ -8,7 +8,6 @@
cases as published by the Free Software Foundation.
*/
-#include "ec-mem-types.h"
#include "ec-helpers.h"
#include "ec-common.h"
#include "ec-data.h"
@@ -98,7 +97,7 @@ ec_cbk_data_destroy(ec_cbk_data_t *cbk)
ec_fop_data_t *
ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id,
- uint32_t flags, uintptr_t target, int32_t minimum,
+ uint32_t flags, uintptr_t target, uint32_t fop_flags,
ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks,
void *data)
{
@@ -151,7 +150,8 @@ ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id,
fop->refs = 1;
fop->flags = flags;
- fop->minimum = minimum;
+ fop->minimum = EC_FOP_MINIMUM(fop_flags);
+ fop->fop_flags = EC_FOP_FLAGS(fop_flags);
fop->mask = target;
fop->wind = wind;
@@ -201,11 +201,13 @@ ec_handle_last_pending_fop_completion(ec_fop_data_t *fop, gf_boolean_t *notify)
{
ec_t *ec = fop->xl->private;
+ *notify = _gf_false;
+
if (!list_empty(&fop->pending_list)) {
LOCK(&ec->lock);
{
list_del_init(&fop->pending_list);
- *notify = list_empty(&ec->pending_fops);
+ *notify = __ec_is_last_fop(ec);
}
UNLOCK(&ec->lock);
}
@@ -271,7 +273,7 @@ ec_fop_data_release(ec_fop_data_t *fop)
loc_wipe(&fop->loc[1]);
GF_FREE(fop->errstr);
- ec_resume_parent(fop, fop->error);
+ ec_resume_parent(fop);
ec_fop_cleanup(fop);
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 112536d554c..c8a74ffe1ed 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -18,7 +18,7 @@ ec_cbk_data_allocate(call_frame_t *frame, xlator_t *this, ec_fop_data_t *fop,
int32_t id, int32_t idx, int32_t op_ret, int32_t op_errno);
ec_fop_data_t *
ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id,
- uint32_t flags, uintptr_t target, int32_t minimum,
+ uint32_t flags, uintptr_t target, uint32_t fop_flags,
ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks,
void *data);
void
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
index 8db92b9d92d..f71dcfac293 100644
--- a/xlators/cluster/ec/src/ec-dir-read.c
+++ b/xlators/cluster/ec/src/ec-dir-read.c
@@ -8,15 +8,11 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-
#include "ec.h"
#include "ec-messages.h"
#include "ec-helpers.h"
#include "ec-common.h"
#include "ec-combine.h"
-#include "ec-method.h"
#include "ec-fops.h"
/****************************************************************
@@ -127,13 +123,15 @@ ec_manager_opendir(ec_fop_data_t *fop, int32_t state)
return EC_STATE_REPORT;
}
- err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
- if (err != 0) {
- UNLOCK(&fop->fd->lock);
+ if (!ctx->loc.inode) {
+ err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
+ if (err != 0) {
+ UNLOCK(&fop->fd->lock);
- fop->error = -err;
+ fop->error = -err;
- return EC_STATE_REPORT;
+ return EC_STATE_REPORT;
+ }
}
UNLOCK(&fop->fd->lock);
@@ -142,7 +140,7 @@ ec_manager_opendir(ec_fop_data_t *fop, int32_t state)
case EC_STATE_LOCK:
ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -219,7 +217,7 @@ ec_manager_opendir(ec_fop_data_t *fop, int32_t state)
void
ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_opendir_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc,
fd_t *fd, dict_t *xdata)
{
ec_cbk_t callback = {.opendir = func};
@@ -233,7 +231,7 @@ ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_OPENDIR, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_opendir,
+ target, fop_flags, ec_wind_opendir,
ec_manager_opendir, callback, data);
if (fop == NULL) {
goto out;
@@ -388,9 +386,16 @@ ec_manager_readdir(ec_fop_data_t *fop, int32_t state)
/* Return error if opendir has not been successfully called on
* any subvolume. */
ctx = ec_fd_get(fop->fd, fop->xl);
- if ((ctx == NULL) || (ctx->open == 0)) {
- fop->error = EINVAL;
+ if (ctx == NULL) {
+ fop->error = ENOMEM;
+ } else if (ctx->open == 0) {
+ fop->error = EBADFD;
+ }
+ if (fop->error) {
+ gf_msg(fop->xl->name, GF_LOG_ERROR, fop->error,
+ EC_MSG_INVALID_REQUEST, "EC is not winding readdir: %s",
+ ec_msg_str(fop));
return EC_STATE_REPORT;
}
@@ -427,7 +432,8 @@ ec_manager_readdir(ec_fop_data_t *fop, int32_t state)
}
fop->mask &= 1ULL << idx;
} else {
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
ec_lock(fop);
}
@@ -514,7 +520,7 @@ ec_manager_readdir(ec_fop_data_t *fop, int32_t state)
void
ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_readdir_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd,
size_t size, off_t offset, dict_t *xdata)
{
ec_cbk_t callback = {.readdir = func};
@@ -528,7 +534,7 @@ ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_READDIR, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_readdir,
+ target, fop_flags, ec_wind_readdir,
ec_manager_readdir, callback, data);
if (fop == NULL) {
goto out;
@@ -584,7 +590,7 @@ ec_wind_readdirp(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_readdirp_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd,
size_t size, off_t offset, dict_t *xdata)
{
ec_cbk_t callback = {.readdirp = func};
@@ -598,7 +604,7 @@ ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(
- frame, this, GF_FOP_READDIRP, EC_FLAG_LOCK_SHARED, target, minimum,
+ frame, this, GF_FOP_READDIRP, EC_FLAG_LOCK_SHARED, target, fop_flags,
ec_wind_readdirp, ec_manager_readdir, callback, data);
if (fop == NULL) {
goto out;
diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c
index f5c38e80dd7..53d27d895c3 100644
--- a/xlators/cluster/ec/src/ec-dir-write.c
+++ b/xlators/cluster/ec/src/ec-dir-write.c
@@ -8,9 +8,6 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-
#include "ec.h"
#include "ec-messages.h"
#include "ec-helpers.h"
@@ -218,10 +215,10 @@ ec_manager_create(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.create != NULL) {
- fop->cbks.create(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, fop->fd, fop->loc[0].inode,
- &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.create, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->fd,
+ fop->loc[0].inode, &cbk->iatt[0], &cbk->iatt[1],
+ &cbk->iatt[2], cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -262,7 +259,7 @@ ec_manager_create(ec_fop_data_t *fop, int32_t state)
void
ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_create_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc,
int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
ec_cbk_t callback = {.create = func};
@@ -275,7 +272,7 @@ ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_CREATE, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_CREATE, 0, target, fop_flags,
ec_wind_create, ec_manager_create, callback,
data);
if (fop == NULL) {
@@ -390,9 +387,10 @@ ec_manager_link(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.link != NULL) {
- fop->cbks.link(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0],
- &cbk->iatt[1], &cbk->iatt[2], cbk->xdata);
+ QUORUM_CBK(fop->cbks.link, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+ &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+ cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -432,9 +430,9 @@ ec_manager_link(ec_fop_data_t *fop, int32_t state)
}
void
-ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_link_cbk_t func, void *data, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata)
+ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
{
ec_cbk_t callback = {.link = func};
ec_fop_data_t *fop = NULL;
@@ -446,7 +444,7 @@ ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_LINK, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_LINK, 0, target, fop_flags,
ec_wind_link, ec_manager_link, callback, data);
if (fop == NULL) {
goto out;
@@ -569,9 +567,10 @@ ec_manager_mkdir(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.mkdir != NULL) {
- fop->cbks.mkdir(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0],
- &cbk->iatt[1], &cbk->iatt[2], cbk->xdata);
+ QUORUM_CBK(fop->cbks.mkdir, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+ &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+ cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -613,9 +612,9 @@ ec_manager_mkdir(ec_fop_data_t *fop, int32_t state)
}
void
-ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_mkdir_cbk_t func, void *data, loc_t *loc, mode_t mode,
- mode_t umask, dict_t *xdata)
+ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc,
+ mode_t mode, mode_t umask, dict_t *xdata)
{
ec_cbk_t callback = {.mkdir = func};
ec_fop_data_t *fop = NULL;
@@ -627,7 +626,7 @@ ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_MKDIR, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_MKDIR, 0, target, fop_flags,
ec_wind_mkdir, ec_manager_mkdir, callback, data);
if (fop == NULL) {
goto out;
@@ -773,9 +772,10 @@ ec_manager_mknod(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.mknod != NULL) {
- fop->cbks.mknod(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, fop->loc[0].inode, &cbk->iatt[0],
- &cbk->iatt[1], &cbk->iatt[2], cbk->xdata);
+ QUORUM_CBK(fop->cbks.mknod, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+ &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+ cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -815,9 +815,9 @@ ec_manager_mknod(ec_fop_data_t *fop, int32_t state)
}
void
-ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_mknod_cbk_t func, void *data, loc_t *loc, mode_t mode, dev_t rdev,
- mode_t umask, dict_t *xdata)
+ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc,
+ mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
{
ec_cbk_t callback = {.mknod = func};
ec_fop_data_t *fop = NULL;
@@ -829,7 +829,7 @@ ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_MKNOD, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_MKNOD, 0, target, fop_flags,
ec_wind_mknod, ec_manager_mknod, callback, data);
if (fop == NULL) {
goto out;
@@ -931,10 +931,10 @@ ec_manager_rename(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.rename != NULL) {
- fop->cbks.rename(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
- &cbk->iatt[2], &cbk->iatt[3], &cbk->iatt[4],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.rename, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], &cbk->iatt[2], &cbk->iatt[3],
+ &cbk->iatt[4], cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -975,7 +975,7 @@ ec_manager_rename(ec_fop_data_t *fop, int32_t state)
void
ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_rename_cbk_t func, void *data, loc_t *oldloc,
+ uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc,
loc_t *newloc, dict_t *xdata)
{
ec_cbk_t callback = {.rename = func};
@@ -988,7 +988,7 @@ ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_RENAME, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_RENAME, 0, target, fop_flags,
ec_wind_rename, ec_manager_rename, callback,
data);
if (fop == NULL) {
@@ -1083,9 +1083,9 @@ ec_manager_rmdir(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.rmdir != NULL) {
- fop->cbks.rmdir(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.rmdir, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -1125,9 +1125,9 @@ ec_manager_rmdir(ec_fop_data_t *fop, int32_t state)
}
void
-ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_rmdir_cbk_t func, void *data, loc_t *loc, int xflags,
- dict_t *xdata)
+ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc,
+ int xflags, dict_t *xdata)
{
ec_cbk_t callback = {.rmdir = func};
ec_fop_data_t *fop = NULL;
@@ -1139,7 +1139,7 @@ ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_RMDIR, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_RMDIR, 0, target, fop_flags,
ec_wind_rmdir, ec_manager_rmdir, callback, data);
if (fop == NULL) {
goto out;
@@ -1237,10 +1237,10 @@ ec_manager_symlink(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.symlink != NULL) {
- fop->cbks.symlink(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, fop->loc[0].inode,
- &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.symlink, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+ &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+ cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -1281,7 +1281,7 @@ ec_manager_symlink(ec_fop_data_t *fop, int32_t state)
void
ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_symlink_cbk_t func, void *data,
+ uint32_t fop_flags, fop_symlink_cbk_t func, void *data,
const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata)
{
ec_cbk_t callback = {.symlink = func};
@@ -1294,9 +1294,9 @@ ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_SYMLINK, 0, target, minimum,
- ec_wind_symlink, ec_manager_symlink, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_SYMLINK, 0, target,
+ fop_flags, ec_wind_symlink, ec_manager_symlink,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -1392,9 +1392,9 @@ ec_manager_unlink(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.unlink != NULL) {
- fop->cbks.unlink(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.unlink, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -1435,7 +1435,7 @@ ec_manager_unlink(ec_fop_data_t *fop, int32_t state)
void
ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_unlink_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc,
int xflags, dict_t *xdata)
{
ec_cbk_t callback = {.unlink = func};
@@ -1448,7 +1448,7 @@ ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_UNLINK, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_UNLINK, 0, target, fop_flags,
ec_wind_unlink, ec_manager_unlink, callback,
data);
if (fop == NULL) {
diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h
index 2858d829c73..07edf8a7fec 100644
--- a/xlators/cluster/ec/src/ec-fops.h
+++ b/xlators/cluster/ec/src/ec-fops.h
@@ -11,240 +11,244 @@
#ifndef __EC_FOPS_H__
#define __EC_FOPS_H__
-#include "xlator.h"
+#include <glusterfs/xlator.h>
#include "ec-types.h"
#include "ec-common.h"
void
ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_access_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc,
int32_t mask, dict_t *xdata);
void
ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_create_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc,
int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
void
ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_entrylk_cbk_t func, void *data,
+ uint32_t fop_flags, fop_entrylk_cbk_t func, void *data,
const char *volume, loc_t *loc, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
void
ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fentrylk_cbk_t func, void *data,
+ uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data,
const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd,
entrylk_type type, dict_t *xdata);
void
-ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_flush_cbk_t func, void *data, fd_t *fd, dict_t *xdata);
+ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd,
+ dict_t *xdata);
void
-ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_fsync_cbk_t func, void *data, fd_t *fd, int32_t datasync,
- dict_t *xdata);
+ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd,
+ int32_t datasync, dict_t *xdata);
void
ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fsyncdir_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd,
int32_t datasync, dict_t *xdata);
void
ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_getxattr_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc,
const char *name, dict_t *xdata);
void
ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fgetxattr_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd,
const char *name, dict_t *xdata);
void
-ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_heal_cbk_t func, void *data, loc_t *loc, int32_t partial,
- dict_t *xdata);
+ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc,
+ int32_t partial, dict_t *xdata);
void
-ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_fheal_cbk_t func, void *data, fd_t *fd, int32_t partial,
- dict_t *xdata);
+ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd,
+ int32_t partial, dict_t *xdata);
void
ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
- uintptr_t target, int32_t minimum, fop_inodelk_cbk_t func,
+ uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func,
void *data, const char *volume, loc_t *loc, int32_t cmd,
struct gf_flock *flock, dict_t *xdata);
void
ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
- uintptr_t target, int32_t minimum, fop_finodelk_cbk_t func,
+ uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func,
void *data, const char *volume, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata);
void
-ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_link_cbk_t func, void *data, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata);
+ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata);
void
-ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
+ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags,
fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata);
void
ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_lookup_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc,
dict_t *xdata);
void
-ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_mkdir_cbk_t func, void *data, loc_t *loc, mode_t mode,
- mode_t umask, dict_t *xdata);
+ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc,
+ mode_t mode, mode_t umask, dict_t *xdata);
void
-ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_mknod_cbk_t func, void *data, loc_t *loc, mode_t mode, dev_t rdev,
- mode_t umask, dict_t *xdata);
+ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc,
+ mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata);
void
-ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_open_cbk_t func, void *data, loc_t *loc, int32_t flags, fd_t *fd,
- dict_t *xdata);
+ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc,
+ int32_t flags, fd_t *fd, dict_t *xdata);
void
ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_opendir_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc,
fd_t *fd, dict_t *xdata);
void
ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_readdir_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd,
size_t size, off_t offset, dict_t *xdata);
void
ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_readdirp_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd,
size_t size, off_t offset, dict_t *xdata);
void
ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_readlink_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc,
size_t size, dict_t *xdata);
void
-ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_readv_cbk_t func, void *data, fd_t *fd, size_t size, off_t offset,
- uint32_t flags, dict_t *xdata);
+ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata);
void
ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_removexattr_cbk_t func, void *data,
+ uint32_t fop_flags, fop_removexattr_cbk_t func, void *data,
loc_t *loc, const char *name, dict_t *xdata);
void
ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fremovexattr_cbk_t func, void *data,
+ uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data,
fd_t *fd, const char *name, dict_t *xdata);
void
ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_rename_cbk_t func, void *data, loc_t *oldloc,
+ uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc,
loc_t *newloc, dict_t *xdata);
void
-ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_rmdir_cbk_t func, void *data, loc_t *loc, int xflags,
- dict_t *xdata);
+ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc,
+ int xflags, dict_t *xdata);
void
ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_setattr_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
void
ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fsetattr_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
void
ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_setxattr_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc,
dict_t *dict, int32_t flags, dict_t *xdata);
void
ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fsetxattr_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd,
dict_t *dict, int32_t flags, dict_t *xdata);
void
-ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_stat_cbk_t func, void *data, loc_t *loc, dict_t *xdata);
+ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc,
+ dict_t *xdata);
void
-ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_fstat_cbk_t func, void *data, fd_t *fd, dict_t *xdata);
+ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd,
+ dict_t *xdata);
void
ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_statfs_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc,
dict_t *xdata);
void
ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_symlink_cbk_t func, void *data,
+ uint32_t fop_flags, fop_symlink_cbk_t func, void *data,
const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata);
void
ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd,
int32_t mode, off_t offset, size_t len, dict_t *xdata);
void
ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_discard_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd,
off_t offset, size_t len, dict_t *xdata);
void
ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_truncate_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc,
off_t offset, dict_t *xdata);
void
ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_ftruncate_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd,
off_t offset, dict_t *xdata);
void
ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_unlink_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc,
int xflags, dict_t *xdata);
void
ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_writev_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
struct iobref *iobref, dict_t *xdata);
void
ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_xattrop_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc,
gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
void
ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fxattrop_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd,
gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
void
-ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_seek_cbk_t func, void *data, fd_t *fd, off_t offset,
- gf_seek_what_t what, dict_t *xdata);
+ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd,
+ off_t offset, gf_seek_what_t what, dict_t *xdata);
void
-ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_ipc_cbk_t func, void *data, int32_t op, dict_t *xdata);
+ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op,
+ dict_t *xdata);
#endif /* __EC_FOPS_H__ */
diff --git a/xlators/cluster/ec/src/ec-galois.c b/xlators/cluster/ec/src/ec-galois.c
index 8cb4dc2e4e3..6e4990c71f5 100644
--- a/xlators/cluster/ec/src/ec-galois.c
+++ b/xlators/cluster/ec/src/ec-galois.c
@@ -10,9 +10,6 @@
#include <string.h>
-#include "mem-pool.h"
-#include "list.h"
-
#include "ec-mem-types.h"
#include "ec-gf8.h"
#include "ec-helpers.h"
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index de01ece3b3d..884deb93669 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -8,16 +8,13 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "ec.h"
#include "ec-messages.h"
#include "ec-helpers.h"
#include "ec-common.h"
#include "ec-combine.h"
-#include "ec-method.h"
#include "ec-fops.h"
/* FOP: flush */
@@ -83,7 +80,7 @@ ec_manager_flush(ec_fop_data_t *fop, int32_t state)
switch (state) {
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, 0, 0, LLONG_MAX);
+ ec_lock_prepare_fd(fop, fop->fd, 0, 0, EC_RANGE_FULL);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -150,9 +147,41 @@ ec_manager_flush(ec_fop_data_t *fop, int32_t state)
}
}
+static int32_t
+ec_validate_fd(fd_t *fd, xlator_t *xl)
+{
+ uint64_t iversion = 0;
+ uint64_t fversion = 0;
+ ec_inode_t *inode_ctx = NULL;
+ ec_fd_t *fd_ctx = NULL;
+
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __ec_fd_get(fd, xl);
+ if (fd_ctx) {
+ fversion = fd_ctx->bad_version;
+ }
+ }
+ UNLOCK(&fd->lock);
+
+ LOCK(&fd->inode->lock);
+ {
+ inode_ctx = __ec_inode_get(fd->inode, xl);
+ if (inode_ctx) {
+ iversion = inode_ctx->bad_version;
+ }
+ }
+ UNLOCK(&fd->inode->lock);
+ if (fversion < iversion) {
+ return EBADF;
+ }
+ return 0;
+}
+
void
-ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_flush_cbk_t func, void *data, fd_t *fd, dict_t *xdata)
+ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd,
+ dict_t *xdata)
{
ec_cbk_t callback = {.flush = func};
ec_fop_data_t *fop = NULL;
@@ -164,7 +193,17 @@ ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_FLUSH, 0, target, minimum,
+ if (fd) {
+ error = ec_validate_fd(fd, this);
+ if (error) {
+ gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD,
+ "Failing %s on %s", gf_fop_list[GF_FOP_FLUSH],
+ fd->inode ? uuid_utoa(fd->inode->gfid) : "");
+ goto out;
+ }
+ }
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FLUSH, 0, target, fop_flags,
ec_wind_flush, ec_manager_flush, callback, data);
if (fop == NULL) {
goto out;
@@ -289,7 +328,7 @@ ec_manager_fsync(ec_fop_data_t *fop, int32_t state)
switch (state) {
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, EC_RANGE_FULL);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -366,9 +405,9 @@ ec_manager_fsync(ec_fop_data_t *fop, int32_t state)
}
void
-ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_fsync_cbk_t func, void *data, fd_t *fd, int32_t datasync,
- dict_t *xdata)
+ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd,
+ int32_t datasync, dict_t *xdata)
{
ec_cbk_t callback = {.fsync = func};
ec_fop_data_t *fop = NULL;
@@ -380,7 +419,17 @@ ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNC, 0, target, minimum,
+ if (fd) {
+ error = ec_validate_fd(fd, this);
+ if (error) {
+ gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD,
+ "Failing %s on %s", gf_fop_list[GF_FOP_FSYNC],
+ fd->inode ? uuid_utoa(fd->inode->gfid) : "");
+ goto out;
+ }
+ }
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNC, 0, target, fop_flags,
ec_wind_fsync, ec_manager_fsync, callback, data);
if (fop == NULL) {
goto out;
@@ -484,7 +533,7 @@ ec_manager_fsyncdir(ec_fop_data_t *fop, int32_t state)
switch (state) {
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, 0, 0, LLONG_MAX);
+ ec_lock_prepare_fd(fop, fop->fd, 0, 0, EC_RANGE_FULL);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -553,7 +602,7 @@ ec_manager_fsyncdir(ec_fop_data_t *fop, int32_t state)
void
ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fsyncdir_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd,
int32_t datasync, dict_t *xdata)
{
ec_cbk_t callback = {.fsyncdir = func};
@@ -566,9 +615,9 @@ ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNCDIR, 0, target, minimum,
- ec_wind_fsyncdir, ec_manager_fsyncdir, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNCDIR, 0, target,
+ fop_flags, ec_wind_fsyncdir, ec_manager_fsyncdir,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -848,7 +897,7 @@ ec_manager_lookup(ec_fop_data_t *fop, int32_t state)
void
ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_lookup_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc,
dict_t *xdata)
{
ec_cbk_t callback = {.lookup = func};
@@ -862,7 +911,7 @@ ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_LOOKUP, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_lookup,
+ target, fop_flags, ec_wind_lookup,
ec_manager_lookup, callback, data);
if (fop == NULL) {
goto out;
@@ -1033,7 +1082,7 @@ ec_manager_statfs(ec_fop_data_t *fop, int32_t state)
void
ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_statfs_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc,
dict_t *xdata)
{
ec_cbk_t callback = {.statfs = func};
@@ -1047,7 +1096,7 @@ ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_STATFS, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_statfs,
+ target, fop_flags, ec_wind_statfs,
ec_manager_statfs, callback, data);
if (fop == NULL) {
goto out;
@@ -1182,9 +1231,10 @@ ec_manager_xattrop(ec_fop_data_t *fop, int32_t state)
case EC_STATE_LOCK:
if (fop->fd == NULL) {
ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
} else {
- ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META, 0, LLONG_MAX);
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META, 0,
+ EC_RANGE_FULL);
}
ec_lock(fop);
@@ -1269,7 +1319,7 @@ ec_manager_xattrop(ec_fop_data_t *fop, int32_t state)
void
ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_xattrop_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc,
gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
ec_cbk_t callback = {.xattrop = func};
@@ -1282,9 +1332,9 @@ ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_XATTROP, 0, target, minimum,
- ec_wind_xattrop, ec_manager_xattrop, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_XATTROP, 0, target,
+ fop_flags, ec_wind_xattrop, ec_manager_xattrop,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -1342,7 +1392,7 @@ ec_wind_fxattrop(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fxattrop_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd,
gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
ec_cbk_t callback = {.fxattrop = func};
@@ -1355,9 +1405,9 @@ ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_FXATTROP, 0, target, minimum,
- ec_wind_fxattrop, ec_manager_xattrop, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FXATTROP, 0, target,
+ fop_flags, ec_wind_fxattrop, ec_manager_xattrop,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -1506,8 +1556,9 @@ ec_manager_ipc(ec_fop_data_t *fop, int32_t state)
}
void
-ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_ipc_cbk_t func, void *data, int32_t op, dict_t *xdata)
+ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op,
+ dict_t *xdata)
{
ec_cbk_t callback = {.ipc = func};
ec_fop_data_t *fop = NULL;
@@ -1519,7 +1570,7 @@ ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_IPC, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_IPC, 0, target, fop_flags,
ec_wind_ipc, ec_manager_ipc, callback, data);
if (fop == NULL) {
goto out;
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 229c0683d91..7d991f04aac 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -8,16 +8,14 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-#include "compat-errno.h"
-#include "byte-order.h"
-#include "syncop.h"
-#include "syncop-utils.h"
-#include "cluster-syncop.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/cluster-syncop.h>
#include "ec.h"
-#include "ec-mem-types.h"
#include "ec-types.h"
#include "ec-messages.h"
#include "ec-helpers.h"
@@ -72,6 +70,7 @@ struct ec_name_data {
char *name;
inode_t *parent;
default_args_cbk_t *replies;
+ uint32_t heal_pending;
};
static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
@@ -103,6 +102,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
}
/* FOP: heal */
+void
+ec_set_entry_healing(ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ loc_t *loc = NULL;
+
+ if (!fop)
+ return;
+
+ loc = &fop->loc[0];
+ LOCK(&loc->inode->lock);
+ {
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ ctx->heal_count += 1;
+ }
+ }
+ UNLOCK(&loc->inode->lock);
+}
+
+void
+ec_reset_entry_healing(ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ loc_t *loc = NULL;
+ int32_t heal_count = 0;
+ if (!fop)
+ return;
+
+ loc = &fop->loc[0];
+ LOCK(&loc->inode->lock);
+ {
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ ctx->heal_count += -1;
+ heal_count = ctx->heal_count;
+ }
+ }
+ UNLOCK(&loc->inode->lock);
+ GF_ASSERT(heal_count >= 0);
+}
+
uintptr_t
ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
{
@@ -325,16 +366,16 @@ ec_heal_data_block(ec_heal_t *heal)
/* FOP: fheal */
void
-ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_fheal_cbk_t func, void *data, fd_t *fd, int32_t partial,
- dict_t *xdata)
+ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd,
+ int32_t partial, dict_t *xdata)
{
ec_fd_t *ctx = ec_fd_get(fd, this);
if (ctx != NULL) {
- gf_msg_trace("ec", 0, "FHEAL ctx: flags=%X, open=%lX", ctx->flags,
+ gf_msg_trace("ec", 0, "FHEAL ctx: flags=%X, open=%" PRIXPTR, ctx->flags,
ctx->open);
- ec_heal(frame, this, target, minimum, func, data, &ctx->loc, partial,
+ ec_heal(frame, this, target, fop_flags, func, data, &ctx->loc, partial,
xdata);
}
}
@@ -954,6 +995,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
ret = -ENOTCONN;
goto out;
}
+
out:
if (xattr)
dict_unref(xattr);
@@ -977,6 +1019,7 @@ ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data)
int estale_count = 0;
int i = 0;
call_frame_t *frame = name_data->frame;
+ uuid_t gfid;
ec = name_data->frame->this->private;
EC_REPLIES_ALLOC(replies, ec->nodes);
@@ -985,12 +1028,16 @@ ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data)
goto out;
}
+ loc.parent = inode_ref(name_data->parent);
loc.inode = inode_new(name_data->parent->table);
if (!loc.inode) {
ret = -ENOMEM;
goto out;
}
- gf_uuid_parse(key, loc.gfid);
+
+ gf_uuid_parse(key, gfid);
+ gf_uuid_copy(loc.pargfid, name_data->parent->gfid);
+ loc.name = name_data->name;
output = alloca0(ec->nodes);
ret = cluster_lookup(ec->xl_list, name_data->participants, ec->nodes,
replies, output, name_data->frame, ec->xl, &loc, NULL);
@@ -1003,6 +1050,11 @@ ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data)
estale_count++;
else
name_data->participants[i] = 0;
+ } else if (gf_uuid_compare(gfid, replies[i].stat.ia_gfid)) {
+ estale_count++;
+ gf_msg_debug(ec->xl->name, 0, "%s/%s: different gfid as %s",
+ uuid_utoa(name_data->parent->gfid), name_data->name,
+ key);
}
}
@@ -1122,6 +1174,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
dict_t *xdata = NULL;
char *linkname = NULL;
ec_config_t config;
+
/* There should be just one gfid key */
EC_REPLIES_ALLOC(replies, ec->nodes);
if (gfid_db->count != 1) {
@@ -1366,6 +1419,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
participants);
+ if (ret >= 0) {
+ /* If ec_create_name() succeeded we return 1 to indicate that a new
+ * file has been created and it will need to be healed. */
+ ret = 1;
+ }
out:
cluster_replies_wipe(replies, ec->nodes);
loc_wipe(&loc);
@@ -1443,18 +1501,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
name_on);
- if (ret < 0)
+ if (ret < 0) {
memset(name_on, 0, ec->nodes);
+ } else {
+ name_data->heal_pending += ret;
+ }
for (i = 0; i < ec->nodes; i++)
if (name_data->participants[i] && !name_on[i])
name_data->failed_on[i] = 1;
+
return 0;
}
int
ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
- unsigned char *participants)
+ unsigned char *participants, uint32_t *pending)
{
int i = 0;
int j = 0;
@@ -1467,7 +1529,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
name_data.frame = frame;
name_data.participants = participants;
name_data.failed_on = alloca0(ec->nodes);
- ;
+ name_data.heal_pending = 0;
for (i = 0; i < ec->nodes; i++) {
if (!participants[i])
@@ -1486,6 +1548,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
break;
}
}
+ *pending += name_data.heal_pending;
+
loc_wipe(&loc);
return ret;
}
@@ -1493,7 +1557,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
int
__ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
unsigned char *heal_on, unsigned char *sources,
- unsigned char *healed_sinks)
+ unsigned char *healed_sinks, uint32_t *pending)
{
unsigned char *locked_on = NULL;
unsigned char *output = NULL;
@@ -1538,7 +1602,7 @@ unlock:
if (sources[i] || healed_sinks[i])
participants[i] = 1;
}
- ret = ec_heal_names(frame, ec, inode, participants);
+ ret = ec_heal_names(frame, ec, inode, participants, pending);
if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
goto out;
@@ -1559,7 +1623,8 @@ out:
int
ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
- unsigned char *sources, unsigned char *healed_sinks)
+ unsigned char *sources, unsigned char *healed_sinks,
+ uint32_t *pending)
{
unsigned char *locked_on = NULL;
unsigned char *up_subvols = NULL;
@@ -1590,7 +1655,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
goto unlock;
}
ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
- healed_sinks);
+ healed_sinks, pending);
}
unlock:
cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
@@ -1909,16 +1974,16 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
case EC_STATE_REPORT:
if (fop->cbks.heal) {
- fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0,
(heal->good | heal->bad), heal->good, heal->bad,
- NULL);
+ 0, NULL);
}
return EC_STATE_END;
case -EC_STATE_REPORT:
if (fop->cbks.heal) {
- fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
- 0, 0, NULL);
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
+ fop->error, 0, 0, 0, 0, NULL);
}
return EC_STATE_END;
@@ -1933,7 +1998,7 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
/*Takes lock */
void
ec_heal_block(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_heal_cbk_t func, ec_heal_t *heal)
+ uint32_t fop_flags, fop_heal_cbk_t func, ec_heal_t *heal)
{
ec_cbk_t callback = {.heal = func};
ec_fop_data_t *fop = NULL;
@@ -1944,7 +2009,7 @@ ec_heal_block(call_frame_t *frame, xlator_t *this, uintptr_t target,
VALIDATE_OR_GOTO(this, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags,
NULL, ec_manager_heal_block, callback, heal);
if (fop == NULL)
goto out;
@@ -1955,19 +2020,21 @@ out:
if (fop != NULL) {
ec_manager(fop, error);
} else {
- func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
+ func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
}
}
int32_t
ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, uintptr_t mask,
- uintptr_t good, uintptr_t bad, dict_t *xdata)
+ uintptr_t good, uintptr_t bad, uint32_t pending,
+ dict_t *xdata)
{
- ec_fop_data_t *fop = cookie;
- ec_heal_t *heal = fop->data;
+ ec_heal_t *heal = cookie;
- fop->heal = NULL;
+ if (heal->fop) {
+ heal->fop->heal = NULL;
+ }
heal->fop = NULL;
heal->error = op_ret < 0 ? op_errno : 0;
syncbarrier_wake(heal->data);
@@ -2259,9 +2326,10 @@ ec_restore_time_and_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd,
loc.inode = inode_ref(fd->inode);
gf_uuid_copy(loc.gfid, fd->inode->gfid);
- ret = cluster_setattr(ec->xl_list, healed_sinks, ec->nodes, replies,
- output, frame, ec->xl, &loc, &source_buf,
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME, NULL);
+ ret = cluster_setattr(
+ ec->xl_list, healed_sinks, ec->nodes, replies, output, frame,
+ ec->xl, &loc, &source_buf,
+ GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME, NULL);
EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes);
if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
ret = -ENOTCONN;
@@ -2429,6 +2497,58 @@ out:
return ret;
}
+int
+ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode)
+{
+ int i = 0;
+ int ret = 0;
+ dict_t **xattr = NULL;
+ loc_t loc = {0};
+ uint64_t dirty_xattr[EC_VERSION_SIZE] = {0};
+ unsigned char *on = NULL;
+ default_args_cbk_t *replies = NULL;
+ dict_t *dict = NULL;
+
+ /* Allocate the required memory */
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ on = alloca0(ec->nodes);
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < ec->nodes; i++) {
+ xattr[i] = dict;
+ on[i] = 1;
+ }
+ ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr,
+ (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+ ec_wind_xattrop_parallel, &loc, GF_XATTROP_ADD_ARRAY64,
+ xattr, NULL);
+out:
+ if (dict) {
+ dict_unref(dict);
+ }
+ if (xattr) {
+ GF_FREE(xattr);
+ }
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
+}
+
void
ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
{
@@ -2446,6 +2566,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
intptr_t mbad = 0;
intptr_t good = 0;
intptr_t bad = 0;
+ uint32_t pending = 0;
ec_fop_data_t *fop = data;
gf_boolean_t blocking = _gf_false;
ec_heal_need_t need_heal = EC_HEAL_NONEED;
@@ -2481,10 +2602,10 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
if (loc->name && strlen(loc->name)) {
ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
participants);
- if (ret == 0) {
+ if (ret >= 0) {
gf_msg_debug(this->name, 0,
"%s: name heal "
- "successful on %lX",
+ "successful on %" PRIXPTR,
loc->path,
ec_char_array_to_mask(participants, ec->nodes));
} else {
@@ -2499,32 +2620,34 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
/* Mount triggers heal only when it detects that it must need heal, shd
* triggers heals periodically which need not be thorough*/
- ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false,
- !ec->shd.iamshd, &need_heal);
-
- if (need_heal == EC_HEAL_NONEED) {
- gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
- "Heal is not required for : %s ", uuid_utoa(loc->gfid));
- goto out;
+ if (ec->shd.iamshd && (ret <= 0)) {
+ ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
+ &need_heal);
+
+ if (need_heal == EC_HEAL_PURGE_INDEX) {
+ gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+ "Index entry needs to be purged for: %s ",
+ uuid_utoa(loc->gfid));
+ /* We need to send zero-xattrop so that stale index entry could be
+ * removed. We need not take lock on this entry to do so as
+ * xattrop on a brick is atomic. */
+ ec_heal_purge_stale_index(frame, ec, loc->inode);
+ goto out;
+ } else if (need_heal == EC_HEAL_NONEED) {
+ gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
+ "Heal is not required for : %s ", uuid_utoa(loc->gfid));
+ goto out;
+ }
}
- msources = alloca0(ec->nodes);
- mhealed_sinks = alloca0(ec->nodes);
- ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
- if (ret == 0) {
- mgood = ec_char_array_to_mask(msources, ec->nodes);
- mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
- } else {
- op_ret = -1;
- op_errno = -ret;
- }
sources = alloca0(ec->nodes);
healed_sinks = alloca0(ec->nodes);
if (IA_ISREG(loc->inode->ia_type)) {
ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
healed_sinks);
} else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
- ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
+ ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
+ &pending);
} else {
ret = 0;
memcpy(sources, participants, ec->nodes);
@@ -2538,15 +2661,27 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
op_ret = -1;
op_errno = -ret;
}
+ msources = alloca0(ec->nodes);
+ mhealed_sinks = alloca0(ec->nodes);
+ ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
+ if (ret == 0) {
+ mgood = ec_char_array_to_mask(msources, ec->nodes);
+ mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
+ } else {
+ op_ret = -1;
+ op_errno = -ret;
+ }
out:
+ ec_reset_entry_healing(fop);
if (fop->cbks.heal) {
- fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno,
ec_char_array_to_mask(participants, ec->nodes),
- mgood & good, mbad & bad, NULL);
+ mgood & good, mbad & bad, pending, NULL);
}
if (frame)
STACK_DESTROY(frame->root);
+
return;
}
@@ -2593,8 +2728,8 @@ void
ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
{
if (fop->cbks.heal) {
- fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
- NULL);
+ fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
+ 0, 0, NULL);
}
ec_fop_data_release(fop);
}
@@ -2603,13 +2738,31 @@ void
ec_launch_heal(ec_t *ec, ec_fop_data_t *fop)
{
int ret = 0;
+ call_frame_t *frame = NULL;
+
+ frame = create_frame(ec->xl, ec->xl->ctx->pool);
+ if (!frame) {
+ ret = -1;
+ goto out;
+ }
+
+ ec_owner_set(frame, frame->root);
+ /*Do heal as root*/
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+ /*Mark the fops as internal*/
+ frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
ret = synctask_new(ec->xl->ctx->env, ec_synctask_heal_wrap, ec_heal_done,
- NULL, fop);
+ frame, fop);
+out:
if (ret < 0) {
ec_fop_set_error(fop, ENOMEM);
ec_heal_fail(ec, fop);
}
+
+ if (frame)
+ STACK_DESTROY(frame->root);
}
void
@@ -2650,11 +2803,33 @@ ec_handle_healers_done(ec_fop_data_t *fop)
ec_launch_heal(ec, heal_fop);
}
+gf_boolean_t
+ec_is_entry_healing(ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ int32_t heal_count = 0;
+ loc_t *loc = NULL;
+
+ loc = &fop->loc[0];
+
+ LOCK(&loc->inode->lock);
+ {
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ heal_count = ctx->heal_count;
+ }
+ }
+ UNLOCK(&loc->inode->lock);
+ GF_ASSERT(heal_count >= 0);
+ return heal_count;
+}
+
void
ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
{
gf_boolean_t can_heal = _gf_true;
ec_t *ec = this->private;
+ ec_fop_data_t *fop_rel = NULL;
if (fop->req_frame == NULL) {
LOCK(&ec->lock);
@@ -2662,8 +2837,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
if ((ec->background_heals > 0) &&
(ec->heal_wait_qlen + ec->background_heals) >
(ec->heal_waiters + ec->healers)) {
- list_add_tail(&fop->healer, &ec->heal_waiting);
- ec->heal_waiters++;
+ if (!ec_is_entry_healing(fop)) {
+ list_add_tail(&fop->healer, &ec->heal_waiting);
+ ec->heal_waiters++;
+ ec_set_entry_healing(fop);
+ } else {
+ fop_rel = fop;
+ }
fop = __ec_dequeue_heals(ec);
} else {
can_heal = _gf_false;
@@ -2673,8 +2853,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
}
if (can_heal) {
- if (fop)
+ if (fop) {
+ if (fop->req_frame != NULL) {
+ ec_set_entry_healing(fop);
+ }
ec_launch_heal(ec, fop);
+ }
} else {
gf_msg_debug(this->name, 0,
"Max number of heals are "
@@ -2682,12 +2866,15 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
ec_fop_set_error(fop, EBUSY);
ec_heal_fail(ec, fop);
}
+ if (fop_rel) {
+ ec_heal_done(0, NULL, fop_rel);
+ }
}
void
-ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_heal_cbk_t func, void *data, loc_t *loc, int32_t partial,
- dict_t *xdata)
+ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc,
+ int32_t partial, dict_t *xdata)
{
ec_cbk_t callback = {.heal = func};
ec_fop_data_t *fop = NULL;
@@ -2703,7 +2890,7 @@ ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
if (frame && frame->local)
goto fail;
- fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags,
NULL, NULL, callback, data);
err = ENOMEM;
@@ -2729,15 +2916,27 @@ fail:
if (fop)
ec_fop_data_release(fop);
if (func)
- func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
+ func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
}
int
ec_replace_heal_done(int ret, call_frame_t *heal, void *opaque)
{
ec_t *ec = opaque;
+ gf_boolean_t last_fop = _gf_false;
+ if (GF_ATOMIC_DEC(ec->async_fop_count) == 0) {
+ LOCK(&ec->lock);
+ {
+ last_fop = __ec_is_last_fop(ec);
+ }
+ UNLOCK(&ec->lock);
+ }
gf_msg_debug(ec->xl->name, 0, "getxattr on bricks is done ret %d", ret);
+
+ if (last_fop)
+ ec_pending_fops_completed(ec);
+
return 0;
}
@@ -2777,6 +2976,10 @@ ec_replace_brick_heal_wrap(void *opaque)
itable = ec->xl->itable;
else
goto out;
+
+ if (xlator_is_cleanup_starting(ec->xl))
+ goto out;
+
ret = ec_replace_heal(ec, itable->root);
out:
return ret;
@@ -2787,14 +2990,15 @@ ec_launch_replace_heal(ec_t *ec)
{
int ret = -1;
- if (!ec)
- return ret;
ret = synctask_new(ec->xl->ctx->env, ec_replace_brick_heal_wrap,
ec_replace_heal_done, NULL, ec);
+
if (ret < 0) {
gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d",
ret);
+ ec_replace_heal_done(-1, NULL, ec);
}
+
return ret;
}
@@ -2826,7 +3030,7 @@ out:
static int32_t
_need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources,
gf_boolean_t self_locked, int32_t lock_count,
- ec_heal_need_t *need_heal)
+ ec_heal_need_t *need_heal, uint64_t *versions)
{
int i = 0;
int source_count = 0;
@@ -2836,11 +3040,18 @@ _need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources,
*need_heal = EC_HEAL_NONEED;
if (self_locked || lock_count == 0) {
for (i = 0; i < ec->nodes; i++) {
- if (dirty[i]) {
+ if (dirty[i] || (versions[i] != versions[0])) {
*need_heal = EC_HEAL_MUST;
goto out;
}
}
+ /* If lock count is 0, all dirty flags are 0 and all the
+ * versions are macthing then why are we here. It looks
+ * like something went wrong while removing the index entries
+ * after completing a successful heal or fop. In this case
+ * we need to remove this index entry to avoid triggering heal
+ * in a loop and causing lookups again and again*/
+ *need_heal = EC_HEAL_PURGE_INDEX;
} else {
for (i = 0; i < ec->nodes; i++) {
/* Since each lock can only increment the dirty
@@ -2852,6 +3063,9 @@ _need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources,
*need_heal = EC_HEAL_MUST;
goto out;
}
+ if (dirty[i] != dirty[0] || (versions[i] != versions[0])) {
+ *need_heal = EC_HEAL_MAYBE;
+ }
}
}
} else {
@@ -2872,7 +3086,6 @@ ec_need_metadata_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
unsigned char *healed_sinks = NULL;
uint64_t *meta_versions = NULL;
int ret = 0;
- int i = 0;
sources = alloca0(ec->nodes);
healed_sinks = alloca0(ec->nodes);
@@ -2885,15 +3098,7 @@ ec_need_metadata_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
}
ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
- need_heal);
- if (ret == ec->nodes && *need_heal == EC_HEAL_NONEED) {
- for (i = 1; i < ec->nodes; i++) {
- if (meta_versions[i] != meta_versions[0]) {
- *need_heal = EC_HEAL_MUST;
- goto out;
- }
- }
- }
+ need_heal, meta_versions);
out:
return ret;
}
@@ -2929,7 +3134,7 @@ ec_need_data_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
}
ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
- need_heal);
+ need_heal, data_versions);
out:
return ret;
}
@@ -2957,7 +3162,7 @@ ec_need_entry_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
}
ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
- need_heal);
+ need_heal, data_versions);
out:
return ret;
}
@@ -3055,10 +3260,6 @@ ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
need_heal:
ret = ec_need_heal(ec, inode, replies, lock_count, self_locked, thorough,
need_heal);
-
- if (!self_locked && *need_heal == EC_HEAL_MUST) {
- *need_heal = EC_HEAL_MAYBE;
- }
out:
cluster_replies_wipe(replies, ec->nodes);
loc_wipe(&loc);
@@ -3144,7 +3345,7 @@ ec_get_heal_info(xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp)
ret = ec_heal_inspect(frame, ec, loc.inode, up_subvols, _gf_false,
_gf_false, &need_heal);
- if (ret == ec->nodes && need_heal == EC_HEAL_NONEED) {
+ if (ret == ec->nodes && need_heal != EC_HEAL_MAYBE) {
goto set_heal;
}
need_heal = EC_HEAL_NONEED;
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
index 130790c66ac..5c1586bc9c5 100644
--- a/xlators/cluster/ec/src/ec-heald.c
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -8,15 +8,14 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-#include "compat-errno.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat-errno.h>
#include "ec.h"
#include "ec-messages.h"
#include "ec-heald.h"
#include "ec-mem-types.h"
-#include "syncop.h"
-#include "syncop-utils.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
#include "protocol-common.h"
#define NTH_INDEX_HEALER(this, n) \
@@ -63,7 +62,7 @@ __ec_shd_healer_wait(struct subvol_healer *healer)
ec = healer->this->private;
disabled_loop:
- wait_till.tv_sec = time(NULL) + 60;
+ wait_till.tv_sec = gf_time() + ec->shd.timeout;
while (!healer->rerun) {
ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
@@ -71,6 +70,11 @@ disabled_loop:
break;
}
+ if (ec->shutdown) {
+ healer->running = _gf_false;
+ return -1;
+ }
+
ret = healer->rerun;
healer->rerun = 0;
@@ -152,19 +156,78 @@ ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name)
return ret;
}
+static gf_boolean_t
+ec_is_heal_completed(char *status)
+{
+ char *bad_pos = NULL;
+ char *zero_pos = NULL;
+
+ if (!status) {
+ return _gf_false;
+ }
+
+ /*Logic:
+ * Status will be of the form Good: <binary>, Bad: <binary>
+ * If heal completes, if we do strchr for '0' it should be present after
+ * 'Bad:' i.e. strRchr for ':'
+ * */
+
+ zero_pos = strchr(status, '0');
+ bad_pos = strrchr(status, ':');
+ if (!zero_pos || !bad_pos) {
+ /*malformed status*/
+ return _gf_false;
+ }
+
+ if (zero_pos > bad_pos) {
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
int
ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
gf_boolean_t full)
{
+ dict_t *xdata = NULL;
+ dict_t *dict = NULL;
+ uint32_t count;
int32_t ret;
+ char *heal_status = NULL;
+ ec_t *ec = healer->this->private;
+
+ GF_ATOMIC_INC(ec->stats.shd.attempted);
+ ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL,
+ &xdata);
+ if (ret == 0) {
+ if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) {
+ if (ec_is_heal_completed(heal_status)) {
+ GF_ATOMIC_INC(ec->stats.shd.completed);
+ }
+ }
+ }
- ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
- if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
+ if (!full && (loc->inode->ia_type == IA_IFDIR)) {
/* If we have just healed a directory, it's possible that
- * other index entries have appeared to be healed. We put a
- * mark so that we can check it later and restart a scan
- * without delay. */
- healer->rerun = _gf_true;
+ * other index entries have appeared to be healed. */
+ if ((xdata != NULL) &&
+ (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
+ (count > 0)) {
+ /* Force a rerun of the index healer. */
+ gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
+ count);
+
+ healer->rerun = _gf_true;
+ }
+ }
+
+ if (xdata != NULL) {
+ dict_unref(xdata);
+ }
+
+ if (dict) {
+ dict_unref(dict);
}
return ret;
@@ -241,9 +304,11 @@ ec_shd_index_sweep(struct subvol_healer *healer)
goto out;
}
+ _mask_cancellation();
ret = syncop_mt_dir_scan(NULL, subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
healer, ec_shd_index_heal, xdata,
ec->shd.max_threads, ec->shd.wait_qlength);
+ _unmask_cancellation();
out:
if (xdata)
dict_unref(xdata);
@@ -263,6 +328,11 @@ ec_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
int ret = 0;
ec = this->private;
+
+ if (this->cleanup_starting) {
+ return -ENOTCONN;
+ }
+
if (ec->xl_up_count <= ec->fragments) {
return -ENOTCONN;
}
@@ -305,11 +375,15 @@ ec_shd_full_sweep(struct subvol_healer *healer, inode_t *inode)
{
ec_t *ec = NULL;
loc_t loc = {0};
+ int ret = -1;
ec = healer->this->private;
loc.inode = inode;
- return syncop_ftw(ec->xl_list[healer->subvol], &loc,
- GF_CLIENT_PID_SELF_HEALD, healer, ec_shd_full_heal);
+ _mask_cancellation();
+ ret = syncop_ftw(ec->xl_list[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer, ec_shd_full_heal);
+ _unmask_cancellation();
+ return ret;
}
void *
@@ -317,13 +391,16 @@ ec_shd_index_healer(void *data)
{
struct subvol_healer *healer = NULL;
xlator_t *this = NULL;
+ int run = 0;
healer = data;
THIS = this = healer->this;
ec_t *ec = this->private;
for (;;) {
- ec_shd_healer_wait(healer);
+ run = ec_shd_healer_wait(healer);
+ if (run == -1)
+ break;
if (ec->xl_up_count > ec->fragments) {
gf_msg_debug(this->name, 0, "starting index sweep on subvol %s",
@@ -352,16 +429,12 @@ ec_shd_full_healer(void *data)
rootloc.inode = this->itable->root;
for (;;) {
- pthread_mutex_lock(&healer->mutex);
- {
- run = __ec_shd_healer_wait(healer);
- if (!run)
- healer->running = _gf_false;
- }
- pthread_mutex_unlock(&healer->mutex);
-
- if (!run)
+ run = ec_shd_healer_wait(healer);
+ if (run < 0) {
break;
+ } else if (run == 0) {
+ continue;
+ }
if (ec->xl_up_count > ec->fragments) {
gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_FULL_SWEEP_START,
@@ -429,6 +502,9 @@ unlock:
int
ec_shd_full_healer_spawn(xlator_t *this, int subvol)
{
+ if (xlator_is_cleanup_starting(this))
+ return -1;
+
return ec_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol),
ec_shd_full_healer);
}
@@ -436,6 +512,9 @@ ec_shd_full_healer_spawn(xlator_t *this, int subvol)
int
ec_shd_index_healer_spawn(xlator_t *this, int subvol)
{
+ if (xlator_is_cleanup_starting(this))
+ return -1;
+
return ec_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol),
ec_shd_index_healer);
}
@@ -562,3 +641,41 @@ out:
dict_del(output, this->name);
return ret;
}
+
+void
+ec_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
+{
+ if (!healer)
+ return;
+
+ pthread_cond_destroy(&healer->cond);
+ pthread_mutex_destroy(&healer->mutex);
+}
+
+void
+ec_selfheal_daemon_fini(xlator_t *this)
+{
+ struct subvol_healer *healer = NULL;
+ ec_self_heald_t *shd = NULL;
+ ec_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+ if (!priv)
+ return;
+
+ shd = &priv->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < priv->nodes; i++) {
+ healer = &shd->index_healers[i];
+ ec_destroy_healer_object(this, healer);
+
+ healer = &shd->full_healers[i];
+ ec_destroy_healer_object(this, healer);
+ }
+
+ GF_FREE(shd->index_healers);
+ GF_FREE(shd->full_healers);
+}
diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h
index 4d141d767e5..6c7da4edc10 100644
--- a/xlators/cluster/ec/src/ec-heald.h
+++ b/xlators/cluster/ec/src/ec-heald.h
@@ -11,9 +11,9 @@
#ifndef __EC_HEALD_H__
#define __EC_HEALD_H__
-#include "xlator.h"
-
-#include "ec-types.h"
+#include "ec-types.h" // for ec_t
+#include "glusterfs/dict.h" // for dict_t
+#include "glusterfs/globals.h" // for xlator_t
int
ec_xl_op(xlator_t *this, dict_t *input, dict_t *output);
@@ -24,4 +24,7 @@ ec_selfheal_daemon_init(xlator_t *this);
void
ec_shd_index_healer_wake(ec_t *ec);
+void
+ec_selfheal_daemon_fini(xlator_t *this);
+
#endif /* __EC_HEALD_H__ */
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
index dec39b9d2aa..48f54475e01 100644
--- a/xlators/cluster/ec/src/ec-helpers.c
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -10,7 +10,7 @@
#include <libgen.h>
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "ec.h"
#include "ec-mem-types.h"
@@ -476,7 +476,7 @@ out:
int32_t
ec_loc_setup_path(xlator_t *xl, loc_t *loc)
{
- uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+ static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
char *name;
int32_t ret = -EINVAL;
@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl)
memset(ctx, 0, sizeof(*ctx));
INIT_LIST_HEAD(&ctx->heal);
INIT_LIST_HEAD(&ctx->stripe_cache.lru);
+ ctx->heal_count = 0;
value = (uint64_t)(uintptr_t)ctx;
if (__inode_ctx_set(inode, xl, &value) != 0) {
GF_FREE(ctx);
@@ -752,6 +753,7 @@ __ec_fd_get(fd_t *fd, xlator_t *xl)
{
int i = 0;
ec_fd_t *ctx = NULL;
+ ec_inode_t *ictx = NULL;
uint64_t value = 0;
ec_t *ec = xl->private;
@@ -774,6 +776,12 @@ __ec_fd_get(fd_t *fd, xlator_t *xl)
GF_FREE(ctx);
return NULL;
}
+ /* Only refering bad-version so no need for lock
+ * */
+ ictx = __ec_inode_get(fd->inode, xl);
+ if (ictx) {
+ ctx->bad_version = ictx->bad_version;
+ }
}
} else {
ctx = (ec_fd_t *)(uintptr_t)value;
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index 7a8b174bbed..dad5f4d7018 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -8,9 +8,6 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-
#include "ec.h"
#include "ec-messages.h"
#include "ec-helpers.h"
@@ -74,7 +71,7 @@ ec_manager_access(ec_fop_data_t *fop, int32_t state)
case EC_STATE_INIT:
case EC_STATE_LOCK:
ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -135,7 +132,7 @@ ec_manager_access(ec_fop_data_t *fop, int32_t state)
void
ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_access_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc,
int32_t mask, dict_t *xdata)
{
ec_cbk_t callback = {.access = func};
@@ -149,7 +146,7 @@ ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_ACCESS, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_access,
+ target, fop_flags, ec_wind_access,
ec_manager_access, callback, data);
if (fop == NULL) {
goto out;
@@ -301,10 +298,10 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
SLEN(GF_XATTR_CLRLK_CMD)) != 0)) {
if (fop->fd == NULL) {
ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
} else {
ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
}
ec_lock(fop);
}
@@ -393,15 +390,34 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
int32_t
ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
int32_t op_ret, int32_t op_errno, uintptr_t mask,
- uintptr_t good, uintptr_t bad, dict_t *xdata)
+ uintptr_t good, uintptr_t bad, uint32_t pending,
+ dict_t *xdata)
{
- ec_fop_data_t *fop = cookie;
- fop_getxattr_cbk_t func = fop->data;
+ fop_getxattr_cbk_t func = cookie;
ec_t *ec = xl->private;
dict_t *dict = NULL;
char *str;
char bin1[65], bin2[65];
+ /* We try to return the 'pending' information in xdata, but if this cannot
+ * be set, we will ignore it silently. We prefer to report the success or
+ * failure of the heal itself. */
+ if (xdata == NULL) {
+ xdata = dict_new();
+ } else {
+ dict_ref(xdata);
+ }
+ if (xdata != NULL) {
+ if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
+ /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
+ * enforces to check the result in this case. However we don't
+ * really care if it succeeded or not. We'll just do the same.
+ *
+ * This empty 'if' avoids the warning, and it will be removed by
+ * the optimizer. */
+ }
+ }
+
if (op_ret >= 0) {
dict = dict_new();
if (dict == NULL) {
@@ -435,18 +451,21 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
}
out:
- func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
+ func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
if (dict != NULL) {
dict_unref(dict);
}
+ if (xdata != NULL) {
+ dict_unref(xdata);
+ }
return 0;
}
void
ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_getxattr_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc,
const char *name, dict_t *xdata)
{
ec_cbk_t callback = {.getxattr = func};
@@ -468,7 +487,7 @@ ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
}
fop = ec_fop_data_allocate(
- frame, this, GF_FOP_GETXATTR, EC_FLAG_LOCK_SHARED, target, minimum,
+ frame, this, GF_FOP_GETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags,
ec_wind_getxattr, ec_manager_getxattr, callback, data);
if (fop == NULL) {
goto out;
@@ -588,7 +607,7 @@ ec_wind_fgetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fgetxattr_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd,
const char *name, dict_t *xdata)
{
ec_cbk_t callback = {.fgetxattr = func};
@@ -602,7 +621,7 @@ ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(
- frame, this, GF_FOP_FGETXATTR, EC_FLAG_LOCK_SHARED, target, minimum,
+ frame, this, GF_FOP_FGETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags,
ec_wind_fgetxattr, ec_manager_getxattr, callback, data);
if (fop == NULL) {
goto out;
@@ -774,13 +793,15 @@ ec_manager_open(ec_fop_data_t *fop, int32_t state)
return EC_STATE_REPORT;
}
- err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
- if (err != 0) {
- UNLOCK(&fop->fd->lock);
+ if (!ctx->loc.inode) {
+ err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
+ if (err != 0) {
+ UNLOCK(&fop->fd->lock);
- fop->error = -err;
+ fop->error = -err;
- return EC_STATE_REPORT;
+ return EC_STATE_REPORT;
+ }
}
ctx->flags = fop->int32;
@@ -869,9 +890,9 @@ ec_manager_open(ec_fop_data_t *fop, int32_t state)
}
void
-ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_open_cbk_t func, void *data, loc_t *loc, int32_t flags, fd_t *fd,
- dict_t *xdata)
+ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc,
+ int32_t flags, fd_t *fd, dict_t *xdata)
{
ec_cbk_t callback = {.open = func};
ec_fop_data_t *fop = NULL;
@@ -884,7 +905,7 @@ ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_OPEN, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_open, ec_manager_open,
+ target, fop_flags, ec_wind_open, ec_manager_open,
callback, data);
if (fop == NULL) {
goto out;
@@ -1008,7 +1029,7 @@ ec_manager_readlink(ec_fop_data_t *fop, int32_t state)
case EC_STATE_INIT:
case EC_STATE_LOCK:
ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1071,7 +1092,7 @@ ec_manager_readlink(ec_fop_data_t *fop, int32_t state)
void
ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_readlink_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc,
size_t size, dict_t *xdata)
{
ec_cbk_t callback = {.readlink = func};
@@ -1085,7 +1106,7 @@ ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(
- frame, this, GF_FOP_READLINK, EC_FLAG_LOCK_SHARED, target, minimum,
+ frame, this, GF_FOP_READLINK, EC_FLAG_LOCK_SHARED, target, fop_flags,
ec_wind_readlink, ec_manager_readlink, callback, data);
if (fop == NULL) {
goto out;
@@ -1131,7 +1152,7 @@ ec_readv_rebuild(ec_t *ec, ec_fop_data_t *fop, ec_cbk_data_t *cbk)
ec_cbk_data_t *ans = NULL;
struct iobref *iobref = NULL;
void *ptr;
- size_t fsize = 0, size = 0, max = 0;
+ uint64_t fsize = 0, size = 0, max = 0;
int32_t pos, err = -ENOMEM;
if (cbk->op_ret < 0) {
@@ -1331,6 +1352,7 @@ int32_t
ec_manager_readv(ec_fop_data_t *fop, int32_t state)
{
ec_cbk_data_t *cbk;
+ ec_t *ec = fop->xl->private;
switch (state) {
case EC_STATE_INIT:
@@ -1350,6 +1372,9 @@ ec_manager_readv(ec_fop_data_t *fop, int32_t state)
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
+ if (ec->read_mask) {
+ fop->mask &= ec->read_mask;
+ }
ec_dispatch_min(fop);
return EC_STATE_PREPARE_ANSWER;
@@ -1417,9 +1442,9 @@ ec_manager_readv(ec_fop_data_t *fop, int32_t state)
}
void
-ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_readv_cbk_t func, void *data, fd_t *fd, size_t size, off_t offset,
- uint32_t flags, dict_t *xdata)
+ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata)
{
ec_cbk_t callback = {.readv = func};
ec_fop_data_t *fop = NULL;
@@ -1432,8 +1457,8 @@ ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_READ, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_readv, ec_manager_readv,
- callback, data);
+ target, fop_flags, ec_wind_readv,
+ ec_manager_readv, callback, data);
if (fop == NULL) {
goto out;
}
@@ -1536,7 +1561,7 @@ int32_t
ec_manager_seek(ec_fop_data_t *fop, int32_t state)
{
ec_cbk_data_t *cbk;
- size_t size;
+ uint64_t size;
switch (state) {
case EC_STATE_INIT:
@@ -1548,7 +1573,7 @@ ec_manager_seek(ec_fop_data_t *fop, int32_t state)
case EC_STATE_LOCK:
ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset,
- LLONG_MAX);
+ EC_RANGE_FULL);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1637,9 +1662,9 @@ ec_manager_seek(ec_fop_data_t *fop, int32_t state)
}
void
-ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_seek_cbk_t func, void *data, fd_t *fd, off_t offset,
- gf_seek_what_t what, dict_t *xdata)
+ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd,
+ off_t offset, gf_seek_what_t what, dict_t *xdata)
{
ec_cbk_t callback = {.seek = func};
ec_fop_data_t *fop = NULL;
@@ -1652,7 +1677,7 @@ ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_SEEK, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_seek, ec_manager_seek,
+ target, fop_flags, ec_wind_seek, ec_manager_seek,
callback, data);
if (fop == NULL) {
goto out;
@@ -1764,9 +1789,10 @@ ec_manager_stat(ec_fop_data_t *fop, int32_t state)
case EC_STATE_LOCK:
if (fop->fd == NULL) {
ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
} else {
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
}
ec_lock(fop);
@@ -1854,8 +1880,9 @@ ec_manager_stat(ec_fop_data_t *fop, int32_t state)
}
void
-ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_stat_cbk_t func, void *data, loc_t *loc, dict_t *xdata)
+ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc,
+ dict_t *xdata)
{
ec_cbk_t callback = {.stat = func};
ec_fop_data_t *fop = NULL;
@@ -1868,7 +1895,7 @@ ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_STAT, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_stat, ec_manager_stat,
+ target, fop_flags, ec_wind_stat, ec_manager_stat,
callback, data);
if (fop == NULL) {
goto out;
@@ -1964,8 +1991,9 @@ ec_wind_fstat(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
}
void
-ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
- fop_fstat_cbk_t func, void *data, fd_t *fd, dict_t *xdata)
+ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd,
+ dict_t *xdata)
{
ec_cbk_t callback = {.fstat = func};
ec_fop_data_t *fop = NULL;
@@ -1978,8 +2006,8 @@ ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_FSTAT, EC_FLAG_LOCK_SHARED,
- target, minimum, ec_wind_fstat, ec_manager_stat,
- callback, data);
+ target, fop_flags, ec_wind_fstat,
+ ec_manager_stat, callback, data);
if (fop == NULL) {
goto out;
}
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
index ffdac632683..9b5fe2a7fdc 100644
--- a/xlators/cluster/ec/src/ec-inode-write.c
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -8,10 +8,6 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-
-#include "ec.h"
#include "ec-messages.h"
#include "ec-helpers.h"
#include "ec-common.h"
@@ -68,8 +64,8 @@ out:
return 0;
}
-int32_t
-ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset, size_t size)
+static int32_t
+ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset, uint64_t size)
{
struct iobref *iobref = NULL;
struct iobuf *iobuf = NULL;
@@ -89,6 +85,8 @@ ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset, size_t size)
goto out;
}
+ if (fop->locks[0].lock)
+ ec_lock_update_good(fop->locks[0].lock, fop);
vector.iov_base = iobuf->ptr;
vector.iov_len = size;
memset(vector.iov_base, 0, vector.iov_len);
@@ -183,26 +181,26 @@ ec_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
switch (fop->id) {
case GF_FOP_SETXATTR:
if (fop->cbks.setxattr) {
- fop->cbks.setxattr(frame, cookie, this, op_ret, op_errno,
- xdata);
+ QUORUM_CBK(fop->cbks.setxattr, fop, frame, cookie, this, op_ret,
+ op_errno, xdata);
}
break;
case GF_FOP_REMOVEXATTR:
if (fop->cbks.removexattr) {
- fop->cbks.removexattr(frame, cookie, this, op_ret, op_errno,
- xdata);
+ QUORUM_CBK(fop->cbks.removexattr, fop, frame, cookie, this,
+ op_ret, op_errno, xdata);
}
break;
case GF_FOP_FSETXATTR:
if (fop->cbks.fsetxattr) {
- fop->cbks.fsetxattr(frame, cookie, this, op_ret, op_errno,
- xdata);
+ QUORUM_CBK(fop->cbks.fsetxattr, fop, frame, cookie, this,
+ op_ret, op_errno, xdata);
}
break;
case GF_FOP_FREMOVEXATTR:
if (fop->cbks.fremovexattr) {
- fop->cbks.fremovexattr(frame, cookie, this, op_ret, op_errno,
- xdata);
+ QUORUM_CBK(fop->cbks.fremovexattr, fop, frame, cookie, this,
+ op_ret, op_errno, xdata);
}
break;
}
@@ -219,10 +217,10 @@ ec_manager_xattr(ec_fop_data_t *fop, int32_t state)
if (fop->fd == NULL) {
ec_lock_prepare_inode(fop, &fop->loc[0],
EC_UPDATE_META | EC_QUERY_INFO, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
} else {
ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
- 0, LLONG_MAX);
+ 0, EC_RANGE_FULL);
}
ec_lock(fop);
@@ -281,7 +279,7 @@ ec_manager_xattr(ec_fop_data_t *fop, int32_t state)
void
ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_removexattr_cbk_t func, void *data,
+ uint32_t fop_flags, fop_removexattr_cbk_t func, void *data,
loc_t *loc, const char *name, dict_t *xdata)
{
ec_cbk_t callback = {.removexattr = func};
@@ -295,7 +293,7 @@ ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_REMOVEXATTR, 0, target,
- minimum, ec_wind_removexattr, ec_manager_xattr,
+ fop_flags, ec_wind_removexattr, ec_manager_xattr,
callback, data);
if (fop == NULL) {
goto out;
@@ -361,7 +359,7 @@ ec_wind_fremovexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fremovexattr_cbk_t func, void *data,
+ uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data,
fd_t *fd, const char *name, dict_t *xdata)
{
ec_cbk_t callback = {.fremovexattr = func};
@@ -375,8 +373,8 @@ ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_FREMOVEXATTR, 0, target,
- minimum, ec_wind_fremovexattr, ec_manager_xattr,
- callback, data);
+ fop_flags, ec_wind_fremovexattr,
+ ec_manager_xattr, callback, data);
if (fop == NULL) {
goto out;
}
@@ -455,10 +453,10 @@ ec_manager_setattr(ec_fop_data_t *fop, int32_t state)
if (fop->fd == NULL) {
ec_lock_prepare_inode(fop, &fop->loc[0],
EC_UPDATE_META | EC_QUERY_INFO, 0,
- LLONG_MAX);
+ EC_RANGE_FULL);
} else {
ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
- 0, LLONG_MAX);
+ 0, EC_RANGE_FULL);
}
ec_lock(fop);
@@ -492,16 +490,15 @@ ec_manager_setattr(ec_fop_data_t *fop, int32_t state)
if (fop->id == GF_FOP_SETATTR) {
if (fop->cbks.setattr != NULL) {
- fop->cbks.setattr(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, &cbk->iatt[0],
- &cbk->iatt[1], cbk->xdata);
+ QUORUM_CBK(fop->cbks.setattr, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno,
+ &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
}
} else {
if (fop->cbks.fsetattr != NULL) {
- fop->cbks.fsetattr(fop->req_frame, fop, fop->xl,
- cbk->op_ret, cbk->op_errno,
- &cbk->iatt[0], &cbk->iatt[1],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.fsetattr, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno,
+ &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
}
}
@@ -550,7 +547,7 @@ ec_manager_setattr(ec_fop_data_t *fop, int32_t state)
void
ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_setattr_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
ec_cbk_t callback = {.setattr = func};
@@ -563,9 +560,9 @@ ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_SETATTR, 0, target, minimum,
- ec_wind_setattr, ec_manager_setattr, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_SETATTR, 0, target,
+ fop_flags, ec_wind_setattr, ec_manager_setattr,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -627,7 +624,7 @@ ec_wind_fsetattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fsetattr_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
ec_cbk_t callback = {.fsetattr = func};
@@ -640,9 +637,9 @@ ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETATTR, 0, target, minimum,
- ec_wind_fsetattr, ec_manager_setattr, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETATTR, 0, target,
+ fop_flags, ec_wind_fsetattr, ec_manager_setattr,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -707,7 +704,7 @@ ec_wind_setxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_setxattr_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc,
dict_t *dict, int32_t flags, dict_t *xdata)
{
ec_cbk_t callback = {.setxattr = func};
@@ -720,9 +717,9 @@ ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_SETXATTR, 0, target, minimum,
- ec_wind_setxattr, ec_manager_xattr, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_SETXATTR, 0, target,
+ fop_flags, ec_wind_setxattr, ec_manager_xattr,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -825,7 +822,7 @@ ec_wind_fsetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fsetxattr_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd,
dict_t *dict, int32_t flags, dict_t *xdata)
{
ec_cbk_t callback = {.fsetxattr = func};
@@ -839,7 +836,7 @@ ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETXATTR, 0, target,
- minimum, ec_wind_fsetxattr, ec_manager_xattr,
+ fop_flags, ec_wind_fsetxattr, ec_manager_xattr,
callback, data);
if (fop == NULL) {
goto out;
@@ -992,9 +989,9 @@ ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.fallocate != NULL) {
- fop->cbks.fallocate(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.fallocate, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -1035,7 +1032,7 @@ ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
void
ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd,
int32_t mode, off_t offset, size_t len, dict_t *xdata)
{
ec_cbk_t callback = {.fallocate = func};
@@ -1049,8 +1046,8 @@ ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_FALLOCATE, 0, target,
- minimum, ec_wind_fallocate, ec_manager_fallocate,
- callback, data);
+ fop_flags, ec_wind_fallocate,
+ ec_manager_fallocate, callback, data);
if (fop == NULL) {
goto out;
}
@@ -1101,8 +1098,8 @@ ec_update_discard_write(ec_fop_data_t *fop, uintptr_t mask)
ec_t *ec = fop->xl->private;
off_t off_head = 0;
off_t off_tail = 0;
- size_t size_head = 0;
- size_t size_tail = 0;
+ uint64_t size_head = 0;
+ uint64_t size_tail = 0;
int error = 0;
off_head = fop->offset * ec->fragments - fop->int32;
@@ -1172,7 +1169,7 @@ ec_manager_discard(ec_fop_data_t *fop, int32_t state)
{
ec_cbk_data_t *cbk = NULL;
off_t fl_start = 0;
- size_t fl_size = 0;
+ uint64_t fl_size = 0;
switch (state) {
case EC_STATE_INIT:
@@ -1209,8 +1206,8 @@ ec_manager_discard(ec_fop_data_t *fop, int32_t state)
ec_dispatch_all(fop);
return EC_STATE_DELAYED_START;
} else {
- /*Assume discard to have succeeded on mask*/
- fop->good = fop->mask;
+ /* Assume discard to have succeeded on all bricks */
+ ec_succeed_all(fop);
}
/* Fall through */
@@ -1245,9 +1242,9 @@ ec_manager_discard(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.discard != NULL) {
- fop->cbks.discard(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.discard, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -1289,7 +1286,7 @@ ec_manager_discard(ec_fop_data_t *fop, int32_t state)
void
ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_discard_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd,
off_t offset, size_t len, dict_t *xdata)
{
ec_cbk_t callback = {.discard = func};
@@ -1302,9 +1299,9 @@ ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target, minimum,
- ec_wind_discard, ec_manager_discard, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target,
+ fop_flags, ec_wind_discard, ec_manager_discard,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -1341,7 +1338,7 @@ int32_t
ec_update_truncate_write(ec_fop_data_t *fop, uintptr_t mask)
{
ec_t *ec = fop->xl->private;
- size_t size = fop->offset * ec->fragments - fop->user_size;
+ uint64_t size = fop->offset * ec->fragments - fop->user_size;
return ec_update_write(fop, mask, fop->user_size, size);
}
@@ -1405,6 +1402,7 @@ int32_t
ec_manager_truncate(ec_fop_data_t *fop, int32_t state)
{
ec_cbk_data_t *cbk;
+ off_t offset_down;
switch (state) {
case EC_STATE_INIT:
@@ -1416,16 +1414,19 @@ ec_manager_truncate(ec_fop_data_t *fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
+ offset_down = fop->user_size;
+ ec_adjust_offset_down(fop->xl->private, &offset_down, _gf_true);
+
if (fop->id == GF_FOP_TRUNCATE) {
ec_lock_prepare_inode(
fop, &fop->loc[0],
EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
- fop->offset, LLONG_MAX);
+ offset_down, EC_RANGE_FULL);
} else {
ec_lock_prepare_fd(
fop, fop->fd,
EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
- fop->offset, LLONG_MAX);
+ offset_down, EC_RANGE_FULL);
}
ec_lock(fop);
@@ -1471,17 +1472,15 @@ ec_manager_truncate(ec_fop_data_t *fop, int32_t state)
if (fop->id == GF_FOP_TRUNCATE) {
if (fop->cbks.truncate != NULL) {
- fop->cbks.truncate(fop->req_frame, fop, fop->xl,
- cbk->op_ret, cbk->op_errno,
- &cbk->iatt[0], &cbk->iatt[1],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.truncate, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno,
+ &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
}
} else {
if (fop->cbks.ftruncate != NULL) {
- fop->cbks.ftruncate(fop->req_frame, fop, fop->xl,
- cbk->op_ret, cbk->op_errno,
- &cbk->iatt[0], &cbk->iatt[1],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.ftruncate, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno,
+ &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
}
}
@@ -1530,7 +1529,7 @@ ec_manager_truncate(ec_fop_data_t *fop, int32_t state)
void
ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_truncate_cbk_t func, void *data, loc_t *loc,
+ uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc,
off_t offset, dict_t *xdata)
{
ec_cbk_t callback = {.truncate = func};
@@ -1543,9 +1542,9 @@ ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_TRUNCATE, 0, target, minimum,
- ec_wind_truncate, ec_manager_truncate, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_TRUNCATE, 0, target,
+ fop_flags, ec_wind_truncate, ec_manager_truncate,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -1604,7 +1603,7 @@ ec_wind_ftruncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_ftruncate_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd,
off_t offset, dict_t *xdata)
{
ec_cbk_t callback = {.ftruncate = func};
@@ -1618,8 +1617,8 @@ ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
fop = ec_fop_data_allocate(frame, this, GF_FOP_FTRUNCATE, 0, target,
- minimum, ec_wind_ftruncate, ec_manager_truncate,
- callback, data);
+ fop_flags, ec_wind_ftruncate,
+ ec_manager_truncate, callback, data);
if (fop == NULL) {
goto out;
}
@@ -1739,7 +1738,7 @@ ec_writev_merge_tail(call_frame_t *frame, void *cookie, xlator_t *this,
{
ec_t *ec = this->private;
ec_fop_data_t *fop = frame->local;
- size_t size, base, tmp;
+ uint64_t size, base, tmp;
if (op_ret >= 0) {
tmp = 0;
@@ -1772,7 +1771,7 @@ ec_writev_merge_head(call_frame_t *frame, void *cookie, xlator_t *this,
{
ec_t *ec = this->private;
ec_fop_data_t *fop = frame->local;
- size_t size, base;
+ uint64_t size, base;
if (op_ret >= 0) {
size = fop->head;
@@ -1884,7 +1883,7 @@ out:
static void
ec_merge_stripe_head_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
{
- size_t head, size;
+ uint32_t head, size;
head = fop->head;
memcpy(fop->vector[0].iov_base, stripe->data, head);
@@ -1900,7 +1899,7 @@ ec_merge_stripe_head_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
static void
ec_merge_stripe_tail_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
{
- size_t head, tail;
+ uint32_t head, tail;
off_t offset;
offset = fop->user_size + fop->head;
@@ -1973,6 +1972,23 @@ ec_get_and_merge_stripe(ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which)
return found;
}
+static uintptr_t
+ec_get_lock_good_mask(inode_t *inode, xlator_t *xl)
+{
+ ec_lock_t *lock = NULL;
+ ec_inode_t *ictx = NULL;
+ LOCK(&inode->lock);
+ {
+ ictx = __ec_inode_get(inode, xl);
+ if (ictx)
+ lock = ictx->inode_lock;
+ }
+ UNLOCK(&inode->lock);
+ if (lock)
+ return lock->good_mask;
+ return 0;
+}
+
void
ec_writev_start(ec_fop_data_t *fop)
{
@@ -2009,20 +2025,29 @@ ec_writev_start(ec_fop_data_t *fop)
if (err != 0) {
goto failed_fd;
}
+ tail = fop->size - fop->user_size - fop->head;
if (fop->head > 0) {
- found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
- if (!found_stripe) {
- if (ec_make_internal_fop_xdata(&xdata)) {
- err = -ENOMEM;
- goto failed_xdata;
+ if (current > fop->offset) {
+ found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata(&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl,
+ ec_get_lock_good_mask(fop->fd->inode, fop->xl),
+ EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd,
+ ec->stripe_size, fop->offset, 0, xdata);
+ }
+ } else {
+ memset(fop->vector[0].iov_base, 0, fop->head);
+ memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
+ if (ec->stripe_cache && (fop->size <= ec->stripe_size)) {
+ ec_add_stripe_in_cache(ec, fop);
}
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
- ec_writev_merge_head, NULL, fd, ec->stripe_size,
- fop->offset, 0, xdata);
}
}
- tail = fop->size - fop->user_size - fop->head;
if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
/* Current locking scheme will make sure the 'current' below will
* never decrease while the fop is in progress, so the checks will
@@ -2035,8 +2060,10 @@ ec_writev_start(ec_fop_data_t *fop)
err = -ENOMEM;
goto failed_xdata;
}
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
- ec_writev_merge_tail, NULL, fd, ec->stripe_size,
+ ec_readv(fop->frame, fop->xl,
+ ec_get_lock_good_mask(fop->fd->inode, fop->xl),
+ EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd,
+ ec->stripe_size,
fop->offset + fop->size - ec->stripe_size, 0, xdata);
}
} else {
@@ -2117,7 +2144,7 @@ ec_manager_writev(ec_fop_data_t *fop, int32_t state)
ec_fd_t *ctx = NULL;
ec_t *ec = fop->xl->private;
off_t fl_start = 0;
- size_t fl_size = LLONG_MAX;
+ uint64_t fl_size = LONG_MAX;
switch (state) {
case EC_STATE_INIT:
@@ -2163,7 +2190,7 @@ ec_manager_writev(ec_fop_data_t *fop, int32_t state)
cbk = ec_fop_prepare_answer(fop, _gf_false);
if (cbk != NULL) {
ec_t *ec = fop->xl->private;
- size_t size;
+ uint64_t size;
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
@@ -2211,9 +2238,9 @@ ec_manager_writev(ec_fop_data_t *fop, int32_t state)
GF_ASSERT(cbk != NULL);
if (fop->cbks.writev != NULL) {
- fop->cbks.writev(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
- cbk->xdata);
+ QUORUM_CBK(fop->cbks.writev, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
}
return EC_STATE_LOCK_REUSE;
@@ -2262,7 +2289,7 @@ ec_manager_writev(ec_fop_data_t *fop, int32_t state)
void
ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_writev_cbk_t func, void *data, fd_t *fd,
+ uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
struct iobref *iobref, dict_t *xdata)
{
@@ -2276,7 +2303,7 @@ ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_WRITE, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_WRITE, 0, target, fop_flags,
ec_wind_writev, ec_manager_writev, callback,
data);
if (fop == NULL) {
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
index 47a069b1775..601960d6154 100644
--- a/xlators/cluster/ec/src/ec-locks.c
+++ b/xlators/cluster/ec/src/ec-locks.c
@@ -8,13 +8,9 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-
#include "ec-helpers.h"
#include "ec-common.h"
#include "ec-combine.h"
-#include "ec-method.h"
#include "ec-fops.h"
#include "ec-messages.h"
@@ -28,9 +24,36 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
ec_t *ec = fop->xl->private;
ec_cbk_data_t *ans = NULL;
ec_cbk_data_t *cbk = NULL;
- uintptr_t locked = 0, notlocked = 0;
+ uintptr_t locked = 0;
+ int32_t good = 0;
+ int32_t eagain = 0;
+ int32_t estale = 0;
int32_t error = -1;
+ /* There are some errors that we'll handle in an special way while trying
+ * to acquire a lock.
+ *
+ * EAGAIN: If it's found during a parallel non-blocking lock request, we
+ * consider that there's contention on the inode, so we consider
+ * the acquisition a failure and try again with a sequential
+ * blocking lock request. This will ensure that we get a lock on
+ * as many bricks as possible (ignoring EAGAIN here would cause
+ * unnecessary triggers of self-healing).
+ *
+ * If it's found during a sequential blocking lock request, it's
+ * considered an error. Lock will only succeed if there are
+ * enough other bricks locked.
+ *
+ * ESTALE: This can appear during parallel or sequential lock request if
+ * the inode has just been unlinked. We consider this error is
+ * not recoverable, but we also don't consider it as fatal. So,
+ * if it happens during parallel lock, we won't attempt a
+ * sequential one unless there are EAGAIN errors on other
+ * bricks (and are enough to form a quorum), but if we reach
+ * quorum counting the ESTALE bricks, we consider the whole
+ * result of the operation is ESTALE instead of EIO.
+ */
+
list_for_each_entry(ans, &fop->cbk_list, list)
{
if (ans->op_ret >= 0) {
@@ -38,24 +61,23 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
error = EIO;
}
locked |= ans->mask;
+ good = ans->count;
cbk = ans;
- } else {
- if (ans->op_errno == EAGAIN) {
- switch (fop->uint32) {
- case EC_LOCK_MODE_NONE:
- case EC_LOCK_MODE_ALL:
- /* Goal is to treat non-blocking lock as failure
- * even if there is a single EAGAIN*/
- notlocked |= ans->mask;
- break;
- }
- }
+ } else if (ans->op_errno == ESTALE) {
+ estale += ans->count;
+ } else if ((ans->op_errno == EAGAIN) &&
+ (fop->uint32 != EC_LOCK_MODE_INC)) {
+ eagain += ans->count;
}
}
if (error == -1) {
- if (gf_bits_count(locked | notlocked) >= ec->fragments) {
- if (notlocked == 0) {
+ /* If we have enough quorum with succeeded and EAGAIN answers, we
+ * ignore for now any ESTALE answer. If there are EAGAIN answers,
+ * we retry with a sequential blocking lock request if needed.
+ * Otherwise we succeed. */
+ if ((good + eagain) >= ec->fragments) {
+ if (eagain == 0) {
if (fop->answer == NULL) {
fop->answer = cbk;
}
@@ -68,21 +90,28 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
case EC_LOCK_MODE_NONE:
error = EAGAIN;
break;
-
case EC_LOCK_MODE_ALL:
fop->uint32 = EC_LOCK_MODE_INC;
break;
-
default:
+ /* This shouldn't happen because eagain cannot be > 0
+ * when fop->uint32 is EC_LOCK_MODE_INC. */
error = EIO;
break;
}
}
} else {
- if (fop->answer && fop->answer->op_ret < 0)
+ /* We have been unable to find enough candidates that will be able
+ * to take the lock. If we have quorum on some answer, we return
+ * it. Otherwise we check if ESTALE answers allow us to reach
+ * quorum. If so, we return ESTALE. */
+ if (fop->answer && fop->answer->op_ret < 0) {
error = fop->answer->op_errno;
- else
+ } else if ((good + eagain + estale) >= ec->fragments) {
+ error = ESTALE;
+ } else {
error = EIO;
+ }
}
}
@@ -275,7 +304,7 @@ ec_manager_entrylk(ec_fop_data_t *fop, int32_t state)
void
ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_entrylk_cbk_t func, void *data,
+ uint32_t fop_flags, fop_entrylk_cbk_t func, void *data,
const char *volume, loc_t *loc, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
@@ -285,13 +314,12 @@ ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
gf_msg_trace("ec", 0, "EC(ENTRYLK) %p", frame);
- VALIDATE_OR_GOTO(this, out);
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_ENTRYLK, 0, target, minimum,
- ec_wind_entrylk, ec_manager_entrylk, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_ENTRYLK, 0, target,
+ fop_flags, ec_wind_entrylk, ec_manager_entrylk,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -404,7 +432,7 @@ ec_wind_fentrylk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
- int32_t minimum, fop_fentrylk_cbk_t func, void *data,
+ uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data,
const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd,
entrylk_type type, dict_t *xdata)
{
@@ -414,13 +442,12 @@ ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
gf_msg_trace("ec", 0, "EC(FENTRYLK) %p", frame);
- VALIDATE_OR_GOTO(this, out);
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_FENTRYLK, 0, target, minimum,
- ec_wind_fentrylk, ec_manager_entrylk, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FENTRYLK, 0, target,
+ fop_flags, ec_wind_fentrylk, ec_manager_entrylk,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -652,7 +679,7 @@ ec_manager_inodelk(ec_fop_data_t *fop, int32_t state)
void
ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
- uintptr_t target, int32_t minimum, fop_inodelk_cbk_t func,
+ uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func,
void *data, const char *volume, loc_t *loc, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
@@ -666,9 +693,9 @@ ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_INODELK, 0, target, minimum,
- ec_wind_inodelk, ec_manager_inodelk, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_INODELK, 0, target,
+ fop_flags, ec_wind_inodelk, ec_manager_inodelk,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -784,7 +811,7 @@ ec_wind_finodelk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
void
ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
- uintptr_t target, int32_t minimum, fop_finodelk_cbk_t func,
+ uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func,
void *data, const char *volume, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
@@ -798,9 +825,9 @@ ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_FINODELK, 0, target, minimum,
- ec_wind_finodelk, ec_manager_inodelk, callback,
- data);
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FINODELK, 0, target,
+ fop_flags, ec_wind_finodelk, ec_manager_inodelk,
+ callback, data);
if (fop == NULL) {
goto out;
}
@@ -1034,7 +1061,7 @@ ec_manager_lk(ec_fop_data_t *fop, int32_t state)
}
void
-ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
+ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags,
fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
@@ -1044,11 +1071,10 @@ ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, int32_t minimum,
gf_msg_trace("ec", 0, "EC(LK) %p", frame);
- VALIDATE_OR_GOTO(this, out);
GF_VALIDATE_OR_GOTO(this->name, frame, out);
GF_VALIDATE_OR_GOTO(this->name, this->private, out);
- fop = ec_fop_data_allocate(frame, this, GF_FOP_LK, 0, target, minimum,
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_LK, 0, target, fop_flags,
ec_wind_lk, ec_manager_lk, callback, data);
if (fop == NULL) {
goto out;
diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h
index fc33d09ea33..3252c4c1c58 100644
--- a/xlators/cluster/ec/src/ec-mem-types.h
+++ b/xlators/cluster/ec/src/ec-mem-types.h
@@ -11,14 +11,13 @@
#ifndef __EC_MEM_TYPES_H__
#define __EC_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_ec_mem_types_ {
ec_mt_ec_t = gf_common_mt_end + 1,
ec_mt_xlator_t,
ec_mt_ec_inode_t,
ec_mt_ec_fd_t,
- ec_mt_ec_heal_t,
ec_mt_subvol_healer_t,
ec_mt_ec_gf_t,
ec_mt_ec_code_t,
diff --git a/xlators/cluster/ec/src/ec-messages.h b/xlators/cluster/ec/src/ec-messages.h
index 5f673d69aa4..72e98f11286 100644
--- a/xlators/cluster/ec/src/ec-messages.h
+++ b/xlators/cluster/ec/src/ec-messages.h
@@ -11,7 +11,7 @@
#ifndef _EC_MESSAGES_H_
#define _EC_MESSAGES_H_
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
/* To add new message IDs, append new identifiers at the end of the list.
*
@@ -55,6 +55,7 @@ GLFS_MSGID(EC, EC_MSG_INVALID_CONFIG, EC_MSG_HEAL_FAIL,
EC_MSG_CONFIG_XATTR_INVALID, EC_MSG_EXTENSION, EC_MSG_EXTENSION_NONE,
EC_MSG_EXTENSION_UNKNOWN, EC_MSG_EXTENSION_UNSUPPORTED,
EC_MSG_EXTENSION_FAILED, EC_MSG_NO_GF, EC_MSG_MATRIX_FAILED,
- EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED);
+ EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED,
+ EC_MSG_THREAD_CLEANUP_FAILED, EC_MSG_FD_BAD);
#endif /* !_EC_MESSAGES_H_ */
diff --git a/xlators/cluster/ec/src/ec-method.c b/xlators/cluster/ec/src/ec-method.c
index 3aff6b096bd..55faed0b193 100644
--- a/xlators/cluster/ec/src/ec-method.c
+++ b/xlators/cluster/ec/src/ec-method.c
@@ -391,10 +391,10 @@ ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen)
}
void
-ec_method_encode(ec_matrix_list_t *list, size_t size, void *in, void **out)
+ec_method_encode(ec_matrix_list_t *list, uint64_t size, void *in, void **out)
{
ec_matrix_t *matrix;
- size_t pos;
+ uint64_t pos;
uint32_t i;
matrix = list->encode;
@@ -408,11 +408,11 @@ ec_method_encode(ec_matrix_list_t *list, size_t size, void *in, void **out)
}
int32_t
-ec_method_decode(ec_matrix_list_t *list, size_t size, uintptr_t mask,
+ec_method_decode(ec_matrix_list_t *list, uint64_t size, uintptr_t mask,
uint32_t *rows, void **in, void *out)
{
ec_matrix_t *matrix;
- size_t pos;
+ uint64_t pos;
uint32_t i;
matrix = ec_method_matrix_get(list, mask, rows);
diff --git a/xlators/cluster/ec/src/ec-method.h b/xlators/cluster/ec/src/ec-method.h
index 3d6393bed06..f91233b2f88 100644
--- a/xlators/cluster/ec/src/ec-method.h
+++ b/xlators/cluster/ec/src/ec-method.h
@@ -11,8 +11,6 @@
#ifndef __EC_METHOD_H__
#define __EC_METHOD_H__
-#include "xlator.h"
-
#include "ec-types.h"
#include "ec-galois.h"
@@ -41,10 +39,10 @@ int32_t
ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen);
void
-ec_method_encode(ec_matrix_list_t *list, size_t size, void *in, void **out);
+ec_method_encode(ec_matrix_list_t *list, uint64_t size, void *in, void **out);
int32_t
-ec_method_decode(ec_matrix_list_t *list, size_t size, uintptr_t mask,
+ec_method_decode(ec_matrix_list_t *list, uint64_t size, uintptr_t mask,
uint32_t *rows, void **in, void *out);
#endif /* __EC_METHOD_H__ */
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 80d9c0d4014..de9b89bb2c9 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -11,10 +11,10 @@
#ifndef __EC_TYPES_H__
#define __EC_TYPES_H__
-#include "xlator.h"
-#include "timer.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/timer.h>
#include "libxlator.h"
-#include "atomic.h"
+#include <glusterfs/atomic.h>
#define EC_GF_MAX_REGS 16
@@ -130,7 +130,12 @@ typedef void (*ec_resume_f)(ec_fop_data_t *, int32_t);
enum _ec_read_policy { EC_ROUND_ROBIN, EC_GFID_HASH, EC_READ_POLICY_MAX };
-enum _ec_heal_need { EC_HEAL_NONEED, EC_HEAL_MAYBE, EC_HEAL_MUST };
+enum _ec_heal_need {
+ EC_HEAL_NONEED,
+ EC_HEAL_MAYBE,
+ EC_HEAL_MUST,
+ EC_HEAL_PURGE_INDEX
+};
enum _ec_stripe_part { EC_STRIPE_HEAD, EC_STRIPE_TAIL };
@@ -150,6 +155,7 @@ struct _ec_fd {
loc_t loc;
uintptr_t open;
int32_t flags;
+ uint64_t bad_version;
ec_fd_status_t fd_status[0];
};
@@ -171,6 +177,7 @@ struct _ec_inode {
gf_boolean_t have_config;
gf_boolean_t have_version;
gf_boolean_t have_size;
+ int32_t heal_count;
ec_config_t config;
uint64_t pre_version[2];
uint64_t post_version[2];
@@ -179,14 +186,15 @@ struct _ec_inode {
uint64_t dirty[2];
struct list_head heal;
ec_stripe_list_t stripe_cache;
+ uint64_t bad_version;
};
typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
int32_t, uintptr_t, uintptr_t, uintptr_t,
- dict_t *);
+ uint32_t, dict_t *);
typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
int32_t, uintptr_t, uintptr_t, uintptr_t,
- dict_t *);
+ uint32_t, dict_t *);
union _ec_cbk {
fop_access_cbk_t access;
@@ -264,6 +272,7 @@ struct _ec_lock {
uint32_t refs_pending; /* Refs assigned to fops being prepared */
uint32_t waiting_flags; /*Track xattrop/dirty marking*/
gf_boolean_t acquired;
+ gf_boolean_t contention;
gf_boolean_t unlock_now;
gf_boolean_t release;
gf_boolean_t query;
@@ -307,9 +316,9 @@ struct _ec_fop_data {
int32_t id; /* ID of the file operation */
int32_t refs;
int32_t state;
- int32_t minimum; /* Minimum number of successful
- operation required to conclude a
- fop as successful */
+ uint32_t minimum; /* Minimum number of successful
+ operation required to conclude a
+ fop as successful */
int32_t expected;
int32_t winds;
int32_t jobs;
@@ -324,11 +333,12 @@ struct _ec_fop_data {
ec_cbk_data_t *answer; /* accepted answer */
int32_t lock_count;
int32_t locked;
+ gf_lock_t lock;
ec_lock_link_t locks[2];
int32_t first_lock;
- gf_lock_t lock;
- uint32_t flags;
+ uint32_t fop_flags; /* Flags passed by the caller. */
+ uint32_t flags; /* Internal flags. */
uint32_t first;
uintptr_t mask;
uintptr_t healing; /*Dispatch is done but call is successful only
@@ -616,6 +626,11 @@ struct _ec_statistics {
requests. (Basically memory allocation
errors). */
} stripe_cache;
+ struct {
+ gf_atomic_t attempted; /*Number of heals attempted on
+ files/directories*/
+ gf_atomic_t completed; /*Number of heals complted on files/directories*/
+ } shd;
};
struct _ec {
@@ -641,6 +656,8 @@ struct _ec {
uintptr_t xl_notify; /* Bit flag representing
notification for bricks. */
uintptr_t node_mask;
+ uintptr_t read_mask; /*Stores user defined read-mask*/
+ gf_atomic_t async_fop_count; /* Number of on going asynchronous fops. */
xlator_t **xl_list;
gf_lock_t lock;
gf_timer_t *timer;
@@ -650,6 +667,7 @@ struct _ec {
gf_boolean_t optimistic_changelog;
gf_boolean_t parallel_writes;
uint32_t stripe_cache;
+ uint32_t quorum_count;
uint32_t background_heals;
uint32_t heal_wait_qlen;
uint32_t self_heal_window_size; /* max size of read/writes */
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 4d64330a598..7344be4968d 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -8,10 +8,10 @@
cases as published by the Free Software Foundation.
*/
-#include "defaults.h"
-#include "statedump.h"
-#include "compat-errno.h"
-#include "upcall-utils.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/upcall-utils.h>
#include "ec.h"
#include "ec-messages.h"
@@ -23,7 +23,7 @@
#include "ec-method.h"
#include "ec-code.h"
#include "ec-heald.h"
-#include "events.h"
+#include <glusterfs/events.h>
static char *ec_read_policies[EC_READ_POLICY_MAX + 1] = {
[EC_ROUND_ROBIN] = "round-robin",
@@ -74,7 +74,7 @@ ec_parse_options(xlator_t *this)
gf_msg_debug("ec", 0,
"Initialized with: nodes=%u, fragments=%u, "
- "stripe_size=%u, node_mask=%lX",
+ "stripe_size=%u, node_mask=%" PRIxFAST32,
ec->nodes, ec->fragments, ec->stripe_size, ec->node_mask);
error = 0;
@@ -285,6 +285,7 @@ reconfigure(xlator_t *this, dict_t *options)
GF_OPTION_RECONF("parallel-writes", ec->parallel_writes, options, bool,
failed);
GF_OPTION_RECONF("stripe-cache", ec->stripe_cache, options, uint32, failed);
+ GF_OPTION_RECONF("quorum-count", ec->quorum_count, options, uint32, failed);
ret = 0;
if (ec_assign_read_policy(ec, read_policy)) {
ret = -1;
@@ -324,13 +325,18 @@ ec_get_event_from_state(ec_t *ec)
void
ec_up(xlator_t *this, ec_t *ec)
{
+ char str1[32], str2[32];
+
if (ec->timer != NULL) {
gf_timer_call_cancel(this->ctx, ec->timer);
ec->timer = NULL;
}
ec->up = 1;
- gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP");
+ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP,
+ "Going UP : Child UP = %s Child Notify = %s",
+ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
}
@@ -338,13 +344,18 @@ ec_up(xlator_t *this, ec_t *ec)
void
ec_down(xlator_t *this, ec_t *ec)
{
+ char str1[32], str2[32];
+
if (ec->timer != NULL) {
gf_timer_call_cancel(this->ctx, ec->timer);
ec->timer = NULL;
}
ec->up = 0;
- gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN");
+ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN,
+ "Going DOWN : Child UP = %s Child Notify = %s",
+ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
}
@@ -355,6 +366,7 @@ ec_notify_cbk(void *data)
ec_t *ec = data;
glusterfs_event_t event = GF_EVENT_MAXVAL;
gf_boolean_t propagate = _gf_false;
+ gf_boolean_t launch_heal = _gf_false;
LOCK(&ec->lock);
{
@@ -384,6 +396,11 @@ ec_notify_cbk(void *data)
* still bricks DOWN, they will be healed when they
* come up. */
ec_up(ec->xl, ec);
+
+ if (ec->shd.iamshd && !ec->shutdown) {
+ launch_heal = _gf_true;
+ GF_ATOMIC_INC(ec->async_fop_count);
+ }
}
propagate = _gf_true;
@@ -391,13 +408,12 @@ ec_notify_cbk(void *data)
unlock:
UNLOCK(&ec->lock);
+ if (launch_heal) {
+ /* We have just brought the volume UP, so we trigger
+ * a self-heal check on the root directory. */
+ ec_launch_replace_heal(ec);
+ }
if (propagate) {
- if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) {
- /* We have just brought the volume UP, so we trigger
- * a self-heal check on the root directory. */
- ec_launch_replace_heal(ec);
- }
-
default_notify(ec->xl, event, NULL);
}
}
@@ -425,10 +441,55 @@ ec_disable_delays(ec_t *ec)
{
ec->shutdown = _gf_true;
- return list_empty(&ec->pending_fops);
+ return __ec_is_last_fop(ec);
}
void
+ec_cleanup_healer_object(ec_t *ec)
+{
+ struct subvol_healer *healer = NULL;
+ ec_self_heald_t *shd = NULL;
+ void *res = NULL;
+ int i = 0;
+ gf_boolean_t is_join = _gf_false;
+
+ shd = &ec->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < ec->nodes; i++) {
+ healer = &shd->index_healers[i];
+ pthread_mutex_lock(&healer->mutex);
+ {
+ healer->rerun = 1;
+ if (healer->running) {
+ pthread_cond_signal(&healer->cond);
+ is_join = _gf_true;
+ }
+ }
+ pthread_mutex_unlock(&healer->mutex);
+ if (is_join) {
+ pthread_join(healer->thread, &res);
+ is_join = _gf_false;
+ }
+
+ healer = &shd->full_healers[i];
+ pthread_mutex_lock(&healer->mutex);
+ {
+ healer->rerun = 1;
+ if (healer->running) {
+ pthread_cond_signal(&healer->cond);
+ is_join = _gf_true;
+ }
+ }
+ pthread_mutex_unlock(&healer->mutex);
+ if (is_join) {
+ pthread_join(healer->thread, &res);
+ is_join = _gf_false;
+ }
+ }
+}
+void
ec_pending_fops_completed(ec_t *ec)
{
if (ec->shutdown) {
@@ -441,6 +502,9 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
{
uintptr_t current_state = 0;
+ if (xlator_is_cleanup_starting(ec->xl))
+ return _gf_false;
+
if ((ec->xl_notify & index_mask) == 0) {
ec->xl_notify |= index_mask;
ec->xl_notify_count++;
@@ -462,6 +526,7 @@ ec_upcall(ec_t *ec, struct gf_upcall *upcall)
struct gf_upcall_cache_invalidation *ci = NULL;
struct gf_upcall_inodelk_contention *lc = NULL;
inode_t *inode;
+ inode_table_t *table;
switch (upcall->event_type) {
case GF_UPCALL_CACHE_INVALIDATION:
@@ -475,8 +540,18 @@ ec_upcall(ec_t *ec, struct gf_upcall *upcall)
/* The lock is not owned by EC, ignore it. */
return _gf_true;
}
- inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable,
- upcall->gfid);
+ table = ((xlator_t *)ec->xl->graph->top)->itable;
+ if (table == NULL) {
+ /* Self-heal daemon doesn't have an inode table on the top
+ * xlator because it doesn't need it. In this case we should
+ * use the inode table managed by EC itself where all inodes
+ * being healed should be present. However self-heal doesn't
+ * use eager-locking and inodelk's are already released as
+ * soon as possible. In this case we can safely ignore these
+ * notifications. */
+ return _gf_false;
+ }
+ inode = inode_find(table, upcall->gfid);
/* If inode is not found, it means that it's already released,
* so we can ignore it. Probably it has been released and
* destroyed while the contention notification was being sent.
@@ -544,6 +619,7 @@ ec_notify(xlator_t *this, int32_t event, void *data, void *data2)
/* If there aren't pending fops running after we have waken up
* them, we immediately propagate the notification. */
propagate = ec_disable_delays(ec);
+ ec_cleanup_healer_object(ec);
goto unlock;
}
@@ -554,7 +630,10 @@ ec_notify(xlator_t *this, int32_t event, void *data, void *data2)
if (event == GF_EVENT_CHILD_UP) {
/* We need to trigger a selfheal if a brick changes
* to UP state. */
- needs_shd_check = ec_set_up_state(ec, mask, mask);
+ if (ec_set_up_state(ec, mask, mask) && ec->shd.iamshd &&
+ !ec->shutdown) {
+ needs_shd_check = _gf_true;
+ }
} else if (event == GF_EVENT_CHILD_DOWN) {
ec_set_up_state(ec, mask, 0);
}
@@ -584,17 +663,21 @@ ec_notify(xlator_t *this, int32_t event, void *data, void *data2)
}
} else {
propagate = _gf_false;
+ needs_shd_check = _gf_false;
+ }
+
+ if (needs_shd_check) {
+ GF_ATOMIC_INC(ec->async_fop_count);
}
}
unlock:
UNLOCK(&ec->lock);
done:
+ if (needs_shd_check) {
+ ec_launch_replace_heal(ec);
+ }
if (propagate) {
- if (needs_shd_check && ec->shd.iamshd) {
- ec_launch_replace_heal(ec);
- }
-
error = default_notify(this, event, data);
}
@@ -627,6 +710,69 @@ ec_statistics_init(ec_t *ec)
GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0);
GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0);
GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0);
+ GF_ATOMIC_INIT(ec->stats.shd.attempted, 0);
+ GF_ATOMIC_INIT(ec->stats.shd.completed, 0);
+}
+
+static int
+ec_assign_read_mask(ec_t *ec, char *read_mask_str)
+{
+ char *mask = NULL;
+ char *maskptr = NULL;
+ char *saveptr = NULL;
+ char *id_str = NULL;
+ int id = 0;
+ int ret = 0;
+ uintptr_t read_mask = 0;
+
+ if (!read_mask_str) {
+ ec->read_mask = 0;
+ ret = 0;
+ goto out;
+ }
+
+ mask = gf_strdup(read_mask_str);
+ if (!mask) {
+ ret = -1;
+ goto out;
+ }
+ maskptr = mask;
+
+ for (;;) {
+ id_str = strtok_r(maskptr, ":", &saveptr);
+ if (id_str == NULL)
+ break;
+ if (gf_string2int(id_str, &id)) {
+ gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+ "In read-mask \"%s\" id %s is not a valid integer",
+ read_mask_str, id_str);
+ ret = -1;
+ goto out;
+ }
+
+ if ((id < 0) || (id >= ec->nodes)) {
+ gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+ "In read-mask \"%s\" id %d is not in range [0 - %d]",
+ read_mask_str, id, ec->nodes - 1);
+ ret = -1;
+ goto out;
+ }
+ read_mask |= (1UL << id);
+ maskptr = NULL;
+ }
+
+ if (gf_bits_count(read_mask) < ec->fragments) {
+ gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+ "read-mask \"%s\" should contain at least %d ids", read_mask_str,
+ ec->fragments);
+ ret = -1;
+ goto out;
+ }
+ ec->read_mask = read_mask;
+ ret = 0;
+out:
+ GF_FREE(mask);
+ return ret;
}
int32_t
@@ -636,6 +782,7 @@ init(xlator_t *this)
char *read_policy = NULL;
char *extensions = NULL;
int32_t err;
+ char *read_mask_str = NULL;
if (this->parents == NULL) {
gf_msg(this->name, GF_LOG_WARNING, 0, EC_MSG_NO_PARENTS,
@@ -656,6 +803,7 @@ init(xlator_t *this)
ec->xl = this;
LOCK_INIT(&ec->lock);
+ GF_ATOMIC_INIT(ec->async_fop_count, 0);
INIT_LIST_HEAD(&ec->pending_fops);
INIT_LIST_HEAD(&ec->heal_waiting);
INIT_LIST_HEAD(&ec->healing);
@@ -714,12 +862,18 @@ init(xlator_t *this)
if (ec_assign_read_policy(ec, read_policy))
goto failed;
+ GF_OPTION_INIT("heal-timeout", ec->shd.timeout, int32, failed);
GF_OPTION_INIT("shd-max-threads", ec->shd.max_threads, uint32, failed);
GF_OPTION_INIT("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
GF_OPTION_INIT("optimistic-change-log", ec->optimistic_changelog, bool,
failed);
GF_OPTION_INIT("parallel-writes", ec->parallel_writes, bool, failed);
GF_OPTION_INIT("stripe-cache", ec->stripe_cache, uint32, failed);
+ GF_OPTION_INIT("quorum-count", ec->quorum_count, uint32, failed);
+ GF_OPTION_INIT("ec-read-mask", read_mask_str, str, failed);
+
+ if (ec_assign_read_mask(ec, read_mask_str))
+ goto failed;
this->itable = inode_table_new(EC_SHD_INODE_LRU_LIMIT, this);
if (!this->itable)
@@ -759,6 +913,7 @@ failed:
void
fini(xlator_t *this)
{
+ ec_selfheal_daemon_fini(this);
__ec_destroy_private(this);
}
@@ -797,11 +952,12 @@ ec_gf_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
loc_t *loc, const char *basename, entrylk_cmd cmd,
entrylk_type type, dict_t *xdata)
{
- int32_t minimum = EC_MINIMUM_ALL;
+ uint32_t fop_flags = EC_MINIMUM_ALL;
+
if (cmd == ENTRYLK_UNLOCK)
- minimum = EC_MINIMUM_ONE;
- ec_entrylk(frame, this, -1, minimum, default_entrylk_cbk, NULL, volume, loc,
- basename, cmd, type, xdata);
+ fop_flags = EC_MINIMUM_ONE;
+ ec_entrylk(frame, this, -1, fop_flags, default_entrylk_cbk, NULL, volume,
+ loc, basename, cmd, type, xdata);
return 0;
}
@@ -811,10 +967,11 @@ ec_gf_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
fd_t *fd, const char *basename, entrylk_cmd cmd,
entrylk_type type, dict_t *xdata)
{
- int32_t minimum = EC_MINIMUM_ALL;
+ uint32_t fop_flags = EC_MINIMUM_ALL;
+
if (cmd == ENTRYLK_UNLOCK)
- minimum = EC_MINIMUM_ONE;
- ec_fentrylk(frame, this, -1, minimum, default_fentrylk_cbk, NULL, volume,
+ fop_flags = EC_MINIMUM_ONE;
+ ec_fentrylk(frame, this, -1, fop_flags, default_fentrylk_cbk, NULL, volume,
fd, basename, cmd, type, xdata);
return 0;
@@ -905,7 +1062,7 @@ ec_gf_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
{
int error = 0;
ec_t *ec = this->private;
- int32_t minimum = EC_MINIMUM_ONE;
+ int32_t fop_flags = EC_MINIMUM_ONE;
if (name && strcmp(name, EC_XATTR_HEAL) != 0) {
EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out);
@@ -920,11 +1077,11 @@ ec_gf_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
if (name && ((fnmatch(GF_XATTR_STIME_PATTERN, name, 0) == 0) ||
XATTR_IS_NODE_UUID(name) || XATTR_IS_NODE_UUID_LIST(name))) {
- minimum = EC_MINIMUM_ALL;
+ fop_flags = EC_MINIMUM_ALL;
}
- ec_getxattr(frame, this, -1, minimum, default_getxattr_cbk, NULL, loc, name,
- xdata);
+ ec_getxattr(frame, this, -1, fop_flags, default_getxattr_cbk, NULL, loc,
+ name, xdata);
return 0;
out:
@@ -954,11 +1111,12 @@ int32_t
ec_gf_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- int32_t minimum = EC_MINIMUM_ALL;
+ int32_t fop_flags = EC_MINIMUM_ALL;
+
if (flock->l_type == F_UNLCK)
- minimum = EC_MINIMUM_ONE;
+ fop_flags = EC_MINIMUM_ONE;
- ec_inodelk(frame, this, &frame->root->lk_owner, -1, minimum,
+ ec_inodelk(frame, this, &frame->root->lk_owner, -1, fop_flags,
default_inodelk_cbk, NULL, volume, loc, cmd, flock, xdata);
return 0;
@@ -968,10 +1126,11 @@ int32_t
ec_gf_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- int32_t minimum = EC_MINIMUM_ALL;
+ int32_t fop_flags = EC_MINIMUM_ALL;
+
if (flock->l_type == F_UNLCK)
- minimum = EC_MINIMUM_ONE;
- ec_finodelk(frame, this, &frame->root->lk_owner, -1, minimum,
+ fop_flags = EC_MINIMUM_ONE;
+ ec_finodelk(frame, this, &frame->root->lk_owner, -1, fop_flags,
default_finodelk_cbk, NULL, volume, fd, cmd, flock, xdata);
return 0;
@@ -991,10 +1150,11 @@ int32_t
ec_gf_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
- int32_t minimum = EC_MINIMUM_ALL;
+ int32_t fop_flags = EC_MINIMUM_ALL;
+
if (flock->l_type == F_UNLCK)
- minimum = EC_MINIMUM_ONE;
- ec_lk(frame, this, -1, minimum, default_lk_cbk, NULL, fd, cmd, flock,
+ fop_flags = EC_MINIMUM_ONE;
+ ec_lk(frame, this, -1, fop_flags, default_lk_cbk, NULL, fd, cmd, flock,
xdata);
return 0;
@@ -1389,6 +1549,10 @@ ec_dump_private(xlator_t *this)
gf_proc_dump_write("childs_up", "%u", ec->xl_up_count);
gf_proc_dump_write("childs_up_mask", "%s",
ec_bin(tmp, sizeof(tmp), ec->xl_up, ec->nodes));
+ if (ec->read_mask) {
+ gf_proc_dump_write("read-mask", "%s",
+ ec_bin(tmp, sizeof(tmp), ec->read_mask, ec->nodes));
+ }
gf_proc_dump_write("background-heals", "%d", ec->background_heals);
gf_proc_dump_write("heal-wait-qlength", "%d", ec->heal_wait_qlen);
gf_proc_dump_write("self-heal-window-size", "%" PRIu32,
@@ -1397,6 +1561,7 @@ ec_dump_private(xlator_t *this)
gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters);
gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]);
gf_proc_dump_write("parallel-writes", "%d", ec->parallel_writes);
+ gf_proc_dump_write("quorum-count", "%u", ec->quorum_count);
snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache",
this->type, this->name);
@@ -1416,6 +1581,10 @@ ec_dump_private(xlator_t *this)
GF_ATOMIC_GET(ec->stats.stripe_cache.allocs));
gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC,
GF_ATOMIC_GET(ec->stats.stripe_cache.errors));
+ gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.shd.attempted));
+ gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.shd.completed));
return 0;
}
@@ -1666,4 +1835,39 @@ struct volume_options options[] = {
"specially for sequential writes. However, this will also"
"lead to extra memory consumption, maximum "
"(cache size * stripe size) Bytes per open file."},
- {.key = {NULL}}};
+ {
+ .key = {"quorum-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "0",
+ .description =
+ "This option can be used to define how many successes on"
+ "the bricks constitute a success to the application. This"
+ " count should be in the range"
+ "[disperse-data-count, disperse-count] (inclusive)",
+ },
+ {
+ .key = {"ec-read-mask"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = NULL,
+ .description = "This option can be used to choose which bricks can be"
+ " used for reading data/metadata of a file/directory",
+ },
+ {
+ .key = {NULL},
+ },
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1},
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "disperse",
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index 1b210d9adc1..6f6de6d5981 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -18,6 +18,7 @@
#define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
#define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
#define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
#define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
#define EC_STRIPE_CACHE_MAX_SIZE 10
#define EC_VERSION_SIZE 2
diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am
deleted file mode 100644
index 2b594567db1..00000000000
--- a/xlators/cluster/stripe/src/Makefile.am
+++ /dev/null
@@ -1,22 +0,0 @@
-xlator_LTLIBRARIES = stripe.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-
-stripe_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
-
-
-stripe_la_SOURCES = stripe.c stripe-helpers.c \
- $(top_builddir)/xlators/lib/src/libxlator.c
-
-stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = stripe.h stripe-mem-types.h \
- $(top_builddir)/xlators/lib/src/libxlator.h
-
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/xlators/lib/src \
- -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
-
-AM_CFLAGS = -Wall $(GF_CFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/cluster/stripe/src/stripe-helpers.c b/xlators/cluster/stripe/src/stripe-helpers.c
deleted file mode 100644
index 7301494415d..00000000000
--- a/xlators/cluster/stripe/src/stripe-helpers.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include <fnmatch.h>
-
-#include "stripe.h"
-#include "byte-order.h"
-#include "mem-types.h"
-#include "logging.h"
-
-void
-stripe_local_wipe(stripe_local_t *local)
-{
- if (!local)
- goto out;
-
- loc_wipe(&local->loc);
- loc_wipe(&local->loc2);
-
- if (local->fd)
- fd_unref(local->fd);
-
- if (local->inode)
- inode_unref(local->inode);
-
- if (local->xattr)
- dict_unref(local->xattr);
-
- if (local->xdata)
- dict_unref(local->xdata);
-
-out:
- return;
-}
-
-int
-stripe_aggregate(dict_t *this, char *key, data_t *value, void *data)
-{
- dict_t *dst = NULL;
- int64_t *ptr = 0, *size = NULL;
- int32_t ret = -1;
-
- dst = data;
-
- if (strcmp(key, QUOTA_SIZE_KEY) == 0) {
- ret = dict_get_bin(dst, key, (void **)&size);
- if (ret < 0) {
- size = GF_CALLOC(1, sizeof(int64_t), gf_common_mt_char);
- if (size == NULL) {
- gf_log("stripe", GF_LOG_WARNING, "memory allocation failed");
- goto out;
- }
- ret = dict_set_bin(dst, key, size, sizeof(int64_t));
- if (ret < 0) {
- gf_log("stripe", GF_LOG_WARNING,
- "stripe aggregate dict set failed");
- GF_FREE(size);
- goto out;
- }
- }
-
- ptr = data_to_bin(value);
- if (ptr == NULL) {
- gf_log("stripe", GF_LOG_WARNING, "data to bin failed");
- goto out;
- }
-
- *size = hton64(ntoh64(*size) + ntoh64(*ptr));
- } else if (strcmp(key, GF_CONTENT_KEY)) {
- /* No need to aggregate 'CONTENT' data */
- ret = dict_set(dst, key, value);
- if (ret)
- gf_log("stripe", GF_LOG_WARNING, "xattr dict set failed");
- }
-
-out:
- return 0;
-}
-
-void
-stripe_aggregate_xattr(dict_t *dst, dict_t *src)
-{
- if ((dst == NULL) || (src == NULL)) {
- goto out;
- }
-
- dict_foreach(src, stripe_aggregate, dst);
-out:
- return;
-}
-
-int32_t
-stripe_xattr_aggregate(char *buffer, stripe_local_t *local, int32_t *total)
-{
- int32_t i = 0;
- int32_t ret = -1;
- int32_t len = 0;
- char *sbuf = NULL;
- stripe_xattr_sort_t *xattr = NULL;
-
- if (!buffer || !local || !local->xattr_list)
- goto out;
-
- sbuf = buffer;
-
- for (i = 0; i < local->nallocs; i++) {
- xattr = local->xattr_list + i;
- len = xattr->xattr_len - 1; /* length includes \0 */
-
- if (len && xattr && xattr->xattr_value) {
- memcpy(buffer, xattr->xattr_value, len);
- buffer += len;
- *buffer++ = ' ';
- }
- }
-
- *--buffer = '\0';
- if (total)
- *total = buffer - sbuf;
- ret = 0;
-
-out:
- return ret;
-}
-
-int32_t
-stripe_free_xattr_str(stripe_local_t *local)
-{
- int32_t i = 0;
- int32_t ret = -1;
- stripe_xattr_sort_t *xattr = NULL;
-
- if (!local || !local->xattr_list)
- goto out;
-
- for (i = 0; i < local->nallocs; i++) {
- xattr = local->xattr_list + i;
-
- if (xattr && xattr->xattr_value)
- GF_FREE(xattr->xattr_value);
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-int32_t
-stripe_fill_lockinfo_xattr(xlator_t *this, stripe_local_t *local,
- void **xattr_serz)
-{
- int32_t ret = -1, i = 0, len = 0;
- dict_t *tmp1 = NULL, *tmp2 = NULL;
- char *buf = NULL;
- stripe_xattr_sort_t *xattr = NULL;
-
- if (xattr_serz == NULL) {
- goto out;
- }
-
- tmp2 = dict_new();
-
- if (tmp2 == NULL) {
- goto out;
- }
-
- for (i = 0; i < local->nallocs; i++) {
- xattr = local->xattr_list + i;
- len = xattr->xattr_len;
-
- if (len && xattr && xattr->xattr_value) {
- ret = dict_reset(tmp2);
- if (ret < 0) {
- gf_log(this->name, GF_LOG_DEBUG, "dict_reset failed (%s)",
- strerror(-ret));
- }
-
- ret = dict_unserialize(xattr->xattr_value, xattr->xattr_len, &tmp2);
- if (ret < 0) {
- gf_log(this->name, GF_LOG_WARNING,
- "dict_unserialize failed (%s)", strerror(-ret));
- ret = -1;
- goto out;
- }
-
- tmp1 = dict_copy(tmp2, tmp1);
- if (tmp1 == NULL) {
- gf_log(this->name, GF_LOG_WARNING, "dict_copy failed (%s)",
- strerror(-ret));
- ret = -1;
- goto out;
- }
- }
- }
-
- len = dict_serialized_length(tmp1);
- if (len > 0) {
- buf = GF_CALLOC(1, len, gf_common_mt_dict_t);
- if (buf == NULL) {
- ret = -1;
- goto out;
- }
-
- ret = dict_serialize(tmp1, buf);
- if (ret < 0) {
- gf_log(this->name, GF_LOG_WARNING, "dict_serialize failed (%s)",
- strerror(-ret));
- GF_FREE(buf);
- ret = -1;
- goto out;
- }
-
- *xattr_serz = buf;
- }
-
- ret = 0;
-out:
- if (tmp1 != NULL) {
- dict_unref(tmp1);
- }
-
- if (tmp2 != NULL) {
- dict_unref(tmp2);
- }
-
- return ret;
-}
-
-int32_t
-stripe_fill_pathinfo_xattr(xlator_t *this, stripe_local_t *local,
- char **xattr_serz)
-{
- int ret = -1;
- int32_t padding = 0;
- int32_t tlen = 0;
- int len = 0;
- char stripe_size_str[20] = {
- 0,
- };
- char *pathinfo_serz = NULL;
-
- if (!local) {
- gf_log(this->name, GF_LOG_ERROR, "Possible NULL deref");
- goto out;
- }
-
- len = snprintf(stripe_size_str, sizeof(stripe_size_str), "%" PRId64,
- (long long)(local->fctx) ? local->fctx->stripe_size : 0);
- if (len < 0 || len >= sizeof(stripe_size_str))
- goto out;
- /* extra bytes for decorations (brackets and <>'s) */
- padding = strlen(this->name) + SLEN(STRIPE_PATHINFO_HEADER) + len + 7;
- local->xattr_total_len += (padding + 2);
-
- pathinfo_serz = GF_MALLOC(local->xattr_total_len, gf_common_mt_char);
- if (!pathinfo_serz)
- goto out;
-
- /* xlator info */
- (void)sprintf(pathinfo_serz, "(<" STRIPE_PATHINFO_HEADER "%s:[%s]> ",
- this->name, stripe_size_str);
-
- ret = stripe_xattr_aggregate(pathinfo_serz + padding, local, &tlen);
- if (ret) {
- gf_log(this->name, GF_LOG_ERROR, "Cannot aggregate pathinfo list");
- GF_FREE(pathinfo_serz);
- goto out;
- }
-
- *(pathinfo_serz + padding + tlen) = ')';
- *(pathinfo_serz + padding + tlen + 1) = '\0';
-
- *xattr_serz = pathinfo_serz;
-
- ret = 0;
-out:
- return ret;
-}
-
-/**
- * stripe_get_matching_bs - Get the matching block size for the given path.
- */
-int32_t
-stripe_get_matching_bs(const char *path, stripe_private_t *priv)
-{
- struct stripe_options *trav = NULL;
- uint64_t block_size = 0;
-
- GF_VALIDATE_OR_GOTO("stripe", priv, out);
- GF_VALIDATE_OR_GOTO("stripe", path, out);
-
- LOCK(&priv->lock);
- {
- block_size = priv->block_size;
- trav = priv->pattern;
- while (trav) {
- if (!fnmatch(trav->path_pattern, path, FNM_NOESCAPE)) {
- block_size = trav->block_size;
- break;
- }
- trav = trav->next;
- }
- }
- UNLOCK(&priv->lock);
-
-out:
- return block_size;
-}
-
-int32_t
-stripe_ctx_handle(xlator_t *this, call_frame_t *prev, stripe_local_t *local,
- dict_t *dict)
-{
- char key[256] = {
- 0,
- };
- data_t *data = NULL;
- int32_t index = 0;
- stripe_private_t *priv = NULL;
-
- priv = this->private;
-
- if (!local->fctx) {
- local->fctx = GF_CALLOC(1, sizeof(stripe_fd_ctx_t),
- gf_stripe_mt_stripe_fd_ctx_t);
- if (!local->fctx) {
- local->op_errno = ENOMEM;
- local->op_ret = -1;
- goto out;
- }
-
- local->fctx->static_array = 0;
- }
- /* Stripe block size */
- sprintf(key, "trusted.%s.stripe-size", this->name);
- data = dict_get(dict, key);
- if (!data) {
- local->xattr_self_heal_needed = 1;
- gf_log(this->name, GF_LOG_ERROR, "Failed to get stripe-size");
- goto out;
- } else {
- if (!local->fctx->stripe_size) {
- local->fctx->stripe_size = data_to_int64(data);
- }
-
- if (local->fctx->stripe_size != data_to_int64(data)) {
- gf_log(this->name, GF_LOG_WARNING,
- "stripe-size mismatch in blocks");
- local->xattr_self_heal_needed = 1;
- }
- }
-
- /* Stripe count */
- sprintf(key, "trusted.%s.stripe-count", this->name);
- data = dict_get(dict, key);
-
- if (!data) {
- local->xattr_self_heal_needed = 1;
- gf_log(this->name, GF_LOG_ERROR, "Failed to get stripe-count");
- goto out;
- }
- if (!local->fctx->xl_array) {
- local->fctx->stripe_count = data_to_int32(data);
- if (!local->fctx->stripe_count) {
- gf_log(this->name, GF_LOG_ERROR, "error with stripe-count xattr");
- local->op_ret = -1;
- local->op_errno = EIO;
- goto out;
- }
-
- local->fctx->xl_array = GF_CALLOC(local->fctx->stripe_count,
- sizeof(xlator_t *),
- gf_stripe_mt_xlator_t);
-
- if (!local->fctx->xl_array) {
- local->op_errno = ENOMEM;
- local->op_ret = -1;
- goto out;
- }
- }
- if (local->fctx->stripe_count != data_to_int32(data)) {
- gf_log(this->name, GF_LOG_ERROR,
- "error with stripe-count xattr (%d != %d)",
- local->fctx->stripe_count, data_to_int32(data));
- local->op_ret = -1;
- local->op_errno = EIO;
- goto out;
- }
-
- /* index */
- sprintf(key, "trusted.%s.stripe-index", this->name);
- data = dict_get(dict, key);
- if (!data) {
- local->xattr_self_heal_needed = 1;
- gf_log(this->name, GF_LOG_ERROR, "Failed to get stripe-index");
- goto out;
- }
- index = data_to_int32(data);
- if (index > priv->child_count) {
- gf_log(this->name, GF_LOG_ERROR, "error with stripe-index xattr (%d)",
- index);
- local->op_ret = -1;
- local->op_errno = EIO;
- goto out;
- }
- if (local->fctx->xl_array) {
- if (!local->fctx->xl_array[index])
- local->fctx->xl_array[index] = prev->this;
- }
-
- sprintf(key, "trusted.%s.stripe-coalesce", this->name);
- data = dict_get(dict, key);
- if (!data) {
- /*
- * The file was probably created prior to coalesce support.
- * Assume non-coalesce mode for this file to maintain backwards
- * compatibility.
- */
- gf_log(this->name, GF_LOG_DEBUG,
- "missing stripe-coalesce "
- "attr, assume non-coalesce mode");
- local->fctx->stripe_coalesce = 0;
- } else {
- local->fctx->stripe_coalesce = data_to_int32(data);
- }
-
-out:
- return 0;
-}
-
-int32_t
-stripe_xattr_request_build(xlator_t *this, dict_t *dict, uint64_t stripe_size,
- uint32_t stripe_count, uint32_t stripe_index,
- uint32_t stripe_coalesce)
-{
- char key[256] = {
- 0,
- };
- int32_t ret = -1;
-
- sprintf(key, "trusted.%s.stripe-size", this->name);
- ret = dict_set_int64(dict, key, stripe_size);
- if (ret) {
- gf_log(this->name, GF_LOG_WARNING, "failed to set %s in xattr_req dict",
- key);
- goto out;
- }
-
- sprintf(key, "trusted.%s.stripe-count", this->name);
- ret = dict_set_int32(dict, key, stripe_count);
- if (ret) {
- gf_log(this->name, GF_LOG_WARNING, "failed to set %s in xattr_req dict",
- key);
- goto out;
- }
-
- sprintf(key, "trusted.%s.stripe-index", this->name);
- ret = dict_set_int32(dict, key, stripe_index);
- if (ret) {
- gf_log(this->name, GF_LOG_WARNING, "failed to set %s in xattr_req dict",
- key);
- goto out;
- }
-
- sprintf(key, "trusted.%s.stripe-coalesce", this->name);
- ret = dict_set_int32(dict, key, stripe_coalesce);
- if (ret) {
- gf_log(this->name, GF_LOG_WARNING, "failed to set %s in xattr_req_dict",
- key);
- goto out;
- }
-out:
- return ret;
-}
-
-static int
-set_default_block_size(stripe_private_t *priv, char *num)
-{
- int ret = -1;
- GF_VALIDATE_OR_GOTO("stripe", THIS, out);
- GF_VALIDATE_OR_GOTO(THIS->name, priv, out);
- GF_VALIDATE_OR_GOTO(THIS->name, num, out);
-
- if (gf_string2bytesize_uint64(num, &priv->block_size) != 0) {
- gf_log(THIS->name, GF_LOG_ERROR, "invalid number format \"%s\"", num);
- goto out;
- }
-
- ret = 0;
-
-out:
- return ret;
-}
-
-int
-set_stripe_block_size(xlator_t *this, stripe_private_t *priv, char *data)
-{
- int ret = -1;
- char *tmp_str = NULL;
- char *tmp_str1 = NULL;
- char *dup_str = NULL;
- char *stripe_str = NULL;
- char *pattern = NULL;
- char *num = NULL;
- struct stripe_options *temp_stripeopt = NULL;
- struct stripe_options *stripe_opt = NULL;
-
- if (!this || !priv || !data)
- goto out;
-
- /* Get the pattern for striping.
- "option block-size *avi:10MB" etc */
- stripe_str = strtok_r(data, ",", &tmp_str);
- while (stripe_str) {
- dup_str = gf_strdup(stripe_str);
- stripe_opt = GF_CALLOC(1, sizeof(struct stripe_options),
- gf_stripe_mt_stripe_options);
- if (!stripe_opt) {
- goto out;
- }
-
- pattern = strtok_r(dup_str, ":", &tmp_str1);
- num = strtok_r(NULL, ":", &tmp_str1);
- if (!num) {
- num = pattern;
- pattern = "*";
- ret = set_default_block_size(priv, num);
- if (ret)
- goto out;
- }
- if (gf_string2bytesize_uint64(num, &stripe_opt->block_size) != 0) {
- gf_log(this->name, GF_LOG_ERROR, "invalid number format \"%s\"",
- num);
- goto out;
- }
-
- if (stripe_opt->block_size < STRIPE_MIN_BLOCK_SIZE) {
- gf_log(this->name, GF_LOG_ERROR,
- "Invalid Block-size: "
- "%s. Should be at least %llu bytes",
- num, STRIPE_MIN_BLOCK_SIZE);
- goto out;
- }
- if (stripe_opt->block_size % 512) {
- gf_log(this->name, GF_LOG_ERROR,
- "Block-size: %s should"
- " be a multiple of 512 bytes",
- num);
- goto out;
- }
-
- memcpy(stripe_opt->path_pattern, pattern, strlen(pattern));
-
- gf_log(this->name, GF_LOG_DEBUG,
- "block-size : pattern %s : size %" PRId64,
- stripe_opt->path_pattern, stripe_opt->block_size);
-
- if (priv->pattern)
- temp_stripeopt = NULL;
- else
- temp_stripeopt = priv->pattern;
-
- stripe_opt->next = temp_stripeopt;
-
- priv->pattern = stripe_opt;
- stripe_opt = NULL;
-
- GF_FREE(dup_str);
- dup_str = NULL;
-
- stripe_str = strtok_r(NULL, ",", &tmp_str);
- }
-
- ret = 0;
-out:
-
- GF_FREE(dup_str);
-
- GF_FREE(stripe_opt);
-
- return ret;
-}
-
-int32_t
-stripe_iatt_merge(struct iatt *from, struct iatt *to)
-{
- if (to->ia_size < from->ia_size)
- to->ia_size = from->ia_size;
- if (to->ia_mtime < from->ia_mtime)
- to->ia_mtime = from->ia_mtime;
- if (to->ia_ctime < from->ia_ctime)
- to->ia_ctime = from->ia_ctime;
- if (to->ia_atime < from->ia_atime)
- to->ia_atime = from->ia_atime;
- return 0;
-}
-
-off_t
-coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count)
-{
- size_t line_size = 0;
- uint64_t stripe_num = 0;
- off_t coalesced_offset = 0;
-
- line_size = stripe_size * stripe_count;
- stripe_num = offset / line_size;
-
- coalesced_offset = (stripe_num * stripe_size) + (offset % stripe_size);
-
- return coalesced_offset;
-}
-
-off_t
-uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count,
- int stripe_index)
-{
- uint64_t nr_full_stripe_chunks = 0, mod = 0;
-
- if (!size)
- return size;
-
- /*
- * Estimate the number of fully written stripes from the
- * local file size. Each stripe_size chunk corresponds to
- * a stripe.
- */
- nr_full_stripe_chunks = (size / stripe_size) * stripe_count;
- mod = size % stripe_size;
-
- if (!mod) {
- /*
- * There is no remainder, thus we could have overestimated
- * the size of the file in terms of chunks. Trim the number
- * of chunks by the following stripe members and leave it
- * up to those nodes to respond with a larger size (if
- * necessary).
- */
- nr_full_stripe_chunks -= stripe_count - (stripe_index + 1);
- size = nr_full_stripe_chunks * stripe_size;
- } else {
- /*
- * There is a remainder and thus we own the last chunk of the
- * file. Add the preceding stripe members of the final stripe
- * along with the remainder to calculate the exact size.
- */
- nr_full_stripe_chunks += stripe_index;
- size = nr_full_stripe_chunks * stripe_size + mod;
- }
-
- return size;
-}
diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h
deleted file mode 100644
index dcbef31212b..00000000000
--- a/xlators/cluster/stripe/src/stripe-mem-types.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __STRIPE_MEM_TYPES_H__
-#define __STRIPE_MEM_TYPES_H__
-
-#include "mem-types.h"
-
-enum gf_stripe_mem_types_ {
- gf_stripe_mt_iovec = gf_common_mt_end + 1,
- gf_stripe_mt_stripe_replies,
- gf_stripe_mt_stripe_fd_ctx_t,
- gf_stripe_mt_char,
- gf_stripe_mt_int8_t,
- gf_stripe_mt_int32_t,
- gf_stripe_mt_xlator_t,
- gf_stripe_mt_stripe_private_t,
- gf_stripe_mt_stripe_options,
- gf_stripe_mt_xattr_sort_t,
- gf_stripe_mt_end
-};
-#endif
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
deleted file mode 100644
index 6c8568b5608..00000000000
--- a/xlators/cluster/stripe/src/stripe.c
+++ /dev/null
@@ -1,5612 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-/**
- * xlators/cluster/stripe:
- * Stripe translator, stripes the data across its child nodes,
- * as per the options given in the volfile. The striping works
- * fairly simple. It writes files at different offset as per
- * calculation. So, 'ls -l' output at the real posix level will
- * show file size bigger than the actual size. But when one does
- * 'df' or 'du <file>', real size of the file on the server is shown.
- *
- * WARNING:
- * Stripe translator can't regenerate data if a child node gets disconnected.
- * So, no 'self-heal' for stripe. Hence the advice, use stripe only when its
- * very much necessary, or else, use it in combination with AFR, to have a
- * backup copy.
- */
-#include <fnmatch.h>
-#include "stripe.h"
-#include "libxlator.h"
-#include "byte-order.h"
-#include "statedump.h"
-
-struct volume_options options[];
-
-int32_t
-stripe_sh_chown_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preop,
- struct iatt *postop, dict_t *xdata)
-{
- int callcnt = -1;
- stripe_local_t *local = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- STRIPE_STACK_DESTROY(frame);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_sh_make_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!frame || !frame->local || !cookie || !this) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- STACK_WIND(frame, stripe_sh_chown_cbk, prev->this,
- prev->this->fops->setattr, &local->loc, &local->stbuf,
- (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL);
-
-out:
- return 0;
-}
-
-int32_t
-stripe_entry_self_heal(call_frame_t *frame, xlator_t *this,
- stripe_local_t *local)
-{
- xlator_list_t *trav = NULL;
- call_frame_t *rframe = NULL;
- stripe_local_t *rlocal = NULL;
- stripe_private_t *priv = NULL;
- dict_t *xdata = NULL;
- int ret = 0;
-
- if (!local || !this || !frame) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- if (!(IA_ISREG(local->stbuf.ia_type) || IA_ISDIR(local->stbuf.ia_type)))
- return 0;
-
- priv = this->private;
- trav = this->children;
- rframe = copy_frame(frame);
- if (!rframe) {
- goto out;
- }
- rlocal = mem_get0(this->local_pool);
- if (!rlocal) {
- goto out;
- }
- rframe->local = rlocal;
- rlocal->call_count = priv->child_count;
- loc_copy(&rlocal->loc, &local->loc);
- memcpy(&rlocal->stbuf, &local->stbuf, sizeof(struct iatt));
-
- xdata = dict_new();
- if (!xdata)
- goto out;
-
- ret = dict_set_gfuuid(xdata, "gfid-req", local->stbuf.ia_gfid, true);
- if (ret)
- gf_log(this->name, GF_LOG_WARNING, "%s: failed to set gfid-req",
- local->loc.path);
-
- while (trav) {
- if (IA_ISREG(local->stbuf.ia_type)) {
- STACK_WIND(
- rframe, stripe_sh_make_entry_cbk, trav->xlator,
- trav->xlator->fops->mknod, &local->loc,
- st_mode_from_ia(local->stbuf.ia_prot, local->stbuf.ia_type), 0,
- 0, xdata);
- }
- if (IA_ISDIR(local->stbuf.ia_type)) {
- STACK_WIND(
- rframe, stripe_sh_make_entry_cbk, trav->xlator,
- trav->xlator->fops->mkdir, &local->loc,
- st_mode_from_ia(local->stbuf.ia_prot, local->stbuf.ia_type), 0,
- xdata);
- }
- trav = trav->next;
- }
-
- if (xdata)
- dict_unref(xdata);
- return 0;
-
-out:
- if (rframe)
- STRIPE_STACK_DESTROY(rframe);
-
- return 0;
-}
-
-int32_t
-stripe_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xdata, struct iatt *postparent)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = 0;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- if ((op_errno != ENOENT) && (op_errno != ESTALE))
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- if (local->op_errno != ESTALE)
- local->op_errno = op_errno;
- if (((op_errno != ENOENT) && (op_errno != ENOTCONN) &&
- (op_errno != ESTALE)) ||
- (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- if (op_errno == ENOENT)
- local->entry_self_heal_needed = 1;
- }
-
- if (op_ret >= 0) {
- local->op_ret = 0;
- if (IA_ISREG(buf->ia_type)) {
- ret = stripe_ctx_handle(this, prev, local, xdata);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR,
- "Error getting fctx info from"
- " dict");
- }
-
- if (FIRST_CHILD(this) == prev->this) {
- local->stbuf = *buf;
- local->postparent = *postparent;
- local->inode = inode_ref(inode);
- if (xdata)
- local->xdata = dict_ref(xdata);
- if (local->xattr) {
- stripe_aggregate_xattr(local->xdata, local->xattr);
- dict_unref(local->xattr);
- local->xattr = NULL;
- }
- }
-
- if (!local->xdata && !local->xattr) {
- local->xattr = dict_ref(xdata);
- } else if (local->xdata) {
- stripe_aggregate_xattr(local->xdata, xdata);
- } else if (local->xattr) {
- stripe_aggregate_xattr(local->xattr, xdata);
- }
-
- local->stbuf_blocks += buf->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- correct_file_size(buf, local->fctx, prev);
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
-
- if (gf_uuid_is_null(local->ia_gfid))
- gf_uuid_copy(local->ia_gfid, buf->ia_gfid);
-
- /* Make sure the gfid on all the nodes are same */
- if (gf_uuid_compare(local->ia_gfid, buf->ia_gfid)) {
- gf_log(this->name, GF_LOG_WARNING,
- "%s: gfid different on subvolume %s", local->loc.path,
- prev->this->name);
- }
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->op_ret == 0 && local->entry_self_heal_needed &&
- !gf_uuid_is_null(local->loc.inode->gfid))
- stripe_entry_self_heal(frame, this, local);
-
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->stbuf.ia_blocks = local->stbuf_blocks;
- local->stbuf.ia_size = local->stbuf_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- inode_ctx_put(local->inode, this, (uint64_t)(long)local->fctx);
- }
-
- STRIPE_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xdata,
- &local->postparent);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
- int64_t filesize = 0;
- int ret = 0;
- uint64_t tmpctx = 0;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- loc_copy(&local->loc, loc);
-
- inode_ctx_get(local->inode, this, &tmpctx);
- if (tmpctx)
- local->fctx = (stripe_fd_ctx_t *)(long)tmpctx;
-
- /* quick-read friendly changes */
- if (xdata && dict_get(xdata, GF_CONTENT_KEY)) {
- ret = dict_get_int64(xdata, GF_CONTENT_KEY, &filesize);
- if (!ret && (filesize > priv->block_size))
- dict_del(xdata, GF_CONTENT_KEY);
- }
-
- /* get stripe-size xattr on lookup. This would be required for
- * open/read/write/pathinfo calls. Hence we send down the request
- * even when type == IA_INVAL */
-
- /*
- * We aren't guaranteed to have xdata here. We need the format info for
- * the file, so allocate xdata if necessary.
- */
- if (!xdata)
- xdata = dict_new();
- else
- xdata = dict_ref(xdata);
-
- if (xdata &&
- (IA_ISREG(loc->inode->ia_type) || (loc->inode->ia_type == IA_INVAL))) {
- ret = stripe_xattr_request_build(this, xdata, 8, 4, 4, 0);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR,
- "Failed to build"
- " xattr request for %s",
- loc->path);
- }
-
- /* Every time in stripe lookup, all child nodes
- should be looked up */
- local->call_count = priv->child_count;
- while (trav) {
- STACK_WIND(frame, stripe_lookup_cbk, trav->xlator,
- trav->xlator->fops->lookup, loc, xdata);
- trav = trav->next;
- }
-
- dict_unref(xdata);
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this) {
- local->stbuf = *buf;
- }
-
- local->stbuf_blocks += buf->ia_blocks;
-
- correct_file_size(buf, local->fctx, prev);
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
-
- STRIPE_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno,
- &local->stbuf, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- if (IA_ISREG(loc->inode->ia_type)) {
- inode_ctx_get(loc->inode, this, (uint64_t *)&fctx);
- if (!fctx)
- goto err;
- local->fctx = fctx;
- }
-
- while (trav) {
- STACK_WIND(frame, stripe_stat_cbk, trav->xlator,
- trav->xlator->fops->stat, loc, NULL);
- trav = trav->next;
- }
-
- return 0;
-
-err:
- STRIPE_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct statvfs *stbuf,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- int32_t callcnt = 0;
-
- if (!this || !frame || !frame->local) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret && (op_errno != ENOTCONN)) {
- local->op_errno = op_errno;
- }
- if (op_ret == 0) {
- struct statvfs *dict_buf = &local->statvfs_buf;
- dict_buf->f_bsize = stbuf->f_bsize;
- dict_buf->f_frsize = stbuf->f_frsize;
- dict_buf->f_blocks += stbuf->f_blocks;
- dict_buf->f_bfree += stbuf->f_bfree;
- dict_buf->f_bavail += stbuf->f_bavail;
- dict_buf->f_files += stbuf->f_files;
- dict_buf->f_ffree += stbuf->f_ffree;
- dict_buf->f_favail += stbuf->f_favail;
- dict_buf->f_fsid = stbuf->f_fsid;
- dict_buf->f_flag = stbuf->f_flag;
- dict_buf->f_namemax = stbuf->f_namemax;
- local->op_ret = 0;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- STRIPE_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
- &local->statvfs_buf, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
-
- trav = this->children;
- priv = this->private;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- frame->local = local;
-
- local->call_count = priv->child_count;
- while (trav) {
- STACK_WIND(frame, stripe_statfs_cbk, trav->xlator,
- trav->xlator->fops->statfs, loc, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
- if (FIRST_CHILD(this) == prev->this) {
- local->pre_buf = *prebuf;
- local->post_buf = *postbuf;
- }
-
- local->prebuf_blocks += prebuf->ia_blocks;
- local->postbuf_blocks += postbuf->ia_blocks;
-
- correct_file_size(prebuf, local->fctx, prev);
- correct_file_size(postbuf, local->fctx, prev);
-
- if (local->prebuf_size < prebuf->ia_size)
- local->prebuf_size = prebuf->ia_size;
-
- if (local->postbuf_size < postbuf->ia_size)
- local->postbuf_size = postbuf->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->pre_buf.ia_blocks = local->prebuf_blocks;
- local->pre_buf.ia_size = local->prebuf_size;
- local->post_buf.ia_blocks = local->postbuf_blocks;
- local->post_buf.ia_size = local->postbuf_size;
- }
-
- STRIPE_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno,
- &local->pre_buf, &local->post_buf, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = EINVAL;
- int i, eof_idx;
- off_t dest_offset, tmp_offset;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- inode_ctx_get(loc->inode, this, (uint64_t *)&fctx);
- if (!fctx) {
- gf_log(this->name, GF_LOG_ERROR, "no stripe context");
- op_errno = EINVAL;
- goto err;
- }
-
- local->fctx = fctx;
- eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count;
-
- for (i = 0; i < fctx->stripe_count; i++) {
- if (!fctx->xl_array[i]) {
- gf_log(this->name, GF_LOG_ERROR, "no xlator at index %d", i);
- op_errno = EINVAL;
- goto err;
- }
-
- if (fctx->stripe_coalesce) {
- /*
- * The node that owns EOF is truncated to the exact
- * coalesced offset. Nodes prior to this index should
- * be rounded up to the size of the complete stripe,
- * while nodes after this index should be rounded down
- * to the size of the previous stripe.
- */
- if (i < eof_idx)
- tmp_offset = roof(offset,
- fctx->stripe_size * fctx->stripe_count);
- else if (i > eof_idx)
- tmp_offset = floor(offset,
- fctx->stripe_size * fctx->stripe_count);
- else
- tmp_offset = offset;
-
- dest_offset = coalesced_offset(tmp_offset, fctx->stripe_size,
- fctx->stripe_count);
- } else {
- dest_offset = offset;
- }
-
- STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i],
- fctx->xl_array[i]->fops->truncate, loc, dest_offset, NULL);
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preop,
- struct iatt *postop, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this) {
- local->pre_buf = *preop;
- local->post_buf = *postop;
- }
-
- local->prebuf_blocks += preop->ia_blocks;
- local->postbuf_blocks += postop->ia_blocks;
-
- correct_file_size(preop, local->fctx, prev);
- correct_file_size(postop, local->fctx, prev);
-
- if (local->prebuf_size < preop->ia_size)
- local->prebuf_size = preop->ia_size;
- if (local->postbuf_size < postop->ia_size)
- local->postbuf_size = postop->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->pre_buf.ia_blocks = local->prebuf_blocks;
- local->pre_buf.ia_size = local->prebuf_size;
- local->post_buf.ia_blocks = local->postbuf_blocks;
- local->post_buf.ia_size = local->postbuf_size;
- }
-
- STRIPE_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
- &local->pre_buf, &local->post_buf, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- if (!IA_ISDIR(loc->inode->ia_type) && !IA_ISREG(loc->inode->ia_type)) {
- local->call_count = 1;
- STACK_WIND(frame, stripe_setattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, NULL);
- return 0;
- }
-
- if (IA_ISREG(loc->inode->ia_type)) {
- inode_ctx_get(loc->inode, this, (uint64_t *)&fctx);
- if (!fctx)
- goto err;
- local->fctx = fctx;
- }
-
- local->call_count = priv->child_count;
- while (trav) {
- STACK_WIND(frame, stripe_setattr_cbk, trav->xlator,
- trav->xlator->fops->setattr, loc, stbuf, valid, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND(frame, stripe_setattr_cbk, trav->xlator,
- trav->xlator->fops->fsetattr, fd, stbuf, valid, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_stack_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent,
- dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- local->stbuf.ia_blocks += buf->ia_blocks;
- local->preparent.ia_blocks += preoldparent->ia_blocks;
- local->postparent.ia_blocks += postoldparent->ia_blocks;
- local->pre_buf.ia_blocks += prenewparent->ia_blocks;
- local->post_buf.ia_blocks += postnewparent->ia_blocks;
-
- correct_file_size(buf, local->fctx, prev);
-
- if (local->stbuf.ia_size < buf->ia_size)
- local->stbuf.ia_size = buf->ia_size;
-
- if (local->preparent.ia_size < preoldparent->ia_size)
- local->preparent.ia_size = preoldparent->ia_size;
-
- if (local->postparent.ia_size < postoldparent->ia_size)
- local->postparent.ia_size = postoldparent->ia_size;
-
- if (local->pre_buf.ia_size < prenewparent->ia_size)
- local->pre_buf.ia_size = prenewparent->ia_size;
-
- if (local->post_buf.ia_size < postnewparent->ia_size)
- local->post_buf.ia_size = postnewparent->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- STRIPE_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preparent,
- &local->postparent, &local->pre_buf,
- &local->post_buf, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_first_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- op_errno = EINVAL;
- goto unwind;
- }
-
- if (op_ret == -1) {
- goto unwind;
- }
-
- local = frame->local;
- trav = this->children;
-
- local->stbuf = *buf;
- local->preparent = *preoldparent;
- local->postparent = *postoldparent;
- local->pre_buf = *prenewparent;
- local->post_buf = *postnewparent;
-
- local->op_ret = 0;
- local->call_count--;
-
- trav = trav->next; /* Skip first child */
- while (trav) {
- STACK_WIND(frame, stripe_stack_rename_cbk, trav->xlator,
- trav->xlator->fops->rename, &local->loc, &local->loc2, NULL);
- trav = trav->next;
- }
- return 0;
-
-unwind:
- STRIPE_STACK_UNWIND(rename, frame, -1, op_errno, buf, preoldparent,
- postoldparent, prenewparent, postnewparent, NULL);
- return 0;
-}
-
-int32_t
-stripe_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata)
-{
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(oldloc, err);
- VALIDATE_OR_GOTO(oldloc->path, err);
- VALIDATE_OR_GOTO(oldloc->inode, err);
- VALIDATE_OR_GOTO(newloc, err);
-
- priv = this->private;
- trav = this->children;
-
- /* If any one node is down, don't allow rename */
- if (priv->nodes_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- frame->local = local;
-
- local->op_ret = -1;
- loc_copy(&local->loc, oldloc);
- loc_copy(&local->loc2, newloc);
-
- local->call_count = priv->child_count;
-
- if (IA_ISREG(oldloc->inode->ia_type)) {
- inode_ctx_get(oldloc->inode, this, (uint64_t *)&fctx);
- if (!fctx)
- goto err;
- local->fctx = fctx;
- }
-
- STACK_WIND(frame, stripe_first_rename_cbk, trav->xlator,
- trav->xlator->fops->rename, oldloc, newloc, NULL);
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
-}
-int32_t
-stripe_first_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned %s", prev->this->name,
- strerror(op_errno));
- goto out;
- }
- local->op_ret = 0;
- local->preparent = *preparent;
- local->postparent = *postparent;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- STRIPE_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, xdata);
- return 0;
-out:
- STRIPE_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-stripe_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned %s", prev->this->name,
- strerror(op_errno));
- local->op_errno = op_errno;
- if (op_errno != ENOENT) {
- local->failed = 1;
- local->op_ret = op_ret;
- }
- }
- }
- UNLOCK(&frame->lock);
-
- if (callcnt == 1) {
- if (local->failed) {
- op_errno = local->op_errno;
- goto out;
- }
- STACK_WIND(frame, stripe_first_unlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag,
- local->xdata);
- }
- return 0;
-out:
- STRIPE_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-stripe_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
- dict_t *xdata)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Don't unlink a file if a node is down */
- if (priv->nodes_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- loc_copy(&local->loc, loc);
- local->xflag = xflag;
-
- if (xdata)
- local->xdata = dict_ref(xdata);
-
- frame->local = local;
- local->call_count = priv->child_count;
- trav = trav->next; /* Skip the first child */
-
- while (trav) {
- STACK_WIND(frame, stripe_unlink_cbk, trav->xlator,
- trav->xlator->fops->unlink, loc, xflag, xdata);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_first_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- op_errno = EINVAL;
- goto err;
- }
-
- if (op_ret == -1) {
- goto err;
- }
-
- local = frame->local;
- local->op_ret = 0;
-
- local->call_count--; /* First child successful */
-
- local->preparent = *preparent;
- local->postparent = *postparent;
- local->preparent_size = preparent->ia_size;
- local->postparent_size = postparent->ia_size;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- STRIPE_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, xdata);
- return 0;
-err:
- STRIPE_STACK_UNWIND(rmdir, frame, op_ret, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned %s", prev->this->name,
- strerror(op_errno));
- if (op_errno != ENOENT)
- local->failed = 1;
- }
- }
- UNLOCK(&frame->lock);
-
- if (callcnt == 1) {
- if (local->failed)
- goto out;
- STACK_WIND(frame, stripe_first_rmdir_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rmdir, &local->loc, local->flags,
- NULL);
- }
- return 0;
-out:
- STRIPE_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
- dict_t *xdata)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* don't delete a directory if any of the subvolume is down */
- if (priv->nodes_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- loc_copy(&local->loc, loc);
- local->flags = flags;
- local->call_count = priv->child_count;
- trav = trav->next; /* skip the first child */
-
- while (trav) {
- STACK_WIND(frame, stripe_rmdir_cbk, trav->xlator,
- trav->xlator->fops->rmdir, loc, flags, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_mknod_ifreg_fail_unlink_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- STRIPE_STACK_UNWIND(mknod, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, &local->preparent,
- &local->postparent, NULL);
- }
-out:
- return 0;
-}
-
-/**
- */
-int32_t
-stripe_mknod_ifreg_setxattr_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->op_ret == -1) {
- local->call_count = priv->child_count;
- while (trav) {
- STACK_WIND(frame, stripe_mknod_ifreg_fail_unlink_cbk,
- trav->xlator, trav->xlator->fops->unlink,
- &local->loc, 0, NULL);
- trav = trav->next;
- }
- return 0;
- }
-
- STRIPE_STACK_UNWIND(mknod, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, &local->preparent,
- &local->postparent, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_mknod_ifreg_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- call_frame_t *prev = NULL;
- xlator_list_t *trav = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- local->op_errno = op_errno;
- }
- if (op_ret >= 0) {
- local->op_ret = op_ret;
-
- /* Can be used as a mechanism to understand if mknod
- was successful in at least one place */
- if (gf_uuid_is_null(local->ia_gfid))
- gf_uuid_copy(local->ia_gfid, buf->ia_gfid);
-
- if (stripe_ctx_handle(this, prev, local, xdata))
- gf_log(this->name, GF_LOG_ERROR,
- "Error getting fctx info from dict");
-
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- correct_file_size(buf, local->fctx, prev);
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if ((local->op_ret == -1) && !gf_uuid_is_null(local->ia_gfid)) {
- /* ia_gfid set means, at least on one node 'mknod'
- is successful */
- local->call_count = priv->child_count;
- trav = this->children;
- while (trav) {
- STACK_WIND(frame, stripe_mknod_ifreg_fail_unlink_cbk,
- trav->xlator, trav->xlator->fops->unlink,
- &local->loc, 0, NULL);
- trav = trav->next;
- }
- return 0;
- }
-
- if (local->op_ret != -1) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- inode_ctx_put(local->inode, this, (uint64_t)(long)local->fctx);
- }
- STRIPE_STACK_UNWIND(mknod, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, &local->preparent,
- &local->postparent, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_mknod_first_ifreg_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- call_frame_t *prev = NULL;
- xlator_list_t *trav = NULL;
- int i = 1;
- dict_t *dict = NULL;
- int ret = 0;
- int need_unref = 0;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
- trav = this->children;
-
- local->call_count--;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->failed = 1;
- local->op_errno = op_errno;
- goto out;
- }
-
- local->op_ret = op_ret;
-
- local->stbuf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
-
- if (gf_uuid_is_null(local->ia_gfid))
- gf_uuid_copy(local->ia_gfid, buf->ia_gfid);
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
-
- trav = trav->next;
- while (trav) {
- if (priv->xattr_supported) {
- dict = dict_new();
- if (!dict) {
- gf_log(this->name, GF_LOG_ERROR, "failed to allocate dict %s",
- local->loc.path);
- }
- need_unref = 1;
-
- dict_copy(local->xattr, dict);
-
- ret = stripe_xattr_request_build(this, dict, local->stripe_size,
- priv->child_count, i,
- priv->coalesce);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR,
- "Failed to build xattr request");
-
- } else {
- dict = local->xattr;
- }
-
- STACK_WIND(frame, stripe_mknod_ifreg_cbk, trav->xlator,
- trav->xlator->fops->mknod, &local->loc, local->mode,
- local->rdev, 0, dict);
- trav = trav->next;
- i++;
-
- if (dict && need_unref)
- dict_unref(dict);
- }
-
- return 0;
-
-out:
-
- STRIPE_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
-}
-
-int32_t
-stripe_single_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STRIPE_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent,
- postparent, xdata);
- return 0;
-}
-
-int
-stripe_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t rdev, mode_t umask, dict_t *xdata)
-{
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- int32_t op_errno = EINVAL;
- int32_t i = 0;
- dict_t *dict = NULL;
- int ret = 0;
- int need_unref = 0;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- if (S_ISREG(mode)) {
- /* NOTE: on older kernels (older than 2.6.9),
- creat() fops is sent as mknod() + open(). Hence handling
- S_IFREG files is necessary */
- if (priv->nodes_down) {
- gf_log(this->name, GF_LOG_WARNING, "Some node down, returning EIO");
- op_errno = EIO;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->stripe_size = stripe_get_matching_bs(loc->path, priv);
- frame->local = local;
- local->inode = inode_ref(loc->inode);
- loc_copy(&local->loc, loc);
- local->xattr = dict_copy_with_ref(xdata, NULL);
- local->mode = mode;
- local->umask = umask;
- local->rdev = rdev;
-
- /* Every time in stripe lookup, all child nodes should
- be looked up */
- local->call_count = priv->child_count;
-
- if (priv->xattr_supported) {
- dict = dict_new();
- if (!dict) {
- gf_log(this->name, GF_LOG_ERROR, "failed to allocate dict %s",
- loc->path);
- }
- need_unref = 1;
-
- dict_copy(xdata, dict);
-
- ret = stripe_xattr_request_build(this, dict, local->stripe_size,
- priv->child_count, i,
- priv->coalesce);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR,
- "failed to build xattr request");
- } else {
- dict = xdata;
- }
-
- STACK_WIND(frame, stripe_mknod_first_ifreg_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask,
- dict);
-
- if (dict && need_unref)
- dict_unref(dict);
- return 0;
- }
-
- STACK_WIND(frame, stripe_single_mknod_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
-}
-
-int32_t
-stripe_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
-
- if (op_ret >= 0) {
- local->op_ret = 0;
-
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed != -1) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
- STRIPE_STACK_UNWIND(mkdir, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, &local->preparent,
- &local->postparent, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_first_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_list_t *trav = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
- trav = this->children;
-
- local->call_count--; /* first child is successful */
- trav = trav->next; /* skip first child */
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- goto out;
- }
-
- local->op_ret = 0;
-
- local->inode = inode_ref(inode);
- local->stbuf = *buf;
- local->postparent = *postparent;
- local->preparent = *preparent;
-
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- local->stbuf_size = buf->ia_size;
- local->preparent_size = preparent->ia_size;
- local->postparent_size = postparent->ia_size;
-
- while (trav) {
- STACK_WIND(frame, stripe_mkdir_cbk, trav->xlator,
- trav->xlator->fops->mkdir, &local->loc, local->mode,
- local->umask, local->xdata);
- trav = trav->next;
- }
- return 0;
-out:
- STRIPE_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL);
-
- return 0;
-}
-
-int
-stripe_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- mode_t umask, dict_t *xdata)
-{
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- local->call_count = priv->child_count;
- if (xdata)
- local->xdata = dict_ref(xdata);
- local->mode = mode;
- local->umask = umask;
- loc_copy(&local->loc, loc);
- frame->local = local;
-
- /* Every time in stripe lookup, all child nodes should be looked up */
- STACK_WIND(frame, stripe_first_mkdir_cbk, trav->xlator,
- trav->xlator->fops->mkdir, loc, mode, umask, xdata);
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
-}
-
-int32_t
-stripe_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
- stripe_fd_ctx_t *fctx = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
-
- if (op_ret >= 0) {
- local->op_ret = 0;
-
- if (IA_ISREG(inode->ia_type)) {
- inode_ctx_get(inode, this, (uint64_t *)&fctx);
- if (!fctx) {
- gf_log(this->name, GF_LOG_ERROR,
- "failed to get stripe context");
- op_ret = -1;
- op_errno = EINVAL;
- }
- }
-
- if (FIRST_CHILD(this) == prev->this) {
- local->inode = inode_ref(inode);
- local->stbuf = *buf;
- local->postparent = *postparent;
- local->preparent = *preparent;
- }
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- correct_file_size(buf, fctx, prev);
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
- STRIPE_STACK_UNWIND(link, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, &local->preparent,
- &local->postparent, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(oldloc, err);
- VALIDATE_OR_GOTO(oldloc->path, err);
- VALIDATE_OR_GOTO(oldloc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* If any one node is down, don't allow link operation */
- if (priv->nodes_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- /* Every time in stripe lookup, all child
- nodes should be looked up */
- while (trav) {
- STACK_WIND(frame, stripe_link_cbk, trav->xlator,
- trav->xlator->fops->link, oldloc, newloc, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
-}
-
-int32_t
-stripe_create_fail_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- STRIPE_STACK_UNWIND(create, frame, local->op_ret, local->op_errno,
- local->fd, local->inode, &local->stbuf,
- &local->preparent, &local->postparent, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- call_frame_t *prev = NULL;
- xlator_list_t *trav = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->failed = 1;
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
- if (IA_ISREG(buf->ia_type)) {
- if (stripe_ctx_handle(this, prev, local, xdata))
- gf_log(this->name, GF_LOG_ERROR,
- "Error getting fctx info from "
- "dict");
- }
-
- local->op_ret = op_ret;
-
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- correct_file_size(buf, local->fctx, prev);
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret == -1) {
- local->call_count = priv->child_count;
- trav = this->children;
- while (trav) {
- STACK_WIND(frame, stripe_create_fail_unlink_cbk, trav->xlator,
- trav->xlator->fops->unlink, &local->loc, 0, NULL);
- trav = trav->next;
- }
-
- return 0;
- }
-
- if (local->op_ret >= 0) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
-
- stripe_copy_xl_array(local->fctx->xl_array, priv->xl_array,
- local->fctx->stripe_count);
- inode_ctx_put(local->inode, this, (uint64_t)local->fctx);
- }
-
- /* Create itself has failed.. so return
- without setxattring */
- STRIPE_STACK_UNWIND(create, frame, local->op_ret, local->op_errno,
- local->fd, local->inode, &local->stbuf,
- &local->preparent, &local->postparent, NULL);
- }
-
-out:
- return 0;
-}
-
-int32_t
-stripe_first_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- call_frame_t *prev = NULL;
- xlator_list_t *trav = NULL;
- int i = 1;
- dict_t *dict = NULL;
- loc_t *loc = NULL;
- int32_t need_unref = 0;
- int32_t ret = -1;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
- trav = this->children;
- loc = &local->loc;
-
- --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->failed = 1;
- local->op_errno = op_errno;
- }
-
- local->op_ret = 0;
- /* Get the mapping in inode private */
- /* Get the stat buf right */
- local->stbuf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
-
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
-
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret == -1) {
- local->call_count = 1;
- STACK_WIND(frame, stripe_create_fail_unlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink, &local->loc, 0, NULL);
- return 0;
- }
-
- if (local->op_ret >= 0) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
-
- /* Send a setxattr request to nodes where the
- files are created */
- trav = trav->next;
- while (trav) {
- if (priv->xattr_supported) {
- dict = dict_new();
- if (!dict) {
- gf_log(this->name, GF_LOG_ERROR, "failed to allocate dict %s",
- loc->path);
- }
- need_unref = 1;
-
- dict_copy(local->xattr, dict);
-
- ret = stripe_xattr_request_build(this, dict, local->stripe_size,
- priv->child_count, i,
- priv->coalesce);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR,
- "failed to build xattr request");
- } else {
- dict = local->xattr;
- }
-
- STACK_WIND(frame, stripe_create_cbk, trav->xlator,
- trav->xlator->fops->create, &local->loc, local->flags,
- local->mode, local->umask, local->fd, dict);
- trav = trav->next;
- if (need_unref && dict)
- dict_unref(dict);
- i++;
- }
-
-out:
- return 0;
-}
-
-/**
- * stripe_create - If a block-size is specified for the 'name', create the
- * file in all the child nodes. If not, create it in only first child.
- *
- * @name- complete path of the file to be created.
- */
-int32_t
-stripe_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
-{
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- int32_t op_errno = EINVAL;
- int ret = 0;
- int need_unref = 0;
- int i = 0;
- dict_t *dict = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
-
- /* files created in O_APPEND mode does not allow lseek() on fd */
- flags &= ~O_APPEND;
-
- if (priv->first_child_down || priv->nodes_down) {
- gf_log(this->name, GF_LOG_DEBUG, "First node down, returning EIO");
- op_errno = EIO;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->stripe_size = stripe_get_matching_bs(loc->path, priv);
- frame->local = local;
- local->inode = inode_ref(loc->inode);
- loc_copy(&local->loc, loc);
- local->fd = fd_ref(fd);
- local->flags = flags;
- local->mode = mode;
- local->umask = umask;
- if (xdata)
- local->xattr = dict_ref(xdata);
-
- local->call_count = priv->child_count;
- /* Send a setxattr request to nodes where the
- files are created */
-
- if (priv->xattr_supported) {
- dict = dict_new();
- if (!dict) {
- gf_log(this->name, GF_LOG_ERROR, "failed to allocate dict %s",
- loc->path);
- }
- need_unref = 1;
-
- dict_copy(xdata, dict);
-
- ret = stripe_xattr_request_build(this, dict, local->stripe_size,
- priv->child_count, i, priv->coalesce);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR, "failed to build xattr request");
- } else {
- dict = xdata;
- }
-
- STACK_WIND(frame, stripe_first_create_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
- dict);
-
- if (need_unref && dict)
- dict_unref(dict);
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL, xdata);
- return 0;
-}
-
-int32_t
-stripe_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0)
- local->op_ret = op_ret;
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- STRIPE_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
- local->fd, xdata);
- }
-out:
- return 0;
-}
-
-/**
- * stripe_open -
- */
-int32_t
-stripe_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- /* files opened in O_APPEND mode does not allow lseek() on fd */
- flags &= ~O_APPEND;
-
- local->fd = fd_ref(fd);
- frame->local = local;
- loc_copy(&local->loc, loc);
-
- /* Striped files */
- local->flags = flags;
- local->call_count = priv->child_count;
- local->stripe_size = stripe_get_matching_bs(loc->path, priv);
-
- while (trav) {
- STACK_WIND(frame, stripe_open_cbk, trav->xlator,
- trav->xlator->fops->open, &local->loc, local->flags,
- local->fd, xdata);
- trav = trav->next;
- }
- return 0;
-err:
- STRIPE_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0)
- local->op_ret = op_ret;
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- STRIPE_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno,
- local->fd, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
- dict_t *xdata)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
- local->call_count = priv->child_count;
- local->fd = fd_ref(fd);
-
- while (trav) {
- STACK_WIND(frame, stripe_opendir_cbk, trav->xlator,
- trav->xlator->fops->opendir, loc, fd, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
- if (op_ret >= 0) {
- if (FIRST_CHILD(this) == prev->this) {
- /* First successful call, copy the *lock */
- local->op_ret = op_ret;
- local->lock = *lock;
- }
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
- STRIPE_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
- &local->lock, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- trav = this->children;
- priv = this->private;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND(frame, stripe_lk_cbk, trav->xlator, trav->xlator->fops->lk,
- fd, cmd, lock, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned %s", prev->this->name,
- strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
- if (op_ret >= 0)
- local->op_ret = op_ret;
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- STRIPE_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND(frame, stripe_flush_cbk, trav->xlator,
- trav->xlator->fops->flush, fd, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
- return 0;
-}
-
-int32_t
-stripe_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned %s", prev->this->name,
- strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- if (FIRST_CHILD(this) == prev->this) {
- local->pre_buf = *prebuf;
- local->post_buf = *postbuf;
- }
- local->prebuf_blocks += prebuf->ia_blocks;
- local->postbuf_blocks += postbuf->ia_blocks;
-
- correct_file_size(prebuf, local->fctx, prev);
- correct_file_size(postbuf, local->fctx, prev);
-
- if (local->prebuf_size < prebuf->ia_size)
- local->prebuf_size = prebuf->ia_size;
-
- if (local->postbuf_size < postbuf->ia_size)
- local->postbuf_size = postbuf->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->pre_buf.ia_blocks = local->prebuf_blocks;
- local->pre_buf.ia_size = local->prebuf_size;
- local->post_buf.ia_blocks = local->postbuf_blocks;
- local->post_buf.ia_size = local->postbuf_size;
- }
-
- STRIPE_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno,
- &local->pre_buf, &local->post_buf, NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- frame->local = local;
-
- inode_ctx_get(fd->inode, this, (uint64_t *)&fctx);
- if (!fctx) {
- op_errno = EINVAL;
- goto err;
- }
- local->fctx = fctx;
- local->op_ret = -1;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND(frame, stripe_fsync_cbk, trav->xlator,
- trav->xlator->fops->fsync, fd, flags, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this)
- local->stbuf = *buf;
-
- local->stbuf_blocks += buf->ia_blocks;
-
- correct_file_size(buf, local->fctx, prev);
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
-
- STRIPE_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno,
- &local->stbuf, NULL);
- }
-
-out:
- return 0;
-}
-
-int32_t
-stripe_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- if (IA_ISREG(fd->inode->ia_type)) {
- inode_ctx_get(fd->inode, this, (uint64_t *)&fctx);
- if (!fctx)
- goto err;
- local->fctx = fctx;
- }
-
- while (trav) {
- STACK_WIND(frame, stripe_fstat_cbk, trav->xlator,
- trav->xlator->fops->fstat, fd, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int i, eof_idx;
- off_t dest_offset, tmp_offset;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- priv = this->private;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- inode_ctx_get(fd->inode, this, (uint64_t *)&fctx);
- if (!fctx) {
- gf_log(this->name, GF_LOG_ERROR, "no stripe context");
- op_errno = EINVAL;
- goto err;
- }
- if (!fctx->stripe_count) {
- gf_log(this->name, GF_LOG_ERROR, "no stripe count");
- op_errno = EINVAL;
- goto err;
- }
-
- local->fctx = fctx;
- eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count;
-
- for (i = 0; i < fctx->stripe_count; i++) {
- if (!fctx->xl_array[i]) {
- gf_log(this->name, GF_LOG_ERROR,
- "no xlator at index "
- "%d",
- i);
- op_errno = EINVAL;
- goto err;
- }
-
- if (fctx->stripe_coalesce) {
- if (i < eof_idx)
- tmp_offset = roof(offset,
- fctx->stripe_size * fctx->stripe_count);
- else if (i > eof_idx)
- tmp_offset = floor(offset,
- fctx->stripe_size * fctx->stripe_count);
- else
- tmp_offset = offset;
-
- dest_offset = coalesced_offset(tmp_offset, fctx->stripe_size,
- fctx->stripe_count);
- } else {
- dest_offset = offset;
- }
-
- STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i],
- fctx->xl_array[i]->fops->ftruncate, fd, dest_offset, NULL);
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG, "%s returned %s", prev->this->name,
- strerror(op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) || (prev->this == FIRST_CHILD(this)))
- local->failed = 1;
- }
- if (op_ret >= 0)
- local->op_ret = op_ret;
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- STRIPE_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
- NULL);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND(frame, stripe_fsyncdir_cbk, trav->xlator,
- trav->xlator->fops->fsyncdir, fd, flags, NULL);
- trav = trav->next;
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL);
- return 0;
-}
-
-int32_t
-stripe_readv_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
-{
- int32_t i = 0;
- int32_t callcnt = 0;
- int32_t count = 0;
- stripe_local_t *local = NULL;
- struct iovec *vec = NULL;
- struct iatt tmp_stbuf = {
- 0,
- };
- struct iobref *tmp_iobref = NULL;
- struct iobuf *iobuf = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- local = frame->local;
- prev = cookie;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret != -1) {
- correct_file_size(buf, local->fctx, prev);
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- op_ret = 0;
-
- /* Keep extra space for filling in '\0's */
- vec = GF_CALLOC((local->count * 2), sizeof(struct iovec),
- gf_stripe_mt_iovec);
- if (!vec) {
- op_ret = -1;
- goto done;
- }
-
- for (i = 0; i < local->wind_count; i++) {
- if (local->replies[i].op_ret) {
- memcpy((vec + count), local->replies[i].vector,
- (local->replies[i].count * sizeof(struct iovec)));
- count += local->replies[i].count;
- op_ret += local->replies[i].op_ret;
- }
- if ((local->replies[i].op_ret < local->replies[i].requested_size) &&
- (local->stbuf_size > (local->offset + op_ret))) {
- /* Fill in 0s here */
- vec[count].iov_len = (local->replies[i].requested_size -
- local->replies[i].op_ret);
- iobuf = iobuf_get2(this->ctx->iobuf_pool, vec[count].iov_len);
- if (!iobuf) {
- gf_log(this->name, GF_LOG_ERROR, "Out of memory.");
- op_ret = -1;
- op_errno = ENOMEM;
- goto done;
- }
- memset(iobuf->ptr, 0, vec[count].iov_len);
- vec[count].iov_base = iobuf->ptr;
-
- iobref_add(local->iobref, iobuf);
- iobuf_unref(iobuf);
-
- op_ret += vec[count].iov_len;
- count++;
- }
- GF_FREE(local->replies[i].vector);
- }
-
- /* ENOENT signals EOF to the NFS-server */
- if (op_ret != -1 && op_ret < local->readv_size &&
- (local->offset + op_ret == buf->ia_size))
- op_errno = ENOENT;
-
- /* FIXME: notice that st_ino, and st_dev (gen) will be
- * different than what inode will have. Make sure this doesn't
- * cause any bugs at higher levels */
- memcpy(&tmp_stbuf, &local->replies[0].stbuf, sizeof(struct iatt));
- tmp_stbuf.ia_size = local->stbuf_size;
-
- done:
- GF_FREE(local->replies);
- tmp_iobref = local->iobref;
- STRIPE_STACK_UNWIND(readv, frame, op_ret, op_errno, vec, count,
- &tmp_stbuf, tmp_iobref, NULL);
-
- iobref_unref(tmp_iobref);
- GF_FREE(vec);
- }
-out:
- return 0;
-}
-
-/**
- * stripe_readv_cbk - get all the striped reads, and order it properly, send it
- * to above layer after putting it in a single vector.
- */
-int32_t
-stripe_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iovec *vector,
- int32_t count, struct iatt *stbuf, struct iobref *iobref,
- dict_t *xdata)
-{
- int32_t index = 0;
- int32_t callcnt = 0;
- int32_t final_count = 0;
- int32_t need_to_check_proper_size = 0;
- call_frame_t *mframe = NULL;
- stripe_local_t *mlocal = NULL;
- stripe_local_t *local = NULL;
- struct iovec *final_vec = NULL;
- struct iatt tmp_stbuf = {
- 0,
- };
- struct iatt *tmp_stbuf_p = NULL; // need it for a warning
- struct iobref *tmp_iobref = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto end;
- }
-
- local = frame->local;
- index = local->node_index;
- prev = cookie;
- mframe = local->orig_frame;
- if (!mframe)
- goto out;
-
- mlocal = mframe->local;
- if (!mlocal)
- goto out;
-
- fctx = mlocal->fctx;
-
- LOCK(&mframe->lock);
- {
- mlocal->replies[index].op_ret = op_ret;
- mlocal->replies[index].op_errno = op_errno;
- mlocal->replies[index].requested_size = local->readv_size;
- if (op_ret >= 0) {
- mlocal->replies[index].stbuf = *stbuf;
- mlocal->replies[index].count = count;
- mlocal->replies[index].vector = iov_dup(vector, count);
-
- correct_file_size(stbuf, fctx, prev);
-
- if (local->stbuf_size < stbuf->ia_size)
- local->stbuf_size = stbuf->ia_size;
- local->stbuf_blocks += stbuf->ia_blocks;
-
- if (!mlocal->iobref)
- mlocal->iobref = iobref_new();
- iobref_merge(mlocal->iobref, iobref);
- }
- callcnt = ++mlocal->call_count;
- }
- UNLOCK(&mframe->lock);
-
- if (callcnt == mlocal->wind_count) {
- op_ret = 0;
-
- for (index = 0; index < mlocal->wind_count; index++) {
- /* check whether each stripe returned
- * 'expected' number of bytes */
- if (mlocal->replies[index].op_ret == -1) {
- op_ret = -1;
- op_errno = mlocal->replies[index].op_errno;
- break;
- }
- /* TODO: handle the 'holes' within the read range
- properly */
- if (mlocal->replies[index].op_ret <
- mlocal->replies[index].requested_size) {
- need_to_check_proper_size = 1;
- }
-
- op_ret += mlocal->replies[index].op_ret;
- mlocal->count += mlocal->replies[index].count;
- }
- if (op_ret == -1)
- goto done;
- if (need_to_check_proper_size)
- goto check_size;
-
- final_vec = GF_CALLOC(mlocal->count, sizeof(struct iovec),
- gf_stripe_mt_iovec);
-
- if (!final_vec) {
- op_ret = -1;
- goto done;
- }
-
- for (index = 0; index < mlocal->wind_count; index++) {
- memcpy((final_vec + final_count), mlocal->replies[index].vector,
- (mlocal->replies[index].count * sizeof(struct iovec)));
- final_count += mlocal->replies[index].count;
- GF_FREE(mlocal->replies[index].vector);
- }
-
- /* FIXME: notice that st_ino, and st_dev (gen) will be
- * different than what inode will have. Make sure this doesn't
- * cause any bugs at higher levels */
- memcpy(&tmp_stbuf, &mlocal->replies[0].stbuf, sizeof(struct iatt));
- tmp_stbuf.ia_size = local->stbuf_size;
- tmp_stbuf.ia_blocks = local->stbuf_blocks;
-
- done:
- /* */
- GF_FREE(mlocal->replies);
- tmp_iobref = mlocal->iobref;
- /* work around for nfs truncated read. Bug 3774 */
- tmp_stbuf_p = &tmp_stbuf;
- WIPE(tmp_stbuf_p);
- STRIPE_STACK_UNWIND(readv, mframe, op_ret, op_errno, final_vec,
- final_count, &tmp_stbuf, tmp_iobref, NULL);
-
- iobref_unref(tmp_iobref);
- GF_FREE(final_vec);
- }
-
- goto out;
-
-check_size:
- mlocal->call_count = fctx->stripe_count;
-
- for (index = 0; index < fctx->stripe_count; index++) {
- STACK_WIND(mframe, stripe_readv_fstat_cbk, (fctx->xl_array[index]),
- (fctx->xl_array[index])->fops->fstat, mlocal->fd, NULL);
- }
-
-out:
- STRIPE_STACK_DESTROY(frame);
-end:
- return 0;
-}
-
-int32_t
-stripe_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata)
-{
- int32_t op_errno = EINVAL;
- int32_t idx = 0;
- int32_t index = 0;
- int32_t num_stripe = 0;
- int32_t off_index = 0;
- size_t frame_size = 0;
- off_t rounded_end = 0;
- uint64_t tmp_fctx = 0;
- uint64_t stripe_size = 0;
- off_t rounded_start = 0;
- off_t frame_offset = offset;
- off_t dest_offset = 0;
- stripe_local_t *local = NULL;
- call_frame_t *rframe = NULL;
- stripe_local_t *rlocal = NULL;
- stripe_fd_ctx_t *fctx = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- inode_ctx_get(fd->inode, this, &tmp_fctx);
- if (!tmp_fctx) {
- op_errno = EBADFD;
- goto err;
- }
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
- stripe_size = fctx->stripe_size;
-
- STRIPE_VALIDATE_FCTX(fctx, err);
-
- if (!stripe_size) {
- gf_log(this->name, GF_LOG_DEBUG, "Wrong stripe size for the file");
- goto err;
- }
- /* The file is stripe across the child nodes. Send the read request
- * to the child nodes appropriately after checking which region of
- * the file is in which child node. Always '0-<stripe_size>' part of
- * the file resides in the first child.
- */
- rounded_start = floor(offset, stripe_size);
- rounded_end = roof(offset + size, stripe_size);
- num_stripe = (rounded_end - rounded_start) / stripe_size;
-
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
-
- /* This is where all the vectors should be copied. */
- local->replies = GF_CALLOC(num_stripe, sizeof(struct stripe_replies),
- gf_stripe_mt_stripe_replies);
- if (!local->replies) {
- op_errno = ENOMEM;
- goto err;
- }
-
- off_index = (offset / stripe_size) % fctx->stripe_count;
- local->wind_count = num_stripe;
- local->readv_size = size;
- local->offset = offset;
- local->fd = fd_ref(fd);
- local->fctx = fctx;
-
- for (index = off_index; index < (num_stripe + off_index); index++) {
- rframe = copy_frame(frame);
- rlocal = mem_get0(this->local_pool);
- if (!rlocal) {
- op_errno = ENOMEM;
- goto err;
- }
-
- frame_size = min(roof(frame_offset + 1, stripe_size), (offset + size)) -
- frame_offset;
-
- rlocal->node_index = index - off_index;
- rlocal->orig_frame = frame;
- rlocal->readv_size = frame_size;
- rframe->local = rlocal;
- idx = (index % fctx->stripe_count);
-
- if (fctx->stripe_coalesce)
- dest_offset = coalesced_offset(frame_offset, stripe_size,
- fctx->stripe_count);
- else
- dest_offset = frame_offset;
-
- STACK_WIND(rframe, stripe_readv_cbk, fctx->xl_array[idx],
- fctx->xl_array[idx]->fops->readv, fd, frame_size,
- dest_offset, flags, xdata);
-
- frame_offset += frame_size;
- }
-
- return 0;
-err:
- if (rframe)
- STRIPE_STACK_DESTROY(rframe);
-
- STRIPE_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_local_t *mlocal = NULL;
- call_frame_t *prev = NULL;
- call_frame_t *mframe = NULL;
- struct stripe_replies *reply = NULL;
- int32_t i = 0;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
- mframe = local->orig_frame;
- mlocal = mframe->local;
-
- LOCK(&frame->lock);
- {
- callcnt = ++mlocal->call_count;
-
- mlocal->replies[local->node_index].op_ret = op_ret;
- mlocal->replies[local->node_index].op_errno = op_errno;
-
- if (op_ret >= 0) {
- mlocal->post_buf = *postbuf;
- mlocal->pre_buf = *prebuf;
-
- mlocal->prebuf_blocks += prebuf->ia_blocks;
- mlocal->postbuf_blocks += postbuf->ia_blocks;
-
- correct_file_size(prebuf, mlocal->fctx, prev);
- correct_file_size(postbuf, mlocal->fctx, prev);
-
- if (mlocal->prebuf_size < prebuf->ia_size)
- mlocal->prebuf_size = prebuf->ia_size;
- if (mlocal->postbuf_size < postbuf->ia_size)
- mlocal->postbuf_size = postbuf->ia_size;
- }
- }
- UNLOCK(&frame->lock);
-
- if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
- mlocal->pre_buf.ia_size = mlocal->prebuf_size;
- mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
- mlocal->post_buf.ia_size = mlocal->postbuf_size;
- mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
-
- /*
- * Only return the number of consecutively written bytes up until
- * the first error. Only return an error if it occurs first.
- *
- * When a short write occurs, the application should retry at the
- * appropriate offset, at which point we'll potentially pass back
- * the error.
- */
- for (i = 0, reply = mlocal->replies; i < mlocal->wind_count;
- i++, reply++) {
- if (reply->op_ret == -1) {
- gf_log(this->name, GF_LOG_DEBUG,
- "reply %d "
- "returned error %s",
- i, strerror(reply->op_errno));
- if (!mlocal->op_ret) {
- mlocal->op_ret = -1;
- mlocal->op_errno = reply->op_errno;
- }
- break;
- }
-
- mlocal->op_ret += reply->op_ret;
-
- if (reply->op_ret < reply->requested_size)
- break;
- }
-
- GF_FREE(mlocal->replies);
-
- STRIPE_STACK_UNWIND(writev, mframe, mlocal->op_ret, mlocal->op_errno,
- &mlocal->pre_buf, &mlocal->post_buf, NULL);
- }
-out:
- if (frame)
- STRIPE_STACK_DESTROY(frame);
- return 0;
-}
-
-int32_t
-stripe_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
- struct iobref *iobref, dict_t *xdata)
-{
- struct iovec *tmp_vec = NULL;
- stripe_local_t *local = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = 1;
- int32_t idx = 0;
- int32_t total_size = 0;
- int32_t offset_offset = 0;
- int32_t remaining_size = 0;
- int32_t tmp_count = count;
- off_t fill_size = 0;
- uint64_t stripe_size = 0;
- uint64_t tmp_fctx = 0;
- off_t dest_offset = 0;
- off_t rounded_start = 0;
- off_t rounded_end = 0;
- int32_t total_chunks = 0;
- call_frame_t *wframe = NULL;
- stripe_local_t *wlocal = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- inode_ctx_get(fd->inode, this, &tmp_fctx);
- if (!tmp_fctx) {
- op_errno = EINVAL;
- goto err;
- }
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
- stripe_size = fctx->stripe_size;
-
- STRIPE_VALIDATE_FCTX(fctx, err);
-
- /* File has to be stripped across the child nodes */
- total_size = iov_length(vector, count);
- remaining_size = total_size;
-
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
- local->stripe_size = stripe_size;
- local->fctx = fctx;
-
- if (!stripe_size) {
- gf_log(this->name, GF_LOG_DEBUG, "Wrong stripe size for the file");
- op_errno = EINVAL;
- goto err;
- }
-
- rounded_start = floor(offset, stripe_size);
- rounded_end = roof(offset + total_size, stripe_size);
- total_chunks = (rounded_end - rounded_start) / stripe_size;
- local->replies = GF_CALLOC(total_chunks, sizeof(struct stripe_replies),
- gf_stripe_mt_stripe_replies);
- if (!local->replies) {
- op_errno = ENOMEM;
- goto err;
- }
-
- total_chunks = 0;
- while (1) {
- wframe = copy_frame(frame);
- wlocal = mem_get0(this->local_pool);
- if (!wlocal) {
- op_errno = ENOMEM;
- goto err;
- }
- wlocal->orig_frame = frame;
- wframe->local = wlocal;
-
- /* Send striped chunk of the vector to child
- nodes appropriately. */
- idx = (((offset + offset_offset) / local->stripe_size) %
- fctx->stripe_count);
-
- fill_size = (local->stripe_size -
- ((offset + offset_offset) % local->stripe_size));
- if (fill_size > remaining_size)
- fill_size = remaining_size;
-
- remaining_size -= fill_size;
-
- tmp_count = iov_subset(vector, count, offset_offset,
- offset_offset + fill_size, NULL);
- tmp_vec = GF_CALLOC(tmp_count, sizeof(struct iovec),
- gf_stripe_mt_iovec);
- if (!tmp_vec) {
- op_errno = ENOMEM;
- goto err;
- }
- tmp_count = iov_subset(vector, count, offset_offset,
- offset_offset + fill_size, tmp_vec);
-
- local->wind_count++;
- if (remaining_size == 0)
- local->unwind = 1;
-
- /*
- * Store off the request index (with respect to the chunk of the
- * initial offset) and the size of the request. This is required
- * in the callback to calculate an appropriate return value in
- * the event of a write failure in one or more requests.
- */
- wlocal->node_index = total_chunks;
- local->replies[total_chunks].requested_size = fill_size;
-
- dest_offset = offset + offset_offset;
- if (fctx->stripe_coalesce)
- dest_offset = coalesced_offset(dest_offset, local->stripe_size,
- fctx->stripe_count);
-
- STACK_WIND(wframe, stripe_writev_cbk, fctx->xl_array[idx],
- fctx->xl_array[idx]->fops->writev, fd, tmp_vec, tmp_count,
- dest_offset, flags, iobref, xdata);
-
- GF_FREE(tmp_vec);
- offset_offset += fill_size;
- total_chunks++;
- if (remaining_size == 0)
- break;
- }
-
- return 0;
-err:
- if (wframe)
- STRIPE_STACK_DESTROY(wframe);
-
- STRIPE_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_local_t *mlocal = NULL;
- call_frame_t *prev = NULL;
- call_frame_t *mframe = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
- mframe = local->orig_frame;
- mlocal = mframe->local;
-
- LOCK(&frame->lock);
- {
- callcnt = ++mlocal->call_count;
-
- if (op_ret == 0) {
- mlocal->post_buf = *postbuf;
- mlocal->pre_buf = *prebuf;
-
- mlocal->prebuf_blocks += prebuf->ia_blocks;
- mlocal->postbuf_blocks += postbuf->ia_blocks;
-
- correct_file_size(prebuf, mlocal->fctx, prev);
- correct_file_size(postbuf, mlocal->fctx, prev);
-
- if (mlocal->prebuf_size < prebuf->ia_size)
- mlocal->prebuf_size = prebuf->ia_size;
- if (mlocal->postbuf_size < postbuf->ia_size)
- mlocal->postbuf_size = postbuf->ia_size;
- }
-
- /* return the first failure */
- if (mlocal->op_ret == 0) {
- mlocal->op_ret = op_ret;
- mlocal->op_errno = op_errno;
- }
- }
- UNLOCK(&frame->lock);
-
- if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
- mlocal->pre_buf.ia_size = mlocal->prebuf_size;
- mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
- mlocal->post_buf.ia_size = mlocal->postbuf_size;
- mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
-
- STRIPE_STACK_UNWIND(fallocate, mframe, mlocal->op_ret, mlocal->op_errno,
- &mlocal->pre_buf, &mlocal->post_buf, NULL);
- }
-out:
- if (frame)
- STRIPE_STACK_DESTROY(frame);
- return 0;
-}
-
-int32_t
-stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
- off_t offset, size_t len, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = 1;
- int32_t idx = 0;
- int32_t offset_offset = 0;
- int32_t remaining_size = 0;
- off_t fill_size = 0;
- uint64_t stripe_size = 0;
- uint64_t tmp_fctx = 0;
- off_t dest_offset = 0;
- call_frame_t *fframe = NULL;
- stripe_local_t *flocal = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- inode_ctx_get(fd->inode, this, &tmp_fctx);
- if (!tmp_fctx) {
- op_errno = EINVAL;
- goto err;
- }
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
- stripe_size = fctx->stripe_size;
-
- STRIPE_VALIDATE_FCTX(fctx, err);
-
- remaining_size = len;
-
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
- local->stripe_size = stripe_size;
- local->fctx = fctx;
-
- if (!stripe_size) {
- gf_log(this->name, GF_LOG_DEBUG, "Wrong stripe size for the file");
- op_errno = EINVAL;
- goto err;
- }
-
- while (1) {
- fframe = copy_frame(frame);
- flocal = mem_get0(this->local_pool);
- if (!flocal) {
- op_errno = ENOMEM;
- goto err;
- }
- flocal->orig_frame = frame;
- fframe->local = flocal;
-
- /* send fallocate request to the associated child node */
- idx = (((offset + offset_offset) / local->stripe_size) %
- fctx->stripe_count);
-
- fill_size = (local->stripe_size -
- ((offset + offset_offset) % local->stripe_size));
- if (fill_size > remaining_size)
- fill_size = remaining_size;
-
- remaining_size -= fill_size;
-
- local->wind_count++;
- if (remaining_size == 0)
- local->unwind = 1;
-
- dest_offset = offset + offset_offset;
- if (fctx->stripe_coalesce)
- dest_offset = coalesced_offset(dest_offset, local->stripe_size,
- fctx->stripe_count);
-
- /*
- * TODO: Create a separate handler for coalesce mode that sends a
- * single fallocate per-child (since the ranges are linear).
- */
- STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx],
- fctx->xl_array[idx]->fops->fallocate, fd, mode, dest_offset,
- fill_size, xdata);
-
- offset_offset += fill_size;
- if (remaining_size == 0)
- break;
- }
-
- return 0;
-err:
- if (fframe)
- STRIPE_STACK_DESTROY(fframe);
-
- STRIPE_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_local_t *mlocal = NULL;
- call_frame_t *prev = NULL;
- call_frame_t *mframe = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
- mframe = local->orig_frame;
- mlocal = mframe->local;
-
- LOCK(&frame->lock);
- {
- callcnt = ++mlocal->call_count;
-
- if (op_ret == 0) {
- mlocal->post_buf = *postbuf;
- mlocal->pre_buf = *prebuf;
-
- mlocal->prebuf_blocks += prebuf->ia_blocks;
- mlocal->postbuf_blocks += postbuf->ia_blocks;
-
- correct_file_size(prebuf, mlocal->fctx, prev);
- correct_file_size(postbuf, mlocal->fctx, prev);
-
- if (mlocal->prebuf_size < prebuf->ia_size)
- mlocal->prebuf_size = prebuf->ia_size;
- if (mlocal->postbuf_size < postbuf->ia_size)
- mlocal->postbuf_size = postbuf->ia_size;
- }
-
- /* return the first failure */
- if (mlocal->op_ret == 0) {
- mlocal->op_ret = op_ret;
- mlocal->op_errno = op_errno;
- }
- }
- UNLOCK(&frame->lock);
-
- if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
- mlocal->pre_buf.ia_size = mlocal->prebuf_size;
- mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
- mlocal->post_buf.ia_size = mlocal->postbuf_size;
- mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
-
- STRIPE_STACK_UNWIND(discard, mframe, mlocal->op_ret, mlocal->op_errno,
- &mlocal->pre_buf, &mlocal->post_buf, NULL);
- }
-out:
- if (frame)
- STRIPE_STACK_DESTROY(frame);
-
- return 0;
-}
-
-int32_t
-stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- size_t len, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = 1;
- int32_t idx = 0;
- int32_t offset_offset = 0;
- int32_t remaining_size = 0;
- off_t fill_size = 0;
- uint64_t stripe_size = 0;
- uint64_t tmp_fctx = 0;
- off_t dest_offset = 0;
- call_frame_t *fframe = NULL;
- stripe_local_t *flocal = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- inode_ctx_get(fd->inode, this, &tmp_fctx);
- if (!tmp_fctx) {
- op_errno = EINVAL;
- goto err;
- }
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
- stripe_size = fctx->stripe_size;
-
- STRIPE_VALIDATE_FCTX(fctx, err);
-
- remaining_size = len;
-
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
- local->stripe_size = stripe_size;
- local->fctx = fctx;
-
- if (!stripe_size) {
- gf_log(this->name, GF_LOG_DEBUG, "Wrong stripe size for the file");
- op_errno = EINVAL;
- goto err;
- }
-
- while (1) {
- fframe = copy_frame(frame);
- flocal = mem_get0(this->local_pool);
- if (!flocal) {
- op_errno = ENOMEM;
- goto err;
- }
- flocal->orig_frame = frame;
- fframe->local = flocal;
-
- /* send discard request to the associated child node */
- idx = (((offset + offset_offset) / local->stripe_size) %
- fctx->stripe_count);
-
- fill_size = (local->stripe_size -
- ((offset + offset_offset) % local->stripe_size));
- if (fill_size > remaining_size)
- fill_size = remaining_size;
-
- remaining_size -= fill_size;
-
- local->wind_count++;
- if (remaining_size == 0)
- local->unwind = 1;
-
- dest_offset = offset + offset_offset;
- if (fctx->stripe_coalesce)
- dest_offset = coalesced_offset(dest_offset, local->stripe_size,
- fctx->stripe_count);
-
- /*
- * TODO: Create a separate handler for coalesce mode that sends a
- * single discard per-child (since the ranges are linear).
- */
- STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx],
- fctx->xl_array[idx]->fops->discard, fd, dest_offset,
- fill_size, xdata);
-
- offset_offset += fill_size;
- if (remaining_size == 0)
- break;
- }
-
- return 0;
-err:
- if (fframe)
- STRIPE_STACK_DESTROY(fframe);
-
- STRIPE_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_local_t *mlocal = NULL;
- call_frame_t *prev = NULL;
- call_frame_t *mframe = NULL;
-
- GF_ASSERT(frame);
-
- if (!this || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
- mframe = local->orig_frame;
- mlocal = mframe->local;
-
- LOCK(&frame->lock);
- {
- callcnt = ++mlocal->call_count;
-
- if (op_ret == 0) {
- mlocal->post_buf = *postbuf;
- mlocal->pre_buf = *prebuf;
-
- mlocal->prebuf_blocks += prebuf->ia_blocks;
- mlocal->postbuf_blocks += postbuf->ia_blocks;
-
- correct_file_size(prebuf, mlocal->fctx, prev);
- correct_file_size(postbuf, mlocal->fctx, prev);
-
- if (mlocal->prebuf_size < prebuf->ia_size)
- mlocal->prebuf_size = prebuf->ia_size;
- if (mlocal->postbuf_size < postbuf->ia_size)
- mlocal->postbuf_size = postbuf->ia_size;
- }
-
- /* return the first failure */
- if (mlocal->op_ret == 0) {
- mlocal->op_ret = op_ret;
- mlocal->op_errno = op_errno;
- }
- }
- UNLOCK(&frame->lock);
-
- if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
- mlocal->pre_buf.ia_size = mlocal->prebuf_size;
- mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
- mlocal->post_buf.ia_size = mlocal->postbuf_size;
- mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
-
- STRIPE_STACK_UNWIND(zerofill, mframe, mlocal->op_ret, mlocal->op_errno,
- &mlocal->pre_buf, &mlocal->post_buf, NULL);
- }
-out:
- STRIPE_STACK_DESTROY(frame);
- return 0;
-}
-
-int32_t
-stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- off_t len, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = 1;
- int32_t idx = 0;
- int32_t offset_offset = 0;
- int32_t remaining_size = 0;
- off_t fill_size = 0;
- uint64_t stripe_size = 0;
- uint64_t tmp_fctx = 0;
- off_t dest_offset = 0;
- call_frame_t *fframe = NULL;
- stripe_local_t *flocal = NULL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
-
- inode_ctx_get(fd->inode, this, &tmp_fctx);
- if (!tmp_fctx) {
- op_errno = EINVAL;
- goto err;
- }
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
- stripe_size = fctx->stripe_size;
-
- STRIPE_VALIDATE_FCTX(fctx, err);
-
- remaining_size = len;
-
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
- local->stripe_size = stripe_size;
- local->fctx = fctx;
-
- if (!stripe_size) {
- gf_log(this->name, GF_LOG_DEBUG, "Wrong stripe size for the file");
- op_errno = EINVAL;
- goto err;
- }
-
- while (1) {
- fframe = copy_frame(frame);
- flocal = mem_get0(this->local_pool);
- if (!flocal) {
- op_errno = ENOMEM;
- goto err;
- }
- flocal->orig_frame = frame;
- fframe->local = flocal;
-
- idx = (((offset + offset_offset) / local->stripe_size) %
- fctx->stripe_count);
-
- fill_size = (local->stripe_size -
- ((offset + offset_offset) % local->stripe_size));
- if (fill_size > remaining_size)
- fill_size = remaining_size;
-
- remaining_size -= fill_size;
-
- local->wind_count++;
- if (remaining_size == 0)
- local->unwind = 1;
-
- dest_offset = offset + offset_offset;
- if (fctx->stripe_coalesce)
- dest_offset = coalesced_offset(dest_offset, local->stripe_size,
- fctx->stripe_count);
-
- STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx],
- fctx->xl_array[idx]->fops->zerofill, fd, dest_offset,
- fill_size, xdata);
- offset_offset += fill_size;
- if (remaining_size == 0)
- break;
- }
-
- return 0;
-err:
- if (fframe)
- STRIPE_STACK_DESTROY(fframe);
-
- STRIPE_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- gf_seek_what_t what, dict_t *xdata)
-{
- /* TBD */
- gf_log(this->name, GF_LOG_INFO, "seek called on %s.",
- uuid_utoa(fd->inode->gfid));
- STRIPE_STACK_UNWIND(seek, frame, -1, ENOTSUP, 0, NULL);
- return 0;
-}
-
-int32_t
-stripe_release(xlator_t *this, fd_t *fd)
-{
- return 0;
-}
-
-int
-stripe_forget(xlator_t *this, inode_t *inode)
-{
- uint64_t tmp_fctx = 0;
- stripe_fd_ctx_t *fctx = NULL;
-
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(inode, err);
-
- (void)inode_ctx_del(inode, this, &tmp_fctx);
- if (!tmp_fctx) {
- goto err;
- }
-
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
-
- if (!fctx->static_array)
- GF_FREE(fctx->xl_array);
-
- GF_FREE(fctx);
-err:
- return 0;
-}
-
-int32_t
-notify(xlator_t *this, int32_t event, void *data, ...)
-{
- stripe_private_t *priv = NULL;
- int down_client = 0;
- int i = 0;
- gf_boolean_t heard_from_all_children = _gf_false;
-
- if (!this)
- return 0;
-
- priv = this->private;
- if (!priv)
- return 0;
-
- switch (event) {
- case GF_EVENT_CHILD_UP: {
- /* get an index number to set */
- for (i = 0; i < priv->child_count; i++) {
- if (data == priv->xl_array[i])
- break;
- }
-
- if (priv->child_count == i) {
- gf_log(this->name, GF_LOG_ERROR,
- "got GF_EVENT_CHILD_UP bad subvolume %s",
- data ? ((xlator_t *)data)->name : NULL);
- break;
- }
-
- LOCK(&priv->lock);
- {
- if (data == FIRST_CHILD(this))
- priv->first_child_down = 0;
- priv->last_event[i] = event;
- }
- UNLOCK(&priv->lock);
- } break;
- case GF_EVENT_CHILD_CONNECTING: {
- // 'CONNECTING' doesn't ensure its CHILD_UP, so do nothing
- goto out;
- }
- case GF_EVENT_CHILD_DOWN: {
- /* get an index number to set */
- for (i = 0; i < priv->child_count; i++) {
- if (data == priv->xl_array[i])
- break;
- }
-
- if (priv->child_count == i) {
- gf_log(this->name, GF_LOG_ERROR,
- "got GF_EVENT_CHILD_DOWN bad subvolume %s",
- data ? ((xlator_t *)data)->name : NULL);
- break;
- }
-
- LOCK(&priv->lock);
- {
- if (data == FIRST_CHILD(this))
- priv->first_child_down = 1;
- priv->last_event[i] = event;
- }
- UNLOCK(&priv->lock);
- } break;
-
- default: {
- /* */
- default_notify(this, event, data);
- goto out;
- } break;
- }
-
- // Consider child as down if it's last_event is not CHILD_UP
- for (i = 0, down_client = 0; i < priv->child_count; i++)
- if (priv->last_event[i] != GF_EVENT_CHILD_UP)
- down_client++;
-
- LOCK(&priv->lock);
- {
- priv->nodes_down = down_client;
- }
- UNLOCK(&priv->lock);
-
- heard_from_all_children = _gf_true;
- for (i = 0; i < priv->child_count; i++)
- if (!priv->last_event[i])
- heard_from_all_children = _gf_false;
-
- if (heard_from_all_children)
- default_notify(this, event, data);
-out:
- return 0;
-}
-
-int
-stripe_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- int ret = -1;
- int call_cnt = 0;
- stripe_local_t *local = NULL;
-
- if (!frame || !frame->local || !this) {
- gf_log("", GF_LOG_ERROR, "Possible NULL deref");
- return ret;
- }
-
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- call_cnt = --local->wind_count;
-
- /**
- * We overwrite ->op_* values here for subsequent failure
- * conditions, hence we propagate the last errno down the
- * stack.
- */
- if (op_ret < 0) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- goto unlock;
- }
- }
-
-unlock:
- UNLOCK(&frame->lock);
-
- if (!call_cnt) {
- STRIPE_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
- xdata);
- }
-
- return 0;
-}
-
-#ifdef HAVE_BD_XLATOR
-int
-stripe_is_bd(dict_t *this, char *key, data_t *value, void *data)
-{
- gf_boolean_t *is_bd = data;
-
- if (data == NULL)
- return 0;
-
- if (XATTR_IS_BD(key))
- *is_bd = _gf_true;
-
- return 0;
-}
-
-static gf_boolean_t
-stripe_setxattr_is_bd(dict_t *dict)
-{
- gf_boolean_t is_bd = _gf_false;
-
- if (dict == NULL)
- goto out;
-
- dict_foreach(dict, stripe_is_bd, &is_bd);
-out:
- return is_bd;
-}
-#else
-#define stripe_setxattr_is_bd(dict) _gf_false
-#endif
-
-int
-stripe_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
- int flags, dict_t *xdata)
-{
- int32_t op_errno = EINVAL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- int i = 0;
- gf_boolean_t is_bd = _gf_false;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- GF_IF_INTERNAL_XATTR_GOTO("trusted.*stripe*", dict, op_errno, err);
-
- priv = this->private;
- trav = this->children;
-
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- frame->local = local;
- local->wind_count = priv->child_count;
- local->op_ret = local->op_errno = 0;
-
- is_bd = stripe_setxattr_is_bd(dict);
-
- /**
- * Set xattrs for directories on all subvolumes. Additionally
- * this power is only given to a special client. Bd xlator
- * also needs xattrs for regular files (ie LVs)
- */
- if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) &&
- IA_ISDIR(loc->inode->ia_type)) ||
- is_bd) {
- for (i = 0; i < priv->child_count; i++, trav = trav->next) {
- STACK_WIND(frame, stripe_setxattr_cbk, trav->xlator,
- trav->xlator->fops->setxattr, loc, dict, flags, xdata);
- }
- } else {
- local->wind_count = 1;
- STACK_WIND(frame, stripe_setxattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
- }
-
- return 0;
-err:
- STRIPE_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
- return 0;
-}
-
-int
-stripe_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- STRIPE_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-int
-stripe_is_special_key(dict_t *this, char *key, data_t *value, void *data)
-{
- gf_boolean_t *is_special = NULL;
-
- if (data == NULL) {
- goto out;
- }
-
- is_special = data;
-
- if (XATTR_IS_LOCKINFO(key) || XATTR_IS_BD(key))
- *is_special = _gf_true;
-
-out:
- return 0;
-}
-
-int32_t
-stripe_fsetxattr_everyone_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- int call_count = 0;
- stripe_local_t *local = NULL;
-
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- call_count = --local->wind_count;
-
- if (op_ret < 0) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- }
- }
- UNLOCK(&frame->lock);
-
- if (call_count == 0) {
- STRIPE_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
- NULL);
- }
- return 0;
-}
-
-int
-stripe_fsetxattr_to_everyone(call_frame_t *frame, xlator_t *this, fd_t *fd,
- dict_t *dict, int flags, dict_t *xdata)
-{
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- int ret = -1;
- stripe_local_t *local = NULL;
-
- priv = this->private;
-
- local = mem_get0(this->local_pool);
- if (local == NULL) {
- goto out;
- }
-
- frame->local = local;
-
- local->wind_count = priv->child_count;
-
- trav = this->children;
-
- while (trav) {
- STACK_WIND(frame, stripe_fsetxattr_everyone_cbk, trav->xlator,
- trav->xlator->fops->fsetxattr, fd, dict, flags, xdata);
- trav = trav->next;
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-static gf_boolean_t
-stripe_fsetxattr_is_special(dict_t *dict)
-{
- gf_boolean_t is_spl = _gf_false;
-
- if (dict == NULL) {
- goto out;
- }
-
- dict_foreach(dict, stripe_is_special_key, &is_spl);
-
-out:
- return is_spl;
-}
-
-int
-stripe_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
- int flags, dict_t *xdata)
-{
- int32_t op_ret = -1, ret = -1, op_errno = EINVAL;
- gf_boolean_t is_spl = _gf_false;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
-
- GF_IF_INTERNAL_XATTR_GOTO("trusted.*stripe*", dict, op_errno, err);
-
- is_spl = stripe_fsetxattr_is_special(dict);
- if (is_spl) {
- ret = stripe_fsetxattr_to_everyone(frame, this, fd, dict, flags, xdata);
- if (ret < 0) {
- op_errno = ENOMEM;
- goto err;
- }
-
- goto out;
- }
-
- STACK_WIND(frame, stripe_fsetxattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
-out:
- return 0;
-err:
- STRIPE_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL);
- return 0;
-}
-
-int
-stripe_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STRIPE_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-int
-stripe_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
-{
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(this, err);
-
- GF_IF_NATIVE_XATTR_GOTO("trusted.*stripe*", name, op_errno, err);
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(loc, err);
-
- STACK_WIND(frame, stripe_removexattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
- return 0;
-err:
- STRIPE_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
- return 0;
-}
-
-int
-stripe_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STRIPE_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-int
-stripe_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
-
- GF_IF_NATIVE_XATTR_GOTO("trusted.*stripe*", name, op_errno, err);
-
- STACK_WIND(frame, stripe_fremovexattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
- return 0;
-err:
- STRIPE_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-int32_t
-stripe_readdirp_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, dict_t *xattr,
- struct iatt *parent)
-{
- stripe_local_t *local = NULL;
- call_frame_t *main_frame = NULL;
- stripe_local_t *main_local = NULL;
- gf_dirent_t *entry = NULL;
- call_frame_t *prev = NULL;
- int done = 0;
-
- local = frame->local;
- prev = cookie;
-
- entry = local->dirent;
-
- main_frame = local->orig_frame;
- main_local = main_frame->local;
- LOCK(&frame->lock);
- {
- local->call_count--;
- if (!local->call_count)
- done = 1;
- if (op_ret == -1) {
- local->op_errno = op_errno;
- local->op_ret = op_ret;
- goto unlock;
- }
-
- if (stripe_ctx_handle(this, prev, local, xattr))
- gf_log(this->name, GF_LOG_ERROR,
- "Error getting fctx info from dict.");
-
- correct_file_size(stbuf, local->fctx, prev);
-
- stripe_iatt_merge(stbuf, &entry->d_stat);
- local->stbuf_blocks += stbuf->ia_blocks;
- }
-unlock:
- UNLOCK(&frame->lock);
-
- if (done) {
- inode_ctx_put(entry->inode, this, (uint64_t)(long)local->fctx);
-
- done = 0;
- LOCK(&main_frame->lock);
- {
- main_local->wind_count--;
- if (!main_local->wind_count)
- done = 1;
- if (local->op_ret == -1) {
- main_local->op_errno = local->op_errno;
- main_local->op_ret = local->op_ret;
- }
- entry->d_stat.ia_blocks = local->stbuf_blocks;
- }
- UNLOCK(&main_frame->lock);
- if (done) {
- main_frame->local = NULL;
- STRIPE_STACK_UNWIND(readdir, main_frame, main_local->op_ret,
- main_local->op_errno, &main_local->entries,
- NULL);
- gf_dirent_free(&main_local->entries);
- stripe_local_wipe(main_local);
- mem_put(main_local);
- }
- frame->local = NULL;
- stripe_local_wipe(local);
- mem_put(local);
- STRIPE_STACK_DESTROY(frame);
- }
-
- return 0;
-}
-
-int32_t
-stripe_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
- gf_dirent_t *local_entry = NULL;
- gf_dirent_t *tmp_entry = NULL;
- xlator_list_t *trav = NULL;
- loc_t loc = {
- 0,
- };
- int32_t count = 0;
- stripe_private_t *priv = NULL;
- int32_t subvols = 0;
- dict_t *xattrs = NULL;
- call_frame_t *local_frame = NULL;
- stripe_local_t *local_ent = NULL;
-
- if (!this || !frame->local || !cookie) {
- gf_log("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
- prev = cookie;
- local = frame->local;
- trav = this->children;
- priv = this->private;
-
- subvols = priv->child_count;
-
- LOCK(&frame->lock);
- {
- local->op_errno = op_errno;
- local->op_ret = op_ret;
-
- if (op_ret != -1) {
- list_splice_init(&orig_entries->list, &local->entries.list);
- local->wind_count = op_ret;
- }
- }
- UNLOCK(&frame->lock);
-
- if (op_ret == -1) {
- gf_log(this->name, GF_LOG_WARNING, "%s returned error %s",
- prev->this->name, strerror(op_errno));
- goto out;
- }
-
- xattrs = dict_new();
- if (xattrs)
- (void)stripe_xattr_request_build(this, xattrs, 0, 0, 0, 0);
- count = op_ret;
- list_for_each_entry_safe(local_entry, tmp_entry, (&local->entries.list),
- list)
- {
- if (!local_entry)
- break;
- if (!IA_ISREG(local_entry->d_stat.ia_type) || !local_entry->inode) {
- LOCK(&frame->lock);
- {
- local->wind_count--;
- count = local->wind_count;
- }
- UNLOCK(&frame->lock);
- continue;
- }
-
- local_frame = copy_frame(frame);
-
- if (!local_frame) {
- op_errno = ENOMEM;
- op_ret = -1;
- goto out;
- }
-
- local_ent = mem_get0(this->local_pool);
- if (!local_ent) {
- op_errno = ENOMEM;
- op_ret = -1;
- goto out;
- }
-
- loc.inode = inode_ref(local_entry->inode);
-
- gf_uuid_copy(loc.gfid, local_entry->d_stat.ia_gfid);
-
- local_ent->orig_frame = frame;
-
- local_ent->call_count = subvols;
-
- local_ent->dirent = local_entry;
-
- local_frame->local = local_ent;
-
- trav = this->children;
- while (trav) {
- STACK_WIND(local_frame, stripe_readdirp_lookup_cbk, trav->xlator,
- trav->xlator->fops->lookup, &loc, xattrs);
- trav = trav->next;
- }
- loc_wipe(&loc);
- }
-out:
- if (!count) {
- /* all entries are directories */
- frame->local = NULL;
- STRIPE_STACK_UNWIND(readdir, frame, (local ? local->op_ret : -1),
- (local ? local->op_errno : EINVAL),
- (local ? &local->entries : NULL), NULL);
- gf_dirent_free(&local->entries);
- stripe_local_wipe(local);
- mem_put(local);
- }
- if (xattrs)
- dict_unref(xattrs);
- return 0;
-}
-int32_t
-stripe_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t off, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int op_errno = -1;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(fd, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- frame->local = local;
-
- local->fd = fd_ref(fd);
-
- local->wind_count = 0;
-
- local->count = 0;
- local->op_ret = -1;
- INIT_LIST_HEAD(&local->entries);
-
- if (!trav)
- goto err;
-
- STACK_WIND(frame, stripe_readdirp_cbk, trav->xlator,
- trav->xlator->fops->readdirp, fd, size, off, xdata);
- return 0;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- STRIPE_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-mem_acct_init(xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- goto out;
-
- ret = xlator_mem_acct_init(this, gf_stripe_mt_end + 1);
-
- if (ret != 0) {
- gf_log(this->name, GF_LOG_ERROR,
- "Memory accounting init"
- "failed");
- goto out;
- }
-
-out:
- return ret;
-}
-
-static int
-clear_pattern_list(stripe_private_t *priv)
-{
- struct stripe_options *prev = NULL;
- struct stripe_options *trav = NULL;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO("stripe", priv, out);
-
- trav = priv->pattern;
- priv->pattern = NULL;
- while (trav) {
- prev = trav;
- trav = trav->next;
- GF_FREE(prev);
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-int
-reconfigure(xlator_t *this, dict_t *options)
-{
- stripe_private_t *priv = NULL;
- data_t *data = NULL;
- int ret = -1;
- volume_option_t *opt = NULL;
-
- GF_ASSERT(this);
- GF_ASSERT(this->private);
-
- priv = this->private;
-
- ret = 0;
- LOCK(&priv->lock);
- {
- ret = clear_pattern_list(priv);
- if (ret)
- goto unlock;
-
- data = dict_get(options, "block-size");
- if (data) {
- ret = set_stripe_block_size(this, priv, data->data);
- if (ret)
- goto unlock;
- } else {
- opt = xlator_volume_option_get(this, "block-size");
- if (!opt) {
- gf_log(this->name, GF_LOG_WARNING,
- "option 'block-size' not found");
- ret = -1;
- goto unlock;
- }
-
- if (gf_string2bytesize_uint64(opt->default_value,
- &priv->block_size)) {
- gf_log(this->name, GF_LOG_ERROR,
- "Unable to set default block-size ");
- ret = -1;
- goto unlock;
- }
- }
-
- GF_OPTION_RECONF("coalesce", priv->coalesce, options, bool, unlock);
- }
-unlock:
- UNLOCK(&priv->lock);
- if (ret)
- goto out;
-
- ret = 0;
-out:
- return ret;
-}
-
-/**
- * init - This function is called when xlator-graph gets initialized.
- * The option given in volfiles are parsed here.
- * @this -
- */
-int32_t
-init(xlator_t *this)
-{
- stripe_private_t *priv = NULL;
- volume_option_t *opt = NULL;
- xlator_list_t *trav = NULL;
- data_t *data = NULL;
- int32_t count = 0;
- int ret = -1;
-
- if (!this)
- goto out;
-
- trav = this->children;
- while (trav) {
- count++;
- trav = trav->next;
- }
-
- if (!count) {
- gf_log(this->name, GF_LOG_ERROR,
- "stripe configured without \"subvolumes\" option. "
- "exiting");
- goto out;
- }
-
- if (!this->parents) {
- gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
- }
-
- if (count == 1) {
- gf_log(this->name, GF_LOG_ERROR,
- "stripe configured with only one \"subvolumes\" option."
- " please check the volume. exiting");
- goto out;
- }
-
- priv = GF_CALLOC(1, sizeof(stripe_private_t),
- gf_stripe_mt_stripe_private_t);
-
- if (!priv)
- goto out;
- priv->xl_array = GF_CALLOC(count, sizeof(xlator_t *),
- gf_stripe_mt_xlator_t);
- if (!priv->xl_array)
- goto out;
-
- priv->last_event = GF_CALLOC(count, sizeof(int), gf_stripe_mt_int32_t);
- if (!priv->last_event)
- goto out;
-
- priv->child_count = count;
- LOCK_INIT(&priv->lock);
-
- trav = this->children;
- count = 0;
- while (trav) {
- priv->xl_array[count++] = trav->xlator;
- trav = trav->next;
- }
-
- if (count > 256) {
- gf_log(this->name, GF_LOG_ERROR,
- "maximum number of stripe subvolumes supported "
- "is 256");
- goto out;
- }
-
- ret = 0;
- LOCK(&priv->lock);
- {
- opt = xlator_volume_option_get(this, "block-size");
- if (!opt) {
- gf_log(this->name, GF_LOG_WARNING, "option 'block-size' not found");
- ret = -1;
- goto unlock;
- }
- if (gf_string2bytesize_uint64(opt->default_value, &priv->block_size)) {
- gf_log(this->name, GF_LOG_ERROR,
- "Unable to set default block-size ");
- ret = -1;
- goto unlock;
- }
- /* option stripe-pattern *avi:1GB,*pdf:16K */
- data = dict_get(this->options, "block-size");
- if (data) {
- ret = set_stripe_block_size(this, priv, data->data);
- if (ret)
- goto unlock;
- }
- }
-unlock:
- UNLOCK(&priv->lock);
- if (ret)
- goto out;
-
- GF_OPTION_INIT("use-xattr", priv->xattr_supported, bool, out);
- /* notify related */
- priv->nodes_down = priv->child_count;
-
- GF_OPTION_INIT("coalesce", priv->coalesce, bool, out);
-
- this->local_pool = mem_pool_new(stripe_local_t, 128);
- if (!this->local_pool) {
- ret = -1;
- gf_log(this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto out;
- }
-
- this->private = priv;
-
- ret = 0;
-out:
- if (ret) {
- if (priv) {
- GF_FREE(priv->xl_array);
- GF_FREE(priv);
- }
- }
- return ret;
-}
-
-/**
- * fini - Free all the private variables
- * @this -
- */
-void
-fini(xlator_t *this)
-{
- stripe_private_t *priv = NULL;
- struct stripe_options *prev = NULL;
- struct stripe_options *trav = NULL;
-
- if (!this)
- goto out;
-
- priv = this->private;
- if (priv) {
- this->private = NULL;
- GF_FREE(priv->xl_array);
-
- trav = priv->pattern;
- while (trav) {
- prev = trav;
- trav = trav->next;
- GF_FREE(prev);
- }
- GF_FREE(priv->last_event);
- LOCK_DESTROY(&priv->lock);
- GF_FREE(priv);
- }
-
-out:
- return;
-}
-
-int32_t
-stripe_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno,
- dict_t *dict, dict_t *xdata)
-
-{
- STRIPE_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
-}
-
-int
-stripe_internal_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr,
- dict_t *xdata)
-{
- char size_key[256] = {
- 0,
- };
- char index_key[256] = {
- 0,
- };
- char count_key[256] = {
- 0,
- };
- char coalesce_key[256] = {
- 0,
- };
-
- VALIDATE_OR_GOTO(frame, out);
- VALIDATE_OR_GOTO(frame->local, out);
-
- if (!xattr || (op_ret == -1))
- goto out;
-
- sprintf(size_key, "trusted.%s.stripe-size", this->name);
- sprintf(count_key, "trusted.%s.stripe-count", this->name);
- sprintf(index_key, "trusted.%s.stripe-index", this->name);
- sprintf(coalesce_key, "trusted.%s.stripe-coalesce", this->name);
-
- dict_del(xattr, size_key);
- dict_del(xattr, count_key);
- dict_del(xattr, index_key);
- dict_del(xattr, coalesce_key);
-
-out:
- STRIPE_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
-
- return 0;
-}
-
-int
-stripe_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
-{
- int call_cnt = 0;
- stripe_local_t *local = NULL;
-
- VALIDATE_OR_GOTO(frame, out);
- VALIDATE_OR_GOTO(frame->local, out);
-
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- call_cnt = --local->wind_count;
- }
- UNLOCK(&frame->lock);
-
- if (!xattr || (op_ret < 0))
- goto out;
-
- local->op_ret = 0;
-
- if (!local->xattr) {
- local->xattr = dict_ref(xattr);
- } else {
- stripe_aggregate_xattr(local->xattr, xattr);
- }
-
-out:
- if (!call_cnt) {
- STRIPE_STACK_UNWIND(getxattr, frame, (local ? local->op_ret : -1),
- op_errno, (local ? local->xattr : NULL), xdata);
- }
-
- return 0;
-}
-
-int32_t
-stripe_vgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- int32_t callcnt = 0;
- int32_t ret = -1;
- long cky = 0;
- void *xattr_val = NULL;
- void *xattr_serz = NULL;
- stripe_xattr_sort_t *xattr = NULL;
- dict_t *stripe_xattr = NULL;
-
- if (!frame || !frame->local || !this) {
- gf_log("", GF_LOG_ERROR, "Possible NULL deref");
- return ret;
- }
-
- local = frame->local;
- cky = (long)cookie;
-
- if (local->xsel[0] == '\0') {
- gf_log(this->name, GF_LOG_ERROR, "Empty xattr in cbk");
- return ret;
- }
-
- LOCK(&frame->lock);
- {
- callcnt = --local->wind_count;
-
- if (!dict || (op_ret < 0))
- goto out;
-
- if (!local->xattr_list)
- local->xattr_list = (stripe_xattr_sort_t *)GF_CALLOC(
- local->nallocs, sizeof(stripe_xattr_sort_t),
- gf_stripe_mt_xattr_sort_t);
-
- if (local->xattr_list) {
- xattr = local->xattr_list + (int32_t)cky;
-
- ret = dict_get_ptr_and_len(dict, local->xsel, &xattr_val,
- &xattr->xattr_len);
- if (xattr->xattr_len == 0)
- goto out;
-
- xattr->pos = cky;
- xattr->xattr_value = gf_memdup(xattr_val, xattr->xattr_len);
-
- if (xattr->xattr_value != NULL)
- local->xattr_total_len += xattr->xattr_len + 1;
- }
- }
-out:
- UNLOCK(&frame->lock);
-
- if (!callcnt) {
- if (!local->xattr_total_len)
- goto unwind;
-
- stripe_xattr = dict_new();
- if (!stripe_xattr)
- goto unwind;
-
- /* select filler based on ->xsel */
- if (XATTR_IS_PATHINFO(local->xsel))
- ret = stripe_fill_pathinfo_xattr(this, local, (char **)&xattr_serz);
- else if (XATTR_IS_LOCKINFO(local->xsel)) {
- ret = stripe_fill_lockinfo_xattr(this, local, &xattr_serz);
- } else {
- gf_log(this->name, GF_LOG_WARNING,
- "Unknown xattr in xattr request");
- goto unwind;
- }
-
- if (!ret) {
- ret = dict_set_dynptr(stripe_xattr, local->xsel, xattr_serz,
- local->xattr_total_len);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR, "Can't set %s key in dict",
- local->xsel);
- }
-
- unwind:
- /*
- * Among other things, STRIPE_STACK_UNWIND will free "local"
- * for us. That means we can't dereference it afterward.
- * Fortunately, the actual result is in stripe_xattr now, so we
- * can simply clean up before unwinding.
- */
- ret = stripe_free_xattr_str(local);
- GF_FREE(local->xattr_list);
- local->xattr_list = NULL;
-
- STRIPE_STACK_UNWIND(getxattr, frame, op_ret, op_errno, stripe_xattr,
- NULL);
-
- if (stripe_xattr)
- dict_unref(stripe_xattr);
- }
-
- return ret;
-}
-
-int
-stripe_marker_populate_args(call_frame_t *frame, int type, int *gauge,
- xlator_t **subvols)
-{
- xlator_t *this = frame->this;
- stripe_private_t *priv = this->private;
- stripe_local_t *local = frame->local;
- int count = 0;
-
- count = priv->child_count;
- if (MARKER_XTIME_TYPE == type) {
- if (!IA_FILE_OR_DIR(local->loc.inode->ia_type))
- count = 1;
- }
- memcpy(subvols, priv->xl_array, sizeof(*subvols) * count);
-
- return count;
-}
-
-int32_t
-stripe_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
- int i = 0;
- int ret = 0;
-
- VALIDATE_OR_GOTO(frame, err);
- VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO(loc, err);
- VALIDATE_OR_GOTO(loc->path, err);
- VALIDATE_OR_GOTO(loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- loc_copy(&local->loc, loc);
-
- if (name && strncmp(name, QUOTA_SIZE_KEY, SLEN(QUOTA_SIZE_KEY)) == 0) {
- local->wind_count = priv->child_count;
-
- for (i = 0, trav = this->children; i < priv->child_count;
- i++, trav = trav->next) {
- STACK_WIND(frame, stripe_getxattr_cbk, trav->xlator,
- trav->xlator->fops->getxattr, loc, name, xdata);
- }
-
- return 0;
- }
-
- if (name && (XATTR_IS_PATHINFO(name))) {
- if (IA_ISREG(loc->inode->ia_type)) {
- ret = inode_ctx_get(loc->inode, this, (uint64_t *)&local->fctx);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR,
- "stripe size unavailable from fctx"
- " relying on pathinfo could lead to"
- " wrong results");
- }
-
- local->nallocs = local->wind_count = priv->child_count;
- (void)strncpy(local->xsel, name, strlen(name));
-
- /**
- * for xattrs that need info from all children, fill ->xsel
- * as above and call the filler function in cbk based on
- * it
- */
- for (i = 0, trav = this->children; i < priv->child_count;
- i++, trav = trav->next) {
- STACK_WIND_COOKIE(frame, stripe_vgetxattr_cbk, (void *)(long)i,
- trav->xlator, trav->xlator->fops->getxattr, loc,
- name, xdata);
- }
-
- return 0;
- }
-
- if (cluster_handle_marker_getxattr(frame, loc, name, priv->vol_uuid,
- stripe_getxattr_unwind,
- stripe_marker_populate_args) == 0)
- return 0;
-
- STACK_WIND(frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
-
- return 0;
-
-err:
- STRIPE_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-static gf_boolean_t
-stripe_is_special_xattr(const char *name)
-{
- gf_boolean_t is_spl = _gf_false;
-
- if (!name) {
- goto out;
- }
-
- if (!strncmp(name, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) ||
- XATTR_IS_PATHINFO(name))
- is_spl = _gf_true;
-out:
- return is_spl;
-}
-
-int32_t
-stripe_fgetxattr_from_everyone(call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t ret = -1, op_errno = 0;
- int i = 0;
- xlator_list_t *trav = NULL;
-
- priv = this->private;
-
- local = mem_get0(this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- local->op_ret = -1;
- frame->local = local;
-
- strncpy(local->xsel, name, strlen(name));
- local->nallocs = local->wind_count = priv->child_count;
-
- for (i = 0, trav = this->children; i < priv->child_count;
- i++, trav = trav->next) {
- STACK_WIND_COOKIE(frame, stripe_vgetxattr_cbk, (void *)(long)i,
- trav->xlator, trav->xlator->fops->fgetxattr, fd, name,
- xdata);
- }
-
- return 0;
-
-err:
- STACK_UNWIND_STRICT(fgetxattr, frame, -1, op_errno, NULL, NULL);
- return ret;
-}
-
-int32_t
-stripe_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
-{
- if (stripe_is_special_xattr(name)) {
- stripe_fgetxattr_from_everyone(frame, this, fd, name, xdata);
- goto out;
- }
-
- STACK_WIND(frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
-
-out:
- return 0;
-}
-
-int32_t
-stripe_priv_dump(xlator_t *this)
-{
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
- stripe_private_t *priv = NULL;
- int ret = -1;
- struct stripe_options *options = NULL;
-
- GF_VALIDATE_OR_GOTO("stripe", this, out);
-
- priv = this->private;
- if (!priv)
- goto out;
-
- ret = TRY_LOCK(&priv->lock);
- if (ret != 0)
- goto out;
-
- gf_proc_dump_add_section("xlator.cluster.stripe.%s.priv", this->name);
- gf_proc_dump_write("child_count", "%d", priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- sprintf(key, "subvolumes[%d]", i);
- gf_proc_dump_write(key, "%s.%s", priv->xl_array[i]->type,
- priv->xl_array[i]->name);
- }
-
- options = priv->pattern;
- while (options != NULL) {
- gf_proc_dump_write("path_pattern", "%s", priv->pattern->path_pattern);
- gf_proc_dump_write("options_block_size", "%" PRIu64,
- options->block_size);
-
- options = options->next;
- }
-
- gf_proc_dump_write("block_size", "%" PRIu64, priv->block_size);
- gf_proc_dump_write("nodes-down", "%d", priv->nodes_down);
- gf_proc_dump_write("first-child_down", "%d", priv->first_child_down);
- gf_proc_dump_write("xattr_supported", "%d", priv->xattr_supported);
-
- UNLOCK(&priv->lock);
-
-out:
- return ret;
-}
-
-struct xlator_fops fops = {
- .stat = stripe_stat,
- .unlink = stripe_unlink,
- .rename = stripe_rename,
- .link = stripe_link,
- .truncate = stripe_truncate,
- .create = stripe_create,
- .open = stripe_open,
- .readv = stripe_readv,
- .writev = stripe_writev,
- .statfs = stripe_statfs,
- .flush = stripe_flush,
- .fsync = stripe_fsync,
- .ftruncate = stripe_ftruncate,
- .fstat = stripe_fstat,
- .mkdir = stripe_mkdir,
- .rmdir = stripe_rmdir,
- .lk = stripe_lk,
- .opendir = stripe_opendir,
- .fsyncdir = stripe_fsyncdir,
- .setattr = stripe_setattr,
- .fsetattr = stripe_fsetattr,
- .lookup = stripe_lookup,
- .mknod = stripe_mknod,
- .setxattr = stripe_setxattr,
- .fsetxattr = stripe_fsetxattr,
- .getxattr = stripe_getxattr,
- .fgetxattr = stripe_fgetxattr,
- .removexattr = stripe_removexattr,
- .fremovexattr = stripe_fremovexattr,
- .readdirp = stripe_readdirp,
- .fallocate = stripe_fallocate,
- .discard = stripe_discard,
- .zerofill = stripe_zerofill,
- .seek = stripe_seek,
-};
-
-struct xlator_cbks cbks = {
- .release = stripe_release,
- .forget = stripe_forget,
-};
-
-struct xlator_dumpops dumpops = {
- .priv = stripe_priv_dump,
-};
-
-struct volume_options options[] = {
- {
- .key = {"block-size"},
- .type = GF_OPTION_TYPE_SIZE_LIST,
- .default_value = "128KB",
- .min = STRIPE_MIN_BLOCK_SIZE,
- .description = "Size of the stripe unit that would be read "
- "from or written to the striped servers.",
- .op_version = {1},
- .tags = {"stripe"},
- .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
- },
- {
- .key = {"use-xattr"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "true",
- .description = "handle the stripe without the xattr",
- .tags = {"stripe", "dev-only"},
- .flags = OPT_FLAG_CLIENT_OPT,
- },
- {
- .key = {"coalesce"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "true",
- .description = "Enable/Disable coalesce mode to flatten striped "
- "files as stored on the server (i.e., eliminate holes "
- "caused by the traditional format).",
- .op_version = {1},
- .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
- .tags = {"stripe"},
- },
- {.key = {NULL}},
-};
diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h
deleted file mode 100644
index 103c96491ff..00000000000
--- a/xlators/cluster/stripe/src/stripe.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _STRIPE_H_
-#define _STRIPE_H_
-
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat.h"
-#include "compat-errno.h"
-#include "stripe-mem-types.h"
-#include "libxlator.h"
-#include <fnmatch.h>
-#include <signal.h>
-
-#define STRIPE_PATHINFO_HEADER "STRIPE:"
-#define STRIPE_MIN_BLOCK_SIZE (16 * GF_UNIT_KB)
-
-#define STRIPE_STACK_UNWIND(fop, frame, params...) \
- do { \
- stripe_local_t *__local = NULL; \
- if (frame) { \
- __local = frame->local; \
- frame->local = NULL; \
- } \
- STACK_UNWIND_STRICT(fop, frame, params); \
- if (__local) { \
- stripe_local_wipe(__local); \
- mem_put(__local); \
- } \
- } while (0)
-
-#define STRIPE_STACK_DESTROY(frame) \
- do { \
- stripe_local_t *__local = NULL; \
- __local = frame->local; \
- frame->local = NULL; \
- STACK_DESTROY(frame->root); \
- if (__local) { \
- stripe_local_wipe(__local); \
- mem_put(__local); \
- } \
- } while (0)
-
-#define STRIPE_VALIDATE_FCTX(fctx, label) \
- do { \
- int idx = 0; \
- if (!fctx) { \
- op_errno = EINVAL; \
- goto label; \
- } \
- for (idx = 0; idx < fctx->stripe_count; idx++) { \
- if (!fctx->xl_array[idx]) { \
- gf_log(this->name, GF_LOG_ERROR, "fctx->xl_array[%d] is NULL", \
- idx); \
- op_errno = ESTALE; \
- goto label; \
- } \
- } \
- } while (0)
-
-typedef struct stripe_xattr_sort {
- int pos;
- int xattr_len;
- char *xattr_value;
-} stripe_xattr_sort_t;
-
-/**
- * struct stripe_options : This keeps the pattern and the block-size
- * information, which is used for striping on a file.
- */
-struct stripe_options {
- struct stripe_options *next;
- char path_pattern[256];
- uint64_t block_size;
-};
-
-/**
- * Private structure for stripe translator
- */
-struct stripe_private {
- struct stripe_options *pattern;
- xlator_t **xl_array;
- uint64_t block_size;
- gf_lock_t lock;
- uint8_t nodes_down;
- int8_t first_child_down;
- int *last_event;
- int8_t child_count;
- gf_boolean_t xattr_supported; /* default yes */
- gf_boolean_t coalesce;
- char vol_uuid[UUID_SIZE + 1];
-};
-
-/**
- * Used to keep info about the replies received from readv/writev calls
- */
-struct stripe_replies {
- struct iovec *vector;
- int32_t count; // count of vector
- int32_t op_ret; // op_ret of readv
- int32_t op_errno;
- int32_t requested_size;
- struct iatt stbuf; /* 'stbuf' is also a part of reply */
-};
-
-typedef struct _stripe_fd_ctx {
- off_t stripe_size;
- int stripe_count;
- int stripe_coalesce;
- int static_array;
- xlator_t **xl_array;
-} stripe_fd_ctx_t;
-
-/**
- * Local structure to be passed with all the frames in case of STACK_WIND
- */
-struct stripe_local; /* this itself is used inside the structure; */
-
-struct stripe_local {
- struct stripe_local *next;
- call_frame_t *orig_frame;
-
- stripe_fd_ctx_t *fctx;
-
- /* Used by _cbk functions */
- struct iatt stbuf;
- struct iatt pre_buf;
- struct iatt post_buf;
- struct iatt preparent;
- struct iatt postparent;
-
- off_t stbuf_size;
- off_t prebuf_size;
- off_t postbuf_size;
- off_t preparent_size;
- off_t postparent_size;
-
- blkcnt_t stbuf_blocks;
- blkcnt_t prebuf_blocks;
- blkcnt_t postbuf_blocks;
- blkcnt_t preparent_blocks;
- blkcnt_t postparent_blocks;
-
- struct stripe_replies *replies;
- struct statvfs statvfs_buf;
- dir_entry_t *entry;
-
- int8_t revalidate;
- int8_t failed;
- int8_t unwind;
-
- size_t readv_size;
- int32_t entry_count;
- int32_t node_index;
- int32_t call_count;
- int32_t wind_count; /* used instead of child_cound
- in case of read and write */
- int32_t op_ret;
- int32_t op_errno;
- int32_t count;
- int32_t flags;
- char *name;
- inode_t *inode;
-
- loc_t loc;
- loc_t loc2;
-
- mode_t mode;
- dev_t rdev;
- /* For File I/O fops */
- dict_t *xdata;
-
- stripe_xattr_sort_t *xattr_list;
- int32_t xattr_total_len;
- int32_t nallocs;
- char xsel[256];
-
- /* General usage */
- off_t offset;
- off_t stripe_size;
-
- int xattr_self_heal_needed;
- int entry_self_heal_needed;
-
- int8_t *list;
- struct gf_flock lock;
- fd_t *fd;
- void *value;
- struct iobref *iobref;
- gf_dirent_t entries;
- gf_dirent_t *dirent;
- dict_t *xattr;
- uuid_t ia_gfid;
-
- int xflag;
- mode_t umask;
-};
-
-typedef struct stripe_local stripe_local_t;
-typedef struct stripe_private stripe_private_t;
-
-/*
- * Determine the stripe index of a particular frame based on the translator.
- */
-static inline int32_t
-stripe_get_frame_index(stripe_fd_ctx_t *fctx, call_frame_t *prev)
-{
- int32_t i, idx = -1;
-
- for (i = 0; i < fctx->stripe_count; i++) {
- if (fctx->xl_array[i] == prev->this) {
- idx = i;
- break;
- }
- }
-
- return idx;
-}
-
-static inline void
-stripe_copy_xl_array(xlator_t **dst, xlator_t **src, int count)
-{
- int i;
-
- for (i = 0; i < count; i++)
- dst[i] = src[i];
-}
-
-void
-stripe_local_wipe(stripe_local_t *local);
-int32_t
-stripe_ctx_handle(xlator_t *this, call_frame_t *prev, stripe_local_t *local,
- dict_t *dict);
-void
-stripe_aggregate_xattr(dict_t *dst, dict_t *src);
-int32_t
-stripe_xattr_request_build(xlator_t *this, dict_t *dict, uint64_t stripe_size,
- uint32_t stripe_count, uint32_t stripe_index,
- uint32_t stripe_coalesce);
-int32_t
-stripe_get_matching_bs(const char *path, stripe_private_t *priv);
-int
-set_stripe_block_size(xlator_t *this, stripe_private_t *priv, char *data);
-int32_t
-stripe_iatt_merge(struct iatt *from, struct iatt *to);
-int32_t
-stripe_fill_pathinfo_xattr(xlator_t *this, stripe_local_t *local,
- char **xattr_serz);
-int32_t
-stripe_free_xattr_str(stripe_local_t *local);
-int32_t
-stripe_xattr_aggregate(char *buffer, stripe_local_t *local, int32_t *total);
-off_t
-coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count);
-off_t
-uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count,
- int stripe_index);
-int32_t
-stripe_fill_lockinfo_xattr(xlator_t *this, stripe_local_t *local,
- void **xattr_serz);
-
-/*
- * Adjust the size attribute for files if coalesce is enabled.
- */
-static inline void
-correct_file_size(struct iatt *buf, stripe_fd_ctx_t *fctx, call_frame_t *prev)
-{
- int index;
-
- if (!IA_ISREG(buf->ia_type))
- return;
-
- if (!fctx || !fctx->stripe_coalesce)
- return;
-
- index = stripe_get_frame_index(fctx, prev);
- buf->ia_size = uncoalesced_size(buf->ia_size, fctx->stripe_size,
- fctx->stripe_count, index);
-}
-
-#endif /* _STRIPE_H_ */
diff --git a/xlators/debug/delay-gen/src/delay-gen-mem-types.h b/xlators/debug/delay-gen/src/delay-gen-mem-types.h
index 63a15a70da3..c89a9217193 100644
--- a/xlators/debug/delay-gen/src/delay-gen-mem-types.h
+++ b/xlators/debug/delay-gen/src/delay-gen-mem-types.h
@@ -11,7 +11,7 @@
#ifndef __DELAY_GEN_MEM_TYPES_H__
#define __DELAY_GEN_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_delay_gen_mem_types_ {
gf_delay_gen_mt_dg_t = gf_common_mt_end + 1,
diff --git a/xlators/debug/delay-gen/src/delay-gen-messages.h b/xlators/debug/delay-gen/src/delay-gen-messages.h
index a9046ca14bf..bc98cec2885 100644
--- a/xlators/debug/delay-gen/src/delay-gen-messages.h
+++ b/xlators/debug/delay-gen/src/delay-gen-messages.h
@@ -11,7 +11,7 @@
#ifndef __DELAY_GEN_MESSAGES_H__
#define __DELAY_GEN_MESSAGES_H__
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
/* To add new message IDs, append new identifiers at the end of the list.
*
diff --git a/xlators/debug/delay-gen/src/delay-gen.c b/xlators/debug/delay-gen/src/delay-gen.c
index a2d02527f23..4698f1fd785 100644
--- a/xlators/debug/delay-gen/src/delay-gen.c
+++ b/xlators/debug/delay-gen/src/delay-gen.c
@@ -27,7 +27,7 @@ delay_gen(xlator_t *this, int fop)
return 0;
if ((rand() % DELAY_GRANULARITY) < dg->delay_ppm)
- usleep(dg->delay_duration);
+ gf_nanosleep(dg->delay_duration * GF_US_IN_NS);
return 0;
}
@@ -679,4 +679,19 @@ struct volume_options options[] = {
.default_value = "",
},
- {.key = {NULL}}};
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {GD_OP_VERSION_3_12_0},
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "delay-gen",
+ .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/delay-gen/src/delay-gen.h b/xlators/debug/delay-gen/src/delay-gen.h
index 5e4d179f0b4..afa95e5eb2d 100644
--- a/xlators/debug/delay-gen/src/delay-gen.h
+++ b/xlators/debug/delay-gen/src/delay-gen.h
@@ -13,9 +13,9 @@
#include "delay-gen-mem-types.h"
#include "delay-gen-messages.h"
-#include "glusterfs.h"
-#include "xlator.h"
-#include "defaults.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
typedef struct {
int enable[GF_FOP_MAXVALUE];
diff --git a/xlators/debug/error-gen/src/error-gen-mem-types.h b/xlators/debug/error-gen/src/error-gen-mem-types.h
index 2facd6b27cb..b9b713af8fc 100644
--- a/xlators/debug/error-gen/src/error-gen-mem-types.h
+++ b/xlators/debug/error-gen/src/error-gen-mem-types.h
@@ -11,7 +11,7 @@
#ifndef __ERROR_GEN_MEM_TYPES_H__
#define __ERROR_GEN_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_error_gen_mem_types_ {
gf_error_gen_mt_eg_t = gf_common_mt_end + 1,
diff --git a/xlators/debug/error-gen/src/error-gen.c b/xlators/debug/error-gen/src/error-gen.c
index c6595b4c0e4..d45655ef4c3 100644
--- a/xlators/debug/error-gen/src/error-gen.c
+++ b/xlators/debug/error-gen/src/error-gen.c
@@ -7,10 +7,10 @@
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
+#include <glusterfs/xlator.h>
#include "error-gen.h"
-#include "statedump.h"
-#include "defaults.h"
+#include <glusterfs/statedump.h>
+#include <glusterfs/defaults.h>
/*
* The user can specify an error probability as a float percentage, but we
@@ -31,9 +31,9 @@
sys_error_t error_no_list[] = {
[GF_FOP_LOOKUP] = {.error_no_count = 4,
.error_no = {ENOENT, ENOTDIR, ENAMETOOLONG, EAGAIN}},
- [GF_FOP_STAT] = {.error_no_count = 7,
- .error_no = {EACCES, EBADF, EFAULT, ENAMETOOLONG, ENOENT,
- ENOMEM, ENOTDIR}},
+ [GF_FOP_STAT] = {.error_no_count = 6,
+ .error_no = {EACCES, EFAULT, ENAMETOOLONG, ENOENT, ENOMEM,
+ ENOTDIR}},
[GF_FOP_READLINK] = {.error_no_count = 8,
.error_no = {EACCES, EFAULT, EINVAL, EIO, ENAMETOOLONG,
ENOENT, ENOMEM, ENOTDIR}},
@@ -79,21 +79,20 @@ sys_error_t error_no_list[] = {
[GF_FOP_WRITE] = {.error_no_count = 7,
.error_no = {EINVAL, EBADF, EFAULT, EISDIR, ENAMETOOLONG,
ENOSPC, GF_ERROR_SHORT_WRITE}},
- [GF_FOP_STATFS] = {.error_no_count = 10,
- .error_no = {EACCES, EBADF, EFAULT, EINTR, EIO,
- ENAMETOOLONG, ENOENT, ENOMEM, ENOSYS,
- ENOTDIR}},
+ [GF_FOP_STATFS] = {.error_no_count = 9,
+ .error_no = {EACCES, EFAULT, EINTR, EIO, ENAMETOOLONG,
+ ENOENT, ENOMEM, ENOSYS, ENOTDIR}},
[GF_FOP_FLUSH] = {.error_no_count = 5,
.error_no = {EACCES, EFAULT, ENAMETOOLONG, ENOSYS,
ENOENT}},
[GF_FOP_FSYNC] = {.error_no_count = 4,
.error_no = {EBADF, EIO, EROFS, EINVAL}},
- [GF_FOP_SETXATTR] = {.error_no_count = 4,
- .error_no = {EACCES, EBADF, EINTR, ENAMETOOLONG}},
- [GF_FOP_GETXATTR] = {.error_no_count = 4,
- .error_no = {EACCES, EBADF, ENAMETOOLONG, EINTR}},
- [GF_FOP_REMOVEXATTR] = {.error_no_count = 4,
- .error_no = {EACCES, EBADF, ENAMETOOLONG, EINTR}},
+ [GF_FOP_SETXATTR] = {.error_no_count = 3,
+ .error_no = {EACCES, EINTR, ENAMETOOLONG}},
+ [GF_FOP_GETXATTR] = {.error_no_count = 3,
+ .error_no = {EACCES, ENAMETOOLONG, EINTR}},
+ [GF_FOP_REMOVEXATTR] = {.error_no_count = 3,
+ .error_no = {EACCES, ENAMETOOLONG, EINTR}},
[GF_FOP_FSETXATTR] = {.error_no_count = 4,
.error_no = {EACCES, EBADF, EINTR, ENAMETOOLONG}},
[GF_FOP_FGETXATTR] = {.error_no_count = 4,
@@ -125,26 +124,25 @@ sys_error_t error_no_list[] = {
ENOENT}},
[GF_FOP_FXATTROP] = {.error_no_count = 4,
.error_no = {EBADF, EIO, EROFS, EINVAL}},
- [GF_FOP_INODELK] = {.error_no_count = 4,
- .error_no = {EACCES, EBADF, EINTR, ENAMETOOLONG}},
+ [GF_FOP_INODELK] = {.error_no_count = 3,
+ .error_no = {EACCES, EINTR, ENAMETOOLONG}},
[GF_FOP_FINODELK] = {.error_no_count = 4,
.error_no = {EACCES, EBADF, EINTR, ENAMETOOLONG}},
- [GF_FOP_ENTRYLK] = {.error_no_count = 4,
- .error_no = {EACCES, EBADF, ENAMETOOLONG, EINTR}},
+ [GF_FOP_ENTRYLK] = {.error_no_count = 3,
+ .error_no = {EACCES, ENAMETOOLONG, EINTR}},
[GF_FOP_FENTRYLK] = {.error_no_count = 10,
.error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
ENAMETOOLONG, ENFILE, ENODEV, ENOENT,
ENOMEM}},
- [GF_FOP_SETATTR] = {.error_no_count = 11,
+ [GF_FOP_SETATTR] = {.error_no_count = 10,
.error_no = {EACCES, EFAULT, EIO, ENAMETOOLONG, ENOENT,
- ENOMEM, ENOTDIR, EPERM, EROFS, EBADF,
- EIO}},
+ ENOMEM, ENOTDIR, EPERM, EROFS, EIO}},
[GF_FOP_FSETATTR] = {.error_no_count = 11,
.error_no = {EACCES, EFAULT, EIO, ENAMETOOLONG, ENOENT,
ENOMEM, ENOTDIR, EPERM, EROFS, EBADF,
EIO}},
- [GF_FOP_GETSPEC] = {.error_no_count = 4,
- .error_no = {EACCES, EBADF, ENAMETOOLONG, EINTR}}};
+ [GF_FOP_GETSPEC] = {.error_no_count = 3,
+ .error_no = {EACCES, ENAMETOOLONG, EINTR}}};
int
generate_rand_no(int op_no)
@@ -761,6 +759,7 @@ error_gen_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
*/
shortvec = iov_dup(vector, 1);
shortvec->iov_len /= 2;
+ count = 1;
goto wind;
} else if (op_errno) {
GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
@@ -1396,7 +1395,7 @@ error_gen_priv_dump(xlator_t *this)
gf_proc_dump_write("op_count", "%d", conf->op_count);
gf_proc_dump_write("failure_iter_no", "%d", conf->failure_iter_no);
- gf_proc_dump_write("error_no", "%s", conf->error_no);
+ gf_proc_dump_write("error_no", "%d", conf->error_no_int);
gf_proc_dump_write("random_failure", "%d", conf->random_failure);
UNLOCK(&conf->lock);
@@ -1430,6 +1429,7 @@ reconfigure(xlator_t *this, dict_t *options)
eg_t *pvt = NULL;
int32_t ret = 0;
char *error_enable_fops = NULL;
+ char *error_no = NULL;
double failure_percent_dbl = 0.0;
if (!this || !this->private)
@@ -1439,10 +1439,10 @@ reconfigure(xlator_t *this, dict_t *options)
ret = -1;
- GF_OPTION_RECONF("error-no", pvt->error_no, options, str, out);
+ GF_OPTION_RECONF("error-no", error_no, options, str, out);
- if (pvt->error_no)
- pvt->error_no_int = conv_errno_to_int(&pvt->error_no);
+ if (error_no)
+ pvt->error_no_int = conv_errno_to_int(&error_no);
GF_OPTION_RECONF("failure", failure_percent_dbl, options, percent, out);
@@ -1466,6 +1466,7 @@ init(xlator_t *this)
eg_t *pvt = NULL;
int32_t ret = 0;
char *error_enable_fops = NULL;
+ char *error_no = NULL;
double failure_percent_dbl = 0.0;
if (!this->children || this->children->next) {
@@ -1490,10 +1491,10 @@ init(xlator_t *this)
ret = -1;
- GF_OPTION_INIT("error-no", pvt->error_no, str, out);
+ GF_OPTION_INIT("error-no", error_no, str, out);
- if (pvt->error_no)
- pvt->error_no_int = conv_errno_to_int(&pvt->error_no);
+ if (error_no)
+ pvt->error_no_int = conv_errno_to_int(&error_no);
GF_OPTION_INIT("failure", failure_percent_dbl, percent, out);
@@ -1506,8 +1507,8 @@ init(xlator_t *this)
this->private = pvt;
- /* Give some seed value here */
- srand(time(NULL));
+ /* Give some seed value here. */
+ srand(gf_time());
ret = 0;
out:
@@ -1644,4 +1645,19 @@ struct volume_options options[] = {
.flags = OPT_FLAG_SETTABLE,
},
- {.key = {NULL}}};
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1},
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "error-gen",
+ .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/error-gen/src/error-gen.h b/xlators/debug/error-gen/src/error-gen.h
index ffa09252d0f..2478cd5b21c 100644
--- a/xlators/debug/error-gen/src/error-gen.h
+++ b/xlators/debug/error-gen/src/error-gen.h
@@ -36,7 +36,6 @@ typedef struct {
* It's just not worth blowing up the diff by changing it.
*/
int failure_iter_no;
- char *error_no;
int error_no_int;
gf_boolean_t random_failure;
gf_lock_t lock;
diff --git a/xlators/debug/io-stats/src/io-stats-mem-types.h b/xlators/debug/io-stats/src/io-stats-mem-types.h
index bc25fd2ca4e..51d38d8b97c 100644
--- a/xlators/debug/io-stats/src/io-stats-mem-types.h
+++ b/xlators/debug/io-stats/src/io-stats-mem-types.h
@@ -11,7 +11,7 @@
#ifndef __IO_STATS_MEM_TYPES_H__
#define __IO_STATS_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
extern const char *__progname;
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
index 3ea0ccdf3d6..aa00c446e5a 100644
--- a/xlators/debug/io-stats/src/io-stats.c
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -7,8 +7,8 @@
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "syscall.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/syscall.h>
/**
* xlators/debug/io_stats :
@@ -28,18 +28,18 @@
#include <fnmatch.h>
#include <errno.h>
-#include "glusterfs.h"
-#include "xlator.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
#include "io-stats-mem-types.h"
#include <stdarg.h>
-#include "defaults.h"
-#include "logging.h"
-#include "cli1-xdr.h"
-#include "statedump.h"
-#include "syncop.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/syncop.h>
#include <pwd.h>
#include <grp.h>
-#include "upcall-utils.h"
+#include <glusterfs/upcall-utils.h>
+#include <glusterfs/async.h>
#define MAX_LIST_MEMBERS 100
#define DEFAULT_PWD_BUF_SZ 16384
@@ -66,6 +66,17 @@ typedef enum {
IOS_STATS_THRU_MAX,
} ios_stats_thru_t;
+/* This is same as gf1_cli_info_op */
+/* had to be defined here again, so we have modularity between
+ xdr, xlator, and library functions */
+typedef enum ios_info_op {
+ GF_IOS_INFO_NONE = 0,
+ GF_IOS_INFO_ALL = 1,
+ GF_IOS_INFO_INCREMENTAL = 2,
+ GF_IOS_INFO_CUMULATIVE = 3,
+ GF_IOS_INFO_CLEAR = 4,
+} ios_info_op_t;
+
struct ios_stat_lat {
struct timeval time;
double throughput;
@@ -124,7 +135,7 @@ struct ios_global_stats {
gf_atomic_t block_count_read[IOS_BLOCK_COUNT_SIZE];
gf_atomic_t fop_hits[GF_FOP_MAXVALUE];
gf_atomic_t upcall_hits[GF_UPCALL_FLAGS_MAXVALUE];
- struct timeval started_at;
+ time_t started_at;
struct ios_lat latency[GF_FOP_MAXVALUE];
uint64_t nr_opens;
uint64_t max_nr_opens;
@@ -173,7 +184,6 @@ struct ios_conf {
*/
char *unique_id;
ios_dump_type_t dump_format;
- char *dump_format_str;
};
struct ios_fd {
@@ -275,21 +285,24 @@ is_fop_latency_started(call_frame_t *frame)
struct timespec *begin, *end; \
double throughput; \
int flag = 0; \
+ struct timeval tv = { \
+ 0, \
+ }; \
\
begin = &frame->begin; \
end = &frame->end; \
\
- elapsed = ((end->tv_sec - begin->tv_sec) * 1e9 + \
- (end->tv_nsec - begin->tv_nsec)) / \
- 1000; \
+ elapsed = gf_tsdiff(begin, end) / 1000.0; \
throughput = op_ret / elapsed; \
\
conf = this->private; \
+ gettimeofday(&tv, NULL); \
LOCK(&iosstat->lock); \
{ \
if (iosstat->thru_counters[type].throughput <= throughput) { \
iosstat->thru_counters[type].throughput = throughput; \
- gettimeofday(&iosstat->thru_counters[type].time, NULL); \
+ memcpy(&iosstat->thru_counters[type].time, &tv, \
+ sizeof(struct timeval)); \
flag = 1; \
} \
} \
@@ -663,10 +676,7 @@ ios_dump_throughput_stats(struct ios_stat_head *list_head, xlator_t *this,
FILE *logfp, ios_stats_thru_t type)
{
struct ios_stat_list *entry = NULL;
- struct timeval time = {
- 0,
- };
- char timestr[256] = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
@@ -674,12 +684,9 @@ ios_dump_throughput_stats(struct ios_stat_head *list_head, xlator_t *this,
{
list_for_each_entry(entry, &list_head->iosstats->list, list)
{
- gf_time_fmt(timestr, sizeof timestr,
- entry->iosstat->thru_counters[type].time.tv_sec,
- gf_timefmt_FT);
- snprintf(timestr + strlen(timestr),
- sizeof timestr - strlen(timestr), ".%" GF_PRI_SUSECONDS,
- time.tv_usec);
+ gf_time_fmt_tv(timestr, sizeof timestr,
+ &entry->iosstat->thru_counters[type].time,
+ gf_timefmt_FT);
ios_log(this, logfp, "%s \t %-10.2f \t %s", timestr, entry->value,
entry->iosstat->filename);
@@ -758,9 +765,8 @@ err:
int
io_stats_dump_global_to_json_logfp(xlator_t *this,
- struct ios_global_stats *stats,
- struct timeval *now, int interval,
- FILE *logfp)
+ struct ios_global_stats *stats, time_t now,
+ int interval, FILE *logfp)
{
int i = 0;
int j = 0;
@@ -786,10 +792,7 @@ io_stats_dump_global_to_json_logfp(xlator_t *this,
};
dict_t *xattr = NULL;
- interval_sec = ((now->tv_sec * 1000000.0 + now->tv_usec) -
- (stats->started_at.tv_sec * 1000000.0 +
- stats->started_at.tv_usec)) /
- 1000000.0;
+ interval_sec = (double)(now - stats->started_at);
conf = this->private;
@@ -819,19 +822,18 @@ io_stats_dump_global_to_json_logfp(xlator_t *this,
}
if (interval == -1) {
- ios_log(this, logfp, "\"%s.%s.read_%d%s\": \"%" GF_PRI_ATOMIC "\",",
+ ios_log(this, logfp, "\"%s.%s.read_%d%s\": %" GF_PRI_ATOMIC ",",
key_prefix, str_prefix, rw_size, rw_unit,
GF_ATOMIC_GET(stats->block_count_read[i]));
- ios_log(this, logfp,
- "\"%s.%s.write_%d%s\": \"%" GF_PRI_ATOMIC "\",", key_prefix,
- str_prefix, rw_size, rw_unit,
+ ios_log(this, logfp, "\"%s.%s.write_%d%s\": %" GF_PRI_ATOMIC ",",
+ key_prefix, str_prefix, rw_size, rw_unit,
GF_ATOMIC_GET(stats->block_count_write[i]));
} else {
- ios_log(this, logfp, "\"%s.%s.read_%d%s_per_sec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.read_%d%s_per_sec\": %0.2lf,",
key_prefix, str_prefix, rw_size, rw_unit,
(double)(GF_ATOMIC_GET(stats->block_count_read[i]) /
interval_sec));
- ios_log(this, logfp, "\"%s.%s.write_%d%s_per_sec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.write_%d%s_per_sec\": %0.2lf,",
key_prefix, str_prefix, rw_size, rw_unit,
(double)(GF_ATOMIC_GET(stats->block_count_write[i]) /
interval_sec));
@@ -839,9 +841,9 @@ io_stats_dump_global_to_json_logfp(xlator_t *this,
}
if (interval == -1) {
- ios_log(this, logfp, "\"%s.%s.fds.open_count\": \"%" PRId64 "\",",
+ ios_log(this, logfp, "\"%s.%s.fds.open_count\": %" PRId64 ",",
key_prefix, str_prefix, conf->cumulative.nr_opens);
- ios_log(this, logfp, "\"%s.%s.fds.max_open_count\": \"%" PRId64 "\",",
+ ios_log(this, logfp, "\"%s.%s.fds.max_open_count\": %" PRId64 ",",
key_prefix, str_prefix, conf->cumulative.max_nr_opens);
}
@@ -863,20 +865,19 @@ io_stats_dump_global_to_json_logfp(xlator_t *this,
}
}
if (interval == -1) {
- ios_log(this, logfp,
- "\"%s.%s.fop.%s.count\": \"%" GF_PRI_ATOMIC "\",",
+ ios_log(this, logfp, "\"%s.%s.fop.%s.count\": %" GF_PRI_ATOMIC ",",
key_prefix, str_prefix, lc_fop_name, fop_hits);
} else {
- ios_log(this, logfp, "\"%s.%s.fop.%s.per_sec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.fop.%s.per_sec\": %0.2lf,",
key_prefix, str_prefix, lc_fop_name,
(double)(fop_hits / interval_sec));
}
- ios_log(this, logfp, "\"%s.%s.fop.%s.latency_ave_usec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.fop.%s.latency_ave_usec\": %0.2lf,",
key_prefix, str_prefix, lc_fop_name, fop_lat_ave);
- ios_log(this, logfp, "\"%s.%s.fop.%s.latency_min_usec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.fop.%s.latency_min_usec\": %0.2lf,",
key_prefix, str_prefix, lc_fop_name, fop_lat_min);
- ios_log(this, logfp, "\"%s.%s.fop.%s.latency_max_usec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.fop.%s.latency_max_usec\": %0.2lf,",
key_prefix, str_prefix, lc_fop_name, fop_lat_max);
fop_ave_usec_sum += fop_lat_ave;
@@ -891,18 +892,17 @@ io_stats_dump_global_to_json_logfp(xlator_t *this,
*/
ios_log(this, logfp,
"\"%s.%s.fop.weighted_latency_ave_usec_nozerofill\": "
- "\"%0.4lf\",",
+ "%0.4lf,",
key_prefix, str_prefix, weighted_fop_ave_usec);
}
- ios_log(this, logfp, "\"%s.%s.fop.weighted_latency_ave_usec\": \"%0.4lf\",",
+ ios_log(this, logfp, "\"%s.%s.fop.weighted_latency_ave_usec\": %0.4lf,",
key_prefix, str_prefix, weighted_fop_ave_usec);
- ios_log(this, logfp, "\"%s.%s.fop.weighted_fop_count\": \"%ld\",",
- key_prefix, str_prefix, total_fop_hits);
+ ios_log(this, logfp, "\"%s.%s.fop.weighted_fop_count\": %ld,", key_prefix,
+ str_prefix, total_fop_hits);
fop_ave_usec = fop_ave_usec_sum / GF_FOP_MAXVALUE;
- ios_log(this, logfp,
- "\"%s.%s.fop.unweighted_latency_ave_usec\":\"%0.4lf\",", key_prefix,
- str_prefix, fop_ave_usec);
+ ios_log(this, logfp, "\"%s.%s.fop.unweighted_latency_ave_usec\":%0.4lf,",
+ key_prefix, str_prefix, fop_ave_usec);
for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++) {
lc_fop_name = strdupa(gf_upcall_list[i]);
@@ -911,11 +911,10 @@ io_stats_dump_global_to_json_logfp(xlator_t *this,
}
fop_hits = GF_ATOMIC_GET(stats->upcall_hits[i]);
if (interval == -1) {
- ios_log(this, logfp,
- "\"%s.%s.fop.%s.count\": \"%" GF_PRI_ATOMIC "\",",
+ ios_log(this, logfp, "\"%s.%s.fop.%s.count\": %" GF_PRI_ATOMIC ",",
key_prefix, str_prefix, lc_fop_name, fop_hits);
} else {
- ios_log(this, logfp, "\"%s.%s.fop.%s.per_sec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.fop.%s.per_sec\": %0.2lf,",
key_prefix, str_prefix, lc_fop_name,
(double)(fop_hits / interval_sec));
}
@@ -932,7 +931,7 @@ io_stats_dump_global_to_json_logfp(xlator_t *this,
dict_foreach_inline(xattr, curr)
{
- ios_log(this, logfp, "\"%s.%s.%s.queue_size\": \"%d\",", key_prefix,
+ ios_log(this, logfp, "\"%s.%s.%s.queue_size\": %d,", key_prefix,
str_prefix, curr->key, data_to_int32(curr->value));
}
@@ -945,23 +944,23 @@ io_stats_dump_global_to_json_logfp(xlator_t *this,
}
if (interval == -1) {
- ios_log(this, logfp, "\"%s.%s.uptime\": \"%" PRId64 "\",", key_prefix,
- str_prefix, (uint64_t)(now->tv_sec - stats->started_at.tv_sec));
+ ios_log(this, logfp, "\"%s.%s.uptime\": %" PRIu64 ",", key_prefix,
+ str_prefix, (uint64_t)(now - stats->started_at));
ios_log(this, logfp,
- "\"%s.%s.bytes_read\": \""
- "%" GF_PRI_ATOMIC "\",",
+ "\"%s.%s.bytes_read\": "
+ "%" GF_PRI_ATOMIC ",",
key_prefix, str_prefix, GF_ATOMIC_GET(stats->data_read));
ios_log(this, logfp,
- "\"%s.%s.bytes_written\": \""
- "%" GF_PRI_ATOMIC "\"",
+ "\"%s.%s.bytes_written\": "
+ "%" GF_PRI_ATOMIC "",
key_prefix, str_prefix, GF_ATOMIC_GET(stats->data_written));
} else {
- ios_log(this, logfp, "\"%s.%s.sample_interval_sec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.sample_interval_sec\": %0.2lf,",
key_prefix, str_prefix, interval_sec);
- ios_log(this, logfp, "\"%s.%s.bytes_read_per_sec\": \"%0.2lf\",",
+ ios_log(this, logfp, "\"%s.%s.bytes_read_per_sec\": %0.2lf,",
key_prefix, str_prefix,
(double)(GF_ATOMIC_GET(stats->data_read) / interval_sec));
- ios_log(this, logfp, "\"%s.%s.bytes_written_per_sec\": \"%0.2lf\"",
+ ios_log(this, logfp, "\"%s.%s.bytes_written_per_sec\": %0.2lf",
key_prefix, str_prefix,
(double)(GF_ATOMIC_GET(stats->data_written) / interval_sec));
}
@@ -1198,14 +1197,14 @@ out:
int
io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats,
- struct timeval *now, int interval, FILE *logfp)
+ time_t now, int interval, FILE *logfp)
{
int i = 0;
int per_line = 0;
int index = 0;
struct ios_stat_head *list_head = NULL;
struct ios_conf *conf = NULL;
- char timestr[256] = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
char str_header[128] = {0};
@@ -1221,8 +1220,8 @@ io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats,
ios_log(this, logfp, "\n=== Cumulative stats ===");
else
ios_log(this, logfp, "\n=== Interval %d stats ===", interval);
- ios_log(this, logfp, " Duration : %" PRId64 " secs",
- (uint64_t)(now->tv_sec - stats->started_at.tv_sec));
+ ios_log(this, logfp, " Duration : %" PRIu64 " secs",
+ (uint64_t)(now - stats->started_at));
ios_log(this, logfp, " BytesRead : %" GF_PRI_ATOMIC,
GF_ATOMIC_GET(stats->data_read));
ios_log(this, logfp, " BytesWritten : %" GF_PRI_ATOMIC "\n",
@@ -1314,11 +1313,8 @@ io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats,
if (interval == -1) {
LOCK(&conf->lock);
{
- gf_time_fmt(timestr, sizeof timestr,
- conf->cumulative.max_openfd_time.tv_sec, gf_timefmt_FT);
- snprintf(timestr + strlen(timestr),
- sizeof timestr - strlen(timestr), ".%" GF_PRI_SUSECONDS,
- conf->cumulative.max_openfd_time.tv_usec);
+ gf_time_fmt_tv(timestr, sizeof timestr,
+ &conf->cumulative.max_openfd_time, gf_timefmt_FT);
ios_log(this, logfp,
"Current open fd's: %" PRId64 " Max open fd's: %" PRId64
" time %s",
@@ -1370,10 +1366,10 @@ io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats,
int
io_stats_dump_global_to_dict(xlator_t *this, struct ios_global_stats *stats,
- struct timeval *now, int interval, dict_t *dict)
+ time_t now, int interval, dict_t *dict)
{
int ret = 0;
- char key[256] = {0};
+ char key[64] = {0};
uint64_t sec = 0;
int i = 0;
uint64_t count = 0;
@@ -1396,7 +1392,7 @@ io_stats_dump_global_to_dict(xlator_t *this, struct ios_global_stats *stats,
interval);
snprintf(key, sizeof(key), "%d-duration", interval);
- sec = (uint64_t)(now->tv_sec - stats->started_at.tv_sec);
+ sec = now - stats->started_at;
ret = dict_set_uint64(dict, key, sec);
if (ret) {
gf_log(this->name, GF_LOG_ERROR,
@@ -1519,9 +1515,8 @@ out:
}
int
-io_stats_dump_global(xlator_t *this, struct ios_global_stats *stats,
- struct timeval *now, int interval,
- struct ios_dump_args *args)
+io_stats_dump_global(xlator_t *this, struct ios_global_stats *stats, time_t now,
+ int interval, struct ios_dump_args *args)
{
int ret = -1;
@@ -1579,24 +1574,24 @@ ios_dump_args_init(struct ios_dump_args *args, ios_dump_type_t type,
}
static void
-ios_global_stats_clear(struct ios_global_stats *stats, struct timeval *now)
+ios_global_stats_clear(struct ios_global_stats *stats, time_t now)
{
GF_ASSERT(stats);
GF_ASSERT(now);
memset(stats, 0, sizeof(*stats));
- stats->started_at = *now;
+ stats->started_at = now;
}
int
-io_stats_dump(xlator_t *this, struct ios_dump_args *args, gf1_cli_info_op op,
+io_stats_dump(xlator_t *this, struct ios_dump_args *args, ios_info_op_t op,
gf_boolean_t is_peek)
{
struct ios_conf *conf = NULL;
struct ios_global_stats cumulative = {};
struct ios_global_stats incremental = {};
int increment = 0;
- struct timeval now;
+ time_t now = 0;
GF_ASSERT(this);
GF_ASSERT(args);
@@ -1604,31 +1599,31 @@ io_stats_dump(xlator_t *this, struct ios_dump_args *args, gf1_cli_info_op op,
GF_ASSERT(args->type < IOS_DUMP_TYPE_MAX);
conf = this->private;
+ now = gf_time();
- gettimeofday(&now, NULL);
LOCK(&conf->lock);
{
- if (op == GF_CLI_INFO_ALL || op == GF_CLI_INFO_CUMULATIVE)
+ if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_CUMULATIVE)
cumulative = conf->cumulative;
- if (op == GF_CLI_INFO_ALL || op == GF_CLI_INFO_INCREMENTAL) {
+ if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_INCREMENTAL) {
incremental = conf->incremental;
increment = conf->increment;
if (!is_peek) {
increment = conf->increment++;
- ios_global_stats_clear(&conf->incremental, &now);
+ ios_global_stats_clear(&conf->incremental, now);
}
}
}
UNLOCK(&conf->lock);
- if (op == GF_CLI_INFO_ALL || op == GF_CLI_INFO_CUMULATIVE)
- io_stats_dump_global(this, &cumulative, &now, -1, args);
+ if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_CUMULATIVE)
+ io_stats_dump_global(this, &cumulative, now, -1, args);
- if (op == GF_CLI_INFO_ALL || op == GF_CLI_INFO_INCREMENTAL)
- io_stats_dump_global(this, &incremental, &now, increment, args);
+ if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_INCREMENTAL)
+ io_stats_dump_global(this, &incremental, now, increment, args);
return 0;
}
@@ -1638,9 +1633,8 @@ io_stats_dump_fd(xlator_t *this, struct ios_fd *iosfd)
{
struct ios_conf *conf = NULL;
struct timeval now;
- uint64_t sec = 0;
- uint64_t usec = 0;
int i = 0;
+ double usecs = 0;
uint64_t data_read = 0;
uint64_t data_written = 0;
uint64_t block_count_read = 0;
@@ -1655,23 +1649,15 @@ io_stats_dump_fd(xlator_t *this, struct ios_fd *iosfd)
return 0;
gettimeofday(&now, NULL);
-
- if (iosfd->opened_at.tv_usec > now.tv_usec) {
- now.tv_usec += 1000000;
- now.tv_usec--;
- }
-
- sec = now.tv_sec - iosfd->opened_at.tv_sec;
- usec = now.tv_usec - iosfd->opened_at.tv_usec;
+ usecs = gf_tvdiff(&iosfd->opened_at, &now);
gf_log(this->name, GF_LOG_INFO, "--- fd stats ---");
if (iosfd->filename)
gf_log(this->name, GF_LOG_INFO, " Filename : %s", iosfd->filename);
- if (sec)
- gf_log(this->name, GF_LOG_INFO,
- " Lifetime : %" PRId64 "secs, %" PRId64 "usecs", sec, usec);
+ if (usecs)
+ gf_log(this->name, GF_LOG_INFO, " Lifetime : %lf secs", usecs);
data_read = GF_ATOMIC_GET(iosfd->data_read);
if (data_read)
@@ -1774,9 +1760,7 @@ update_ios_latency(struct ios_conf *conf, call_frame_t *frame,
begin = &frame->begin;
end = &frame->end;
- elapsed = ((end->tv_sec - begin->tv_sec) * 1e9 +
- (end->tv_nsec - begin->tv_nsec)) /
- 1000;
+ elapsed = gf_tsdiff(begin, end) / 1000.0;
update_ios_latency_stats(&conf->cumulative, elapsed, op);
update_ios_latency_stats(&conf->incremental, elapsed, op);
@@ -1791,12 +1775,13 @@ io_stats_dump_stats_to_dict(xlator_t *this, dict_t *resp,
{
struct ios_conf *conf = NULL;
int cnt = 0;
- char key[256];
+ char key[32];
+ int keylen;
struct ios_stat_head *list_head = NULL;
struct ios_stat_list *entry = NULL;
int ret = -1;
ios_stats_thru_t index = IOS_STATS_THRU_MAX;
- char timestr[256] = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
char *dict_timestr = NULL;
@@ -1815,14 +1800,9 @@ io_stats_dump_stats_to_dict(xlator_t *this, dict_t *resp,
ret = dict_set_uint64(resp, "max-open",
conf->cumulative.max_nr_opens);
- gf_time_fmt(timestr, sizeof timestr,
- conf->cumulative.max_openfd_time.tv_sec,
- gf_timefmt_FT);
- if (conf->cumulative.max_openfd_time.tv_sec)
- snprintf(timestr + strlen(timestr),
- sizeof timestr - strlen(timestr),
- ".%" GF_PRI_SUSECONDS,
- conf->cumulative.max_openfd_time.tv_usec);
+ gf_time_fmt_tv(timestr, sizeof timestr,
+ &conf->cumulative.max_openfd_time,
+ gf_timefmt_FT);
dict_timestr = gf_strdup(timestr);
if (!dict_timestr)
@@ -1862,7 +1842,7 @@ io_stats_dump_stats_to_dict(xlator_t *this, dict_t *resp,
default:
goto out;
}
- ret = dict_set_int32(resp, "top-op", flags);
+ ret = dict_set_int32_sizen(resp, "top-op", flags);
if (!list_cnt)
goto out;
LOCK(&list_head->lock);
@@ -1870,24 +1850,24 @@ io_stats_dump_stats_to_dict(xlator_t *this, dict_t *resp,
list_for_each_entry(entry, &list_head->iosstats->list, list)
{
cnt++;
- snprintf(key, 256, "%s-%d", "filename", cnt);
- ret = dict_set_str(resp, key, entry->iosstat->filename);
+ keylen = snprintf(key, sizeof(key), "filename-%d", cnt);
+ ret = dict_set_strn(resp, key, keylen, entry->iosstat->filename);
if (ret)
goto unlock_list_head;
- snprintf(key, 256, "%s-%d", "value", cnt);
+ snprintf(key, sizeof(key), "value-%d", cnt);
ret = dict_set_uint64(resp, key, entry->value);
if (ret)
goto unlock_list_head;
if (index != IOS_STATS_THRU_MAX) {
- snprintf(key, 256, "%s-%d", "time-sec", cnt);
- ret = dict_set_int32(
- resp, key,
+ keylen = snprintf(key, sizeof(key), "time-sec-%d", cnt);
+ ret = dict_set_int32n(
+ resp, key, keylen,
entry->iosstat->thru_counters[index].time.tv_sec);
if (ret)
goto unlock_list_head;
- snprintf(key, 256, "%s-%d", "time-usec", cnt);
- ret = dict_set_int32(
- resp, key,
+ keylen = snprintf(key, sizeof(key), "time-usec-%d", cnt);
+ ret = dict_set_int32n(
+ resp, key, keylen,
entry->iosstat->thru_counters[index].time.tv_usec);
if (ret)
goto unlock_list_head;
@@ -1902,7 +1882,7 @@ unlock_list_head:
* failed. */
if (ret)
goto out;
- ret = dict_set_int32(resp, "members", cnt);
+ ret = dict_set_int32_sizen(resp, "members", cnt);
out:
return ret;
}
@@ -2118,6 +2098,19 @@ io_stats_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
int
+io_stats_copy_file_range_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *stbuf, struct iatt *prebuf_dst,
+ struct iatt *postbuf_dst, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, COPY_FILE_RANGE);
+
+ STACK_UNWIND_STRICT(copy_file_range, frame, op_ret, op_errno, stbuf,
+ prebuf_dst, postbuf_dst, xdata);
+ return 0;
+}
+
+int
io_stats_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
dict_t *xdata)
@@ -2872,6 +2865,19 @@ io_stats_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
}
int
+io_stats_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in,
+ off_t off_in, fd_t *fd_out, off_t off_out, size_t len,
+ uint32_t flags, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
+
+ STACK_WIND(frame, io_stats_copy_file_range_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->copy_file_range, fd_in, off_in, fd_out,
+ off_out, len, flags, xdata);
+ return 0;
+}
+
+int
io_stats_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
START_FOP_LATENCY(frame);
@@ -3004,7 +3010,7 @@ conditional_dump(dict_t *dict, char *key, data_t *value, void *data)
} else {
(void)ios_dump_args_init(&args, IOS_DUMP_TYPE_FILE, logfp);
}
- io_stats_dump(this, &args, GF_CLI_INFO_ALL, _gf_false);
+ io_stats_dump(this, &args, GF_IOS_INFO_ALL, _gf_false);
fclose(logfp);
return 0;
}
@@ -3107,7 +3113,7 @@ _ios_dump_thread(xlator_t *this)
stats_logfp = fopen(stats_filename, "w+");
if (stats_logfp) {
(void)ios_dump_args_init(&args, conf->dump_format, stats_logfp);
- io_stats_dump(this, &args, GF_CLI_INFO_ALL, _gf_false);
+ io_stats_dump(this, &args, GF_IOS_INFO_ALL, _gf_false);
fclose(stats_logfp);
log_stats_fopen_failure = _gf_true;
} else if (log_stats_fopen_failure) {
@@ -3445,12 +3451,13 @@ io_stats_release(xlator_t *this, fd_t *fd)
BUMP_FOP(RELEASE);
conf = this->private;
-
- LOCK(&conf->lock);
- {
- conf->cumulative.nr_opens--;
+ if (conf) {
+ LOCK(&conf->lock);
+ {
+ conf->cumulative.nr_opens--;
+ }
+ UNLOCK(&conf->lock);
}
- UNLOCK(&conf->lock);
ios_fd_ctx_get(fd, this, &iosfd);
if (iosfd) {
@@ -3567,26 +3574,21 @@ ios_destroy_top_stats(struct ios_conf *conf)
return;
}
-static int
+static void
io_stats_clear(struct ios_conf *conf)
{
- struct timeval now;
- int ret = -1;
+ time_t now = 0;
GF_ASSERT(conf);
+ now = gf_time();
- if (!gettimeofday(&now, NULL)) {
- LOCK(&conf->lock);
- {
- ios_global_stats_clear(&conf->cumulative, &now);
- ios_global_stats_clear(&conf->incremental, &now);
- conf->increment = 0;
- }
- UNLOCK(&conf->lock);
- ret = 0;
+ LOCK(&conf->lock);
+ {
+ ios_global_stats_clear(&conf->cumulative, now);
+ ios_global_stats_clear(&conf->incremental, now);
+ conf->increment = 0;
}
-
- return ret;
+ UNLOCK(&conf->lock);
}
int32_t
@@ -3652,18 +3654,51 @@ io_priv(xlator_t *this)
}
static void
-ios_set_log_format_code(struct ios_conf *conf)
+ios_set_log_format_code(struct ios_conf *conf, char *dump_format_str)
{
- if (strcmp(conf->dump_format_str, "json") == 0)
+ if (strcmp(dump_format_str, "json") == 0)
conf->dump_format = IOS_DUMP_TYPE_JSON_FILE;
- else if (strcmp(conf->dump_format_str, "text") == 0)
+ else if (strcmp(dump_format_str, "text") == 0)
conf->dump_format = IOS_DUMP_TYPE_FILE;
- else if (strcmp(conf->dump_format_str, "dict") == 0)
+ else if (strcmp(dump_format_str, "dict") == 0)
conf->dump_format = IOS_DUMP_TYPE_DICT;
- else if (strcmp(conf->dump_format_str, "samples") == 0)
+ else if (strcmp(dump_format_str, "samples") == 0)
conf->dump_format = IOS_DUMP_TYPE_SAMPLES;
}
+void
+xlator_set_loglevel(xlator_t *this, int log_level)
+{
+ glusterfs_ctx_t *ctx = NULL;
+ glusterfs_graph_t *active = NULL;
+ xlator_t *top = NULL;
+ xlator_t *trav = this;
+
+ ctx = this->ctx;
+ GF_ASSERT(ctx);
+ active = ctx->active;
+ top = active->first;
+
+ if (log_level == -1)
+ return;
+
+ if (ctx->cmd_args.brick_mux) {
+ /* Set log-level for all brick xlators */
+ top->loglevel = log_level;
+
+ /* Set log-level for parent xlator */
+ if (this->parents)
+ this->parents->xlator->loglevel = log_level;
+
+ while (trav) {
+ trav->loglevel = log_level;
+ trav = trav->next;
+ }
+ } else {
+ gf_log_set_loglevel(this->ctx, log_level);
+ }
+}
+
int
reconfigure(xlator_t *this, dict_t *options)
{
@@ -3672,6 +3707,7 @@ reconfigure(xlator_t *this, dict_t *options)
char *sys_log_str = NULL;
char *log_format_str = NULL;
char *logger_str = NULL;
+ char *dump_format_str = NULL;
int sys_log_level = -1;
char *log_str = NULL;
int log_level = -1;
@@ -3680,6 +3716,7 @@ reconfigure(xlator_t *this, dict_t *options)
uint32_t log_buf_size = 0;
uint32_t log_flush_timeout = 0;
int32_t old_dump_interval;
+ int32_t threads;
if (!this || !this->private)
goto out;
@@ -3716,9 +3753,8 @@ reconfigure(xlator_t *this, dict_t *options)
GF_OPTION_RECONF("ios-sample-interval", conf->ios_sample_interval, options,
int32, out);
- GF_OPTION_RECONF("ios-dump-format", conf->dump_format_str, options, str,
- out);
- ios_set_log_format_code(conf);
+ GF_OPTION_RECONF("ios-dump-format", dump_format_str, options, str, out);
+ ios_set_log_format_code(conf, dump_format_str);
GF_OPTION_RECONF("ios-sample-buf-size", conf->ios_sample_buf_size, options,
int32, out);
GF_OPTION_RECONF("sys-log-level", sys_log_str, options, str, out);
@@ -3730,7 +3766,8 @@ reconfigure(xlator_t *this, dict_t *options)
GF_OPTION_RECONF("log-level", log_str, options, str, out);
if (log_str) {
log_level = glusterd_check_log_level(log_str);
- gf_log_set_loglevel(this->ctx, log_level);
+ /* Set loglevel for all children and server xlators */
+ xlator_set_loglevel(this, log_level);
}
GF_OPTION_RECONF("logger", logger_str, options, str, out);
@@ -3752,6 +3789,9 @@ reconfigure(xlator_t *this, dict_t *options)
out);
gf_log_set_log_flush_timeout(log_flush_timeout);
+ GF_OPTION_RECONF("threads", threads, options, int32, out);
+ gf_async_adjust_threads(threads);
+
ret = 0;
out:
gf_log(this ? this->name : "io-stats", GF_LOG_DEBUG,
@@ -3789,7 +3829,7 @@ ios_conf_destroy(struct ios_conf *conf)
_ios_destroy_dump_thread(conf);
ios_destroy_sample_buf(conf->ios_sample_buf);
LOCK_DESTROY(&conf->lock);
- GF_FREE(conf->dnscache);
+ gf_dnscache_deinit(conf->dnscache);
GF_FREE(conf);
}
@@ -3812,16 +3852,18 @@ ios_init_stats(struct ios_global_stats *stats)
for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++)
GF_ATOMIC_INIT(stats->upcall_hits[i], 0);
- gettimeofday(&stats->started_at, NULL);
+ stats->started_at = gf_time();
}
int
init(xlator_t *this)
{
struct ios_conf *conf = NULL;
+ char *volume_id = NULL;
char *sys_log_str = NULL;
char *logger_str = NULL;
char *log_format_str = NULL;
+ char *dump_format_str = NULL;
int logger = -1;
int log_format = -1;
int sys_log_level = -1;
@@ -3830,6 +3872,7 @@ init(xlator_t *this)
int ret = -1;
uint32_t log_buf_size = 0;
uint32_t log_flush_timeout = 0;
+ int32_t threads;
if (!this)
return -1;
@@ -3857,6 +3900,11 @@ init(xlator_t *this)
conf->unique_id = this->name;
}
+ ret = dict_get_strn(this->options, "volume-id", SLEN("volume-id"),
+ &volume_id);
+ if (!ret) {
+ strncpy(this->graph->volume_id, volume_id, GF_UUID_BUF_SIZE);
+ }
/*
* Init it just after calloc, so that we are sure the lock is inited
* in case of error paths.
@@ -3882,8 +3930,8 @@ init(xlator_t *this)
GF_OPTION_INIT("ios-sample-interval", conf->ios_sample_interval, int32,
out);
- GF_OPTION_INIT("ios-dump-format", conf->dump_format_str, str, out);
- ios_set_log_format_code(conf);
+ GF_OPTION_INIT("ios-dump-format", dump_format_str, str, out);
+ ios_set_log_format_code(conf, dump_format_str);
GF_OPTION_INIT("ios-sample-buf-size", conf->ios_sample_buf_size, int32,
out);
@@ -3897,6 +3945,10 @@ init(xlator_t *this)
GF_OPTION_INIT("ios-dnscache-ttl-sec", conf->ios_dnscache_ttl_sec, int32,
out);
conf->dnscache = gf_dnscache_init(conf->ios_dnscache_ttl_sec);
+ if (!conf->dnscache) {
+ ret = -1;
+ goto out;
+ }
GF_OPTION_INIT("sys-log-level", sys_log_str, str, out);
if (sys_log_str) {
@@ -3929,6 +3981,9 @@ init(xlator_t *this)
GF_OPTION_INIT("log-flush-timeout", log_flush_timeout, time, out);
gf_log_set_log_flush_timeout(log_flush_timeout);
+ GF_OPTION_INIT("threads", threads, int32, out);
+ gf_async_adjust_threads(threads);
+
this->private = conf;
if (conf->ios_dump_interval > 0) {
conf->dump_thread_running = _gf_true;
@@ -4033,8 +4088,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
}
} else {
ret = dict_get_int32(dict, "info-op", &op);
- if (ret || op < GF_CLI_INFO_ALL || GF_CLI_INFO_CLEAR < op)
- op = GF_CLI_INFO_ALL;
+ if (ret || op < GF_IOS_INFO_ALL || GF_IOS_INFO_CLEAR < op)
+ op = GF_IOS_INFO_ALL;
ret = dict_set_int32(output, "info-op", op);
if (ret) {
@@ -4043,13 +4098,10 @@ notify(xlator_t *this, int32_t event, void *data, ...)
goto out;
}
- if (GF_CLI_INFO_CLEAR == op) {
- ret = io_stats_clear(this->private);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR,
- "Failed to clear info stats");
+ if (GF_IOS_INFO_CLEAR == op) {
+ io_stats_clear(this->private);
- ret = dict_set_int32(output, "stats-cleared", ret ? 0 : 1);
+ ret = dict_set_int32(output, "stats-cleared", 1);
if (ret)
gf_log(this->name, GF_LOG_ERROR,
"Failed to set stats-cleared"
@@ -4157,6 +4209,7 @@ struct xlator_fops fops = {
.getactivelk = io_stats_getactivelk,
.setactivelk = io_stats_setactivelk,
.compound = io_stats_compound,
+ .copy_file_range = io_stats_copy_file_range,
};
struct xlator_cbks cbks = {
@@ -4371,6 +4424,57 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_STR,
.default_value = "/no/such/path",
.description = "Unique ID for our files."},
+ {.key = {"global-threading"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {GD_OP_VERSION_6_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"io-stats", "threading"},
+ .description = "This option enables the global threading support for "
+ "bricks. If enabled, it's recommended to also enable "
+ "'performance.iot-pass-through'"},
+ {.key = {"threads"}, .type = GF_OPTION_TYPE_INT},
+ {.key = {"brick-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "16",
+ .min = 0,
+ .max = GF_ASYNC_MAX_THREADS,
+ .op_version = {GD_OP_VERSION_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats", "threading"},
+ .description = "When global threading is used, this value determines the "
+ "maximum amount of threads that can be created on bricks"},
+ {.key = {"client-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "16",
+ .min = 0,
+ .max = GF_ASYNC_MAX_THREADS,
+ .op_version = {GD_OP_VERSION_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .tags = {"io-stats", "threading"},
+ .description = "When global threading is used, this value determines the "
+ "maximum amount of threads that can be created on clients"},
+ {.key = {"volume-id"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_7_1},
+ .tags = {"global", "volume-id"},
+ .description =
+ "This option points to the 'unique' UUID particular to this "
+ "volume, which would be set in 'graph->volume_id'"},
{.key = {NULL}},
+};
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "io-stats",
+ .category = GF_MAINTAINED,
};
diff --git a/xlators/debug/sink/src/sink.c b/xlators/debug/sink/src/sink.c
index fbbdd3a4847..9822bbb732e 100644
--- a/xlators/debug/sink/src/sink.c
+++ b/xlators/debug/sink/src/sink.c
@@ -8,8 +8,8 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
int32_t
init(xlator_t *this)
@@ -80,3 +80,15 @@ struct xlator_cbks cbks = {};
struct volume_options options[] = {
{.key = {NULL}},
};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .op_version = {GD_OP_VERSION_3_12_0},
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "sink",
+ .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/trace/src/trace-mem-types.h b/xlators/debug/trace/src/trace-mem-types.h
index cf05a77b9f1..18a7e0414a6 100644
--- a/xlators/debug/trace/src/trace-mem-types.h
+++ b/xlators/debug/trace/src/trace-mem-types.h
@@ -11,7 +11,7 @@
#ifndef __TRACE_MEM_TYPES_H__
#define __TRACE_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_trace_mem_types_ {
gf_trace_mt_trace_conf_t = gf_common_mt_end + 1,
diff --git a/xlators/debug/trace/src/trace.c b/xlators/debug/trace/src/trace.c
index a02288ceb41..6ed0ca00342 100644
--- a/xlators/debug/trace/src/trace.c
+++ b/xlators/debug/trace/src/trace.c
@@ -22,13 +22,13 @@
static void
trace_stat_to_str(struct iatt *buf, char *str, size_t len)
{
- char atime_buf[200] = {
+ char atime_buf[GF_TIMESTR_SIZE] = {
0,
};
- char mtime_buf[200] = {
+ char mtime_buf[GF_TIMESTR_SIZE] = {
0,
};
- char ctime_buf[200] = {
+ char ctime_buf[GF_TIMESTR_SIZE] = {
0,
};
@@ -64,7 +64,7 @@ trace_stat_to_str(struct iatt *buf, char *str, size_t len)
int
dump_history_trace(circular_buffer_t *cb, void *data)
{
- char timestr[256] = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
@@ -72,9 +72,7 @@ dump_history_trace(circular_buffer_t *cb, void *data)
gettimeofday () fails, it's safe to check tm and then dump the time
at which the entry was added to the buffer */
- gf_time_fmt(timestr, sizeof timestr, cb->tv.tv_sec, gf_timefmt_Ymd_T);
- snprintf(timestr + strlen(timestr), 256 - strlen(timestr),
- ".%" GF_PRI_SUSECONDS, cb->tv.tv_usec);
+ gf_time_fmt_tv(timestr, sizeof timestr, &cb->tv, gf_timefmt_Ymd_T);
gf_proc_dump_write("TIME", "%s", timestr);
gf_proc_dump_write("FOP", "%s\n", (char *)cb->data);
@@ -2209,10 +2207,10 @@ int
trace_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- char actime_str[256] = {
+ char actime_str[GF_TIMESTR_SIZE] = {
0,
};
- char modtime_str[256] = {
+ char modtime_str[GF_TIMESTR_SIZE] = {
0,
};
trace_conf_t *conf = NULL;
@@ -2278,10 +2276,10 @@ int
trace_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- char actime_str[256] = {
+ char actime_str[GF_TIMESTR_SIZE] = {
0,
};
- char modtime_str[256] = {
+ char modtime_str[GF_TIMESTR_SIZE] = {
0,
};
trace_conf_t *conf = NULL;
@@ -3311,7 +3309,7 @@ init(xlator_t *this)
char *forced_loglevel = NULL;
eh_t *history = NULL;
int ret = -1;
- size_t history_size = TRACE_DEFAULT_HISTORY_SIZE;
+ uint64_t history_size = TRACE_DEFAULT_HISTORY_SIZE;
trace_conf_t *conf = NULL;
if (!this)
@@ -3364,10 +3362,10 @@ init(xlator_t *this)
if (excludes)
process_call_list(excludes, 0);
- GF_OPTION_INIT("history-size", conf->history_size, size, out);
+ GF_OPTION_INIT("history-size", history_size, size, out);
+ conf->history_size = history_size;
- gf_log(this->name, GF_LOG_INFO, "history size %" GF_PRI_SIZET,
- history_size);
+ gf_log(this->name, GF_LOG_INFO, "history size %" PRIu64, history_size);
GF_OPTION_INIT("log-file", conf->log_file, bool, out);
@@ -3520,3 +3518,17 @@ struct volume_options options[] = {
};
struct xlator_dumpops dumpops = {.history = trace_dump_history};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1},
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "trace",
+ .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/trace/src/trace.h b/xlators/debug/trace/src/trace.h
index 815647c05be..b16304799da 100644
--- a/xlators/debug/trace/src/trace.h
+++ b/xlators/debug/trace/src/trace.h
@@ -10,14 +10,14 @@
#include <time.h>
#include <errno.h>
-#include "glusterfs.h"
-#include "xlator.h"
-#include "common-utils.h"
-#include "event-history.h"
-#include "logging.h"
-#include "circ-buff.h"
-#include "statedump.h"
-#include "options.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/event-history.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/circ-buff.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/options.h>
#define TRACE_DEFAULT_HISTORY_SIZE 1024
@@ -34,7 +34,7 @@ trace_fop_name_t trace_fop_names[GF_FOP_MAXVALUE];
typedef struct {
gf_boolean_t log_file;
gf_boolean_t log_history;
- size_t history_size;
+ uint64_t history_size;
int trace_log_level;
} trace_conf_t;
diff --git a/xlators/encryption/Makefile.am b/xlators/encryption/Makefile.am
deleted file mode 100644
index 36efc6698bd..00000000000
--- a/xlators/encryption/Makefile.am
+++ /dev/null
@@ -1,3 +0,0 @@
-SUBDIRS = rot-13 crypt
-
-CLEANFILES =
diff --git a/xlators/encryption/crypt/Makefile.am b/xlators/encryption/crypt/Makefile.am
deleted file mode 100644
index d471a3f9243..00000000000
--- a/xlators/encryption/crypt/Makefile.am
+++ /dev/null
@@ -1,3 +0,0 @@
-SUBDIRS = src
-
-CLEANFILES =
diff --git a/xlators/encryption/crypt/src/Makefile.am b/xlators/encryption/crypt/src/Makefile.am
deleted file mode 100644
index 05fd3d5096b..00000000000
--- a/xlators/encryption/crypt/src/Makefile.am
+++ /dev/null
@@ -1,26 +0,0 @@
-if ENABLE_CRYPT_XLATOR
-
-xlator_LTLIBRARIES = crypt.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/encryption
-
-crypt_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
-
-crypt_la_SOURCES = keys.c data.c metadata.c atom.c crypt.c
-crypt_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
- -lssl -lcrypto
-
-noinst_HEADERS = crypt-common.h crypt-mem-types.h crypt.h metadata.h
-
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
-
-AM_CFLAGS = -Wall $(GF_CFLAGS)
-
-CLEANFILES =
-
-else
-
-noinst_DIST = keys.c data.c metadata.c atom.c crypt.c
-noinst_HEADERS = crypt-common.h crypt-mem-types.h crypt.h metadata.h
-
-endif
diff --git a/xlators/encryption/crypt/src/atom.c b/xlators/encryption/crypt/src/atom.c
deleted file mode 100644
index 8e9c4940abd..00000000000
--- a/xlators/encryption/crypt/src/atom.c
+++ /dev/null
@@ -1,861 +0,0 @@
-/*
- Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include "defaults.h"
-#include "crypt-common.h"
-#include "crypt.h"
-
-/*
- * Glossary
- *
- *
- * cblock (or cipher block). A logical unit in a file.
- * cblock size is defined as the number of bits
- * in an input (or output) block of the block
- * cipher (*). Cipher block size is a property of
- * cipher algorithm. E.g. cblock size is 64 bits
- * for DES, 128 bits for AES, etc.
- *
- * atomic cipher A cipher algorithm, which requires some chunks of
- * algorithm text to be padded at left and(or) right sides before
- * cipher transaform.
- *
- *
- * block (atom) Minimal chunk of file's data, which doesn't require
- * padding. We'll consider logical units in a file of
- * block size (atom size).
- *
- * cipher algorithm Atomic cipher algorithm, which requires the last
- * with EOF issue incomplete cblock in a file to be padded with some
- * data (usually zeros).
- *
- *
- * operation, which reading/writing from offset, which is not aligned to
- * forms a gap at to atom size
- * the beginning
- *
- *
- * operation, which reading/writing count bytes starting from offset off,
- * forms a gap at so that off+count is not aligned to atom_size
- * the end
- *
- * head block the first atom affected by an operation, which forms
- * a gap at the beginning, or(and) at the end.
- * Сomment. Head block has at least one gap (either at
- * the beginning, or at the end)
- *
- *
- * tail block the last atom different from head, affected by an
- * operation, which forms a gap at the end.
- * Сomment: Tail block has exactly one gap (at the end).
- *
- *
- * partial block head or tail block
- *
- *
- * full block block without gaps.
- *
- *
- * (*) Recommendation for Block Cipher Modes of Operation
- * Methods and Techniques
- * NIST Special Publication 800-38A Edition 2001
- */
-
-/*
- * atom->offset_at()
- */
-static off_t
-offset_at_head(struct avec_config *conf)
-{
- return conf->aligned_offset;
-}
-
-static off_t
-offset_at_hole_head(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_at_head(get_hole_conf(frame));
-}
-
-static off_t
-offset_at_data_head(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_at_head(get_data_conf(frame));
-}
-
-static off_t
-offset_at_tail(struct avec_config *conf, struct object_cipher_info *object)
-{
- return conf->aligned_offset +
- (conf->off_in_head ? get_atom_size(object) : 0) +
- (conf->nr_full_blocks << get_atom_bits(object));
-}
-
-static off_t
-offset_at_hole_tail(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_at_tail(get_hole_conf(frame), object);
-}
-
-static off_t
-offset_at_data_tail(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_at_tail(get_data_conf(frame), object);
-}
-
-static off_t
-offset_at_full(struct avec_config *conf, struct object_cipher_info *object)
-{
- return conf->aligned_offset +
- (conf->off_in_head ? get_atom_size(object) : 0);
-}
-
-static off_t
-offset_at_data_full(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_at_full(get_data_conf(frame), object);
-}
-
-static off_t
-offset_at_hole_full(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_at_full(get_hole_conf(frame), object);
-}
-
-/*
- * atom->io_size_nopad()
- */
-
-static uint32_t
-io_size_nopad_head(struct avec_config *conf, struct object_cipher_info *object)
-{
- uint32_t gap_at_beg;
- uint32_t gap_at_end;
-
- check_head_block(conf);
-
- gap_at_beg = conf->off_in_head;
-
- if (has_tail_block(conf) || has_full_blocks(conf) || conf->off_in_tail == 0)
- gap_at_end = 0;
- else
- gap_at_end = get_atom_size(object) - conf->off_in_tail;
-
- return get_atom_size(object) - (gap_at_beg + gap_at_end);
-}
-
-static uint32_t
-io_size_nopad_tail(struct avec_config *conf, struct object_cipher_info *object)
-{
- check_tail_block(conf);
- return conf->off_in_tail;
-}
-
-static uint32_t
-io_size_nopad_full(struct avec_config *conf, struct object_cipher_info *object)
-{
- check_full_block(conf);
- return get_atom_size(object);
-}
-
-static uint32_t
-io_size_nopad_data_head(call_frame_t *frame, struct object_cipher_info *object)
-{
- return io_size_nopad_head(get_data_conf(frame), object);
-}
-
-static uint32_t
-io_size_nopad_hole_head(call_frame_t *frame, struct object_cipher_info *object)
-{
- return io_size_nopad_head(get_hole_conf(frame), object);
-}
-
-static uint32_t
-io_size_nopad_data_tail(call_frame_t *frame, struct object_cipher_info *object)
-{
- return io_size_nopad_tail(get_data_conf(frame), object);
-}
-
-static uint32_t
-io_size_nopad_hole_tail(call_frame_t *frame, struct object_cipher_info *object)
-{
- return io_size_nopad_tail(get_hole_conf(frame), object);
-}
-
-static uint32_t
-io_size_nopad_data_full(call_frame_t *frame, struct object_cipher_info *object)
-{
- return io_size_nopad_full(get_data_conf(frame), object);
-}
-
-static uint32_t
-io_size_nopad_hole_full(call_frame_t *frame, struct object_cipher_info *object)
-{
- return io_size_nopad_full(get_hole_conf(frame), object);
-}
-
-static uint32_t
-offset_in_head(struct avec_config *conf)
-{
- check_cursor_head(conf);
-
- return conf->off_in_head;
-}
-
-static uint32_t
-offset_in_tail(call_frame_t *frame, struct object_cipher_info *object)
-{
- return 0;
-}
-
-static uint32_t
-offset_in_full(struct avec_config *conf, struct object_cipher_info *object)
-{
- check_cursor_full(conf);
-
- if (has_head_block(conf))
- return (conf->cursor - 1) << get_atom_bits(object);
- else
- return conf->cursor << get_atom_bits(object);
-}
-
-static uint32_t
-offset_in_data_head(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_in_head(get_data_conf(frame));
-}
-
-static uint32_t
-offset_in_hole_head(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_in_head(get_hole_conf(frame));
-}
-
-static uint32_t
-offset_in_data_full(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_in_full(get_data_conf(frame), object);
-}
-
-static uint32_t
-offset_in_hole_full(call_frame_t *frame, struct object_cipher_info *object)
-{
- return offset_in_full(get_hole_conf(frame), object);
-}
-
-/*
- * atom->rmw()
- */
-/*
- * Pre-conditions:
- * @vec contains plain text of the latest
- * version.
- *
- * Uptodate gaps of the @partial block with
- * this plain text, encrypt the whole block
- * and write the result to disk.
- */
-static int32_t
-rmw_partial_block(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iovec *vec,
- int32_t count, struct iatt *stbuf, struct iobref *iobref,
- struct rmw_atom *atom)
-{
- size_t was_read = 0;
- uint64_t file_size;
- crypt_local_t *local = frame->local;
- struct object_cipher_info *object = &local->info->cinfo;
-
- struct iovec *partial = atom->get_iovec(frame, 0);
- struct avec_config *conf = atom->get_config(frame);
- end_writeback_handler_t end_writeback_partial_block;
-#if DEBUG_CRYPT
- gf_boolean_t check_last_cblock = _gf_false;
-#endif
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret < 0)
- goto exit;
-
- file_size = local->cur_file_size;
- was_read = op_ret;
-
- if (atom->locality == HEAD_ATOM && conf->off_in_head) {
- /*
- * head atom with a non-uptodate gap
- * at the beginning
- *
- * fill the gap with plain text of the
- * latest version. Convert a part of hole
- * (if any) to zeros.
- */
- int32_t i;
- int32_t copied = 0;
- int32_t to_gap; /* amount of data needed to uptodate
- the gap at the beginning */
-#if 0
- int32_t hole = 0; /* The part of the hole which
- * got in the head block */
-#endif /* 0 */
- to_gap = conf->off_in_head;
-
- if (was_read < to_gap) {
- if (file_size > offset_at_head(conf) + was_read) {
- /*
- * It is impossible to uptodate
- * head block: too few bytes have
- * been read from disk, so that
- * partial write is impossible.
- *
- * It could happen because of many
- * reasons: IO errors, (meta)data
- * corruption in the local file system,
- * etc.
- */
- gf_log(this->name, GF_LOG_WARNING,
- "Can not uptodate a gap at the beginning");
- local->op_ret = -1;
- local->op_errno = EIO;
- goto exit;
- }
-#if 0
- hole = to_gap - was_read;
-#endif /* 0 */
- to_gap = was_read;
- }
- /*
- * uptodate the gap at the beginning
- */
- for (i = 0; i < count && copied < to_gap; i++) {
- int32_t to_copy;
-
- to_copy = vec[i].iov_len;
- if (to_copy > to_gap - copied)
- to_copy = to_gap - copied;
-
- memcpy(partial->iov_base, vec[i].iov_base, to_copy);
- copied += to_copy;
- }
-#if 0
- /*
- * If possible, convert part of the
- * hole, which got in the head block
- */
- ret = TRY_LOCK(&local->hole_lock);
- if (!ret) {
- if (local->hole_handled)
- /*
- * already converted by
- * crypt_writev_cbk()
- */
- UNLOCK(&local->hole_lock);
- else {
- /*
- * convert the part of the hole
- * which got in the head block
- * to zeros.
- *
- * Update the orig_offset to make
- * sure writev_cbk() won't care
- * about this part of the hole.
- *
- */
- memset(partial->iov_base + to_gap, 0, hole);
-
- conf->orig_offset -= hole;
- conf->orig_size += hole;
- UNLOCK(&local->hole_lock);
- }
- }
- else /*
- * conversion is being performed
- * by crypt_writev_cbk()
- */
- ;
-#endif /* 0 */
- }
- if (atom->locality == TAIL_ATOM ||
- (!has_tail_block(conf) && conf->off_in_tail)) {
- /*
- * tail atom, or head atom with a non-uptodate
- * gap at the end.
- *
- * fill the gap at the end of the block
- * with plain text of the latest version.
- * Pad the result, (if needed)
- */
- int32_t i;
- int32_t to_gap;
- int copied;
- off_t off_in_tail;
- int32_t to_copy;
-
- off_in_tail = conf->off_in_tail;
- to_gap = conf->gap_in_tail;
-
- if (to_gap && was_read < off_in_tail + to_gap) {
- /*
- * It is impossible to uptodate
- * the gap at the end: too few bytes
- * have been read from disk, so that
- * partial write is impossible.
- *
- * It could happen because of many
- * reasons: IO errors, (meta)data
- * corruption in the local file system,
- * etc.
- */
- gf_log(this->name, GF_LOG_WARNING,
- "Can not uptodate a gap at the end");
- local->op_ret = -1;
- local->op_errno = EIO;
- goto exit;
- }
- /*
- * uptodate the gap at the end
- */
- copied = 0;
- to_copy = to_gap;
- for (i = count - 1; i >= 0 && to_copy > 0; i--) {
- uint32_t from_vec, off_in_vec;
-
- off_in_vec = 0;
- from_vec = vec[i].iov_len;
- if (from_vec > to_copy) {
- off_in_vec = from_vec - to_copy;
- from_vec = to_copy;
- }
- memcpy(partial->iov_base + off_in_tail + to_gap - copied - from_vec,
- vec[i].iov_base + off_in_vec, from_vec);
-
- gf_log(
- this->name, GF_LOG_DEBUG,
- "uptodate %d bytes at tail. Offset at target(source): %d(%d)",
- (int)from_vec, (int)off_in_tail + to_gap - copied - from_vec,
- (int)off_in_vec);
-
- copied += from_vec;
- to_copy -= from_vec;
- }
- partial->iov_len = off_in_tail + to_gap;
-
- if (object_alg_should_pad(object)) {
- int32_t resid = 0;
- resid = partial->iov_len & (object_alg_blksize(object) - 1);
- if (resid) {
- /*
- * append a new EOF padding
- */
- local->eof_padding_size = object_alg_blksize(object) - resid;
-
- gf_log(this->name, GF_LOG_DEBUG, "set padding size %d",
- local->eof_padding_size);
-
- memset(partial->iov_base + partial->iov_len, 1,
- local->eof_padding_size);
- partial->iov_len += local->eof_padding_size;
-#if DEBUG_CRYPT
- gf_log(this->name, GF_LOG_DEBUG,
- "pad cblock with %d zeros:", local->eof_padding_size);
- dump_cblock(this, (unsigned char *)partial->iov_base +
- partial->iov_len -
- object_alg_blksize(object));
- check_last_cblock = _gf_true;
-#endif
- }
- }
- }
- /*
- * encrypt the whole block
- */
- encrypt_aligned_iov(object, partial, 1, atom->offset_at(frame, object));
-#if DEBUG_CRYPT
- if (check_last_cblock == _gf_true) {
- gf_log(this->name, GF_LOG_DEBUG, "encrypt last cblock with offset %llu",
- (unsigned long long)atom->offset_at(frame, object));
- dump_cblock(this, (unsigned char *)partial->iov_base +
- partial->iov_len - object_alg_blksize(object));
- }
-#endif
- set_local_io_params_writev(frame, object, atom,
- atom->offset_at(frame, object),
- iov_length(partial, 1));
- /*
- * write the whole block to disk
- */
- end_writeback_partial_block = dispatch_end_writeback(local->fop);
- conf->cursor++;
- STACK_WIND(frame, end_writeback_partial_block, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev, local->fd, partial, 1,
- atom->offset_at(frame, object), local->flags, local->iobref_data,
- local->xdata);
-
- gf_log("crypt", GF_LOG_DEBUG,
- "submit partial block: %d bytes from %d offset",
- (int)iov_length(partial, 1), (int)atom->offset_at(frame, object));
-exit:
- return 0;
-}
-
-/*
- * Perform a (read-)modify-write sequence.
- * This should be performed only after approval
- * of upper server-side manager, i.e. the caller
- * needs to make sure this is his turn to rmw.
- */
-void
-submit_partial(call_frame_t *frame, xlator_t *this, fd_t *fd,
- atom_locality_type ltype)
-{
- int32_t ret;
- dict_t *dict;
- struct rmw_atom *atom;
- crypt_local_t *local = frame->local;
- struct object_cipher_info *object = &local->info->cinfo;
-
- atom = atom_by_types(local->active_setup, ltype);
- /*
- * To perform the "read" component of the read-modify-write
- * sequence the crypt translator does stack_wind to itself.
- *
- * Pass current file size to crypt_readv()
- */
- dict = dict_new();
- if (!dict) {
- /*
- * FIXME: Handle the error
- */
- gf_log("crypt", GF_LOG_WARNING, "Can not alloc dict");
- return;
- }
- ret = dict_set(dict, FSIZE_XATTR_PREFIX,
- data_from_uint64(local->cur_file_size));
- if (ret) {
- /*
- * FIXME: Handle the error
- */
- dict_unref(dict);
- gf_log("crypt", GF_LOG_WARNING, "Can not set dict");
- goto exit;
- }
- STACK_WIND(frame, atom->rmw, this, this->fops->readv, /* crypt_readv */
- fd, atom->count_to_uptodate(frame, object), /* count */
- atom->offset_at(frame, object), /* offset to read from */
- 0, dict);
-exit:
- dict_unref(dict);
-}
-
-/*
- * submit blocks of FULL_ATOM type
- */
-void
-submit_full(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
- struct object_cipher_info *object = &local->info->cinfo;
- struct rmw_atom *atom = atom_by_types(local->active_setup, FULL_ATOM);
- uint32_t count; /* total number of full blocks to submit */
- uint32_t granularity; /* number of blocks to submit in one iteration */
-
- uint64_t off_in_file; /* start offset in the file, bytes */
- uint32_t off_in_atom; /* start offset in the atom, blocks */
- uint32_t blocks_written = 0; /* blocks written for this submit */
-
- struct avec_config *conf = atom->get_config(frame);
- end_writeback_handler_t end_writeback_full_block;
- /*
- * Write full blocks by groups of granularity size.
- */
- end_writeback_full_block = dispatch_end_writeback(local->fop);
-
- if (is_ordered_mode(frame)) {
- uint32_t skip = has_head_block(conf) ? 1 : 0;
- count = 1;
- granularity = 1;
- /*
- * calculate start offset using cursor value;
- * here we should take into account head block,
- * which corresponds to cursor value 0.
- */
- off_in_file = atom->offset_at(frame, object) +
- ((conf->cursor - skip) << get_atom_bits(object));
- off_in_atom = conf->cursor - skip;
- } else {
- /*
- * in parallel mode
- */
- count = conf->nr_full_blocks;
- granularity = MAX_IOVEC;
- off_in_file = atom->offset_at(frame, object);
- off_in_atom = 0;
- }
- while (count) {
- uint32_t blocks_to_write = count;
-
- if (blocks_to_write > granularity)
- blocks_to_write = granularity;
- if (conf->type == HOLE_ATOM)
- /*
- * reset iovec before encryption
- */
- memset(atom->get_iovec(frame, 0)->iov_base, 0,
- get_atom_size(object));
- /*
- * encrypt the group
- */
- encrypt_aligned_iov(
- object, atom->get_iovec(frame, off_in_atom + blocks_written),
- blocks_to_write,
- off_in_file + (blocks_written << get_atom_bits(object)));
-
- set_local_io_params_writev(
- frame, object, atom,
- off_in_file + (blocks_written << get_atom_bits(object)),
- blocks_to_write << get_atom_bits(object));
-
- conf->cursor += blocks_to_write;
-
- STACK_WIND(frame, end_writeback_full_block, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev, local->fd,
- atom->get_iovec(frame, off_in_atom + blocks_written),
- blocks_to_write,
- off_in_file + (blocks_written << get_atom_bits(object)),
- local->flags,
- local->iobref_data ? local->iobref_data : local->iobref,
- local->xdata);
-
- gf_log("crypt", GF_LOG_DEBUG, "submit %d full blocks from %d offset",
- blocks_to_write,
- (int)(off_in_file + (blocks_written << get_atom_bits(object))));
-
- count -= blocks_to_write;
- blocks_written += blocks_to_write;
- }
- return;
-}
-
-static int32_t
-rmw_data_head(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iovec *vec, int32_t count,
- struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
-{
- return rmw_partial_block(frame, cookie, this, op_ret, op_errno, vec, count,
- stbuf, iobref,
- atom_by_types(DATA_ATOM, HEAD_ATOM));
-}
-
-static int32_t
-rmw_data_tail(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iovec *vec, int32_t count,
- struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
-{
- return rmw_partial_block(frame, cookie, this, op_ret, op_errno, vec, count,
- stbuf, iobref,
- atom_by_types(DATA_ATOM, TAIL_ATOM));
-}
-
-static int32_t
-rmw_hole_head(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iovec *vec, int32_t count,
- struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
-{
- return rmw_partial_block(frame, cookie, this, op_ret, op_errno, vec, count,
- stbuf, iobref,
- atom_by_types(HOLE_ATOM, HEAD_ATOM));
-}
-
-static int32_t
-rmw_hole_tail(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iovec *vec, int32_t count,
- struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
-{
- return rmw_partial_block(frame, cookie, this, op_ret, op_errno, vec, count,
- stbuf, iobref,
- atom_by_types(HOLE_ATOM, TAIL_ATOM));
-}
-
-/*
- * atom->count_to_uptodate()
- */
-static uint32_t
-count_to_uptodate_head(struct avec_config *conf,
- struct object_cipher_info *object)
-{
- if (conf->acount == 1 && conf->off_in_tail)
- return get_atom_size(object);
- else
- /* there is no need to read the whole head block */
- return conf->off_in_head;
-}
-
-static uint32_t
-count_to_uptodate_tail(struct avec_config *conf,
- struct object_cipher_info *object)
-{
- /* we need to read the whole tail block */
- return get_atom_size(object);
-}
-
-static uint32_t
-count_to_uptodate_data_head(call_frame_t *frame,
- struct object_cipher_info *object)
-{
- return count_to_uptodate_head(get_data_conf(frame), object);
-}
-
-static uint32_t
-count_to_uptodate_data_tail(call_frame_t *frame,
- struct object_cipher_info *object)
-{
- return count_to_uptodate_tail(get_data_conf(frame), object);
-}
-
-static uint32_t
-count_to_uptodate_hole_head(call_frame_t *frame,
- struct object_cipher_info *object)
-{
- return count_to_uptodate_head(get_hole_conf(frame), object);
-}
-
-static uint32_t
-count_to_uptodate_hole_tail(call_frame_t *frame,
- struct object_cipher_info *object)
-{
- return count_to_uptodate_tail(get_hole_conf(frame), object);
-}
-
-/* atom->get_config() */
-
-static struct avec_config *
-get_config_data(call_frame_t *frame)
-{
- return &((crypt_local_t *)frame->local)->data_conf;
-}
-
-static struct avec_config *
-get_config_hole(call_frame_t *frame)
-{
- return &((crypt_local_t *)frame->local)->hole_conf;
-}
-
-/*
- * atom->get_iovec()
- */
-static struct iovec *
-get_iovec_hole_head(call_frame_t *frame, uint32_t count)
-{
- struct avec_config *conf = get_hole_conf(frame);
-
- return conf->avec;
-}
-
-static struct iovec *
-get_iovec_hole_full(call_frame_t *frame, uint32_t count)
-{
- struct avec_config *conf = get_hole_conf(frame);
-
- return conf->avec + (conf->off_in_head ? 1 : 0);
-}
-
-static struct iovec *
-get_iovec_hole_tail(call_frame_t *frame, uint32_t count)
-{
- struct avec_config *conf = get_hole_conf(frame);
-
- return conf->avec + (conf->blocks_in_pool - 1);
-}
-
-static struct iovec *
-get_iovec_data_head(call_frame_t *frame, uint32_t count)
-{
- struct avec_config *conf = get_data_conf(frame);
-
- return conf->avec;
-}
-
-static struct iovec *
-get_iovec_data_full(call_frame_t *frame, uint32_t count)
-{
- struct avec_config *conf = get_data_conf(frame);
-
- return conf->avec + (conf->off_in_head ? 1 : 0) + count;
-}
-
-static struct iovec *
-get_iovec_data_tail(call_frame_t *frame, uint32_t count)
-{
- struct avec_config *conf = get_data_conf(frame);
-
- return conf->avec + (conf->off_in_head ? 1 : 0) + conf->nr_full_blocks;
-}
-
-static struct rmw_atom atoms[LAST_DATA_TYPE][LAST_LOCALITY_TYPE] = {
- [DATA_ATOM][HEAD_ATOM] = {.locality = HEAD_ATOM,
- .rmw = rmw_data_head,
- .offset_at = offset_at_data_head,
- .offset_in = offset_in_data_head,
- .get_iovec = get_iovec_data_head,
- .io_size_nopad = io_size_nopad_data_head,
- .count_to_uptodate = count_to_uptodate_data_head,
- .get_config = get_config_data},
- [DATA_ATOM][TAIL_ATOM] = {.locality = TAIL_ATOM,
- .rmw = rmw_data_tail,
- .offset_at = offset_at_data_tail,
- .offset_in = offset_in_tail,
- .get_iovec = get_iovec_data_tail,
- .io_size_nopad = io_size_nopad_data_tail,
- .count_to_uptodate = count_to_uptodate_data_tail,
- .get_config = get_config_data},
- [DATA_ATOM][FULL_ATOM] = {.locality = FULL_ATOM,
- .offset_at = offset_at_data_full,
- .offset_in = offset_in_data_full,
- .get_iovec = get_iovec_data_full,
- .io_size_nopad = io_size_nopad_data_full,
- .get_config = get_config_data},
- [HOLE_ATOM][HEAD_ATOM] = {.locality = HEAD_ATOM,
- .rmw = rmw_hole_head,
- .offset_at = offset_at_hole_head,
- .offset_in = offset_in_hole_head,
- .get_iovec = get_iovec_hole_head,
- .io_size_nopad = io_size_nopad_hole_head,
- .count_to_uptodate = count_to_uptodate_hole_head,
- .get_config = get_config_hole},
- [HOLE_ATOM][TAIL_ATOM] = {.locality = TAIL_ATOM,
- .rmw = rmw_hole_tail,
- .offset_at = offset_at_hole_tail,
- .offset_in = offset_in_tail,
- .get_iovec = get_iovec_hole_tail,
- .io_size_nopad = io_size_nopad_hole_tail,
- .count_to_uptodate = count_to_uptodate_hole_tail,
- .get_config = get_config_hole},
- [HOLE_ATOM][FULL_ATOM] = {.locality = FULL_ATOM,
- .offset_at = offset_at_hole_full,
- .offset_in = offset_in_hole_full,
- .get_iovec = get_iovec_hole_full,
- .io_size_nopad = io_size_nopad_hole_full,
- .get_config = get_config_hole}};
-
-struct rmw_atom *
-atom_by_types(atom_data_type data, atom_locality_type locality)
-{
- return &atoms[data][locality];
-}
-
-/*
- Local variables:
- c-indentation-style: "K&R"
- mode-name: "LC"
- c-basic-offset: 8
- tab-width: 8
- fill-column: 80
- scroll-step: 1
- End:
-*/
diff --git a/xlators/encryption/crypt/src/crypt-common.h b/xlators/encryption/crypt/src/crypt-common.h
deleted file mode 100644
index 123d5c2a631..00000000000
--- a/xlators/encryption/crypt/src/crypt-common.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __CRYPT_COMMON_H__
-#define __CRYPT_COMMON_H__
-
-#define INVAL_SUBVERSION_NUMBER (0xff)
-#define CRYPT_INVAL_OP (GF_FOP_NULL)
-
-#define CRYPTO_FORMAT_PREFIX "trusted.glusterfs.crypt.att.cfmt"
-#define FSIZE_XATTR_PREFIX "trusted.glusterfs.crypt.att.size"
-#define SUBREQ_PREFIX "trusted.glusterfs.crypt.msg.sreq"
-#define FSIZE_MSG_PREFIX "trusted.glusterfs.crypt.msg.size"
-#define DE_MSG_PREFIX "trusted.glusterfs.crypt.msg.dent"
-#define REQUEST_ID_PREFIX "trusted.glusterfs.crypt.msg.rqid"
-#define MSGFLAGS_PREFIX "trusted.glusterfs.crypt.msg.xfgs"
-
-/* messages for crypt_open() */
-#define MSGFLAGS_REQUEST_MTD_RLOCK 1 /* take read lock and don't unlock */
-#define MSGFLAGS_REQUEST_MTD_WLOCK 2 /* take write lock and don't unlock */
-
-#define AES_BLOCK_BITS (4) /* AES_BLOCK_SIZE == 1 << AES_BLOCK_BITS */
-
-#define noop \
- do { \
- ; \
- } while (0)
-#define cassert(cond) \
- ({ \
- switch (-1) { \
- case (cond): \
- case 0: \
- break; \
- } \
- })
-#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-#define round_up(x, y) ((((x)-1) | __round_mask(x, y)) + 1)
-
-/*
- * Format of file's metadata
- */
-struct crypt_format {
- uint8_t loader_id; /* version of metadata loader */
- uint8_t versioned[0]; /* file's metadata of specific version */
-} __attribute__((packed));
-
-typedef enum { AES_CIPHER_ALG, LAST_CIPHER_ALG } cipher_alg_t;
-
-typedef enum { XTS_CIPHER_MODE, LAST_CIPHER_MODE } cipher_mode_t;
-
-typedef enum { MTD_LOADER_V1, LAST_MTD_LOADER } mtd_loader_id;
-
-static inline void
-msgflags_set_mtd_rlock(uint32_t *flags)
-{
- *flags |= MSGFLAGS_REQUEST_MTD_RLOCK;
-}
-
-static inline void
-msgflags_set_mtd_wlock(uint32_t *flags)
-{
- *flags |= MSGFLAGS_REQUEST_MTD_WLOCK;
-}
-
-static inline gf_boolean_t
-msgflags_check_mtd_rlock(uint32_t *flags)
-{
- return *flags & MSGFLAGS_REQUEST_MTD_RLOCK;
-}
-
-static inline gf_boolean_t
-msgflags_check_mtd_wlock(uint32_t *flags)
-{
- return *flags & MSGFLAGS_REQUEST_MTD_WLOCK;
-}
-
-static inline gf_boolean_t
-msgflags_check_mtd_lock(uint32_t *flags)
-{
- return msgflags_check_mtd_rlock(flags) || msgflags_check_mtd_wlock(flags);
-}
-
-/*
- * returns number of logical blocks occupied
- * (maybe partially) by @count bytes
- * at offset @start.
- */
-static inline off_t
-logical_blocks_occupied(uint64_t start, off_t count, int blkbits)
-{
- return ((start + count - 1) >> blkbits) - (start >> blkbits) + 1;
-}
-
-/*
- * are two bytes (represented by offsets @off1
- * and @off2 respectively) in the same logical
- * block.
- */
-static inline int
-in_same_lblock(uint64_t off1, uint64_t off2, int blkbits)
-{
- return off1 >> blkbits == off2 >> blkbits;
-}
-
-static inline void
-dump_cblock(xlator_t *this, unsigned char *buf)
-{
- gf_log(this->name, GF_LOG_DEBUG,
- "dump cblock: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x",
- (buf)[0], (buf)[1], (buf)[2], (buf)[3], (buf)[4], (buf)[5], (buf)[6],
- (buf)[7], (buf)[8], (buf)[9], (buf)[10], (buf)[11], (buf)[12],
- (buf)[13], (buf)[14], (buf)[15]);
-}
-
-#endif /* __CRYPT_COMMON_H__ */
-
-/*
- Local variables:
- c-indentation-style: "K&R"
- mode-name: "LC"
- c-basic-offset: 8
- tab-width: 8
- fill-column: 80
- scroll-step: 1
- End:
-*/
diff --git a/xlators/encryption/crypt/src/crypt-mem-types.h b/xlators/encryption/crypt/src/crypt-mem-types.h
deleted file mode 100644
index 7e9fb90ed43..00000000000
--- a/xlators/encryption/crypt/src/crypt-mem-types.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __CRYPT_MEM_TYPES_H__
-#define __CRYPT_MEM_TYPES_H__
-
-#include "mem-types.h"
-
-enum gf_crypt_mem_types_ {
- gf_crypt_mt_priv = gf_common_mt_end + 1,
- gf_crypt_mt_inode,
- gf_crypt_mt_data,
- gf_crypt_mt_mtd,
- gf_crypt_mt_loc,
- gf_crypt_mt_iatt,
- gf_crypt_mt_key,
- gf_crypt_mt_iovec,
- gf_crypt_mt_char,
- gf_crypt_mt_local,
- gf_crypt_mt_end,
-};
-
-#endif /* __CRYPT_MEM_TYPES_H__ */
-
-/*
- Local variables:
- c-indentation-style: "K&R"
- mode-name: "LC"
- c-basic-offset: 8
- tab-width: 8
- fill-column: 80
- scroll-step: 1
- End:
-*/
diff --git a/xlators/encryption/crypt/src/crypt.c b/xlators/encryption/crypt/src/crypt.c
deleted file mode 100644
index 02c4028c087..00000000000
--- a/xlators/encryption/crypt/src/crypt.c
+++ /dev/null
@@ -1,3906 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-#include <ctype.h>
-#include <sys/uio.h>
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-
-#include "crypt-common.h"
-#include "crypt.h"
-
-static void
-init_inode_info_head(struct crypt_inode_info *info, fd_t *fd);
-static int32_t
-init_inode_info_tail(struct crypt_inode_info *info,
- struct master_cipher_info *master);
-static int32_t
-prepare_for_submit_hole(call_frame_t *frame, xlator_t *this, uint64_t from,
- off_t size);
-static int32_t
-load_file_size(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata);
-static void
-do_ordered_submit(call_frame_t *frame, xlator_t *this, atom_data_type dtype);
-static void
-do_parallel_submit(call_frame_t *frame, xlator_t *this, atom_data_type dtype);
-static void
-put_one_call_open(call_frame_t *frame);
-static void
-put_one_call_readv(call_frame_t *frame, xlator_t *this);
-static void
-put_one_call_writev(call_frame_t *frame, xlator_t *this);
-static void
-put_one_call_ftruncate(call_frame_t *frame, xlator_t *this);
-static void
-free_avec(struct iovec *avec, char **pool, int blocks_in_pool);
-static void
-free_avec_data(crypt_local_t *local);
-static void
-free_avec_hole(crypt_local_t *local);
-
-static crypt_local_t *
-crypt_alloc_local(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop)
-{
- crypt_local_t *local = NULL;
-
- local = GF_CALLOC(1, sizeof(crypt_local_t), gf_crypt_mt_local);
- if (!local) {
- gf_log(this->name, GF_LOG_ERROR, "out of memory");
- return NULL;
- }
- local->fop = fop;
- LOCK_INIT(&local->hole_lock);
- LOCK_INIT(&local->call_lock);
- LOCK_INIT(&local->rw_count_lock);
-
- frame->local = local;
- return local;
-}
-
-struct crypt_inode_info *
-get_crypt_inode_info(inode_t *inode, xlator_t *this)
-{
- int ret;
- uint64_t value = 0;
- struct crypt_inode_info *info;
-
- ret = inode_ctx_get(inode, this, &value);
- if (ret == -1) {
- gf_log(this->name, GF_LOG_WARNING, "Can not get inode info");
- return NULL;
- }
- info = (struct crypt_inode_info *)(long)value;
- if (info == NULL) {
- gf_log(this->name, GF_LOG_WARNING, "Can not obtain inode info");
- return NULL;
- }
- return info;
-}
-
-static struct crypt_inode_info *
-local_get_inode_info(crypt_local_t *local, xlator_t *this)
-{
- if (local->info)
- return local->info;
- local->info = get_crypt_inode_info(local->fd->inode, this);
- return local->info;
-}
-
-static struct crypt_inode_info *
-alloc_inode_info(crypt_local_t *local, loc_t *loc)
-{
- struct crypt_inode_info *info;
-
- info = GF_CALLOC(1, sizeof(struct crypt_inode_info), gf_crypt_mt_inode);
- if (!info) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- gf_log("crypt", GF_LOG_WARNING, "Can not allocate inode info");
- return NULL;
- }
-#if DEBUG_CRYPT
- info->loc = GF_CALLOC(1, sizeof(loc_t), gf_crypt_mt_loc);
- if (!info->loc) {
- gf_log("crypt", GF_LOG_WARNING, "Can not allocate loc");
- GF_FREE(info);
- return NULL;
- }
- if (loc_copy(info->loc, loc)) {
- GF_FREE(info->loc);
- GF_FREE(info);
- return NULL;
- }
-#endif /* DEBUG_CRYPT */
-
- local->info = info;
- return info;
-}
-
-static void
-free_inode_info(struct crypt_inode_info *info)
-{
-#if DEBUG_CRYPT
- loc_wipe(info->loc);
- GF_FREE(info->loc);
-#endif
- memset(info, 0, sizeof(*info));
- GF_FREE(info);
-}
-
-int
-crypt_forget(xlator_t *this, inode_t *inode)
-{
- uint64_t ctx_addr = 0;
- if (!inode_ctx_del(inode, this, &ctx_addr))
- free_inode_info((struct crypt_inode_info *)(long)ctx_addr);
- return 0;
-}
-
-#if DEBUG_CRYPT
-static void
-check_read(call_frame_t *frame, xlator_t *this, int32_t read, struct iovec *vec,
- int32_t count, struct iatt *stbuf)
-{
- crypt_local_t *local = frame->local;
- struct object_cipher_info *object = get_object_cinfo(local->info);
- struct avec_config *conf = &local->data_conf;
- uint32_t resid = stbuf->ia_size & (object_alg_blksize(object) - 1);
-
- if (read <= 0)
- return;
- if (read != iov_length(vec, count))
- gf_log("crypt", GF_LOG_DEBUG,
- "op_ret differs from amount of read bytes");
-
- if (object_alg_should_pad(object) &&
- (read & (object_alg_blksize(object) - 1)))
- gf_log("crypt", GF_LOG_DEBUG,
- "bad amount of read bytes (!= 0 mod(cblock size))");
-
- if (conf->aligned_offset + read >
- stbuf->ia_size + (resid ? object_alg_blksize(object) - resid : 0))
- gf_log("crypt", GF_LOG_DEBUG, "bad amount of read bytes (too large))");
-}
-
-#define PT_BYTES_TO_DUMP (32)
-static void
-dump_plain_text(crypt_local_t *local, struct iovec *avec)
-{
- int32_t to_dump;
- char str[PT_BYTES_TO_DUMP + 1];
-
- if (!avec)
- return;
- to_dump = avec->iov_len;
- if (to_dump > PT_BYTES_TO_DUMP)
- to_dump = PT_BYTES_TO_DUMP;
- memcpy(str, avec->iov_base, to_dump);
- memset(str + to_dump, '0', 1);
- gf_log("crypt", GF_LOG_DEBUG, "Read file: %s", str);
-}
-
-static int32_t
-data_conf_invariant(struct avec_config *conf)
-{
- return conf->acount == !!has_head_block(conf) + !!has_tail_block(conf) +
- conf->nr_full_blocks;
-}
-
-static int32_t
-hole_conf_invariant(struct avec_config *conf)
-{
- return conf->blocks_in_pool == !!has_head_block(conf) +
- !!has_tail_block(conf) +
- !!has_full_blocks(conf);
-}
-
-static void
-crypt_check_conf(struct avec_config *conf)
-{
- int32_t ret = 0;
- const char *msg;
-
- switch (conf->type) {
- case DATA_ATOM:
- msg = "data";
- ret = data_conf_invariant(conf);
- break;
- case HOLE_ATOM:
- msg = "hole";
- ret = hole_conf_invariant(conf);
- break;
- default:
- msg = "unknown";
- }
- if (!ret)
- gf_log("crypt", GF_LOG_DEBUG, "bad %s conf", msg);
-}
-
-static void
-check_buf(call_frame_t *frame, xlator_t *this, struct iatt *buf)
-{
- crypt_local_t *local = frame->local;
- struct object_cipher_info *object = &local->info->cinfo;
- uint64_t local_file_size;
-
- switch (local->fop) {
- case GF_FOP_FTRUNCATE:
- return;
- case GF_FOP_WRITE:
- local_file_size = local->new_file_size;
- break;
- case GF_FOP_READ:
- if (parent_is_crypt_xlator(frame, this))
- return;
- local_file_size = local->cur_file_size;
- break;
- default:
- gf_log("crypt", GF_LOG_DEBUG, "bad file operation");
- return;
- }
- if (buf->ia_size != round_up(local_file_size, object_alg_blksize(object)))
- gf_log("crypt", GF_LOG_DEBUG,
- "bad ia_size in buf (%llu), should be %llu",
- (unsigned long long)buf->ia_size,
- (unsigned long long)round_up(local_file_size,
- object_alg_blksize(object)));
-}
-
-#else
-#define check_read(frame, this, op_ret, vec, count, stbuf) noop
-#define dump_plain_text(local, avec) noop
-#define crypt_check_conf(conf) noop
-#define check_buf(frame, this, buf) noop
-#endif /* DEBUG_CRYPT */
-
-/*
- * Pre-conditions:
- * @vec represents a ciphertext of expanded size and
- * aligned offset.
- *
- * Compound a temporal vector @avec with block-aligned
- * components, decrypt and fix it up to represent a chunk
- * of data corresponding to the original size and offset.
- * Pass the result to the next translator.
- */
-int32_t
-crypt_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iovec *vec,
- int32_t count, struct iatt *stbuf, struct iobref *iobref,
- dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- struct avec_config *conf = &local->data_conf;
- struct object_cipher_info *object = &local->info->cinfo;
-
- struct iovec *avec;
- uint32_t i;
- uint32_t to_vec;
- uint32_t to_user;
-
- check_buf(frame, this, stbuf);
- check_read(frame, this, op_ret, vec, count, stbuf);
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- local->iobref = iobref_ref(iobref);
-
- local->buf = *stbuf;
- local->buf.ia_size = local->cur_file_size;
-
- if (op_ret <= 0 || count == 0 || vec[0].iov_len == 0)
- goto put_one_call;
-
- if (conf->orig_offset >= local->cur_file_size) {
- local->op_ret = 0;
- goto put_one_call;
- }
- /*
- * correct config params with real file size
- * and actual amount of bytes read
- */
- set_config_offsets(frame, this, conf->orig_offset, op_ret, DATA_ATOM, 0);
-
- if (conf->orig_offset + conf->orig_size > local->cur_file_size)
- conf->orig_size = local->cur_file_size - conf->orig_offset;
- /*
- * calculate amount of data to be returned
- * to user.
- */
- to_user = op_ret;
- if (conf->aligned_offset + to_user <= conf->orig_offset) {
- gf_log(this->name, GF_LOG_WARNING, "Incomplete read");
- local->op_ret = -1;
- local->op_errno = EIO;
- goto put_one_call;
- }
- to_user -= (conf->aligned_offset - conf->orig_offset);
-
- if (to_user > conf->orig_size)
- to_user = conf->orig_size;
- local->rw_count = to_user;
-
- op_errno = set_config_avec_data(this, local, conf, object, vec, count);
- if (op_errno) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- goto put_one_call;
- }
- avec = conf->avec;
-#if DEBUG_CRYPT
- if (conf->off_in_tail != 0 &&
- conf->off_in_tail < object_alg_blksize(object) &&
- object_alg_should_pad(object))
- gf_log(this->name, GF_LOG_DEBUG, "Bad offset in tail %d",
- conf->off_in_tail);
- if (iov_length(vec, count) != 0 &&
- in_same_lblock(conf->orig_offset + iov_length(vec, count) - 1,
- local->cur_file_size - 1, object_alg_blkbits(object))) {
- gf_log(this->name, GF_LOG_DEBUG, "Compound last cblock");
- dump_cblock(this, (unsigned char *)(avec[conf->acount - 1].iov_base) +
- avec[conf->acount - 1].iov_len -
- object_alg_blksize(object));
- dump_cblock(this, (unsigned char *)(vec[count - 1].iov_base) +
- vec[count - 1].iov_len -
- object_alg_blksize(object));
- }
-#endif
- decrypt_aligned_iov(object, avec, conf->acount, conf->aligned_offset);
- /*
- * pass proper plain data to user
- */
- avec[0].iov_base += (conf->aligned_offset - conf->orig_offset);
- avec[0].iov_len -= (conf->aligned_offset - conf->orig_offset);
-
- to_vec = to_user;
- for (i = 0; i < conf->acount; i++) {
- if (avec[i].iov_len > to_vec)
- avec[i].iov_len = to_vec;
- to_vec -= avec[i].iov_len;
- }
-put_one_call:
- put_one_call_readv(frame, this);
- return 0;
-}
-
-static int32_t
-do_readv(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *dict, dict_t *xdata)
-{
- data_t *data;
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0)
- goto error;
- /*
- * extract regular file size
- */
- data = dict_get(dict, FSIZE_XATTR_PREFIX);
- if (!data) {
- gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
- op_errno = EIO;
- goto error;
- }
- local->cur_file_size = data_to_uint64(data);
-
- get_one_call(frame);
- STACK_WIND(frame, crypt_readv_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv, local->fd,
- /*
- * FIXME: read amount can be reduced
- */
- local->data_conf.expanded_size, local->data_conf.aligned_offset,
- local->flags, local->xdata);
- return 0;
-error:
- local->op_ret = -1;
- local->op_errno = op_errno;
-
- get_one_call(frame);
- put_one_call_readv(frame, this);
- return 0;
-}
-
-static int32_t
-crypt_readv_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0)
- goto error;
- /*
- * An access has been granted,
- * retrieve file size
- */
- STACK_WIND(frame, do_readv, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fgetxattr, local->fd,
- FSIZE_XATTR_PREFIX, NULL);
- return 0;
-error:
- fd_unref(local->fd);
- if (local->xdata)
- dict_unref(local->xdata);
- CRYPT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
- return 0;
-}
-
-static int32_t
-readv_trivial_completion(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret < 0) {
- gf_log(this->name, GF_LOG_WARNING, "stat failed (%d)", op_errno);
- goto error;
- }
- local->buf = *buf;
- STACK_WIND(frame, load_file_size, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr, local->loc,
- FSIZE_XATTR_PREFIX, NULL);
- return 0;
-error:
- CRYPT_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, 0, NULL, NULL,
- NULL);
- return 0;
-}
-
-int32_t
-crypt_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata)
-{
- int32_t ret;
- crypt_local_t *local;
- struct crypt_inode_info *info;
- struct gf_flock lock = {
- 0,
- };
-
-#if DEBUG_CRYPT
- gf_log("crypt", GF_LOG_DEBUG, "reading %d bytes from offset %llu",
- (int)size, (long long)offset);
- if (parent_is_crypt_xlator(frame, this))
- gf_log("crypt", GF_LOG_DEBUG, "parent is crypt");
-#endif
- local = crypt_alloc_local(frame, this, GF_FOP_READ);
- if (!local) {
- ret = ENOMEM;
- goto error;
- }
- if (size == 0)
- goto trivial;
-
- local->fd = fd_ref(fd);
- local->flags = flags;
-
- info = local_get_inode_info(local, this);
- if (info == NULL) {
- ret = EINVAL;
- fd_unref(fd);
- goto error;
- }
- if (!object_alg_atomic(&info->cinfo)) {
- ret = EINVAL;
- fd_unref(fd);
- goto error;
- }
- set_config_offsets(frame, this, offset, size, DATA_ATOM, 0);
- if (parent_is_crypt_xlator(frame, this)) {
- data_t *data;
- /*
- * We are called by crypt_writev (or cypt_ftruncate)
- * to perform the "read" component of the read-modify-write
- * (or read-prune-write) sequence for some atom;
- *
- * don't ask for access:
- * it has already been acquired
- *
- * Retrieve current file size
- */
- if (!xdata) {
- gf_log("crypt", GF_LOG_WARNING,
- "Regular file size hasn't been passed");
- ret = EIO;
- goto error;
- }
- data = dict_get(xdata, FSIZE_XATTR_PREFIX);
- if (!data) {
- gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
- ret = EIO;
- goto error;
- }
- local->old_file_size = local->cur_file_size = data_to_uint64(data);
-
- get_one_call(frame);
- STACK_WIND(frame, crypt_readv_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv, local->fd,
- /*
- * FIXME: read amount can be reduced
- */
- local->data_conf.expanded_size,
- local->data_conf.aligned_offset, flags, NULL);
- return 0;
- }
- if (xdata)
- local->xdata = dict_ref(xdata);
-
- lock.l_len = 0;
- lock.l_start = 0;
- lock.l_type = F_RDLCK;
- lock.l_whence = SEEK_SET;
-
- STACK_WIND(frame, crypt_readv_finodelk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, fd, F_SETLKW,
- &lock, NULL);
- return 0;
-trivial:
- STACK_WIND(frame, readv_trivial_completion, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat, fd, NULL);
- return 0;
-error:
- CRYPT_STACK_UNWIND(readv, frame, -1, ret, NULL, 0, NULL, NULL, NULL);
- return 0;
-}
-
-void
-set_local_io_params_writev(call_frame_t *frame,
- struct object_cipher_info *object,
- struct rmw_atom *atom, off_t io_offset,
- uint32_t io_size)
-{
- crypt_local_t *local = frame->local;
-
- local->io_offset = io_offset;
- local->io_size = io_size;
-
- local->io_offset_nopad = atom->offset_at(frame, object) +
- atom->offset_in(frame, object);
-
- gf_log("crypt", GF_LOG_DEBUG, "set nopad offset to %llu",
- (unsigned long long)local->io_offset_nopad);
-
- local->io_size_nopad = atom->io_size_nopad(frame, object);
-
- gf_log("crypt", GF_LOG_DEBUG, "set nopad size to %llu",
- (unsigned long long)local->io_size_nopad);
-
- local->update_disk_file_size = 0;
- /*
- * NOTE: eof_padding_size is 0 for all full atoms;
- * For head and tail atoms it will be set up at rmw_partial block()
- */
- local->new_file_size = local->cur_file_size;
-
- if (local->io_offset_nopad + local->io_size_nopad > local->cur_file_size) {
- local->new_file_size = local->io_offset_nopad + local->io_size_nopad;
-
- gf_log("crypt", GF_LOG_DEBUG, "set new file size to %llu",
- (unsigned long long)local->new_file_size);
-
- local->update_disk_file_size = 1;
- }
-}
-
-void
-set_local_io_params_ftruncate(call_frame_t *frame,
- struct object_cipher_info *object)
-{
- uint32_t resid;
- crypt_local_t *local = frame->local;
- struct avec_config *conf = &local->data_conf;
-
- resid = conf->orig_offset & (object_alg_blksize(object) - 1);
- if (resid) {
- local->eof_padding_size = object_alg_blksize(object) - resid;
- local->new_file_size = conf->aligned_offset;
- local->update_disk_file_size = 0;
- /*
- * file size will be updated
- * in the ->writev() stack,
- * when submitting file tail
- */
- } else {
- local->eof_padding_size = 0;
- local->new_file_size = conf->orig_offset;
- local->update_disk_file_size = 1;
- /*
- * file size will be updated
- * in this ->ftruncate stack
- */
- }
-}
-
-static void
-submit_head(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
- submit_partial(frame, this, local->fd, HEAD_ATOM);
-}
-
-static void
-submit_tail(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
- submit_partial(frame, this, local->fd, TAIL_ATOM);
-}
-
-static void
-submit_hole(call_frame_t *frame, xlator_t *this)
-{
- /*
- * hole conversion always means
- * appended write and goes in ordered fashion
- */
- do_ordered_submit(frame, this, HOLE_ATOM);
-}
-
-static void
-submit_data(call_frame_t *frame, xlator_t *this)
-{
- if (is_ordered_mode(frame)) {
- do_ordered_submit(frame, this, DATA_ATOM);
- return;
- }
- gf_log("crypt", GF_LOG_WARNING, "Bad submit mode");
- get_nr_calls(frame, nr_calls_data(frame));
- do_parallel_submit(frame, this, DATA_ATOM);
- return;
-}
-
-/*
- * heplers called by writev_cbk, fruncate_cbk in ordered mode
- */
-
-static int32_t
-should_submit_hole(crypt_local_t *local)
-{
- struct avec_config *conf = &local->hole_conf;
-
- return conf->avec != NULL;
-}
-
-static int32_t
-should_resume_submit_hole(crypt_local_t *local)
-{
- struct avec_config *conf = &local->hole_conf;
-
- if (local->fop == GF_FOP_WRITE && has_tail_block(conf))
- /*
- * Don't submit a part of hole, which
- * fits into a data block:
- * this part of hole will be converted
- * as a gap filled by zeros in data head
- * block.
- */
- return conf->cursor < conf->acount - 1;
- else
- return conf->cursor < conf->acount;
-}
-
-static int32_t
-should_resume_submit_data(call_frame_t *frame)
-{
- crypt_local_t *local = frame->local;
- struct avec_config *conf = &local->data_conf;
-
- if (is_ordered_mode(frame))
- return conf->cursor < conf->acount;
- /*
- * parallel writes
- */
- return 0;
-}
-
-static int32_t
-should_submit_data_after_hole(crypt_local_t *local)
-{
- return local->data_conf.avec != NULL;
-}
-
-static void
-update_local_file_params(call_frame_t *frame, xlator_t *this,
- struct iatt *prebuf, struct iatt *postbuf)
-{
- crypt_local_t *local = frame->local;
-
- check_buf(frame, this, postbuf);
-
- local->prebuf = *prebuf;
- local->postbuf = *postbuf;
-
- local->prebuf.ia_size = local->cur_file_size;
- local->postbuf.ia_size = local->new_file_size;
-
- local->cur_file_size = local->new_file_size;
-}
-
-static int32_t
-end_writeback_writev(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret <= 0) {
- gf_log(this->name, GF_LOG_WARNING, "writev iteration failed");
- goto put_one_call;
- }
- /*
- * op_ret includes paddings (atom's head, atom's tail and EOF)
- */
- if (op_ret < local->io_size) {
- gf_log(this->name, GF_LOG_WARNING, "Incomplete writev iteration");
- goto put_one_call;
- }
- op_ret -= local->eof_padding_size;
- local->op_ret = op_ret;
-
- update_local_file_params(frame, this, prebuf, postbuf);
-
- if (data_write_in_progress(local)) {
- LOCK(&local->rw_count_lock);
- local->rw_count += op_ret;
- UNLOCK(&local->rw_count_lock);
-
- if (should_resume_submit_data(frame))
- submit_data(frame, this);
- } else {
- /*
- * hole conversion is going on;
- * don't take into account written zeros
- */
- if (should_resume_submit_hole(local))
- submit_hole(frame, this);
-
- else if (should_submit_data_after_hole(local))
- submit_data(frame, this);
- }
-put_one_call:
- put_one_call_writev(frame, this);
- return 0;
-}
-
-#define crypt_writev_cbk end_writeback_writev
-
-#define HOLE_WRITE_CHUNK_BITS 12
-#define HOLE_WRITE_CHUNK_SIZE (1 << HOLE_WRITE_CHUNK_BITS)
-
-/*
- * Convert hole of size @size at offset @off to
- * zeros and prepare respective iovecs for submit.
- * The hole lock should be held.
- *
- * Pre-conditions:
- * @local->file_size is set and valid.
- */
-int32_t
-prepare_for_submit_hole(call_frame_t *frame, xlator_t *this, uint64_t off,
- off_t size)
-{
- int32_t ret;
- crypt_local_t *local = frame->local;
- struct object_cipher_info *object = &local->info->cinfo;
-
- set_config_offsets(frame, this, off, size, HOLE_ATOM, 1);
-
- ret = set_config_avec_hole(this, local, &local->hole_conf, object,
- local->fop);
- crypt_check_conf(&local->hole_conf);
-
- return ret;
-}
-
-/*
- * prepare for submit @count bytes at offset @from
- */
-int32_t
-prepare_for_submit_data(call_frame_t *frame, xlator_t *this, off_t from,
- int32_t size, struct iovec *vec, int32_t vec_count,
- int32_t setup_gap)
-{
- uint32_t ret;
- crypt_local_t *local = frame->local;
- struct object_cipher_info *object = &local->info->cinfo;
-
- set_config_offsets(frame, this, from, size, DATA_ATOM, setup_gap);
-
- ret = set_config_avec_data(this, local, &local->data_conf, object, vec,
- vec_count);
- crypt_check_conf(&local->data_conf);
-
- return ret;
-}
-
-static void
-free_avec(struct iovec *avec, char **pool, int blocks_in_pool)
-{
- if (!avec)
- return;
- GF_FREE(pool);
- GF_FREE(avec);
-}
-
-static void
-free_avec_data(crypt_local_t *local)
-{
- return free_avec(local->data_conf.avec, local->data_conf.pool,
- local->data_conf.blocks_in_pool);
-}
-
-static void
-free_avec_hole(crypt_local_t *local)
-{
- return free_avec(local->hole_conf.avec, local->hole_conf.pool,
- local->hole_conf.blocks_in_pool);
-}
-
-static void
-do_parallel_submit(call_frame_t *frame, xlator_t *this, atom_data_type dtype)
-{
- crypt_local_t *local = frame->local;
- struct avec_config *conf;
-
- local->active_setup = dtype;
- conf = conf_by_type(frame, dtype);
-
- if (has_head_block(conf))
- submit_head(frame, this);
-
- if (has_full_blocks(conf))
- submit_full(frame, this);
-
- if (has_tail_block(conf))
- submit_tail(frame, this);
- return;
-}
-
-static void
-do_ordered_submit(call_frame_t *frame, xlator_t *this, atom_data_type dtype)
-{
- crypt_local_t *local = frame->local;
- struct avec_config *conf;
-
- local->active_setup = dtype;
- conf = conf_by_type(frame, dtype);
-
- if (should_submit_head_block(conf)) {
- get_one_call_nolock(frame);
- submit_head(frame, this);
- } else if (should_submit_full_block(conf)) {
- get_one_call_nolock(frame);
- submit_full(frame, this);
- } else if (should_submit_tail_block(conf)) {
- get_one_call_nolock(frame);
- submit_tail(frame, this);
- } else
- gf_log("crypt", GF_LOG_DEBUG,
- "nothing has been submitted in ordered mode");
- return;
-}
-
-static int32_t
-do_writev(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *dict, dict_t *xdata)
-{
- data_t *data;
- crypt_local_t *local = frame->local;
- struct object_cipher_info *object = &local->info->cinfo;
- /*
- * extract regular file size
- */
- data = dict_get(dict, FSIZE_XATTR_PREFIX);
- if (!data) {
- gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
- op_ret = -1;
- op_errno = EIO;
- goto error;
- }
- local->old_file_size = local->cur_file_size = data_to_uint64(data);
-
- set_gap_at_end(frame, object, &local->data_conf, DATA_ATOM);
-
- if (local->cur_file_size < local->data_conf.orig_offset) {
- /*
- * Set up hole config
- */
- op_errno = prepare_for_submit_hole(
- frame, this, local->cur_file_size,
- local->data_conf.orig_offset - local->cur_file_size);
- if (op_errno) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- goto error;
- }
- }
- if (should_submit_hole(local))
- submit_hole(frame, this);
- else
- submit_data(frame, this);
- return 0;
-error:
- get_one_call_nolock(frame);
- put_one_call_writev(frame, this);
- return 0;
-}
-
-static int32_t
-crypt_writev_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret < 0)
- goto error;
- /*
- * An access has been granted,
- * retrieve file size first
- */
- STACK_WIND(frame, do_writev, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fgetxattr, local->fd,
- FSIZE_XATTR_PREFIX, NULL);
- return 0;
-error:
- get_one_call_nolock(frame);
- put_one_call_writev(frame, this);
- return 0;
-}
-
-static int32_t
-writev_trivial_completion(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *dict)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- local->prebuf = *buf;
- local->postbuf = *buf;
-
- local->prebuf.ia_size = local->cur_file_size;
- local->postbuf.ia_size = local->cur_file_size;
-
- get_one_call(frame);
- put_one_call_writev(frame, this);
- return 0;
-}
-
-int
-crypt_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vec,
- int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
- dict_t *xdata)
-{
- int32_t ret;
- crypt_local_t *local;
- struct crypt_inode_info *info;
- struct gf_flock lock = {
- 0,
- };
-#if DEBUG_CRYPT
- gf_log("crypt", GF_LOG_DEBUG, "writing %d bytes from offset %llu",
- (int)iov_length(vec, count), (long long)offset);
-#endif
- local = crypt_alloc_local(frame, this, GF_FOP_WRITE);
- if (!local) {
- ret = ENOMEM;
- goto error;
- }
- local->fd = fd_ref(fd);
-
- if (iobref)
- local->iobref = iobref_ref(iobref);
- /*
- * to update real file size on the server
- */
- local->xattr = dict_new();
- if (!local->xattr) {
- ret = ENOMEM;
- goto error;
- }
- local->flags = flags;
-
- info = local_get_inode_info(local, this);
- if (info == NULL) {
- ret = EINVAL;
- goto error;
- }
- if (!object_alg_atomic(&info->cinfo)) {
- ret = EINVAL;
- goto error;
- }
- if (iov_length(vec, count) == 0)
- goto trivial;
-
- ret = prepare_for_submit_data(frame, this, offset,
- iov_length(vec, count),
- vec, count, 0 /* don't setup gup
- in tail: we don't
- know file size yet */);
- if (ret) {
- ret = ENOMEM;
- goto error;
- }
-
- if (parent_is_crypt_xlator(frame, this)) {
- data_t *data;
- /*
- * we are called by shinking crypt_ftruncate(),
- * which doesn't perform hole conversion;
- *
- * don't ask for access:
- * it has already been acquired
- */
-
- /*
- * extract file size
- */
- if (!xdata) {
- gf_log("crypt", GF_LOG_WARNING,
- "Regular file size hasn't been passed");
- ret = EIO;
- goto error;
- }
- data = dict_get(xdata, FSIZE_XATTR_PREFIX);
- if (!data) {
- gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
- ret = EIO;
- goto error;
- }
- local->old_file_size = local->cur_file_size = data_to_uint64(data);
-
- submit_data(frame, this);
- return 0;
- }
- if (xdata)
- local->xdata = dict_ref(xdata);
- /*
- * lock the file and retrieve its size
- */
- lock.l_len = 0;
- lock.l_start = 0;
- lock.l_type = F_WRLCK;
- lock.l_whence = SEEK_SET;
-
- STACK_WIND(frame, crypt_writev_finodelk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, fd, F_SETLKW,
- &lock, NULL);
- return 0;
-trivial:
- STACK_WIND(frame, writev_trivial_completion, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat, fd, NULL);
- return 0;
-error:
- if (local && local->fd)
- fd_unref(fd);
- if (local && local->iobref)
- iobref_unref(iobref);
- if (local && local->xdata)
- dict_unref(xdata);
- if (local && local->xattr)
- dict_unref(local->xattr);
- if (local && local->info)
- free_inode_info(local->info);
-
- CRYPT_STACK_UNWIND(writev, frame, -1, ret, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-prepare_for_prune(call_frame_t *frame, xlator_t *this, uint64_t offset)
-{
- set_config_offsets(frame, this,
- offset,
- 0, /* count */
- DATA_ATOM,
- 0 /* since we prune, there is no
- gap in tail to uptodate */);
- return 0;
-}
-
-/*
- * Finish the read-prune-modify sequence
- *
- * Can be invoked as
- * 1) ->ftruncate_cbk() for cblock-aligned, or trivial prune
- * 2) ->writev_cbk() for non-cblock-aligned prune
- */
-
-static int32_t
-prune_complete(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- update_local_file_params(frame, this, prebuf, postbuf);
-
- put_one_call_ftruncate(frame, this);
- return 0;
-}
-
-/*
- * This is called as ->ftruncate_cbk()
- *
- * Perform the "write" component of the
- * read-prune-write sequence.
- *
- * submuit the rest of the file
- */
-static int32_t
-prune_submit_file_tail(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- struct avec_config *conf = &local->data_conf;
- dict_t *dict;
-
- if (op_ret < 0)
- goto put_one_call;
-
- if (local->xdata) {
- dict_unref(local->xdata);
- local->xdata = NULL;
- }
- if (xdata)
- local->xdata = dict_ref(xdata);
-
- dict = dict_new();
- if (!dict) {
- op_errno = ENOMEM;
- goto error;
- }
-
- update_local_file_params(frame, this, prebuf, postbuf);
- local->new_file_size = conf->orig_offset;
-
- /*
- * The rest of the file is a partial block and, hence,
- * should be written via RMW sequence, so the crypt xlator
- * does STACK_WIND to itself.
- *
- * Pass current file size to crypt_writev()
- */
- op_errno = dict_set(dict, FSIZE_XATTR_PREFIX,
- data_from_uint64(local->cur_file_size));
- if (op_errno) {
- gf_log("crypt", GF_LOG_WARNING, "can not set key to update file size");
- dict_unref(dict);
- goto error;
- }
- gf_log("crypt", GF_LOG_DEBUG,
- "passing current file size (%llu) to crypt_writev",
- (unsigned long long)local->cur_file_size);
- /*
- * Padding will be filled with
- * zeros by rmw_partial_block()
- */
- STACK_WIND(frame, prune_complete, this,
- this->fops->writev, /* crypt_writev */
- local->fd, &local->vec, 1,
- conf->aligned_offset, /* offset to write from */
- 0, local->iobref, dict);
-
- dict_unref(dict);
- return 0;
-error:
- local->op_ret = -1;
- local->op_errno = op_errno;
-put_one_call:
- put_one_call_ftruncate(frame, this);
- return 0;
-}
-
-/*
- * This is called as a callback of ->writev() invoked in behalf
- * of ftruncate(): it can be
- * 1) ordered writes issued by hole conversion in the case of
- * expanded truncate, or
- * 2) an rmw partial data block issued by non-cblock-aligned
- * prune.
- */
-int32_t
-end_writeback_ftruncate(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- /*
- * if nothing has been written,
- * then it must be an error
- */
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret < 0)
- goto put_one_call;
-
- update_local_file_params(frame, this, prebuf, postbuf);
-
- if (data_write_in_progress(local))
- /* case (2) */
- goto put_one_call;
- /* case (1) */
- if (should_resume_submit_hole(local))
- submit_hole(frame, this);
- /*
- * case of hole, when we shouldn't resume
- */
-put_one_call:
- put_one_call_ftruncate(frame, this);
- return 0;
-}
-
-/*
- * Perform prune and write components of the
- * read-prune-write sequence.
- *
- * Called as ->readv_cbk()
- *
- * Pre-conditions:
- * @vec contains the latest atom of the file
- * (plain text)
- */
-static int32_t
-prune_write(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iovec *vec, int32_t count,
- struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
-{
- int32_t i;
- size_t to_copy;
- size_t copied = 0;
- crypt_local_t *local = frame->local;
- struct avec_config *conf = &local->data_conf;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- if (op_ret == -1)
- goto put_one_call;
-
- /*
- * At first, uptodate head block
- */
- if (iov_length(vec, count) < conf->off_in_head) {
- gf_log(this->name, GF_LOG_WARNING,
- "Failed to uptodate head block for prune");
- local->op_ret = -1;
- local->op_errno = EIO;
- goto put_one_call;
- }
- local->vec.iov_len = conf->off_in_head;
- local->vec.iov_base = GF_CALLOC(1, local->vec.iov_len, gf_crypt_mt_data);
-
- if (local->vec.iov_base == NULL) {
- gf_log(this->name, GF_LOG_WARNING,
- "Failed to calloc head block for prune");
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto put_one_call;
- }
- for (i = 0; i < count; i++) {
- to_copy = vec[i].iov_len;
- if (to_copy > local->vec.iov_len - copied)
- to_copy = local->vec.iov_len - copied;
-
- memcpy((char *)local->vec.iov_base + copied, vec[i].iov_base, to_copy);
- copied += to_copy;
- if (copied == local->vec.iov_len)
- break;
- }
- /*
- * perform prune with aligned offset
- * (i.e. at this step we prune a bit
- * more then it is needed
- */
- STACK_WIND(frame, prune_submit_file_tail, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate, local->fd,
- conf->aligned_offset, local->xdata);
- return 0;
-put_one_call:
- put_one_call_ftruncate(frame, this);
- return 0;
-}
-
-/*
- * Perform a read-prune-write sequence
- */
-int32_t
-read_prune_write(call_frame_t *frame, xlator_t *this)
-{
- int32_t ret = 0;
- dict_t *dict = NULL;
- crypt_local_t *local = frame->local;
- struct avec_config *conf = &local->data_conf;
- struct object_cipher_info *object = &local->info->cinfo;
-
- set_local_io_params_ftruncate(frame, object);
- get_one_call_nolock(frame);
-
- if ((conf->orig_offset & (object_alg_blksize(object) - 1)) == 0) {
- /*
- * cblock-aligned prune:
- * we don't need read and write components,
- * just cut file body
- */
- gf_log("crypt", GF_LOG_DEBUG, "prune without RMW (at offset %llu",
- (unsigned long long)conf->orig_offset);
-
- STACK_WIND(frame, prune_complete, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate, local->fd,
- conf->orig_offset, local->xdata);
- return 0;
- }
- gf_log("crypt", GF_LOG_DEBUG, "prune with RMW (at offset %llu",
- (unsigned long long)conf->orig_offset);
- /*
- * We are about to perform the "read" component of the
- * read-prune-write sequence. It means that we need to
- * read encrypted data from disk and decrypt it.
- * So, the crypt translator does STACK_WIND to itself.
- *
- * Pass current file size to crypt_readv()
-
- */
- dict = dict_new();
- if (!dict) {
- gf_log("crypt", GF_LOG_WARNING, "Can not alloc dict");
- ret = ENOMEM;
- goto exit;
- }
- ret = dict_set(dict, FSIZE_XATTR_PREFIX,
- data_from_uint64(local->cur_file_size));
- if (ret) {
- gf_log("crypt", GF_LOG_WARNING, "Can not set dict");
- goto exit;
- }
- STACK_WIND(frame, prune_write, this, this->fops->readv, /* crypt_readv */
- local->fd, get_atom_size(object), /* bytes to read */
- conf->aligned_offset, /* offset to read from */
- 0, dict);
-exit:
- if (dict)
- dict_unref(dict);
- return ret;
-}
-
-/*
- * File prune is more complicated than expand.
- * First we need to read the latest atom to not lose info
- * needed for proper update. Also we need to make sure that
- * every component of read-prune-write sequence leaves data
- * consistent
- *
- * Non-cblock aligned prune is performed as read-prune-write
- * sequence:
- *
- * 1) read the latest atom;
- * 2) perform cblock-aligned prune
- * 3) issue a write request for the end-of-file
- */
-int32_t
-prune_file(call_frame_t *frame, xlator_t *this, uint64_t offset)
-{
- int32_t ret;
-
- ret = prepare_for_prune(frame, this, offset);
- if (ret)
- return ret;
- return read_prune_write(frame, this);
-}
-
-int32_t
-expand_file(call_frame_t *frame, xlator_t *this, uint64_t offset)
-{
- int32_t ret;
- crypt_local_t *local = frame->local;
-
- ret = prepare_for_submit_hole(frame, this, local->old_file_size,
- offset - local->old_file_size);
- if (ret)
- return ret;
- submit_hole(frame, this);
- return 0;
-}
-
-static int32_t
-ftruncate_trivial_completion(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *dict)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- local->prebuf = *buf;
- local->postbuf = *buf;
-
- local->prebuf.ia_size = local->cur_file_size;
- local->postbuf.ia_size = local->cur_file_size;
-
- get_one_call(frame);
- put_one_call_ftruncate(frame, this);
- return 0;
-}
-
-static int32_t
-do_ftruncate(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *dict, dict_t *xdata)
-{
- data_t *data;
- crypt_local_t *local = frame->local;
-
- if (op_ret)
- goto error;
- /*
- * extract regular file size
- */
- data = dict_get(dict, FSIZE_XATTR_PREFIX);
- if (!data) {
- gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
- op_errno = EIO;
- goto error;
- }
- local->old_file_size = local->cur_file_size = data_to_uint64(data);
-
- if (local->data_conf.orig_offset == local->cur_file_size) {
-#if DEBUG_CRYPT
- gf_log("crypt", GF_LOG_DEBUG,
- "trivial ftruncate (current file size %llu)",
- (unsigned long long)local->cur_file_size);
-#endif
- goto trivial;
- } else if (local->data_conf.orig_offset < local->cur_file_size) {
-#if DEBUG_CRYPT
- gf_log("crypt", GF_LOG_DEBUG, "prune from %llu to %llu",
- (unsigned long long)local->cur_file_size,
- (unsigned long long)local->data_conf.orig_offset);
-#endif
- op_errno = prune_file(frame, this, local->data_conf.orig_offset);
- } else {
-#if DEBUG_CRYPT
- gf_log("crypt", GF_LOG_DEBUG, "expand from %llu to %llu",
- (unsigned long long)local->cur_file_size,
- (unsigned long long)local->data_conf.orig_offset);
-#endif
- op_errno = expand_file(frame, this, local->data_conf.orig_offset);
- }
- if (op_errno)
- goto error;
- return 0;
-trivial:
- STACK_WIND(frame, ftruncate_trivial_completion, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat, local->fd, NULL);
- return 0;
-error:
- /*
- * finish with ftruncate
- */
- local->op_ret = -1;
- local->op_errno = op_errno;
-
- get_one_call_nolock(frame);
- put_one_call_ftruncate(frame, this);
- return 0;
-}
-
-static int32_t
-crypt_ftruncate_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret < 0)
- goto error;
- /*
- * An access has been granted,
- * retrieve file size first
- */
- STACK_WIND(frame, do_ftruncate, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fgetxattr, local->fd,
- FSIZE_XATTR_PREFIX, NULL);
- return 0;
-error:
- get_one_call_nolock(frame);
- put_one_call_ftruncate(frame, this);
- return 0;
-}
-
-/*
- * ftruncate is performed in 2 steps:
- * . receive file size;
- * . expand or prune file.
- */
-static int32_t
-crypt_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- dict_t *xdata)
-{
- int32_t ret;
- crypt_local_t *local;
- struct crypt_inode_info *info;
- struct gf_flock lock = {
- 0,
- };
-
- local = crypt_alloc_local(frame, this, GF_FOP_FTRUNCATE);
- if (!local) {
- ret = ENOMEM;
- goto error;
- }
- local->xattr = dict_new();
- if (!local->xattr) {
- ret = ENOMEM;
- goto error;
- }
- local->fd = fd_ref(fd);
- info = local_get_inode_info(local, this);
- if (info == NULL) {
- ret = EINVAL;
- goto error;
- }
- if (!object_alg_atomic(&info->cinfo)) {
- ret = EINVAL;
- goto error;
- }
- local->data_conf.orig_offset = offset;
- if (xdata)
- local->xdata = dict_ref(xdata);
-
- lock.l_len = 0;
- lock.l_start = 0;
- lock.l_type = F_WRLCK;
- lock.l_whence = SEEK_SET;
-
- STACK_WIND(frame, crypt_ftruncate_finodelk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, fd, F_SETLKW,
- &lock, NULL);
- return 0;
-error:
- if (local && local->fd)
- fd_unref(fd);
- if (local && local->xdata)
- dict_unref(xdata);
- if (local && local->xattr)
- dict_unref(local->xattr);
- if (local && local->info)
- free_inode_info(local->info);
-
- CRYPT_STACK_UNWIND(ftruncate, frame, -1, ret, NULL, NULL, NULL);
- return 0;
-}
-
-/* ->flush_cbk() */
-int32_t
-truncate_end(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- CRYPT_STACK_UNWIND(truncate, frame, op_ret, op_errno, &local->prebuf,
- &local->postbuf, local->xdata);
- return 0;
-}
-
-/* ftruncate_cbk() */
-int32_t
-truncate_flush(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- fd_t *fd = local->fd;
- local->prebuf = *prebuf;
- local->postbuf = *postbuf;
-
- STACK_WIND(frame, truncate_end, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush, fd, NULL);
- fd_unref(fd);
- return 0;
-}
-
-/*
- * is called as ->open_cbk()
- */
-static int32_t
-truncate_begin(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0) {
- fd_unref(fd);
- CRYPT_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL, NULL);
- return 0;
- } else {
- fd_bind(fd);
- }
- /*
- * crypt_truncate() is implemented via crypt_ftruncate(),
- * so the crypt xlator does STACK_WIND to itself here
- */
- STACK_WIND(frame, truncate_flush, this,
- this->fops->ftruncate, /* crypt_ftruncate */
- fd, local->offset, NULL);
- return 0;
-}
-
-/*
- * crypt_truncate() is implemented via crypt_ftruncate() as a
- * sequence crypt_open() - crypt_ftruncate() - truncate_flush()
- */
-int32_t
-crypt_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
- dict_t *xdata)
-{
- fd_t *fd;
- crypt_local_t *local;
-
-#if DEBUG_CRYPT
- gf_log(this->name, GF_LOG_DEBUG, "truncate file %s at offset %llu",
- loc->path, (unsigned long long)offset);
-#endif
- local = crypt_alloc_local(frame, this, GF_FOP_TRUNCATE);
- if (!local)
- goto error;
-
- fd = fd_create(loc->inode, frame->root->pid);
- if (!fd) {
- gf_log(this->name, GF_LOG_ERROR, "Can not create fd");
- goto error;
- }
- local->fd = fd;
- local->offset = offset;
- local->xdata = xdata;
- STACK_WIND(frame, truncate_begin, this, this->fops->open, /* crypt_open() */
- loc, O_RDWR, fd, NULL);
- return 0;
-error:
- CRYPT_STACK_UNWIND(truncate, frame, -1, EINVAL, NULL, NULL, NULL);
- return 0;
-}
-
-end_writeback_handler_t
-dispatch_end_writeback(glusterfs_fop_t fop)
-{
- switch (fop) {
- case GF_FOP_WRITE:
- return end_writeback_writev;
- case GF_FOP_FTRUNCATE:
- return end_writeback_ftruncate;
- default:
- gf_log("crypt", GF_LOG_WARNING, "Bad wb operation %d", fop);
- return NULL;
- }
-}
-
-/*
- * true, if the caller needs metadata string
- */
-static int32_t
-is_custom_mtd(dict_t *xdata)
-{
- data_t *data;
- uint32_t flags;
-
- if (!xdata)
- return 0;
-
- data = dict_get(xdata, MSGFLAGS_PREFIX);
- if (!data)
- return 0;
- if (data->len != sizeof(uint32_t)) {
- gf_log("crypt", GF_LOG_WARNING, "Bad msgflags size (%d)", data->len);
- return -1;
- }
- flags = *((uint32_t *)data->data);
- return msgflags_check_mtd_lock(&flags);
-}
-
-static int32_t
-crypt_open_done(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- if (op_ret < 0)
- gf_log(this->name, GF_LOG_WARNING, "mtd unlock failed (%d)", op_errno);
- put_one_call_open(frame);
- return 0;
-}
-
-static void
-crypt_open_tail(call_frame_t *frame, xlator_t *this)
-{
- struct gf_flock lock = {
- 0,
- };
- crypt_local_t *local = frame->local;
-
- lock.l_type = F_UNLCK;
- lock.l_whence = SEEK_SET;
- lock.l_start = 0;
- lock.l_len = 0;
- lock.l_pid = 0;
-
- STACK_WIND(frame, crypt_open_done, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, local->fd,
- F_SETLKW, &lock, NULL);
-}
-
-/*
- * load private inode info at open time
- * called as ->fgetxattr_cbk()
- */
-static int
-load_mtd_open(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *dict, dict_t *xdata)
-{
- int32_t ret;
- gf_boolean_t upload_info;
- data_t *mtd;
- uint64_t value = 0;
- struct crypt_inode_info *info;
- crypt_local_t *local = frame->local;
- crypt_private_t *priv = this->private;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (local->fd->inode->ia_type == IA_IFLNK)
- goto exit;
- if (op_ret < 0)
- goto exit;
- /*
- * first, check for cached info
- */
- ret = inode_ctx_get(local->fd->inode, this, &value);
- if (ret != -1) {
- info = (struct crypt_inode_info *)(long)value;
- if (info == NULL) {
- gf_log(this->name, GF_LOG_WARNING,
- "Inode info expected, but not found");
- local->op_ret = -1;
- local->op_errno = EIO;
- goto exit;
- }
- /*
- * info has been found in the cache
- */
- upload_info = _gf_false;
- } else {
- /*
- * info hasn't been found in the cache.
- */
- info = alloc_inode_info(local, local->loc);
- if (!info) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto exit;
- }
- init_inode_info_head(info, local->fd);
- upload_info = _gf_true;
- }
- /*
- * extract metadata
- */
- mtd = dict_get(dict, CRYPTO_FORMAT_PREFIX);
- if (!mtd) {
- local->op_ret = -1;
- local->op_errno = ENOENT;
- gf_log(this->name, GF_LOG_WARNING, "Format string wasn't found");
- goto exit;
- }
- /*
- * authenticate metadata against the path
- */
- ret = open_format((unsigned char *)mtd->data, mtd->len, local->loc, info,
- get_master_cinfo(priv), local, upload_info);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = ret;
- goto exit;
- }
- if (upload_info) {
- ret = init_inode_info_tail(info, get_master_cinfo(priv));
- if (ret) {
- local->op_ret = -1;
- local->op_errno = ret;
- goto exit;
- }
- ret = inode_ctx_put(local->fd->inode, this, (uint64_t)(long)info);
- if (ret == -1) {
- local->op_ret = -1;
- local->op_errno = EIO;
- goto exit;
- }
- }
- if (local->custom_mtd) {
- /*
- * pass the metadata string to the customer
- */
- ret = dict_set_static_bin(local->xdata, CRYPTO_FORMAT_PREFIX, mtd->data,
- mtd->len);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = ret;
- goto exit;
- }
- }
-exit:
- if (!local->custom_mtd)
- crypt_open_tail(frame, this);
- else
- put_one_call_open(frame);
- return 0;
-}
-
-static int32_t
-crypt_open_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret < 0) {
- gf_log(this->name, GF_LOG_WARNING, "finodelk (LOCK) failed");
- goto exit;
- }
- STACK_WIND(frame, load_mtd_open, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fgetxattr, local->fd,
- CRYPTO_FORMAT_PREFIX, NULL);
- return 0;
-exit:
- put_one_call_open(frame);
- return 0;
-}
-
-/*
- * verify metadata against the specified pathname
- */
-static int32_t
-crypt_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- struct gf_flock lock = {
- 0,
- };
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (local->fd->inode->ia_type == IA_IFLNK)
- goto exit;
- if (op_ret < 0)
- goto exit;
- if (xdata)
- local->xdata = dict_ref(xdata);
- else if (local->custom_mtd) {
- local->xdata = dict_new();
- if (!local->xdata) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- gf_log("crypt", GF_LOG_ERROR,
- "Can not get new dict for mtd string");
- goto exit;
- }
- }
- lock.l_len = 0;
- lock.l_start = 0;
- lock.l_type = local->custom_mtd ? F_WRLCK : F_RDLCK;
- lock.l_whence = SEEK_SET;
-
- STACK_WIND(frame, crypt_open_finodelk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, fd, F_SETLKW,
- &lock, NULL);
- return 0;
-exit:
- put_one_call_open(frame);
- return 0;
-}
-
-static int32_t
-crypt_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, dict_t *xdata)
-{
- int32_t ret = ENOMEM;
- crypt_local_t *local;
-
- local = crypt_alloc_local(frame, this, GF_FOP_OPEN);
- if (!local)
- goto error;
- local->loc = GF_CALLOC(1, sizeof(loc_t), gf_crypt_mt_loc);
- if (!local->loc) {
- ret = ENOMEM;
- goto error;
- }
- ret = loc_copy(local->loc, loc);
- if (ret) {
- GF_FREE(local->loc);
- ret = ENOMEM;
- goto error;
- }
- local->fd = fd_ref(fd);
-
- ret = is_custom_mtd(xdata);
- if (ret < 0) {
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- ret = EINVAL;
- goto error;
- }
- local->custom_mtd = ret;
-
- if ((flags & O_ACCMODE) == O_WRONLY)
- /*
- * we can't open O_WRONLY, because
- * we need to do read-modify-write
- */
- flags = (flags & ~O_ACCMODE) | O_RDWR;
- /*
- * Make sure that out translated offsets
- * and counts won't be ignored
- */
- flags &= ~O_APPEND;
- get_one_call_nolock(frame);
- STACK_WIND(frame, crypt_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
- return 0;
-error:
- CRYPT_STACK_UNWIND(open, frame, -1, ret, NULL, NULL);
- return 0;
-}
-
-static int32_t
-init_inode_info_tail(struct crypt_inode_info *info,
- struct master_cipher_info *master)
-{
- int32_t ret;
- struct object_cipher_info *object = &info->cinfo;
-
-#if DEBUG_CRYPT
- gf_log("crypt", GF_LOG_DEBUG, "Init inode info for object %s",
- uuid_utoa(info->oid));
-#endif
- ret = data_cipher_algs[object->o_alg][object->o_mode].set_private(info,
- master);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, "Set private info failed");
- return ret;
- }
- return 0;
-}
-
-/*
- * Init inode info at ->create() time
- */
-static void
-init_inode_info_create(struct crypt_inode_info *info,
- struct master_cipher_info *master, data_t *data)
-{
- struct object_cipher_info *object;
-
- info->nr_minor = CRYPT_XLATOR_ID;
- memcpy(info->oid, data->data, data->len);
-
- object = &info->cinfo;
-
- object->o_alg = master->m_alg;
- object->o_mode = master->m_mode;
- object->o_block_bits = master->m_block_bits;
- object->o_dkey_size = master->m_dkey_size;
-}
-
-static void
-init_inode_info_head(struct crypt_inode_info *info, fd_t *fd)
-{
- memcpy(info->oid, fd->inode->gfid, sizeof(uuid_t));
-}
-
-static int32_t
-crypt_create_done(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_private_t *priv = this->private;
- crypt_local_t *local = frame->local;
- struct crypt_inode_info *info = local->info;
- fd_t *local_fd = local->fd;
- dict_t *local_xdata = local->xdata;
- inode_t *local_inode = local->inode;
-
- if (op_ret < 0) {
- free_inode_info(info);
- goto unwind;
- }
- op_errno = init_inode_info_tail(info, get_master_cinfo(priv));
- if (op_errno) {
- op_ret = -1;
- free_inode_info(info);
- goto unwind;
- }
- /*
- * FIXME: drop major subversion number
- */
- op_ret = inode_ctx_put(local->fd->inode, this, (uint64_t)(long)info);
- if (op_ret == -1) {
- op_errno = EIO;
- free_inode_info(info);
- goto unwind;
- }
-unwind:
- free_format(local);
- CRYPT_STACK_UNWIND(create, frame, op_ret, op_errno, local_fd, local_inode,
- &local->buf, &local->prebuf, &local->postbuf,
- local_xdata);
- fd_unref(local_fd);
- inode_unref(local_inode);
- if (local_xdata)
- dict_unref(local_xdata);
- return 0;
-}
-
-static int
-crypt_create_tail(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- struct gf_flock lock = {
- 0,
- };
- crypt_local_t *local = frame->local;
- fd_t *local_fd = local->fd;
- dict_t *local_xdata = local->xdata;
- inode_t *local_inode = local->inode;
-
- dict_unref(local->xattr);
-
- if (op_ret < 0)
- goto error;
-
- lock.l_type = F_UNLCK;
- lock.l_whence = SEEK_SET;
- lock.l_start = 0;
- lock.l_len = 0;
- lock.l_pid = 0;
-
- STACK_WIND(frame, crypt_create_done, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, local->fd,
- F_SETLKW, &lock, NULL);
- return 0;
-error:
- free_inode_info(local->info);
- free_format(local);
-
- CRYPT_STACK_UNWIND(create, frame, op_ret, op_errno, local_fd, local_inode,
- &local->buf, &local->prebuf, &local->postbuf,
- local_xdata);
-
- fd_unref(local_fd);
- inode_unref(local_inode);
- if (local_xdata)
- dict_unref(local_xdata);
- return 0;
-}
-
-static int32_t
-crypt_create_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- struct crypt_inode_info *info = local->info;
-
- if (op_ret < 0)
- goto error;
-
- STACK_WIND(frame, crypt_create_tail, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetxattr, local->fd,
- local->xattr, /* CRYPTO_FORMAT_PREFIX */
- 0, NULL);
- return 0;
-error:
- free_inode_info(info);
- free_format(local);
- fd_unref(local->fd);
- dict_unref(local->xattr);
- if (local->xdata)
- dict_unref(local->xdata);
-
- CRYPT_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
-}
-
-/*
- * Create and store crypt-specific format on disk;
- * Populate cache with private inode info
- */
-static int32_t
-crypt_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- struct gf_flock lock = {
- 0,
- };
- crypt_local_t *local = frame->local;
- struct crypt_inode_info *info = local->info;
-
- if (op_ret < 0)
- goto error;
- if (xdata)
- local->xdata = dict_ref(xdata);
- local->inode = inode_ref(inode);
- local->buf = *buf;
- local->prebuf = *preparent;
- local->postbuf = *postparent;
-
- lock.l_len = 0;
- lock.l_start = 0;
- lock.l_type = F_WRLCK;
- lock.l_whence = SEEK_SET;
-
- STACK_WIND(frame, crypt_create_finodelk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, local->fd,
- F_SETLKW, &lock, NULL);
- return 0;
-error:
- free_inode_info(info);
- free_format(local);
- fd_unref(local->fd);
- dict_unref(local->xattr);
-
- CRYPT_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
-}
-
-static int32_t
-crypt_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
-{
- int ret;
- data_t *data;
- crypt_local_t *local;
- crypt_private_t *priv;
- struct master_cipher_info *master;
- struct crypt_inode_info *info;
-
- priv = this->private;
- master = get_master_cinfo(priv);
-
- if (master_alg_atomic(master)) {
- /*
- * We can't open O_WRONLY, because we
- * need to do read-modify-write.
- */
- if ((flags & O_ACCMODE) == O_WRONLY)
- flags = (flags & ~O_ACCMODE) | O_RDWR;
- /*
- * Make sure that out translated offsets
- * and counts won't be ignored
- */
- flags &= ~O_APPEND;
- }
- local = crypt_alloc_local(frame, this, GF_FOP_CREATE);
- if (!local) {
- ret = ENOMEM;
- goto error;
- }
- data = dict_get(xdata, "gfid-req");
- if (!data) {
- ret = EINVAL;
- gf_log("crypt", GF_LOG_WARNING, "gfid not found");
- goto error;
- }
- if (data->len != sizeof(uuid_t)) {
- ret = EINVAL;
- gf_log("crypt", GF_LOG_WARNING, "bad gfid size (%d), should be %d",
- (int)data->len, (int)sizeof(uuid_t));
- goto error;
- }
- info = alloc_inode_info(local, loc);
- if (!info) {
- ret = ENOMEM;
- goto error;
- }
- /*
- * NOTE:
- * format has to be created BEFORE
- * proceeding to the untrusted server
- */
- ret = alloc_format_create(local);
- if (ret) {
- free_inode_info(info);
- goto error;
- }
- init_inode_info_create(info, master, data);
-
- ret = create_format(local->format, loc, info, master);
- if (ret) {
- free_inode_info(info);
- goto error;
- }
- local->xattr = dict_new();
- if (!local->xattr) {
- free_inode_info(info);
- free_format(local);
- goto error;
- }
- ret = dict_set_static_bin(local->xattr, CRYPTO_FORMAT_PREFIX, local->format,
- new_format_size());
- if (ret) {
- dict_unref(local->xattr);
- free_inode_info(info);
- free_format(local);
- ret = EINVAL;
- goto error;
- }
- ret = dict_set(local->xattr, FSIZE_XATTR_PREFIX, data_from_uint64(0));
- if (ret) {
- dict_unref(local->xattr);
- free_inode_info(info);
- free_format(local);
- ret = ENOMEM;
- goto error;
- }
- local->fd = fd_ref(fd);
-
- STACK_WIND(frame, crypt_create_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
- xdata);
- return 0;
-error:
- gf_log("crypt", GF_LOG_WARNING, "can not create file");
- CRYPT_STACK_UNWIND(create, frame, -1, ret, NULL, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
-}
-
-/*
- * FIXME: this should depends on the version of format string
- */
-static int32_t
-filter_crypt_xattr(dict_t *dict, char *key, data_t *value, void *data)
-{
- dict_del(dict, key);
- return 0;
-}
-
-static int32_t
-crypt_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
- int32_t flags, dict_t *xdata)
-{
- dict_foreach_fnmatch(dict, "trusted.glusterfs.crypt*", filter_crypt_xattr,
- NULL);
- STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
- return 0;
-}
-
-/*
- * TBD: verify file metadata before wind
- */
-static int32_t
-crypt_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
- int32_t flags, dict_t *xdata)
-{
- dict_foreach_fnmatch(dict, "trusted.glusterfs.crypt*", filter_crypt_xattr,
- NULL);
- STACK_WIND(frame, default_setxattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
- return 0;
-}
-
-/*
- * called as flush_cbk()
- */
-static int32_t
-linkop_end(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- linkop_unwind_handler_t unwind_fn;
- unwind_fn = linkop_unwind_dispatch(local->fop);
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret < 0 && op_errno == ENOENT &&
- local->loc->inode->ia_type == IA_IFLNK) {
- local->op_ret = 0;
- local->op_errno = 0;
- }
- unwind_fn(frame);
- return 0;
-}
-
-/*
- * unpin inode on the server
- */
-static int32_t
-link_flush(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0)
- goto error;
- if (local->xdata) {
- dict_unref(local->xdata);
- local->xdata = NULL;
- }
- if (xdata)
- local->xdata = dict_ref(xdata);
- local->inode = inode_ref(inode);
- local->buf = *buf;
- local->prebuf = *preparent;
- local->postbuf = *postparent;
-
- STACK_WIND(frame, linkop_end, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush, local->fd, NULL);
- return 0;
-error:
- local->op_ret = -1;
- local->op_errno = op_errno;
- link_unwind(frame);
- return 0;
-}
-
-void
-link_unwind(call_frame_t *frame)
-{
- crypt_local_t *local = frame->local;
- dict_t *xdata;
- dict_t *xattr;
- inode_t *inode;
-
- if (!local) {
- CRYPT_STACK_UNWIND(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
- NULL);
- return;
- }
- xdata = local->xdata;
- xattr = local->xattr;
- inode = local->inode;
-
- if (local->loc) {
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- }
- if (local->newloc) {
- loc_wipe(local->newloc);
- GF_FREE(local->newloc);
- }
- if (local->fd)
- fd_unref(local->fd);
- if (local->format)
- GF_FREE(local->format);
-
- CRYPT_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, inode,
- &local->buf, &local->prebuf, &local->postbuf, xdata);
- if (xdata)
- dict_unref(xdata);
- if (xattr)
- dict_unref(xattr);
- if (inode)
- inode_unref(inode);
-}
-
-void
-link_wind(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
-
- STACK_WIND(frame, link_flush, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link, local->loc, local->newloc,
- local->xdata);
-}
-
-/*
- * unlink()
- */
-static int32_t
-unlink_flush(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0)
- goto error;
- local->prebuf = *preparent;
- local->postbuf = *postparent;
- if (local->xdata) {
- dict_unref(local->xdata);
- local->xdata = NULL;
- }
- if (xdata)
- local->xdata = dict_ref(xdata);
-
- STACK_WIND(frame, linkop_end, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush, local->fd, NULL);
- return 0;
-error:
- local->op_ret = -1;
- local->op_errno = op_errno;
- unlink_unwind(frame);
- return 0;
-}
-
-void
-unlink_unwind(call_frame_t *frame)
-{
- crypt_local_t *local = frame->local;
- dict_t *xdata;
- dict_t *xattr;
-
- if (!local) {
- CRYPT_STACK_UNWIND(unlink, frame, -1, ENOMEM, NULL, NULL, NULL);
- return;
- }
- xdata = local->xdata;
- xattr = local->xattr;
- if (local->loc) {
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- }
- if (local->fd)
- fd_unref(local->fd);
- if (local->format)
- GF_FREE(local->format);
-
- CRYPT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
- &local->prebuf, &local->postbuf, xdata);
- if (xdata)
- dict_unref(xdata);
- if (xattr)
- dict_unref(xattr);
-}
-
-void
-unlink_wind(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
-
- STACK_WIND(frame, unlink_flush, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink, local->loc, local->flags,
- local->xdata);
-}
-
-void
-rename_unwind(call_frame_t *frame)
-{
- crypt_local_t *local = frame->local;
- dict_t *xdata;
- dict_t *xattr;
- struct iatt *prenewparent;
- struct iatt *postnewparent;
-
- if (!local) {
- CRYPT_STACK_UNWIND(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
- NULL, NULL);
- return;
- }
- xdata = local->xdata;
- xattr = local->xattr;
- prenewparent = local->prenewparent;
- postnewparent = local->postnewparent;
-
- if (local->loc) {
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- }
- if (local->newloc) {
- loc_wipe(local->newloc);
- GF_FREE(local->newloc);
- }
- if (local->fd)
- fd_unref(local->fd);
- if (local->format)
- GF_FREE(local->format);
-
- CRYPT_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno,
- &local->buf, &local->prebuf, &local->postbuf,
- prenewparent, postnewparent, xdata);
- if (xdata)
- dict_unref(xdata);
- if (xattr)
- dict_unref(xattr);
- if (prenewparent)
- GF_FREE(prenewparent);
- if (postnewparent)
- GF_FREE(postnewparent);
-}
-
-/*
- * called as flush_cbk()
- */
-static int32_t
-rename_end(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- rename_unwind(frame);
- return 0;
-}
-
-static int32_t
-rename_flush(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *buf, struct iatt *preoldparent,
- struct iatt *postoldparent, struct iatt *prenewparent,
- struct iatt *postnewparent, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0)
- goto error;
- dict_unref(local->xdata);
- local->xdata = NULL;
- if (xdata)
- local->xdata = dict_ref(xdata);
-
- local->buf = *buf;
- local->prebuf = *preoldparent;
- local->postbuf = *postoldparent;
- if (prenewparent) {
- local->prenewparent = GF_CALLOC(1, sizeof(*prenewparent),
- gf_crypt_mt_iatt);
- if (!local->prenewparent) {
- op_errno = ENOMEM;
- goto error;
- }
- *local->prenewparent = *prenewparent;
- }
- if (postnewparent) {
- local->postnewparent = GF_CALLOC(1, sizeof(*postnewparent),
- gf_crypt_mt_iatt);
- if (!local->postnewparent) {
- op_errno = ENOMEM;
- goto error;
- }
- *local->postnewparent = *postnewparent;
- }
- STACK_WIND(frame, rename_end, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush, local->fd, NULL);
- return 0;
-error:
- local->op_ret = -1;
- local->op_errno = op_errno;
- rename_unwind(frame);
- return 0;
-}
-
-void
-rename_wind(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
-
- STACK_WIND(frame, rename_flush, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename, local->loc, local->newloc,
- local->xdata);
-}
-
-static int32_t
-__do_linkop(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- linkop_wind_handler_t wind_fn;
- linkop_unwind_handler_t unwind_fn;
-
- wind_fn = linkop_wind_dispatch(local->fop);
- unwind_fn = linkop_unwind_dispatch(local->fop);
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret >= 0)
- wind_fn(frame, this);
- else {
- gf_log(this->name, GF_LOG_WARNING, "mtd unlock failed (%d)", op_errno);
- unwind_fn(frame);
- }
- return 0;
-}
-
-static int32_t
-do_linkop(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- struct gf_flock lock = {
- 0,
- };
- crypt_local_t *local = frame->local;
- linkop_unwind_handler_t unwind_fn;
-
- unwind_fn = linkop_unwind_dispatch(local->fop);
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- if (op_ret < 0)
- goto error;
-
- lock.l_type = F_UNLCK;
- lock.l_whence = SEEK_SET;
- lock.l_start = 0;
- lock.l_len = 0;
- lock.l_pid = 0;
-
- STACK_WIND(frame, __do_linkop, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, local->fd,
- F_SETLKW, &lock, NULL);
- return 0;
-error:
- unwind_fn(frame);
- return 0;
-}
-
-/*
- * Update the metadata string (against the new pathname);
- * submit the result
- */
-static int32_t
-linkop_begin(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- gf_boolean_t upload_info;
- crypt_local_t *local = frame->local;
- crypt_private_t *priv = this->private;
- struct crypt_inode_info *info;
- data_t *old_mtd;
- uint32_t new_mtd_size;
- uint64_t value = 0;
- void (*unwind_fn)(call_frame_t * frame);
- mtd_op_t mop;
-
- unwind_fn = linkop_unwind_dispatch(local->fop);
- mop = linkop_mtdop_dispatch(local->fop);
-
- if (op_ret < 0) {
- /*
- * verification failed
- */
- goto error;
- } else {
- fd_bind(fd);
- }
-
- old_mtd = dict_get(xdata, CRYPTO_FORMAT_PREFIX);
- if (!old_mtd) {
- op_errno = EIO;
- gf_log(this->name, GF_LOG_DEBUG, "Metadata string wasn't found");
- goto error;
- }
- new_mtd_size = format_size(mop, old_mtd->len);
- op_errno = alloc_format(local, new_mtd_size);
- if (op_errno)
- goto error;
- /*
- * check for cached info
- */
- op_ret = inode_ctx_get(fd->inode, this, &value);
- if (op_ret != -1) {
- info = (struct crypt_inode_info *)(long)value;
- if (info == NULL) {
- gf_log(this->name, GF_LOG_WARNING, "Inode info was not found");
- op_errno = EINVAL;
- goto error;
- }
- /*
- * info was found in the cache
- */
- local->info = info;
- upload_info = _gf_false;
- } else {
- /*
- * info wasn't found in the cache;
- */
- info = alloc_inode_info(local, local->loc);
- if (!info)
- goto error;
- init_inode_info_head(info, fd);
- local->info = info;
- upload_info = _gf_true;
- }
- op_errno = open_format((unsigned char *)old_mtd->data, old_mtd->len,
- local->loc, info, get_master_cinfo(priv), local,
- upload_info);
- if (op_errno)
- goto error;
- if (upload_info == _gf_true) {
- op_errno = init_inode_info_tail(info, get_master_cinfo(priv));
- if (op_errno)
- goto error;
- op_errno = inode_ctx_put(fd->inode, this, (uint64_t)(long)(info));
- if (op_errno == -1) {
- op_errno = EIO;
- goto error;
- }
- }
- /*
- * update the format string (append/update/cup a MAC)
- */
- op_errno = update_format(local->format, (unsigned char *)old_mtd->data,
- old_mtd->len, local->mac_idx, mop, local->newloc,
- info, get_master_cinfo(priv), local);
- if (op_errno)
- goto error;
- /*
- * store the new format string on the server
- */
- if (new_mtd_size) {
- op_errno = dict_set_static_bin(local->xattr, CRYPTO_FORMAT_PREFIX,
- local->format, new_mtd_size);
- if (op_errno)
- goto error;
- }
- STACK_WIND(frame, do_linkop, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setxattr, local->loc, local->xattr, 0,
- NULL);
- return 0;
-error:
- local->op_ret = -1;
- local->op_errno = op_errno;
- unwind_fn(frame);
- return 0;
-}
-
-static int32_t
-linkop_grab_local(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
- loc_t *newloc, int flags, dict_t *xdata, glusterfs_fop_t op)
-{
- int32_t ret = ENOMEM;
- fd_t *fd;
- crypt_local_t *local;
-
- local = crypt_alloc_local(frame, this, op);
- if (!local)
- goto error;
- if (xdata)
- local->xdata = dict_ref(xdata);
-
- fd = fd_create(oldloc->inode, frame->root->pid);
- if (!fd) {
- gf_log(this->name, GF_LOG_ERROR, "Can not create fd");
- goto error;
- }
- local->fd = fd;
- local->flags = flags;
- local->loc = GF_CALLOC(1, sizeof(loc_t), gf_crypt_mt_loc);
- if (!local->loc)
- goto error;
- ret = loc_copy(local->loc, oldloc);
- if (ret) {
- GF_FREE(local->loc);
- local->loc = NULL;
- goto error;
- }
- if (newloc) {
- local->newloc = GF_CALLOC(1, sizeof(loc_t), gf_crypt_mt_loc);
- if (!local->newloc) {
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- goto error;
- }
- ret = loc_copy(local->newloc, newloc);
- if (ret) {
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- GF_FREE(local->newloc);
- goto error;
- }
- }
- local->xattr = dict_new();
- if (!local->xattr) {
- gf_log(this->name, GF_LOG_ERROR, "Can not create dict");
- ret = ENOMEM;
- goto error;
- }
- return 0;
-
-error:
- if (local) {
- if (local->xdata)
- dict_unref(local->xdata);
- if (local->fd)
- fd_unref(local->fd);
- local->fd = 0;
- local->loc = NULL;
- local->newloc = NULL;
- local->op_ret = -1;
- local->op_errno = ret;
- }
-
- return ret;
-}
-
-/*
- * read and verify locked metadata against the old pathname (via open);
- * update the metadata string in accordance with the new pathname;
- * submit modified metadata;
- * wind;
- */
-static int32_t
-linkop(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- int flags, dict_t *xdata, glusterfs_fop_t op)
-{
- int32_t ret;
- dict_t *dict;
- crypt_local_t *local;
- void (*unwind_fn)(call_frame_t * frame);
- void (*wind_fn)(call_frame_t * frame, xlator_t * this);
-
- wind_fn = linkop_wind_dispatch(op);
- unwind_fn = linkop_unwind_dispatch(op);
-
- ret = linkop_grab_local(frame, this, oldloc, newloc, flags, xdata, op);
- local = frame->local;
- if (ret)
- goto error;
-
- if (local->fd->inode->ia_type == IA_IFLNK)
- goto wind;
-
- dict = dict_new();
- if (!dict) {
- gf_log(this->name, GF_LOG_ERROR, "Can not create dict");
- ret = ENOMEM;
- goto error;
- }
- /*
- * Set a message to crypt_open() that we need
- * locked metadata string.
- * All link operations (link, unlink, rename)
- * need write lock
- */
- msgflags_set_mtd_wlock(&local->msgflags);
- ret = dict_set_static_bin(dict, MSGFLAGS_PREFIX, &local->msgflags,
- sizeof(local->msgflags));
- if (ret) {
- gf_log(this->name, GF_LOG_ERROR, "Can not set dict");
- dict_unref(dict);
- goto error;
- }
- /*
- * verify metadata against the old pathname
- * and retrieve locked metadata string
- */
- STACK_WIND(frame, linkop_begin, this, this->fops->open, /* crypt_open() */
- oldloc, O_RDWR, local->fd, dict);
- dict_unref(dict);
- return 0;
-
-wind:
- wind_fn(frame, this);
- return 0;
-
-error:
- local->op_ret = -1;
- local->op_errno = ret;
- unwind_fn(frame);
- return 0;
-}
-
-static int32_t
-crypt_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata)
-{
- return linkop(frame, this, oldloc, newloc, 0, xdata, GF_FOP_LINK);
-}
-
-static int32_t
-crypt_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
- dict_t *xdata)
-{
- return linkop(frame, this, loc, NULL, flags, xdata, GF_FOP_UNLINK);
-}
-
-static int32_t
-crypt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata)
-{
- return linkop(frame, this, oldloc, newloc, 0, xdata, GF_FOP_RENAME);
-}
-
-static void
-put_one_call_open(call_frame_t *frame)
-{
- crypt_local_t *local = frame->local;
- if (put_one_call(local)) {
- fd_t *fd = local->fd;
- loc_t *loc = local->loc;
- dict_t *xdata = local->xdata;
-
- CRYPT_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, fd,
- xdata);
- fd_unref(fd);
- if (xdata)
- dict_unref(xdata);
- loc_wipe(loc);
- GF_FREE(loc);
- }
-}
-
-static int32_t
-__crypt_readv_done(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- fd_t *local_fd = local->fd;
- dict_t *local_xdata = local->xdata;
- /* read deals with data configs only */
- struct iovec *avec = local->data_conf.avec;
- char **pool = local->data_conf.pool;
- int blocks_in_pool = local->data_conf.blocks_in_pool;
- struct iobref *iobref = local->iobref;
- struct iobref *iobref_data = local->iobref_data;
-
- if (op_ret < 0) {
- gf_log(this->name, GF_LOG_WARNING, "readv unlock failed (%d)",
- op_errno);
- if (local->op_ret >= 0) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- }
- }
- dump_plain_text(local, avec);
-
- gf_log("crypt", GF_LOG_DEBUG,
- "readv: ret_to_user: %d, iovec len: %d, ia_size: %llu",
- (int)(local->rw_count > 0 ? local->rw_count : local->op_ret),
- (int)(local->rw_count > 0 ? iov_length(avec, local->data_conf.acount)
- : 0),
- (unsigned long long)local->buf.ia_size);
-
- CRYPT_STACK_UNWIND(
- readv, frame, local->rw_count > 0 ? local->rw_count : local->op_ret,
- local->op_errno, avec, avec ? local->data_conf.acount : 0, &local->buf,
- local->iobref, local_xdata);
-
- free_avec(avec, pool, blocks_in_pool);
- fd_unref(local_fd);
- if (local_xdata)
- dict_unref(local_xdata);
- if (iobref)
- iobref_unref(iobref);
- if (iobref_data)
- iobref_unref(iobref_data);
- return 0;
-}
-
-static void
-crypt_readv_done(call_frame_t *frame, xlator_t *this)
-{
- if (parent_is_crypt_xlator(frame, this))
- /*
- * don't unlock (it will be done by the parent)
- */
- __crypt_readv_done(frame, NULL, this, 0, 0, NULL);
- else {
- crypt_local_t *local = frame->local;
- struct gf_flock lock = {
- 0,
- };
-
- lock.l_type = F_UNLCK;
- lock.l_whence = SEEK_SET;
- lock.l_start = 0;
- lock.l_len = 0;
- lock.l_pid = 0;
-
- STACK_WIND(frame, __crypt_readv_done, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, local->fd,
- F_SETLKW, &lock, NULL);
- }
-}
-
-static void
-put_one_call_readv(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
- if (put_one_call(local))
- crypt_readv_done(frame, this);
-}
-
-static int32_t
-__crypt_writev_done(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- fd_t *local_fd = local->fd;
- dict_t *local_xdata = local->xdata;
- int32_t ret_to_user;
-
- if (local->xattr)
- dict_unref(local->xattr);
- /*
- * Calculate amount of butes to be returned
- * to user. We need to subtract paddings that
- * have been written as a part of atom.
- */
- /*
- * subtract head padding
- */
- if (local->rw_count == 0)
- /*
- * Nothing has been written, it must be an error
- */
- ret_to_user = local->op_ret;
- else if (local->rw_count <= local->data_conf.off_in_head) {
- gf_log("crypt", GF_LOG_WARNING, "Incomplete write");
- ret_to_user = 0;
- } else
- ret_to_user = local->rw_count - local->data_conf.off_in_head;
- /*
- * subtract tail padding
- */
- if (ret_to_user > local->data_conf.orig_size)
- ret_to_user = local->data_conf.orig_size;
-
- if (local->iobref)
- iobref_unref(local->iobref);
- if (local->iobref_data)
- iobref_unref(local->iobref_data);
- free_avec_data(local);
- free_avec_hole(local);
-
- gf_log("crypt", GF_LOG_DEBUG, "writev: ret_to_user: %d", ret_to_user);
-
- CRYPT_STACK_UNWIND(writev, frame, ret_to_user, local->op_errno,
- &local->prebuf, &local->postbuf, local_xdata);
- fd_unref(local_fd);
- if (local_xdata)
- dict_unref(local_xdata);
- return 0;
-}
-
-static int32_t
-crypt_writev_done(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0)
- gf_log("crypt", GF_LOG_WARNING, "can not update file size");
-
- if (parent_is_crypt_xlator(frame, this))
- /*
- * don't unlock (it will be done by the parent)
- */
- __crypt_writev_done(frame, NULL, this, 0, 0, NULL);
- else {
- struct gf_flock lock = {
- 0,
- };
-
- lock.l_type = F_UNLCK;
- lock.l_whence = SEEK_SET;
- lock.l_start = 0;
- lock.l_len = 0;
- lock.l_pid = 0;
-
- STACK_WIND(frame, __crypt_writev_done, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, local->fd,
- F_SETLKW, &lock, NULL);
- }
- return 0;
-}
-
-static void
-put_one_call_writev(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
- if (put_one_call(local)) {
- if (local->update_disk_file_size) {
- int32_t ret;
- /*
- * update file size, unlock the file and unwind
- */
- ret = dict_set(local->xattr, FSIZE_XATTR_PREFIX,
- data_from_uint64(local->cur_file_size));
- if (ret) {
- gf_log("crypt", GF_LOG_WARNING,
- "can not set key to update file size");
- crypt_writev_done(frame, NULL, this, 0, 0, NULL);
- return;
- }
- gf_log("crypt", GF_LOG_DEBUG, "Updating disk file size to %llu",
- (unsigned long long)local->cur_file_size);
- STACK_WIND(frame, crypt_writev_done, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetxattr, local->fd,
- local->xattr, /* CRYPTO_FORMAT_PREFIX */
- 0, NULL);
- } else
- crypt_writev_done(frame, NULL, this, 0, 0, NULL);
- }
-}
-
-static int32_t
-__crypt_ftruncate_done(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- fd_t *local_fd = local->fd;
- dict_t *local_xdata = local->xdata;
- char *iobase = local->vec.iov_base;
-
- if (op_ret < 0) {
- gf_log(this->name, GF_LOG_WARNING, "ftruncate unlock failed (%d)",
- op_errno);
- if (local->op_ret >= 0) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- }
- }
- if (local->iobref_data)
- iobref_unref(local->iobref_data);
- free_avec_data(local);
- free_avec_hole(local);
-
- gf_log("crypt", GF_LOG_DEBUG,
- "ftruncate, return to user: presize=%llu, postsize=%llu",
- (unsigned long long)local->prebuf.ia_size,
- (unsigned long long)local->postbuf.ia_size);
-
- CRYPT_STACK_UNWIND(ftruncate, frame, ((local->op_ret < 0) ? -1 : 0),
- local->op_errno, &local->prebuf, &local->postbuf,
- local_xdata);
- fd_unref(local_fd);
- if (local_xdata)
- dict_unref(local_xdata);
- if (iobase)
- GF_FREE(iobase);
- return 0;
-}
-
-static int32_t
-crypt_ftruncate_done(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
- struct gf_flock lock = {
- 0,
- };
-
- dict_unref(local->xattr);
- if (op_ret < 0)
- gf_log("crypt", GF_LOG_WARNING, "can not update file size");
-
- lock.l_type = F_UNLCK;
- lock.l_whence = SEEK_SET;
- lock.l_start = 0;
- lock.l_len = 0;
- lock.l_pid = 0;
-
- STACK_WIND(frame, __crypt_ftruncate_done, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, this->name, local->fd,
- F_SETLKW, &lock, NULL);
- return 0;
-}
-
-static void
-put_one_call_ftruncate(call_frame_t *frame, xlator_t *this)
-{
- crypt_local_t *local = frame->local;
- if (put_one_call(local)) {
- if (local->update_disk_file_size) {
- int32_t ret;
- /*
- * update file size, unlock the file and unwind
- */
- ret = dict_set(local->xattr, FSIZE_XATTR_PREFIX,
- data_from_uint64(local->cur_file_size));
- if (ret) {
- gf_log("crypt", GF_LOG_WARNING,
- "can not set key to update file size");
- crypt_ftruncate_done(frame, NULL, this, 0, 0, NULL);
- return;
- }
- gf_log("crypt", GF_LOG_DEBUG, "Updating disk file size to %llu",
- (unsigned long long)local->cur_file_size);
- STACK_WIND(frame, crypt_ftruncate_done, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetxattr, local->fd,
- local->xattr, /* CRYPTO_FORMAT_PREFIX */
- 0, NULL);
- } else
- crypt_ftruncate_done(frame, NULL, this, 0, 0, NULL);
- }
-}
-
-/*
- * load regular file size for some FOPs
- */
-static int32_t
-load_file_size(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
-{
- data_t *data;
- crypt_local_t *local = frame->local;
-
- dict_t *local_xdata = local->xdata;
- inode_t *local_inode = local->inode;
-
- if (op_ret < 0)
- goto unwind;
- /*
- * load regular file size
- */
- data = dict_get(dict, FSIZE_XATTR_PREFIX);
- if (!data) {
- if (local->xdata)
- dict_unref(local->xdata);
- gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
- op_ret = -1;
- op_errno = EIO;
- goto unwind;
- }
- local->buf.ia_size = data_to_uint64(data);
-
- gf_log(this->name, GF_LOG_DEBUG, "FOP %d: Translate regular file to %llu",
- local->fop, (unsigned long long)local->buf.ia_size);
-unwind:
- if (local->fd)
- fd_unref(local->fd);
- if (local->loc) {
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- }
- switch (local->fop) {
- case GF_FOP_FSTAT:
- CRYPT_STACK_UNWIND(fstat, frame, op_ret, op_errno,
- op_ret >= 0 ? &local->buf : NULL, local->xdata);
- break;
- case GF_FOP_STAT:
- CRYPT_STACK_UNWIND(stat, frame, op_ret, op_errno,
- op_ret >= 0 ? &local->buf : NULL, local->xdata);
- break;
- case GF_FOP_LOOKUP:
- CRYPT_STACK_UNWIND(lookup, frame, op_ret, op_errno,
- op_ret >= 0 ? local->inode : NULL,
- op_ret >= 0 ? &local->buf : NULL, local->xdata,
- op_ret >= 0 ? &local->postbuf : NULL);
- break;
- case GF_FOP_READ:
- CRYPT_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, 0,
- op_ret >= 0 ? &local->buf : NULL, NULL, NULL);
- break;
- default:
- gf_log(this->name, GF_LOG_WARNING, "Improper file operation %d",
- local->fop);
- }
- if (local_xdata)
- dict_unref(local_xdata);
- if (local_inode)
- inode_unref(local_inode);
- return 0;
-}
-
-static int32_t
-crypt_stat_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
-{
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0)
- goto unwind;
- if (!IA_ISREG(buf->ia_type))
- goto unwind;
-
- local->buf = *buf;
- if (xdata)
- local->xdata = dict_ref(xdata);
-
- switch (local->fop) {
- case GF_FOP_FSTAT:
- STACK_WIND(frame, load_file_size, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fgetxattr, local->fd,
- FSIZE_XATTR_PREFIX, NULL);
- break;
- case GF_FOP_STAT:
- STACK_WIND(frame, load_file_size, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr, local->loc,
- FSIZE_XATTR_PREFIX, NULL);
- break;
- default:
- gf_log(this->name, GF_LOG_WARNING, "Improper file operation %d",
- local->fop);
- }
- return 0;
-unwind:
- if (local->fd)
- fd_unref(local->fd);
- if (local->loc) {
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- }
- switch (local->fop) {
- case GF_FOP_FSTAT:
- CRYPT_STACK_UNWIND(fstat, frame, op_ret, op_errno,
- op_ret >= 0 ? buf : NULL,
- op_ret >= 0 ? xdata : NULL);
- break;
- case GF_FOP_STAT:
- CRYPT_STACK_UNWIND(stat, frame, op_ret, op_errno,
- op_ret >= 0 ? buf : NULL,
- op_ret >= 0 ? xdata : NULL);
- break;
- default:
- gf_log(this->name, GF_LOG_WARNING, "Improper file operation %d",
- local->fop);
- }
- return 0;
-}
-
-static int32_t
-crypt_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- crypt_local_t *local;
-
- local = crypt_alloc_local(frame, this, GF_FOP_FSTAT);
- if (!local)
- goto error;
- local->fd = fd_ref(fd);
- STACK_WIND(frame, crypt_stat_common_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat, fd, xdata);
- return 0;
-error:
- CRYPT_STACK_UNWIND(fstat, frame, -1, ENOMEM, NULL, NULL);
- return 0;
-}
-
-static int32_t
-crypt_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- int32_t ret;
- crypt_local_t *local;
-
- local = crypt_alloc_local(frame, this, GF_FOP_STAT);
- if (!local)
- goto error;
- local->loc = GF_CALLOC(1, sizeof(loc_t), gf_crypt_mt_loc);
- if (!local->loc)
- goto error;
- ret = loc_copy(local->loc, loc);
- if (ret) {
- GF_FREE(local->loc);
- goto error;
- }
- STACK_WIND(frame, crypt_stat_common_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat, loc, xdata);
- return 0;
-error:
- CRYPT_STACK_UNWIND(stat, frame, -1, ENOMEM, NULL, NULL);
- return 0;
-}
-
-static int32_t
-crypt_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xdata, struct iatt *postparent)
-{
- crypt_local_t *local = frame->local;
-
- if (op_ret < 0)
- goto unwind;
- if (!IA_ISREG(buf->ia_type))
- goto unwind;
-
- local->inode = inode_ref(inode);
- local->buf = *buf;
- local->postbuf = *postparent;
- if (xdata)
- local->xdata = dict_ref(xdata);
- gf_uuid_copy(local->loc->gfid, buf->ia_gfid);
-
- STACK_WIND(frame, load_file_size, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr, local->loc,
- FSIZE_XATTR_PREFIX, NULL);
- return 0;
-unwind:
- loc_wipe(local->loc);
- GF_FREE(local->loc);
- CRYPT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata,
- postparent);
- return 0;
-}
-
-static int32_t
-crypt_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- int32_t ret;
- crypt_local_t *local;
-
- local = crypt_alloc_local(frame, this, GF_FOP_LOOKUP);
- if (!local)
- goto error;
- local->loc = GF_CALLOC(1, sizeof(loc_t), gf_crypt_mt_loc);
- if (!local->loc)
- goto error;
- ret = loc_copy(local->loc, loc);
- if (ret) {
- GF_FREE(local->loc);
- goto error;
- }
- gf_log(this->name, GF_LOG_DEBUG, "Lookup %s", loc->path);
- STACK_WIND(frame, crypt_lookup_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lookup, loc, xdata);
- return 0;
-error:
- CRYPT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-/*
- * for every regular directory entry find its real file size
- * and update stat's buf properly
- */
-static int32_t
-crypt_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
- dict_t *xdata)
-{
- gf_dirent_t *entry = NULL;
-
- if (op_ret < 0)
- goto unwind;
-
- list_for_each_entry(entry, (&entries->list), list)
- {
- data_t *data;
-
- if (!IA_ISREG(entry->d_stat.ia_type))
- continue;
- data = dict_get(entry->dict, FSIZE_XATTR_PREFIX);
- if (!data) {
- gf_log("crypt", GF_LOG_WARNING,
- "Regular file size of direntry not found");
- op_errno = EIO;
- op_ret = -1;
- break;
- }
- entry->d_stat.ia_size = data_to_uint64(data);
- }
-unwind:
- CRYPT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata);
- return 0;
-}
-
-/*
- * ->readdirp() fills in-core inodes, so we need to set proper
- * file sizes for all directory entries of the parent @fd.
- * Actual updates take place in ->crypt_readdirp_cbk()
- */
-static int32_t
-crypt_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, dict_t *xdata)
-{
- int32_t ret = ENOMEM;
-
- if (!xdata) {
- xdata = dict_new();
- if (!xdata)
- goto error;
- } else
- dict_ref(xdata);
- /*
- * make sure that we'll have real file sizes at ->readdirp_cbk()
- */
- ret = dict_set(xdata, FSIZE_XATTR_PREFIX, data_from_uint64(0));
- if (ret) {
- dict_unref(xdata);
- ret = ENOMEM;
- goto error;
- }
- STACK_WIND(frame, crypt_readdirp_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata);
- dict_unref(xdata);
- return 0;
-error:
- CRYPT_STACK_UNWIND(readdirp, frame, -1, ret, NULL, NULL);
- return 0;
-}
-
-static int32_t
-crypt_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
- dict_t *xdata)
-{
- gf_log(this->name, GF_LOG_WARNING,
- "NFS mounts of encrypted volumes are unsupported");
- CRYPT_STACK_UNWIND(access, frame, -1, EPERM, NULL);
- return 0;
-}
-
-int32_t
-master_set_block_size(xlator_t *this, crypt_private_t *priv, dict_t *options)
-{
- uint64_t block_size = 0;
- struct master_cipher_info *master = get_master_cinfo(priv);
-
- if (options != NULL)
- GF_OPTION_RECONF("block-size", block_size, options, size_uint64, error);
- else
- GF_OPTION_INIT("block-size", block_size, size_uint64, error);
-
- switch (block_size) {
- case 512:
- master->m_block_bits = 9;
- break;
- case 1024:
- master->m_block_bits = 10;
- break;
- case 2048:
- master->m_block_bits = 11;
- break;
- case 4096:
- master->m_block_bits = 12;
- break;
- default:
- gf_log("crypt", GF_LOG_ERROR, "FATAL: unsupported block size %llu",
- (unsigned long long)block_size);
- goto error;
- }
- return 0;
-error:
- return -1;
-}
-
-int32_t
-master_set_alg(xlator_t *this, crypt_private_t *priv)
-{
- struct master_cipher_info *master = get_master_cinfo(priv);
- master->m_alg = AES_CIPHER_ALG;
- return 0;
-}
-
-int32_t
-master_set_mode(xlator_t *this, crypt_private_t *priv)
-{
- struct master_cipher_info *master = get_master_cinfo(priv);
- master->m_mode = XTS_CIPHER_MODE;
- return 0;
-}
-
-/*
- * set key size in bits to the master info
- * Pre-conditions: cipher mode in the master info is uptodate.
- */
-static int
-master_set_data_key_size(xlator_t *this, crypt_private_t *priv, dict_t *options)
-{
- int32_t ret;
- uint64_t key_size = 0;
- struct master_cipher_info *master = get_master_cinfo(priv);
-
- if (options != NULL)
- GF_OPTION_RECONF("data-key-size", key_size, options, uint64, error);
- else
- GF_OPTION_INIT("data-key-size", key_size, uint64, error);
-
- ret = data_cipher_algs[master->m_alg][master->m_mode].check_key(key_size);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR,
- "FATAL: wrong bin key size %llu for alg %d mode %d",
- (unsigned long long)key_size, (int)master->m_alg,
- (int)master->m_mode);
- goto error;
- }
- master->m_dkey_size = key_size;
- return 0;
-error:
- return -1;
-}
-
-static int
-is_hex(char *s)
-{
- return ('0' <= *s && *s <= '9') || ('a' <= *s && *s <= 'f');
-}
-
-static int
-parse_hex_buf(xlator_t *this, char *src, unsigned char *dst, int hex_size)
-{
- int i;
- int hex_byte = 0;
-
- for (i = 0; i < (hex_size / 2); i++) {
- if (!is_hex(src + i * 2) || !is_hex(src + i * 2 + 1)) {
- gf_log("crypt", GF_LOG_ERROR, "FATAL: not hex symbol in key");
- return -1;
- }
- if (sscanf(src + i * 2, "%2x", &hex_byte) != 1) {
- gf_log("crypt", GF_LOG_ERROR, "FATAL: can not parse hex key");
- return -1;
- }
- dst[i] = hex_byte & 0xff;
- }
- return 0;
-}
-
-/*
- * Parse options;
- * install master volume key
- */
-int32_t
-master_set_master_vol_key(xlator_t *this, crypt_private_t *priv)
-{
- int32_t ret;
- FILE *file = NULL;
-
- int32_t key_size;
- char *opt_key_file_pathname = NULL;
-
- unsigned char bin_buf[MASTER_VOL_KEY_SIZE];
- char hex_buf[2 * MASTER_VOL_KEY_SIZE];
-
- struct master_cipher_info *master = get_master_cinfo(priv);
- /*
- * extract master key passed via option
- */
- GF_OPTION_INIT("master-key", opt_key_file_pathname, path, bad_key);
-
- if (!opt_key_file_pathname) {
- gf_log(this->name, GF_LOG_ERROR, "FATAL: missing master key");
- return -1;
- }
- gf_log(this->name, GF_LOG_DEBUG, "handling file key %s",
- opt_key_file_pathname);
-
- file = fopen(opt_key_file_pathname, "r");
- if (file == NULL) {
- gf_log(this->name, GF_LOG_ERROR,
- "FATAL: can not open file with master key");
- return -1;
- }
- /*
- * extract hex key
- */
- key_size = fread(hex_buf, 1, sizeof(hex_buf), file);
- if (key_size < sizeof(hex_buf)) {
- gf_log(this->name, GF_LOG_ERROR, "FATAL: master key is too short");
- goto bad_key;
- }
- ret = parse_hex_buf(this, hex_buf, bin_buf, key_size);
- if (ret)
- goto bad_key;
- memcpy(master->m_key, bin_buf, MASTER_VOL_KEY_SIZE);
- memset(hex_buf, 0, sizeof(hex_buf));
- fclose(file);
-
- memset(bin_buf, 0, sizeof(bin_buf));
- return 0;
-bad_key:
- gf_log(this->name, GF_LOG_ERROR, "FATAL: bad master key");
- if (file)
- fclose(file);
- memset(bin_buf, 0, sizeof(bin_buf));
- return -1;
-}
-
-/*
- * Derive volume key for object-id authentication
- */
-int32_t
-master_set_nmtd_vol_key(xlator_t *this, crypt_private_t *priv)
-{
- return get_nmtd_vol_key(get_master_cinfo(priv));
-}
-
-int32_t
-crypt_init_xlator(xlator_t *this)
-{
- int32_t ret;
- crypt_private_t *priv = this->private;
-
- ret = master_set_alg(this, priv);
- if (ret)
- return ret;
- ret = master_set_mode(this, priv);
- if (ret)
- return ret;
- ret = master_set_block_size(this, priv, NULL);
- if (ret)
- return ret;
- ret = master_set_data_key_size(this, priv, NULL);
- if (ret)
- return ret;
- ret = master_set_master_vol_key(this, priv);
- if (ret)
- return ret;
- return master_set_nmtd_vol_key(this, priv);
-}
-
-static int32_t
-crypt_alloc_private(xlator_t *this)
-{
- this->private = GF_CALLOC(1, sizeof(crypt_private_t), gf_crypt_mt_priv);
- if (!this->private) {
- gf_log("crypt", GF_LOG_ERROR,
- "Can not allocate memory for private data");
- return ENOMEM;
- }
- return 0;
-}
-
-static void
-crypt_free_private(xlator_t *this)
-{
- crypt_private_t *priv = this->private;
- if (priv) {
- memset(priv, 0, sizeof(*priv));
- GF_FREE(priv);
- }
-}
-
-int32_t
-mem_acct_init(xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init(this, gf_crypt_mt_end);
-
- if (ret != 0) {
- gf_log(this->name, GF_LOG_ERROR,
- "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-
-int32_t
-reconfigure(xlator_t *this, dict_t *options)
-{
- int32_t ret = -1;
- crypt_private_t *priv = NULL;
-
- GF_VALIDATE_OR_GOTO("crypt", this, error);
- GF_VALIDATE_OR_GOTO(this->name, this->private, error);
- GF_VALIDATE_OR_GOTO(this->name, options, error);
-
- priv = this->private;
-
- ret = master_set_block_size(this, priv, options);
- if (ret) {
- gf_log("this->name", GF_LOG_ERROR, "Failed to reconfure block size");
- goto error;
- }
- ret = master_set_data_key_size(this, priv, options);
- if (ret) {
- gf_log("this->name", GF_LOG_ERROR, "Failed to reconfure data key size");
- goto error;
- }
- return 0;
-error:
- return ret;
-}
-
-int32_t
-init(xlator_t *this)
-{
- int32_t ret;
-
- if (!this->children || this->children->next) {
- gf_log("crypt", GF_LOG_ERROR,
- "FATAL: crypt should have exactly one child");
- return EINVAL;
- }
- if (!this->parents) {
- gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
- }
- ret = crypt_alloc_private(this);
- if (ret)
- return ret;
- ret = crypt_init_xlator(this);
- if (ret)
- goto error;
- this->local_pool = mem_pool_new(crypt_local_t, 64);
- if (!this->local_pool) {
- gf_log(this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- ret = ENOMEM;
- goto error;
- }
- gf_log("crypt", GF_LOG_INFO, "crypt xlator loaded");
- return 0;
-error:
- crypt_free_private(this);
- return ret;
-}
-
-void
-fini(xlator_t *this)
-{
- crypt_free_private(this);
-}
-
-struct xlator_fops fops = {.readv = crypt_readv,
- .writev = crypt_writev,
- .truncate = crypt_truncate,
- .ftruncate = crypt_ftruncate,
- .setxattr = crypt_setxattr,
- .fsetxattr = crypt_fsetxattr,
- .link = crypt_link,
- .unlink = crypt_unlink,
- .rename = crypt_rename,
- .open = crypt_open,
- .create = crypt_create,
- .stat = crypt_stat,
- .fstat = crypt_fstat,
- .lookup = crypt_lookup,
- .readdirp = crypt_readdirp,
- .access = crypt_access};
-
-struct xlator_cbks cbks = {.forget = crypt_forget};
-
-struct volume_options options[] = {
- {.key = {"master-key"},
- .type = GF_OPTION_TYPE_PATH,
- .description =
- "Pathname of regular file which contains master volume key"},
- {
- .key = {"data-key-size"},
- .type = GF_OPTION_TYPE_SIZET,
- .description = "Data key size (bits)",
- .min = 256,
- .max = 512,
- .default_value = "256",
- },
- {.key = {"block-size"},
- .type = GF_OPTION_TYPE_SIZET,
- .description = "Atom size (bits)",
- .min = 512,
- .max = 4096,
- .default_value = "4096"},
- {.key = {NULL}},
-};
-
-/*
- Local variables:
- c-indentation-style: "K&R"
- mode-name: "LC"
- c-basic-offset: 8
- tab-width: 8
- fill-column: 80
- scroll-step: 1
- End:
-*/
diff --git a/xlators/encryption/crypt/src/crypt.h b/xlators/encryption/crypt/src/crypt.h
deleted file mode 100644
index 390eee831b1..00000000000
--- a/xlators/encryption/crypt/src/crypt.h
+++ /dev/null
@@ -1,931 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __CRYPT_H__
-#define __CRYPT_H__
-
-#include <openssl/aes.h>
-#include <openssl/evp.h>
-#include <openssl/sha.h>
-#include <openssl/hmac.h>
-#include <openssl/cmac.h>
-#include <openssl/modes.h>
-#include "crypt-mem-types.h"
-#include "compat.h"
-
-#define CRYPT_XLATOR_ID (0)
-
-#define MAX_IOVEC_BITS (3)
-#define MAX_IOVEC (1 << MAX_IOVEC_BITS)
-#define KEY_FACTOR_BITS (6)
-
-#define DEBUG_CRYPT (0)
-#define TRIVIAL_TFM (0)
-
-#define CRYPT_MIN_BLOCK_BITS (9)
-#define CRYPT_MAX_BLOCK_BITS (12)
-
-#define MASTER_VOL_KEY_SIZE (32)
-#define NMTD_VOL_KEY_SIZE (16)
-
-#if !defined(GF_LINUX_HOST_OS)
-typedef off_t loff_t;
-#endif
-
-struct crypt_key {
- uint32_t len;
- const char *label;
-};
-
-/*
- * Add new key types to the end of this
- * enumeration but before LAST_KEY_TYPE
- */
-typedef enum {
- MASTER_VOL_KEY,
- NMTD_VOL_KEY,
- NMTD_LINK_KEY,
- EMTD_FILE_KEY,
- DATA_FILE_KEY_256,
- DATA_FILE_KEY_512,
- LAST_KEY_TYPE
-} crypt_key_type;
-
-struct kderive_context {
- const unsigned char *pkey; /* parent key */
- uint32_t pkey_len; /* parent key size, bits */
- uint32_t ckey_len; /* child key size, bits */
- unsigned char *fid; /* fixed input data, NIST 800-108, 5.1 */
- uint32_t fid_len; /* fid len, bytes */
- unsigned char *out; /* contains child keying material */
- uint32_t out_len; /* out len, bytes */
-};
-
-typedef enum { DATA_ATOM, HOLE_ATOM, LAST_DATA_TYPE } atom_data_type;
-
-typedef enum {
- HEAD_ATOM,
- TAIL_ATOM,
- FULL_ATOM,
- LAST_LOCALITY_TYPE
-} atom_locality_type;
-
-typedef enum {
- MTD_CREATE,
- MTD_APPEND,
- MTD_OVERWRITE,
- MTD_CUT,
- MTD_LAST_OP
-} mtd_op_t;
-
-struct xts128_context {
- void *key1, *key2;
- block128_f block1, block2;
-};
-
-struct object_cipher_info {
- cipher_alg_t o_alg;
- cipher_mode_t o_mode;
- uint32_t o_block_bits;
- uint32_t o_dkey_size; /* raw data key size in bits */
- union {
- struct {
- unsigned char ivec[16];
- AES_KEY dkey[2];
- AES_KEY tkey; /* key used for tweaking */
- XTS128_CONTEXT xts;
- } aes_xts;
- } u;
-};
-
-struct master_cipher_info {
- /*
- * attributes inherited by newly created regular files
- */
- cipher_alg_t m_alg;
- cipher_mode_t m_mode;
- uint32_t m_block_bits;
- uint32_t m_dkey_size; /* raw key size in bits */
- /*
- * master key
- */
- unsigned char m_key[MASTER_VOL_KEY_SIZE];
- /*
- * volume key for oid authentication
- */
- unsigned char m_nmtd_key[NMTD_VOL_KEY_SIZE];
-};
-
-/*
- * This info is not changed during file's life
- */
-struct crypt_inode_info {
-#if DEBUG_CRYPT
- loc_t *loc; /* pathname that the file has been
- opened, or created with */
-#endif
- uint16_t nr_minor;
- uuid_t oid;
- struct object_cipher_info cinfo;
-};
-
-/*
- * this should locate in secure memory
- */
-typedef struct {
- struct master_cipher_info master;
-} crypt_private_t;
-
-static inline struct master_cipher_info *
-get_master_cinfo(crypt_private_t *priv)
-{
- return &priv->master;
-}
-
-static inline struct object_cipher_info *
-get_object_cinfo(struct crypt_inode_info *info)
-{
- return &info->cinfo;
-}
-
-/*
- * this describes layouts and properties
- * of atoms in an aligned vector
- */
-struct avec_config {
- uint32_t atom_size;
- atom_data_type type;
- size_t orig_size;
- off_t orig_offset;
- size_t expanded_size;
- off_t aligned_offset;
-
- uint32_t off_in_head;
- uint32_t off_in_tail;
- uint32_t gap_in_tail;
- uint32_t nr_full_blocks;
-
- struct iovec *avec; /* aligned vector */
- uint32_t acount; /* number of avec components. The same
- * as number of occupied logical blocks */
- char **pool;
- uint32_t blocks_in_pool;
- uint32_t cursor; /* makes sense only for ordered writes,
- * so there is no races on this counter.
- *
- * Cursor is per-config object, we don't
- * reset cursor for atoms of different
- * localities (head, tail, full)
- */
-};
-
-typedef struct {
- glusterfs_fop_t fop; /* code of FOP this local info built for */
- fd_t *fd;
- inode_t *inode;
- loc_t *loc;
- int32_t mac_idx;
- loc_t *newloc;
- int32_t flags;
- int32_t wbflags;
- struct crypt_inode_info *info;
- struct iobref *iobref;
- struct iobref *iobref_data;
- off_t offset;
-
- uint64_t old_file_size; /* per FOP, retrieved under lock held */
- uint64_t cur_file_size; /* per iteration, before issuing IOs */
- uint64_t new_file_size; /* per iteration, after issuing IOs */
-
- uint64_t io_offset; /* offset of IOs issued per iteration */
- uint64_t io_offset_nopad; /* offset of user's data in the atom */
- uint32_t io_size; /* size of IOs issued per iteration */
- uint32_t io_size_nopad; /* size of user's data in the IOs */
- uint32_t eof_padding_size; /* size od EOF padding in the IOs */
-
- gf_lock_t call_lock; /* protect nr_calls from many cbks */
- int32_t nr_calls;
-
- atom_data_type active_setup; /* which setup (hole or date)
- is currently active */
- /* data setup */
- struct avec_config data_conf;
-
- /* hole setup */
- int hole_conv_in_proggress;
- gf_lock_t hole_lock; /* protect hole config from many cbks */
- int hole_handled;
- struct avec_config hole_conf;
- struct iatt buf;
- struct iatt prebuf;
- struct iatt postbuf;
- struct iatt *prenewparent;
- struct iatt *postnewparent;
- int32_t op_ret;
- int32_t op_errno;
- int32_t rw_count; /* total read or written */
- gf_lock_t rw_count_lock; /* protect the counter above */
- unsigned char *format; /* for create, update format string */
- uint32_t format_size;
- uint32_t msgflags; /* messages for crypt_open() */
- dict_t *xdata;
- dict_t *xattr;
- struct iovec vec; /* contains last file's atom for
- read-prune-write sequence */
- gf_boolean_t custom_mtd;
- /*
- * the next 3 fields are used by readdir and friends
- */
- gf_dirent_t *de; /* directory entry */
- char *de_path; /* pathname of directory entry */
- uint32_t de_prefix_len; /* length of the parent's pathname */
- gf_dirent_t *entries;
-
- uint32_t update_disk_file_size : 1;
-} crypt_local_t;
-
-/* This represents a (read)modify-write atom */
-struct rmw_atom {
- atom_locality_type locality;
- /*
- * read-modify-write sequence of the atom
- */
- int32_t (*rmw)(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iovec *vec,
- int32_t count, struct iatt *stbuf, struct iobref *iobref,
- dict_t *xdata);
- /*
- * offset of the logical block in a file
- */
- loff_t (*offset_at)(call_frame_t *frame, struct object_cipher_info *object);
- /*
- * IO offset in an atom
- */
- uint32_t (*offset_in)(call_frame_t *frame,
- struct object_cipher_info *object);
- /*
- * number of bytes of plain text of this atom that user
- * wants to read/write.
- * It can be smaller than atom_size in the case of head
- * or tail atoms.
- */
- uint32_t (*io_size_nopad)(call_frame_t *frame,
- struct object_cipher_info *object);
- /*
- * which iovec represents the atom
- */
- struct iovec *(*get_iovec)(call_frame_t *frame, uint32_t count);
- /*
- * how many bytes of partial block should be uptodated by
- * reading from disk.
- * This is used to perform a read component of RMW (read-modify-write).
- */
- uint32_t (*count_to_uptodate)(call_frame_t *frame,
- struct object_cipher_info *object);
- struct avec_config *(*get_config)(call_frame_t *frame);
-};
-
-struct data_cipher_alg {
- gf_boolean_t atomic; /* true means that algorithm requires
- to pad data before cipher transform */
- gf_boolean_t should_pad; /* true means that algorithm requires
- to pad the end of file with extra-data */
- uint32_t blkbits; /* blksize = 1 << blkbits */
- /*
- * any preliminary sanity checks goes here
- */
- int32_t (*init)(void);
- /*
- * set alg-mode specific inode info
- */
- int32_t (*set_private)(struct crypt_inode_info *info,
- struct master_cipher_info *master);
- /*
- * check alg-mode specific data key
- */
- int32_t (*check_key)(uint32_t key_size);
- void (*set_iv)(off_t offset, struct object_cipher_info *object);
- int32_t (*encrypt)(const unsigned char *from, unsigned char *to,
- size_t length, off_t offset, const int enc,
- struct object_cipher_info *object);
-};
-
-/*
- * version-dependent metadata loader
- */
-struct crypt_mtd_loader {
- /*
- * return core format size
- */
- size_t (*format_size)(mtd_op_t op, size_t old_size);
- /*
- * pack version-specific metadata of an object
- * at ->create()
- */
- int32_t (*create_format)(unsigned char *wire, loc_t *loc,
- struct crypt_inode_info *info,
- struct master_cipher_info *master);
- /*
- * extract version-specific metadata of an object
- * at ->open() time
- */
- int32_t (*open_format)(unsigned char *wire, int32_t len, loc_t *loc,
- struct crypt_inode_info *info,
- struct master_cipher_info *master,
- crypt_local_t *local, gf_boolean_t load_info);
- int32_t (*update_format)(unsigned char *new, unsigned char *old,
- size_t old_len, int32_t mac_idx, mtd_op_t op,
- loc_t *loc, struct crypt_inode_info *info,
- struct master_cipher_info *master,
- crypt_local_t *local);
-};
-
-typedef int32_t (*end_writeback_handler_t)(call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata);
-typedef void (*linkop_wind_handler_t)(call_frame_t *frame, xlator_t *this);
-typedef void (*linkop_unwind_handler_t)(call_frame_t *frame);
-
-/* Declarations */
-
-/* keys.c */
-extern struct crypt_key crypt_keys[LAST_KEY_TYPE];
-int32_t
-get_nmtd_vol_key(struct master_cipher_info *master);
-int32_t
-get_nmtd_link_key(loc_t *loc, struct master_cipher_info *master,
- unsigned char *result);
-int32_t
-get_emtd_file_key(struct crypt_inode_info *info,
- struct master_cipher_info *master, unsigned char *result);
-int32_t
-get_data_file_key(struct crypt_inode_info *info,
- struct master_cipher_info *master, uint32_t keysize,
- unsigned char *key);
-/* data.c */
-extern struct data_cipher_alg data_cipher_algs[LAST_CIPHER_ALG]
- [LAST_CIPHER_MODE];
-void
-encrypt_aligned_iov(struct object_cipher_info *object, struct iovec *vec,
- int count, off_t off);
-void
-decrypt_aligned_iov(struct object_cipher_info *object, struct iovec *vec,
- int count, off_t off);
-int32_t
-align_iov_by_atoms(xlator_t *this, crypt_local_t *local,
- struct object_cipher_info *object,
- struct iovec *vec /* input vector */,
- int32_t count /* number of vec components */,
- struct iovec *avec /* aligned vector */,
- char **blocks /* pool of blocks */,
- uint32_t *blocks_allocated, struct avec_config *conf);
-int32_t
-set_config_avec_data(xlator_t *this, crypt_local_t *local,
- struct avec_config *conf,
- struct object_cipher_info *object, struct iovec *vec,
- int32_t vec_count);
-int32_t
-set_config_avec_hole(xlator_t *this, crypt_local_t *local,
- struct avec_config *conf,
- struct object_cipher_info *object, glusterfs_fop_t fop);
-void
-set_gap_at_end(call_frame_t *frame, struct object_cipher_info *object,
- struct avec_config *conf, atom_data_type dtype);
-void
-set_config_offsets(call_frame_t *frame, xlator_t *this, uint64_t offset,
- uint64_t count, atom_data_type dtype,
- int32_t setup_gap_in_tail);
-
-/* metadata.c */
-extern struct crypt_mtd_loader mtd_loaders[LAST_MTD_LOADER];
-
-int32_t
-alloc_format(crypt_local_t *local, size_t size);
-int32_t
-alloc_format_create(crypt_local_t *local);
-void
-free_format(crypt_local_t *local);
-size_t
-format_size(mtd_op_t op, size_t old_size);
-size_t
-new_format_size(void);
-int32_t
-open_format(unsigned char *str, int32_t len, loc_t *loc,
- struct crypt_inode_info *info, struct master_cipher_info *master,
- crypt_local_t *local, gf_boolean_t load_info);
-int32_t
-update_format(unsigned char *new, unsigned char *old, size_t old_len,
- int32_t mac_idx, mtd_op_t op, loc_t *loc,
- struct crypt_inode_info *info, struct master_cipher_info *master,
- crypt_local_t *local);
-int32_t
-create_format(unsigned char *wire, loc_t *loc, struct crypt_inode_info *info,
- struct master_cipher_info *master);
-
-/* atom.c */
-struct rmw_atom *
-atom_by_types(atom_data_type data, atom_locality_type locality);
-void
-submit_partial(call_frame_t *frame, xlator_t *this, fd_t *fd,
- atom_locality_type ltype);
-void
-submit_full(call_frame_t *frame, xlator_t *this);
-
-/* crypt.c */
-
-end_writeback_handler_t
-dispatch_end_writeback(glusterfs_fop_t fop);
-void
-set_local_io_params_writev(call_frame_t *frame,
- struct object_cipher_info *object,
- struct rmw_atom *atom, off_t io_offset,
- uint32_t io_size);
-void
-link_wind(call_frame_t *frame, xlator_t *this);
-void
-unlink_wind(call_frame_t *frame, xlator_t *this);
-void
-link_unwind(call_frame_t *frame);
-void
-unlink_unwind(call_frame_t *frame);
-void
-rename_wind(call_frame_t *frame, xlator_t *this);
-void
-rename_unwind(call_frame_t *frame);
-
-/* Inline functions */
-
-static inline int32_t
-crypt_xlator_id(void)
-{
- return CRYPT_XLATOR_ID;
-}
-
-static inline mtd_loader_id
-current_mtd_loader(void)
-{
- return MTD_LOADER_V1;
-}
-
-static inline uint32_t
-master_key_size(void)
-{
- return crypt_keys[MASTER_VOL_KEY].len >> 3;
-}
-
-static inline uint32_t
-nmtd_vol_key_size(void)
-{
- return crypt_keys[NMTD_VOL_KEY].len >> 3;
-}
-
-static inline uint32_t
-alg_mode_blkbits(cipher_alg_t alg, cipher_mode_t mode)
-{
- return data_cipher_algs[alg][mode].blkbits;
-}
-
-static inline uint32_t
-alg_mode_blksize(cipher_alg_t alg, cipher_mode_t mode)
-{
- return 1 << alg_mode_blkbits(alg, mode);
-}
-
-static inline gf_boolean_t
-alg_mode_atomic(cipher_alg_t alg, cipher_mode_t mode)
-{
- return data_cipher_algs[alg][mode].atomic;
-}
-
-static inline gf_boolean_t
-alg_mode_should_pad(cipher_alg_t alg, cipher_mode_t mode)
-{
- return data_cipher_algs[alg][mode].should_pad;
-}
-
-static inline uint32_t
-master_alg_blksize(struct master_cipher_info *mr)
-{
- return alg_mode_blksize(mr->m_alg, mr->m_mode);
-}
-
-static inline uint32_t
-master_alg_blkbits(struct master_cipher_info *mr)
-{
- return alg_mode_blkbits(mr->m_alg, mr->m_mode);
-}
-
-static inline gf_boolean_t
-master_alg_atomic(struct master_cipher_info *mr)
-{
- return alg_mode_atomic(mr->m_alg, mr->m_mode);
-}
-
-static inline gf_boolean_t
-master_alg_should_pad(struct master_cipher_info *mr)
-{
- return alg_mode_should_pad(mr->m_alg, mr->m_mode);
-}
-
-static inline uint32_t
-object_alg_blksize(struct object_cipher_info *ob)
-{
- return alg_mode_blksize(ob->o_alg, ob->o_mode);
-}
-
-static inline uint32_t
-object_alg_blkbits(struct object_cipher_info *ob)
-{
- return alg_mode_blkbits(ob->o_alg, ob->o_mode);
-}
-
-static inline gf_boolean_t
-object_alg_atomic(struct object_cipher_info *ob)
-{
- return alg_mode_atomic(ob->o_alg, ob->o_mode);
-}
-
-static inline gf_boolean_t
-object_alg_should_pad(struct object_cipher_info *ob)
-{
- return alg_mode_should_pad(ob->o_alg, ob->o_mode);
-}
-
-static inline uint32_t
-aes_raw_key_size(struct master_cipher_info *master)
-{
- return master->m_dkey_size >> 3;
-}
-
-static inline struct avec_config *
-get_hole_conf(call_frame_t *frame)
-{
- return &(((crypt_local_t *)frame->local)->hole_conf);
-}
-
-static inline struct avec_config *
-get_data_conf(call_frame_t *frame)
-{
- return &(((crypt_local_t *)frame->local)->data_conf);
-}
-
-static inline int32_t
-get_atom_bits(struct object_cipher_info *object)
-{
- return object->o_block_bits;
-}
-
-static inline int32_t
-get_atom_size(struct object_cipher_info *object)
-{
- return 1 << get_atom_bits(object);
-}
-
-static inline int32_t
-has_head_block(struct avec_config *conf)
-{
- return conf->off_in_head || (conf->acount == 1 && conf->off_in_tail);
-}
-
-static inline int32_t
-has_tail_block(struct avec_config *conf)
-{
- return conf->off_in_tail && conf->acount > 1;
-}
-
-static inline int32_t
-has_full_blocks(struct avec_config *conf)
-{
- return conf->nr_full_blocks;
-}
-
-static inline int32_t
-should_submit_head_block(struct avec_config *conf)
-{
- return has_head_block(conf) && (conf->cursor == 0);
-}
-
-static inline int32_t
-should_submit_tail_block(struct avec_config *conf)
-{
- return has_tail_block(conf) && (conf->cursor == conf->acount - 1);
-}
-
-static inline int32_t
-should_submit_full_block(struct avec_config *conf)
-{
- uint32_t start = has_head_block(conf) ? 1 : 0;
-
- return has_full_blocks(conf) && conf->cursor >= start &&
- conf->cursor < start + conf->nr_full_blocks;
-}
-
-#if DEBUG_CRYPT
-static inline void
-crypt_check_input_len(size_t len, struct object_cipher_info *object)
-{
- if (object_alg_should_pad(object) &&
- (len & (object_alg_blksize(object) - 1)))
- gf_log("crypt", GF_LOG_DEBUG, "bad input len: %d", (int)len);
-}
-
-static inline void
-check_head_block(struct avec_config *conf)
-{
- if (!has_head_block(conf))
- gf_log("crypt", GF_LOG_DEBUG, "not a head atom");
-}
-
-static inline void
-check_tail_block(struct avec_config *conf)
-{
- if (!has_tail_block(conf))
- gf_log("crypt", GF_LOG_DEBUG, "not a tail atom");
-}
-
-static inline void
-check_full_block(struct avec_config *conf)
-{
- if (!has_full_blocks(conf))
- gf_log("crypt", GF_LOG_DEBUG, "not a full atom");
-}
-
-static inline void
-check_cursor_head(struct avec_config *conf)
-{
- if (!has_head_block(conf))
- gf_log("crypt", GF_LOG_DEBUG, "Illegal call of head atom method");
- else if (conf->cursor != 0)
- gf_log("crypt", GF_LOG_DEBUG, "Cursor (%d) is not at head atom",
- conf->cursor);
-}
-
-static inline void
-check_cursor_full(struct avec_config *conf)
-{
- if (!has_full_blocks(conf))
- gf_log("crypt", GF_LOG_DEBUG, "Illegal call of full atom method");
- if (has_head_block(conf) && (conf->cursor == 0))
- gf_log("crypt", GF_LOG_DEBUG, "Cursor is not at full atom");
-}
-
-/*
- * FIXME: use avec->iov_len to check setup
- */
-static inline int
-data_local_invariant(crypt_local_t *local)
-{
- return 0;
-}
-
-#else
-#define crypt_check_input_len(len, object) noop
-#define check_head_block(conf) noop
-#define check_tail_block(conf) noop
-#define check_full_block(conf) noop
-#define check_cursor_head(conf) noop
-#define check_cursor_full(conf) noop
-
-#endif /* DEBUG_CRYPT */
-
-static inline struct avec_config *
-conf_by_type(call_frame_t *frame, atom_data_type dtype)
-{
- struct avec_config *conf = NULL;
-
- switch (dtype) {
- case HOLE_ATOM:
- conf = get_hole_conf(frame);
- break;
- case DATA_ATOM:
- conf = get_data_conf(frame);
- break;
- default:
- gf_log("crypt", GF_LOG_DEBUG, "bad atom type");
- }
- return conf;
-}
-
-static inline uint32_t
-nr_calls_head(struct avec_config *conf)
-{
- return has_head_block(conf) ? 1 : 0;
-}
-
-static inline uint32_t
-nr_calls_tail(struct avec_config *conf)
-{
- return has_tail_block(conf) ? 1 : 0;
-}
-
-static inline uint32_t
-nr_calls_full(struct avec_config *conf)
-{
- switch (conf->type) {
- case HOLE_ATOM:
- return has_full_blocks(conf);
- case DATA_ATOM:
- return has_full_blocks(conf)
- ? logical_blocks_occupied(0, conf->nr_full_blocks,
- MAX_IOVEC_BITS)
- : 0;
- default:
- gf_log("crypt", GF_LOG_DEBUG, "bad atom data type");
- return 0;
- }
-}
-
-static inline uint32_t
-nr_calls(struct avec_config *conf)
-{
- return nr_calls_head(conf) + nr_calls_tail(conf) + nr_calls_full(conf);
-}
-
-static inline uint32_t
-nr_calls_data(call_frame_t *frame)
-{
- return nr_calls(get_data_conf(frame));
-}
-
-static inline uint32_t
-nr_calls_hole(call_frame_t *frame)
-{
- return nr_calls(get_hole_conf(frame));
-}
-
-static inline void
-get_one_call_nolock(call_frame_t *frame)
-{
- crypt_local_t *local = frame->local;
-
- ++local->nr_calls;
-
- // gf_log("crypt", GF_LOG_DEBUG, "get %d calls", 1);
-}
-
-static inline void
-get_one_call(call_frame_t *frame)
-{
- crypt_local_t *local = frame->local;
-
- LOCK(&local->call_lock);
- get_one_call_nolock(frame);
- UNLOCK(&local->call_lock);
-}
-
-static inline void
-get_nr_calls_nolock(call_frame_t *frame, int32_t nr)
-{
- crypt_local_t *local = frame->local;
-
- local->nr_calls += nr;
-
- // gf_log("crypt", GF_LOG_DEBUG, "get %d calls", nr);
-}
-
-static inline void
-get_nr_calls(call_frame_t *frame, int32_t nr)
-{
- crypt_local_t *local = frame->local;
-
- LOCK(&local->call_lock);
- get_nr_calls_nolock(frame, nr);
- UNLOCK(&local->call_lock);
-}
-
-static inline int
-put_one_call(crypt_local_t *local)
-{
- uint32_t last = 0;
-
- LOCK(&local->call_lock);
- if (--local->nr_calls == 0)
- last = 1;
-
- // gf_log("crypt", GF_LOG_DEBUG, "put %d calls", 1);
-
- UNLOCK(&local->call_lock);
- return last;
-}
-
-static inline int
-is_appended_write(call_frame_t *frame)
-{
- crypt_local_t *local = frame->local;
- struct avec_config *conf = get_data_conf(frame);
-
- return conf->orig_offset + conf->orig_size > local->old_file_size;
-}
-
-static inline int
-is_ordered_mode(call_frame_t *frame)
-{
-#if 0
- crypt_local_t *local = frame->local;
- return local->fop == GF_FOP_FTRUNCATE ||
- (local->fop == GF_FOP_WRITE && is_appended_write(frame));
-#endif
- return 1;
-}
-
-static inline int32_t
-hole_conv_completed(crypt_local_t *local)
-{
- struct avec_config *conf = &local->hole_conf;
- return conf->cursor == conf->acount;
-}
-
-static inline int32_t
-data_write_in_progress(crypt_local_t *local)
-{
- return local->active_setup == DATA_ATOM;
-}
-
-static inline int32_t
-parent_is_crypt_xlator(call_frame_t *frame, xlator_t *this)
-{
- return frame->parent->this == this;
-}
-
-static inline linkop_wind_handler_t
-linkop_wind_dispatch(glusterfs_fop_t fop)
-{
- switch (fop) {
- case GF_FOP_LINK:
- return link_wind;
- case GF_FOP_UNLINK:
- return unlink_wind;
- case GF_FOP_RENAME:
- return rename_wind;
- default:
- gf_log("crypt", GF_LOG_ERROR, "Bad link operation %d", fop);
- return NULL;
- }
-}
-
-static inline linkop_unwind_handler_t
-linkop_unwind_dispatch(glusterfs_fop_t fop)
-{
- switch (fop) {
- case GF_FOP_LINK:
- return link_unwind;
- case GF_FOP_UNLINK:
- return unlink_unwind;
- case GF_FOP_RENAME:
- return rename_unwind;
- default:
- gf_log("crypt", GF_LOG_ERROR, "Bad link operation %d", fop);
- return NULL;
- }
-}
-
-static inline mtd_op_t
-linkop_mtdop_dispatch(glusterfs_fop_t fop)
-{
- switch (fop) {
- case GF_FOP_LINK:
- return MTD_APPEND;
- case GF_FOP_UNLINK:
- return MTD_CUT;
- case GF_FOP_RENAME:
- return MTD_OVERWRITE;
- default:
- gf_log("crypt", GF_LOG_WARNING, "Bad link operation %d", fop);
- return MTD_LAST_OP;
- }
-}
-
-#define CRYPT_STACK_UNWIND(fop, frame, params...) \
- do { \
- crypt_local_t *__local = NULL; \
- if (frame) { \
- __local = frame->local; \
- frame->local = NULL; \
- } \
- STACK_UNWIND_STRICT(fop, frame, params); \
- if (__local) { \
- GF_FREE(__local); \
- } \
- } while (0)
-
-#endif /* __CRYPT_H__ */
-
-/*
- Local variables:
- c-indentation-style: "K&R"
- mode-name: "LC"
- c-basic-offset: 8
- tab-width: 8
- fill-column: 80
- scroll-step: 1
- End:
-*/
diff --git a/xlators/encryption/crypt/src/data.c b/xlators/encryption/crypt/src/data.c
deleted file mode 100644
index 8e8701b6bf2..00000000000
--- a/xlators/encryption/crypt/src/data.c
+++ /dev/null
@@ -1,715 +0,0 @@
-/*
- Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include "defaults.h"
-#include "crypt-common.h"
-#include "crypt.h"
-
-static void
-set_iv_aes_xts(off_t offset, struct object_cipher_info *object)
-{
- unsigned char *ivec;
-
- ivec = object->u.aes_xts.ivec;
-
- /* convert the tweak into a little-endian byte
- * array (IEEE P1619/D16, May 2007, section 5.1)
- */
-
- *((uint64_t *)ivec) = htole64(offset);
-
- /* ivec is padded with zeroes */
-}
-
-static int32_t
-aes_set_keys_common(unsigned char *raw_key, uint32_t key_size, AES_KEY *keys)
-{
- int32_t ret;
-
- ret = AES_set_encrypt_key(raw_key, key_size, &keys[AES_ENCRYPT]);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, "Set encrypt key failed");
- return ret;
- }
- ret = AES_set_decrypt_key(raw_key, key_size, &keys[AES_DECRYPT]);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, "Set decrypt key failed");
- return ret;
- }
- return 0;
-}
-
-/*
- * set private cipher info for xts mode
- */
-static int32_t
-set_private_aes_xts(struct crypt_inode_info *info,
- struct master_cipher_info *master)
-{
- int ret;
- struct object_cipher_info *object = get_object_cinfo(info);
- unsigned char *data_key;
- uint32_t subkey_size;
-
- /* init tweak value */
- memset(object->u.aes_xts.ivec, 0, 16);
-
- data_key = GF_CALLOC(1, object->o_dkey_size, gf_crypt_mt_key);
- if (!data_key)
- return ENOMEM;
-
- /*
- * retrieve data keying material
- */
- ret = get_data_file_key(info, master, object->o_dkey_size, data_key);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, "Failed to retrieve data key");
- GF_FREE(data_key);
- return ret;
- }
- /*
- * parse compound xts key
- */
- subkey_size = object->o_dkey_size >> 4; /* (xts-key-size-in-bytes / 2) */
- /*
- * install key for data encryption
- */
- ret = aes_set_keys_common(data_key, subkey_size << 3,
- object->u.aes_xts.dkey);
- if (ret) {
- GF_FREE(data_key);
- return ret;
- }
- /*
- * set up key used to encrypt tweaks
- */
- ret = AES_set_encrypt_key(data_key + subkey_size, object->o_dkey_size / 2,
- &object->u.aes_xts.tkey);
- if (ret < 0)
- gf_log("crypt", GF_LOG_ERROR, "Set tweak key failed");
-
- GF_FREE(data_key);
- return ret;
-}
-
-static int32_t
-aes_xts_init(void)
-{
- cassert(AES_BLOCK_SIZE == (1 << AES_BLOCK_BITS));
- return 0;
-}
-
-static int32_t
-check_key_aes_xts(uint32_t keysize)
-{
- switch (keysize) {
- case 256:
- case 512:
- return 0;
- default:
- break;
- }
- return -1;
-}
-
-static int32_t
-encrypt_aes_xts(const unsigned char *from, unsigned char *to, size_t length,
- off_t offset, const int enc, struct object_cipher_info *object)
-{
- XTS128_CONTEXT ctx;
- if (enc) {
- ctx.key1 = &object->u.aes_xts.dkey[AES_ENCRYPT];
- ctx.block1 = (block128_f)AES_encrypt;
- } else {
- ctx.key1 = &object->u.aes_xts.dkey[AES_DECRYPT];
- ctx.block1 = (block128_f)AES_decrypt;
- }
- ctx.key2 = &object->u.aes_xts.tkey;
- ctx.block2 = (block128_f)AES_encrypt;
-
- return CRYPTO_xts128_encrypt(&ctx, object->u.aes_xts.ivec, from, to, length,
- enc);
-}
-
-/*
- * Cipher input chunk @from of length @len;
- * @to: result of cipher transform;
- * @off: offset in a file (must be cblock-aligned);
- */
-static void
-cipher_data(struct object_cipher_info *object, char *from, char *to, off_t off,
- size_t len, const int enc)
-{
- crypt_check_input_len(len, object);
-
-#if TRIVIAL_TFM && DEBUG_CRYPT
- return;
-#endif
- data_cipher_algs[object->o_alg][object->o_mode].set_iv(off, object);
- data_cipher_algs[object->o_alg][object->o_mode].encrypt(
- (const unsigned char *)from, (unsigned char *)to, len, off, enc,
- object);
-}
-
-#define MAX_CIPHER_CHUNK (1 << 30)
-
-/*
- * Do cipher (encryption/decryption) transform of a
- * continuous region of memory.
- *
- * @len: a number of bytes to transform;
- * @buf: data to transform;
- * @off: offset in a file, should be block-aligned
- * for atomic cipher modes and ksize-aligned
- * for other modes).
- * @dir: direction of transform (encrypt/decrypt).
- */
-static void
-cipher_region(struct object_cipher_info *object, char *from, char *to,
- off_t off, size_t len, int dir)
-{
- while (len > 0) {
- size_t to_cipher;
-
- to_cipher = len;
- if (to_cipher > MAX_CIPHER_CHUNK)
- to_cipher = MAX_CIPHER_CHUNK;
-
- /* this will reset IV */
- cipher_data(object, from, to, off, to_cipher, dir);
- from += to_cipher;
- to += to_cipher;
- off += to_cipher;
- len -= to_cipher;
- }
-}
-
-/*
- * Do cipher transform (encryption/decryption) of
- * plaintext/ciphertext represented by @vec.
- *
- * Pre-conditions: @vec represents a continuous piece
- * of data in a file at offset @off to be ciphered
- * (encrypted/decrypted).
- * @count is the number of vec's components. All the
- * components must be block-aligned, the caller is
- * responsible for this. @dir is "direction" of
- * transform (encrypt/decrypt).
- */
-static void
-cipher_aligned_iov(struct object_cipher_info *object, struct iovec *vec,
- int count, off_t off, int32_t dir)
-{
- int i;
- int len = 0;
-
- for (i = 0; i < count; i++) {
- cipher_region(object, vec[i].iov_base, vec[i].iov_base, off + len,
- vec[i].iov_len, dir);
- len += vec[i].iov_len;
- }
-}
-
-void
-encrypt_aligned_iov(struct object_cipher_info *object, struct iovec *vec,
- int count, off_t off)
-{
- cipher_aligned_iov(object, vec, count, off, 1);
-}
-
-void
-decrypt_aligned_iov(struct object_cipher_info *object, struct iovec *vec,
- int count, off_t off)
-{
- cipher_aligned_iov(object, vec, count, off, 0);
-}
-
-#if DEBUG_CRYPT
-static void
-compound_stream(struct iovec *vec, int count, char *buf, off_t skip)
-{
- int i;
- int off = 0;
- for (i = 0; i < count; i++) {
- memcpy(buf + off, vec[i].iov_base + skip, vec[i].iov_len - skip);
-
- off += (vec[i].iov_len - skip);
- skip = 0;
- }
-}
-
-static void
-check_iovecs(struct iovec *vec, int cnt, struct iovec *avec, int acnt,
- uint32_t off_in_head)
-{
- char *s1, *s2;
- uint32_t size, asize;
-
- size = iov_length(vec, cnt);
- asize = iov_length(avec, acnt) - off_in_head;
- if (size != asize) {
- gf_log("crypt", GF_LOG_DEBUG, "size %d is not eq asize %d", size,
- asize);
- return;
- }
- s1 = GF_CALLOC(1, size, gf_crypt_mt_data);
- if (!s1) {
- gf_log("crypt", GF_LOG_DEBUG, "Can not allocate stream ");
- return;
- }
- s2 = GF_CALLOC(1, asize, gf_crypt_mt_data);
- if (!s2) {
- GF_FREE(s1);
- gf_log("crypt", GF_LOG_DEBUG, "Can not allocate stream ");
- return;
- }
- compound_stream(vec, cnt, s1, 0);
- compound_stream(avec, acnt, s2, off_in_head);
- if (memcmp(s1, s2, size))
- gf_log("crypt", GF_LOG_DEBUG, "chunks of different data");
- GF_FREE(s1);
- GF_FREE(s2);
-}
-
-#else
-#define check_iovecs(vec, count, avec, avecn, off) noop
-#endif /* DEBUG_CRYPT */
-
-static char *
-data_alloc_block(xlator_t *this, crypt_local_t *local, int32_t block_size)
-{
- struct iobuf *iobuf = NULL;
-
- iobuf = iobuf_get2(this->ctx->iobuf_pool, block_size);
- if (!iobuf) {
- gf_log("crypt", GF_LOG_ERROR, "Failed to get iobuf");
- return NULL;
- }
- if (!local->iobref_data) {
- local->iobref_data = iobref_new();
- if (!local->iobref_data) {
- gf_log("crypt", GF_LOG_ERROR, "Failed to get iobref");
- iobuf_unref(iobuf);
- return NULL;
- }
- }
- iobref_add(local->iobref_data, iobuf);
- return iobuf->ptr;
-}
-
-/*
- * Compound @avec, which represent the same data
- * chunk as @vec, but has aligned components of
- * specified block size. Alloc blocks, if needed.
- * In particular, incomplete head and tail blocks
- * must be allocated.
- * Put number of allocated blocks to @num_blocks.
- *
- * Example:
- *
- * input: data chunk represented by 4 components
- * [AB],[BC],[CD],[DE];
- * output: 5 logical blocks (0, 1, 2, 3, 4).
- *
- * A B C D E
- * *-----*+------*-+---*----+--------+-*
- * | || | | | | | |
- * *-+-----+*------+-*---+----*--------*-+------*
- * 0 1 2 3 4
- *
- * 0 - incomplete compound (head);
- * 1, 2 - full compound;
- * 3 - full non-compound (the case of reuse);
- * 4 - incomplete non-compound (tail).
- */
-int32_t
-align_iov_by_atoms(xlator_t *this, crypt_local_t *local,
- struct object_cipher_info *object,
- struct iovec *vec /* input vector */,
- int32_t count /* number of vec components */,
- struct iovec *avec /* aligned vector */,
- char **blocks /* pool of blocks */,
- uint32_t *blocks_allocated, struct avec_config *conf)
-{
- int vecn = 0; /* number of the current component in vec */
- int avecn = 0; /* number of the current component in avec */
- off_t vec_off = 0; /* offset in the current vec component,
- * i.e. the number of bytes have already
- * been copied */
- int32_t block_size = get_atom_size(object);
- size_t to_process; /* number of vec's bytes to copy and(or) re-use */
- int32_t off_in_head = conf->off_in_head;
-
- to_process = iov_length(vec, count);
-
- while (to_process > 0) {
- if (off_in_head || vec[vecn].iov_len - vec_off < block_size) {
- /*
- * less than block_size:
- * the case of incomplete (head or tail),
- * or compound block
- */
- size_t copied = 0;
- /*
- * populate the pool with a new block
- */
- blocks[*blocks_allocated] = data_alloc_block(this, local,
- block_size);
- if (!blocks[*blocks_allocated])
- return -ENOMEM;
- memset(blocks[*blocks_allocated], 0, off_in_head);
- /*
- * fill the block with vec components
- */
- do {
- size_t to_copy;
-
- to_copy = vec[vecn].iov_len - vec_off;
- if (to_copy > block_size - off_in_head)
- to_copy = block_size - off_in_head;
-
- memcpy(blocks[*blocks_allocated] + off_in_head + copied,
- vec[vecn].iov_base + vec_off, to_copy);
-
- copied += to_copy;
- to_process -= to_copy;
-
- vec_off += to_copy;
- if (vec_off == vec[vecn].iov_len) {
- /* finished with this vecn */
- vec_off = 0;
- vecn++;
- }
- } while (copied < (block_size - off_in_head) && to_process > 0);
- /*
- * update avec
- */
- avec[avecn].iov_len = off_in_head + copied;
- avec[avecn].iov_base = blocks[*blocks_allocated];
-
- (*blocks_allocated)++;
- off_in_head = 0;
- } else {
- /*
- * the rest of the current vec component
- * is not less than block_size, so reuse
- * the memory buffer of the component.
- */
- size_t to_reuse;
- to_reuse = (to_process > block_size ? block_size : to_process);
- avec[avecn].iov_len = to_reuse;
- avec[avecn].iov_base = vec[vecn].iov_base + vec_off;
-
- vec_off += to_reuse;
- if (vec_off == vec[vecn].iov_len) {
- /* finished with this vecn */
- vec_off = 0;
- vecn++;
- }
- to_process -= to_reuse;
- }
- avecn++;
- }
- check_iovecs(vec, count, avec, avecn, conf->off_in_head);
- return 0;
-}
-
-/*
- * allocate and setup aligned vector for data submission
- * Pre-condition: @conf is set.
- */
-int32_t
-set_config_avec_data(xlator_t *this, crypt_local_t *local,
- struct avec_config *conf,
- struct object_cipher_info *object, struct iovec *vec,
- int32_t vec_count)
-{
- int32_t ret = ENOMEM;
- struct iovec *avec;
- char **pool;
- uint32_t blocks_in_pool = 0;
-
- conf->type = DATA_ATOM;
-
- avec = GF_CALLOC(conf->acount, sizeof(*avec), gf_crypt_mt_iovec);
- if (!avec)
- return ret;
- pool = GF_CALLOC(conf->acount, sizeof(*pool), gf_crypt_mt_char);
- if (!pool) {
- GF_FREE(avec);
- return ret;
- }
- if (!vec) {
- /*
- * degenerated case: no data
- */
- pool[0] = data_alloc_block(this, local, get_atom_size(object));
- if (!pool[0])
- goto free;
- blocks_in_pool = 1;
- avec->iov_base = pool[0];
- avec->iov_len = conf->off_in_tail;
- } else {
- ret = align_iov_by_atoms(this, local, object, vec, vec_count, avec,
- pool, &blocks_in_pool, conf);
- if (ret)
- goto free;
- }
- conf->avec = avec;
- conf->pool = pool;
- conf->blocks_in_pool = blocks_in_pool;
- return 0;
-free:
- GF_FREE(avec);
- GF_FREE(pool);
- return ret;
-}
-
-/*
- * allocate and setup aligned vector for hole submission
- */
-int32_t
-set_config_avec_hole(xlator_t *this, crypt_local_t *local,
- struct avec_config *conf,
- struct object_cipher_info *object, glusterfs_fop_t fop)
-{
- uint32_t i, idx;
- struct iovec *avec;
- char **pool;
- uint32_t num_blocks;
- uint32_t blocks_in_pool = 0;
-
- conf->type = HOLE_ATOM;
-
- num_blocks = conf->acount -
- (conf->nr_full_blocks ? conf->nr_full_blocks - 1 : 0);
-
- switch (fop) {
- case GF_FOP_WRITE:
- /*
- * hole goes before data
- */
- if (num_blocks == 1 && conf->off_in_tail != 0)
- /*
- * we won't submit a hole which fits into
- * a data atom: this part of hole will be
- * submitted with data write
- */
- return 0;
- break;
- case GF_FOP_FTRUNCATE:
- /*
- * expanding truncate, hole goes after data,
- * and will be submitted in any case.
- */
- break;
- default:
- gf_log("crypt", GF_LOG_WARNING, "bad file operation %d", fop);
- return 0;
- }
- avec = GF_CALLOC(num_blocks, sizeof(*avec), gf_crypt_mt_iovec);
- if (!avec)
- return ENOMEM;
- pool = GF_CALLOC(num_blocks, sizeof(*pool), gf_crypt_mt_char);
- if (!pool) {
- GF_FREE(avec);
- return ENOMEM;
- }
- for (i = 0; i < num_blocks; i++) {
- pool[i] = data_alloc_block(this, local, get_atom_size(object));
- if (pool[i] == NULL)
- goto free;
- blocks_in_pool++;
- }
- if (has_head_block(conf)) {
- /* set head block */
- idx = 0;
- avec[idx].iov_base = pool[idx];
- avec[idx].iov_len = get_atom_size(object);
- memset(avec[idx].iov_base + conf->off_in_head, 0,
- get_atom_size(object) - conf->off_in_head);
- }
- if (has_tail_block(conf)) {
- /* set tail block */
- idx = num_blocks - 1;
- avec[idx].iov_base = pool[idx];
- avec[idx].iov_len = get_atom_size(object);
- memset(avec[idx].iov_base, 0, conf->off_in_tail);
- }
- if (has_full_blocks(conf)) {
- /* set full block */
- idx = conf->off_in_head ? 1 : 0;
- avec[idx].iov_base = pool[idx];
- avec[idx].iov_len = get_atom_size(object);
- /*
- * since we re-use the buffer,
- * zeroes will be set every time
- * before encryption, see submit_full()
- */
- }
- conf->avec = avec;
- conf->pool = pool;
- conf->blocks_in_pool = blocks_in_pool;
- return 0;
-free:
- GF_FREE(avec);
- GF_FREE(pool);
- return ENOMEM;
-}
-
-/* A helper for setting up config of partial atoms (which
- * participate in read-modify-write sequence).
- *
- * Calculate and setup precise amount of "extra-bytes"
- * that should be uptodated at the end of partial (not
- * necessarily tail!) block.
- *
- * Pre-condition: local->old_file_size is valid!
- * @conf contains setup, which is enough for correct calculation
- * of has_tail_block(), ->get_offset().
- */
-void
-set_gap_at_end(call_frame_t *frame, struct object_cipher_info *object,
- struct avec_config *conf, atom_data_type dtype)
-{
- uint32_t to_block;
- crypt_local_t *local = frame->local;
- uint64_t old_file_size = local->old_file_size;
- struct rmw_atom *partial = atom_by_types(
- dtype, has_tail_block(conf) ? TAIL_ATOM : HEAD_ATOM);
-
- if (old_file_size <= partial->offset_at(frame, object))
- to_block = 0;
- else {
- to_block = old_file_size - partial->offset_at(frame, object);
- if (to_block > get_atom_size(object))
- to_block = get_atom_size(object);
- }
- if (to_block > conf->off_in_tail)
- conf->gap_in_tail = to_block - conf->off_in_tail;
- else
- /*
- * nothing to uptodate
- */
- conf->gap_in_tail = 0;
-}
-
-/*
- * fill struct avec_config with offsets layouts
- */
-void
-set_config_offsets(call_frame_t *frame, xlator_t *this, uint64_t offset,
- uint64_t count, atom_data_type dtype, int32_t set_gap)
-{
- crypt_local_t *local;
- struct object_cipher_info *object;
- struct avec_config *conf;
- uint32_t resid;
-
- uint32_t atom_size;
- uint32_t atom_bits;
-
- size_t orig_size;
- off_t orig_offset;
- size_t expanded_size;
- off_t aligned_offset;
-
- uint32_t off_in_head = 0;
- uint32_t off_in_tail = 0;
- uint32_t nr_full_blocks;
- int32_t size_full_blocks;
-
- uint32_t acount; /* number of aligned components to write.
- * The same as number of occupied logical
- * blocks (atoms)
- */
- local = frame->local;
- object = &local->info->cinfo;
- conf = (dtype == DATA_ATOM ? get_data_conf(frame) : get_hole_conf(frame));
-
- orig_offset = offset;
- orig_size = count;
-
- atom_size = get_atom_size(object);
- atom_bits = get_atom_bits(object);
-
- /*
- * Round-down the start,
- * round-up the end.
- */
- resid = offset & (uint64_t)(atom_size - 1);
-
- if (resid)
- off_in_head = resid;
- aligned_offset = offset - off_in_head;
- expanded_size = orig_size + off_in_head;
-
- /* calculate tail,
- expand size forward */
- resid = (offset + orig_size) & (uint64_t)(atom_size - 1);
-
- if (resid) {
- off_in_tail = resid;
- expanded_size += (atom_size - off_in_tail);
- }
- /*
- * calculate number of occupied blocks
- */
- acount = expanded_size >> atom_bits;
- /*
- * calculate number of full blocks
- */
- size_full_blocks = expanded_size;
- if (off_in_head)
- size_full_blocks -= atom_size;
- if (off_in_tail && size_full_blocks > 0)
- size_full_blocks -= atom_size;
- nr_full_blocks = size_full_blocks >> atom_bits;
-
- conf->atom_size = atom_size;
- conf->orig_size = orig_size;
- conf->orig_offset = orig_offset;
- conf->expanded_size = expanded_size;
- conf->aligned_offset = aligned_offset;
-
- conf->off_in_head = off_in_head;
- conf->off_in_tail = off_in_tail;
- conf->nr_full_blocks = nr_full_blocks;
- conf->acount = acount;
- /*
- * Finally, calculate precise amount of
- * "extra-bytes" that should be uptodated
- * at the end.
- * Only if RMW is expected.
- */
- if (off_in_tail && set_gap)
- set_gap_at_end(frame, object, conf, dtype);
-}
-
-struct data_cipher_alg data_cipher_algs[LAST_CIPHER_ALG][LAST_CIPHER_MODE] = {
- [AES_CIPHER_ALG][XTS_CIPHER_MODE] = {.atomic = _gf_true,
- .should_pad = _gf_true,
- .blkbits = AES_BLOCK_BITS,
- .init = aes_xts_init,
- .set_private = set_private_aes_xts,
- .check_key = check_key_aes_xts,
- .set_iv = set_iv_aes_xts,
- .encrypt = encrypt_aes_xts}};
-
-/*
- Local variables:
- c-indentation-style: "K&R"
- mode-name: "LC"
- c-basic-offset: 8
- tab-width: 8
- fill-column: 80
- scroll-step: 1
- End:
-*/
diff --git a/xlators/encryption/crypt/src/keys.c b/xlators/encryption/crypt/src/keys.c
deleted file mode 100644
index a9357005a36..00000000000
--- a/xlators/encryption/crypt/src/keys.c
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include "defaults.h"
-#include "crypt-common.h"
-#include "crypt.h"
-
-/* Key hierarchy
-
- +----------------+
- | MASTER_VOL_KEY |
- +-------+--------+
- |
- |
- +----------------+----------------+
- | | |
- | | |
- +-------+------+ +-------+-------+ +------+--------+
- | NMTD_VOL_KEY | | EMTD_FILE_KEY | | DATA_FILE_KEY |
- +-------+------+ +---------------+ +---------------+
- |
- |
- +-------+-------+
- | NMTD_LINK_KEY |
- +---------------+
-
- */
-
-#if DEBUG_CRYPT
-static void
-check_prf_iters(uint32_t num_iters)
-{
- if (num_iters == 0)
- gf_log("crypt", GF_LOG_DEBUG, "bad number of prf iterations : %d",
- num_iters);
-}
-#else
-#define check_prf_iters(num_iters) noop
-#endif /* DEBUG_CRYPT */
-
-unsigned char crypt_fake_oid[16] = {0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0};
-
-/*
- * derive key in the counter mode using
- * sha256-based HMAC as PRF, see
- * NIST Special Publication 800-108, 5.1)
- */
-
-#define PRF_OUTPUT_SIZE SHA256_DIGEST_LENGTH
-
-static int32_t
-kderive_init(struct kderive_context *ctx,
- const unsigned char *pkey, /* parent key */
- uint32_t pkey_size, /* parent key size */
- const unsigned char *idctx, /* id-context */
- uint32_t idctx_size, crypt_key_type type /* type of child key */)
-{
- unsigned char *pos;
- uint32_t llen = strlen(crypt_keys[type].label);
- /*
- * Compoud the fixed input data for KDF:
- * [i]_2 || Label || 0x00 || Id-Context || [L]_2),
- * NIST SP 800-108, 5.1
- */
- ctx->fid_len = sizeof(uint32_t) + llen + 1 + idctx_size + sizeof(uint32_t);
-
- ctx->fid = GF_CALLOC(ctx->fid_len, 1, gf_crypt_mt_key);
- if (!ctx->fid)
- return ENOMEM;
- ctx->out_len = round_up(crypt_keys[type].len >> 3, PRF_OUTPUT_SIZE);
- ctx->out = GF_CALLOC(ctx->out_len, 1, gf_crypt_mt_key);
- if (!ctx->out) {
- GF_FREE(ctx->fid);
- return ENOMEM;
- }
- ctx->pkey = pkey;
- ctx->pkey_len = pkey_size;
- ctx->ckey_len = crypt_keys[type].len;
-
- pos = ctx->fid;
-
- /* counter will be set up in kderive_rfn() */
- pos += sizeof(uint32_t);
-
- memcpy(pos, crypt_keys[type].label, llen);
- pos += llen;
-
- /* set up zero octet */
- *pos = 0;
- pos += 1;
-
- memcpy(pos, idctx, idctx_size);
- pos += idctx_size;
-
- *((uint32_t *)pos) = htobe32(ctx->ckey_len);
-
- return 0;
-}
-
-static void
-kderive_update(struct kderive_context *ctx)
-{
- uint32_t i;
-#if (OPENSSL_VERSION_NUMBER < 0x1010002f)
- HMAC_CTX hctx;
-#endif
- HMAC_CTX *phctx = NULL;
- unsigned char *pos = ctx->out;
- uint32_t *p_iter = (uint32_t *)ctx->fid;
- uint32_t num_iters = ctx->out_len / PRF_OUTPUT_SIZE;
-
- check_prf_iters(num_iters);
-
-#if (OPENSSL_VERSION_NUMBER < 0x1010002f)
- HMAC_CTX_init(&hctx);
- phctx = &hctx;
-#else
- phctx = HMAC_CTX_new();
- /* I guess we presume it was successful? */
-#endif
- for (i = 0; i < num_iters; i++) {
- /*
- * update the iteration number in the fid
- */
- *p_iter = htobe32(i);
- HMAC_Init_ex(phctx, ctx->pkey, ctx->pkey_len >> 3, EVP_sha256(), NULL);
- HMAC_Update(phctx, ctx->fid, ctx->fid_len);
- HMAC_Final(phctx, pos, NULL);
-
- pos += PRF_OUTPUT_SIZE;
- }
-#if (OPENSSL_VERSION_NUMBER < 0x1010002f)
- HMAC_CTX_cleanup(phctx);
-#else
- HMAC_CTX_free(phctx);
-#endif
-}
-
-static void
-kderive_final(struct kderive_context *ctx, unsigned char *child)
-{
- memcpy(child, ctx->out, ctx->ckey_len >> 3);
- GF_FREE(ctx->fid);
- GF_FREE(ctx->out);
- memset(ctx, 0, sizeof(*ctx));
-}
-
-/*
- * derive per-volume key for object ids aithentication
- */
-int32_t
-get_nmtd_vol_key(struct master_cipher_info *master)
-{
- int32_t ret;
- struct kderive_context ctx;
-
- ret = kderive_init(&ctx, master->m_key, master_key_size(), crypt_fake_oid,
- sizeof(uuid_t), NMTD_VOL_KEY);
- if (ret)
- return ret;
- kderive_update(&ctx);
- kderive_final(&ctx, master->m_nmtd_key);
- return 0;
-}
-
-/*
- * derive per-link key for aithentication of non-encrypted
- * meta-data (nmtd)
- */
-int32_t
-get_nmtd_link_key(loc_t *loc, struct master_cipher_info *master,
- unsigned char *result)
-{
- int32_t ret;
- struct kderive_context ctx;
-
- ret = kderive_init(&ctx, master->m_nmtd_key, nmtd_vol_key_size(),
- (const unsigned char *)loc->path, strlen(loc->path),
- NMTD_LINK_KEY);
- if (ret)
- return ret;
- kderive_update(&ctx);
- kderive_final(&ctx, result);
- return 0;
-}
-
-/*
- * derive per-file key for encryption and authentication
- * of encrypted part of metadata (emtd)
- */
-int32_t
-get_emtd_file_key(struct crypt_inode_info *info,
- struct master_cipher_info *master, unsigned char *result)
-{
- int32_t ret;
- struct kderive_context ctx;
-
- ret = kderive_init(&ctx, master->m_key, master_key_size(), info->oid,
- sizeof(uuid_t), EMTD_FILE_KEY);
- if (ret)
- return ret;
- kderive_update(&ctx);
- kderive_final(&ctx, result);
- return 0;
-}
-
-static int32_t
-data_key_type_by_size(uint32_t keysize, crypt_key_type *type)
-{
- int32_t ret = 0;
- switch (keysize) {
- case 256:
- *type = DATA_FILE_KEY_256;
- break;
- case 512:
- *type = DATA_FILE_KEY_512;
- break;
- default:
- gf_log("crypt", GF_LOG_ERROR, "Unsupported data key size %d",
- keysize);
- ret = ENOTSUP;
- break;
- }
- return ret;
-}
-
-/*
- * derive per-file key for data encryption
- */
-int32_t
-get_data_file_key(struct crypt_inode_info *info,
- struct master_cipher_info *master, uint32_t keysize,
- unsigned char *key)
-{
- int32_t ret;
- struct kderive_context ctx;
- crypt_key_type type;
-
- ret = data_key_type_by_size(keysize, &type);
- if (ret)
- return ret;
- ret = kderive_init(&ctx, master->m_key, master_key_size(), info->oid,
- sizeof(uuid_t), type);
- if (ret)
- return ret;
- kderive_update(&ctx);
- kderive_final(&ctx, key);
- return 0;
-}
-
-/*
- * NOTE: Don't change existing keys: it will break compatibility;
- */
-struct crypt_key crypt_keys[LAST_KEY_TYPE] = {
- [MASTER_VOL_KEY] =
- {
- .len = MASTER_VOL_KEY_SIZE << 3,
- .label = "volume-master",
- },
- [NMTD_VOL_KEY] = {.len = NMTD_VOL_KEY_SIZE << 3,
- .label = "volume-nmtd-key-generation"},
- [NMTD_LINK_KEY] = {.len = 128, .label = "link-nmtd-authentication"},
- [EMTD_FILE_KEY] = {.len = 128, .label = "file-emtd-encryption-and-auth"},
- [DATA_FILE_KEY_256] = {.len = 256, .label = "file-data-encryption-256"},
- [DATA_FILE_KEY_512] = {.len = 512, .label = "file-data-encryption-512"}};
-
-/*
- Local variables:
- c-indentation-style: "K&R"
- mode-name: "LC"
- c-basic-offset: 8
- tab-width: 8
- fill-column: 80
- scroll-step: 1
- End:
-*/
diff --git a/xlators/encryption/crypt/src/metadata.c b/xlators/encryption/crypt/src/metadata.c
deleted file mode 100644
index 90c53a9f721..00000000000
--- a/xlators/encryption/crypt/src/metadata.c
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include "defaults.h"
-#include "crypt-common.h"
-#include "crypt.h"
-#include "metadata.h"
-
-int32_t
-alloc_format(crypt_local_t *local, size_t size)
-{
- if (size > 0) {
- local->format = GF_CALLOC(1, size, gf_crypt_mt_mtd);
- if (!local->format)
- return ENOMEM;
- }
- local->format_size = size;
- return 0;
-}
-
-int32_t
-alloc_format_create(crypt_local_t *local)
-{
- return alloc_format(local, new_format_size());
-}
-
-void
-free_format(crypt_local_t *local)
-{
- GF_FREE(local->format);
-}
-
-/*
- * Check compatibility with extracted metadata
- */
-static int32_t
-check_file_metadata(struct crypt_inode_info *info)
-{
- struct object_cipher_info *object = &info->cinfo;
-
- if (info->nr_minor != CRYPT_XLATOR_ID) {
- gf_log("crypt", GF_LOG_WARNING, "unsupported minor subversion %d",
- info->nr_minor);
- return EINVAL;
- }
- if (object->o_alg > LAST_CIPHER_ALG) {
- gf_log("crypt", GF_LOG_WARNING, "unsupported cipher algorithm %d",
- object->o_alg);
- return EINVAL;
- }
- if (object->o_mode > LAST_CIPHER_MODE) {
- gf_log("crypt", GF_LOG_WARNING, "unsupported cipher mode %d",
- object->o_mode);
- return EINVAL;
- }
- if (object->o_block_bits < CRYPT_MIN_BLOCK_BITS ||
- object->o_block_bits > CRYPT_MAX_BLOCK_BITS) {
- gf_log("crypt", GF_LOG_WARNING, "unsupported block bits %d",
- object->o_block_bits);
- return EINVAL;
- }
- /* TBD: check data key size */
- return 0;
-}
-
-static size_t
-format_size_v1(mtd_op_t op, size_t old_size)
-{
- switch (op) {
- case MTD_CREATE:
- return sizeof(struct mtd_format_v1);
- case MTD_OVERWRITE:
- return old_size;
- case MTD_APPEND:
- return old_size + NMTD_8_MAC_SIZE;
- case MTD_CUT:
- if (old_size > sizeof(struct mtd_format_v1))
- return old_size - NMTD_8_MAC_SIZE;
- else
- return 0;
- default:
- gf_log("crypt", GF_LOG_WARNING, "Bad mtd operation");
- return 0;
- }
-}
-
-/*
- * Calculate size of the updated format string.
- * Returned zero means that we don't need to update the format string.
- */
-size_t
-format_size(mtd_op_t op, size_t old_size)
-{
- size_t versioned;
-
- versioned = mtd_loaders[current_mtd_loader()].format_size(
- op, old_size - sizeof(struct crypt_format));
- if (versioned != 0)
- return versioned + sizeof(struct crypt_format);
- return 0;
-}
-
-/*
- * size of the format string of newly created file (nr_links = 1)
- */
-size_t
-new_format_size(void)
-{
- return format_size(MTD_CREATE, 0);
-}
-
-/*
- * Calculate per-link MAC by pathname
- */
-static int32_t
-calc_link_mac_v1(struct mtd_format_v1 *fmt, loc_t *loc, unsigned char *result,
- struct crypt_inode_info *info,
- struct master_cipher_info *master)
-{
- int32_t ret;
- unsigned char nmtd_link_key[16];
- CMAC_CTX *cctx;
- size_t len;
-
- ret = get_nmtd_link_key(loc, master, nmtd_link_key);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, "Can not get nmtd link key");
- return -1;
- }
- cctx = CMAC_CTX_new();
- if (!cctx) {
- gf_log("crypt", GF_LOG_ERROR, "CMAC_CTX_new failed");
- return -1;
- }
- ret = CMAC_Init(cctx, nmtd_link_key, sizeof(nmtd_link_key),
- EVP_aes_128_cbc(), 0);
- if (!ret) {
- gf_log("crypt", GF_LOG_ERROR, "CMAC_Init failed");
- CMAC_CTX_free(cctx);
- return -1;
- }
- ret = CMAC_Update(cctx, get_NMTD_V1(info), SIZE_OF_NMTD_V1);
- if (!ret) {
- gf_log("crypt", GF_LOG_ERROR, "CMAC_Update failed");
- CMAC_CTX_free(cctx);
- return -1;
- }
- ret = CMAC_Final(cctx, result, &len);
- CMAC_CTX_free(cctx);
- if (!ret) {
- gf_log("crypt", GF_LOG_ERROR, "CMAC_Final failed");
- return -1;
- }
- return 0;
-}
-
-/*
- * Create per-link MAC of index @idx by pathname
- */
-static int32_t
-create_link_mac_v1(struct mtd_format_v1 *fmt, uint32_t idx, loc_t *loc,
- struct crypt_inode_info *info,
- struct master_cipher_info *master)
-{
- int32_t ret;
- unsigned char *mac;
- unsigned char cmac[16];
-
- mac = get_NMTD_V1_MAC(fmt) + idx * SIZE_OF_NMTD_V1_MAC;
-
- ret = calc_link_mac_v1(fmt, loc, cmac, info, master);
- if (ret)
- return -1;
- memcpy(mac, cmac, SIZE_OF_NMTD_V1_MAC);
- return 0;
-}
-
-static int32_t
-create_format_v1(unsigned char *wire, loc_t *loc, struct crypt_inode_info *info,
- struct master_cipher_info *master)
-{
- int32_t ret;
- struct mtd_format_v1 *fmt;
- unsigned char mtd_key[16];
- AES_KEY EMTD_KEY;
- unsigned char nmtd_link_key[16];
- uint32_t ad;
- GCM128_CONTEXT *gctx;
-
- fmt = (struct mtd_format_v1 *)wire;
-
- fmt->minor_id = info->nr_minor;
- fmt->alg_id = AES_CIPHER_ALG;
- fmt->dkey_factor = master->m_dkey_size >> KEY_FACTOR_BITS;
- fmt->block_bits = master->m_block_bits;
- fmt->mode_id = master->m_mode;
- /*
- * retrieve keys for the parts of metadata
- */
- ret = get_emtd_file_key(info, master, mtd_key);
- if (ret)
- return ret;
- ret = get_nmtd_link_key(loc, master, nmtd_link_key);
- if (ret)
- return ret;
-
- AES_set_encrypt_key(mtd_key, sizeof(mtd_key) * 8, &EMTD_KEY);
-
- gctx = CRYPTO_gcm128_new(&EMTD_KEY, (block128_f)AES_encrypt);
-
- /* TBD: Check return values */
-
- CRYPTO_gcm128_setiv(gctx, info->oid, sizeof(uuid_t));
-
- ad = htole32(MTD_LOADER_V1);
- ret = CRYPTO_gcm128_aad(gctx, (const unsigned char *)&ad, sizeof(ad));
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_aad failed");
- CRYPTO_gcm128_release(gctx);
- return ret;
- }
- ret = CRYPTO_gcm128_encrypt(gctx, get_EMTD_V1(fmt), get_EMTD_V1(fmt),
- SIZE_OF_EMTD_V1);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_encrypt failed");
- CRYPTO_gcm128_release(gctx);
- return ret;
- }
- /*
- * set MAC of encrypted part of metadata
- */
- CRYPTO_gcm128_tag(gctx, get_EMTD_V1_MAC(fmt), SIZE_OF_EMTD_V1_MAC);
- CRYPTO_gcm128_release(gctx);
- /*
- * set the first MAC of non-encrypted part of metadata
- */
- return create_link_mac_v1(fmt, 0, loc, info, master);
-}
-
-/*
- * Called by fops:
- * ->create();
- * ->link();
- *
- * Pack common and version-specific parts of file's metadata
- * Pre-conditions: @info contains valid object-id.
- */
-int32_t
-create_format(unsigned char *wire, loc_t *loc, struct crypt_inode_info *info,
- struct master_cipher_info *master)
-{
- struct crypt_format *fmt = (struct crypt_format *)wire;
-
- fmt->loader_id = current_mtd_loader();
-
- wire += sizeof(struct crypt_format);
- return mtd_loaders[current_mtd_loader()].create_format(wire, loc, info,
- master);
-}
-
-/*
- * Append or overwrite per-link mac of @mac_idx index
- * in accordance with the new pathname
- */
-int32_t
-appov_link_mac_v1(unsigned char *new, unsigned char *old, uint32_t old_size,
- int32_t mac_idx, loc_t *loc, struct crypt_inode_info *info,
- struct master_cipher_info *master, crypt_local_t *local)
-{
- memcpy(new, old, old_size);
- return create_link_mac_v1((struct mtd_format_v1 *)new, mac_idx, loc, info,
- master);
-}
-
-/*
- * Cut per-link mac of @mac_idx index
- */
-static int32_t
-cut_link_mac_v1(unsigned char *new, unsigned char *old, uint32_t old_size,
- int32_t mac_idx, loc_t *loc, struct crypt_inode_info *info,
- struct master_cipher_info *master, crypt_local_t *local)
-{
- memcpy(new, old,
- sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * (mac_idx - 1));
-
- memcpy(
- new + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE *(mac_idx - 1),
- old + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * mac_idx,
- old_size - (sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * mac_idx));
- return 0;
-}
-
-int32_t
-update_format_v1(unsigned char *new, unsigned char *old, size_t old_len,
- int32_t mac_idx, /* of old name */
- mtd_op_t op, loc_t *loc, struct crypt_inode_info *info,
- struct master_cipher_info *master, crypt_local_t *local)
-{
- switch (op) {
- case MTD_APPEND:
- mac_idx = 1 + (old_len - sizeof(struct mtd_format_v1)) / 8;
- case MTD_OVERWRITE:
- return appov_link_mac_v1(new, old, old_len, mac_idx, loc, info,
- master, local);
- case MTD_CUT:
- return cut_link_mac_v1(new, old, old_len, mac_idx, loc, info,
- master, local);
- default:
- gf_log("crypt", GF_LOG_ERROR, "Bad mtd operation %d", op);
- return -1;
- }
-}
-
-/*
- * Called by fops:
- *
- * ->link()
- * ->unlink()
- * ->rename()
- *
- */
-int32_t
-update_format(unsigned char *new, unsigned char *old, size_t old_len,
- int32_t mac_idx, mtd_op_t op, loc_t *loc,
- struct crypt_inode_info *info, struct master_cipher_info *master,
- crypt_local_t *local)
-{
- if (!new)
- return 0;
- memcpy(new, old, sizeof(struct crypt_format));
-
- old += sizeof(struct crypt_format);
- new += sizeof(struct crypt_format);
- old_len -= sizeof(struct crypt_format);
-
- return mtd_loaders[current_mtd_loader()].update_format(
- new, old, old_len, mac_idx, op, loc, info, master, local);
-}
-
-/*
- * Perform preliminary checks of found metadata
- * Return < 0 on errors;
- * Return number of object-id MACs (>= 1) on success
- */
-int32_t
-check_format_v1(uint32_t len, unsigned char *wire)
-{
- uint32_t nr_links;
-
- if (len < sizeof(struct mtd_format_v1)) {
- gf_log("crypt", GF_LOG_ERROR, "v1-loader: bad metadata size %d", len);
- goto error;
- }
- len -= sizeof(struct mtd_format_v1);
- if (len % sizeof(nmtd_8_mac_t)) {
- gf_log("crypt", GF_LOG_ERROR, "v1-loader: bad metadata format");
- goto error;
- }
- nr_links = 1 + len / sizeof(nmtd_8_mac_t);
- if (nr_links > _POSIX_LINK_MAX)
- goto error;
- return nr_links;
-error:
- return EIO;
-}
-
-/*
- * Verify per-link MAC specified by index @idx
- *
- * return:
- * -1 on errors;
- * 0 on failed verification;
- * 1 on successful verification
- */
-static int32_t
-verify_link_mac_v1(struct mtd_format_v1 *fmt,
- uint32_t idx /* index of the mac to verify */, loc_t *loc,
- struct crypt_inode_info *info,
- struct master_cipher_info *master)
-{
- int32_t ret;
- unsigned char *mac;
- unsigned char cmac[16];
-
- mac = get_NMTD_V1_MAC(fmt) + idx * SIZE_OF_NMTD_V1_MAC;
-
- ret = calc_link_mac_v1(fmt, loc, cmac, info, master);
- if (ret)
- return -1;
- if (memcmp(cmac, mac, SIZE_OF_NMTD_V1_MAC))
- return 0;
- return 1;
-}
-
-/*
- * Lookup per-link MAC by pathname.
- *
- * return index of the MAC, if it was found;
- * return < 0 on errors, or if the MAC wasn't found
- */
-static int32_t
-lookup_link_mac_v1(struct mtd_format_v1 *fmt, uint32_t nr_macs, loc_t *loc,
- struct crypt_inode_info *info,
- struct master_cipher_info *master)
-{
- int32_t ret;
- uint32_t idx;
-
- for (idx = 0; idx < nr_macs; idx++) {
- ret = verify_link_mac_v1(fmt, idx, loc, info, master);
- if (ret < 0)
- return ret;
- if (ret > 0)
- return idx;
- }
- return -ENOENT;
-}
-
-/*
- * Extract version-specific part of metadata
- */
-static int32_t
-open_format_v1(unsigned char *wire, int32_t len, loc_t *loc,
- struct crypt_inode_info *info, struct master_cipher_info *master,
- crypt_local_t *local, gf_boolean_t load_info)
-{
- int32_t ret;
- int32_t num_nmtd_macs;
- struct mtd_format_v1 *fmt;
- unsigned char mtd_key[16];
- AES_KEY EMTD_KEY;
- GCM128_CONTEXT *gctx;
- uint32_t ad;
- emtd_8_mac_t gmac;
- struct object_cipher_info *object;
-
- num_nmtd_macs = check_format_v1(len, wire);
- if (num_nmtd_macs <= 0)
- return EIO;
-
- ret = lookup_link_mac_v1((struct mtd_format_v1 *)wire, num_nmtd_macs, loc,
- info, master);
- if (ret < 0) {
- gf_log("crypt", GF_LOG_ERROR, "NMTD verification failed");
- return EINVAL;
- }
-
- local->mac_idx = ret;
- if (load_info == _gf_false)
- /* the case of partial open */
- return 0;
-
- fmt = GF_MALLOC(len, gf_crypt_mt_mtd);
- if (!fmt)
- return ENOMEM;
- memcpy(fmt, wire, len);
-
- object = &info->cinfo;
-
- ret = get_emtd_file_key(info, master, mtd_key);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, "Can not retrieve metadata key");
- goto out;
- }
- /*
- * decrypt encrypted meta-data
- */
- ret = AES_set_encrypt_key(mtd_key, sizeof(mtd_key) * 8, &EMTD_KEY);
- if (ret < 0) {
- gf_log("crypt", GF_LOG_ERROR, "Can not set encrypt key");
- ret = EIO;
- goto out;
- }
- gctx = CRYPTO_gcm128_new(&EMTD_KEY, (block128_f)AES_encrypt);
- if (!gctx) {
- gf_log("crypt", GF_LOG_ERROR, "Can not alloc gcm context");
- ret = ENOMEM;
- goto out;
- }
- CRYPTO_gcm128_setiv(gctx, info->oid, sizeof(uuid_t));
-
- ad = htole32(MTD_LOADER_V1);
- ret = CRYPTO_gcm128_aad(gctx, (const unsigned char *)&ad, sizeof(ad));
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_aad failed");
- CRYPTO_gcm128_release(gctx);
- ret = EIO;
- goto out;
- }
- ret = CRYPTO_gcm128_decrypt(gctx, get_EMTD_V1(fmt), get_EMTD_V1(fmt),
- SIZE_OF_EMTD_V1);
- if (ret) {
- gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_decrypt failed");
- CRYPTO_gcm128_release(gctx);
- ret = EIO;
- goto out;
- }
- /*
- * verify metadata
- */
- CRYPTO_gcm128_tag(gctx, gmac, sizeof(gmac));
- CRYPTO_gcm128_release(gctx);
- if (memcmp(gmac, get_EMTD_V1_MAC(fmt), SIZE_OF_EMTD_V1_MAC)) {
- gf_log("crypt", GF_LOG_ERROR, "EMTD verification failed");
- ret = EINVAL;
- goto out;
- }
- /*
- * load verified metadata to the private part of inode
- */
- info->nr_minor = fmt->minor_id;
-
- object->o_alg = fmt->alg_id;
- object->o_dkey_size = fmt->dkey_factor << KEY_FACTOR_BITS;
- object->o_block_bits = fmt->block_bits;
- object->o_mode = fmt->mode_id;
-
- ret = check_file_metadata(info);
-out:
- GF_FREE(fmt);
- return ret;
-}
-
-/*
- * perform metadata authentication against @loc->path;
- * extract crypt-specific attribute and populate @info
- * with them (optional)
- */
-int32_t
-open_format(unsigned char *str, int32_t len, loc_t *loc,
- struct crypt_inode_info *info, struct master_cipher_info *master,
- crypt_local_t *local, gf_boolean_t load_info)
-{
- struct crypt_format *fmt;
- if (len < sizeof(*fmt)) {
- gf_log("crypt", GF_LOG_ERROR, "Bad core format");
- return EIO;
- }
- fmt = (struct crypt_format *)str;
-
- if (fmt->loader_id >= LAST_MTD_LOADER) {
- gf_log("crypt", GF_LOG_ERROR, "Unsupported loader id %d",
- fmt->loader_id);
- return EINVAL;
- }
- str += sizeof(*fmt);
- len -= sizeof(*fmt);
-
- return mtd_loaders[fmt->loader_id].open_format(str, len, loc, info, master,
- local, load_info);
-}
-
-struct crypt_mtd_loader mtd_loaders[LAST_MTD_LOADER] = {
- [MTD_LOADER_V1] = {.format_size = format_size_v1,
- .create_format = create_format_v1,
- .open_format = open_format_v1,
- .update_format = update_format_v1}};
-
-/*
- Local variables:
- c-indentation-style: "K&R"
- mode-name: "LC"
- c-basic-offset: 8
- tab-width: 8
- fill-column: 80
- scroll-step: 1
- End:
-*/
diff --git a/xlators/encryption/crypt/src/metadata.h b/xlators/encryption/crypt/src/metadata.h
deleted file mode 100644
index 0bcee1b18c8..00000000000
--- a/xlators/encryption/crypt/src/metadata.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __METADATA_H__
-#define __METADATA_H__
-
-#define NMTD_8_MAC_SIZE (8)
-#define EMTD_8_MAC_SIZE (8)
-
-typedef uint8_t nmtd_8_mac_t[NMTD_8_MAC_SIZE];
-typedef uint8_t emtd_8_mac_t[EMTD_8_MAC_SIZE];
-
-/*
- * Version "v1" of file's metadata.
- * Metadata of this version has 4 components:
- *
- * 1) EMTD (Encrypted part of MeTaData);
- * 2) NMTD (Non-encrypted part of MeTaData);
- * 3) EMTD_MAC; (EMTD Message Authentication Code);
- * 4) Array of per-link NMTD MACs (for every (hard)link it includes
- * exactly one MAC)
- */
-struct mtd_format_v1 {
- /* EMTD, encrypted part of meta-data */
- uint8_t alg_id; /* cipher algorithm id (only AES for now) */
- uint8_t mode_id; /* cipher mode id; (only XTS for now) */
- uint8_t block_bits; /* encoded block size */
- uint8_t minor_id; /* client translator id */
- uint8_t dkey_factor; /* encoded size of the data key */
- /* MACs */
- emtd_8_mac_t gmac; /* MAC of the encrypted meta-data, 8 bytes */
- nmtd_8_mac_t omac; /* per-link MACs of the non-encrypted
- * meta-data: at least one such MAC is always
- * present */
-} __attribute__((packed));
-
-/*
- * NMTD, the non-encrypted part of metadata of version "v1"
- * is file's gfid, which is generated on trusted machines.
- */
-#define SIZE_OF_NMTD_V1 (sizeof(uuid_t))
-#define SIZE_OF_EMTD_V1 \
- (offsetof(struct mtd_format_v1, gmac) - \
- offsetof(struct mtd_format_v1, alg_id))
-#define SIZE_OF_NMTD_V1_MAC (NMTD_8_MAC_SIZE)
-#define SIZE_OF_EMTD_V1_MAC (EMTD_8_MAC_SIZE)
-
-static inline unsigned char *
-get_EMTD_V1(struct mtd_format_v1 *format)
-{
- return &format->alg_id;
-}
-
-static inline unsigned char *
-get_NMTD_V1(struct crypt_inode_info *info)
-{
- return info->oid;
-}
-
-static inline unsigned char *
-get_EMTD_V1_MAC(struct mtd_format_v1 *format)
-{
- return format->gmac;
-}
-
-static inline unsigned char *
-get_NMTD_V1_MAC(struct mtd_format_v1 *format)
-{
- return format->omac;
-}
-
-#endif /* __METADATA_H__ */
diff --git a/xlators/encryption/rot-13/Makefile.am b/xlators/encryption/rot-13/Makefile.am
deleted file mode 100644
index d471a3f9243..00000000000
--- a/xlators/encryption/rot-13/Makefile.am
+++ /dev/null
@@ -1,3 +0,0 @@
-SUBDIRS = src
-
-CLEANFILES =
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
index 6ef19af0860..c57897f11ea 100644
--- a/xlators/features/Makefile.am
+++ b/xlators/features/Makefile.am
@@ -2,10 +2,13 @@ if BUILD_CLOUDSYNC
CLOUDSYNC_DIR = cloudsync
endif
+if BUILD_METADISP
+ METADISP_DIR = metadisp
+endif
+
+SUBDIRS = locks quota read-only quiesce marker index barrier arbiter upcall \
+ compress changelog gfid-access snapview-client snapview-server trash \
+ shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \
+ utime $(METADISP_DIR)
-SUBDIRS = locks quota read-only quiesce marker index barrier \
- arbiter compress changelog changetimerecorder \
- gfid-access $(GLUPY_SUBDIR) upcall snapview-client snapview-server \
- trash shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \
- utime
CLEANFILES =
diff --git a/xlators/features/arbiter/src/arbiter-mem-types.h b/xlators/features/arbiter/src/arbiter-mem-types.h
index 0f77cfd05f4..05d18374c46 100644
--- a/xlators/features/arbiter/src/arbiter-mem-types.h
+++ b/xlators/features/arbiter/src/arbiter-mem-types.h
@@ -9,7 +9,7 @@
#ifndef __ARBITER_MEM_TYPES_H__
#define __ARBITER_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
typedef enum gf_arbiter_mem_types_ {
gf_arbiter_mt_inode_ctx_t = gf_common_mt_end + 1,
diff --git a/xlators/features/arbiter/src/arbiter.c b/xlators/features/arbiter/src/arbiter.c
index 436f228a566..83a97e3354b 100644
--- a/xlators/features/arbiter/src/arbiter.c
+++ b/xlators/features/arbiter/src/arbiter.c
@@ -10,9 +10,9 @@
#include "arbiter.h"
#include "arbiter-mem-types.h"
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
static arbiter_inode_ctx_t *
__arbiter_inode_ctx_get(inode_t *inode, xlator_t *this)
@@ -31,7 +31,7 @@ __arbiter_inode_ctx_get(inode_t *inode, xlator_t *this)
if (!ctx)
goto out;
- ret = __inode_ctx_put(inode, this, (uint64_t)ctx);
+ ret = __inode_ctx_put(inode, this, (uint64_t)(uintptr_t)ctx);
if (ret) {
GF_FREE(ctx);
ctx = NULL;
@@ -365,3 +365,16 @@ struct xlator_cbks cbks = {
struct volume_options options[] = {
{.key = {NULL}},
};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "arbiter",
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/arbiter/src/arbiter.h b/xlators/features/arbiter/src/arbiter.h
index ce1c909f70f..546db7b751a 100644
--- a/xlators/features/arbiter/src/arbiter.h
+++ b/xlators/features/arbiter/src/arbiter.h
@@ -11,8 +11,8 @@
#ifndef _ARBITER_H
#define _ARBITER_H
-#include "locking.h"
-#include "common-utils.h"
+#include <glusterfs/locking.h>
+#include <glusterfs/common-utils.h>
typedef struct arbiter_inode_ctx_ {
struct iatt iattbuf;
diff --git a/xlators/features/barrier/src/barrier-mem-types.h b/xlators/features/barrier/src/barrier-mem-types.h
index 93ccab633ce..71ed7898d9c 100644
--- a/xlators/features/barrier/src/barrier-mem-types.h
+++ b/xlators/features/barrier/src/barrier-mem-types.h
@@ -11,7 +11,7 @@
#ifndef __BARRIER_MEM_TYPES_H__
#define __BARRIER_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_barrier_mem_types_ {
gf_barrier_mt_priv_t = gf_common_mt_end + 1,
diff --git a/xlators/features/barrier/src/barrier.c b/xlators/features/barrier/src/barrier.c
index 1c5c5ffdc22..852bbacb99d 100644
--- a/xlators/features/barrier/src/barrier.c
+++ b/xlators/features/barrier/src/barrier.c
@@ -9,10 +9,10 @@
*/
#include "barrier.h"
-#include "defaults.h"
-#include "call-stub.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/call-stub.h>
-#include "statedump.h"
+#include <glusterfs/statedump.h>
void
barrier_local_set_gfid(call_frame_t *frame, uuid_t gfid, xlator_t *this)
@@ -461,16 +461,14 @@ out:
int
notify(xlator_t *this, int event, void *data, ...)
{
- barrier_priv_t *priv = NULL;
+ barrier_priv_t *priv = this->private;
dict_t *dict = NULL;
- gf_boolean_t past = _gf_false;
int ret = -1;
int barrier_enabled = _gf_false;
struct list_head queue = {
0,
};
- priv = this->private;
GF_ASSERT(priv);
INIT_LIST_HEAD(&queue);
@@ -488,35 +486,27 @@ notify(xlator_t *this, int event, void *data, ...)
LOCK(&priv->lock);
{
- past = priv->barrier_enabled;
-
- switch (past) {
- case _gf_false:
- if (barrier_enabled) {
- ret = __barrier_enable(this, priv);
- if (ret)
- goto unlock;
- } else {
- gf_log(this->name, GF_LOG_ERROR,
- "Already disabled.");
- goto unlock;
- }
- break;
-
- case _gf_true:
- if (!barrier_enabled) {
- __barrier_disable(this, &queue);
- } else {
- gf_log(this->name, GF_LOG_ERROR, "Already enabled");
- goto unlock;
- }
- break;
+ if (!priv->barrier_enabled) {
+ if (barrier_enabled) {
+ ret = __barrier_enable(this, priv);
+ } else {
+ UNLOCK(&priv->lock);
+ gf_log(this->name, GF_LOG_ERROR, "Already disabled.");
+ goto post_unlock;
+ }
+ } else {
+ if (!barrier_enabled) {
+ __barrier_disable(this, &queue);
+ ret = 0;
+ } else {
+ UNLOCK(&priv->lock);
+ gf_log(this->name, GF_LOG_ERROR, "Already enabled");
+ goto post_unlock;
+ }
}
- ret = 0;
}
- unlock:
UNLOCK(&priv->lock);
-
+ post_unlock:
if (!list_empty(&queue))
barrier_dequeue_all(this, &queue);
@@ -536,7 +526,6 @@ int
reconfigure(xlator_t *this, dict_t *options)
{
barrier_priv_t *priv = NULL;
- gf_boolean_t past = _gf_false;
int ret = -1;
gf_boolean_t barrier_enabled = _gf_false;
uint32_t timeout = {
@@ -556,23 +545,17 @@ reconfigure(xlator_t *this, dict_t *options)
LOCK(&priv->lock);
{
- past = priv->barrier_enabled;
-
- switch (past) {
- case _gf_false:
- if (barrier_enabled) {
- ret = __barrier_enable(this, priv);
- if (ret) {
- goto unlock;
- }
+ if (!priv->barrier_enabled) {
+ if (barrier_enabled) {
+ ret = __barrier_enable(this, priv);
+ if (ret) {
+ goto unlock;
}
- break;
-
- case _gf_true:
- if (!barrier_enabled) {
- __barrier_disable(this, &queue);
- }
- break;
+ }
+ } else {
+ if (!barrier_enabled) {
+ __barrier_disable(this, &queue);
+ }
}
priv->timeout.tv_sec = timeout;
ret = 0;
@@ -746,13 +729,13 @@ barrier_dump_priv(xlator_t *this)
gf_proc_dump_build_key(key, "xlator.features.barrier", "priv");
gf_proc_dump_add_section("%s", key);
+ gf_proc_dump_build_key(key, "barrier", "enabled");
LOCK(&priv->lock);
{
- gf_proc_dump_build_key(key, "barrier", "enabled");
gf_proc_dump_write(key, "%d", priv->barrier_enabled);
gf_proc_dump_build_key(key, "barrier", "timeout");
- gf_proc_dump_write(key, "%" PRId64, priv->timeout.tv_sec);
+ gf_proc_dump_write(key, "%ld", priv->timeout.tv_sec);
if (priv->barrier_enabled) {
gf_proc_dump_build_key(key, "barrier", "queue_size");
gf_proc_dump_write(key, "%d", priv->queue_size);
@@ -809,3 +792,18 @@ struct volume_options options[] = {
"blocked acknowledgements are sent to the application"},
{.key = {NULL}},
};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "barrier",
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/barrier/src/barrier.h b/xlators/features/barrier/src/barrier.h
index d11d71d151e..1337f311f7d 100644
--- a/xlators/features/barrier/src/barrier.h
+++ b/xlators/features/barrier/src/barrier.h
@@ -12,9 +12,9 @@
#define __BARRIER_H__
#include "barrier-mem-types.h"
-#include "xlator.h"
-#include "timer.h"
-#include "call-stub.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/timer.h>
+#include <glusterfs/call-stub.h>
#define BARRIER_FOP_CBK(fop_name, label, frame, this, params...) \
do { \
@@ -65,11 +65,12 @@
typedef struct {
gf_timer_t *timer;
- gf_boolean_t barrier_enabled;
gf_lock_t lock;
struct list_head queue;
struct timespec timeout;
uint32_t queue_size;
+ gf_boolean_t barrier_enabled;
+ char _pad[3]; /* manual padding */
} barrier_priv_t;
int
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h
index 6f59933a31d..5bc5103a27c 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h
@@ -11,7 +11,7 @@
#ifndef _BITROT_BITD_MESSAGES_H_
#define _BITROT_BITD_MESSAGES_H_
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
/* To add new message IDs, append new identifiers at the end of the list.
*
@@ -47,6 +47,55 @@ GLFS_MSGID(BITROT_BITD, BRB_MSG_FD_CREATE_FAILED, BRB_MSG_READV_FAILED,
BRB_MSG_SCRUB_THREAD_CLEANUP, BRB_MSG_SCRUBBER_CLEANED,
BRB_MSG_GENERIC_SSM_INFO, BRB_MSG_ZERO_TIMEOUT_BUG,
BRB_MSG_BAD_OBJ_READDIR_FAIL, BRB_MSG_SSM_FAILED,
- BRB_MSG_SCRUB_WAIT_FAILED);
+ BRB_MSG_SCRUB_WAIT_FAILED, BRB_MSG_TRIGGER_SIGN_FAILED,
+ BRB_MSG_EVENT_UNHANDLED, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB,
+ BRB_MSG_THREAD_CREATION_FAILED, BRB_MSG_MEM_POOL_ALLOC,
+ BRB_MSG_SAVING_HASH_FAILED);
+#define BRB_MSG_FD_CREATE_FAILED_STR "failed to create fd for the inode"
+#define BRB_MSG_READV_FAILED_STR "readv failed"
+#define BRB_MSG_BLOCK_READ_FAILED_STR "reading block failed"
+#define BRB_MSG_NO_MEMORY_STR "failed to allocate memory"
+#define BRB_MSG_CALC_CHECKSUM_FAILED_STR "calculating checksum failed"
+#define BRB_MSG_GET_SIGN_FAILED_STR "failed to get the signature"
+#define BRB_MSG_SET_SIGN_FAILED_STR "signing failed"
+#define BRB_MSG_OP_FAILED_STR "failed on object"
+#define BRB_MSG_TRIGGER_SIGN_FAILED_STR "Could not trigger signing"
+#define BRB_MSG_READ_AND_SIGN_FAILED_STR "reading and signing of object failed"
+#define BRB_MSG_SET_TIMER_FAILED_STR "Failed to allocate object expiry timer"
+#define BRB_MSG_GET_SUBVOL_FAILED_STR \
+ "failed to get the subvolume for the brick"
+#define BRB_MSG_PATH_FAILED_STR "path failed"
+#define BRB_MSG_SKIP_OBJECT_STR "Entry is marked corrupted. skipping"
+#define BRB_MSG_PARTIAL_VERSION_PRESENCE_STR \
+ "PArtial version xattr presence detected, ignoring"
+#define BRB_MSG_TRIGGER_SIGN_STR "Triggering signing"
+#define BRB_MSG_CRAWLING_START_STR \
+ "Crawling brick, scanning for unsigned objects"
+#define BRB_MSG_CRAWLING_FINISH_STR "Completed crawling brick"
+#define BRB_MSG_REGISTER_FAILED_STR "Register to changelog failed"
+#define BRB_MSG_SPAWN_FAILED_STR "failed to spawn"
+#define BRB_MSG_CONNECTED_TO_BRICK_STR "Connected to brick"
+#define BRB_MSG_LOOKUP_FAILED_STR "lookup on root failed"
+#define BRB_MSG_GET_INFO_FAILED_STR "failed to get stub info"
+#define BRB_MSG_SCRUB_THREAD_CLEANUP_STR "Error cleaning up scanner thread"
+#define BRB_MSG_SCRUBBER_CLEANED_STR "clened up scrubber for brick"
+#define BRB_MSG_SUBVOL_CONNECT_FAILED_STR \
+ "callback handler for subvolume failed"
+#define BRB_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed"
+#define BRB_MSG_EVENT_UNHANDLED_STR "Event unhandled for child"
+#define BRB_MSG_INVALID_SUBVOL_STR "Got event from invalid subvolume"
+#define BRB_MSG_RESCHEDULE_SCRUBBER_FAILED_STR \
+ "on demand scrub schedule failed. Scrubber is not in pending state."
+#define BRB_MSG_COULD_NOT_SCHEDULE_SCRUB_STR \
+ "Could not schedule ondemand scrubbing. Scrubbing will continue " \
+ "according to old frequency."
+#define BRB_MSG_THREAD_CREATION_FAILED_STR "thread creation failed"
+#define BRB_MSG_RATE_LIMIT_INFO_STR "Rate Limit Info"
+#define BRB_MSG_MEM_POOL_ALLOC_STR "failed to allocate mem-pool for timer"
+#define BRB_MSG_NO_CHILD_STR "FATAL: no children"
+#define BRB_MSG_TIMER_WHEEL_UNAVAILABLE_STR "global timer wheel unavailable"
+#define BRB_MSG_BITROT_LOADED_STR "bit-rot xlator loaded"
+#define BRB_MSG_SAVING_HASH_FAILED_STR \
+ "failed to allocate memory for saving hash of the object"
#endif /* !_BITROT_BITD_MESSAGES_H_ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c
index 34e20f9df11..5cef2ffa5e5 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c
@@ -40,21 +40,21 @@ br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat)
}
void
-br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, struct timeval *tv)
+br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time)
{
if (!scrub_stat)
return;
pthread_mutex_lock(&scrub_stat->lock);
{
- scrub_stat->scrub_start_tv.tv_sec = tv->tv_sec;
+ scrub_stat->scrub_start_time = time;
}
pthread_mutex_unlock(&scrub_stat->lock);
}
void
br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr,
- struct timeval *tv)
+ time_t time)
{
int lst_size = 0;
@@ -67,10 +67,10 @@ br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr,
pthread_mutex_lock(&scrub_stat->lock);
{
- scrub_stat->scrub_end_tv.tv_sec = tv->tv_sec;
+ scrub_stat->scrub_end_time = time;
- scrub_stat->scrub_duration = scrub_stat->scrub_end_tv.tv_sec -
- scrub_stat->scrub_start_tv.tv_sec;
+ scrub_stat->scrub_duration = scrub_stat->scrub_end_time -
+ scrub_stat->scrub_start_time;
snprintf(scrub_stat->last_scrub_time, lst_size, "%s", timestr);
}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h
index 24128b90a66..f022aa831eb 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h
@@ -15,20 +15,22 @@
#include <sys/time.h>
#include <pthread.h>
+#include <glusterfs/common-utils.h>
+
struct br_scrub_stats {
- uint64_t scrubbed_files; /* Total number of scrubbed file */
+ uint64_t scrubbed_files; /* Total number of scrubbed files. */
- uint64_t unsigned_files; /* Total number of unsigned file */
+ uint64_t unsigned_files; /* Total number of unsigned files. */
- uint64_t scrub_duration; /* Duration of last scrub */
+ uint64_t scrub_duration; /* Duration of last scrub. */
- char last_scrub_time[1024]; /*last scrub completion time */
+ char last_scrub_time[GF_TIMESTR_SIZE]; /* Last scrub completion time. */
- struct timeval scrub_start_tv; /* Scrubbing starting time*/
+ time_t scrub_start_time; /* Scrubbing starting time. */
- struct timeval scrub_end_tv; /* Scrubbing finishing time */
+ time_t scrub_end_time; /* Scrubbing finishing time. */
- int8_t scrub_running; /* Scrub running or not */
+ int8_t scrub_running; /* Whether scrub running or not. */
pthread_mutex_t lock;
};
@@ -40,9 +42,9 @@ br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat);
void
br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat);
void
-br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, struct timeval *tv);
+br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time);
void
br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr,
- struct timeval *tv);
+ time_t time);
#endif /* __BIT_ROT_SCRUB_STATUS_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c
index b856c168eb7..289dd53f610 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c
@@ -12,15 +12,15 @@
#include <ctype.h>
#include <sys/uio.h>
-#include "glusterfs.h"
-#include "logging.h"
-#include "common-utils.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
#include "bit-rot-scrub.h"
#include <pthread.h>
#include "bit-rot-bitd-messages.h"
#include "bit-rot-scrub-status.h"
-#include "events.h"
+#include <glusterfs/events.h>
struct br_scrubbers {
pthread_t scrubthread;
@@ -130,6 +130,8 @@ bitd_scrub_post_compute_check(xlator_t *this, br_child_t *child, fd_t *fd,
(void)memcpy(*signature, signptr, sizeof(br_isignature_out_t) + signlen);
+ (*signature)->signaturelen = signlen;
+
unref_dict:
dict_unref(xattr);
out:
@@ -222,7 +224,7 @@ bitd_compare_ckum(xlator_t *this, br_isignature_out_t *sign, unsigned char *md,
GF_VALIDATE_OR_GOTO(this->name, md, out);
GF_VALIDATE_OR_GOTO(this->name, entry, out);
- if (strncmp(sign->signature, (char *)md, strlen(sign->signature)) == 0) {
+ if (strncmp(sign->signature, (char *)md, sign->signaturelen) == 0) {
gf_msg_debug(this->name, 0,
"%s [GFID: %s | Brick: %s] "
"matches calculated checksum",
@@ -599,25 +601,23 @@ br_fsscan_deactivate(xlator_t *this)
static void
br_scrubber_log_time(xlator_t *this, const char *sfx)
{
- char timestr[1024] = {
- 0,
- };
- struct timeval tv = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
br_private_t *priv = NULL;
+ time_t now = 0;
+ now = gf_time();
priv = this->private;
- gettimeofday(&tv, NULL);
- gf_time_fmt(timestr, sizeof(timestr), tv.tv_sec, gf_timefmt_FT);
+ gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT);
if (strcasecmp(sfx, "started") == 0) {
- br_update_scrub_start_time(&priv->scrub_stat, &tv);
+ br_update_scrub_start_time(&priv->scrub_stat, now);
gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_START,
"Scrubbing %s at %s", sfx, timestr);
} else {
- br_update_scrub_finish_time(&priv->scrub_stat, timestr, &tv);
+ br_update_scrub_finish_time(&priv->scrub_stat, timestr, now);
gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_FINISH,
"Scrubbing %s at %s", sfx, timestr);
}
@@ -626,15 +626,13 @@ br_scrubber_log_time(xlator_t *this, const char *sfx)
static void
br_fsscanner_log_time(xlator_t *this, br_child_t *child, const char *sfx)
{
- char timestr[1024] = {
- 0,
- };
- struct timeval tv = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
+ time_t now = 0;
- gettimeofday(&tv, NULL);
- gf_time_fmt(timestr, sizeof(timestr), tv.tv_sec, gf_timefmt_FT);
+ now = gf_time();
+ gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT);
if (strcasecmp(sfx, "started") == 0) {
gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s",
@@ -718,8 +716,10 @@ br_scrubber_exit_control(xlator_t *this)
if (scrub_monitor->state == BR_SCRUB_STATE_ACTIVE) {
(void)br_fsscan_activate(this);
} else {
+ UNLOCK(&scrub_monitor->lock);
gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
"Volume waiting to get rescheduled..");
+ return;
}
}
UNLOCK(&scrub_monitor->lock);
@@ -915,10 +915,7 @@ br_fsscan_schedule(xlator_t *this)
{
uint32_t timo = 0;
br_private_t *priv = NULL;
- struct timeval tv = {
- 0,
- };
- char timestr[1024] = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
struct br_scrubber *fsscrub = NULL;
@@ -929,8 +926,7 @@ br_fsscan_schedule(xlator_t *this)
fsscrub = &priv->fsscrub;
scrub_monitor = &priv->scrub_monitor;
- (void)gettimeofday(&tv, NULL);
- scrub_monitor->boot = tv.tv_sec;
+ scrub_monitor->boot = gf_time();
timo = br_fsscan_calculate_timeout(fsscrub->frequency);
if (timo == 0) {
@@ -971,12 +967,10 @@ int32_t
br_fsscan_activate(xlator_t *this)
{
uint32_t timo = 0;
- char timestr[1024] = {
- 0,
- };
- struct timeval now = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
+ time_t now = 0;
br_private_t *priv = NULL;
struct br_scrubber *fsscrub = NULL;
struct br_monitor *scrub_monitor = NULL;
@@ -985,7 +979,7 @@ br_fsscan_activate(xlator_t *this)
fsscrub = &priv->fsscrub;
scrub_monitor = &priv->scrub_monitor;
- (void)gettimeofday(&now, NULL);
+ now = gf_time();
timo = br_fsscan_calculate_timeout(fsscrub->frequency);
if (timo == 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG,
@@ -999,7 +993,7 @@ br_fsscan_activate(xlator_t *this)
}
pthread_mutex_unlock(&scrub_monitor->donelock);
- gf_time_fmt(timestr, sizeof(timestr), (now.tv_sec + timo), gf_timefmt_FT);
+ gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
(void)gf_tw_mod_timer(priv->timer_wheel, scrub_monitor->timer, timo);
_br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
@@ -1016,12 +1010,10 @@ br_fsscan_reschedule(xlator_t *this)
{
int32_t ret = 0;
uint32_t timo = 0;
- char timestr[1024] = {
- 0,
- };
- struct timeval now = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
+ time_t now = 0;
br_private_t *priv = NULL;
struct br_scrubber *fsscrub = NULL;
struct br_monitor *scrub_monitor = NULL;
@@ -1033,7 +1025,7 @@ br_fsscan_reschedule(xlator_t *this)
if (!fsscrub->frequency_reconf)
return 0;
- (void)gettimeofday(&now, NULL);
+ now = gf_time();
timo = br_fsscan_calculate_timeout(fsscrub->frequency);
if (timo == 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG,
@@ -1041,7 +1033,7 @@ br_fsscan_reschedule(xlator_t *this)
return -1;
}
- gf_time_fmt(timestr, sizeof(timestr), (now.tv_sec + timo), gf_timefmt_FT);
+ gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
pthread_mutex_lock(&scrub_monitor->donelock);
{
@@ -1069,23 +1061,19 @@ br_fsscan_ondemand(xlator_t *this)
{
int32_t ret = 0;
uint32_t timo = 0;
- char timestr[1024] = {
- 0,
- };
- struct timeval now = {
+ char timestr[GF_TIMESTR_SIZE] = {
0,
};
+ time_t now = 0;
br_private_t *priv = NULL;
struct br_monitor *scrub_monitor = NULL;
priv = this->private;
scrub_monitor = &priv->scrub_monitor;
- (void)gettimeofday(&now, NULL);
-
+ now = gf_time();
timo = BR_SCRUB_ONDEMAND;
-
- gf_time_fmt(timestr, sizeof(timestr), (now.tv_sec + timo), gf_timefmt_FT);
+ gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
pthread_mutex_lock(&scrub_monitor->donelock);
{
@@ -1543,9 +1531,11 @@ br_scrubber_log_option(xlator_t *this, br_private_t *priv,
[BR_SCRUB_THROTTLE_LAZY] = "lazy",
[BR_SCRUB_THROTTLE_NORMAL] = "normal",
[BR_SCRUB_THROTTLE_AGGRESSIVE] = "aggressive",
+ [BR_SCRUB_THROTTLE_STALLED] = "stalled",
};
char *scrub_freq_str[] = {
+ [0] = "",
[BR_FSSCRUB_FREQ_HOURLY] = "hourly",
[BR_FSSCRUB_FREQ_DAILY] = "daily",
[BR_FSSCRUB_FREQ_WEEKLY] = "weekly",
@@ -1558,6 +1548,8 @@ br_scrubber_log_option(xlator_t *this, br_private_t *priv,
return; /* logged as pause */
if (fsscrub->frequency_reconf || fsscrub->throttle_reconf) {
+ if (fsscrub->throttle == BR_SCRUB_THROTTLE_VOID)
+ return;
gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_TUNABLE,
"SCRUB TUNABLES:: [Frequency: %s, Throttle: %s]",
scrub_freq_str[fsscrub->frequency],
@@ -1649,7 +1641,7 @@ br_read_bad_object_dir(xlator_t *this, br_child_t *child, fd_t *fd,
int32_t ret = -1;
off_t offset = 0;
int32_t count = 0;
- char key[PATH_MAX] = {
+ char key[32] = {
0,
};
dict_t *out_dict = NULL;
@@ -1687,7 +1679,7 @@ br_read_bad_object_dir(xlator_t *this, br_child_t *child, fd_t *fd,
}
ret = count;
- ret = dict_set_int32(dict, "count", count);
+ ret = dict_set_int32_sizen(dict, "count", count);
out:
return ret;
@@ -1769,10 +1761,10 @@ br_collect_bad_objects_of_child(xlator_t *this, br_child_t *child, dict_t *dict,
{
int32_t ret = -1;
int32_t count = 0;
- char key[PATH_MAX] = {
+ char key[32] = {
0,
};
- char main_key[PATH_MAX] = {
+ char main_key[32] = {
0,
};
int32_t j = 0;
@@ -1784,15 +1776,15 @@ br_collect_bad_objects_of_child(xlator_t *this, br_child_t *child, dict_t *dict,
char *path = NULL;
int32_t len = 0;
- ret = dict_get_int32(child_dict, "count", &count);
+ ret = dict_get_int32_sizen(child_dict, "count", &count);
if (ret)
goto out;
tmp_count = total_count;
for (j = 0; j < count; j++) {
- snprintf(key, PATH_MAX, "quarantine-%d", j);
- ret = dict_get_str(child_dict, key, &entry);
+ len = snprintf(key, sizeof(key), "quarantine-%d", j);
+ ret = dict_get_strn(child_dict, key, len, &entry);
if (ret)
continue;
@@ -1802,7 +1794,7 @@ br_collect_bad_objects_of_child(xlator_t *this, br_child_t *child, dict_t *dict,
if ((len < 0) || (len >= PATH_MAX)) {
continue;
}
- snprintf(main_key, PATH_MAX, "quarantine-%d", tmp_count);
+ snprintf(main_key, sizeof(main_key), "quarantine-%d", tmp_count);
ret = dict_set_dynstr_with_alloc(dict, main_key, tmp);
if (!ret)
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h
index 7a3c14abb93..4e5f67bc021 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h
@@ -11,7 +11,7 @@
#ifndef __BIT_ROT_SCRUB_H__
#define __BIT_ROT_SCRUB_H__
-#include "xlator.h"
+#include <glusterfs/xlator.h>
#include "bit-rot.h"
void *
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h
index f3fbe2928b7..37b45a42eac 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h
@@ -11,7 +11,7 @@
#ifndef __BIT_ROT_SSM_H__
#define __BIT_ROT_SSM_H__
-#include "xlator.h"
+#include <glusterfs/xlator.h>
typedef enum br_scrub_state {
BR_SCRUB_STATE_INACTIVE = 0,
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
index 40932268c9b..a2f1c343a1d 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -9,12 +9,9 @@
*/
#include <ctype.h>
-#include <sys/uio.h>
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-#include "compat-errno.h"
+#include <glusterfs/logging.h>
+#include <glusterfs/compat-errno.h>
#include "bit-rot.h"
#include "bit-rot-scrub.h"
@@ -244,8 +241,8 @@ br_object_open(xlator_t *this, br_object_t *object, inode_t *inode,
ret = -EINVAL;
fd = fd_create(inode, 0);
if (!fd) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
- "failed to create fd for the inode %s", uuid_utoa(inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
+ "gfid=%s", uuid_utoa(inode->gfid), NULL);
goto out;
}
@@ -296,11 +293,11 @@ br_object_read_block_and_sign(xlator_t *this, fd_t *fd, br_child_t *child,
tbf = priv->tbf;
ret = syncop_readv(child->xl, fd, size, offset, 0, &iovec, &count, &iobref,
- NULL, NULL);
+ NULL, NULL, NULL);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno, BRB_MSG_READV_FAILED,
- "readv on %s failed", uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_READV_FAILED,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
ret = -1;
goto out;
}
@@ -350,10 +347,9 @@ br_calculate_obj_checksum(unsigned char *md, br_child_t *child, fd_t *fd,
ret = br_object_read_block_and_sign(this, fd, child, offset, block,
&sha256);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BLOCK_READ_FAILED,
- "reading block with "
- "offset %lu of object %s failed",
- offset, uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BLOCK_READ_FAILED,
+ "offset=%" PRIu64, offset, "object-gfid=%s",
+ uuid_utoa(fd->inode->gfid), NULL);
break;
}
@@ -395,28 +391,23 @@ br_object_read_sign(inode_t *linked_inode, fd_t *fd, br_object_t *object,
md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char);
if (!md) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
- "failed to allocate memory for saving hash of the "
- "object %s",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_SAVING_HASH_FAILED,
+ "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto out;
}
ret = br_object_checksum(md, object, fd, iatt);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_CHECKSUM_FAILED,
- "calculating checksum "
- "for the object %s failed",
- uuid_utoa(linked_inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_CHECKSUM_FAILED,
+ "object-gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
goto free_signature;
}
sign = br_prepare_signature(md, SHA256_DIGEST_LENGTH,
BR_SIGNATURE_TYPE_SHA256, object);
if (!sign) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
- "failed to get the signature for the object %s",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
+ "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto free_signature;
}
@@ -424,17 +415,16 @@ br_object_read_sign(inode_t *linked_inode, fd_t *fd, br_object_t *object,
signature_size(SHA256_DIGEST_LENGTH), _gf_true);
if (!xattr) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
- "dict allocation for signing failed for the object %s",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+ "dict-allocation object-gfid=%s", uuid_utoa(fd->inode->gfid),
+ NULL);
goto free_isign;
}
ret = syncop_fsetxattr(object->child->xl, fd, xattr, 0, NULL, NULL);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
- "fsetxattr of signature to the object %s failed",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+ "fsetxattr object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto unref_dict;
}
@@ -467,8 +457,8 @@ br_log_object(xlator_t *this, char *op, uuid_t gfid, int32_t op_errno)
"[reason: %s]",
op, uuid_utoa(gfid), strerror(op_errno));
} else {
- gf_msg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED,
- "%s() failed on object %s", op, uuid_utoa(gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s",
+ op, "gfid=%s", uuid_utoa(gfid), NULL);
}
}
@@ -482,8 +472,8 @@ br_log_object_path(xlator_t *this, char *op, const char *path, int32_t op_errno)
"[reason: %s]",
op, path, strerror(op_errno));
} else {
- gf_msg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED,
- "%s() failed on object %s", op, path);
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s",
+ op, "path=%s", path, NULL);
}
}
@@ -512,8 +502,8 @@ br_trigger_sign(xlator_t *this, br_child_t *child, inode_t *linked_inode,
ret = -1;
fd = fd_create(linked_inode, 0);
if (!fd) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
- "Failed to create fd [GFID %s]", uuid_utoa(linked_inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
+ "gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
goto cleanup_dict;
}
@@ -537,9 +527,9 @@ cleanup_dict:
dict_unref(dict);
out:
if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_TRIGGER_SIGN,
- "Could not trigger signingd for %s (reopen hint: %d)",
- uuid_utoa(linked_inode->gfid), val);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_TRIGGER_SIGN_FAILED,
+ "gfid=%s", uuid_utoa(linked_inode->gfid), "reopen-hint-val=%d",
+ val, NULL);
}
}
@@ -619,10 +609,8 @@ br_sign_object(br_object_t *object)
ret = br_object_read_sign(linked_inode, fd, object, &iatt);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_READ_AND_SIGN_FAILED,
- "reading and signing of "
- "the object %s failed",
- uuid_utoa(linked_inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_READ_AND_SIGN_FAILED,
+ "gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
goto unref_fd;
}
@@ -676,8 +664,8 @@ br_process_object(void *arg)
ret = br_sign_object(object);
if (ret && !br_object_sign_softerror(-ret))
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SIGN_FAILED,
- "SIGNING FAILURE [%s]", uuid_utoa(object->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+ "gfid=%s", uuid_utoa(object->gfid), NULL);
GF_FREE(object);
}
@@ -779,9 +767,8 @@ br_schedule_object_reopen(xlator_t *this, br_object_t *object,
timer = br_initialize_timer(this, object, child, ev);
if (!timer)
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_TIMER_FAILED,
- "Failed to allocate object expiry timer [GFID: %s]",
- uuid_utoa(object->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_TIMER_FAILED,
+ "gfid=%s", uuid_utoa(object->gfid), NULL);
return timer ? 0 : -1;
}
@@ -828,15 +815,15 @@ br_brick_callback(void *xl, char *brick, void *data, changelog_event_t *ev)
child = br_get_child_from_brick_path(this, brick);
if (!child) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SUBVOL_FAILED,
- "failed to get the subvolume for the brick %s", brick);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SUBVOL_FAILED,
+ "brick=%s", brick, NULL);
goto out;
}
object = br_initialize_object(this, child, ev);
if (!object) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
- "failed to allocate object memory [GFID: %s]", uuid_utoa(gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+ "object-gfid=%s", uuid_utoa(gfid), NULL);
goto out;
}
@@ -888,8 +875,8 @@ br_check_object_need_sign(xlator_t *this, dict_t *xattr, br_child_t *child)
ret = dict_get_ptr(xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)&sign);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
- "failed to get object signature info");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
+ "object-info", NULL);
goto out;
}
@@ -928,9 +915,9 @@ br_prepare_loc(xlator_t *this, br_child_t *child, loc_t *parent,
ret = inode_path(parent->inode, entry->d_name, (char **)&loc->path);
if (ret < 0 || !loc->path) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_PATH_FAILED,
- "inode_path on %s (parent: %s) failed", entry->d_name,
- uuid_utoa(parent->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_PATH_FAILED,
+ "inode_path=%s", entry->d_name, "parent-gfid=%s",
+ uuid_utoa(parent->inode->gfid), NULL);
goto out;
}
@@ -974,6 +961,7 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
int32_t ret = -1;
inode_t *linked_inode = NULL;
gf_boolean_t need_signing = _gf_false;
+ gf_boolean_t need_reopen = _gf_true;
GF_VALIDATE_OR_GOTO("bit-rot", subvol, out);
GF_VALIDATE_OR_GOTO("bit-rot", data, out);
@@ -1021,8 +1009,8 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
*/
if (bitd_is_bad_file(this, child, &loc, NULL)) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT,
- "Entry [%s] is marked corrupted.. skipping.", loc.path);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT, "path=%s",
+ loc.path, NULL);
goto unref_inode;
}
@@ -1039,23 +1027,32 @@ bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
if (op_errno == ENODATA && (iatt.ia_size != 0))
need_signing = _gf_true;
if (op_errno == EINVAL)
- gf_msg(this->name, GF_LOG_WARNING, 0,
- BRB_MSG_PARTIAL_VERSION_PRESENCE,
- "Partial "
- "version xattr presence detected, ignoring "
- "[GFID: %s]",
- uuid_utoa(linked_inode->gfid));
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ BRB_MSG_PARTIAL_VERSION_PRESENCE, "gfid=%s",
+ uuid_utoa(linked_inode->gfid), NULL);
} else {
need_signing = br_check_object_need_sign(this, xattr, child);
+
+ /*
+ * If we are here means, bitrot daemon has started. Is it just
+ * a simple restart of the daemon or is it started because the
+ * feature is enabled is something hard to determine. Hence,
+ * if need_signing is false (because bit-rot version and signature
+ * are present), then still go ahead and sign it.
+ */
+ if (!need_signing) {
+ need_signing = _gf_true;
+ need_reopen = _gf_true;
+ }
}
if (!need_signing)
goto unref_dict;
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_TRIGGER_SIGN,
- "Triggering signing for %s [GFID: %s | Brick: %s]", loc.path,
- uuid_utoa(linked_inode->gfid), child->brick_path);
- br_trigger_sign(this, child, linked_inode, &loc, _gf_true);
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_TRIGGER_SIGN, "path=%s",
+ loc.path, "gfid=%s", uuid_utoa(linked_inode->gfid), "Brick-path=%s",
+ child->brick_path, NULL);
+ br_trigger_sign(this, child, linked_inode, &loc, need_reopen);
ret = 0;
@@ -1087,17 +1084,16 @@ br_oneshot_signer(void *arg)
THIS = this;
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_START,
- "Crawling brick [%s], scanning for unsigned objects",
- child->brick_path);
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_START, "brick-path=%s",
+ child->brick_path, NULL);
loc.inode = child->table->root;
(void)syncop_ftw_throttle(child->xl, &loc, GF_CLIENT_PID_BITD, child,
bitd_oneshot_crawl, BR_CRAWL_THROTTLE_COUNT,
BR_CRAWL_THROTTLE_ZZZ);
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_FINISH,
- "Completed crawling brick [%s]", child->brick_path);
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_FINISH,
+ "brick-path=%s", child->brick_path, NULL);
return NULL;
}
@@ -1141,9 +1137,7 @@ br_enact_signer(xlator_t *this, br_child_t *child, br_stub_init_t *stub)
ret = gf_changelog_register_generic(brick, 1, 1,
this->ctx->cmd_args.log_file, -1, this);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, errno, BRB_MSG_REGISTER_FAILED,
- "Register to changelog "
- "failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_REGISTER_FAILED, NULL);
goto dealloc;
}
@@ -1151,8 +1145,8 @@ br_enact_signer(xlator_t *this, br_child_t *child, br_stub_init_t *stub)
ret = gf_thread_create(&child->thread, NULL, br_oneshot_signer, child,
"brosign");
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SPAWN_FAILED,
- "failed to spawn FS crawler thread");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SPAWN_FAILED,
+ "FS-crawler-thread", NULL);
else
child->threadrunning = 1;
@@ -1180,9 +1174,9 @@ br_launch_scrubber(xlator_t *this, br_child_t *child, struct br_scanfs *fsscan,
ret = gf_thread_create(&child->thread, NULL, br_fsscanner, child,
"brfsscan");
if (ret != 0) {
- gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_SPAWN_FAILED,
- "failed to spawn bitrot scrubber daemon [Brick: %s]",
- child->brick_path);
+ gf_smsg(this->name, GF_LOG_ALERT, 0, BRB_MSG_SPAWN_FAILED,
+ "bitrot-scrubber-daemon Brick-path=%s", child->brick_path,
+ NULL);
goto error_return;
}
@@ -1270,8 +1264,8 @@ br_child_enaction(xlator_t *this, br_child_t *child, br_stub_init_t *stub)
if (!ret) {
child->witnessed = 1;
_br_set_child_state(child, BR_CHILD_STATE_CONNECTED);
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_CONNECTED_TO_BRICK,
- "Connected to brick %s..", child->brick_path);
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CONNECTED_TO_BRICK,
+ "brick-path=%s", child->brick_path, NULL);
}
}
pthread_mutex_unlock(&child->lock);
@@ -1318,8 +1312,8 @@ br_brick_connect(xlator_t *this, br_child_t *child)
if (ret) {
op_errno = -ret;
ret = -1;
- gf_msg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_LOOKUP_FAILED,
- "lookup on root failed");
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_LOOKUP_FAILED,
+ NULL);
goto wipeloc;
}
@@ -1328,15 +1322,14 @@ br_brick_connect(xlator_t *this, br_child_t *child)
if (ret) {
op_errno = -ret;
ret = -1;
- gf_msg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_GET_INFO_FAILED,
- "failed to get stub info");
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_GET_INFO_FAILED,
+ NULL);
goto wipeloc;
}
ret = dict_get_ptr(xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, (void **)&stub);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_INFO_FAILED,
- "failed to extract stub information");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_INFO_FAILED, NULL);
goto free_dict;
}
@@ -1406,11 +1399,10 @@ br_cleanup_scrubber(xlator_t *this, br_child_t *child)
*/
ret = gf_thread_cleanup_xint(child->thread);
if (ret)
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_THREAD_CLEANUP,
- "Error cleaning up scanner thread");
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_THREAD_CLEANUP, NULL);
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUBBER_CLEANED,
- "Cleaned up scrubber for brick [%s]", child->brick_path);
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUBBER_CLEANED,
+ "brick-path=%s", child->brick_path, NULL);
return 0;
}
@@ -1495,9 +1487,8 @@ br_handle_events(void *arg)
child = childev->child;
ret = childev->call(this, child);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SUBVOL_CONNECT_FAILED,
- "callback handler for subvolume [%s] failed",
- child->xl->name);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SUBVOL_CONNECT_FAILED,
+ "name=%s", child->xl->name, NULL);
GF_FREE(childev);
}
@@ -1515,8 +1506,7 @@ mem_acct_init(xlator_t *this)
ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_MEM_ACNT_FAILED,
- "Memory accounting init failed");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_MEM_ACNT_FAILED, NULL);
return ret;
}
@@ -1533,8 +1523,8 @@ _br_qchild_event(xlator_t *this, br_child_t *child, br_child_handler *call)
childev = GF_CALLOC(1, sizeof(*childev), gf_br_mt_br_child_event_t);
if (!childev) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
- "Event unhandled for child.. [Brick: %s]", child->xl->name);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_EVENT_UNHANDLED,
+ "Brick-name=%s", child->xl->name, NULL);
return;
}
@@ -1629,10 +1619,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
switch (event) {
case GF_EVENT_CHILD_UP:
if (idx < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL,
- "Got event %d from "
- "invalid subvolume",
- event);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL,
+ "event=%d", event, NULL);
goto out;
}
@@ -1660,9 +1648,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_CHILD_DOWN:
if (idx < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- BRB_MSG_INVALID_SUBVOL_CHILD,
- "Got event %d from invalid subvolume", event);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL,
+ "event=%d", event, NULL);
goto out;
}
@@ -1703,11 +1690,9 @@ notify(xlator_t *this, int32_t event, void *data, ...)
"called");
if (scrub_monitor->state != BR_SCRUB_STATE_PENDING) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- BRB_MSG_RESCHEDULE_SCRUBBER_FAILED,
- "on demand scrub schedule failed. Scrubber is "
- "not in pending state. Current state is %d",
- scrub_monitor->state);
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, "Current-state=%d",
+ scrub_monitor->state, NULL);
return -2;
}
@@ -1719,11 +1704,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
pthread_mutex_unlock(&priv->lock);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- BRB_MSG_RESCHEDULE_SCRUBBER_FAILED,
- "Could not schedule ondemand scrubbing. "
- "Scrubbing will continue according to "
- "old frequency.");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, NULL);
}
gf_msg_debug(this->name, 0, "returning %d", ret);
break;
@@ -1735,22 +1717,26 @@ out:
return 0;
}
-/**
- * Initialize signer specific structures, spawn worker threads.
- */
-
static void
br_fini_signer(xlator_t *this, br_private_t *priv)
{
int i = 0;
- for (; i < BR_WORKERS; i++) {
+ if (priv == NULL)
+ return;
+
+ for (; i < priv->signer_th_count; i++) {
(void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]);
}
+ GF_FREE(priv->obj_queue->workers);
pthread_cond_destroy(&priv->object_cond);
}
+/**
+ * Initialize signer specific structures, spawn worker threads.
+ */
+
static int32_t
br_init_signer(xlator_t *this, br_private_t *priv)
{
@@ -1770,13 +1756,17 @@ br_init_signer(xlator_t *this, br_private_t *priv)
goto cleanup_cond;
INIT_LIST_HEAD(&priv->obj_queue->objects);
- for (i = 0; i < BR_WORKERS; i++) {
+ priv->obj_queue->workers = GF_CALLOC(
+ priv->signer_th_count, sizeof(pthread_t), gf_br_mt_br_worker_t);
+ if (!priv->obj_queue->workers)
+ goto cleanup_obj_queue;
+
+ for (i = 0; i < priv->signer_th_count; i++) {
ret = gf_thread_create(&priv->obj_queue->workers[i], NULL,
br_process_object, this, "brpobj");
if (ret != 0) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SPAWN_FAILED,
- "thread creation"
- " failed");
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ BRB_MSG_THREAD_CREATION_FAILED, NULL);
ret = -1;
goto cleanup_threads;
}
@@ -1788,7 +1778,9 @@ cleanup_threads:
for (i--; i >= 0; i--) {
(void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]);
}
+ GF_FREE(priv->obj_queue->workers);
+cleanup_obj_queue:
GF_FREE(priv->obj_queue);
cleanup_cond:
@@ -1841,18 +1833,17 @@ br_rate_limit_signer(xlator_t *this, int child_count, int numbricks)
if (contribution == 0)
contribution = 1;
spec.rate = BR_HASH_CALC_READ_SIZE * contribution;
- spec.maxlimit = BR_WORKERS * BR_HASH_CALC_READ_SIZE;
+ spec.maxlimit = priv->signer_th_count * BR_HASH_CALC_READ_SIZE;
#endif
if (!spec.rate)
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO,
- "[Rate Limit Info] \"FULL THROTTLE\"");
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO,
+ "FULL THROTTLE", NULL);
else
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO,
- "[Rate Limit Info] \"tokens/sec (rate): %lu, "
- "maxlimit: %lu\"",
- spec.rate, spec.maxlimit);
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO,
+ "tokens/sec-rate=%lu", spec.rate, "maxlimit=%lu", spec.maxlimit,
+ NULL);
priv->tbf = tbf_init(&spec, 1);
return priv->tbf ? 0 : -1;
@@ -1861,11 +1852,16 @@ br_rate_limit_signer(xlator_t *this, int child_count, int numbricks)
static int32_t
br_signer_handle_options(xlator_t *this, br_private_t *priv, dict_t *options)
{
- if (options)
+ if (options) {
GF_OPTION_RECONF("expiry-time", priv->expiry_time, options, uint32,
error_return);
- else
+ GF_OPTION_RECONF("signer-threads", priv->signer_th_count, options,
+ uint32, error_return);
+ } else {
GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return);
+ GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32,
+ error_return);
+ }
return 0;
@@ -1881,6 +1877,8 @@ br_signer_init(xlator_t *this, br_private_t *priv)
GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return);
GF_OPTION_INIT("brick-count", numbricks, int32, error_return);
+ GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32,
+ error_return);
ret = br_rate_limit_signer(this, priv->child_count, numbricks);
if (ret)
@@ -1967,8 +1965,8 @@ br_init_children(xlator_t *this, br_private_t *priv)
child->timer_pool = mem_pool_new(struct gf_tw_timer_list, 4096);
if (!child->timer_pool) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
- "failed to allocate mem-pool for timer");
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_MEM_POOL_ALLOC,
+ NULL);
errno = ENOMEM;
goto freechild;
}
@@ -1994,15 +1992,13 @@ init(xlator_t *this)
br_private_t *priv = NULL;
if (!this->children) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_NO_CHILD,
- "FATAL: no children");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_NO_CHILD, NULL);
goto out;
}
priv = GF_CALLOC(1, sizeof(*priv), gf_br_mt_br_private_t);
if (!priv) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
- "failed to allocate memory (->priv)");
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, NULL);
goto out;
}
@@ -2020,8 +2016,8 @@ init(xlator_t *this)
priv->timer_wheel = glusterfs_ctx_tw_get(this->ctx);
if (!priv->timer_wheel) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_TIMER_WHEEL_UNAVAILABLE,
- "global timer wheel unavailable");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_TIMER_WHEEL_UNAVAILABLE,
+ NULL);
goto cleanup;
}
@@ -2043,15 +2039,14 @@ init(xlator_t *this)
ret = gf_thread_create(&priv->thread, NULL, br_handle_events, this,
"brhevent");
if (ret != 0) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SPAWN_FAILED,
- "thread creation failed");
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_THREAD_CREATION_FAILED,
+ NULL);
ret = -1;
}
if (!ret) {
- gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_BITROT_LOADED,
- "bit-rot xlator loaded in \"%s\" mode",
- (priv->iamscrubber) ? "SCRUBBER" : "SIGNER");
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_BITROT_LOADED, "mode=%s",
+ (priv->iamscrubber) ? "SCRUBBER" : "SIGNER", NULL);
return 0;
}
@@ -2098,9 +2093,8 @@ br_reconfigure_monitor(xlator_t *this)
ret = br_scrub_state_machine(this, _gf_false);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_RESCHEDULE_SCRUBBER_FAILED,
- "Could not reschedule scrubber for the volume. Scrubbing "
- "will continue according to old frequency.");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB,
+ NULL);
}
}
@@ -2211,5 +2205,28 @@ struct volume_options options[] = {
.description = "Pause/Resume scrub. Upon resume, scrubber "
"continues from where it left off.",
},
+ {
+ .key = {"signer-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = BR_WORKERS,
+ .op_version = {GD_OP_VERSION_8_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .description = "Number of signing process threads. As a best "
+ "practice, set this to the number of processor cores",
+ },
{.key = {NULL}},
};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "bit-rot",
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
index 962b4d717e6..8ac7dcdac3d 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -11,17 +11,17 @@
#ifndef __BIT_ROT_H__
#define __BIT_ROT_H__
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "defaults.h"
-#include "syncop.h"
-#include "syncop-utils.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
#include "changelog.h"
#include "timer-wheel.h"
-#include "throttle-tbf.h"
+#include <glusterfs/throttle-tbf.h>
#include "bit-rot-ssm.h"
#include "bit-rot-common.h"
@@ -30,12 +30,6 @@
#include <openssl/sha.h>
-/**
- * TODO: make this configurable. As a best practice, set this to the
- * number of processor cores.
- */
-#define BR_WORKERS 4
-
typedef enum scrub_throttle {
BR_SCRUB_THROTTLE_VOID = -1,
BR_SCRUB_THROTTLE_LAZY = 0,
@@ -108,12 +102,12 @@ struct br_child {
typedef struct br_child br_child_t;
struct br_obj_n_workers {
- struct list_head objects; /* queue of objects expired from the
- timer wheel and ready to be picked
- up for signing */
- pthread_t workers[BR_WORKERS]; /* Threads which pick up the objects
- from the above queue and start
- signing each object */
+ struct list_head objects; /* queue of objects expired from the
+ timer wheel and ready to be picked
+ up for signing */
+ pthread_t *workers; /* Threads which pick up the objects
+ from the above queue and start
+ signing each object */
};
struct br_scrubber {
@@ -209,6 +203,8 @@ struct br_private {
uint32_t expiry_time; /* objects "wait" time */
+ uint32_t signer_th_count; /* Number of signing process threads */
+
tbf_t *tbf; /* token bucket filter */
gf_boolean_t iamscrubber; /* function as a fs scrubber */
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h
index ef683ac7f9f..20561aa7764 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-common.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h
@@ -11,7 +11,7 @@
#ifndef __BIT_ROT_COMMON_H__
#define __BIT_ROT_COMMON_H__
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "bit-rot-object-version.h"
#define BR_VXATTR_VERSION (1 << 0)
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c b/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c
index cb567297b60..8ac13a09941 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c
@@ -133,8 +133,8 @@ br_stub_add(xlator_t *this, uuid_t gfid)
* show up less number of objects. That's fine as we'll have
* the log files that will have the missing information.
*/
- gf_msg(this->name, GF_LOG_WARNING, errno, BRS_MSG_LINK_FAIL,
- "failed to record gfid [%s]", uuid_utoa(gfid));
+ gf_smsg(this->name, GF_LOG_WARNING, errno, BRS_MSG_LINK_FAIL, "gfid=%s",
+ uuid_utoa(gfid), NULL);
}
return 0;
@@ -157,10 +157,8 @@ br_stub_del(xlator_t *this, uuid_t gfid)
uuid_utoa(gfid));
ret = sys_unlink(gfid_path);
if (ret && (errno != ENOENT)) {
- gf_msg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJ_UNLINK_FAIL,
- "%s: failed to delete bad object link from quarantine "
- "directory",
- gfid_path);
+ gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJ_UNLINK_FAIL,
+ "path=%s", gfid_path, NULL);
ret = -errno;
goto out;
}
@@ -200,13 +198,13 @@ br_stub_check_stub_directory(xlator_t *this, char *fullpath)
}
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
- "failed to create stub directory [%s]", fullpath);
+ gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
+ "create-path=%s", fullpath, NULL);
return ret;
error_return:
- gf_msg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
- "Failed to verify stub directory [%s]", fullpath);
+ gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
+ "verify-path=%s", fullpath, NULL);
return -1;
}
@@ -231,8 +229,8 @@ br_stub_check_stub_file(xlator_t *this, char *path)
goto error_return;
fd = sys_creat(path, 0);
if (fd < 0)
- gf_msg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
- "Failed to create stub file [%s]", path);
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ BRS_MSG_BAD_OBJECT_DIR_FAIL, "create-path=%s", path, NULL);
}
if (fd >= 0) {
@@ -243,8 +241,8 @@ br_stub_check_stub_file(xlator_t *this, char *path)
return ret;
error_return:
- gf_msg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
- "Failed to verify stub file [%s]", path);
+ gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
+ "verify-path=%s", path, NULL);
return -1;
}
@@ -463,12 +461,9 @@ br_stub_fill_readdir(fd_t *fd, br_stub_fd_t *fctx, DIR *dir, off_t off,
seekdir(dir, off);
#ifndef GF_LINUX_HOST_OS
if ((u_long)telldir(dir) != off && off != fctx->bad_object.dir_eof) {
- gf_msg(THIS->name, GF_LOG_ERROR, 0,
- BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL,
- "seekdir(0x%llx) failed on dir=%p: "
- "Invalid argument (offset reused from "
- "another DIR * structure?)",
- off, dir);
+ gf_smsg(THIS->name, GF_LOG_ERROR, 0,
+ BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, "off=(0x%llx)", off,
+ "dir=%p", dir, NULL);
errno = EINVAL;
count = -1;
goto out;
@@ -480,9 +475,9 @@ br_stub_fill_readdir(fd_t *fd, br_stub_fd_t *fctx, DIR *dir, off_t off,
in_case = (u_long)telldir(dir);
if (in_case == -1) {
- gf_msg(THIS->name, GF_LOG_ERROR, 0,
- BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL,
- "telldir failed on dir=%p: %s", dir, strerror(errno));
+ gf_smsg(THIS->name, GF_LOG_ERROR, 0,
+ BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL, "dir=%p", dir, "err=%s",
+ strerror(errno), NULL);
goto out;
}
@@ -490,9 +485,9 @@ br_stub_fill_readdir(fd_t *fd, br_stub_fd_t *fctx, DIR *dir, off_t off,
entry = sys_readdir(dir, scratch);
if (!entry || errno != 0) {
if (errno == EBADF) {
- gf_msg(THIS->name, GF_LOG_WARNING, 0,
- BRS_MSG_BAD_OBJECT_DIR_READ_FAIL,
- "readdir failed on dir=%p: %s", dir, strerror(errno));
+ gf_smsg(THIS->name, GF_LOG_WARNING, 0,
+ BRS_MSG_BAD_OBJECT_DIR_READ_FAIL, "dir=%p", dir,
+ "err=%s", strerror(errno), NULL);
goto out;
}
break;
@@ -514,12 +509,9 @@ br_stub_fill_readdir(fd_t *fd, br_stub_fd_t *fctx, DIR *dir, off_t off,
#ifndef GF_LINUX_HOST_OS
if ((u_long)telldir(dir) != in_case &&
in_case != fctx->bad_object.dir_eof) {
- gf_msg(THIS->name, GF_LOG_ERROR, 0,
- BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL,
- "seekdir(0x%llx) failed on dir=%p: "
- "Invalid argument (offset reused from "
- "another DIR * structure?)",
- in_case, dir);
+ gf_smsg(THIS->name, GF_LOG_ERROR, 0,
+ BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, "in_case=(0x%llx)",
+ in_case, "dir=%p", dir, NULL);
errno = EINVAL;
count = -1;
goto out;
@@ -531,9 +523,9 @@ br_stub_fill_readdir(fd_t *fd, br_stub_fd_t *fctx, DIR *dir, off_t off,
this_entry = gf_dirent_for_name(entry->d_name);
if (!this_entry) {
- gf_msg(THIS->name, GF_LOG_ERROR, 0, BRS_MSG_NO_MEMORY,
- "could not create gf_dirent for entry %s: (%s)",
- entry->d_name, strerror(errno));
+ gf_smsg(THIS->name, GF_LOG_ERROR, 0,
+ BRS_MSG_CREATE_GF_DIRENT_FAILED, "entry-name=%s",
+ entry->d_name, "err=%s", strerror(errno), NULL);
goto out;
}
/*
@@ -580,8 +572,8 @@ br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd,
fctx = br_stub_fd_ctx_get(this, fd);
if (!fctx) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_FD_CONTEXT_FAILED,
- "pfd is NULL, fd=%p", fd);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_FD_CONTEXT_FAILED,
+ "fd=%p", fd, NULL);
op_errno = -ret;
goto done;
}
@@ -589,8 +581,8 @@ br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd,
dir = fctx->bad_object.dir;
if (!dir) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_BAD_HANDLE_DIR_NULL,
- "dir is NULL for fd=%p", fd);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_BAD_HANDLE_DIR_NULL,
+ "fd=%p", fd, NULL);
op_errno = EINVAL;
goto done;
}
@@ -680,10 +672,7 @@ br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries,
* be shown.
*/
if (!tmp_dict) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NO_MEMORY,
- "failed to allocate new dict for saving the paths "
- "of the corrupted objects. Scrub status will only "
- "display the gfid");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_FAILED, NULL);
goto out;
}
}
@@ -707,9 +696,8 @@ br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries,
uuid_utoa(gfid), hpath);
br_stub_entry_xattr_fill(this, hpath, entry, tmp_dict);
} else
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
- "failed to get the path for the inode %s",
- uuid_utoa_r(gfid, str_gfid));
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
+ "gfid=%s", uuid_utoa_r(gfid, str_gfid), NULL);
inode = NULL;
hpath = NULL;
@@ -744,10 +732,8 @@ br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode,
ret = syncop_gfid_to_path_hard(parent->table, FIRST_CHILD(this), gfid,
inode, path, _gf_true);
if (ret < 0)
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
- "failed to get the path xattr from disk for the "
- " gfid %s. Trying to get path from the memory",
- uuid_utoa_r(gfid, gfid_str));
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
+ "gfid=%s", uuid_utoa_r(gfid, gfid_str), NULL);
/*
* Try with soft resolution of path if hard resolve fails. Because
@@ -768,9 +754,8 @@ br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode,
ret = syncop_gfid_to_path_hard(parent->table, FIRST_CHILD(this), gfid,
inode, path, _gf_false);
if (ret < 0)
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
- "failed to get the path from the memory for gfid %s",
- uuid_utoa_r(gfid, gfid_str));
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
+ "from-memory gfid=%s", uuid_utoa_r(gfid, gfid_str), NULL);
}
out:
@@ -804,10 +789,8 @@ br_stub_entry_xattr_fill(xlator_t *this, char *hpath, gf_dirent_t *entry,
ret = dict_set_dynstr(dict, entry->d_name, hpath);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_DICT_SET_FAILED,
- "failed to set the actual path %s as the value in the "
- "dict for the corrupted object %s",
- hpath, entry->d_name);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_DICT_SET_FAILED,
+ "path=%s", hpath, "object-name=%s", entry->d_name, NULL);
out:
return;
}
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
index a3e7b03291e..9d93caf069f 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
@@ -11,7 +11,7 @@
#ifndef _BR_MEM_TYPES_H
#define _BR_MEM_TYPES_H
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum br_mem_types {
gf_br_stub_mt_private_t = gf_common_mt_end + 1,
@@ -29,6 +29,7 @@ enum br_mem_types {
gf_br_stub_mt_sigstub_t,
gf_br_mt_br_child_event_t,
gf_br_stub_mt_misc,
+ gf_br_mt_br_worker_t,
gf_br_stub_mt_end,
};
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h
index cccc3b9c599..6c15a166f18 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h
@@ -11,7 +11,7 @@
#ifndef _BITROT_STUB_MESSAGES_H_
#define _BITROT_STUB_MESSAGES_H_
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
/* To add new message IDs, append new identifiers at the end of the list.
*
@@ -39,6 +39,79 @@ GLFS_MSGID(BITROT_STUB, BRS_MSG_NO_MEMORY, BRS_MSG_SET_EVENT_FAILED,
BRS_MSG_BAD_HANDLE_DIR_NULL, BRS_MSG_BAD_OBJ_THREAD_FAIL,
BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL, BRS_MSG_LINK_FAIL,
BRS_MSG_BAD_OBJ_UNLINK_FAIL, BRS_MSG_DICT_SET_FAILED,
- BRS_MSG_PATH_GET_FAILED);
+ BRS_MSG_PATH_GET_FAILED, BRS_MSG_NULL_LOCAL,
+ BRS_MSG_SPAWN_SIGN_THRD_FAILED, BRS_MSG_KILL_SIGN_THREAD,
+ BRS_MSG_NON_BITD_PID, BRS_MSG_SIGN_PREPARE_FAIL,
+ BRS_MSG_USING_DEFAULT_THREAD_SIZE, BRS_MSG_ALLOC_MEM_FAILED,
+ BRS_MSG_DICT_ALLOC_FAILED, BRS_MSG_CREATE_GF_DIRENT_FAILED,
+ BRS_MSG_ALLOC_FAILED, BRS_MSG_PATH_XATTR_GET_FAILED,
+ BRS_MSG_VERSION_PREPARE_FAIL);
+#define BRS_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed"
+#define BRS_MSG_BAD_OBJ_THREAD_FAIL_STR "pthread_init failed"
+#define BRS_MSG_USING_DEFAULT_THREAD_SIZE_STR "Using default thread stack size"
+#define BRS_MSG_NO_CHILD_STR "FATAL: no children"
+#define BRS_MSG_SPAWN_SIGN_THRD_FAILED_STR \
+ "failed to create the new thread for signer"
+#define BRS_MSG_BAD_CONTAINER_FAIL_STR \
+ "failed to launch the thread for storing bad gfids"
+#define BRS_MSG_CANCEL_SIGN_THREAD_FAILED_STR \
+ "Could not cancel sign serializer thread"
+#define BRS_MSG_KILL_SIGN_THREAD_STR "killed the signer thread"
+#define BRS_MSG_GET_INODE_CONTEXT_FAILED_STR \
+ "failed to init the inode context for the inode"
+#define BRS_MSG_ADD_FD_TO_INODE_STR "failed to add fd to the inode"
+#define BRS_MSG_NO_MEMORY_STR "local allocation failed"
+#define BRS_MSG_BAD_OBJECT_ACCESS_STR "bad object accessed. Returning"
+#define BRS_MSG_SIGN_VERSION_ERROR_STR "Signing version exceeds current version"
+#define BRS_MSG_NON_BITD_PID_STR \
+ "PID from where signature request came, does not belong to bit-rot " \
+ "daemon. Unwinding the fop"
+#define BRS_MSG_SIGN_PREPARE_FAIL_STR \
+ "failed to prepare the signature. Unwinding the fop"
+#define BRS_MSG_VERSION_PREPARE_FAIL_STR \
+ "failed to prepare the version. Unwinding the fop"
+#define BRS_MSG_STUB_ALLOC_FAILED_STR "failed to allocate stub fop, Unwinding"
+#define BRS_MSG_BAD_OBJ_MARK_FAIL_STR "failed to mark object as bad"
+#define BRS_MSG_NON_SCRUB_BAD_OBJ_MARK_STR \
+ "bad object marking is not from the scrubber"
+#define BRS_MSG_ALLOC_MEM_FAILED_STR "failed to allocate memory"
+#define BRS_MSG_SET_INTERNAL_XATTR_STR "called on the internal xattr"
+#define BRS_MSG_REMOVE_INTERNAL_XATTR_STR "removexattr called on internal xattr"
+#define BRS_MSG_CREATE_ANONYMOUS_FD_FAILED_STR \
+ "failed to create anonymous fd for the inode"
+#define BRS_MSG_ADD_FD_TO_LIST_FAILED_STR "failed add fd to the list"
+#define BRS_MSG_SET_FD_CONTEXT_FAILED_STR \
+ "failed to set the fd context for the file"
+#define BRS_MSG_NULL_LOCAL_STR "local is NULL"
+#define BRS_MSG_DICT_ALLOC_FAILED_STR \
+ "dict allocation failed: cannot send IPC FOP to changelog"
+#define BRS_MSG_SET_EVENT_FAILED_STR "cannot set release event in dict"
+#define BRS_MSG_CREATE_FRAME_FAILED_STR "create_frame() failure"
+#define BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL_STR "closedir error"
+#define BRS_MSG_LINK_FAIL_STR "failed to record gfid"
+#define BRS_MSG_BAD_OBJ_UNLINK_FAIL_STR \
+ "failed to delete bad object link from quaratine directory"
+#define BRS_MSG_BAD_OBJECT_DIR_FAIL_STR "failed stub directory"
+#define BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL_STR \
+ "seekdir failed. Invalid argument (offset reused from another DIR * " \
+ "structure)"
+#define BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL_STR "telldir failed on dir"
+#define BRS_MSG_BAD_OBJECT_DIR_READ_FAIL_STR "readdir failed on dir"
+#define BRS_MSG_CREATE_GF_DIRENT_FAILED_STR "could not create gf_dirent"
+#define BRS_MSG_GET_FD_CONTEXT_FAILED_STR "pfd is NULL"
+#define BRS_MSG_BAD_HANDLE_DIR_NULL_STR "dir if NULL"
+#define BRS_MSG_ALLOC_FAILED_STR \
+ "failed to allocate new dict for saving the paths of the corrupted " \
+ "objects. Scrub status will only display the gfid"
+#define BRS_MSG_PATH_GET_FAILED_STR "failed to get the path"
+#define BRS_MSG_PATH_XATTR_GET_FAILED_STR \
+ "failed to get the path xattr from disk for the gfid. Trying to get path " \
+ "from the memory"
+#define BRS_MSG_DICT_SET_FAILED_STR \
+ "failed to set the actual path as the value in the dict for the " \
+ "corrupted object"
+#define BRS_MSG_SET_CONTEXT_FAILED_STR \
+ "could not set fd context for release callback"
+#define BRS_MSG_CHANGE_VERSION_FAILED_STR "change version failed"
#endif /* !_BITROT_STUB_MESSAGES_H_ */
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
index 2f5cc2b18dd..447dd47ff41 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
@@ -12,12 +12,11 @@
#include <sys/uio.h>
#include <signal.h>
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
#include "changelog.h"
-#include "compat-errno.h"
-#include "call-stub.h"
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/call-stub.h>
#include "bit-rot-stub.h"
#include "bit-rot-stub-mem-types.h"
@@ -26,6 +25,15 @@
#define BR_STUB_REQUEST_COOKIE 0x1
+void
+br_stub_lock_cleaner(void *arg)
+{
+ pthread_mutex_t *clean_mutex = arg;
+
+ pthread_mutex_unlock(clean_mutex);
+ return;
+}
+
void *
br_stub_signth(void *);
@@ -48,8 +56,7 @@ mem_acct_init(xlator_t *this)
ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_MEM_ACNT_FAILED,
- "Memory accounting init failed");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_MEM_ACNT_FAILED, NULL);
return ret;
}
@@ -64,29 +71,29 @@ br_stub_bad_object_container_init(xlator_t *this, br_stub_private_t *priv)
ret = pthread_cond_init(&priv->container.bad_cond, NULL);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
- "pthread_cond_init failed (%d)", ret);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
+ "cond_init ret=%d", ret, NULL);
goto out;
}
ret = pthread_mutex_init(&priv->container.bad_lock, NULL);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
- "pthread_mutex_init failed (%d)", ret);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
+ "mutex_init ret=%d", ret, NULL);
goto cleanup_cond;
}
ret = pthread_attr_init(&w_attr);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
- "pthread_attr_init failed (%d)", ret);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
+ "attr_init ret=%d", ret, NULL);
goto cleanup_lock;
}
ret = pthread_attr_setstacksize(&w_attr, BAD_OBJECT_THREAD_STACK_SIZE);
if (ret == EINVAL) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
- "Using default thread stack size");
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ BRS_MSG_USING_DEFAULT_THREAD_SIZE, NULL);
}
INIT_LIST_HEAD(&priv->container.bad_queue);
@@ -122,8 +129,7 @@ init(xlator_t *this)
br_stub_private_t *priv = NULL;
if (!this->children) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NO_CHILD,
- "FATAL: no children");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NO_CHILD, NULL);
goto error_return;
}
@@ -161,16 +167,20 @@ init(xlator_t *this)
* assigned inside the thread. So setting this->private here.
*/
this->private = priv;
+ if (!priv->do_versioning)
+ return 0;
ret = gf_thread_create(&priv->signth, NULL, br_stub_signth, this,
"brssign");
- if (ret != 0)
+ if (ret != 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SPAWN_SIGN_THRD_FAILED,
+ NULL);
goto cleanup_lock;
+ }
ret = br_stub_bad_object_container_init(this, priv);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL,
- "failed to launch the thread for storing bad gfids");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL, NULL);
goto cleanup_lock;
}
@@ -183,6 +193,7 @@ cleanup_lock:
pthread_mutex_destroy(&priv->lock);
free_mempool:
mem_pool_destroy(priv->local_pool);
+ priv->local_pool = NULL;
free_priv:
GF_FREE(priv);
this->private = NULL;
@@ -211,10 +222,62 @@ reconfigure(xlator_t *this, dict_t *options)
priv = this->private;
- GF_OPTION_RECONF("bitrot", priv->do_versioning, options, bool, out);
+ GF_OPTION_RECONF("bitrot", priv->do_versioning, options, bool, err);
+ if (priv->do_versioning && !priv->signth) {
+ ret = gf_thread_create(&priv->signth, NULL, br_stub_signth, this,
+ "brssign");
+ if (ret != 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ BRS_MSG_SPAWN_SIGN_THRD_FAILED, NULL);
+ goto err;
+ }
+
+ ret = br_stub_bad_object_container_init(this, priv);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL,
+ NULL);
+ goto err;
+ }
+ } else {
+ if (priv->signth) {
+ if (gf_thread_cleanup_xint(priv->signth)) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL);
+ } else {
+ gf_smsg(this->name, GF_LOG_INFO, 0, BRS_MSG_KILL_SIGN_THREAD,
+ NULL);
+ priv->signth = 0;
+ }
+ }
+
+ if (priv->container.thread) {
+ if (gf_thread_cleanup_xint(priv->container.thread)) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL);
+ }
+ priv->container.thread = 0;
+ }
+ }
ret = 0;
-out:
+ return ret;
+err:
+ if (priv->signth) {
+ if (gf_thread_cleanup_xint(priv->signth)) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL);
+ }
+ priv->signth = 0;
+ }
+
+ if (priv->container.thread) {
+ if (gf_thread_cleanup_xint(priv->container.thread)) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL);
+ }
+ priv->container.thread = 0;
+ }
+ ret = -1;
return ret;
}
@@ -245,10 +308,13 @@ fini(xlator_t *this)
if (!priv)
return;
+ if (!priv->do_versioning)
+ goto cleanup;
+
ret = gf_thread_cleanup_xint(priv->signth);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED,
- "Could not cancel sign serializer thread");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED,
+ NULL);
goto out;
}
priv->signth = 0;
@@ -262,13 +328,10 @@ fini(xlator_t *this)
GF_FREE(sigstub);
}
- pthread_mutex_destroy(&priv->lock);
- pthread_cond_destroy(&priv->cond);
-
ret = gf_thread_cleanup_xint(priv->container.thread);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED,
- "Could not cancel sign serializer thread");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED,
+ NULL);
goto out;
}
@@ -280,14 +343,18 @@ fini(xlator_t *this)
call_stub_destroy(stub);
}
+ pthread_mutex_destroy(&priv->container.bad_lock);
+ pthread_cond_destroy(&priv->container.bad_cond);
+
+cleanup:
+ pthread_mutex_destroy(&priv->lock);
+ pthread_cond_destroy(&priv->cond);
+
if (priv->local_pool) {
mem_pool_destroy(priv->local_pool);
priv->local_pool = NULL;
}
- pthread_mutex_destroy(&priv->container.bad_lock);
- pthread_cond_destroy(&priv->container.bad_cond);
-
this->private = NULL;
GF_FREE(priv);
@@ -357,8 +424,8 @@ br_stub_prepare_version_request(xlator_t *this, dict_t *dict,
priv = this->private;
br_set_ongoingversion(obuf, oversion, priv->boot);
- return dict_set_static_bin(dict, BITROT_CURRENT_VERSION_KEY, (void *)obuf,
- sizeof(br_version_t));
+ return dict_set_bin(dict, BITROT_CURRENT_VERSION_KEY, (void *)obuf,
+ sizeof(br_version_t));
}
static int
@@ -369,8 +436,7 @@ br_stub_prepare_signing_request(dict_t *dict, br_signature_t *sbuf,
br_set_signature(sbuf, sign, signaturelen, &size);
- return dict_set_static_bin(dict, BITROT_SIGNING_VERSION_KEY, (void *)sbuf,
- size);
+ return dict_set_bin(dict, BITROT_SIGNING_VERSION_KEY, (void *)sbuf, size);
}
/**
@@ -410,7 +476,7 @@ br_stub_init_inode_versions(xlator_t *this, fd_t *fd, inode_t *inode,
goto free_ctx;
if (ctx_addr)
- *ctx_addr = (uint64_t)ctx;
+ *ctx_addr = (uint64_t)(uintptr_t)ctx;
return 0;
free_ctx:
@@ -510,11 +576,9 @@ br_stub_need_versioning(xlator_t *this, fd_t *fd, gf_boolean_t *versioning,
ret = br_stub_init_inode_versions(this, fd, fd->inode, version,
_gf_true, _gf_false, &ctx_addr);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- BRS_MSG_GET_INODE_CONTEXT_FAILED,
- "failed to "
- " init the inode context for the inode %s",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRS_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+ uuid_utoa(fd->inode->gfid), NULL);
goto error_return;
}
}
@@ -548,10 +612,8 @@ br_stub_anon_fd_ctx(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
if (!br_stub_fd) {
ret = br_stub_add_fd_to_inode(this, fd, ctx);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_INODE,
- "failed to add fd to "
- "the inode (gfid: %s)",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_INODE,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto out;
}
}
@@ -571,9 +633,8 @@ br_stub_versioning_prep(call_frame_t *frame, xlator_t *this, fd_t *fd,
local = br_stub_alloc_local(this);
if (!local) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_NO_MEMORY,
- "local allocation failed (gfid: %s)",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_NO_MEMORY, "gfid=%s",
+ uuid_utoa(fd->inode->gfid), NULL);
goto error_return;
}
@@ -643,8 +704,8 @@ br_stub_check_bad_object(xlator_t *this, inode_t *inode, int32_t *op_ret,
ret = br_stub_is_bad_object(this, inode);
if (ret == -2) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJECT_ACCESS,
- "%s is a bad object. Returning", uuid_utoa(inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJECT_ACCESS,
+ "gfid=%s", uuid_utoa(inode->gfid), NULL);
*op_ret = -1;
*op_errno = EIO;
}
@@ -653,9 +714,9 @@ br_stub_check_bad_object(xlator_t *this, inode_t *inode, int32_t *op_ret,
ret = br_stub_init_inode_versions(this, NULL, inode, version, _gf_true,
_gf_false, NULL);
if (ret) {
- gf_msg(
- this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
- "failed to init inode context for %s", uuid_utoa(inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRS_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+ uuid_utoa(inode->gfid), NULL);
*op_ret = -1;
*op_errno = EINVAL;
}
@@ -792,23 +853,27 @@ br_stub_perform_incversioning(xlator_t *this, call_frame_t *frame,
op_errno = ENOMEM;
dict = dict_new();
if (!dict)
- goto done;
+ goto out;
ret = br_stub_alloc_versions(&obuf, NULL, 0);
- if (ret)
- goto dealloc_dict;
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+ goto out;
+ }
ret = br_stub_prepare_version_request(this, dict, obuf, writeback_version);
- if (ret)
- goto dealloc_versions;
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_VERSION_PREPARE_FAIL,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+ br_stub_dealloc_versions(obuf);
+ goto out;
+ }
ret = br_stub_fd_versioning(
this, frame, stub, dict, fd, br_stub_fd_incversioning_cbk,
writeback_version, BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE);
-
-dealloc_versions:
- br_stub_dealloc_versions(obuf);
-dealloc_dict:
- dict_unref(dict);
-done:
+out:
+ if (dict)
+ dict_unref(dict);
if (ret) {
if (local)
frame->local = NULL;
@@ -846,6 +911,24 @@ br_stub_signth(void *arg)
THIS = this;
while (1) {
+ /*
+ * Disabling bit-rot feature leads to this particular thread
+ * getting cleaned up by reconfigure via a call to the function
+ * gf_thread_cleanup_xint (which in turn calls pthread_cancel
+ * and pthread_join). But, if this thread had held the mutex
+ * &priv->lock at the time of cancellation, then it leads to
+ * deadlock in future when bit-rot feature is enabled (which
+ * again spawns this thread which cant hold the lock as the
+ * mutex is still held by the previous instance of the thread
+ * which got killed). Also, the br_stub_handle_object_signature
+ * function which is called whenever file has to be signed
+ * also gets blocked as it too attempts to acquire &priv->lock.
+ *
+ * So, arrange for the lock to be unlocked as part of the
+ * cleanup of this thread using pthread_cleanup_push and
+ * pthread_cleanup_pop.
+ */
+ pthread_cleanup_push(br_stub_lock_cleaner, &priv->lock);
pthread_mutex_lock(&priv->lock);
{
while (list_empty(&priv->squeue))
@@ -856,6 +939,7 @@ br_stub_signth(void *arg)
list_del_init(&sigstub->list);
}
pthread_mutex_unlock(&priv->lock);
+ pthread_cleanup_pop(0);
call_resume(sigstub->stub);
@@ -931,10 +1015,9 @@ br_stub_compare_sign_version(xlator_t *this, inode_t *inode,
if (invalid) {
ret = -1;
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_VERSION_ERROR,
- "Signing version exceeds "
- "current version [%lu > %lu]",
- sbuf->signedversion, ctx->currentversion);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_VERSION_ERROR,
+ "Signing-ver=%lu", sbuf->signedversion, "current-ver=%lu",
+ ctx->currentversion, NULL);
}
out:
@@ -945,31 +1028,36 @@ static int
br_stub_prepare_signature(xlator_t *this, dict_t *dict, inode_t *inode,
br_isignature_t *sign, int *fakesuccess)
{
- int32_t ret = 0;
+ int32_t ret = -1;
size_t signaturelen = 0;
br_signature_t *sbuf = NULL;
if (!br_is_signature_type_valid(sign->signaturetype))
- goto error_return;
+ goto out;
signaturelen = sign->signaturelen;
ret = br_stub_alloc_versions(NULL, &sbuf, signaturelen);
- if (ret)
- goto error_return;
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED,
+ "gfid=%s", uuid_utoa(inode->gfid), NULL);
+ ret = -1;
+ goto out;
+ }
ret = br_stub_prepare_signing_request(dict, sbuf, sign, signaturelen);
- if (ret)
- goto dealloc_versions;
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SIGN_PREPARE_FAIL,
+ "gfid=%s", uuid_utoa(inode->gfid), NULL);
+ ret = -1;
+ br_stub_dealloc_versions(sbuf);
+ goto out;
+ }
+ /* At this point sbuf has been added to dict, so the memory will be freed
+ * when the data from the dict is destroyed
+ */
ret = br_stub_compare_sign_version(this, inode, sbuf, dict, fakesuccess);
- if (ret)
- goto dealloc_versions;
-
- return 0;
-
-dealloc_versions:
- br_stub_dealloc_versions(sbuf);
-error_return:
- return -1;
+out:
+ return ret;
}
static void
@@ -986,12 +1074,18 @@ br_stub_handle_object_signature(call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
- if (frame->root->pid != GF_CLIENT_PID_BITD)
+ if (frame->root->pid != GF_CLIENT_PID_BITD) {
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, BRS_MSG_NON_BITD_PID,
+ "PID=%d", frame->root->pid, NULL);
goto dofop;
+ }
ret = br_stub_prepare_signature(this, dict, fd->inode, sign, &fakesuccess);
- if (ret)
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_PREPARE_FAIL,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto dofop;
+ }
if (fakesuccess) {
op_ret = op_errno = 0;
goto dofop;
@@ -1141,10 +1235,8 @@ br_stub_handle_object_reopen(call_frame_t *frame, xlator_t *this, fd_t *fd,
stub = fop_fsetxattr_cbk_stub(frame, br_stub_fsetxattr_resume, 0, 0, NULL);
if (!stub) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
- "failed to allocate stub for fsetxattr fop (gfid: %s),"
- " unwinding",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
+ "fsetxattr gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto cleanup_local;
}
@@ -1198,9 +1290,8 @@ br_stub_fsetxattr_bad_object_cbk(call_frame_t *frame, void *cookie,
*/
ret = br_stub_mark_object_bad(this, local->u.context.inode);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_MARK_FAIL,
- "failed to mark object %s as bad",
- uuid_utoa(local->u.context.inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_MARK_FAIL,
+ "gfid=%s", uuid_utoa(local->u.context.inode->gfid), NULL);
ret = br_stub_add(this, local->u.context.inode->gfid);
@@ -1220,18 +1311,15 @@ br_stub_handle_bad_object_key(call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t op_errno = EINVAL;
if (frame->root->pid != GF_CLIENT_PID_SCRUB) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NON_SCRUB_BAD_OBJ_MARK,
- "bad object marking "
- "on %s is not from the scrubber",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NON_SCRUB_BAD_OBJ_MARK,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto unwind;
}
local = br_stub_alloc_local(this);
if (!local) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NO_MEMORY,
- "failed to allocate memory for fsetxattr on %s",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED,
+ "fsetxattr gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
op_ret = -1;
op_errno = ENOMEM;
goto unwind;
@@ -1270,10 +1358,9 @@ br_stub_handle_internal_xattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t op_ret = -1;
int32_t op_errno = EINVAL;
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR,
- "setxattr called"
- " on the internal xattr %s for inode %s",
- key, uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR,
+ "setxattr key=%s", key, "inode-gfid=%s", uuid_utoa(fd->inode->gfid),
+ NULL);
STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL);
return 0;
@@ -1291,10 +1378,8 @@ br_stub_dump_xattr(xlator_t *this, dict_t *dict, int *op_errno)
goto out;
}
dict_dump_to_str(dict, dump, BR_STUB_DUMP_STR_SIZE, format);
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR,
- "fsetxattr called on "
- "internal xattr %s",
- dump);
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR,
+ "fsetxattr dump=%s", dump, NULL);
out:
if (dump) {
GF_FREE(dump);
@@ -1331,6 +1416,8 @@ br_stub_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
/* object signature request */
ret = dict_get_bin(dict, GLUSTERFS_SET_OBJECT_SIGNATURE, (void **)&sign);
if (!ret) {
+ gf_msg_debug(this->name, 0, "got SIGNATURE request on %s",
+ uuid_utoa(fd->inode->gfid));
br_stub_handle_object_signature(frame, this, fd, dict, sign, xdata);
goto done;
}
@@ -1423,10 +1510,8 @@ br_stub_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
if (!strcmp(BITROT_OBJECT_BAD_KEY, name) ||
!strcmp(BITROT_SIGNING_VERSION_KEY, name) ||
!strcmp(BITROT_CURRENT_VERSION_KEY, name)) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR,
- "removexattr called"
- " on internal xattr %s for file %s",
- name, loc->path);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR,
+ "name=%s", name, "file-path=%s", loc->path, NULL);
goto unwind;
}
@@ -1448,10 +1533,9 @@ br_stub_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
if (!strcmp(BITROT_OBJECT_BAD_KEY, name) ||
!strcmp(BITROT_SIGNING_VERSION_KEY, name) ||
!strcmp(BITROT_CURRENT_VERSION_KEY, name)) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR,
- "removexattr called"
- " on internal xattr %s for inode %s",
- name, uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR,
+ "name=%s", name, "inode-gfid=%s", uuid_utoa(fd->inode->gfid),
+ NULL);
goto unwind;
}
@@ -1476,7 +1560,7 @@ br_stub_listxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret < 0)
goto unwind;
- br_stub_remove_vxattrs(xattr);
+ br_stub_remove_vxattrs(xattr, _gf_true);
unwind:
STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, xdata);
@@ -1537,10 +1621,8 @@ br_stub_is_object_stale(xlator_t *this, call_frame_t *frame, inode_t *inode,
ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
- "failed to get the "
- "inode context for %s",
- uuid_utoa(inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+ "gfid=%s", uuid_utoa(inode->gfid), NULL);
goto out;
}
@@ -1588,6 +1670,11 @@ br_stub_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
frame->local = NULL;
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto unwind;
+ }
inode = local->u.context.inode;
op_ret = -1;
@@ -1650,14 +1737,12 @@ br_stub_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
op_ret = totallen;
delkeys:
- br_stub_remove_vxattrs(xattr);
+ br_stub_remove_vxattrs(xattr, _gf_true);
unwind:
STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, xdata);
- if (local) {
- br_stub_cleanup_local(local);
- br_stub_dealloc_local(local);
- }
+ br_stub_cleanup_local(local);
+ br_stub_dealloc_local(local);
return 0;
}
@@ -1708,9 +1793,7 @@ br_stub_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
void *cookie = NULL;
- uuid_t rootgfid = {
- 0,
- };
+ static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
fop_getxattr_cbk_t cbk = br_stub_getxattr_cbk;
int32_t op_ret = -1;
int32_t op_errno = EINVAL;
@@ -1722,8 +1805,6 @@ br_stub_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
GF_VALIDATE_OR_GOTO(this->name, this->private, unwind);
GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind);
- rootgfid[15] = 1;
-
if (!name) {
cbk = br_stub_listxattr_cbk;
goto wind;
@@ -1793,16 +1874,13 @@ br_stub_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
const char *name, dict_t *xdata)
{
void *cookie = NULL;
- uuid_t rootgfid = {
- 0,
- };
+ static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
fop_fgetxattr_cbk_t cbk = br_stub_getxattr_cbk;
int32_t op_ret = -1;
int32_t op_errno = EINVAL;
br_stub_local_t *local = NULL;
br_stub_private_t *priv = NULL;
- rootgfid[15] = 1;
priv = this->private;
if (!name) {
@@ -2022,10 +2100,8 @@ br_stub_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
offset, flags, iobref, xdata);
if (!stub) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
- "failed to allocate stub for write fop (gfid: %s), "
- "unwinding",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
+ "write gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto cleanup_local;
}
@@ -2138,10 +2214,8 @@ br_stub_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
stub = fop_ftruncate_stub(frame, br_stub_ftruncate_resume, fd, offset,
xdata);
if (!stub) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
- "failed to allocate stub for ftruncate fop (gfid: %s),"
- " unwinding",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
+ "ftruncate gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto cleanup_local;
}
@@ -2245,10 +2319,8 @@ br_stub_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
fd = fd_anonymous(loc->inode);
if (!fd) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CREATE_ANONYMOUS_FD_FAILED,
- "failed to create "
- "anonymous fd for the inode %s",
- uuid_utoa(loc->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CREATE_ANONYMOUS_FD_FAILED,
+ "inode-gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
goto unwind;
}
@@ -2278,10 +2350,8 @@ br_stub_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
stub = fop_truncate_stub(frame, br_stub_truncate_resume, loc, offset,
xdata);
if (!stub) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
- "failed to allocate stub for truncate fop (gfid: %s), "
- "unwinding",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
+ "truncate gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto cleanup_local;
}
@@ -2354,11 +2424,9 @@ br_stub_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
ret = br_stub_init_inode_versions(this, fd, fd->inode, version,
_gf_true, _gf_false, &ctx_addr);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- BRS_MSG_GET_INODE_CONTEXT_FAILED,
- "failed to init the inode context for "
- "the file %s (gfid: %s)",
- loc->path, uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ BRS_MSG_GET_INODE_CONTEXT_FAILED, "path=%s", loc->path,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto unwind;
}
}
@@ -2377,9 +2445,8 @@ br_stub_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
ret = br_stub_add_fd_to_inode(this, fd, ctx);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_LIST_FAILED,
- "failed add fd to the list (gfid: %s)",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_LIST_FAILED,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto unwind;
}
@@ -2410,10 +2477,8 @@ br_stub_add_fd_to_inode(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
ret = br_stub_require_release_call(this, fd, &br_stub_fd);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_FD_CONTEXT_FAILED,
- "failed to set the fd "
- "context for the file (gfid: %s)",
- uuid_utoa(fd->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_FD_CONTEXT_FAILED,
+ "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
goto out;
}
@@ -2700,17 +2765,37 @@ br_stub_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!IA_ISREG(entry->d_stat.ia_type))
continue;
+ /*
+ * Readdirp for most part is a bulk lookup for all the entries
+ * present in the directory being read. Ideally, for each
+ * entry, the handling should be similar to that of a lookup
+ * callback. But for now, just keeping this as it has been
+ * until now (which means, this comment has been added much
+ * later as part of a change that wanted to send the flag
+ * of true/false to br_stub_remove_vxattrs to indicate whether
+ * the bad-object xattr should be removed from the entry->dict
+ * or not). Until this change, the function br_stub_remove_vxattrs
+ * was just removing all the xattrs associated with bit-rot-stub
+ * (like version, bad-object, signature etc). But, there are
+ * scenarios where we only want to send bad-object xattr and not
+ * others. So this comment is part of that change which also
+ * mentions about another possible change that might be needed
+ * in future.
+ * But for now, adding _gf_true means functionally its same as
+ * what this function was doing before. Just remove all the stub
+ * related xattrs.
+ */
ret = br_stub_get_inode_ctx(this, entry->inode, &ctxaddr);
if (ret < 0)
ctxaddr = 0;
if (ctxaddr) { /* already has the context */
- br_stub_remove_vxattrs(entry->dict);
+ br_stub_remove_vxattrs(entry->dict, _gf_true);
continue;
}
ret = br_stub_lookup_version(this, entry->inode->gfid, entry->inode,
entry->dict);
- br_stub_remove_vxattrs(entry->dict);
+ br_stub_remove_vxattrs(entry->dict, _gf_true);
if (ret) {
/**
* there's no per-file granularity support in case of
@@ -2846,13 +2931,22 @@ br_stub_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t ret = 0;
br_stub_private_t *priv = NULL;
gf_boolean_t ver_enabled = _gf_false;
+ gf_boolean_t remove_bad_file_marker = _gf_true;
BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled);
priv = this->private;
if (op_ret < 0) {
(void)br_stub_handle_lookup_error(this, inode, op_errno);
- goto unwind;
+
+ /*
+ * If the lookup error is not ENOENT, then it is better
+ * to send the bad file marker to the higher layer (if
+ * it has been set)
+ */
+ if (op_errno != ENOENT)
+ remove_bad_file_marker = _gf_false;
+ goto delkey;
}
BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), delkey);
@@ -2873,7 +2967,13 @@ br_stub_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (ret) {
op_ret = -1;
op_errno = EIO;
- goto unwind;
+ /*
+ * This flag ensures that in the label @delkey below,
+ * bad file marker is not removed from the dictinary,
+ * but other virtual xattrs (such as version, signature)
+ * are removed.
+ */
+ remove_bad_file_marker = _gf_false;
}
goto delkey;
}
@@ -2897,11 +2997,11 @@ br_stub_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
*/
op_ret = -1;
op_errno = EIO;
- goto unwind;
+ goto delkey;
}
delkey:
- br_stub_remove_vxattrs(xattr);
+ br_stub_remove_vxattrs(xattr, remove_bad_file_marker);
unwind:
STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
postparent);
@@ -3086,6 +3186,10 @@ br_stub_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret < 0)
goto unwind;
+ if (!local) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_NULL_LOCAL, NULL);
+ goto unwind;
+ }
inode = local->u.context.inode;
if (!IA_ISREG(inode->ia_type))
goto unwind;
@@ -3101,9 +3205,8 @@ br_stub_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
* has to be removed manually. Its not a good idea to fail
* the fop, as the object has already been deleted.
*/
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
- "failed to get the context for the inode %s",
- uuid_utoa(inode->gfid));
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+ "inode-gfid=%s", uuid_utoa(inode->gfid), NULL);
goto unwind;
}
@@ -3146,9 +3249,9 @@ br_stub_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag,
if (!local) {
op_ret = -1;
op_errno = ENOMEM;
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_NO_MEMORY,
- "failed to allocate memory for local (path: %s, gfid: %s)",
- loc->path, uuid_utoa(loc->inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_ALLOC_MEM_FAILED,
+ "local path=%s", loc->path, "gfid=%s",
+ uuid_utoa(loc->inode->gfid), NULL);
goto unwind;
}
@@ -3223,23 +3326,21 @@ br_stub_send_ipc_fop(xlator_t *this, fd_t *fd, unsigned long releaseversion,
xdata = dict_new();
if (!xdata) {
- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, BRS_MSG_NO_MEMORY,
- "dict allocation failed: cannot send IPC FOP "
- "to changelog");
+ gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, BRS_MSG_DICT_ALLOC_FAILED,
+ NULL);
goto out;
}
ret = dict_set_static_bin(xdata, "RELEASE-EVENT", &ev, CHANGELOG_EV_SIZE);
if (ret) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_EVENT_FAILED,
- "cannot set release event in dict");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_EVENT_FAILED, NULL);
goto dealloc_dict;
}
frame = create_frame(this, this->ctx->pool);
if (!frame) {
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_CREATE_FRAME_FAILED,
- "create_frame() failure");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_CREATE_FRAME_FAILED,
+ NULL);
goto dealloc_dict;
}
@@ -3374,8 +3475,8 @@ br_stub_releasedir(xlator_t *this, fd_t *fd)
if (fctx->bad_object.dir) {
ret = sys_closedir(fctx->bad_object.dir);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL,
- "closedir error: %s", strerror(errno));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL,
+ "error=%s", strerror(errno), NULL);
}
GF_FREE(fctx);
@@ -3403,14 +3504,14 @@ br_stub_ictxmerge(xlator_t *this, fd_t *fd, inode_t *inode,
ret = br_stub_get_inode_ctx(this, inode, &ctxaddr);
if (ret < 0)
goto done;
- ctx = (br_stub_inode_ctx_t *)ctxaddr;
+ ctx = (br_stub_inode_ctx_t *)(uintptr_t)ctxaddr;
LOCK(&linked_inode->lock);
{
ret = __br_stub_get_inode_ctx(this, linked_inode, &lctxaddr);
if (ret < 0)
goto unblock;
- lctx = (br_stub_inode_ctx_t *)lctxaddr;
+ lctx = (br_stub_inode_ctx_t *)(uintptr_t)lctxaddr;
GF_ASSERT(list_is_singular(&ctx->fd_list));
br_stub_fd = list_first_entry(&ctx->fd_list, br_stub_fd_t, list);
@@ -3473,3 +3574,17 @@ struct volume_options options[] = {
.default_value = "{{ brick.path }}"},
{.key = {NULL}},
};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "bitrot-stub",
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
index a15667e323a..edd79a77e4f 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
@@ -10,20 +10,20 @@
#ifndef __BIT_ROT_STUB_H__
#define __BIT_ROT_STUB_H__
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "defaults.h"
-#include "call-stub.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/call-stub.h>
#include "bit-rot-stub-mem-types.h"
-#include "syscall.h"
-#include "common-utils.h"
+#include <glusterfs/syscall.h>
+#include <glusterfs/common-utils.h>
#include "bit-rot-common.h"
#include "bit-rot-stub-messages.h"
#include "glusterfs3-xdr.h"
-#include "syncop.h"
-#include "syncop-utils.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
#define BAD_OBJECT_THREAD_STACK_SIZE ((size_t)(1024 * 1024))
#define BR_STUB_DUMP_STR_SIZE 65536
@@ -222,8 +222,8 @@ br_stub_require_release_call(xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx)
ret = br_stub_fd_ctx_set(this, fd, br_stub_fd);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_CONTEXT_FAILED,
- "could not set fd context (for release callback");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_CONTEXT_FAILED,
+ NULL);
else
*fd_ctx = br_stub_fd;
@@ -255,7 +255,7 @@ br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx)
static inline int
br_stub_set_inode_ctx(xlator_t *this, inode_t *inode, br_stub_inode_ctx_t *ctx)
{
- uint64_t ctx_addr = (uint64_t)ctx;
+ uint64_t ctx_addr = (uint64_t)(uintptr_t)ctx;
return inode_ctx_set(inode, this, &ctx_addr);
}
@@ -273,10 +273,9 @@ __br_stub_set_ongoing_version(br_stub_inode_ctx_t *ctx, unsigned long version)
if (ctx->currentversion < version)
ctx->currentversion = version;
else
- gf_msg("bit-rot-stub", GF_LOG_WARNING, 0, BRS_MSG_CHANGE_VERSION_FAILED,
- "current version: %lu"
- "new version: %lu",
- ctx->currentversion, version);
+ gf_smsg("bit-rot-stub", GF_LOG_WARNING, 0,
+ BRS_MSG_CHANGE_VERSION_FAILED, "current version=%lu",
+ ctx->currentversion, "new version=%lu", version, NULL);
}
static inline int
@@ -359,10 +358,18 @@ br_stub_is_internal_xattr(const char *name)
}
static inline void
-br_stub_remove_vxattrs(dict_t *xattr)
+br_stub_remove_vxattrs(dict_t *xattr, gf_boolean_t remove_bad_marker)
{
if (xattr) {
- dict_del(xattr, BITROT_OBJECT_BAD_KEY);
+ /*
+ * When a file is corrupted, bad-object should be
+ * set in the dict. But, other info such as version,
+ * signature etc should not be set. Hence the flag
+ * remove_bad_marker. The consumer should know whether
+ * to send the bad-object info in the dict or not.
+ */
+ if (remove_bad_marker)
+ dict_del(xattr, BITROT_OBJECT_BAD_KEY);
dict_del(xattr, BITROT_CURRENT_VERSION_KEY);
dict_del(xattr, BITROT_SIGNING_VERSION_KEY);
dict_del(xattr, BITROT_SIGNING_XATTR_SIZE_KEY);
@@ -390,9 +397,8 @@ br_stub_is_bad_object(xlator_t *this, inode_t *inode)
ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
- "failed to get the inode context for the inode %s",
- uuid_utoa(inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+ "inode-gfid=%s", uuid_utoa(inode->gfid), NULL);
bad_object = -1;
goto out;
}
@@ -420,10 +426,8 @@ br_stub_mark_object_bad(xlator_t *this, inode_t *inode)
ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
- "failed to get the "
- "inode context for the inode %s",
- uuid_utoa(inode->gfid));
+ gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+ "inode-gfid=%s", uuid_utoa(inode->gfid), NULL);
goto out;
}
diff --git a/xlators/features/changelog/lib/examples/python/changes.py b/xlators/features/changelog/lib/examples/python/changes.py
index c410d3b000d..c410d3b000d 100644..100755
--- a/xlators/features/changelog/lib/examples/python/changes.py
+++ b/xlators/features/changelog/lib/examples/python/changes.py
diff --git a/xlators/features/changelog/lib/examples/python/libgfchangelog.py b/xlators/features/changelog/lib/examples/python/libgfchangelog.py
index 2cdbf1152b9..2da9f2d2a8c 100644
--- a/xlators/features/changelog/lib/examples/python/libgfchangelog.py
+++ b/xlators/features/changelog/lib/examples/python/libgfchangelog.py
@@ -1,8 +1,10 @@
import os
from ctypes import *
+from ctypes.util import find_library
class Changes(object):
- libgfc = CDLL("libgfchangelog.so", mode=RTLD_GLOBAL, use_errno=True)
+ libgfc = CDLL(find_library("gfchangelog"), mode=RTLD_GLOBAL,
+ use_errno=True)
@classmethod
def geterrno(cls):
diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am
index c4b9a3df692..c933ec53ed2 100644
--- a/xlators/features/changelog/lib/src/Makefile.am
+++ b/xlators/features/changelog/lib/src/Makefile.am
@@ -1,7 +1,7 @@
libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \
-DDATADIR=\"$(localstatedir)\"
-libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -fpic \
+libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -D__USE_LARGEFILE64 -fpic \
-I../../../src/ -I$(top_srcdir)/libglusterfs/src \
-I$(top_srcdir)/xlators/features/changelog/src \
-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
diff --git a/xlators/features/changelog/lib/src/changelog-lib-messages.h b/xlators/features/changelog/lib/src/changelog-lib-messages.h
index 32b3497d89d..d7fe7274353 100644
--- a/xlators/features/changelog/lib/src/changelog-lib-messages.h
+++ b/xlators/features/changelog/lib/src/changelog-lib-messages.h
@@ -11,7 +11,7 @@
#ifndef _CHANGELOG_LIB_MESSAGES_H_
#define _CHANGELOG_LIB_MESSAGES_H_
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
/* To add new message IDs, append new identifiers at the end of the list.
*
@@ -34,7 +34,7 @@ GLFS_MSGID(
CHANGELOG_LIB_MSG_MMAP_FAILED, CHANGELOG_LIB_MSG_MUNMAP_FAILED,
CHANGELOG_LIB_MSG_ASCII_ERROR, CHANGELOG_LIB_MSG_STAT_FAILED,
CHANGELOG_LIB_MSG_GET_XATTR_FAILED, CHANGELOG_LIB_MSG_PUBLISH_ERROR,
- CHANGELOG_LIB_MSG_PARSE_ERROR, CHANGELOG_LIB_MSG_TOTAL_LOG_INFO,
+ CHANGELOG_LIB_MSG_PARSE_ERROR, CHANGELOG_LIB_MSG_MIN_MAX_INFO,
CHANGELOG_LIB_MSG_CLEANUP_ERROR, CHANGELOG_LIB_MSG_UNLINK_FAILED,
CHANGELOG_LIB_MSG_NOTIFY_REGISTER_FAILED,
CHANGELOG_LIB_MSG_INVOKE_RPC_FAILED, CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO,
@@ -43,6 +43,32 @@ GLFS_MSGID(
CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO,
CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING,
CHANGELOG_LIB_MSG_COPY_FROM_BUFFER_FAILED,
- CHANGELOG_LIB_MSG_PTHREAD_JOIN_FAILED, CHANGELOG_LIB_MSG_HIST_FAILED);
+ CHANGELOG_LIB_MSG_PTHREAD_JOIN_FAILED, CHANGELOG_LIB_MSG_HIST_FAILED,
+ CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO, CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED,
+ CHANGELOG_LIB_MSG_REQUESTING_INFO, CHANGELOG_LIB_MSG_FINAL_INFO);
+
+#define CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO_STR "Registering brick"
+#define CHANGELOG_LIB_MSG_RENAME_FAILED_STR "error moving changelog file"
+#define CHANGELOG_LIB_MSG_OPEN_FAILED_STR "cannot open changelog file"
+#define CHANGELOG_LIB_MSG_UNLINK_FAILED_STR "failed to unlink"
+#define CHANGELOG_LIB_MSG_FAILED_TO_RMDIR_STR "failed to rmdir"
+#define CHANGELOG_LIB_MSG_STAT_FAILED_STR "stat failed on changelog file"
+#define CHANGELOG_LIB_MSG_PARSE_ERROR_STR "could not parse changelog"
+#define CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED_STR \
+ "parsing error, ceased publishing..."
+#define CHANGELOG_LIB_MSG_HTIME_ERROR_STR "fop failed on htime file"
+#define CHANGELOG_LIB_MSG_GET_XATTR_FAILED_STR \
+ "error extracting max timstamp from htime file"
+#define CHANGELOG_LIB_MSG_MIN_MAX_INFO_STR "changelogs min max"
+#define CHANGELOG_LIB_MSG_REQUESTING_INFO_STR "Requesting historical changelogs"
+#define CHANGELOG_LIB_MSG_FINAL_INFO_STR "FINAL"
+#define CHANGELOG_LIB_MSG_HIST_FAILED_STR \
+ "Requested changelog range is not available"
+#define CHANGELOG_LIB_MSG_GET_TIME_ERROR_STR "wrong result"
+#define CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO_STR \
+ "Cleaning brick entry for brick"
+#define CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO_STR "Draining event"
+#define CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO_STR "Drained event"
+#define CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO_STR "freeing entry"
#endif /* !_CHANGELOG_MESSAGES_H_ */
diff --git a/xlators/features/changelog/lib/src/gf-changelog-api.c b/xlators/features/changelog/lib/src/gf-changelog-api.c
index 1b6e932596d..81a5cbfec10 100644
--- a/xlators/features/changelog/lib/src/gf-changelog-api.c
+++ b/xlators/features/changelog/lib/src/gf-changelog-api.c
@@ -8,10 +8,10 @@
cases as published by the Free Software Foundation.
*/
-#include "compat-uuid.h"
-#include "globals.h"
-#include "glusterfs.h"
-#include "syscall.h"
+#include <glusterfs/compat-uuid.h>
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/syscall.h>
#include "gf-changelog-helpers.h"
#include "gf-changelog-journal.h"
@@ -56,8 +56,8 @@ gf_changelog_done(char *file)
ret = sys_rename(buffer, to_path);
if (ret) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_RENAME_FAILED, "cannot move changelog file",
- "from=%s", file, "to=%s", to_path, NULL);
+ CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", file, "to=%s",
+ to_path, NULL);
goto out;
}
diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.c b/xlators/features/changelog/lib/src/gf-changelog-helpers.c
index fd15ec68ab8..75f8a6dfc08 100644
--- a/xlators/features/changelog/lib/src/gf-changelog-helpers.c
+++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.c
@@ -11,13 +11,7 @@
#include "changelog-mem-types.h"
#include "gf-changelog-helpers.h"
#include "changelog-lib-messages.h"
-#include "syscall.h"
-
-ssize_t
-gf_changelog_read_path(int fd, char *buffer, size_t bufsize)
-{
- return sys_read(fd, buffer, bufsize);
-}
+#include <glusterfs/syscall.h>
size_t
gf_changelog_write(int fd, char *buffer, size_t len)
@@ -64,20 +58,7 @@ gf_rfc3986_encode_space_newline(unsigned char *s, char *enc, char *estr)
* made a part of libglusterfs.
*/
-static pthread_key_t rl_key;
-static pthread_once_t rl_once = PTHREAD_ONCE_INIT;
-
-static void
-readline_destructor(void *ptr)
-{
- GF_FREE(ptr);
-}
-
-static void
-readline_once(void)
-{
- pthread_key_create(&rl_key, readline_destructor);
-}
+static __thread read_line_t thread_tsd = {};
static ssize_t
my_read(read_line_t *tsd, int fd, char *ptr)
@@ -97,27 +78,6 @@ my_read(read_line_t *tsd, int fd, char *ptr)
return 1;
}
-static int
-gf_readline_init_once(read_line_t **tsd)
-{
- if (pthread_once(&rl_once, readline_once) != 0)
- return -1;
-
- *tsd = pthread_getspecific(rl_key);
- if (*tsd)
- goto out;
-
- *tsd = GF_CALLOC(1, sizeof(**tsd), gf_changelog_mt_libgfchangelog_rl_t);
- if (!*tsd)
- return -1;
-
- if (pthread_setspecific(rl_key, *tsd) != 0)
- return -1;
-
-out:
- return 0;
-}
-
ssize_t
gf_readline(int fd, void *vptr, size_t maxlen)
{
@@ -125,10 +85,7 @@ gf_readline(int fd, void *vptr, size_t maxlen)
size_t rc = 0;
char c = ' ';
char *ptr = NULL;
- read_line_t *tsd = NULL;
-
- if (gf_readline_init_once(&tsd))
- return -1;
+ read_line_t *tsd = &thread_tsd;
ptr = vptr;
for (n = 1; n < maxlen; n++) {
@@ -151,10 +108,7 @@ off_t
gf_lseek(int fd, off_t offset, int whence)
{
off_t off = 0;
- read_line_t *tsd = NULL;
-
- if (gf_readline_init_once(&tsd))
- return -1;
+ read_line_t *tsd = &thread_tsd;
off = sys_lseek(fd, offset, whence);
if (off == -1)
@@ -169,10 +123,7 @@ gf_lseek(int fd, off_t offset, int whence)
int
gf_ftruncate(int fd, off_t length)
{
- read_line_t *tsd = NULL;
-
- if (gf_readline_init_once(&tsd))
- return -1;
+ read_line_t *tsd = &thread_tsd;
if (sys_ftruncate(fd, 0))
return -1;
diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h
index cfb26a0081e..9c609d33172 100644
--- a/xlators/features/changelog/lib/src/gf-changelog-helpers.h
+++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h
@@ -14,9 +14,9 @@
#include <unistd.h>
#include <dirent.h>
#include <limits.h>
-#include "locking.h"
+#include <glusterfs/locking.h>
-#include <xlator.h>
+#include <glusterfs/xlator.h>
#include "changelog.h"
@@ -205,9 +205,6 @@ typedef struct gf_private {
void *
gf_changelog_process(void *data);
-ssize_t
-gf_changelog_read_path(int fd, char *buffer, size_t bufsize);
-
void
gf_rfc3986_encode_space_newline(unsigned char *s, char *enc, char *estr);
diff --git a/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c b/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c
index ef46bf50c97..7f6e2329e71 100644
--- a/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c
+++ b/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c
@@ -8,11 +8,11 @@
cases as published by the Free Software Foundation.
*/
-#include "compat-uuid.h"
-#include "globals.h"
-#include "glusterfs.h"
-#include "syscall.h"
-#include "compat-errno.h"
+#include <glusterfs/compat-uuid.h>
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/compat-errno.h>
#include "gf-changelog-helpers.h"
@@ -526,9 +526,8 @@ gf_changelog_publish(xlator_t *this, gf_changelog_journal_t *jnl,
ret = sys_rename(to_path, dest);
if (ret) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_RENAME_FAILED,
- "error moving changelog to processing dir", "path=%s", to_path,
- NULL);
+ CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", to_path, "to=%s",
+ dest, NULL);
}
out:
@@ -564,14 +563,14 @@ gf_changelog_consume(xlator_t *this, gf_changelog_journal_t *jnl,
if (ret || !S_ISREG(stbuf.st_mode)) {
ret = -1;
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_STAT_FAILED,
- "stat failed on changelog file", "path=%s", from_path, NULL);
+ "path=%s", from_path, NULL);
goto out;
}
fd1 = open(from_path, O_RDONLY);
if (fd1 < 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_OPEN_FAILED,
- "cannot open changelog file", "path=%s", from_path, NULL);
+ "path=%s", from_path, NULL);
goto out;
}
@@ -579,7 +578,7 @@ gf_changelog_consume(xlator_t *this, gf_changelog_journal_t *jnl,
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
if (fd2 < 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_OPEN_FAILED,
- "cannot create ascii changelog file", "path=%s", to_path, NULL);
+ "path=%s", to_path, NULL);
goto close_fd;
} else {
ret = gf_changelog_decode(this, jnl, fd1, fd2, &stbuf, &zerob);
@@ -594,9 +593,8 @@ gf_changelog_consume(xlator_t *this, gf_changelog_journal_t *jnl,
ret = sys_rename(to_path, dest);
if (ret)
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_RENAME_FAILED,
- "error moving changelog to processing dir", "path=%s",
- to_path, NULL);
+ CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", to_path,
+ "to=%s", dest, NULL);
}
/* remove it from .current if it's an empty file */
@@ -605,9 +603,8 @@ gf_changelog_consume(xlator_t *this, gf_changelog_journal_t *jnl,
ret = sys_unlink(to_path);
if (ret)
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_UNLINK_FAILED,
- "could not unlink empty changelog", "path=%s", to_path,
- NULL);
+ CHANGELOG_LIB_MSG_UNLINK_FAILED, "name=empty changelog",
+ "path=%s", to_path, NULL);
}
}
@@ -828,7 +825,7 @@ gf_changelog_open_dirs(xlator_t *this, gf_changelog_journal_t *jnl)
ret = recursive_rmdir(jnl->jnl_current_dir);
if (ret) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, "Failed to rmdir", "path=%s",
+ CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, "path=%s",
jnl->jnl_current_dir, NULL);
goto out;
}
@@ -849,7 +846,7 @@ gf_changelog_open_dirs(xlator_t *this, gf_changelog_journal_t *jnl)
ret = recursive_rmdir(jnl->jnl_processing_dir);
if (ret) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, "Failed to rmdir", "path=%s",
+ CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, "path=%s",
jnl->jnl_processing_dir, NULL);
goto out;
}
diff --git a/xlators/features/changelog/lib/src/gf-changelog-reborp.c b/xlators/features/changelog/lib/src/gf-changelog-reborp.c
index 8dfda4c79c5..56b11cbb705 100644
--- a/xlators/features/changelog/lib/src/gf-changelog-reborp.c
+++ b/xlators/features/changelog/lib/src/gf-changelog-reborp.c
@@ -15,14 +15,14 @@
#include "changelog-rpc-common.h"
#include "changelog-lib-messages.h"
-#include "syscall.h"
+#include <glusterfs/syscall.h>
/**
* Reverse socket: actual data transfer handler. Connection
* initiator is PROBER, data transfer is REBORP.
*/
-struct rpcsvc_program *gf_changelog_reborp_programs[];
+static struct rpcsvc_program *gf_changelog_reborp_programs[];
void *
gf_changelog_connection_janitor(void *arg)
@@ -55,9 +55,8 @@ gf_changelog_connection_janitor(void *arg)
ev = &entry->event;
gf_smsg(this->name, GF_LOG_INFO, 0,
- CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO,
- "Cleaning brick entry for brick", "brick=%s", entry->brick,
- NULL);
+ CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO, "brick=%s",
+ entry->brick, NULL);
/* 0x0: disable rpc-clnt */
rpc_clnt_disable(RPC_PROBER(entry));
@@ -71,21 +70,19 @@ gf_changelog_connection_janitor(void *arg)
while (!list_empty(&ev->events)) {
event = list_first_entry(&ev->events, struct gf_event, list);
gf_smsg(this->name, GF_LOG_INFO, 0,
- CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO, "Draining event",
- "seq=%lu", event->seq, "payload=%d", event->count, NULL);
+ CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO, "seq=%lu",
+ event->seq, "payload=%d", event->count, NULL);
GF_FREE(event);
drained++;
}
gf_smsg(this->name, GF_LOG_INFO, 0,
- CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO, "Drained events",
- "num=%lu", drained, NULL);
+ CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO, "num=%lu", drained, NULL);
/* 0x3: freeup brick entry */
gf_smsg(this->name, GF_LOG_INFO, 0,
- CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO, "freeing entry",
- "entry=%p", entry, NULL);
+ CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO, "entry=%p", entry, NULL);
LOCK_DESTROY(&entry->statelock);
GF_FREE(entry);
}
@@ -112,9 +109,7 @@ gf_changelog_reborp_rpcsvc_notify(rpcsvc_t *rpc, void *mydata,
ret = sys_unlink(RPC_SOCK(entry));
if (ret != 0)
gf_smsg(this->name, GF_LOG_WARNING, errno,
- CHANGELOG_LIB_MSG_UNLINK_FAILED,
- "failed to unlink "
- "reverse socket",
+ CHANGELOG_LIB_MSG_UNLINK_FAILED, "name=reverse socket",
"path=%s", RPC_SOCK(entry), NULL);
if (entry->connected)
GF_CHANGELOG_INVOKE_CBK(this, entry->connected, entry->brick,
@@ -353,7 +348,9 @@ gf_changelog_event_handler(rpcsvc_request_t *req, xlator_t *this,
}
gf_msg_debug(this->name, 0,
- "seq: %lu [%s] (time: %lu.%lu), (vec: %d, len: %zd)",
+ "seq: %" PRIu64 " [%s] (time: %" PRIu64 ".%" PRIu64
+ "), "
+ "(vec: %d, len: %zd)",
rpc_req.seq, entry->brick, rpc_req.tv_sec, rpc_req.tv_usec,
payloadcnt, payloadlen);
@@ -389,11 +386,10 @@ gf_changelog_reborp_handle_event(rpcsvc_request_t *req)
return gf_changelog_event_handler(req, this, entry);
}
-rpcsvc_actor_t gf_changelog_reborp_actors[CHANGELOG_REV_PROC_MAX] = {
+static rpcsvc_actor_t gf_changelog_reborp_actors[CHANGELOG_REV_PROC_MAX] = {
[CHANGELOG_REV_PROC_EVENT] = {"CHANGELOG EVENT HANDLER",
- CHANGELOG_REV_PROC_EVENT,
- gf_changelog_reborp_handle_event, NULL, 0,
- DRC_NA},
+ gf_changelog_reborp_handle_event, NULL,
+ CHANGELOG_REV_PROC_EVENT, DRC_NA, 0},
};
/**
@@ -402,7 +398,7 @@ rpcsvc_actor_t gf_changelog_reborp_actors[CHANGELOG_REV_PROC_MAX] = {
* and that's required to invoke the callback with the appropriate
* brick path and it's private data.
*/
-struct rpcsvc_program gf_changelog_reborp_prog = {
+static struct rpcsvc_program gf_changelog_reborp_prog = {
.progname = "LIBGFCHANGELOG REBORP",
.prognum = CHANGELOG_REV_RPC_PROCNUM,
.progver = CHANGELOG_REV_RPC_PROCVER,
@@ -411,7 +407,7 @@ struct rpcsvc_program gf_changelog_reborp_prog = {
.synctask = _gf_false,
};
-struct rpcsvc_program *gf_changelog_reborp_programs[] = {
+static struct rpcsvc_program *gf_changelog_reborp_programs[] = {
&gf_changelog_reborp_prog,
NULL,
};
diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.h b/xlators/features/changelog/lib/src/gf-changelog-rpc.h
index 975307b99d3..5c82d6f1c08 100644
--- a/xlators/features/changelog/lib/src/gf-changelog-rpc.h
+++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.h
@@ -11,7 +11,7 @@
#ifndef __GF_CHANGELOG_RPC_H
#define __GF_CHANGELOG_RPC_H
-#include "xlator.h"
+#include <glusterfs/xlator.h>
#include "gf-changelog-helpers.h"
#include "changelog-rpc-common.h"
diff --git a/xlators/features/changelog/lib/src/gf-changelog.c b/xlators/features/changelog/lib/src/gf-changelog.c
index c7791c62950..57c3d39ef76 100644
--- a/xlators/features/changelog/lib/src/gf-changelog.c
+++ b/xlators/features/changelog/lib/src/gf-changelog.c
@@ -22,11 +22,11 @@
#endif
#include <string.h>
-#include "globals.h"
-#include "glusterfs.h"
-#include "logging.h"
-#include "defaults.h"
-#include "syncop.h"
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syncop.h>
#include "gf-changelog-rpc.h"
#include "gf-changelog-helpers.h"
@@ -100,48 +100,48 @@ gf_changelog_ctx_defaults_init(glusterfs_ctx_t *ctx)
ctx->iobuf_pool = iobuf_pool_new();
if (!ctx->iobuf_pool)
- return -1;
+ goto free_pool;
- ctx->event_pool = event_pool_new(GF_CHANGELOG_EVENT_POOL_SIZE,
- GF_CHANGELOG_EVENT_THREAD_COUNT);
+ ctx->event_pool = gf_event_pool_new(GF_CHANGELOG_EVENT_POOL_SIZE,
+ GF_CHANGELOG_EVENT_THREAD_COUNT);
if (!ctx->event_pool)
- return -1;
+ goto free_pool;
pool = GF_CALLOC(1, sizeof(call_pool_t),
gf_changelog_mt_libgfchangelog_call_pool_t);
if (!pool)
- return -1;
+ goto free_pool;
/* frame_mem_pool size 112 * 64 */
pool->frame_mem_pool = mem_pool_new(call_frame_t, 32);
if (!pool->frame_mem_pool)
- return -1;
+ goto free_pool;
/* stack_mem_pool size 256 * 128 */
pool->stack_mem_pool = mem_pool_new(call_stack_t, 16);
if (!pool->stack_mem_pool)
- return -1;
+ goto free_pool;
ctx->stub_mem_pool = mem_pool_new(call_stub_t, 16);
if (!ctx->stub_mem_pool)
- return -1;
+ goto free_pool;
ctx->dict_pool = mem_pool_new(dict_t, 32);
if (!ctx->dict_pool)
- return -1;
+ goto free_pool;
ctx->dict_pair_pool = mem_pool_new(data_pair_t, 512);
if (!ctx->dict_pair_pool)
- return -1;
+ goto free_pool;
ctx->dict_data_pool = mem_pool_new(data_t, 512);
if (!ctx->dict_data_pool)
- return -1;
+ goto free_pool;
ctx->logbuf_pool = mem_pool_new(log_buf_t, 256);
if (!ctx->logbuf_pool)
- return -1;
+ goto free_pool;
INIT_LIST_HEAD(&pool->all_frames);
LOCK_INIT(&pool->lock);
@@ -158,6 +158,31 @@ gf_changelog_ctx_defaults_init(glusterfs_ctx_t *ctx)
setrlimit(RLIMIT_CORE, &lim);
return 0;
+
+free_pool:
+ if (pool) {
+ GF_FREE(pool->frame_mem_pool);
+
+ GF_FREE(pool->stack_mem_pool);
+
+ GF_FREE(pool);
+ }
+
+ GF_FREE(ctx->stub_mem_pool);
+
+ GF_FREE(ctx->dict_pool);
+
+ GF_FREE(ctx->dict_pair_pool);
+
+ GF_FREE(ctx->dict_data_pool);
+
+ GF_FREE(ctx->logbuf_pool);
+
+ GF_FREE(ctx->iobuf_pool);
+
+ GF_FREE(ctx->event_pool);
+
+ return -1;
}
/* TODO: cleanup ctx defaults */
@@ -212,9 +237,8 @@ gf_changelog_init_master()
{
int ret = 0;
- mem_pools_init_early();
ret = gf_changelog_init_context();
- mem_pools_init_late();
+ mem_pools_init();
return ret;
}
@@ -549,9 +573,8 @@ gf_changelog_register_generic(struct gf_brick_spec *bricks, int count,
brick = bricks;
while (count--) {
gf_smsg(this->name, GF_LOG_INFO, 0,
- CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO, "Registering brick",
- "brick=%s", brick->brick_path, "notify_filter=%d",
- brick->filter, NULL);
+ CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO, "brick=%s",
+ brick->brick_path, "notify_filter=%d", brick->filter, NULL);
ret = gf_changelog_register_brick(this, brick, need_order, xl);
if (ret != 0) {
diff --git a/xlators/features/changelog/lib/src/gf-history-changelog.c b/xlators/features/changelog/lib/src/gf-history-changelog.c
index c8a31ebbd73..a16219f3664 100644
--- a/xlators/features/changelog/lib/src/gf-history-changelog.c
+++ b/xlators/features/changelog/lib/src/gf-history-changelog.c
@@ -8,10 +8,10 @@
#endif
#include <string.h>
-#include "globals.h"
-#include "glusterfs.h"
-#include "logging.h"
-#include "syscall.h"
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/syscall.h>
#include "gf-changelog-helpers.h"
#include "gf-changelog-journal.h"
@@ -79,8 +79,8 @@ gf_history_changelog_done(char *file)
ret = sys_rename(buffer, to_path);
if (ret) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_RENAME_FAILED, "cannot move changelog file",
- "from=%s", file, "to=%s", to_path, NULL);
+ CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", file, "to=%s",
+ to_path, NULL);
goto out;
}
@@ -522,8 +522,7 @@ gf_changelog_consume_wrap(void *data)
_gf_true);
if (ret) {
gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_PARSE_ERROR,
- "could not parse changelog", "name=%s", ccd->changelog,
- NULL);
+ "name=%s", ccd->changelog, NULL);
goto out;
}
}
@@ -564,9 +563,6 @@ gf_history_consume(void *data)
{0},
};
gf_changelog_consume_data_t *curr = NULL;
- char thread_name[GF_THREAD_NAMEMAX] = {
- 0,
- };
hist_data = (gf_changelog_history_data_t *)data;
if (hist_data == NULL) {
@@ -612,12 +608,10 @@ gf_history_consume(void *data)
curr->retval = 0;
memset(curr->changelog, '\0', PATH_MAX);
- snprintf(thread_name, sizeof(thread_name), "clogc%03hx",
- ((iter + 1) & 0x3ff));
ret = gf_thread_create(&th_id[iter], NULL,
gf_changelog_consume_wrap, curr,
- thread_name);
+ "clogc%03hx", (iter + 1) & 0x3ff);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, ret,
CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED,
@@ -647,9 +641,8 @@ gf_history_consume(void *data)
curr = &ccd[iter];
if (ccd->retval) {
publish = _gf_false;
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_LIB_MSG_PARSE_ERROR,
- "parsing error, ceased publishing...");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED, NULL);
continue;
}
@@ -728,7 +721,7 @@ gf_changelog_extract_min_max(const char *dname, const char *htime_dir, int *fd,
if (ret) {
ret = -1;
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR,
- "stat() failed on htime file", "path=%s", htime_file, NULL);
+ "op=stat", "path=%s", htime_file, NULL);
goto out;
}
@@ -742,7 +735,7 @@ gf_changelog_extract_min_max(const char *dname, const char *htime_dir, int *fd,
if (*fd < 0) {
ret = -1;
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR,
- "open() failed for htime file", "path=%s", htime_file, NULL);
+ "op=open", "path=%s", htime_file, NULL);
goto out;
}
@@ -751,17 +744,15 @@ gf_changelog_extract_min_max(const char *dname, const char *htime_dir, int *fd,
if (ret < 0) {
ret = -1;
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_GET_XATTR_FAILED,
- "error extracting max timstamp from htime file"
- "path=%s",
- htime_file, NULL);
+ CHANGELOG_LIB_MSG_GET_XATTR_FAILED, "path=%s", htime_file,
+ NULL);
goto out;
}
sscanf(x_value, "%lu:%lu", max_ts, total);
- gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_TOTAL_LOG_INFO,
- "changelogs min max", "min=%lu", *min_ts, "max=%lu", *max_ts,
- "total_changelogs=%lu", *total, NULL);
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_MIN_MAX_INFO,
+ "min=%lu", *min_ts, "max=%lu", *max_ts, "total_changelogs=%lu",
+ *total, NULL);
ret = 0;
@@ -842,15 +833,14 @@ gf_history_changelog(char *changelog_dir, unsigned long start,
goto out;
}
- gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_TOTAL_LOG_INFO,
- "Requesting historical changelogs", "start=%lu", start, "end=%lu",
- end, NULL);
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_REQUESTING_INFO,
+ "start=%lu", start, "end=%lu", end, NULL);
/* basic sanity check */
if (start > end || n_parallel <= 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HIST_FAILED,
- "Sanity check failed", "start=%lu", start, "end=%lu", end,
- "thread_count=%d", n_parallel, NULL);
+ "start=%lu", start, "end=%lu", end, "thread_count=%d",
+ n_parallel, NULL);
ret = -1;
goto out;
}
@@ -864,7 +854,7 @@ gf_history_changelog(char *changelog_dir, unsigned long start,
dirp = sys_opendir(htime_dir);
if (dirp == NULL) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR,
- "open dir on htime failed", "path=%s", htime_dir, NULL);
+ "op=opendir", "path=%s", htime_dir, NULL);
ret = -1;
goto out;
}
@@ -876,9 +866,8 @@ gf_history_changelog(char *changelog_dir, unsigned long start,
if (!entry || errno != 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_HIST_FAILED,
- "Requested changelog range is not availbale", "start=%lu",
- start, "end=%lu", end, NULL);
+ CHANGELOG_LIB_MSG_HIST_FAILED, "start=%lu", start,
+ "end=%lu", end, NULL);
ret = -2;
break;
}
@@ -916,9 +905,8 @@ gf_history_changelog(char *changelog_dir, unsigned long start,
if (gf_history_check(fd, from, start, len) != 0) {
ret = -1;
gf_smsg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_LIB_MSG_GET_TIME_ERROR,
- "wrong result for start", "start=%lu", start, "idx=%lu",
- from, NULL);
+ CHANGELOG_LIB_MSG_GET_TIME_ERROR, "for=start",
+ "start=%lu", start, "idx=%lu", from, NULL);
goto out;
}
@@ -949,9 +937,8 @@ gf_history_changelog(char *changelog_dir, unsigned long start,
if (gf_history_check(fd, to, end2, len) != 0) {
ret = -1;
gf_smsg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_LIB_MSG_GET_TIME_ERROR,
- "wrong result for end", "start=%lu", end2, "idx=%lu",
- to, NULL);
+ CHANGELOG_LIB_MSG_GET_TIME_ERROR, "for=end",
+ "start=%lu", end2, "idx=%lu", to, NULL);
goto out;
}
@@ -963,9 +950,9 @@ gf_history_changelog(char *changelog_dir, unsigned long start,
if (ret == -1)
goto out;
- gf_smsg(this->name, GF_LOG_INFO, 0,
- CHANGELOG_LIB_MSG_TOTAL_LOG_INFO, "FINAL", "from=%lu", ts1,
- "to=%lu", ts2, "changes=%lu", (to - from + 1), NULL);
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_FINAL_INFO,
+ "from=%lu", ts1, "to=%lu", ts2, "changes=%lu",
+ (to - from + 1), NULL);
hist_data = GF_CALLOC(1, sizeof(gf_changelog_history_data_t),
gf_changelog_mt_history_data_t);
@@ -1003,11 +990,9 @@ gf_history_changelog(char *changelog_dir, unsigned long start,
} else { /* end of range check */
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_LIB_MSG_HIST_FAILED,
- "Requested changelog range is not "
- "available. Retrying next HTIME",
- "start=%lu", start, "end=%lu", end, "chlog_min=%lu", min_ts,
- "chlog_max=%lu", max_ts, NULL);
+ CHANGELOG_LIB_MSG_HIST_FAILED, "start=%lu", start,
+ "end=%lu", end, "chlog_min=%lu", min_ts, "chlog_max=%lu",
+ max_ts, NULL);
}
} /* end of readdir() */
diff --git a/xlators/features/changelog/src/changelog-barrier.c b/xlators/features/changelog/src/changelog-barrier.c
index e8d742404df..0fb89ddb127 100644
--- a/xlators/features/changelog/src/changelog-barrier.c
+++ b/xlators/features/changelog/src/changelog-barrier.c
@@ -10,7 +10,7 @@
#include "changelog-helpers.h"
#include "changelog-messages.h"
-#include "call-stub.h"
+#include <glusterfs/call-stub.h>
/* Enqueue a stub*/
void
@@ -53,14 +53,14 @@ chlog_barrier_dequeue_all(xlator_t *this, struct list_head *queue)
{
call_stub_t *stub = NULL;
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO,
- "Dequeuing all the changelog barriered fops");
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS,
+ NULL);
while ((stub = __chlog_barrier_dequeue(this, queue)))
call_resume(stub);
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO,
- "Dequeuing changelog barriered fops is finished");
+ gf_smsg(this->name, GF_LOG_INFO, 0,
+ CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED, NULL);
return;
}
@@ -80,8 +80,7 @@ chlog_barrier_timeout(void *data)
INIT_LIST_HEAD(&queue);
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_BARRIER_ERROR,
- "Disabling changelog barrier because of the timeout.");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_BARRIER_TIMEOUT, NULL);
LOCK(&priv->lock);
{
@@ -120,8 +119,8 @@ __chlog_barrier_enable(xlator_t *this, changelog_priv_t *priv)
priv->timer = gf_timer_call_after(this->ctx, priv->timeout,
chlog_barrier_timeout, (void *)this);
if (!priv->timer) {
- gf_msg(this->name, GF_LOG_CRITICAL, 0, CHANGELOG_MSG_BARRIER_ERROR,
- "Couldn't add changelog barrier timeout event.");
+ gf_smsg(this->name, GF_LOG_CRITICAL, 0,
+ CHANGELOG_MSG_TIMEOUT_ADD_FAILED, NULL);
goto out;
}
diff --git a/xlators/features/changelog/src/changelog-encoders.h b/xlators/features/changelog/src/changelog-encoders.h
index ca42c4c4fe0..26252696d56 100644
--- a/xlators/features/changelog/src/changelog-encoders.h
+++ b/xlators/features/changelog/src/changelog-encoders.h
@@ -11,8 +11,8 @@
#ifndef _CHANGELOG_ENCODERS_H
#define _CHANGELOG_ENCODERS_H
-#include "xlator.h"
-#include "defaults.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
#include "changelog-helpers.h"
diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c
index 3ed6ff821d9..aa94459de5a 100644
--- a/xlators/features/changelog/src/changelog-ev-handle.c
+++ b/xlators/features/changelog/src/changelog-ev-handle.c
@@ -134,6 +134,8 @@ changelog_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
changelog_clnt_t *c_clnt = NULL;
changelog_priv_t *priv = NULL;
changelog_ev_selector_t *selection = NULL;
+ uint64_t clntcnt = 0;
+ uint64_t xprtcnt = 0;
crpc = mydata;
this = crpc->this;
@@ -144,6 +146,7 @@ changelog_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
switch (event) {
case RPC_CLNT_CONNECT:
selection = &priv->ev_selection;
+ GF_ATOMIC_INC(priv->clntcnt);
LOCK(&c_clnt->wait_lock);
{
@@ -176,12 +179,23 @@ changelog_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
changelog_set_disconnect_flag(crpc, _gf_true);
}
UNLOCK(&crpc->lock);
+ LOCK(&c_clnt->active_lock);
+ {
+ list_del_init(&crpc->list);
+ }
+ UNLOCK(&c_clnt->active_lock);
break;
case RPC_CLNT_MSG:
case RPC_CLNT_DESTROY:
/* Free up mydata */
changelog_rpc_clnt_unref(crpc);
+ clntcnt = GF_ATOMIC_DEC(priv->clntcnt);
+ xprtcnt = GF_ATOMIC_GET(priv->xprtcnt);
+ if (this->cleanup_starting) {
+ if (!clntcnt && !xprtcnt)
+ changelog_process_cleanup_event(this);
+ }
break;
case RPC_CLNT_PING:
break;
@@ -211,8 +225,8 @@ changelog_ev_connector(void *data)
changelog_rpc_notify);
if (!crpc->rpc) {
gf_smsg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_RPC_CONNECT_ERROR,
- "failed to connect back", "path=%s", crpc->sock, NULL);
+ CHANGELOG_MSG_RPC_CONNECT_ERROR, "path=%s", crpc->sock,
+ NULL);
crpc->cleanup(crpc);
goto mutex_unlock;
}
@@ -364,9 +378,8 @@ changelog_ev_dispatch(void *data)
ret = rbuf_wait_for_completion(c_clnt->rbuf, opaque, _dispatcher,
c_clnt);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0,
- CHANGELOG_MSG_PUT_BUFFER_FAILED,
- "failed to put buffer after consumption");
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ CHANGELOG_MSG_PUT_BUFFER_FAILED, NULL);
}
return NULL;
diff --git a/xlators/features/changelog/src/changelog-ev-handle.h b/xlators/features/changelog/src/changelog-ev-handle.h
index 7e543a0edb3..cc1af58a276 100644
--- a/xlators/features/changelog/src/changelog-ev-handle.h
+++ b/xlators/features/changelog/src/changelog-ev-handle.h
@@ -11,11 +11,11 @@
#ifndef __CHANGELOG_EV_HANDLE_H
#define __CHANGELOG_EV_HANDLE_H
-#include "list.h"
-#include "xlator.h"
+#include <glusterfs/list.h>
+#include <glusterfs/xlator.h>
#include "rpc-clnt.h"
-#include "rot-buffs.h"
+#include <glusterfs/rot-buffs.h>
struct changelog_clnt;
@@ -131,4 +131,6 @@ changelog_ev_queue_connection(changelog_clnt_t *, changelog_rpc_clnt_t *);
void
changelog_ev_cleanup_connections(xlator_t *, changelog_clnt_t *);
+void
+changelog_process_cleanup_event(xlator_t *);
#endif
diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c
index 53219bf2d78..e561997d858 100644
--- a/xlators/features/changelog/src/changelog-helpers.c
+++ b/xlators/features/changelog/src/changelog-helpers.c
@@ -8,11 +8,11 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-#include "logging.h"
-#include "iobuf.h"
-#include "syscall.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/iobuf.h>
+#include <glusterfs/syscall.h>
#include "changelog-helpers.h"
#include "changelog-encoders.h"
@@ -22,6 +22,7 @@
#include "changelog-encoders.h"
#include "changelog-rpc-common.h"
#include <pthread.h>
+#include <time.h>
static void
changelog_cleanup_free_mutex(void *arg_mutex)
@@ -41,16 +42,15 @@ changelog_thread_cleanup(xlator_t *this, pthread_t thr_id)
/* send a cancel request to the thread */
ret = pthread_cancel(thr_id);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_CANCEL_FAILED, "could not cancel thread");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_PTHREAD_CANCEL_FAILED, NULL);
goto out;
}
ret = pthread_join(thr_id, &retval);
if ((ret != 0) || (retval != PTHREAD_CANCELED)) {
- gf_msg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_CANCEL_FAILED,
- "cancel request not adhered as expected");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_PTHREAD_CANCEL_FAILED, NULL);
}
out:
@@ -153,27 +153,6 @@ changelog_init_event_selection(xlator_t *this,
return 0;
}
-int
-changelog_cleanup_event_selection(xlator_t *this,
- changelog_ev_selector_t *selection)
-{
- int j = CHANGELOG_EV_SELECTION_RANGE;
-
- LOCK(&selection->reflock);
- {
- while (j--) {
- if (selection->ref[j] > 0)
- gf_msg(this->name, GF_LOG_WARNING, 0,
- CHANGELOG_MSG_CLEANUP_ON_ACTIVE_REF,
- "changelog event selection cleaning up "
- " on active references");
- }
- }
- UNLOCK(&selection->reflock);
-
- return LOCK_DESTROY(&selection->reflock);
-}
-
static void
changelog_perform_dispatch(xlator_t *this, changelog_priv_t *priv, void *mem,
size_t size)
@@ -263,8 +242,7 @@ changelog_write(int fd, char *buffer, size_t len)
}
int
-htime_update(xlator_t *this, changelog_priv_t *priv, unsigned long ts,
- char *buffer)
+htime_update(xlator_t *this, changelog_priv_t *priv, time_t ts, char *buffer)
{
char changelog_path[PATH_MAX + 1] = {
0,
@@ -277,8 +255,8 @@ htime_update(xlator_t *this, changelog_priv_t *priv, unsigned long ts,
int ret = 0;
if (priv->htime_fd == -1) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
- "Htime fd not available for updation");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
+ "reason=fd not available", NULL);
ret = -1;
goto out;
}
@@ -288,13 +266,13 @@ htime_update(xlator_t *this, changelog_priv_t *priv, unsigned long ts,
goto out;
}
if (changelog_write(priv->htime_fd, (void *)changelog_path, len + 1) < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
- "Htime file content write failed");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
+ "reason=write failed", NULL);
ret = -1;
goto out;
}
- len = snprintf(x_value, sizeof(x_value), "%lu:%d", ts,
+ len = snprintf(x_value, sizeof(x_value), "%ld:%d", ts,
priv->rollover_count);
if (len >= sizeof(x_value)) {
ret = -1;
@@ -303,12 +281,12 @@ htime_update(xlator_t *this, changelog_priv_t *priv, unsigned long ts,
if (sys_fsetxattr(priv->htime_fd, HTIME_KEY, x_value, len, XATTR_REPLACE)) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_ERROR,
- "Htime xattr updation failed with XATTR_REPLACE",
+ "reason=xattr updation failed", "XATTR_REPLACE=true",
"changelog=%s", changelog_path, NULL);
if (sys_fsetxattr(priv->htime_fd, HTIME_KEY, x_value, len, 0)) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_ERROR,
- "Htime xattr updation failed", "changelog=%s",
+ "reason=xattr updation failed", "changelog=%s",
changelog_path, NULL);
ret = -1;
goto out;
@@ -346,15 +324,15 @@ cl_is_empty(xlator_t *this, int fd)
ret = sys_fstat(fd, &stbuf);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSTAT_OP_FAILED,
- "Could not stat (CHANGELOG)");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSTAT_OP_FAILED,
+ NULL);
goto out;
}
ret = sys_lseek(fd, 0, SEEK_SET);
if (ret == -1) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_LSEEK_OP_FAILED,
- "Could not lseek (CHANGELOG)");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_LSEEK_OP_FAILED,
+ NULL);
goto out;
}
@@ -390,8 +368,8 @@ update_path(xlator_t *this, char *cl_path)
found = strstr(cl_path, up_cl);
if (found == NULL) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_LSEEK_OP_FAILED,
- "Could not find CHANGELOG in changelog path");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PATH_NOT_FOUND,
+ NULL);
goto out;
} else {
memcpy(found, low_cl, sizeof(low_cl) - 1);
@@ -403,18 +381,22 @@ out:
}
static int
-changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv,
- unsigned long ts)
+changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv, time_t ts)
{
int ret = -1;
int notify = 0;
int cl_empty_flag = 0;
+ struct tm *gmt;
+ char yyyymmdd[40];
char ofile[PATH_MAX] = {
0,
};
char nfile[PATH_MAX] = {
0,
};
+ char nfile_dir[PATH_MAX] = {
+ 0,
+ };
changelog_event_t ev = {
0,
};
@@ -422,33 +404,37 @@ changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv,
if (priv->changelog_fd != -1) {
ret = sys_fsync(priv->changelog_fd);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_FSYNC_OP_FAILED, "fsync failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_FSYNC_OP_FAILED, NULL);
}
ret = cl_is_empty(this, priv->changelog_fd);
if (ret == 1) {
cl_empty_flag = 1;
} else if (ret == -1) {
/* Log error but proceed as usual */
- gf_msg(this->name, GF_LOG_WARNING, 0,
- CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED,
- "Error detecting empty changelog");
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED, NULL);
}
sys_close(priv->changelog_fd);
priv->changelog_fd = -1;
}
+ /* Get GMT time. */
+ gmt = gmtime(&ts);
+
+ strftime(yyyymmdd, sizeof(yyyymmdd), "%Y/%m/%d", gmt);
+
(void)snprintf(ofile, PATH_MAX, "%s/" CHANGELOG_FILE_NAME,
priv->changelog_dir);
- (void)snprintf(nfile, PATH_MAX, "%s/" CHANGELOG_FILE_NAME ".%lu",
- priv->changelog_dir, ts);
+ (void)snprintf(nfile, PATH_MAX, "%s/%s/" CHANGELOG_FILE_NAME ".%ld",
+ priv->changelog_dir, yyyymmdd, ts);
+ (void)snprintf(nfile_dir, PATH_MAX, "%s/%s", priv->changelog_dir, yyyymmdd);
if (cl_empty_flag == 1) {
ret = sys_unlink(ofile);
if (ret) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_UNLINK_OP_FAILED,
- "error unlinking empty changelog", "path=%s", ofile, NULL);
+ CHANGELOG_MSG_UNLINK_OP_FAILED, "path=%s", ofile, NULL);
ret = 0; /* Error in unlinking empty changelog should
not break further changelog operation, so
reset return value to 0*/
@@ -456,13 +442,26 @@ changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv,
} else {
ret = sys_rename(ofile, nfile);
+ /* Changelog file rename gets ENOENT when parent dir doesn't exist */
+ if (errno == ENOENT) {
+ ret = mkdir_p(nfile_dir, 0600, _gf_true);
+
+ if ((ret == -1) && (EEXIST != errno)) {
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_MKDIR_ERROR, "%s", nfile_dir, NULL);
+ goto out;
+ }
+
+ ret = sys_rename(ofile, nfile);
+ }
+
if (ret && (errno == ENOENT)) {
ret = 0;
goto out;
}
if (ret) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_RENAME_ERROR,
- "error renaming", "from=%s", ofile, "to=%s", nfile, NULL);
+ "from=%s", ofile, "to=%s", nfile, NULL);
}
}
@@ -476,8 +475,8 @@ changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv,
}
ret = htime_update(this, priv, ts, nfile);
if (ret == -1) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
- "could not update htime file");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
+ NULL);
goto out;
}
}
@@ -501,15 +500,10 @@ out:
{
if (ret) {
priv->bn.bnotify_error = _gf_true;
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED,
- "Fail snapshot because of "
- "previous errors");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED, NULL);
} else {
gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BNOTIFY_INFO,
- "Explicit "
- "rollover changelog signaling "
- "bnotify",
"changelog=%s", nfile, NULL);
}
priv->bn.bnotify = _gf_false;
@@ -556,8 +550,8 @@ find_current_htime(int ht_dir_fd, const char *ht_dir_path, char *ht_file_bname)
cnt = scandir(ht_dir_path, &namelist, filter_cur_par_dirs, alphasort);
if (cnt < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_SCAN_DIR_FAILED,
- "scandir failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_SCAN_DIR_FAILED,
+ NULL);
} else if (cnt > 0) {
if (snprintf(ht_file_bname, NAME_MAX, "%s",
namelist[cnt - 1]->d_name) >= NAME_MAX) {
@@ -566,16 +560,15 @@ find_current_htime(int ht_dir_fd, const char *ht_dir_path, char *ht_file_bname)
}
if (sys_fsetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname,
strlen(ht_file_bname), 0)) {
- gf_msg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_FSETXATTR_FAILED,
- "fsetxattr failed: HTIME_CURRENT");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_FSETXATTR_FAILED, "HTIME_CURRENT", NULL);
ret = -1;
goto out;
}
if (sys_fsync(ht_dir_fd) < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_FSYNC_OP_FAILED, "fsync failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_FSYNC_OP_FAILED, NULL);
ret = -1;
goto out;
}
@@ -596,7 +589,7 @@ out:
* returns -1 on failure or error
*/
int
-htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
+htime_open(xlator_t *this, changelog_priv_t *priv, time_t ts)
{
int ht_file_fd = -1;
int ht_dir_fd = -1;
@@ -632,7 +625,7 @@ htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
ht_dir_fd = open(ht_dir_path, O_RDONLY);
if (ht_dir_fd == -1) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
- "open failed", "path=%s", ht_dir_path, NULL);
+ "path=%s", ht_dir_path, NULL);
ret = -1;
goto out;
}
@@ -640,9 +633,8 @@ htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
size = sys_fgetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname,
sizeof(ht_file_bname));
if (size < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FGETXATTR_FAILED,
- "Error extracting"
- " HTIME_CURRENT.");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FGETXATTR_FAILED,
+ "name=HTIME_CURRENT", NULL);
/* If upgrade scenario, find the latest HTIME.TSTAMP file
* and use the same. If error, create a new HTIME.TSTAMP
@@ -650,20 +642,18 @@ htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
*/
cnt = find_current_htime(ht_dir_fd, ht_dir_path, ht_file_bname);
if (cnt <= 0) {
- gf_msg(this->name, GF_LOG_INFO, errno, CHANGELOG_MSG_HTIME_INFO,
- "HTIME_CURRENT not found. Changelog enabled"
- " before init");
+ gf_smsg(this->name, GF_LOG_INFO, errno,
+ CHANGELOG_MSG_NO_HTIME_CURRENT, NULL);
sys_close(ht_dir_fd);
return htime_create(this, priv, ts);
}
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_ERROR,
- "Error extracting"
- " HTIME_CURRENT.");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_HTIME_CURRENT_ERROR, NULL);
}
- gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_HTIME_INFO,
- "HTIME_CURRENT", "path=%s", ht_file_bname, NULL);
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_HTIME_CURRENT, "path=%s",
+ ht_file_bname, NULL);
len = snprintf(ht_file_path, PATH_MAX, "%s/%s", ht_dir_path, ht_file_bname);
if ((len < 0) || (len >= PATH_MAX)) {
ret = -1;
@@ -676,7 +666,7 @@ htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
if (ht_file_fd < 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
- "unable to open htime file", "path=%s", ht_file_path, NULL);
+ "path=%s", ht_file_path, NULL);
ret = -1;
goto out;
}
@@ -686,8 +676,8 @@ htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
ret = sys_fstat(ht_file_fd, &stat_buf);
if (ret < 0) {
- gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_ERROR,
- "unable to stat htime file", "path=%s", ht_file_path, NULL);
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_STAT_ERROR,
+ "path=%s", ht_file_path, NULL);
ret = -1;
goto out;
}
@@ -696,9 +686,7 @@ htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
size = sys_fgetxattr(ht_file_fd, HTIME_KEY, x_value, sizeof(x_value));
if (size < 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FGETXATTR_FAILED,
- "error extracting max"
- " timstamp from htime file",
- "path=%s", ht_file_path, NULL);
+ "name=%s", HTIME_KEY, "path=%s", ht_file_path, NULL);
ret = -1;
goto out;
}
@@ -710,14 +698,11 @@ htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
total1 = stat_buf.st_size / record_len;
if (total != total1) {
gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_TOTAL_LOG_INFO,
- "Mismatch of changelog count. "
- "INIT CASE",
"xattr_total=%lu", total, "size_total=%lu", total1, NULL);
}
- gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_TOTAL_LOG_INFO,
- "INIT CASE", "min=%lu", min_ts, "max=%lu", max_ts,
- "total_changelogs=%lu", total, NULL);
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_TOTAL_LOG_INFO, "min=%lu",
+ min_ts, "max=%lu", max_ts, "total_changelogs=%lu", total, NULL);
if (total < total1)
priv->rollover_count = total1 + 1;
@@ -734,7 +719,7 @@ out:
* returns -1 on failure or error
*/
int
-htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
+htime_create(xlator_t *this, changelog_priv_t *priv, time_t ts)
{
int ht_file_fd = -1;
int ht_dir_fd = -1;
@@ -751,15 +736,13 @@ htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
int flags = 0;
int32_t len = 0;
- gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_HTIME_INFO,
- "Changelog enable: Creating new "
- "HTIME file",
- "name=%lu", ts, NULL);
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_NEW_HTIME_FILE,
+ "name=%ld", ts, NULL);
CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, ht_dir_path);
/* get the htime file name in ht_file_path */
- len = snprintf(ht_file_path, PATH_MAX, "%s/%s.%lu", ht_dir_path,
+ len = snprintf(ht_file_path, PATH_MAX, "%s/%s.%ld", ht_dir_path,
HTIME_FILE_NAME, ts);
if ((len < 0) || (len >= PATH_MAX)) {
ret = -1;
@@ -771,23 +754,23 @@ htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
if (ht_file_fd < 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
- "unable to create htime file", "path=%s", ht_file_path, NULL);
+ "path=%s", ht_file_path, NULL);
ret = -1;
goto out;
}
if (sys_fsetxattr(ht_file_fd, HTIME_KEY, HTIME_INITIAL_VALUE,
sizeof(HTIME_INITIAL_VALUE) - 1, 0)) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSETXATTR_FAILED,
- "Htime xattr initialization failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_XATTR_INIT_FAILED, NULL);
ret = -1;
goto out;
}
ret = sys_fsync(ht_file_fd);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSYNC_OP_FAILED,
- "fsync failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSYNC_OP_FAILED,
+ NULL);
goto out;
}
@@ -800,26 +783,25 @@ htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts)
ht_dir_fd = open(ht_dir_path, O_RDONLY);
if (ht_dir_fd == -1) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
- "open failed", "path=%s", ht_dir_path, NULL);
+ "path=%s", ht_dir_path, NULL);
ret = -1;
goto out;
}
- (void)snprintf(ht_file_bname, sizeof(ht_file_bname), "%s.%lu",
+ (void)snprintf(ht_file_bname, sizeof(ht_file_bname), "%s.%ld",
HTIME_FILE_NAME, ts);
if (sys_fsetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname,
strlen(ht_file_bname), 0)) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSETXATTR_FAILED,
- "fsetxattr failed:"
- " HTIME_CURRENT");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSETXATTR_FAILED,
+ " HTIME_CURRENT", NULL);
ret = -1;
goto out;
}
ret = sys_fsync(ht_dir_fd);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSYNC_OP_FAILED,
- "fsync failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSYNC_OP_FAILED,
+ NULL);
goto out;
}
@@ -873,7 +855,7 @@ changelog_snap_open(xlator_t *this, changelog_priv_t *priv)
fd = open(c_snap_path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
if (fd < 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
- "unable to open file", "path=%s", c_snap_path, NULL);
+ "path=%s", c_snap_path, NULL);
ret = -1;
goto out;
}
@@ -905,8 +887,8 @@ changelog_snap_logging_start(xlator_t *this, changelog_priv_t *priv)
int ret = 0;
ret = changelog_snap_open(this, priv);
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO,
- "Now starting to log in call path");
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO, "starting",
+ NULL);
return ret;
}
@@ -926,8 +908,8 @@ changelog_snap_logging_stop(xlator_t *this, changelog_priv_t *priv)
sys_close(priv->c_snap_fd);
priv->c_snap_fd = -1;
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO,
- "Stopped to log in call path");
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO, "Stopped",
+ NULL);
return ret;
}
@@ -955,9 +937,6 @@ changelog_open_journal(xlator_t *this, changelog_priv_t *priv)
fd = open(changelog_path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
if (fd < 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
- "unable to open/create changelog file."
- " change-logging will be"
- " inactive",
"path=%s", changelog_path, NULL);
goto out;
}
@@ -980,8 +959,8 @@ out:
}
int
-changelog_start_next_change(xlator_t *this, changelog_priv_t *priv,
- unsigned long ts, gf_boolean_t finale)
+changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, time_t ts,
+ gf_boolean_t finale)
{
int ret = -1;
@@ -1002,21 +981,12 @@ changelog_entry_length()
return sizeof(changelog_log_data_t);
}
-int
+void
changelog_fill_rollover_data(changelog_log_data_t *cld, gf_boolean_t is_last)
{
- struct timeval tv = {
- 0,
- };
-
cld->cld_type = CHANGELOG_TYPE_ROLLOVER;
-
- if (gettimeofday(&tv, NULL))
- return -1;
-
- cld->cld_roll_time = (unsigned long)tv.tv_sec;
+ cld->cld_roll_time = gf_time();
cld->cld_finale = is_last;
- return 0;
}
int
@@ -1074,11 +1044,10 @@ changelog_snap_handle_ascii_change(xlator_t *this, changelog_log_data_t *cld)
ret = changelog_snap_write_change(priv, buffer, off);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_WRITE_FAILED,
- "error writing csnap to disk");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_WRITE_FAILED,
+ "csnap", NULL);
}
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO,
- "Successfully wrote to csnap");
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_WROTE_TO_CSNAP, NULL);
ret = 0;
out:
return ret;
@@ -1095,9 +1064,8 @@ changelog_handle_change(xlator_t *this, changelog_priv_t *priv,
ret = changelog_start_next_change(this, priv, cld->cld_roll_time,
cld->cld_finale);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_GET_TIME_OP_FAILED,
- "Problem rolling over changelog(s)");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_GET_TIME_OP_FAILED, NULL);
goto out;
}
@@ -1111,16 +1079,16 @@ changelog_handle_change(xlator_t *this, changelog_priv_t *priv,
if (CHANGELOG_TYPE_IS_FSYNC(cld->cld_type)) {
ret = sys_fsync(priv->changelog_fd);
if (ret < 0) {
- gf_msg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_FSYNC_OP_FAILED, "fsync failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_FSYNC_OP_FAILED, NULL);
}
goto out;
}
ret = priv->ce->encode(this, cld);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_WRITE_FAILED,
- "error writing changelog to disk");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_WRITE_FAILED,
+ "changelog", NULL);
}
out:
@@ -1143,6 +1111,7 @@ changelog_local_init(xlator_t *this, inode_t *inode, uuid_t gfid,
gf_msg_callingfn(this->name, GF_LOG_WARNING, 0,
CHANGELOG_MSG_INODE_NOT_FOUND,
"inode needed for version checking !!!");
+
goto out;
}
@@ -1211,7 +1180,7 @@ changelog_drain_black_fops(xlator_t *this, changelog_priv_t *priv)
ret = pthread_mutex_lock(&priv->dm.drain_black_mutex);
if (ret)
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR,
- "pthread error", "error=%d", ret, NULL);
+ "error=%d", ret, NULL);
while (priv->dm.black_fop_cnt > 0) {
gf_msg_debug(this->name, 0, "Conditional wait on black fops: %ld",
priv->dm.black_fop_cnt);
@@ -1220,14 +1189,14 @@ changelog_drain_black_fops(xlator_t *this, changelog_priv_t *priv)
&priv->dm.drain_black_mutex);
if (ret)
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED,
- "pthread cond wait failed", "error=%d", ret, NULL);
+ CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED, "error=%d", ret,
+ NULL);
}
priv->dm.drain_wait_black = _gf_false;
ret = pthread_mutex_unlock(&priv->dm.drain_black_mutex);
if (ret)
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR,
- "pthread error", "error=%d", ret, NULL);
+ "error=%d", ret, NULL);
pthread_cleanup_pop(0);
gf_msg_debug(this->name, 0, "Woke up: Conditional wait on black fops");
}
@@ -1247,7 +1216,7 @@ changelog_drain_white_fops(xlator_t *this, changelog_priv_t *priv)
ret = pthread_mutex_lock(&priv->dm.drain_white_mutex);
if (ret)
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR,
- "pthread error", "error=%d", ret, NULL);
+ "error=%d", ret, NULL);
while (priv->dm.white_fop_cnt > 0) {
gf_msg_debug(this->name, 0, "Conditional wait on white fops : %ld",
priv->dm.white_fop_cnt);
@@ -1256,14 +1225,14 @@ changelog_drain_white_fops(xlator_t *this, changelog_priv_t *priv)
&priv->dm.drain_white_mutex);
if (ret)
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED,
- "pthread cond wait failed", "error=%d", ret, NULL);
+ CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED, "error=%d", ret,
+ NULL);
}
priv->dm.drain_wait_white = _gf_false;
ret = pthread_mutex_unlock(&priv->dm.drain_white_mutex);
if (ret)
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR,
- "pthread error", "error=%d", ret, NULL);
+ "error=%d", ret, NULL);
pthread_cleanup_pop(0);
gf_msg_debug(this->name, 0, "Woke up: Conditional wait on white fops");
}
@@ -1292,7 +1261,7 @@ changelog_rollover(void *data)
while (1) {
(void)pthread_testcancel();
- tv.tv_sec = time(NULL) + priv->rollover_time;
+ tv.tv_sec = gf_time() + priv->rollover_time;
tv.tv_nsec = 0;
ret = 0; /* Reset ret to zero */
@@ -1315,12 +1284,12 @@ changelog_rollover(void *data)
pthread_cleanup_pop(0);
if (ret == 0) {
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO,
- "Explicit wakeup on barrier notify");
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO,
+ NULL);
priv->explicit_rollover = _gf_true;
} else if (ret && ret != ETIMEDOUT) {
- gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_SELECT_FAILED,
- "pthread_cond_timedwait failed");
+ gf_smsg(this->name, GF_LOG_ERROR, errno,
+ CHANGELOG_MSG_SELECT_FAILED, NULL);
continue;
} else if (ret && ret == ETIMEDOUT) {
gf_msg_debug(this->name, 0, "Wokeup on timeout");
@@ -1373,13 +1342,7 @@ changelog_rollover(void *data)
if (priv->explicit_rollover == _gf_true)
sleep(1);
- ret = changelog_fill_rollover_data(&cld, _gf_false);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_GET_TIME_OP_FAILED,
- "failed to fill rollover data");
- continue;
- }
+ changelog_fill_rollover_data(&cld, _gf_false);
_mask_cancellation();
@@ -1427,9 +1390,8 @@ changelog_fsync_thread(void *data)
ret = changelog_inject_single_event(this, priv, &cld);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_INJECT_FSYNC_FAILED,
- "failed to inject fsync event");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_INJECT_FSYNC_FAILED, NULL);
_unmask_cancellation();
}
@@ -1466,7 +1428,7 @@ static int
__changelog_inode_ctx_set(xlator_t *this, inode_t *inode,
changelog_inode_ctx_t *ctx)
{
- uint64_t ctx_addr = (uint64_t)ctx;
+ uint64_t ctx_addr = (uint64_t)(uintptr_t)ctx;
return __inode_ctx_set(inode, this, &ctx_addr);
}
@@ -1851,23 +1813,21 @@ changelog_fill_entry_buf(call_frame_t *frame, xlator_t *this, loc_t *loc,
parent = inode_parent(loc->inode, 0, 0);
if (!parent) {
gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_INODE_NOT_FOUND,
- "Parent inode not found", "gfid=%s",
- uuid_utoa(loc->inode->gfid), NULL);
+ "type=parent", "gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
goto err;
}
CHANGELOG_INIT_NOCHECK(this, *local, loc->inode, loc->inode->gfid, 5);
if (!(*local)) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_LOCAL_INIT_FAILED,
- "changelog local"
- " initiatilization failed");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_LOCAL_INIT_FAILED,
+ NULL);
goto err;
}
co = changelog_get_usable_buffer(*local);
if (!co) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_NO_MEMORY,
- "Failed to get buffer");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_GET_BUFFER_FAILED,
+ NULL);
goto err;
}
diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h
index 10d457e8cf5..38fa7590c32 100644
--- a/xlators/features/changelog/src/changelog-helpers.h
+++ b/xlators/features/changelog/src/changelog-helpers.h
@@ -11,14 +11,14 @@
#ifndef _CHANGELOG_HELPERS_H
#define _CHANGELOG_HELPERS_H
-#include "locking.h"
-#include "timer.h"
+#include <glusterfs/locking.h>
+#include <glusterfs/timer.h>
#include "pthread.h"
-#include "iobuf.h"
-#include "rot-buffs.h"
+#include <glusterfs/iobuf.h>
+#include <glusterfs/rot-buffs.h>
#include "changelog-misc.h"
-#include "call-stub.h"
+#include <glusterfs/call-stub.h>
#include "rpcsvc.h"
#include "changelog-ev-handle.h"
@@ -31,7 +31,7 @@
*/
typedef struct changelog_log_data {
/* rollover related */
- unsigned long cld_roll_time;
+ time_t cld_roll_time;
/* reopen changelog? */
gf_boolean_t cld_finale;
@@ -97,12 +97,6 @@ struct changelog_encoder {
typedef struct changelog_time_slice {
/**
- * just in case we need nanosecond granularity some day.
- * field is unused as of now (maybe we'd need it later).
- */
- struct timeval tv_start;
-
- /**
* version of changelog file, incremented each time changes
* rollover.
*/
@@ -190,8 +184,12 @@ typedef struct changelog_ev_selector {
/* changelog's private structure */
struct changelog_priv {
+ /* changelog journalling */
gf_boolean_t active;
+ /* changelog live notifications */
+ gf_boolean_t rpc_active;
+
/* to generate unique socket file per brick */
char *changelog_brick;
@@ -307,6 +305,24 @@ struct changelog_priv {
/* glusterfind dependency to capture paths on deleted entries*/
gf_boolean_t capture_del_path;
+
+ /* Save total no. of listners */
+ gf_atomic_t listnercnt;
+
+ /* Save total no. of xprt are associated with listner */
+ gf_atomic_t xprtcnt;
+
+ /* Save xprt list */
+ struct list_head xprt_list;
+
+ /* Save total no. of client connection */
+ gf_atomic_t clntcnt;
+
+ /* Save cleanup brick in victim */
+ xlator_t *victim;
+
+ /* Status to save cleanup notify status */
+ gf_boolean_t notify_down;
};
struct changelog_local {
@@ -401,11 +417,11 @@ changelog_local_t *
changelog_local_init(xlator_t *this, inode_t *inode, uuid_t gfid,
int xtra_records, gf_boolean_t update_flag);
int
-changelog_start_next_change(xlator_t *this, changelog_priv_t *priv,
- unsigned long ts, gf_boolean_t finale);
+changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, time_t ts,
+ gf_boolean_t finale);
int
changelog_open_journal(xlator_t *this, changelog_priv_t *priv);
-int
+void
changelog_fill_rollover_data(changelog_log_data_t *cld, gf_boolean_t is_last);
int
changelog_inject_single_event(xlator_t *this, changelog_priv_t *priv,
@@ -429,12 +445,11 @@ changelog_fsync_thread(void *data);
int
changelog_forget(xlator_t *this, inode_t *inode);
int
-htime_update(xlator_t *this, changelog_priv_t *priv, unsigned long ts,
- char *buffer);
+htime_update(xlator_t *this, changelog_priv_t *priv, time_t ts, char *buffer);
int
-htime_open(xlator_t *this, changelog_priv_t *priv, unsigned long ts);
+htime_open(xlator_t *this, changelog_priv_t *priv, time_t ts);
int
-htime_create(xlator_t *this, changelog_priv_t *priv, unsigned long ts);
+htime_create(xlator_t *this, changelog_priv_t *priv, time_t ts);
/* Geo-Rep snapshot dependency changes */
void
@@ -492,8 +507,6 @@ changelog_deselect_event(xlator_t *, changelog_ev_selector_t *, unsigned int);
int
changelog_init_event_selection(xlator_t *, changelog_ev_selector_t *);
int
-changelog_cleanup_event_selection(xlator_t *, changelog_ev_selector_t *);
-int
changelog_ev_selected(xlator_t *, changelog_ev_selector_t *, unsigned int);
void
changelog_dispatch_event(xlator_t *, changelog_priv_t *, changelog_event_t *);
@@ -656,8 +669,8 @@ resolve_pargfid_to_path(xlator_t *this, const uuid_t gfid, char **path,
#define CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, label) \
do { \
if (!priv->active) { \
- gf_msg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_NOT_ACTIVE, \
- "Changelog is not active, return success"); \
+ gf_smsg(this->name, GF_LOG_WARNING, 0, \
+ CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE, NULL); \
ret = 0; \
goto label; \
} \
@@ -668,7 +681,7 @@ resolve_pargfid_to_path(xlator_t *this, const uuid_t gfid, char **path,
do { \
if (ret) { \
gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR, \
- "pthread error", "error=%d", ret, NULL); \
+ "error=%d", ret, NULL); \
ret = -1; \
goto label; \
} \
@@ -679,7 +692,7 @@ resolve_pargfid_to_path(xlator_t *this, const uuid_t gfid, char **path,
do { \
if (ret) { \
gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR, \
- "pthread error", "error=%d", ret, NULL); \
+ "error=%d", ret, NULL); \
ret = -1; \
flag = _gf_true; \
goto label; \
@@ -691,7 +704,7 @@ resolve_pargfid_to_path(xlator_t *this, const uuid_t gfid, char **path,
do { \
if (ret) { \
gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR, \
- "pthread error", "error=%d", ret, NULL); \
+ "error=%d", ret, NULL); \
ret = -1; \
pthread_mutex_unlock(&mutex); \
goto label; \
diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h
index 1e3786c6298..a2d8a9cbe93 100644
--- a/xlators/features/changelog/src/changelog-mem-types.h
+++ b/xlators/features/changelog/src/changelog-mem-types.h
@@ -11,7 +11,7 @@
#ifndef _CHANGELOG_MEM_TYPES_H
#define _CHANGELOG_MEM_TYPES_H
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_changelog_mem_types {
gf_changelog_mt_priv_t = gf_common_mt_end + 1,
diff --git a/xlators/features/changelog/src/changelog-messages.h b/xlators/features/changelog/src/changelog-messages.h
index dbf133ec836..cb0e16c85d8 100644
--- a/xlators/features/changelog/src/changelog-messages.h
+++ b/xlators/features/changelog/src/changelog-messages.h
@@ -11,7 +11,7 @@
#ifndef _CHANGELOG_MESSAGES_H_
#define _CHANGELOG_MESSAGES_H_
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
/* To add new message IDs, append new identifiers at the end of the list.
*
@@ -24,7 +24,7 @@
*/
GLFS_MSGID(
- CHANGELOG, CHANGELOG_MSG_OPEN_FAILED, CHANGELOG_MSG_NO_MEMORY,
+ CHANGELOG, CHANGELOG_MSG_OPEN_FAILED, CHANGELOG_MSG_BARRIER_FOP_FAILED,
CHANGELOG_MSG_VOL_MISCONFIGURED, CHANGELOG_MSG_RENAME_ERROR,
CHANGELOG_MSG_READ_ERROR, CHANGELOG_MSG_HTIME_ERROR,
CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED,
@@ -37,11 +37,11 @@ GLFS_MSGID(
CHANGELOG_MSG_FSYNC_OP_FAILED, CHANGELOG_MSG_TOTAL_LOG_INFO,
CHANGELOG_MSG_SNAP_INFO, CHANGELOG_MSG_SELECT_FAILED,
CHANGELOG_MSG_FCNTL_FAILED, CHANGELOG_MSG_BNOTIFY_INFO,
- CHANGELOG_MSG_ENTRY_BUF_INFO, CHANGELOG_MSG_NOT_ACTIVE,
+ CHANGELOG_MSG_ENTRY_BUF_INFO, CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE,
CHANGELOG_MSG_LOCAL_INIT_FAILED, CHANGELOG_MSG_NOTIFY_REGISTER_FAILED,
CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED, CHANGELOG_MSG_HANDLE_PROBE_ERROR,
CHANGELOG_MSG_SET_FD_CONTEXT, CHANGELOG_MSG_FREEUP_FAILED,
- CHANGELOG_MSG_HTIME_INFO, CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED,
+ CHANGELOG_MSG_RECONFIGURE, CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED,
CHANGELOG_MSG_RPC_BUILD_ERROR, CHANGELOG_MSG_RPC_CONNECT_ERROR,
CHANGELOG_MSG_RPC_START_ERROR, CHANGELOG_MSG_BUFFER_STARVATION_ERROR,
CHANGELOG_MSG_SCAN_DIR_FAILED, CHANGELOG_MSG_FSETXATTR_FAILED,
@@ -52,6 +52,121 @@ GLFS_MSGID(
CHANGELOG_MSG_FSTAT_OP_FAILED, CHANGELOG_MSG_LSEEK_OP_FAILED,
CHANGELOG_MSG_STRSTR_OP_FAILED, CHANGELOG_MSG_UNLINK_OP_FAILED,
CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED,
- CHANGELOG_MSG_READLINK_OP_FAILED, CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED);
+ CHANGELOG_MSG_READLINK_OP_FAILED, CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED,
+ CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED, CHANGELOG_MSG_MEMORY_INIT_FAILED,
+ CHANGELOG_MSG_NO_MEMORY, CHANGELOG_MSG_HTIME_STAT_ERROR,
+ CHANGELOG_MSG_HTIME_CURRENT_ERROR, CHANGELOG_MSG_BNOTIFY_COND_INFO,
+ CHANGELOG_MSG_NO_HTIME_CURRENT, CHANGELOG_MSG_HTIME_CURRENT,
+ CHANGELOG_MSG_NEW_HTIME_FILE, CHANGELOG_MSG_MKDIR_ERROR,
+ CHANGELOG_MSG_PATH_NOT_FOUND, CHANGELOG_MSG_XATTR_INIT_FAILED,
+ CHANGELOG_MSG_WROTE_TO_CSNAP, CHANGELOG_MSG_UNUSED_0,
+ CHANGELOG_MSG_GET_BUFFER_FAILED, CHANGELOG_MSG_BARRIER_STATE_NOTIFY,
+ CHANGELOG_MSG_BARRIER_DISABLED, CHANGELOG_MSG_BARRIER_ALREADY_DISABLED,
+ CHANGELOG_MSG_BARRIER_ON_ERROR, CHANGELOG_MSG_BARRIER_ENABLE,
+ CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND, CHANGELOG_MSG_ERROR_IN_DICT_GET,
+ CHANGELOG_MSG_UNUSED_1, CHANGELOG_MSG_UNUSED_2,
+ CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS,
+ CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED,
+ CHANGELOG_MSG_BARRIER_TIMEOUT, CHANGELOG_MSG_TIMEOUT_ADD_FAILED,
+ CHANGELOG_MSG_CLEANUP_ALREADY_SET);
+#define CHANGELOG_MSG_BARRIER_FOP_FAILED_STR \
+ "failed to barrier FOPs, disabling changelog barrier"
+#define CHANGELOG_MSG_MEMORY_INIT_FAILED_STR "memory accounting init failed"
+#define CHANGELOG_MSG_NO_MEMORY_STR "failed to create local memory pool"
+#define CHANGELOG_MSG_ENTRY_BUF_INFO_STR \
+ "Entry cannot be captured for gfid, Capturing DATA entry."
+#define CHANGELOG_MSG_PTHREAD_ERROR_STR "pthread error"
+#define CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED_STR "pthread_mutex_init failed"
+#define CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED_STR "pthread_cond_init failed"
+#define CHANGELOG_MSG_HTIME_ERROR_STR "failed to update HTIME file"
+#define CHANGELOG_MSG_HTIME_STAT_ERROR_STR "unable to stat htime file"
+#define CHANGELOG_MSG_HTIME_CURRENT_ERROR_STR "Error extracting HTIME_CURRENT."
+#define CHANGELOG_MSG_UNLINK_OP_FAILED_STR "error unlinking empty changelog"
+#define CHANGELOG_MSG_RENAME_ERROR_STR "error renaming"
+#define CHANGELOG_MSG_MKDIR_ERROR_STR "unable to create directory"
+#define CHANGELOG_MSG_BNOTIFY_INFO_STR \
+ "Explicit rollover changelog signaling bnotify"
+#define CHANGELOG_MSG_BNOTIFY_COND_INFO_STR "Woke up: bnotify conditional wait"
+#define CHANGELOG_MSG_RECONFIGURE_STR "Reconfigure: Changelog Enable"
+#define CHANGELOG_MSG_NO_HTIME_CURRENT_STR \
+ "HTIME_CURRENT not found. Changelog enabled before init"
+#define CHANGELOG_MSG_HTIME_CURRENT_STR "HTIME_CURRENT"
+#define CHANGELOG_MSG_NEW_HTIME_FILE_STR \
+ "Changelog enable: Creating new HTIME file"
+#define CHANGELOG_MSG_FGETXATTR_FAILED_STR "fgetxattr failed"
+#define CHANGELOG_MSG_TOTAL_LOG_INFO_STR "changelog info"
+#define CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED_STR "pthread cond wait failed"
+#define CHANGELOG_MSG_INODE_NOT_FOUND_STR "inode not found"
+#define CHANGELOG_MSG_READLINK_OP_FAILED_STR \
+ "could not read the link from the gfid handle"
+#define CHANGELOG_MSG_OPEN_FAILED_STR "unable to open file"
+#define CHANGELOG_MSG_RPC_CONNECT_ERROR_STR "failed to connect back"
+#define CHANGELOG_MSG_BUFFER_STARVATION_ERROR_STR \
+ "Failed to get buffer for RPC dispatch"
+#define CHANGELOG_MSG_PTHREAD_CANCEL_FAILED_STR "could not cancel thread"
+#define CHANGELOG_MSG_FSTAT_OP_FAILED_STR "Could not stat (CHANGELOG)"
+#define CHANGELOG_MSG_LSEEK_OP_FAILED_STR "Could not lseek (changelog)"
+#define CHANGELOG_MSG_PATH_NOT_FOUND_STR \
+ "Could not find CHANGELOG in changelog path"
+#define CHANGELOG_MSG_FSYNC_OP_FAILED_STR "fsync failed"
+#define CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED_STR \
+ "Error detecting empty changelog"
+#define CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED_STR \
+ "Fail snapshot because of previous errors"
+#define CHANGELOG_MSG_SCAN_DIR_FAILED_STR "scandir failed"
+#define CHANGELOG_MSG_FSETXATTR_FAILED_STR "fsetxattr failed"
+#define CHANGELOG_MSG_XATTR_INIT_FAILED_STR "Htime xattr initialization failed"
+#define CHANGELOG_MSG_SNAP_INFO_STR "log in call path"
+#define CHANGELOG_MSG_WRITE_FAILED_STR "error writing to disk"
+#define CHANGELOG_MSG_WROTE_TO_CSNAP_STR "Successfully wrote to csnap"
+#define CHANGELOG_MSG_GET_TIME_OP_FAILED_STR "Problem rolling over changelog(s)"
+#define CHANGELOG_MSG_BARRIER_INFO_STR "Explicit wakeup on barrier notify"
+#define CHANGELOG_MSG_SELECT_FAILED_STR "pthread_cond_timedwait failed"
+#define CHANGELOG_MSG_INJECT_FSYNC_FAILED_STR "failed to inject fsync event"
+#define CHANGELOG_MSG_LOCAL_INIT_FAILED_STR \
+ "changelog local initialization failed"
+#define CHANGELOG_MSG_GET_BUFFER_FAILED_STR "Failed to get buffer"
+#define CHANGELOG_MSG_SET_FD_CONTEXT_STR \
+ "could not set fd context(for release cbk)"
+#define CHANGELOG_MSG_DICT_GET_FAILED_STR "Barrier failed"
+#define CHANGELOG_MSG_BARRIER_STATE_NOTIFY_STR "Barrier notification"
+#define CHANGELOG_MSG_BARRIER_ERROR_STR \
+ "Received another barrier off notification while already off"
+#define CHANGELOG_MSG_BARRIER_DISABLED_STR "disabled changelog barrier"
+#define CHANGELOG_MSG_BARRIER_ALREADY_DISABLED_STR \
+ "Changelog barrier already disabled"
+#define CHANGELOG_MSG_BARRIER_ON_ERROR_STR \
+ "Received another barrier on notification when last one is not served yet"
+#define CHANGELOG_MSG_BARRIER_ENABLE_STR "Enabled changelog barrier"
+#define CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND_STR "barrier key not found"
+#define CHANGELOG_MSG_ERROR_IN_DICT_GET_STR \
+ "Something went wrong in dict_get_str_boolean"
+#define CHANGELOG_MSG_DIR_OPTIONS_NOT_SET_STR "changelog-dir option is not set"
+#define CHANGELOG_MSG_FREEUP_FAILED_STR "could not cleanup bootstrapper"
+#define CHANGELOG_MSG_CHILD_MISCONFIGURED_STR \
+ "translator needs a single subvolume"
+#define CHANGELOG_MSG_VOL_MISCONFIGURED_STR \
+ "dangling volume. please check volfile"
+#define CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_STR \
+ "Dequeuing all the changelog barriered fops"
+#define CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED_STR \
+ "Dequeuing changelog barriered fops is finished"
+#define CHANGELOG_MSG_BARRIER_TIMEOUT_STR \
+ "Disabling changelog barrier because of the timeout"
+#define CHANGELOG_MSG_TIMEOUT_ADD_FAILED_STR \
+ "Couldn't add changelog barrier timeout event"
+#define CHANGELOG_MSG_RPC_BUILD_ERROR_STR "failed to build rpc options"
+#define CHANGELOG_MSG_NOTIFY_REGISTER_FAILED_STR "failed to register notify"
+#define CHANGELOG_MSG_RPC_START_ERROR_STR "failed to start rpc"
+#define CHANGELOG_MSG_CREATE_FRAME_FAILED_STR "failed to create frame"
+#define CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED_STR "failed to serialize reply"
+#define CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED_STR "cannot register program"
+#define CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE_STR \
+ "Changelog is not active, return success"
+#define CHANGELOG_MSG_PUT_BUFFER_FAILED_STR \
+ "failed to put buffer after consumption"
+#define CHANGELOG_MSG_CLEANUP_ALREADY_SET_STR \
+ "cleanup_starting flag is already set for xl"
+#define CHANGELOG_MSG_HANDLE_PROBE_ERROR_STR "xdr decoding error"
#endif /* !_CHANGELOG_MESSAGES_H_ */
diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h
index 04d1bdeba03..e2addc09414 100644
--- a/xlators/features/changelog/src/changelog-misc.h
+++ b/xlators/features/changelog/src/changelog-misc.h
@@ -11,8 +11,8 @@
#ifndef _CHANGELOG_MISC_H
#define _CHANGELOG_MISC_H
-#include "glusterfs.h"
-#include "common-utils.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/common-utils.h>
#define CHANGELOG_MAX_TYPE 4
#define CHANGELOG_FILE_NAME "CHANGELOG"
diff --git a/xlators/features/changelog/src/changelog-rpc-common.c b/xlators/features/changelog/src/changelog-rpc-common.c
index ce01bf7a133..125246a17e1 100644
--- a/xlators/features/changelog/src/changelog-rpc-common.c
+++ b/xlators/features/changelog/src/changelog-rpc-common.c
@@ -11,7 +11,7 @@
#include "changelog-rpc-common.h"
#include "changelog-messages.h"
-#include "syscall.h"
+#include <glusterfs/syscall.h>
/**
*****************************************************
Client Interface
@@ -28,7 +28,7 @@ changelog_rpc_poller(void *arg)
{
xlator_t *this = arg;
- (void)event_dispatch(this->ctx->event_pool);
+ (void)gf_event_dispatch(this->ctx->event_pool);
return NULL;
}
@@ -47,10 +47,10 @@ changelog_rpc_client_init(xlator_t *this, void *cbkdata, char *sockfile,
if (!options)
goto error_return;
- ret = rpc_transport_unix_options_build(&options, sockfile, 0);
+ ret = rpc_transport_unix_options_build(options, sockfile, 0);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_BUILD_ERROR,
- "failed to build rpc options");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_BUILD_ERROR,
+ NULL);
goto dealloc_dict;
}
@@ -60,19 +60,19 @@ changelog_rpc_client_init(xlator_t *this, void *cbkdata, char *sockfile,
ret = rpc_clnt_register_notify(rpc, fn, cbkdata);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_NOTIFY_REGISTER_FAILED,
- "failed to register notify");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_NOTIFY_REGISTER_FAILED, NULL);
goto dealloc_rpc_clnt;
}
ret = rpc_clnt_start(rpc);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_START_ERROR,
- "failed to start rpc");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_START_ERROR,
+ NULL);
goto dealloc_rpc_clnt;
}
+ dict_unref(options);
return rpc;
dealloc_rpc_clnt:
@@ -164,8 +164,8 @@ changelog_invoke_rpc(xlator_t *this, struct rpc_clnt *rpc,
frame = create_frame(this, this->ctx->pool);
if (!frame) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_CREATE_FRAME_FAILED,
- "failed to create frame");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_CREATE_FRAME_FAILED,
+ NULL);
goto error_return;
}
@@ -238,8 +238,8 @@ changelog_rpc_sumbit_reply(rpcsvc_request_t *req, void *arg,
iob = __changelog_rpc_serialize_reply(req, arg, &iov, xdrproc);
if (!iob)
- gf_msg("", GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED,
- "failed to serialize reply");
+ gf_smsg("", GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED,
+ NULL);
else
iobref_add(iobref, iob);
@@ -260,6 +260,10 @@ changelog_rpc_server_destroy(xlator_t *this, rpcsvc_t *rpc, char *sockfile,
rpcsvc_listener_t *listener = NULL;
rpcsvc_listener_t *next = NULL;
struct rpcsvc_program *prog = NULL;
+ rpc_transport_t *trans = NULL;
+
+ if (!rpc)
+ return;
while (*progs) {
prog = *progs;
@@ -269,22 +273,25 @@ changelog_rpc_server_destroy(xlator_t *this, rpcsvc_t *rpc, char *sockfile,
list_for_each_entry_safe(listener, next, &rpc->listeners, list)
{
- rpcsvc_listener_destroy(listener);
+ if (listener->trans) {
+ trans = listener->trans;
+ rpc_transport_disconnect(trans, _gf_false);
+ }
}
(void)rpcsvc_unregister_notify(rpc, fn, this);
- sys_unlink(sockfile);
- if (rpc->rxpool) {
- mem_pool_destroy(rpc->rxpool);
- rpc->rxpool = NULL;
- }
/* TODO Avoid freeing rpc object in case of brick multiplex
after freeing rpc object svc->rpclock corrupted and it takes
more time to detach a brick
*/
- if (!this->cleanup_starting)
+ if (!this->cleanup_starting) {
+ if (rpc->rxpool) {
+ mem_pool_destroy(rpc->rxpool);
+ rpc->rxpool = NULL;
+ }
GF_FREE(rpc);
+ }
}
rpcsvc_t *
@@ -301,24 +308,23 @@ changelog_rpc_server_init(xlator_t *this, char *sockfile, void *cbkdata,
options = dict_new();
if (!options)
- goto error_return;
+ return NULL;
- ret = rpcsvc_transport_unix_options_build(&options, sockfile);
+ ret = rpcsvc_transport_unix_options_build(options, sockfile);
if (ret)
goto dealloc_dict;
rpc = rpcsvc_init(this, this->ctx, options, 8);
if (rpc == NULL) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_START_ERROR,
- "failed to init rpc");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_START_ERROR,
+ NULL);
goto dealloc_dict;
}
ret = rpcsvc_register_notify(rpc, fn, cbkdata);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_NOTIFY_REGISTER_FAILED,
- "failed to register notify function");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_NOTIFY_REGISTER_FAILED, NULL);
goto dealloc_rpc;
}
@@ -332,11 +338,10 @@ changelog_rpc_server_init(xlator_t *this, char *sockfile, void *cbkdata,
prog = *progs;
ret = rpcsvc_program_register(rpc, prog, _gf_false);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED,
- "cannot register program "
- "(name: %s, prognum: %d, pogver: %d)",
- prog->progname, prog->prognum, prog->progver);
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED, "name%s",
+ prog->progname, "prognum=%d", prog->prognum, "pogver=%d",
+ prog->progver, NULL);
goto dealloc_rpc;
}
@@ -350,6 +355,5 @@ dealloc_rpc:
GF_FREE(rpc);
dealloc_dict:
dict_unref(options);
-error_return:
return NULL;
}
diff --git a/xlators/features/changelog/src/changelog-rpc-common.h b/xlators/features/changelog/src/changelog-rpc-common.h
index 2d3f06e60c0..4d9aa2c694b 100644
--- a/xlators/features/changelog/src/changelog-rpc-common.h
+++ b/xlators/features/changelog/src/changelog-rpc-common.h
@@ -13,8 +13,8 @@
#include "rpcsvc.h"
#include "rpc-clnt.h"
-#include "gf-event.h"
-#include "call-stub.h"
+#include <glusterfs/gf-event.h>
+#include <glusterfs/call-stub.h>
#include "changelog-xdr.h"
#include "xdr-generic.h"
diff --git a/xlators/features/changelog/src/changelog-rpc.c b/xlators/features/changelog/src/changelog-rpc.c
index 828f85e8e45..440b88091a6 100644
--- a/xlators/features/changelog/src/changelog-rpc.c
+++ b/xlators/features/changelog/src/changelog-rpc.c
@@ -8,12 +8,12 @@
cases as published by the Free Software Foundation.
*/
-#include "syscall.h"
+#include <glusterfs/syscall.h>
#include "changelog-rpc.h"
#include "changelog-mem-types.h"
#include "changelog-ev-handle.h"
-struct rpcsvc_program *changelog_programs[];
+static struct rpcsvc_program *changelog_programs[];
static void
changelog_cleanup_dispatchers(xlator_t *this, changelog_priv_t *priv, int count)
@@ -43,9 +43,6 @@ changelog_cleanup_rpc_threads(xlator_t *this, changelog_priv_t *priv)
/** terminate dispatcher thread(s) */
changelog_cleanup_dispatchers(this, priv, priv->nr_dispatchers);
- /* TODO: what about pending and waiting connections? */
- changelog_ev_cleanup_connections(this, conn);
-
/* destroy locks */
ret = pthread_mutex_destroy(&conn->pending_lock);
if (ret != 0)
@@ -72,9 +69,6 @@ changelog_init_rpc_threads(xlator_t *this, changelog_priv_t *priv, rbuf_t *rbuf,
int j = 0;
int ret = 0;
changelog_clnt_t *conn = NULL;
- char thread_name[GF_THREAD_NAMEMAX] = {
- 0,
- };
conn = &priv->connections;
@@ -114,9 +108,9 @@ changelog_init_rpc_threads(xlator_t *this, changelog_priv_t *priv, rbuf_t *rbuf,
/* spawn dispatcher threads */
for (; j < nr_dispatchers; j++) {
- snprintf(thread_name, sizeof(thread_name), "clogd%03hx", (j & 0x3ff));
ret = gf_thread_create(&priv->ev_dispatcher[j], NULL,
- changelog_ev_dispatch, conn, thread_name);
+ changelog_ev_dispatch, conn, "clogd%03hx",
+ j & 0x3ff);
if (ret != 0) {
changelog_cleanup_dispatchers(this, priv, j);
break;
@@ -147,48 +141,146 @@ int
changelog_rpcsvc_notify(rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
void *data)
{
+ xlator_t *this = NULL;
+ rpc_transport_t *trans = NULL;
+ rpc_transport_t *xprt = NULL;
+ rpc_transport_t *xp_next = NULL;
+ changelog_priv_t *priv = NULL;
+ uint64_t listnercnt = 0;
+ uint64_t xprtcnt = 0;
+ uint64_t clntcnt = 0;
+ rpcsvc_listener_t *listener = NULL;
+ rpcsvc_listener_t *next = NULL;
+ gf_boolean_t listner_found = _gf_false;
+ socket_private_t *sockpriv = NULL;
+
+ if (!xl || !data || !rpc) {
+ gf_msg_callingfn("changelog", GF_LOG_WARNING, 0,
+ CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED,
+ "Calling rpc_notify without initializing");
+ goto out;
+ }
+
+ this = xl;
+ trans = data;
+ priv = this->private;
+
+ if (!priv) {
+ gf_msg_callingfn("changelog", GF_LOG_WARNING, 0,
+ CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED,
+ "Calling rpc_notify without priv initializing");
+ goto out;
+ }
+
+ if (event == RPCSVC_EVENT_ACCEPT) {
+ GF_ATOMIC_INC(priv->xprtcnt);
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&trans->list, &priv->xprt_list);
+ }
+ UNLOCK(&priv->lock);
+ goto out;
+ }
+
+ if (event == RPCSVC_EVENT_DISCONNECT) {
+ list_for_each_entry_safe(listener, next, &rpc->listeners, list)
+ {
+ if (listener && listener->trans) {
+ if (listener->trans == trans) {
+ listnercnt = GF_ATOMIC_DEC(priv->listnercnt);
+ listner_found = _gf_true;
+ rpcsvc_listener_destroy(listener);
+ }
+ }
+ }
+
+ if (listnercnt > 0) {
+ goto out;
+ }
+ if (listner_found) {
+ LOCK(&priv->lock);
+ list_for_each_entry_safe(xprt, xp_next, &priv->xprt_list, list)
+ {
+ sockpriv = (socket_private_t *)(xprt->private);
+ gf_log("changelog", GF_LOG_INFO,
+ "Send disconnect"
+ " on socket %d",
+ sockpriv->sock);
+ rpc_transport_disconnect(xprt, _gf_false);
+ }
+ UNLOCK(&priv->lock);
+ goto out;
+ }
+ LOCK(&priv->lock);
+ {
+ list_del_init(&trans->list);
+ }
+ UNLOCK(&priv->lock);
+
+ xprtcnt = GF_ATOMIC_DEC(priv->xprtcnt);
+ clntcnt = GF_ATOMIC_GET(priv->clntcnt);
+ if (!xprtcnt && !clntcnt) {
+ changelog_process_cleanup_event(this);
+ }
+ }
+
+out:
return 0;
}
void
+changelog_process_cleanup_event(xlator_t *this)
+{
+ gf_boolean_t cleanup_notify = _gf_false;
+ changelog_priv_t *priv = NULL;
+ char sockfile[UNIX_PATH_MAX] = {
+ 0,
+ };
+
+ if (!this)
+ return;
+ priv = this->private;
+ if (!priv)
+ return;
+
+ LOCK(&priv->lock);
+ {
+ cleanup_notify = priv->notify_down;
+ priv->notify_down = _gf_true;
+ }
+ UNLOCK(&priv->lock);
+
+ if (priv->victim && !cleanup_notify) {
+ default_notify(this, GF_EVENT_PARENT_DOWN, priv->victim);
+
+ if (priv->rpc) {
+ /* sockfile path could have been saved to avoid this */
+ CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile,
+ UNIX_PATH_MAX);
+ sys_unlink(sockfile);
+ (void)rpcsvc_unregister_notify(priv->rpc, changelog_rpcsvc_notify,
+ this);
+ if (priv->rpc->rxpool) {
+ mem_pool_destroy(priv->rpc->rxpool);
+ priv->rpc->rxpool = NULL;
+ }
+ GF_FREE(priv->rpc);
+ priv->rpc = NULL;
+ }
+ }
+}
+
+void
changelog_destroy_rpc_listner(xlator_t *this, changelog_priv_t *priv)
{
char sockfile[UNIX_PATH_MAX] = {
0,
};
- changelog_clnt_t *c_clnt = &priv->connections;
- changelog_rpc_clnt_t *crpc = NULL;
- int nofconn = 0;
/* sockfile path could have been saved to avoid this */
CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile, UNIX_PATH_MAX);
changelog_rpc_server_destroy(this, priv->rpc, sockfile,
changelog_rpcsvc_notify, changelog_programs);
-
- /* TODO Below approach is not perfect to wait for cleanup
- all active connections without this code brick process
- can be crash in case of brick multiplexing if any in-progress
- request process on rpc by changelog xlator after
- cleanup resources
- */
-
- if (c_clnt) {
- do {
- nofconn = 0;
- LOCK(&c_clnt->active_lock);
- list_for_each_entry(crpc, &c_clnt->active, list) { nofconn++; }
- UNLOCK(&c_clnt->active_lock);
- LOCK(&c_clnt->wait_lock);
- list_for_each_entry(crpc, &c_clnt->waitq, list) { nofconn++; }
- UNLOCK(&c_clnt->wait_lock);
- pthread_mutex_lock(&c_clnt->pending_lock);
- list_for_each_entry(crpc, &c_clnt->pending, list) { nofconn++; }
- pthread_mutex_unlock(&c_clnt->pending_lock);
-
- } while (nofconn); /* Wait for all connection cleanup */
- }
-
- (void)changelog_cleanup_rpc_threads(this, priv);
}
rpcsvc_t *
@@ -287,16 +379,15 @@ changelog_handle_probe(rpcsvc_request_t *req)
this = req->trans->xl;
if (this->cleanup_starting) {
- gf_msg(this->name, GF_LOG_DEBUG, 0, CHANGELOG_MSG_HANDLE_PROBE_ERROR,
- "cleanup_starting flag is already set for xl");
+ gf_smsg(this->name, GF_LOG_DEBUG, 0, CHANGELOG_MSG_CLEANUP_ALREADY_SET,
+ NULL);
return 0;
}
ret = xdr_to_generic(req->msg[0], &rpc_req,
(xdrproc_t)xdr_changelog_probe_req);
if (ret < 0) {
- gf_msg("", GF_LOG_ERROR, 0, CHANGELOG_MSG_HANDLE_PROBE_ERROR,
- "xdr decoding error");
+ gf_smsg("", GF_LOG_ERROR, 0, CHANGELOG_MSG_HANDLE_PROBE_ERROR, NULL);
req->rpc_err = GARBAGE_ARGS;
goto handle_xdr_error;
}
@@ -328,13 +419,13 @@ submit_rpc:
* RPC declarations
*/
-rpcsvc_actor_t changelog_svc_actors[CHANGELOG_RPC_PROC_MAX] = {
+static rpcsvc_actor_t changelog_svc_actors[CHANGELOG_RPC_PROC_MAX] = {
[CHANGELOG_RPC_PROBE_FILTER] = {"CHANGELOG PROBE FILTER",
- CHANGELOG_RPC_PROBE_FILTER,
- changelog_handle_probe, NULL, 0, DRC_NA},
+ changelog_handle_probe, NULL,
+ CHANGELOG_RPC_PROBE_FILTER, DRC_NA, 0},
};
-struct rpcsvc_program changelog_svc_prog = {
+static struct rpcsvc_program changelog_svc_prog = {
.progname = CHANGELOG_RPC_PROGNAME,
.prognum = CHANGELOG_RPC_PROGNUM,
.progver = CHANGELOG_RPC_PROGVER,
@@ -343,7 +434,7 @@ struct rpcsvc_program changelog_svc_prog = {
.synctask = _gf_true,
};
-struct rpcsvc_program *changelog_programs[] = {
+static struct rpcsvc_program *changelog_programs[] = {
&changelog_svc_prog,
NULL,
};
diff --git a/xlators/features/changelog/src/changelog-rpc.h b/xlators/features/changelog/src/changelog-rpc.h
index 8002cea5091..b1707565249 100644
--- a/xlators/features/changelog/src/changelog-rpc.h
+++ b/xlators/features/changelog/src/changelog-rpc.h
@@ -11,7 +11,7 @@
#ifndef __CHANGELOG_RPC_H
#define __CHANGELOG_RPC_H
-#include "xlator.h"
+#include <glusterfs/xlator.h>
#include "changelog-helpers.h"
/* one time */
diff --git a/xlators/features/changelog/src/changelog-rt.c b/xlators/features/changelog/src/changelog-rt.c
index 968c76b8b20..841545ae359 100644
--- a/xlators/features/changelog/src/changelog-rt.c
+++ b/xlators/features/changelog/src/changelog-rt.c
@@ -8,9 +8,9 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-#include "logging.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
#include "changelog-rt.h"
#include "changelog-mem-types.h"
diff --git a/xlators/features/changelog/src/changelog-rt.h b/xlators/features/changelog/src/changelog-rt.h
index df0d5b03487..28b9827d85b 100644
--- a/xlators/features/changelog/src/changelog-rt.h
+++ b/xlators/features/changelog/src/changelog-rt.h
@@ -11,8 +11,8 @@
#ifndef _CHANGELOG_RT_H
#define _CHANGELOG_RT_H
-#include "locking.h"
-#include "timer.h"
+#include <glusterfs/locking.h>
+#include <glusterfs/timer.h>
#include "pthread.h"
#include "changelog-helpers.h"
diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c
index 35a523316ed..6a6e5af859e 100644
--- a/xlators/features/changelog/src/changelog.c
+++ b/xlators/features/changelog/src/changelog.c
@@ -8,11 +8,11 @@
cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-#include "syscall.h"
-#include "logging.h"
-#include "iobuf.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/iobuf.h>
#include "changelog-rt.h"
@@ -34,6 +34,12 @@ static struct changelog_bootstrap cb_bootstrap[] = {
},
};
+static int
+changelog_init_rpc(xlator_t *this, changelog_priv_t *priv);
+
+static int
+changelog_init(xlator_t *this, changelog_priv_t *priv);
+
/* Entry operations - TYPE III */
/**
@@ -149,9 +155,8 @@ changelog_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
goto out;
}
if (barrier_enabled && !stub) {
- gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "Failed to barrier FOPs, disabling changelog barrier",
- "fop=rmdir", NULL);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+ CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=rmdir", NULL);
chlog_barrier_dequeue_all(this, &queue);
}
@@ -298,9 +303,8 @@ changelog_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
goto out;
}
if (barrier_enabled && !stub) {
- gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "Failed to barrier FOPs, disabling changelog barrier",
- "fop=unlink", NULL);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+ CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=unlink", NULL);
chlog_barrier_dequeue_all(this, &queue);
}
@@ -418,9 +422,8 @@ changelog_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
goto out;
}
if (barrier_enabled && !stub) {
- gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "Failed to barrier FOPs, disabling changelog barrier",
- "fop=rename", NULL);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+ CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=rename", NULL);
chlog_barrier_dequeue_all(this, &queue);
}
/* changelog barrier */
@@ -531,8 +534,7 @@ changelog_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
}
if (barrier_enabled && !stub) {
- gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_NO_MEMORY,
- "Failed to barrier FOPs, disabling changelog barrier",
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_BARRIER_FOP_FAILED,
"fop=link", NULL);
chlog_barrier_dequeue_all(this, &queue);
}
@@ -660,9 +662,8 @@ changelog_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
}
if (barrier_enabled && !stub) {
- gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "Failed to barrier FOPs, disabling changelog barrier",
- "fop=mkdir", NULL);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+ CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=mkdir", NULL);
chlog_barrier_dequeue_all(this, &queue);
}
@@ -782,9 +783,8 @@ changelog_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
}
if (barrier_enabled && !stub) {
- gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "Failed to barrier FOPs, disabling changelog barrier",
- "fop=symlink", NULL);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+ CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=symlink", NULL);
chlog_barrier_dequeue_all(this, &queue);
}
@@ -929,9 +929,8 @@ changelog_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
}
if (barrier_enabled && !stub) {
- gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "Failed to barrier FOPs, disabling changelog barrier",
- "fop=mknod", NULL);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+ CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=mknod", NULL);
chlog_barrier_dequeue_all(this, &queue);
}
@@ -972,8 +971,8 @@ changelog_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
CHANGELOG_OP_TYPE_RELEASE)) {
ret = fd_ctx_set(fd, this, (uint64_t)(long)0x1);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_SET_FD_CONTEXT,
- "could not set fd context (for release cbk)");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_SET_FD_CONTEXT,
+ NULL);
}
changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
@@ -1083,9 +1082,8 @@ changelog_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
}
if (barrier_enabled && !stub) {
- gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "Failed to barrier FOPs, disabling changelog barrier",
- "fop=create", NULL);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+ CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=create", NULL);
chlog_barrier_dequeue_all(this, &queue);
}
@@ -1388,9 +1386,6 @@ changelog_handle_virtual_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
ret = changelog_fill_entry_buf(frame, this, loc, &local);
if (ret) {
gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_ENTRY_BUF_INFO,
- "Entry cannot be"
- " captured for gfid, Capturing DATA"
- " entry.",
"gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
goto unwind;
}
@@ -1806,8 +1801,8 @@ changelog_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
CHANGELOG_OP_TYPE_RELEASE)) {
ret = fd_ctx_set(fd, this, (uint64_t)(long)0x1);
if (ret)
- gf_msg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_SET_FD_CONTEXT,
- "could not set fd context (for release cbk)");
+ gf_smsg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_SET_FD_CONTEXT,
+ NULL);
}
unwind:
@@ -2004,6 +1999,15 @@ notify(xlator_t *this, int event, void *data, ...)
struct list_head queue = {
0,
};
+ uint64_t xprtcnt = 0;
+ uint64_t clntcnt = 0;
+ changelog_clnt_t *conn = NULL;
+ gf_boolean_t cleanup_notify = _gf_false;
+ char sockfile[UNIX_PATH_MAX] = {
+ 0,
+ };
+ rpcsvc_listener_t *listener = NULL;
+ rpcsvc_listener_t *next = NULL;
INIT_LIST_HEAD(&queue);
@@ -2011,6 +2015,50 @@ notify(xlator_t *this, int event, void *data, ...)
if (!priv)
goto out;
+ if (event == GF_EVENT_PARENT_DOWN) {
+ priv->victim = data;
+ gf_log(this->name, GF_LOG_INFO,
+ "cleanup changelog rpc connection of brick %s",
+ priv->victim->name);
+
+ if (priv->rpc_active) {
+ this->cleanup_starting = 1;
+ changelog_destroy_rpc_listner(this, priv);
+ conn = &priv->connections;
+ if (conn)
+ changelog_ev_cleanup_connections(this, conn);
+ xprtcnt = GF_ATOMIC_GET(priv->xprtcnt);
+ clntcnt = GF_ATOMIC_GET(priv->clntcnt);
+ if (!xprtcnt && !clntcnt) {
+ LOCK(&priv->lock);
+ {
+ cleanup_notify = priv->notify_down;
+ priv->notify_down = _gf_true;
+ }
+ UNLOCK(&priv->lock);
+ if (priv->rpc) {
+ list_for_each_entry_safe(listener, next,
+ &priv->rpc->listeners, list)
+ {
+ if (listener->trans) {
+ rpc_transport_unref(listener->trans);
+ }
+ }
+ rpcsvc_destroy(priv->rpc);
+ priv->rpc = NULL;
+ }
+ CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile,
+ UNIX_PATH_MAX);
+ sys_unlink(sockfile);
+ if (!cleanup_notify)
+ default_notify(this, GF_EVENT_PARENT_DOWN, data);
+ }
+ } else {
+ default_notify(this, GF_EVENT_PARENT_DOWN, data);
+ }
+ goto out;
+ }
+
if (event == GF_EVENT_TRANSLATOR_OP) {
dict = data;
@@ -2018,15 +2066,15 @@ notify(xlator_t *this, int event, void *data, ...)
switch (barrier) {
case DICT_ERROR:
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_DICT_GET_FAILED,
- "Barrier dict_get_str_boolean failed");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_DICT_GET_FAILED, "dict_get_str_boolean",
+ NULL);
ret = -1;
goto out;
case BARRIER_OFF:
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO,
- "Barrier off notification");
+ gf_smsg(this->name, GF_LOG_INFO, 0,
+ CHANGELOG_MSG_BARRIER_STATE_NOTIFY, "off", NULL);
CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, out);
LOCK(&priv->c_snap_lock);
@@ -2043,10 +2091,8 @@ notify(xlator_t *this, int event, void *data, ...)
UNLOCK(&priv->bflags.lock);
if (ret == -1) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_BARRIER_ERROR,
- "Received another barrier off"
- " notification while already off");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_BARRIER_ERROR, NULL);
goto out;
}
@@ -2064,13 +2110,11 @@ notify(xlator_t *this, int event, void *data, ...)
*/
if (ret == 0) {
chlog_barrier_dequeue_all(this, &queue);
- gf_msg(this->name, GF_LOG_INFO, 0,
- CHANGELOG_MSG_BARRIER_INFO,
- "Disabled changelog barrier");
+ gf_smsg(this->name, GF_LOG_INFO, 0,
+ CHANGELOG_MSG_BARRIER_DISABLED, NULL);
} else {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_BARRIER_ERROR,
- "Changelog barrier already disabled");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_BARRIER_ALREADY_DISABLED, NULL);
}
LOCK(&priv->bflags.lock);
@@ -2082,8 +2126,8 @@ notify(xlator_t *this, int event, void *data, ...)
goto out;
case BARRIER_ON:
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO,
- "Barrier on notification");
+ gf_smsg(this->name, GF_LOG_INFO, 0,
+ CHANGELOG_MSG_BARRIER_STATE_NOTIFY, "on", NULL);
CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, out);
LOCK(&priv->c_snap_lock);
@@ -2102,11 +2146,8 @@ notify(xlator_t *this, int event, void *data, ...)
UNLOCK(&priv->bflags.lock);
if (ret == -1) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_BARRIER_ERROR,
- "Received another barrier on"
- "notification when last one is"
- "not served yet");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_BARRIER_ON_ERROR, NULL);
goto out;
}
@@ -2129,14 +2170,14 @@ notify(xlator_t *this, int event, void *data, ...)
goto out;
}
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO,
- "Enabled changelog barrier");
+ gf_smsg(this->name, GF_LOG_INFO, 0,
+ CHANGELOG_MSG_BARRIER_ENABLE, NULL);
ret = changelog_barrier_notify(priv, buf);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_WRITE_FAILED,
- "Explicit roll over: write failed");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_WRITE_FAILED, "Explicit roll over",
+ NULL);
changelog_barrier_cleanup(this, priv, &queue);
ret = -1;
goto out;
@@ -2160,21 +2201,20 @@ notify(xlator_t *this, int event, void *data, ...)
}
ret1 = pthread_mutex_unlock(&priv->bn.bnotify_mutex);
CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret1, out, bclean_req);
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BNOTIFY_INFO,
- "Woke up: bnotify conditional wait");
+ gf_smsg(this->name, GF_LOG_INFO, 0,
+ CHANGELOG_MSG_BNOTIFY_COND_INFO, NULL);
goto out;
case DICT_DEFAULT:
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_DICT_GET_FAILED, "barrier key not found");
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND, NULL);
ret = -1;
goto out;
default:
- gf_msg(this->name, GF_LOG_ERROR, EINVAL,
- CHANGELOG_MSG_DICT_GET_FAILED,
- "Something went bad in dict_get_str_boolean");
+ gf_smsg(this->name, GF_LOG_ERROR, EINVAL,
+ CHANGELOG_MSG_ERROR_IN_DICT_GET, NULL);
ret = -1;
goto out;
}
@@ -2200,9 +2240,8 @@ mem_acct_init(xlator_t *this)
ret = xlator_mem_acct_init(this, gf_changelog_mt_end + 1);
if (ret != 0) {
- gf_msg(this->name, GF_LOG_WARNING, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "Memory accounting"
- " init failed");
+ gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+ CHANGELOG_MSG_MEMORY_INIT_FAILED, NULL);
return ret;
}
@@ -2213,23 +2252,11 @@ static int
changelog_init(xlator_t *this, changelog_priv_t *priv)
{
int i = 0;
- int ret = -1;
- struct timeval tv = {
- 0,
- };
+ int ret = 0;
changelog_log_data_t cld = {
0,
};
- ret = gettimeofday(&tv, NULL);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_GET_TIME_OP_FAILED, "gettimeofday() failure");
- goto out;
- }
-
- priv->slice.tv_start = tv;
-
priv->maps[CHANGELOG_TYPE_DATA] = "D ";
priv->maps[CHANGELOG_TYPE_METADATA] = "M ";
priv->maps[CHANGELOG_TYPE_METADATA_XATTR] = "M ";
@@ -2248,9 +2275,7 @@ changelog_init(xlator_t *this, changelog_priv_t *priv)
* in case there was an encoding change. so... things are kept
* simple here.
*/
- ret = changelog_fill_rollover_data(&cld, _gf_false);
- if (ret)
- goto out;
+ changelog_fill_rollover_data(&cld, _gf_false);
ret = htime_open(this, priv, cld.cld_roll_time);
/* call htime open with cld's rollover_time */
@@ -2288,8 +2313,8 @@ changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv)
if ((ret = pthread_mutex_init(&priv->bn.bnotify_mutex, NULL)) != 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED,
- "bnotify pthread_mutex_init failed", "ret=%d", ret, NULL);
+ CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=bnotify",
+ "ret=%d", ret, NULL);
ret = -1;
goto out;
}
@@ -2297,8 +2322,8 @@ changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv)
if ((ret = pthread_cond_init(&priv->bn.bnotify_cond, NULL)) != 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED,
- "bnotify pthread_cond_init failed", "ret=%d", ret, NULL);
+ CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=bnotify",
+ "ret=%d", ret, NULL);
ret = -1;
goto out;
}
@@ -2306,8 +2331,8 @@ changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv)
if ((ret = pthread_mutex_init(&priv->dm.drain_black_mutex, NULL)) != 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED,
- "drain_black pthread_mutex_init failed", "ret=%d", ret, NULL);
+ CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=drain_black",
+ "ret=%d", ret, NULL);
ret = -1;
goto out;
}
@@ -2315,8 +2340,8 @@ changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv)
if ((ret = pthread_cond_init(&priv->dm.drain_black_cond, NULL)) != 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED,
- "drain_black pthread_cond_init failed", "ret=%d", ret, NULL);
+ CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=drain_black",
+ "ret=%d", ret, NULL);
ret = -1;
goto out;
}
@@ -2324,8 +2349,8 @@ changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv)
if ((ret = pthread_mutex_init(&priv->dm.drain_white_mutex, NULL)) != 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED,
- "drain_white pthread_mutex_init failed", "ret=%d", ret, NULL);
+ CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=drain_white",
+ "ret=%d", ret, NULL);
ret = -1;
goto out;
}
@@ -2333,8 +2358,8 @@ changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv)
if ((ret = pthread_cond_init(&priv->dm.drain_white_cond, NULL)) != 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
- CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED,
- "drain_white pthread_cond_init failed", "ret=%d", ret, NULL);
+ CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=drain_white",
+ "ret=%d", ret, NULL);
ret = -1;
goto out;
}
@@ -2343,7 +2368,7 @@ changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv)
if ((pthread_mutex_init(&priv->cr.lock, NULL)) != 0) {
gf_smsg(this->name, GF_LOG_ERROR, errno,
CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED,
- "changelog_rollover lock init failed", "ret=%d", ret, NULL);
+ "name=changelog_rollover", "ret=%d", ret, NULL);
ret = -1;
goto out;
}
@@ -2394,6 +2419,22 @@ changelog_barrier_pthread_destroy(changelog_priv_t *priv)
LOCK_DESTROY(&priv->bflags.lock);
}
+static void
+changelog_cleanup_rpc(xlator_t *this, changelog_priv_t *priv)
+{
+ /* terminate rpc server */
+ if (!this->cleanup_starting)
+ changelog_destroy_rpc_listner(this, priv);
+
+ (void)changelog_cleanup_rpc_threads(this, priv);
+ /* cleanup rot buffs */
+ rbuf_dtor(priv->rbuf);
+
+ /* cleanup poller thread */
+ if (priv->poller)
+ (void)changelog_thread_cleanup(this, priv->poller);
+}
+
int
reconfigure(xlator_t *this, dict_t *options)
{
@@ -2402,6 +2443,9 @@ reconfigure(xlator_t *this, dict_t *options)
changelog_priv_t *priv = NULL;
gf_boolean_t active_earlier = _gf_true;
gf_boolean_t active_now = _gf_true;
+ gf_boolean_t rpc_active_earlier = _gf_true;
+ gf_boolean_t rpc_active_now = _gf_true;
+ gf_boolean_t iniate_rpc = _gf_false;
changelog_time_slice_t *slice = NULL;
changelog_log_data_t cld = {
0,
@@ -2412,9 +2456,6 @@ reconfigure(xlator_t *this, dict_t *options)
char csnap_dir[PATH_MAX] = {
0,
};
- struct timeval tv = {
- 0,
- };
uint32_t timeout = 0;
priv = this->private;
@@ -2423,14 +2464,15 @@ reconfigure(xlator_t *this, dict_t *options)
ret = -1;
active_earlier = priv->active;
+ rpc_active_earlier = priv->rpc_active;
/* first stop the rollover and the fsync thread */
changelog_cleanup_helper_threads(this, priv);
GF_OPTION_RECONF("changelog-dir", tmp, options, str, out);
if (!tmp) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_DIR_OPTIONS_NOT_SET,
- "\"changelog-dir\" option is not set");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_DIR_OPTIONS_NOT_SET,
+ NULL);
goto out;
}
@@ -2456,6 +2498,29 @@ reconfigure(xlator_t *this, dict_t *options)
goto out;
GF_OPTION_RECONF("changelog", active_now, options, bool, out);
+ GF_OPTION_RECONF("changelog-notification", rpc_active_now, options, bool,
+ out);
+
+ /* If journalling is enabled, enable rpc notifications */
+ if (active_now && !active_earlier) {
+ if (!rpc_active_earlier)
+ iniate_rpc = _gf_true;
+ }
+
+ if (rpc_active_now && !rpc_active_earlier) {
+ iniate_rpc = _gf_true;
+ }
+
+ /* TODO: Disable of changelog-notifications is not supported for now
+ * as there is no clean way of cleaning up of rpc resources
+ */
+
+ if (iniate_rpc) {
+ ret = changelog_init_rpc(this, priv);
+ if (ret)
+ goto out;
+ priv->rpc_active = _gf_true;
+ }
/**
* changelog_handle_change() handles changes that could possibly
@@ -2482,9 +2547,7 @@ reconfigure(xlator_t *this, dict_t *options)
out);
if (active_now || active_earlier) {
- ret = changelog_fill_rollover_data(&cld, !active_now);
- if (ret)
- goto out;
+ changelog_fill_rollover_data(&cld, !active_now);
slice = &priv->slice;
@@ -2501,15 +2564,9 @@ reconfigure(xlator_t *this, dict_t *options)
if (active_now) {
if (!active_earlier) {
- gf_msg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_HTIME_INFO,
- "Reconfigure: Changelog Enable");
- if (gettimeofday(&tv, NULL)) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CHANGELOG_MSG_HTIME_ERROR, "unable to fetch htime");
- ret = -1;
- goto out;
- }
- htime_create(this, priv, tv.tv_sec);
+ gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_RECONFIGURE,
+ NULL);
+ htime_create(this, priv, gf_time());
}
ret = changelog_spawn_helper_threads(this, priv);
}
@@ -2534,8 +2591,7 @@ changelog_freeup_options(xlator_t *this, changelog_priv_t *priv)
ret = priv->cb->dtor(this, &priv->cd);
if (ret)
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_FREEUP_FAILED,
- "could not cleanup bootstrapper");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_FREEUP_FAILED, NULL);
GF_FREE(priv->changelog_brick);
GF_FREE(priv->changelog_dir);
}
@@ -2587,6 +2643,7 @@ changelog_init_options(xlator_t *this, changelog_priv_t *priv)
goto dealloc_2;
GF_OPTION_INIT("changelog", priv->active, bool, dealloc_2);
+ GF_OPTION_INIT("changelog-notification", priv->rpc_active, bool, dealloc_2);
GF_OPTION_INIT("capture-del-path", priv->capture_del_path, bool, dealloc_2);
GF_OPTION_INIT("op-mode", tmp, str, dealloc_2);
@@ -2625,20 +2682,6 @@ error_return:
return -1;
}
-static void
-changelog_cleanup_rpc(xlator_t *this, changelog_priv_t *priv)
-{
- /* terminate rpc server */
- changelog_destroy_rpc_listner(this, priv);
-
- /* cleanup rot buffs */
- rbuf_dtor(priv->rbuf);
-
- /* cleanup poller thread */
- if (priv->poller)
- (void)changelog_thread_cleanup(this, priv->poller);
-}
-
static int
changelog_init_rpc(xlator_t *this, changelog_priv_t *priv)
{
@@ -2679,14 +2722,14 @@ init(xlator_t *this)
GF_VALIDATE_OR_GOTO("changelog", this, error_return);
if (!this->children || this->children->next) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_CHILD_MISCONFIGURED,
- "translator needs a single subvolume");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_CHILD_MISCONFIGURED,
+ NULL);
goto error_return;
}
if (!this->parents) {
- gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_VOL_MISCONFIGURED,
- "dangling volume. please check volfile");
+ gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_VOL_MISCONFIGURED,
+ NULL);
goto error_return;
}
@@ -2696,13 +2739,18 @@ init(xlator_t *this)
this->local_pool = mem_pool_new(changelog_local_t, 64);
if (!this->local_pool) {
- gf_msg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
- "failed to create local memory pool");
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
+ NULL);
goto cleanup_priv;
}
LOCK_INIT(&priv->lock);
LOCK_INIT(&priv->c_snap_lock);
+ GF_ATOMIC_INIT(priv->listnercnt, 0);
+ GF_ATOMIC_INIT(priv->clntcnt, 0);
+ GF_ATOMIC_INIT(priv->xprtcnt, 0);
+ INIT_LIST_HEAD(&priv->xprt_list);
+ priv->htime_fd = -1;
ret = changelog_init_options(this, priv);
if (ret)
@@ -2730,10 +2778,13 @@ init(xlator_t *this)
INIT_LIST_HEAD(&priv->queue);
priv->barrier_enabled = _gf_false;
- /* RPC ball rolling.. */
- ret = changelog_init_rpc(this, priv);
- if (ret)
- goto cleanup_barrier;
+ if (priv->rpc_active || priv->active) {
+ /* RPC ball rolling.. */
+ ret = changelog_init_rpc(this, priv);
+ if (ret)
+ goto cleanup_barrier;
+ priv->rpc_active = _gf_true;
+ }
ret = changelog_init(this, priv);
if (ret)
@@ -2745,13 +2796,16 @@ init(xlator_t *this)
return 0;
cleanup_rpc:
- changelog_cleanup_rpc(this, priv);
+ if (priv->rpc_active) {
+ changelog_cleanup_rpc(this, priv);
+ }
cleanup_barrier:
changelog_barrier_pthread_destroy(priv);
cleanup_options:
changelog_freeup_options(this, priv);
cleanup_mempool:
mem_pool_destroy(this->local_pool);
+ this->local_pool = NULL;
cleanup_priv:
GF_FREE(priv);
error_return:
@@ -2770,9 +2824,11 @@ fini(xlator_t *this)
priv = this->private;
if (priv) {
- /* terminate RPC server/threads */
- changelog_cleanup_rpc(this, priv);
-
+ if (priv->active || priv->rpc_active) {
+ /* terminate RPC server/threads */
+ changelog_cleanup_rpc(this, priv);
+ GF_FREE(priv->ev_dispatcher);
+ }
/* call barrier_disable to cancel timer */
if (priv->barrier_enabled)
__chlog_barrier_disable(this, &queue);
@@ -2841,6 +2897,13 @@ struct volume_options options[] = {
.flags = OPT_FLAG_SETTABLE,
.level = OPT_STATUS_BASIC,
.tags = {"journal", "georep", "glusterfind"}},
+ {.key = {"changelog-notification"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "enable/disable changelog live notification",
+ .op_version = {3},
+ .level = OPT_STATUS_BASIC,
+ .tags = {"bitrot", "georep"}},
{.key = {"changelog-brick"},
.type = GF_OPTION_TYPE_PATH,
.description = "brick path to generate unique socket file name."
@@ -2910,3 +2973,17 @@ struct volume_options options[] = {
.tags = {"journal", "glusterfind"}},
{.key = {NULL}},
};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "changelog",
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/changetimerecorder/src/Makefile.am b/xlators/features/changetimerecorder/src/Makefile.am
deleted file mode 100644
index 620017e3309..00000000000
--- a/xlators/features/changetimerecorder/src/Makefile.am
+++ /dev/null
@@ -1,26 +0,0 @@
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-
-# changetimerecorder can only get build when libgfdb is enabled
-if BUILD_GFDB
- xlator_LTLIBRARIES = changetimerecorder.la
-endif
-
-changetimerecorder_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
-
-changetimerecorder_la_SOURCES = changetimerecorder.c \
- ctr-helper.c ctr-xlator-ctx.c
-
-changetimerecorder_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
- $(top_builddir)/libglusterfs/src/gfdb/libgfdb.la
-
-noinst_HEADERS = ctr-messages.h changetimerecorder.h ctr_mem_types.h \
- ctr-helper.h ctr-xlator-ctx.h
-
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/libglusterfs/src/gfdb \
- -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
- -DDATADIR=\"$(localstatedir)\"
-
-AM_CFLAGS = -Wall $(GF_CFLAGS) $(SQLITE_CFLAGS)
-
-CLEANFILES =
diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.c b/xlators/features/changetimerecorder/src/changetimerecorder.c
deleted file mode 100644
index 04b89e5a540..00000000000
--- a/xlators/features/changetimerecorder/src/changetimerecorder.c
+++ /dev/null
@@ -1,2358 +0,0 @@
-/*
- Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-#include <ctype.h>
-#include <sys/uio.h>
-
-#include "gfdb_sqlite3.h"
-#include "ctr-helper.h"
-#include "ctr-messages.h"
-#include "syscall.h"
-
-#include "changetimerecorder.h"
-#include "tier-ctr-interface.h"
-
-/*******************************inode forget***********************************/
-int
-ctr_forget(xlator_t *this, inode_t *inode)
-{
- fini_ctr_xlator_ctx(this, inode);
- return 0;
-}
-
-/************************** Look up heal **************************************/
-/*
-Problem: The CTR xlator records file meta (heat/hardlinks)
-into the data. This works fine for files which are created
-after ctr xlator is switched ON. But for files which were
-created before CTR xlator is ON, CTR xlator is not able to
-record either of the meta i.e heat or hardlinks. Thus making
-those files immune to promotions/demotions.
-
-Solution: The solution that is implemented in this patch is
-do ctr-db heal of all those pre-existent files, using named lookup.
-For this purpose we use the inode-xlator context variable option
-in gluster.
-The inode-xlator context variable for ctr xlator will have the
-following,
- a. A Lock for the context variable
- b. A hardlink list: This list represents the successful looked
- up hardlinks.
-These are the scenarios when the hardlink list is updated:
-1) Named-Lookup: Whenever a named lookup happens on a file, in the
- wind path we copy all required hardlink and inode information to
- ctr_db_record structure, which resides in the frame->local variable.
- We don't update the database in wind. During the unwind, we read the
- information from the ctr_db_record and ,
- Check if the inode context variable is created, if not we create it.
- Check if the hard link is there in the hardlink list.
- If its not there we add it to the list and send a update to the
- database using libgfdb.
- Please note: The database transaction can fail(and we ignore) as there
- already might be a record in the db. This update to the db is to heal
- if its not there.
- If its there in the list we ignore it.
-2) Inode Forget: Whenever an inode forget hits we clear the hardlink list in
- the inode context variable and delete the inode context variable.
- Please note: An inode forget may happen for two reason,
- a. when the inode is delete.
- b. the in-memory inode is evicted from the inode table due to cache limits.
-3) create: whenever a create happens we create the inode context variable and
- add the hardlink. The database updation is done as usual by ctr.
-4) link: whenever a hardlink is created for the inode, we create the inode
- context variable, if not present, and add the hardlink to the list.
-5) unlink: whenever a unlink happens we delete the hardlink from the list.
-6) mknod: same as create.
-7) rename: whenever a rename happens we update the hardlink in list. if the
- hardlink was not present for updation, we add the hardlink to the list.
-
-What is pending:
-1) This solution will only work for named lookups.
-2) We don't track afr-self-heal/dht-rebalancer traffic for healing.
-
-*/
-
-/* This function does not write anything to the db,
- * just created the local variable
- * for the frame and sets values for the ctr_db_record */
-static int
-ctr_lookup_wind(call_frame_t *frame, xlator_t *this,
- gf_ctr_inode_context_t *ctr_inode_cx)
-{
- int ret = -1;
- gf_ctr_private_t *_priv = NULL;
- gf_ctr_local_t *ctr_local = NULL;
-
- GF_ASSERT(frame);
- GF_ASSERT(frame->root);
- GF_ASSERT(this);
- IS_CTR_INODE_CX_SANE(ctr_inode_cx);
-
- _priv = this->private;
- GF_ASSERT(_priv);
-
- if (_priv->ctr_record_wind && ctr_inode_cx->ia_type != IA_IFDIR) {
- frame->local = init_ctr_local_t(this);
- if (!frame->local) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
- "WIND: Error while creating ctr local");
- goto out;
- };
- ctr_local = frame->local;
- /*Definitely no internal fops will reach here*/
- ctr_local->is_internal_fop = _gf_false;
- /*Don't record counters*/
- CTR_DB_REC(ctr_local).do_record_counters = _gf_false;
- /*Don't record time at all*/
- CTR_DB_REC(ctr_local).do_record_times = _gf_false;
-
- /* Copy gfid into db record*/
- gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, *(ctr_inode_cx->gfid));
-
- /* Set fop_path and fop_type, required by libgfdb to make
- * decision while inserting the record */
- CTR_DB_REC(ctr_local).gfdb_fop_path = ctr_inode_cx->fop_path;
- CTR_DB_REC(ctr_local).gfdb_fop_type = ctr_inode_cx->fop_type;
-
- /* Copy hard link info*/
- gf_uuid_copy(CTR_DB_REC(ctr_local).pargfid,
- *((NEW_LINK_CX(ctr_inode_cx))->pargfid));
- if (snprintf(CTR_DB_REC(ctr_local).file_name,
- sizeof(CTR_DB_REC(ctr_local).file_name), "%s",
- NEW_LINK_CX(ctr_inode_cx)->basename) >=
- sizeof(CTR_DB_REC(ctr_local).file_name)) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
- "WIND: Error copying filename of ctr local");
- goto out;
- }
- /* Since we are in lookup we can ignore errors while
- * Inserting in the DB, because there may be many
- * to write to the DB attempts for healing.
- * We don't want to log all failed attempts and
- * bloat the log*/
- ctr_local->gfdb_db_record.ignore_errors = _gf_true;
- }
-
- ret = 0;
-
-out:
-
- if (ret) {
- free_ctr_local(ctr_local);
- frame->local = NULL;
- }
-
- return ret;
-}
-
-/* This function inserts the ctr_db_record populated by ctr_lookup_wind
- * in to the db. It also destroys the frame->local created by ctr_lookup_wind */
-static int
-ctr_lookup_unwind(call_frame_t *frame, xlator_t *this)