summaryrefslogtreecommitdiffstats
path: root/libglusterfs/src/common-utils.c
diff options
context:
space:
mode:
authorPoornima G <pgurusid@redhat.com>2016-01-22 11:44:21 -0500
committerKaleb KEITHLEY <kkeithle@redhat.com>2017-01-23 19:13:15 -0500
commitf2e7b6800b812e8bbc9bdbcea4c400a1784e31dc (patch)
tree2809f95a28204a4769e4dac07db8fe7b79a980f9 /libglusterfs/src/common-utils.c
parent4008a48080b9910b9948c35892e762e91198d134 (diff)
glusterd: add a cli command to trigger a statedump on a client
With this, we will be able to trigger statedumps on remote Gluster clients, mainly targetted for applications using libgfapi. Design: SIGUSR signal is the most comman way of taking a statedump in Gluster. But it cannot be used for libgfapi based processes, as the process loading the library might have already consumed SIGUSR signal. Hence going by the command way. One has to issue a Gluster command to initiate a statedump on the libgfapi based client. The command takes hostname and PID as an argument. All the glusterds in the cluster, check if they are connected to the specified hostname, and send an RPC request to all the connected clients from that hostname (via the mgmt connection). URL: http://review.gluster.org/16357 Change-Id: Icbe4d2f026b32a2c7d5535e1bfb2cdaaff042e91 BUG: 1169302 Signed-off-by: Poornima G <pgurusid@redhat.com> [ndevos: minor fixes and split patch in smaller pieces] Reviewed-on: https://review.gluster.org/9228 Reviewed-by: Niels de Vos <ndevos@redhat.com> Tested-by: Niels de Vos <ndevos@redhat.com> Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Kaleb KEITHLEY <kkeithle@redhat.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Samikshan Bairagya <samikshan@gmail.com>
Diffstat (limited to 'libglusterfs/src/common-utils.c')
-rw-r--r--libglusterfs/src/common-utils.c21
1 files changed, 21 insertions, 0 deletions
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index 18c2a39d60e..0486409a849 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -3669,6 +3669,27 @@ out:
return running;
}
+/* Check if the pid is > 0 */
+gf_boolean_t
+gf_valid_pid (const char *pid, int length)
+{
+ gf_boolean_t ret = _gf_true;
+ pid_t value = 0;
+ char *end_ptr = NULL;
+
+ if (length <= 0) {
+ ret = _gf_false;
+ goto out;
+ }
+
+ value = strtol (pid, &end_ptr, 10);
+ if (value <= 0) {
+ ret = _gf_false;
+ }
+out:
+ return ret;
+}
+
static int
dht_is_linkfile_key (dict_t *this, char *key, data_t *value, void *data)
{
-dir-write.h?id2=413b2a5f9b77fd3d7f3b26c848482ec7b914102f'>xlators/cluster/afr/src/afr-dir-write.h60
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c2271
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.h54
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c3263
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h104
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c2638
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h64
-rw-r--r--xlators/cluster/afr/src/afr-messages.h167
-rw-r--r--xlators/cluster/afr/src/afr-open.c873
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c494
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c1072
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.h60
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c3915
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h73
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c1746
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c3263
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c1153
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c616
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h389
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c1716
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h75
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c3448
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h84
-rw-r--r--xlators/cluster/afr/src/afr.c2193
-rw-r--r--xlators/cluster/afr/src/afr.h1985
-rw-r--r--xlators/cluster/afr/src/pump.c2528
-rw-r--r--xlators/cluster/afr/src/pump.h97
-rw-r--r--xlators/cluster/dht/src/Makefile.am40
-rw-r--r--xlators/cluster/dht/src/dht-common.c14441
-rw-r--r--xlators/cluster/dht/src/dht-common.h1581
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c656
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c167
-rw-r--r--xlators/cluster/dht/src/dht-helper.c2475
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c1658
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c1404
-rw-r--r--xlators/cluster/dht/src/dht-layout.c1260
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c475
-rw-r--r--xlators/cluster/dht/src/dht-lock.c1392
-rw-r--r--xlators/cluster/dht/src/dht-lock.h91
-rw-r--r--xlators/cluster/dht/src/dht-mem-types.h58
-rw-r--r--xlators/cluster/dht/src/dht-messages.h386
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c4702
-rw-r--r--xlators/cluster/dht/src/dht-rename.c2418
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c2926
-rw-r--r--xlators/cluster/dht/src/dht-shared.c1104
-rw-r--r--xlators/cluster/dht/src/dht.c645
-rw-r--r--xlators/cluster/dht/src/nufa.c1233
-rw-r--r--xlators/cluster/dht/src/switch.c1697
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_mock.c73
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_unittest.c127
-rw-r--r--xlators/cluster/ec/Makefile.am (renamed from xlators/cluster/ha/Makefile.am)0
-rw-r--r--xlators/cluster/ec/src/Makefile.am83
-rw-r--r--xlators/cluster/ec/src/ec-code-avx.c109
-rw-r--r--xlators/cluster/ec/src/ec-code-avx.h18
-rw-r--r--xlators/cluster/ec/src/ec-code-c.c11679
-rw-r--r--xlators/cluster/ec/src/ec-code-c.h27
-rw-r--r--xlators/cluster/ec/src/ec-code-intel.c594
-rw-r--r--xlators/cluster/ec/src/ec-code-intel.h191
-rw-r--r--xlators/cluster/ec/src/ec-code-sse.c101
-rw-r--r--xlators/cluster/ec/src/ec-code-sse.h18
-rw-r--r--xlators/cluster/ec/src/ec-code-x64.c144
-rw-r--r--xlators/cluster/ec/src/ec-code-x64.h18
-rw-r--r--xlators/cluster/ec/src/ec-code.c1060
-rw-r--r--xlators/cluster/ec/src/ec-code.h44
-rw-r--r--xlators/cluster/ec/src/ec-combine.c995
-rw-r--r--xlators/cluster/ec/src/ec-combine.h44
-rw-r--r--xlators/cluster/ec/src/ec-common.c3042
-rw-r--r--xlators/cluster/ec/src/ec-common.h234
-rw-r--r--xlators/cluster/ec/src/ec-data.c288
-rw-r--r--xlators/cluster/ec/src/ec-data.h35
-rw-r--r--xlators/cluster/ec/src/ec-dir-read.c647
-rw-r--r--xlators/cluster/ec/src/ec-dir-write.c1487
-rw-r--r--xlators/cluster/ec/src/ec-fops.h254
-rw-r--r--xlators/cluster/ec/src/ec-galois.c183
-rw-r--r--xlators/cluster/ec/src/ec-galois.h32
-rw-r--r--xlators/cluster/ec/src/ec-generic.c1591
-rw-r--r--xlators/cluster/ec/src/ec-gf8.c5882
-rw-r--r--xlators/cluster/ec/src/ec-gf8.h18
-rw-r--r--xlators/cluster/ec/src/ec-heal.c3367
-rw-r--r--xlators/cluster/ec/src/ec-heald.c681
-rw-r--r--xlators/cluster/ec/src/ec-heald.h30
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c867
-rw-r--r--xlators/cluster/ec/src/ec-helpers.h200
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c2046
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c2369
-rw-r--r--xlators/cluster/ec/src/ec-locks.c1128
-rw-r--r--xlators/cluster/ec/src/ec-mem-types.h30
-rw-r--r--xlators/cluster/ec/src/ec-messages.h61
-rw-r--r--xlators/cluster/ec/src/ec-method.c433
-rw-r--r--xlators/cluster/ec/src/ec-method.h48
-rw-r--r--xlators/cluster/ec/src/ec-types.h690
-rw-r--r--xlators/cluster/ec/src/ec.c1873
-rw-r--r--xlators/cluster/ec/src/ec.h34
-rw-r--r--xlators/cluster/ha/src/Makefile.am15
-rw-r--r--xlators/cluster/ha/src/ha-helpers.c204
-rw-r--r--xlators/cluster/ha/src/ha-mem-types.h37
-rw-r--r--xlators/cluster/ha/src/ha.c4023
-rw-r--r--xlators/cluster/ha/src/ha.h63
-rw-r--r--xlators/cluster/map/src/Makefile.am15
-rw-r--r--xlators/cluster/map/src/map-helper.c358
-rw-r--r--xlators/cluster/map/src/map-mem-types.h35
-rw-r--r--xlators/cluster/map/src/map.c2577
-rw-r--r--xlators/cluster/map/src/map.h77
-rw-r--r--xlators/cluster/stripe/src/Makefile.am16
-rw-r--r--xlators/cluster/stripe/src/stripe-mem-types.h40
-rw-r--r--xlators/cluster/stripe/src/stripe.c4002
-rw-r--r--xlators/cluster/stripe/src/stripe.h159
-rw-r--r--xlators/cluster/unify/src/Makefile.am16
-rw-r--r--xlators/cluster/unify/src/unify-mem-types.h41
-rw-r--r--xlators/cluster/unify/src/unify-self-heal.c1239
-rw-r--r--xlators/cluster/unify/src/unify.c4589
-rw-r--r--xlators/cluster/unify/src/unify.h146
-rw-r--r--xlators/debug/Makefile.am2
-rw-r--r--xlators/debug/delay-gen/Makefile.am (renamed from xlators/features/access-control/Makefile.am)0
-rw-r--r--xlators/debug/delay-gen/src/Makefile.am11
-rw-r--r--xlators/debug/delay-gen/src/delay-gen-mem-types.h21
-rw-r--r--xlators/debug/delay-gen/src/delay-gen-messages.h26
-rw-r--r--xlators/debug/delay-gen/src/delay-gen.c697
-rw-r--r--xlators/debug/delay-gen/src/delay-gen.h27
-rw-r--r--xlators/debug/error-gen/src/Makefile.am10
-rw-r--r--xlators/debug/error-gen/src/error-gen-mem-types.h20
-rw-r--r--xlators/debug/error-gen/src/error-gen.c3075
-rw-r--r--xlators/debug/error-gen/src/error-gen.h60
-rw-r--r--xlators/debug/io-stats/src/Makefile.am10
-rw-r--r--xlators/debug/io-stats/src/io-stats-mem-types.h36
-rw-r--r--xlators/debug/io-stats/src/io-stats.c5122
-rw-r--r--xlators/debug/sink/Makefile.am (renamed from xlators/cluster/stripe/Makefile.am)1
-rw-r--r--xlators/debug/sink/src/Makefile.am14
-rw-r--r--xlators/debug/sink/src/sink.c94
-rw-r--r--xlators/debug/trace/src/Makefile.am9
-rw-r--r--xlators/debug/trace/src/trace-mem-types.h20
-rw-r--r--xlators/debug/trace/src/trace.c4884
-rw-r--r--xlators/debug/trace/src/trace.h55
-rw-r--r--xlators/encryption/Makefile.am3
-rw-r--r--xlators/encryption/rot-13/src/rot-13.c208
-rw-r--r--xlators/encryption/rot-13/src/rot-13.h33
-rw-r--r--xlators/features/Makefile.am15
-rw-r--r--xlators/features/access-control/src/Makefile.am13
-rw-r--r--xlators/features/access-control/src/access-control.c1862
-rw-r--r--xlators/features/access-control/src/access-control.h55
-rw-r--r--xlators/features/arbiter/Makefile.am (renamed from xlators/cluster/unify/Makefile.am)2
-rw-r--r--xlators/features/arbiter/src/Makefile.am19
-rw-r--r--xlators/features/arbiter/src/arbiter-mem-types.h18
-rw-r--r--xlators/features/arbiter/src/arbiter.c380
-rw-r--r--xlators/features/arbiter/src/arbiter.h21
-rw-r--r--xlators/features/barrier/Makefile.am (renamed from xlators/features/filter/Makefile.am)2
-rw-r--r--xlators/features/barrier/src/Makefile.am17
-rw-r--r--xlators/features/barrier/src/barrier-mem-types.h20
-rw-r--r--xlators/features/barrier/src/barrier.c809
-rw-r--r--xlators/features/barrier/src/barrier.h89
-rw-r--r--xlators/features/bit-rot/Makefile.am (renamed from xlators/protocol/legacy/transport/ib-verbs/Makefile.am)0
-rw-r--r--xlators/features/bit-rot/src/Makefile.am1
-rw-r--r--xlators/features/bit-rot/src/bitd/Makefile.am23
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h101
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c78
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h50
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub.c2070
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub.h46
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-ssm.c124
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-ssm.h38
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c2232
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h302
-rw-r--r--xlators/features/bit-rot/src/stub/Makefile.am20
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-common.h178
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-object-version.h30
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c796
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h36
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h117
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.c3590
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.h515
-rw-r--r--xlators/features/changelog/Makefile.am3
-rw-r--r--xlators/features/changelog/lib/Makefile.am (renamed from xlators/encryption/rot-13/Makefile.am)2
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes-multi.c90
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes.c93
-rw-r--r--xlators/features/changelog/lib/examples/c/get-history.c116
-rwxr-xr-xxlators/features/changelog/lib/examples/python/changes.py34
-rw-r--r--xlators/features/changelog/lib/examples/python/libgfchangelog.py71
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am35
-rw-r--r--xlators/features/changelog/lib/src/changelog-lib-messages.h74
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-api.c224
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.c170
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.h255
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-journal-handler.c1029
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-journal.h116
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-reborp.c413
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.c98
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.h28
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog.c652
-rw-r--r--xlators/features/changelog/lib/src/gf-history-changelog.c1020
-rw-r--r--xlators/features/changelog/src/Makefile.am29
-rw-r--r--xlators/features/changelog/src/changelog-barrier.c131
-rw-r--r--xlators/features/changelog/src/changelog-encoders.c232
-rw-r--r--xlators/features/changelog/src/changelog-encoders.h50
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.c412
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.h136
-rw-r--r--xlators/features/changelog/src/changelog-helpers.c1977
-rw-r--r--xlators/features/changelog/src/changelog-helpers.h716
-rw-r--r--xlators/features/changelog/src/changelog-mem-types.h34
-rw-r--r--xlators/features/changelog/src/changelog-messages.h172
-rw-r--r--xlators/features/changelog/src/changelog-misc.h131
-rw-r--r--xlators/features/changelog/src/changelog-rpc-common.c359
-rw-r--r--xlators/features/changelog/src/changelog-rpc-common.h85
-rw-r--r--xlators/features/changelog/src/changelog-rpc.c440
-rw-r--r--xlators/features/changelog/src/changelog-rpc.h31
-rw-r--r--xlators/features/changelog/src/changelog-rt.c66
-rw-r--r--xlators/features/changelog/src/changelog-rt.h33
-rw-r--r--xlators/features/changelog/src/changelog.c2989
-rw-r--r--xlators/features/cloudsync/Makefile.am3
-rw-r--r--xlators/features/cloudsync/src/Makefile.am46
-rw-r--r--xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c30
-rw-r--r--xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h24
-rw-r--r--xlators/features/cloudsync/src/cloudsync-common.c60
-rw-r--r--xlators/features/cloudsync/src/cloudsync-common.h134
-rwxr-xr-xxlators/features/cloudsync/src/cloudsync-fops-c.py324
-rwxr-xr-xxlators/features/cloudsync/src/cloudsync-fops-h.py31
-rw-r--r--xlators/features/cloudsync/src/cloudsync-mem-types.h22
-rw-r--r--xlators/features/cloudsync/src/cloudsync-messages.h16
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/Makefile.am3
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am11
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/Makefile.am3
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am12
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h19
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c584
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h50
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym1
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am3
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am12
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h203
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h30
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym1
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h19
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c842
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h84
-rw-r--r--xlators/features/cloudsync/src/cloudsync.c2076
-rw-r--r--xlators/features/cloudsync/src/cloudsync.h123
-rw-r--r--xlators/features/compress/Makefile.am3
-rw-r--r--xlators/features/compress/src/Makefile.am19
-rw-r--r--xlators/features/compress/src/cdc-helper.c527
-rw-r--r--xlators/features/compress/src/cdc-mem-types.h23
-rw-r--r--xlators/features/compress/src/cdc.c348
-rw-r--r--xlators/features/compress/src/cdc.h99
-rw-r--r--xlators/features/filter/src/Makefile.am15
-rw-r--r--xlators/features/filter/src/filter-mem-types.h30
-rw-r--r--xlators/features/filter/src/filter.c1744
-rw-r--r--xlators/features/gfid-access/Makefile.am (renamed from xlators/bindings/python/Makefile.am)0
-rw-r--r--xlators/features/gfid-access/src/Makefile.am16
-rw-r--r--xlators/features/gfid-access/src/gfid-access-mem-types.h22
-rw-r--r--xlators/features/gfid-access/src/gfid-access.c1420
-rw-r--r--xlators/features/gfid-access/src/gfid-access.h107
-rw-r--r--xlators/features/index/Makefile.am3
-rw-r--r--xlators/features/index/src/Makefile.am19
-rw-r--r--xlators/features/index/src/index-mem-types.h23
-rw-r--r--xlators/features/index/src/index-messages.h33
-rw-r--r--xlators/features/index/src/index.c2682
-rw-r--r--xlators/features/index/src/index.h86
-rw-r--r--xlators/features/leases/Makefile.am3
-rw-r--r--xlators/features/leases/src/Makefile.am20
-rw-r--r--xlators/features/leases/src/leases-internal.c1412
-rw-r--r--xlators/features/leases/src/leases-mem-types.h27
-rw-r--r--xlators/features/leases/src/leases-messages.h33
-rw-r--r--xlators/features/leases/src/leases.c1168
-rw-r--r--xlators/features/leases/src/leases.h259
-rw-r--r--xlators/features/locks/src/Makefile.am23
-rw-r--r--xlators/features/locks/src/clear.c460
-rw-r--r--xlators/features/locks/src/clear.h73
-rw-r--r--xlators/features/locks/src/common.c1978
-rw-r--r--xlators/features/locks/src/common.h265
-rw-r--r--xlators/features/locks/src/entrylk.c1517
-rw-r--r--xlators/features/locks/src/inodelk.c1524
-rw-r--r--xlators/features/locks/src/locks-mem-types.h44
-rw-r--r--xlators/features/locks/src/locks.h335
-rw-r--r--xlators/features/locks/src/pl-messages.h29
-rw-r--r--xlators/features/locks/src/posix.c5846
-rw-r--r--xlators/features/locks/src/reservelk.c618
-rw-r--r--xlators/features/locks/tests/unit-test.c118
-rw-r--r--xlators/features/mac-compat/Makefile.am3
-rw-r--r--xlators/features/mac-compat/src/Makefile.am13
-rw-r--r--xlators/features/mac-compat/src/mac-compat.c247
-rw-r--r--xlators/features/marker/Makefile.am3
-rw-r--r--xlators/features/marker/src/Makefile.am24
-rw-r--r--xlators/features/marker/src/marker-common.c57
-rw-r--r--xlators/features/marker/src/marker-common.h19
-rw-r--r--xlators/features/marker/src/marker-mem-types.h28
-rw-r--r--xlators/features/marker/src/marker-quota-helper.c380
-rw-r--r--xlators/features/marker/src/marker-quota-helper.h66
-rw-r--r--xlators/features/marker/src/marker-quota.c2297
-rw-r--r--xlators/features/marker/src/marker-quota.h140
-rw-r--r--xlators/features/marker/src/marker.c3568
-rw-r--r--xlators/features/marker/src/marker.h148
-rw-r--r--xlators/features/metadisp/Makefile.am3
-rw-r--r--xlators/features/metadisp/src/Makefile.am38
-rw-r--r--xlators/features/metadisp/src/backend.c45
-rw-r--r--xlators/features/metadisp/src/fops-tmpl.c10
-rw-r--r--xlators/features/metadisp/src/gen-fops.py160
-rw-r--r--xlators/features/metadisp/src/metadisp-create.c101
-rw-r--r--xlators/features/metadisp/src/metadisp-fops.h51
-rw-r--r--xlators/features/metadisp/src/metadisp-fsync.c54
-rw-r--r--xlators/features/metadisp/src/metadisp-lookup.c90
-rw-r--r--xlators/features/metadisp/src/metadisp-open.c70
-rw-r--r--xlators/features/metadisp/src/metadisp-readdir.c65
-rw-r--r--xlators/features/metadisp/src/metadisp-setattr.c90
-rw-r--r--xlators/features/metadisp/src/metadisp-stat.c124
-rw-r--r--xlators/features/metadisp/src/metadisp-unlink.c160
-rw-r--r--xlators/features/metadisp/src/metadisp.c46
-rw-r--r--xlators/features/metadisp/src/metadisp.h45
-rw-r--r--xlators/features/namespace/Makefile.am3
-rw-r--r--xlators/features/namespace/src/Makefile.am17
-rw-r--r--xlators/features/namespace/src/namespace.c1344
-rw-r--r--xlators/features/namespace/src/namespace.h23
-rw-r--r--xlators/features/path-convertor/Makefile.am3
-rw-r--r--xlators/features/path-convertor/src/Makefile.am14
-rw-r--r--xlators/features/path-convertor/src/path-mem-types.h32
-rw-r--r--xlators/features/path-convertor/src/path.c1239
-rw-r--r--xlators/features/quiesce/src/Makefile.am10
-rw-r--r--xlators/features/quiesce/src/quiesce-mem-types.h27
-rw-r--r--xlators/features/quiesce/src/quiesce-messages.h28
-rw-r--r--xlators/features/quiesce/src/quiesce.c3235
-rw-r--r--xlators/features/quiesce/src/quiesce.h74
-rw-r--r--xlators/features/quota/src/Makefile.am30
-rw-r--r--xlators/features/quota/src/quota-enforcer-client.c503
-rw-r--r--xlators/features/quota/src/quota-mem-types.h39
-rw-r--r--xlators/features/quota/src/quota-messages.h39
-rw-r--r--xlators/features/quota/src/quota.c5867
-rw-r--r--xlators/features/quota/src/quota.h266
-rw-r--r--xlators/features/quota/src/quotad-aggregator.c494
-rw-r--r--xlators/features/quota/src/quotad-aggregator.h38
-rw-r--r--xlators/features/quota/src/quotad-helpers.c107
-rw-r--r--xlators/features/quota/src/quotad-helpers.h24
-rw-r--r--xlators/features/quota/src/quotad.c245
-rw-r--r--xlators/features/read-only/src/Makefile.am20
-rw-r--r--xlators/features/read-only/src/read-only-common.c406
-rw-r--r--xlators/features/read-only/src/read-only-common.h121
-rw-r--r--xlators/features/read-only/src/read-only-mem-types.h20
-rw-r--r--xlators/features/read-only/src/read-only.c374
-rw-r--r--xlators/features/read-only/src/read-only.h37
-rw-r--r--xlators/features/read-only/src/worm-helper.c395
-rw-r--r--xlators/features/read-only/src/worm-helper.h44
-rw-r--r--xlators/features/read-only/src/worm.c722
-rw-r--r--xlators/features/sdfs/Makefile.am3
-rw-r--r--xlators/features/sdfs/src/Makefile.am19
-rw-r--r--xlators/features/sdfs/src/sdfs-messages.h67
-rw-r--r--xlators/features/sdfs/src/sdfs.c1479
-rw-r--r--xlators/features/sdfs/src/sdfs.h49
-rw-r--r--xlators/features/selinux/Makefile.am3
-rw-r--r--xlators/features/selinux/src/Makefile.am20
-rw-r--r--xlators/features/selinux/src/selinux-mem-types.h19
-rw-r--r--xlators/features/selinux/src/selinux-messages.h30
-rw-r--r--xlators/features/selinux/src/selinux.c323
-rw-r--r--xlators/features/selinux/src/selinux.h24
-rw-r--r--xlators/features/shard/Makefile.am3
-rw-r--r--xlators/features/shard/src/Makefile.am17
-rw-r--r--xlators/features/shard/src/shard-mem-types.h24
-rw-r--r--xlators/features/shard/src/shard-messages.h39
-rw-r--r--xlators/features/shard/src/shard.c7382
-rw-r--r--xlators/features/shard/src/shard.h348
-rw-r--r--xlators/features/snapview-client/Makefile.am (renamed from xlators/performance/stat-prefetch/Makefile.am)0
-rw-r--r--xlators/features/snapview-client/src/Makefile.am16
-rw-r--r--xlators/features/snapview-client/src/snapview-client-mem-types.h24
-rw-r--r--xlators/features/snapview-client/src/snapview-client-messages.h71
-rw-r--r--xlators/features/snapview-client/src/snapview-client.c2791
-rw-r--r--xlators/features/snapview-client/src/snapview-client.h101
-rw-r--r--xlators/features/snapview-server/Makefile.am1
-rw-r--r--xlators/features/snapview-server/src/Makefile.am25
-rw-r--r--xlators/features/snapview-server/src/snapview-server-helpers.c715
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mem-types.h25
-rw-r--r--xlators/features/snapview-server/src/snapview-server-messages.h54
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mgmt.c524
-rw-r--r--xlators/features/snapview-server/src/snapview-server.c2720
-rw-r--r--xlators/features/snapview-server/src/snapview-server.h255
-rw-r--r--xlators/features/thin-arbiter/Makefile.am3
-rw-r--r--xlators/features/thin-arbiter/src/Makefile.am22
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h19
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter-messages.h28
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter.c661
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter.h59
-rw-r--r--xlators/features/trash/src/Makefile.am12
-rw-r--r--xlators/features/trash/src/trash-mem-types.h33
-rw-r--r--xlators/features/trash/src/trash.c3761
-rw-r--r--xlators/features/trash/src/trash.h128
-rw-r--r--xlators/features/upcall/Makefile.am3
-rw-r--r--xlators/features/upcall/src/Makefile.am23
-rw-r--r--xlators/features/upcall/src/upcall-cache-invalidation.h18
-rw-r--r--xlators/features/upcall/src/upcall-internal.c689
-rw-r--r--xlators/features/upcall/src/upcall-mem-types.h23
-rw-r--r--xlators/features/upcall/src/upcall-messages.h29
-rw-r--r--xlators/features/upcall/src/upcall.c2505
-rw-r--r--xlators/features/upcall/src/upcall.h131
-rw-r--r--xlators/features/utime/Makefile.am3
-rw-r--r--xlators/features/utime/src/Makefile.am41
-rw-r--r--xlators/features/utime/src/utime-autogen-fops-tmpl.c28
-rw-r--r--xlators/features/utime/src/utime-autogen-fops-tmpl.h22
-rwxr-xr-xxlators/features/utime/src/utime-gen-fops-c.py147
-rwxr-xr-xxlators/features/utime/src/utime-gen-fops-h.py35
-rw-r--r--xlators/features/utime/src/utime-helpers.c110
-rw-r--r--xlators/features/utime/src/utime-helpers.h25
-rw-r--r--xlators/features/utime/src/utime-mem-types.h21
-rw-r--r--xlators/features/utime/src/utime-messages.h29
-rw-r--r--xlators/features/utime/src/utime.c392
-rw-r--r--xlators/features/utime/src/utime.h23
-rw-r--r--xlators/lib/src/libxlator.c490
-rw-r--r--xlators/lib/src/libxlator.h147
-rw-r--r--xlators/meta/Makefile.am2
-rw-r--r--xlators/meta/src/Makefile.am48
-rw-r--r--xlators/meta/src/active-link.c34
-rw-r--r--xlators/meta/src/cmdline-file.c39
-rw-r--r--xlators/meta/src/frames-file.c107
-rw-r--r--xlators/meta/src/graph-dir.c98
-rw-r--r--xlators/meta/src/graphs-dir.c67
-rw-r--r--xlators/meta/src/history-file.c44
-rw-r--r--xlators/meta/src/logfile-link.c34
-rw-r--r--xlators/meta/src/logging-dir.c44
-rw-r--r--xlators/meta/src/loglevel-file.c50
-rw-r--r--xlators/meta/src/mallinfo-file.c36
-rw-r--r--xlators/meta/src/measure-file.c49
-rw-r--r--xlators/meta/src/meminfo-file.c44
-rw-r--r--xlators/meta/src/meta-defaults.c655
-rw-r--r--xlators/meta/src/meta-helpers.c332
-rw-r--r--xlators/meta/src/meta-hooks.h48
-rw-r--r--xlators/meta/src/meta-mem-types.h36
-rw-r--r--xlators/meta/src/meta.c1355
-rw-r--r--xlators/meta/src/meta.h162
-rw-r--r--xlators/meta/src/misc.c67
-rw-r--r--xlators/meta/src/misc.h31
-rw-r--r--xlators/meta/src/name-file.c44
-rw-r--r--xlators/meta/src/option-file.c45
-rw-r--r--xlators/meta/src/options-dir.c65
-rw-r--r--xlators/meta/src/private-file.c44
-rw-r--r--xlators/meta/src/process_uuid-file.c37
-rw-r--r--xlators/meta/src/profile-file.c44
-rw-r--r--xlators/meta/src/root-dir.c77
-rw-r--r--xlators/meta/src/subvolume-link.c56
-rw-r--r--xlators/meta/src/subvolumes-dir.c62
-rw-r--r--xlators/meta/src/top-link.c40
-rw-r--r--xlators/meta/src/tree.c179
-rw-r--r--xlators/meta/src/tree.h35
-rw-r--r--xlators/meta/src/type-file.c44
-rw-r--r--xlators/meta/src/version-file.c37
-rw-r--r--xlators/meta/src/view-dir.c33
-rw-r--r--xlators/meta/src/view.c258
-rw-r--r--xlators/meta/src/view.h32
-rw-r--r--xlators/meta/src/volfile-file.c79
-rw-r--r--xlators/meta/src/xlator-dir.c97
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am85
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitd-svc.c206
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitd-svc.h40
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitrot.c822
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-brick-ops.c2796
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-helper.c21
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-helper.h21
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-mgmt.c191
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-mgmt.h53
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-errno.h33
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-ganesha.c927
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.c6782
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.h52
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.c235
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.h51
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c478
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.h47
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c8741
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c2667
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.c641
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.h88
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.c870
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.h57
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-log-ops.c290
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mem-types.h106
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-messages.h451
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c1144
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.c3114
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.h97
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.c721
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.h37
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-nfs-svc.c228
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-nfs-svc.h27
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c11618
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.h360
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-peer-utils.c1058
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-peer-utils.h82
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.c909
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.h83
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c152
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-proc-mgmt.h44
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.c2259
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.h17
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quotad-svc.c217
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quotad-svc.h31
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rcu.h36
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c1720
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-replace-brick.c716
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-reset-brick.c376
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rpc-ops.c2448
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-scrub-svc.c207
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-scrub-svc.h45
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-server-quorum.c486
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-server-quorum.h46
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.c153
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.h42
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc.c796
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc.h45
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c2032
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.h267
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.c75
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.h32
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc.c478
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc.h42
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c4290
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot-utils.h169
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot.c10087
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-statedump.c243
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-statedump.h18
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.c5789
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.h247
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-helper.c1047
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-helper.h72
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c536
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-mgmt.h112
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.c2043
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.h93
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-tierd-svc-helper.c207
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c16075
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h887
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c7346
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h348
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c3033
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c3146
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c2475
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h1476
-rw-r--r--xlators/mgmt/glusterd/src/glusterd3_1-mops.c1590
-rw-r--r--xlators/mount/fuse/src/Makefile.am28
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.c8983
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.h712
-rw-r--r--xlators/mount/fuse/src/fuse-helpers.c830
-rw-r--r--xlators/mount/fuse/src/fuse-mem-types.h42
-rw-r--r--xlators/mount/fuse/src/fuse-resolve.c1024
-rw-r--r--xlators/mount/fuse/utils/Makefile.am9
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in891
-rwxr-xr-xxlators/mount/fuse/utils/mount_glusterfs.in625
-rw-r--r--xlators/nfs/lib/src/auth-null.c72
-rw-r--r--xlators/nfs/lib/src/auth-unix.c92
-rw-r--r--xlators/nfs/lib/src/msg-nfs3.c554
-rw-r--r--xlators/nfs/lib/src/msg-nfs3.h186
-rw-r--r--xlators/nfs/lib/src/rpc-socket.c360
-rw-r--r--xlators/nfs/lib/src/rpc-socket.h65
-rw-r--r--xlators/nfs/lib/src/rpcsvc-auth.c400
-rw-r--r--xlators/nfs/lib/src/rpcsvc.c2803
-rw-r--r--xlators/nfs/lib/src/rpcsvc.h732
-rw-r--r--xlators/nfs/lib/src/xdr-common.h48
-rw-r--r--xlators/nfs/lib/src/xdr-nfs3.c1897
-rw-r--r--xlators/nfs/lib/src/xdr-nfs3.h1206
-rw-r--r--xlators/nfs/lib/src/xdr-rpc.c229
-rw-r--r--xlators/nfs/lib/src/xdr-rpc.h82
-rw-r--r--xlators/nfs/server/src/Makefile.am41
-rw-r--r--xlators/nfs/server/src/acl3.c933
-rw-r--r--xlators/nfs/server/src/acl3.h42
-rw-r--r--xlators/nfs/server/src/auth-cache.c496
-rw-r--r--xlators/nfs/server/src/auth-cache.h52
-rw-r--r--xlators/nfs/server/src/exports.c1484
-rw-r--r--xlators/nfs/server/src/exports.h93
-rw-r--r--xlators/nfs/server/src/mount3-auth.c642
-rw-r--r--xlators/nfs/server/src/mount3-auth.h59
-rw-r--r--xlators/nfs/server/src/mount3.c5008
-rw-r--r--xlators/nfs/server/src/mount3.h213
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c238
-rw-r--r--xlators/nfs/server/src/netgroups.c1161
-rw-r--r--xlators/nfs/server/src/netgroups.h53
-rw-r--r--xlators/nfs/server/src/nfs-common.c605
-rw-r--r--xlators/nfs/server/src/nfs-common.h78
-rw-r--r--xlators/nfs/server/src/nfs-fops.c2204
-rw-r--r--xlators/nfs/server/src/nfs-fops.h324
-rw-r--r--xlators/nfs/server/src/nfs-generics.c354
-rw-r--r--xlators/nfs/server/src/nfs-generics.h171
-rw-r--r--xlators/nfs/server/src/nfs-inodes.c819
-rw-r--r--xlators/nfs/server/src/nfs-inodes.h84
-rw-r--r--xlators/nfs/server/src/nfs-mem-types.h73
-rw-r--r--xlators/nfs/server/src/nfs-messages.h102
-rw-r--r--xlators/nfs/server/src/nfs.c2524
-rw-r--r--xlators/nfs/server/src/nfs.h168
-rw-r--r--xlators/nfs/server/src/nfs3-fh.c352
-rw-r--r--xlators/nfs/server/src/nfs3-fh.h139
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.c5460
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.h304
-rw-r--r--xlators/nfs/server/src/nfs3.c8840
-rw-r--r--xlators/nfs/server/src/nfs3.h389
-rw-r--r--xlators/nfs/server/src/nfsserver.sym12
-rw-r--r--xlators/nfs/server/src/nlm4.c2786
-rw-r--r--xlators/nfs/server/src/nlm4.h111
-rw-r--r--xlators/nfs/server/src/nlmcbk_svc.c134
-rw-r--r--xlators/performance/Makefile.am3
-rw-r--r--xlators/performance/io-cache/src/Makefile.am11
-rw-r--r--xlators/performance/io-cache/src/io-cache-messages.h69
-rw-r--r--xlators/performance/io-cache/src/io-cache.c3326
-rw-r--r--xlators/performance/io-cache/src/io-cache.h440
-rw-r--r--xlators/performance/io-cache/src/ioc-inode.c319
-rw-r--r--xlators/performance/io-cache/src/ioc-mem-types.h47
-rw-r--r--xlators/performance/io-cache/src/page.c1515
-rw-r--r--xlators/performance/io-threads/src/Makefile.am10
-rw-r--r--xlators/performance/io-threads/src/io-threads-messages.h41
-rw-r--r--xlators/performance/io-threads/src/io-threads.c3110
-rw-r--r--xlators/performance/io-threads/src/io-threads.h113
-rw-r--r--xlators/performance/io-threads/src/iot-mem-types.h30
-rw-r--r--xlators/performance/md-cache/Makefile.am1
-rw-r--r--xlators/performance/md-cache/src/Makefile.am29
-rw-r--r--xlators/performance/md-cache/src/md-cache-mem-types.h23
-rw-r--r--xlators/performance/md-cache/src/md-cache-messages.h29
-rw-r--r--xlators/performance/md-cache/src/md-cache.c4020
-rw-r--r--xlators/performance/nl-cache/Makefile.am3
-rw-r--r--xlators/performance/nl-cache/src/Makefile.am12
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-helper.c1201
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-mem-types.h27
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-messages.h29
-rw-r--r--xlators/performance/nl-cache/src/nl-cache.c840
-rw-r--r--xlators/performance/nl-cache/src/nl-cache.h175
-rw-r--r--xlators/performance/open-behind/Makefile.am1
-rw-r--r--xlators/performance/open-behind/src/Makefile.am16
-rw-r--r--xlators/performance/open-behind/src/open-behind-mem-types.h22
-rw-r--r--xlators/performance/open-behind/src/open-behind-messages.h32
-rw-r--r--xlators/performance/open-behind/src/open-behind.c1101
-rw-r--r--xlators/performance/quick-read/src/Makefile.am10
-rw-r--r--xlators/performance/quick-read/src/quick-read-mem-types.h38
-rw-r--r--xlators/performance/quick-read/src/quick-read-messages.h31
-rw-r--r--xlators/performance/quick-read/src/quick-read.c3711
-rw-r--r--xlators/performance/quick-read/src/quick-read.h135
-rw-r--r--xlators/performance/read-ahead/src/Makefile.am10
-rw-r--r--xlators/performance/read-ahead/src/page.c871
-rw-r--r--xlators/performance/read-ahead/src/read-ahead-mem-types.h40
-rw-r--r--xlators/performance/read-ahead/src/read-ahead-messages.h31
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c1825
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.h200
-rw-r--r--xlators/performance/readdir-ahead/Makefile.am3
-rw-r--r--xlators/performance/readdir-ahead/src/Makefile.am18
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h24
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-messages.h30
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.c1382
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.h98
-rw-r--r--xlators/performance/stat-prefetch/src/Makefile.am14
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch-mem-types.h37
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch.c3727
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch.h106
-rw-r--r--xlators/performance/symlink-cache/Makefile.am3
-rw-r--r--xlators/performance/symlink-cache/src/Makefile.am12
-rw-r--r--xlators/performance/symlink-cache/src/symlink-cache.c407
-rw-r--r--xlators/performance/write-behind/src/Makefile.am10
-rw-r--r--xlators/performance/write-behind/src/write-behind-mem-types.h37
-rw-r--r--xlators/performance/write-behind/src/write-behind-messages.h31
-rw-r--r--xlators/performance/write-behind/src/write-behind.c5217
-rw-r--r--xlators/playground/Makefile.am2
-rw-r--r--xlators/playground/rot-13/Makefile.am (renamed from xlators/cluster/map/Makefile.am)0
-rw-r--r--xlators/playground/rot-13/src/Makefile.am (renamed from xlators/encryption/rot-13/src/Makefile.am)8
-rw-r--r--xlators/playground/rot-13/src/rot-13.c166
-rw-r--r--xlators/playground/rot-13/src/rot-13.h18
-rw-r--r--xlators/playground/template/Makefile.am2
-rw-r--r--xlators/playground/template/src/Makefile.am17
-rw-r--r--xlators/playground/template/src/template.c186
-rw-r--r--xlators/playground/template/src/template.h43
-rw-r--r--xlators/protocol/auth/addr/src/Makefile.am11
-rw-r--r--xlators/protocol/auth/addr/src/addr.c522
-rw-r--r--xlators/protocol/auth/login/src/Makefile.am10
-rw-r--r--xlators/protocol/auth/login/src/login.c296
-rw-r--r--xlators/protocol/client/src/Makefile.am18
-rw-r--r--xlators/protocol/client/src/client-callback.c330
-rw-r--r--xlators/protocol/client/src/client-common.c3589
-rw-r--r--xlators/protocol/client/src/client-common.h630
-rw-r--r--xlators/protocol/client/src/client-handshake.c2233
-rw-r--r--xlators/protocol/client/src/client-helpers.c966
-rw-r--r--xlators/protocol/client/src/client-lk.c1133
-rw-r--r--xlators/protocol/client/src/client-mem-types.h38
-rw-r--r--xlators/protocol/client/src/client-messages.h174
-rw-r--r--xlators/protocol/client/src/client-rpc-fops.c6079
-rw-r--r--xlators/protocol/client/src/client-rpc-fops_v2.c6177
-rw-r--r--xlators/protocol/client/src/client.c4202
-rw-r--r--xlators/protocol/client/src/client.h502
-rw-r--r--xlators/protocol/client/src/client3_1-fops.c4762
-rw-r--r--xlators/protocol/legacy/Makefile.am3
-rw-r--r--xlators/protocol/legacy/client/Makefile.am3
-rw-r--r--xlators/protocol/legacy/client/src/Makefile.am21
-rw-r--r--xlators/protocol/legacy/client/src/client-mem-types.h43
-rw-r--r--xlators/protocol/legacy/client/src/client-protocol.c6683
-rw-r--r--xlators/protocol/legacy/client/src/client-protocol.h178
-rw-r--r--xlators/protocol/legacy/client/src/saved-frames.c196
-rw-r--r--xlators/protocol/legacy/client/src/saved-frames.h79
-rw-r--r--xlators/protocol/legacy/lib/Makefile.am3
-rw-r--r--xlators/protocol/legacy/lib/src/Makefile.am14
-rw-r--r--xlators/protocol/legacy/lib/src/protocol.c108
-rw-r--r--xlators/protocol/legacy/lib/src/protocol.h1118
-rw-r--r--xlators/protocol/legacy/lib/src/transport.c422
-rw-r--r--xlators/protocol/legacy/lib/src/transport.h106
-rw-r--r--xlators/protocol/legacy/server/Makefile.am3
-rw-r--r--xlators/protocol/legacy/server/src/Makefile.am27
-rw-r--r--xlators/protocol/legacy/server/src/authenticate.c249
-rw-r--r--xlators/protocol/legacy/server/src/authenticate.h60
-rw-r--r--xlators/protocol/legacy/server/src/server-helpers.c622
-rw-r--r--xlators/protocol/legacy/server/src/server-helpers.h48
-rw-r--r--xlators/protocol/legacy/server/src/server-mem-types.h39
-rw-r--r--xlators/protocol/legacy/server/src/server-protocol.c6587
-rw-r--r--xlators/protocol/legacy/server/src/server-protocol.h191
-rw-r--r--xlators/protocol/legacy/server/src/server-resolve.c658
-rw-r--r--xlators/protocol/legacy/transport/Makefile.am3
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/Makefile.am19
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/ib-verbs-mem-types.h39
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/ib-verbs.c2625
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/ib-verbs.h220
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/name.c712
-rw-r--r--xlators/protocol/legacy/transport/ib-verbs/src/name.h47
-rw-r--r--xlators/protocol/legacy/transport/socket/Makefile.am1
-rw-r--r--xlators/protocol/legacy/transport/socket/src/Makefile.am19
-rw-r--r--xlators/protocol/legacy/transport/socket/src/name.c740
-rw-r--r--xlators/protocol/legacy/transport/socket/src/name.h44
-rw-r--r--xlators/protocol/legacy/transport/socket/src/socket-mem-types.h36
-rw-r--r--xlators/protocol/legacy/transport/socket/src/socket.c1625
-rw-r--r--xlators/protocol/legacy/transport/socket/src/socket.h129
-rw-r--r--xlators/protocol/server/src/Makefile.am31
-rw-r--r--xlators/protocol/server/src/authenticate.c395
-rw-r--r--xlators/protocol/server/src/authenticate.h58
-rw-r--r--xlators/protocol/server/src/server-common.c842
-rw-r--r--xlators/protocol/server/src/server-common.h199
-rw-r--r--xlators/protocol/server/src/server-handshake.c1314
-rw-r--r--xlators/protocol/server/src/server-helpers.c2352
-rw-r--r--xlators/protocol/server/src/server-helpers.h129
-rw-r--r--xlators/protocol/server/src/server-mem-types.h43
-rw-r--r--xlators/protocol/server/src/server-messages.h168
-rw-r--r--xlators/protocol/server/src/server-resolve.c994
-rw-r--r--xlators/protocol/server/src/server-rpc-fops.c6084
-rw-r--r--xlators/protocol/server/src/server-rpc-fops_v2.c6031
-rw-r--r--xlators/protocol/server/src/server.c2370
-rw-r--r--xlators/protocol/server/src/server.h332
-rw-r--r--xlators/protocol/server/src/server3_1-fops.c5182
-rw-r--r--xlators/storage/Makefile.am2
-rw-r--r--xlators/storage/bdb/Makefile.am3
-rw-r--r--xlators/storage/bdb/src/Makefile.am18
-rw-r--r--xlators/storage/bdb/src/bctx.c341
-rw-r--r--xlators/storage/bdb/src/bdb-ll.c1464
-rw-r--r--xlators/storage/bdb/src/bdb-mem-types.h42
-rw-r--r--xlators/storage/bdb/src/bdb.c3603
-rw-r--r--xlators/storage/bdb/src/bdb.h530
-rw-r--r--xlators/storage/posix/src/Makefile.am25
-rw-r--r--xlators/storage/posix/src/posix-aio.c556
-rw-r--r--xlators/storage/posix/src/posix-aio.h34
-rw-r--r--xlators/storage/posix/src/posix-common.c1524
-rw-r--r--xlators/storage/posix/src/posix-entry-ops.c2496
-rw-r--r--xlators/storage/posix/src/posix-gfid-path.c243
-rw-r--r--xlators/storage/posix/src/posix-gfid-path.h28
-rw-r--r--xlators/storage/posix/src/posix-handle.c1020
-rw-r--r--xlators/storage/posix/src/posix-handle.h221
-rw-r--r--xlators/storage/posix/src/posix-helpers.c3666
-rw-r--r--xlators/storage/posix/src/posix-inode-fd-ops.c6004
-rw-r--r--xlators/storage/posix/src/posix-inode-handle.h118
-rw-r--r--xlators/storage/posix/src/posix-mem-types.h39
-rw-r--r--xlators/storage/posix/src/posix-messages.h74
-rw-r--r--xlators/storage/posix/src/posix-metadata-disk.h31
-rw-r--r--xlators/storage/posix/src/posix-metadata.c916
-rw-r--r--xlators/storage/posix/src/posix-metadata.h71
-rw-r--r--xlators/storage/posix/src/posix.c4603
-rw-r--r--xlators/storage/posix/src/posix.h702
-rw-r--r--xlators/system/Makefile.am1
-rw-r--r--xlators/system/posix-acl/Makefile.am1
-rw-r--r--xlators/system/posix-acl/src/Makefile.am28
-rw-r--r--xlators/system/posix-acl/src/posix-acl-mem-types.h23
-rw-r--r--xlators/system/posix-acl/src/posix-acl-messages.h28
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.c173
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.h29
-rw-r--r--xlators/system/posix-acl/src/posix-acl.c2246
-rw-r--r--xlators/system/posix-acl/src/posix-acl.h35
-rw-r--r--xlators/xlator.sym1
778 files changed, 425695 insertions, 173900 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am
index 4c94f5e44c1..ef20cbb64fa 100644
--- a/xlators/Makefile.am
+++ b/xlators/Makefile.am
@@ -1,3 +1,13 @@
-SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt
+if BUILD_GNFS
+ GNFS_DIR = nfs
+endif
+
+DIST_SUBDIRS = cluster storage protocol performance debug features \
+ mount nfs mgmt system playground meta
+
+SUBDIRS = cluster storage protocol performance debug features \
+ mount ${GNFS_DIR} mgmt system playground meta
+
+EXTRA_DIST = xlator.sym
CLEANFILES =
diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am
deleted file mode 100644
index f7766580257..00000000000
--- a/xlators/bindings/Makefile.am
+++ /dev/null
@@ -1 +0,0 @@
-SUBDIRS = $(BINDINGS_SUBDIRS)
diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am
deleted file mode 100644
index c0b9141c667..00000000000
--- a/xlators/bindings/python/src/Makefile.am
+++ /dev/null
@@ -1,19 +0,0 @@
-
-xlator_PROGRAMS = python.so
-
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings
-
-python_PYTHON = gluster.py glustertypes.py glusterstack.py
-
-pythondir = $(xlatordir)/python
-
-python_so_SOURCES = python.c
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \
- $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\"
-
-AM_LDFLAGS = $(PYTHON_LDFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py
deleted file mode 100644
index ee0eb131011..00000000000
--- a/xlators/bindings/python/src/gluster.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-# This file is part of GlusterFS.
-#
-# GlusterFS is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published
-# by the Free Software Foundation; either version 3 of the License,
-# or (at your option) any later version.
-#
-# GlusterFS is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see
-# <http://www.gnu.org/licenses/>.
-from ctypes import *
-from glustertypes import *
-from glusterstack import *
-import sys
-import inspect
-
-libglusterfs = CDLL("libglusterfs.so")
-_gf_log = libglusterfs._gf_log
-_gf_log.restype = c_int32
-_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p]
-
-gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel")
-
-GF_LOG_NONE = 0
-GF_LOG_CRITICAL = 1
-GF_LOG_ERROR = 2
-GF_LOG_WARNING = 3
-GF_LOG_DEBUG = 4
-
-def gf_log(module, level, fmt, *params):
- if level <= gf_log_loglevel:
- frame = sys._getframe(1)
- _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name,
- frame.f_lineno, level, fmt, *params)
-
-class ComplexTranslator(object):
- def __init__(self, xlator):
- self.xlator = xlator_t.from_address(xlator)
-
- def __getattr__(self, item):
- return getattr(self.xlator, item)
diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py
deleted file mode 100644
index ba24c81652e..00000000000
--- a/xlators/bindings/python/src/glusterstack.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-# This file is part of GlusterFS.
-#
-# GlusterFS is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published
-# by the Free Software Foundation; either version 3 of the License,
-# or (at your option) any later version.
-#
-# GlusterFS is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see
-# <http://www.gnu.org/licenses/>.
-from ctypes import *
-from glustertypes import *
-
-libc = CDLL("libc.so.6")
-calloc = libc.calloc
-calloc.argtypes = [c_int, c_int]
-calloc.restype = c_void_p
-
-# TODO: Can these be done in C somehow?
-def stack_wind(frame, rfn, obj, fn, *params):
- """Frame is a frame object"""
- _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t))
- _new[0].root = frame.root
- _new[0].next = frame.root[0].frames.next
- _new[0].prev = pointer(frame.root[0].frames)
- if frame.root[0].frames.next:
- frame.root[0].frames.next[0].prev = _new
- frame.root[0].frames.next = _new
- _new[0].this = obj
- # TODO: Type checking like tmp_cbk?
- _new[0].ret = rfn
- _new[0].parent = pointer(frame)
- _new[0].cookie = cast(_new, c_void_p)
- # TODO: Initialize lock
- #_new.lock.init()
- frame.ref_count += 1
- fn(_new, obj, *params)
-
-def stack_unwind(frame, *params):
- """Frame is a frame object"""
- fn = frame[0].ret
- parent = frame[0].parent[0]
- parent.ref_count -= 1
-
- op_ret = params[0]
- op_err = params[1]
- params = params[2:]
- fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this,
- op_ret, op_err, *params)
diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py
deleted file mode 100644
index e9069d07c72..00000000000
--- a/xlators/bindings/python/src/glustertypes.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-# This file is part of GlusterFS.
-#
-# GlusterFS is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published
-# by the Free Software Foundation; either version 3 of the License,
-# or (at your option) any later version.
-#
-# GlusterFS is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see
-# <http://www.gnu.org/licenses/>.
-from ctypes import *
-import collections
-
-#
-# Forward declaration of some gluster types
-#
-class call_frame_t(Structure):
- pass
-
-class call_ctx_t(Structure):
- pass
-
-class call_pool_t(Structure):
- pass
-
-class xlator_t(Structure):
- def _getFirstChild(self):
- return self.children[0].xlator
- firstChild = property(_getFirstChild)
-
-class xlator_list_t(Structure):
- pass
-
-class xlator_fops(Structure):
- pass
-
-class xlator_mops(Structure):
- pass
-
-class glusterfs_ctx_t(Structure):
- pass
-
-class list_head(Structure):
- pass
-
-class dict_t(Structure):
- pass
-
-class inode_table_t(Structure):
- pass
-
-class fd_t(Structure):
- pass
-
-class iovec(Structure):
- _fields_ = [
- ("iov_base", c_void_p),
- ("iov_len", c_size_t),
- ]
-
- def __init__(self, s):
- self.iov_base = cast(c_char_p(s), c_void_p)
- self.iov_len = len(s)
-
- def getBytes(self):
- return string_at(self.iov_base, self.iov_len)
-
-# This is a pthread_spinlock_t
-# TODO: what happens to volatile-ness?
-gf_lock_t = c_int
-
-uid_t = c_uint32
-gid_t = c_uint32
-pid_t = c_int32
-
-off_t = c_int64
-
-#
-# Function pointer types
-#
-ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t),
- POINTER(xlator_t), c_int32, c_int32)
-
-fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t))
-init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t))
-event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p)
-
-list_head._fields_ = [
- ("next", POINTER(list_head)),
- ("prev", POINTER(list_head)),
- ]
-
-call_frame_t._fields_ = [
- ("root", POINTER(call_ctx_t)),
- ("parent", POINTER(call_frame_t)),
- ("next", POINTER(call_frame_t)),
- ("prev", POINTER(call_frame_t)),
- ("local", c_void_p),
- ("this", POINTER(xlator_t)),
- ("ret", ret_fn_t),
- ("ref_count", c_int32),
- ("lock", gf_lock_t),
- ("cookie", c_void_p),
- ("op", c_int32),
- ("type", c_int8),
- ]
-
-call_ctx_t._fields_ = [
- ("all_frames", list_head),
- ("trans", c_void_p),
- ("pool", call_pool_t),
- ("unique", c_uint64),
- ("state", c_void_p),
- ("uid", uid_t),
- ("gid", gid_t),
- ("pid", pid_t),
- ("frames", call_frame_t),
- ("req_refs", POINTER(dict_t)),
- ("rsp_refs", POINTER(dict_t)),
- ]
-
-xlator_t._fields_ = [
- ("name", c_char_p),
- ("type", c_char_p),
- ("next", POINTER(xlator_t)),
- ("prev", POINTER(xlator_t)),
- ("parent", POINTER(xlator_t)),
- ("children", POINTER(xlator_list_t)),
- ("fops", POINTER(xlator_fops)),
- ("mops", POINTER(xlator_mops)),
- ("fini", fini_fn_t),
- ("init", init_fn_t),
- ("notify", event_notify_fn_t),
- ("options", POINTER(dict_t)),
- ("ctx", POINTER(glusterfs_ctx_t)),
- ("itable", POINTER(inode_table_t)),
- ("ready", c_char),
- ("private", c_void_p),
- ]
-
-xlator_list_t._fields_ = [
- ("xlator", POINTER(xlator_t)),
- ("next", POINTER(xlator_list_t)),
- ]
-
-fop_functions = collections.defaultdict(lambda: c_void_p)
-fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod',
- 'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access',
- 'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink',
- 'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush',
- 'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir',
- 'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir',
- # TODO: Call backs?
- ]
-
-fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t),
- POINTER(fd_t), POINTER(iovec), c_int32,
- off_t)
-
-fop_functions['writev'] = fop_writev_t
-xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names]
diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c
deleted file mode 100644
index 3310a211513..00000000000
--- a/xlators/bindings/python/src/python.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- Copyright (c) 2007-2010 Chris AtLee <chris@atlee.ca>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <Python.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-
-typedef struct
-{
- char *scriptname;
- PyObject *pXlator;
- PyObject *pScriptModule;
- PyObject *pGlusterModule;
- PyThreadState *pInterp;
-
- PyObject *pFrameType, *pVectorType, *pFdType;
-} python_private_t;
-
-int32_t
-python_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t offset)
-{
- python_private_t *priv = (python_private_t *)this->private;
- gf_log("python", GF_LOG_DEBUG, "In writev");
- if (PyObject_HasAttrString(priv->pXlator, "writev"))
- {
-
- PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev",
- "O O O i l",
- PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame),
- PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd),
- PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector),
- count,
- offset);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- }
- Py_XDECREF(retval);
- }
- else
- {
- return default_writev(frame, this, fd, vector, count, offset);
- }
- return 0;
-}
-
-struct xlator_fops fops = {
- .writev = python_writev
-};
-
-static PyObject *
-AnonModule_FromFile (const char* fname)
-{
- // Get the builtins
- PyThreadState* pThread = PyThreadState_Get();
- PyObject *pBuiltins = pThread->interp->builtins;
-
- if (PyErr_Occurred())
- {
- PyErr_Print();
- return NULL;
- }
-
- // Create a new dictionary for running code in
- PyObject *pModuleDict = PyDict_New();
- PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins);
- Py_INCREF(pBuiltins);
-
- // Run the file in the new context
- FILE* fp = fopen(fname, "r");
- PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict);
- fclose(fp);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- return NULL;
- }
-
- // Create an object to hold the new context
- PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- return NULL;
- }
- PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- Py_XDECREF(pModule);
- return NULL;
- }
-
- // Set the new context's dictionary to the one we used to run the code
- // inside
- PyObject_SetAttrString(pModule, "__dict__", pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- Py_DECREF(pModule);
- return NULL;
- }
-
- return pModule;
-}
-
-int32_t
-init (xlator_t *this)
-{
- // This is ok to call more than once per process
- Py_InitializeEx(0);
-
- if (!this->children) {
- gf_log ("python", GF_LOG_ERROR,
- "FATAL: python should have exactly one child");
- return -1;
- }
-
- python_private_t *priv = CALLOC (sizeof (python_private_t), 1);
- ERR_ABORT (priv);
-
- data_t *scriptname = dict_get (this->options, "scriptname");
- if (scriptname) {
- priv->scriptname = data_to_str(scriptname);
- } else {
- gf_log("python", GF_LOG_ERROR,
- "FATAL: python requires the scriptname parameter");
- return -1;
- }
-
- priv->pInterp = Py_NewInterpreter();
-
- // Adjust python's path
- PyObject *syspath = PySys_GetObject("path");
- PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH);
- PyList_Append(syspath, path);
- Py_DECREF(path);
-
- gf_log("python", GF_LOG_DEBUG,
- "Loading gluster module");
-
- priv->pGlusterModule = PyImport_ImportModule("gluster");
- if (PyErr_Occurred())
- {
- PyErr_Print();
- return -1;
- }
-
- priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t");
- priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t");
- priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec");
-
- gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname);
-
- priv->pScriptModule = AnonModule_FromFile(priv->scriptname);
- if (!priv->pScriptModule || PyErr_Occurred())
- {
- gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname);
- PyErr_Print();
- return -1;
- }
-
- if (!PyObject_HasAttrString(priv->pScriptModule, "xlator"))
- {
- gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname);
- return -1;
- }
- gf_log("python", GF_LOG_DEBUG, "Instantiating translator");
- priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&",
- PyLong_FromVoidPtr, this);
- if (PyErr_Occurred() || !priv->pXlator)
- {
- PyErr_Print();
- return -1;
- }
-
- this->private = priv;
-
- gf_log ("python", GF_LOG_DEBUG, "python xlator loaded");
- return 0;
-}
-
-void
-fini (xlator_t *this)
-{
- python_private_t *priv = (python_private_t*)(this->private);
- Py_DECREF(priv->pXlator);
- Py_DECREF(priv->pScriptModule);
- Py_DECREF(priv->pGlusterModule);
- Py_DECREF(priv->pFrameType);
- Py_DECREF(priv->pFdType);
- Py_DECREF(priv->pVectorType);
- Py_EndInterpreter(priv->pInterp);
- return;
-}
diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py
deleted file mode 100644
index 507455c856a..00000000000
--- a/xlators/bindings/python/src/testxlator.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-# This file is part of GlusterFS.
-#
-# GlusterFS is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published
-# by the Free Software Foundation; either version 3 of the License,
-# or (at your option) any later version.
-#
-# GlusterFS is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see
-# <http://www.gnu.org/licenses/>.
-
-"""
-This is a test translator written in python.
-
-Important things to note:
- This file must be import-able from glusterfsd. This probably means
- setting PYTHONPATH to where this file is located.
-
- This file must have a top-level xlator class object that will be
- used to instantiate individual translators.
-"""
-from gluster import *
-
-class MyXlator(ComplexTranslator):
- name = "MyXlator"
- def writev_cbk(self, frame, cookie, op_ret, op_errno, buf):
- stack_unwind(frame, op_ret, op_errno, buf)
- return 0
-
- def writev(self, frame, fd, vector, count, offset):
- gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
- # TODO: Use cookie to pass this to writev_cbk
- old_count = vector.iov_len
-
- data = vector.getBytes().encode("zlib")
-
- vector = iovec(data)
- gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
-
- @ret_fn_t
- def rfn(frame, prev, this, op_ret, op_errno, *params):
- if len(params) == 0:
- params = [0]
- return self.writev_cbk(frame, prev, old_count, op_errno, *params)
-
- stack_wind(frame, rfn, self.firstChild,
- self.firstChild[0].fops[0].writev, fd, vector, count, offset)
- return 0
-
-xlator = MyXlator
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 0990822a7d3..8e067d5ab58 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht
+SUBDIRS = afr dht ec
CLEANFILES =
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index 4e4b4c75260..610819b28fc 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -1,26 +1,35 @@
-xlator_LTLIBRARIES = afr.la pump.la
+xlator_LTLIBRARIES = afr.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr-lk-common.c
+afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \
+ afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \
+ afr-read-txn.c \
+ $(top_builddir)/xlators/lib/src/libxlator.c
-afr_la_LDFLAGS = -module -avoidversion
-afr_la_SOURCES = $(afr_common_source) afr.c
+AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \
+ afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \
+ afr-self-heal-name.c
+
+afr_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c
afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-pump_la_LDFLAGS = -module -avoidversion
-pump_la_SOURCES = $(afr_common_source) pump.c
-pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \
+ afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \
+ afr-common.c afr-self-heald.h \
+ $(top_builddir)/xlators/lib/src/libxlator.h afr-messages.h
-noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
- -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS)
+AM_CFLAGS = -Wall $(GF_CFLAGS)
CLEANFILES =
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/replicate.so
- rm -f $(DESTDIR)$(xlatordir)/pump.so
install-data-hook:
ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 4ef11cb1bc8..032ab5c8001 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#include <libgen.h>
@@ -24,28 +15,20 @@
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "statedump.h"
-
-#include "fd.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/hashfn.h>
+#include <glusterfs/list.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/events.h>
+#include <glusterfs/upcall-utils.h>
#include "afr-inode-read.h"
#include "afr-inode-write.h"
@@ -53,2559 +36,7843 @@
#include "afr-dir-write.h"
#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "pump.h"
-
-#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL
-#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL
-#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL
+#include "afr-self-heald.h"
+#include "afr-messages.h"
int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid)
+afr_quorum_errno(afr_private_t *priv)
{
- int ret = 0;
-
- GF_ASSERT (gfid);
+ return ENOTCONN;
+}
- ret = dict_set_static_bin (dict, "gfid-req", gfid, 16);
- if (ret)
- gf_log (THIS->name, GF_LOG_DEBUG, "gfid set failed");
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid)
+{
+ if (!__is_root_gfid(pargfid)) {
+ return _gf_false;
+ }
+
+ if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) {
+ /*For backward compatibility /.landfill is private*/
+ return _gf_true;
+ }
+
+ if (pid == GF_CLIENT_PID_GSYNCD) {
+ /*geo-rep needs to create/sync private directory on slave because
+ * it appears in changelog*/
+ return _gf_false;
+ }
+
+ if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) {
+ if (strcmp(name, priv->anon_inode_name) == 0) {
+ /* anonymous-inode dir is private*/
+ return _gf_true;
+ }
+ } else {
+ if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) ==
+ 0) {
+ /* anonymous-inode dir prefix is private for geo-rep to work*/
+ return _gf_true;
+ }
+ }
- return ret;
+ return _gf_false;
}
-uint64_t
-afr_is_split_brain (xlator_t *this, inode_t *inode)
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies)
{
- int ret = 0;
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ replies[i] = 1;
+ } else {
+ replies[i] = 0;
+ }
+ }
+}
- uint64_t ctx = 0;
- uint64_t split_brain = 0;
+int
+afr_fav_child_reset_sink_xattrs(void *opaque);
- VALIDATE_OR_GOTO (inode, out);
+int
+afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+static void
+afr_discover_done(call_frame_t *frame, xlator_t *this);
- if (ret < 0)
- goto unlock;
+int
+afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ local->cont.lk.dom_lock_op_ret[i] = op_ret;
+ local->cont.lk.dom_lock_op_errno[i] = op_errno;
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to acquire %s on %s",
+ uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM,
+ priv->children[i]->name);
+ } else {
+ local->cont.lk.dom_locked_nodes[i] = 1;
+ }
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
- split_brain = ctx & AFR_ICTX_SPLIT_BRAIN_MASK;
+int
+afr_dom_lock_acquire(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ local->cont.lk.dom_locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+ if (!local->cont.lk.dom_locked_nodes) {
+ return -ENOMEM;
+ }
+ local->cont.lk.dom_lock_op_ret = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_ret) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ local->cont.lk.dom_lock_op_errno = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_errno) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ flock.l_type = F_WRLCK;
+
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLK, &flock, NULL);
+
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL))
+ goto blocking_lock;
+
+ /*If any of the bricks returned EAGAIN, we still need blocking locks.*/
+ if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) !=
+ priv->child_count) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.dom_lock_op_ret[i] == -1 &&
+ local->cont.lk.dom_lock_op_errno[i] == EAGAIN)
+ goto blocking_lock;
}
-unlock:
- UNLOCK (&inode->lock);
+ }
-out:
- return split_brain;
+ return 0;
+
+blocking_lock:
+ afr_dom_lock_release(frame);
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLKW, &flock, NULL);
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) {
+ afr_dom_lock_release(frame);
+ return -afr_quorum_errno(priv);
+ }
+
+ return 0;
}
+int
+afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to release %s on %s", local->loc.path,
+ AFR_LK_HEAL_DOM, priv->children[i]->name);
+ }
+ local->cont.lk.dom_locked_nodes[i] = 0;
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set)
+afr_dom_lock_release(call_frame_t *frame)
{
- uint64_t ctx = 0;
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = frame->this->private;
+ locked_on = local->cont.lk.dom_locked_nodes;
+ if (AFR_COUNT(locked_on, priv->child_count) == 0)
+ return;
+ flock.l_type = F_UNLCK;
- VALIDATE_OR_GOTO (inode, out);
+ AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk,
+ AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL);
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ return;
+}
- if (ret < 0) {
- ctx = 0;
- }
+static void
+afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info)
+{
+ if (!info)
+ return;
+ if (info->xdata_req)
+ dict_unref(info->xdata_req);
+ if (info->fd)
+ fd_unref(info->fd);
+ GF_FREE(info->locked_nodes);
+ GF_FREE(info->child_up_event_gen);
+ GF_FREE(info->child_down_event_gen);
+ GF_FREE(info);
+}
- if (set) {
- ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx)
- | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK);
- } else {
- ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx);
- }
- __inode_ctx_put (inode, this, ctx);
+static int
+afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -ENOMEM;
+
+ info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t);
+ if (!info) {
+ goto cleanup;
+ }
+ INIT_LIST_HEAD(&info->pos);
+ info->fd = fd_ref(local->fd);
+ info->cmd = local->cont.lk.cmd;
+ info->pid = frame->root->pid;
+ info->flock = local->cont.lk.user_flock;
+ info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL);
+ if (!info->xdata_req) {
+ goto cleanup;
+ }
+ info->lk_owner = frame->root->lk_owner;
+ info->locked_nodes = GF_MALLOC(
+ sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char);
+ if (!info->locked_nodes) {
+ goto cleanup;
+ }
+ memcpy(info->locked_nodes, local->cont.lk.locked_nodes,
+ sizeof(*info->locked_nodes) * priv->child_count);
+ info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen),
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!info->child_up_event_gen) {
+ goto cleanup;
+ }
+ info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!info->child_down_event_gen) {
+ goto cleanup;
+ }
+
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(local->fd, this);
+ if (fd_ctx)
+ fd_ctx->lk_heal_info = info;
+ }
+ UNLOCK(&local->fd->lock);
+ if (!fd_ctx) {
+ goto cleanup;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&info->pos, &priv->saved_locks);
+ }
+ UNLOCK(&priv->lock);
+
+ return 0;
+cleanup:
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to add lock to healq",
+ uuid_utoa(local->fd->inode->gfid));
+ if (info) {
+ afr_lk_heal_info_cleanup(info);
+ if (fd_ctx) {
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx->lk_heal_info = NULL;
+ }
+ UNLOCK(&local->fd->lock);
}
- UNLOCK (&inode->lock);
+ }
+ return ret;
+}
+
+static int
+afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = local->cont.lk.user_flock;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -EINVAL;
+
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx || !fd_ctx->lk_heal_info) {
+ goto out;
+ }
+
+ info = fd_ctx->lk_heal_info;
+ if ((info->flock.l_start != flock.l_start) ||
+ (info->flock.l_whence != flock.l_whence) ||
+ (info->flock.l_len != flock.l_len)) {
+ /*TODO: Compare lkowners too.*/
+ goto out;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ UNLOCK(&priv->lock);
+
+ afr_lk_heal_info_cleanup(info);
+ fd_ctx->lk_heal_info = NULL;
+ ret = 0;
out:
- return;
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to remove lock from healq",
+ uuid_utoa(local->fd->inode->gfid));
+ return ret;
}
+int
+afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed to heal lock on child %d for %s", i,
+ uuid_utoa(local->fd->inode->gfid));
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
-uint64_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode)
+int
+afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
- int ret = 0;
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid));
+ } else {
+ local->cont.lk.getlk_rsp[i] = *lock;
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
- uint64_t ctx = 0;
- uint64_t opendir_done = 0;
+static gf_boolean_t
+afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ afr_local_t *local = frame->local;
+ struct gf_flock flock = {
+ 0,
+ };
+ gf_boolean_t ret = _gf_true;
+ char *wind_on = alloca0(priv->child_count);
+ unsigned char *success_replies = alloca0(priv->child_count);
+ local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp),
+ priv->child_count, gf_afr_mt_gf_lock);
+
+ flock = info->flock;
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ wind_on[i] = 1;
+ }
+
+ AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock,
+ info->xdata_req);
+
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) == 0) {
+ ret = _gf_false;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret != 0)
+ continue;
+ if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK)
+ continue;
+ /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/
+ if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner,
+ &info->lk_owner)) {
+ ret = _gf_false;
+ break;
+ }
+ }
+out:
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->cont.lk.getlk_rsp);
+ local->cont.lk.getlk_rsp = NULL;
+ return ret;
+}
- VALIDATE_OR_GOTO (inode, out);
+static void
+afr_mark_fd_bad(fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ if (!fd)
+ return;
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ if (fd_ctx) {
+ fd_ctx->is_fd_bad = _gf_true;
+ fd_ctx->lk_heal_info = NULL;
+ }
+ }
+ UNLOCK(&fd->lock);
+}
- if (ret < 0)
- goto unlock;
+static void
+afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info)
+{
+ LOCK(&priv->lock);
+ {
+ list_del(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+}
- opendir_done = ctx & AFR_ICTX_OPENDIR_DONE_MASK;
+static void
+afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ int op_errno = 0;
+ int32_t *current_event_gen = NULL;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ char *wind_on = alloca0(priv->child_count);
+ gf_boolean_t retry = _gf_true;
+
+ frame->root->pid = info->pid;
+ lk_owner_copy(&frame->root->lk_owner, &info->lk_owner);
+
+ op_errno = -afr_dom_lock_acquire(frame);
+ if ((op_errno != 0)) {
+ goto release;
+ }
+
+ if (!afr_does_lk_owner_match(frame, priv, info)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM,
+ "Ignoring lock heal for %s since lk-onwers mismatch. "
+ "Lock possibly pre-empted by another client.",
+ uuid_utoa(info->fd->inode->gfid));
+ goto release;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ continue;
+ wind_on[i] = 1;
+ }
+
+ current_event_gen = alloca(priv->child_count);
+ memcpy(current_event_gen, info->child_up_event_gen,
+ priv->child_count * sizeof *current_event_gen);
+ AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd,
+ &info->flock, info->xdata_req);
+
+ LOCK(&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) {
+ continue;
+ }
+
+ if ((current_event_gen[i] == info->child_up_event_gen[i]) &&
+ (current_event_gen[i] > info->child_down_event_gen[i])) {
+ info->locked_nodes[i] = 1;
+ retry = _gf_false;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->saved_locks);
+ } else {
+ /*We received subsequent child up/down events while heal was in
+ * progress; don't mark child as healed. Attempt again on the
+ * new child up*/
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM,
+ "Event gen mismatch: skipped healing lock on child %d "
+ "for %s.",
+ i, uuid_utoa(info->fd->inode->gfid));
+ }
}
-unlock:
- UNLOCK (&inode->lock);
+ }
+ UNLOCK(&priv->lock);
+
+release:
+ afr_dom_lock_release(frame);
+ if (retry)
+ afr_add_lock_to_lkhealq(priv, info);
+ return;
+}
-out:
- return opendir_done;
+static int
+afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ STACK_DESTROY(frame->root);
+ return 0;
}
+static int
+afr_lock_heal(void *opaque)
+{
+ call_frame_t *frame = (call_frame_t *)opaque;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+ struct list_head healq = {
+ 0,
+ };
+ int ret = 0;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&healq);
+ LOCK(&priv->lock);
+ {
+ list_splice_init(&priv->lk_healq, &healq);
+ }
+ UNLOCK(&priv->lock);
+
+ list_for_each_entry_safe(info, tmp, &healq, pos)
+ {
+ GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) <
+ priv->child_count));
+ ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd);
+ afr_lock_heal_do(iter_frame, priv, info);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = ENOTCONN;
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN,
+ AFR_MSG_LK_HEAL_DOM,
+ "Aborting processing of lk_healq."
+ "Healing will be reattempted on next child up for locks "
+ "that are still in quorum.");
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&healq, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+ break;
+ }
+ }
+
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
+}
-void
-afr_set_opendir_done (xlator_t *this, inode_t *inode)
+static int
+__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child)
{
- uint64_t ctx = 0;
- int ret = 0;
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
- VALIDATE_OR_GOTO (inode, out);
+ if (priv->shd.iamshd)
+ return 0;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_up_event_gen[child] = priv->event_generation;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
- if (ret < 0) {
- ctx = 0;
- }
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return -1;
- ctx = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx)
- | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK);
+ ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame,
+ frame);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM,
+ "Failed to launch lock heal synctask");
- __inode_ctx_put (inode, this, ctx);
- }
- UNLOCK (&inode->lock);
-out:
- return;
+ return ret;
}
-
-uint64_t
-afr_read_child (xlator_t *this, inode_t *inode)
+static int
+__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child)
{
- int ret = 0;
-
- uint64_t ctx = 0;
- uint64_t read_child = 0;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
- VALIDATE_OR_GOTO (inode, out);
+ if (priv->shd.iamshd)
+ return 0;
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_down_event_gen[child] = priv->event_generation;
+ if (info->locked_nodes[child] == 1)
+ info->locked_nodes[child] = 0;
+ if (!afr_has_quorum(info->locked_nodes, this, NULL)) {
+ /* Since the lock was lost on quorum no. of nodes, we should
+ * not attempt to heal it anymore. Some other client could have
+ * acquired the lock, modified data and released it and this
+ * client wouldn't know about it if we heal it.*/
+ afr_mark_fd_bad(info->fd, this);
+ list_del(&info->pos);
+ afr_lk_heal_info_cleanup(info);
+ /* We're not winding an unlock on the node where the lock is still
+ * present because when fencing logic switches over to the new
+ * client (since we marked the fd bad), it should preempt any
+ * existing lock. */
+ }
+ }
+ return 0;
+}
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+gf_boolean_t
+afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
+ int32_t *op_errno)
+{
+ if (priv->consistent_io && local->call_count != priv->child_count) {
+ gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
+ "All subvolumes are not up");
+ if (op_errno)
+ *op_errno = ENOTCONN;
+ return _gf_false;
+ }
+ return _gf_true;
+}
- if (ret < 0)
- goto unlock;
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata)
+{
+ int ret = 0;
+ uint32_t lk_mode = GF_LK_ADVISORY;
- read_child = ctx & AFR_ICTX_READ_CHILD_MASK;
- }
-unlock:
- UNLOCK (&inode->lock);
+ ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode);
+ if (!ret && lk_mode == GF_LK_MANDATORY)
+ return _gf_true;
-out:
- return read_child;
+ return _gf_false;
}
+call_frame_t *
+afr_copy_frame(call_frame_t *base)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int op_errno = 0;
+
+ frame = copy_frame(base);
+ if (!frame)
+ return NULL;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local) {
+ AFR_STACK_DESTROY(frame);
+ return NULL;
+ }
+
+ return frame;
+}
-void
-afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child)
+/* Check if an entry or inode could be undergoing a transaction. */
+gf_boolean_t
+afr_is_possibly_under_txn(afr_transaction_type type, afr_local_t *local,
+ xlator_t *this)
{
- uint64_t ctx = 0;
- int ret = 0;
+ int i = 0;
+ int tmp = 0;
+ afr_private_t *priv = NULL;
+ GF_UNUSED char *key = NULL;
+ int keylen = 0;
+
+ priv = this->private;
+
+ if (type == AFR_ENTRY_TRANSACTION) {
+ key = GLUSTERFS_PARENT_ENTRYLK;
+ keylen = SLEN(GLUSTERFS_PARENT_ENTRYLK);
+ } else if (type == AFR_DATA_TRANSACTION) {
+ /*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once
+ * pl_inodelk_xattr_fill supports separate keys for different
+ * domains.*/
+ key = GLUSTERFS_INODELK_COUNT;
+ keylen = SLEN(GLUSTERFS_INODELK_COUNT);
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].xdata)
+ continue;
+ if (dict_get_int32n(local->replies[i].xdata, key, keylen, &tmp) == 0)
+ if (tmp)
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
- VALIDATE_OR_GOTO (inode, out);
+static void
+afr_inode_ctx_destroy(afr_inode_ctx_t *ctx)
+{
+ int i = 0;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ if (!ctx)
+ return;
- if (ret < 0) {
- ctx = 0;
- }
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ GF_FREE(ctx->pre_op_done[i]);
+ }
- ctx = (~AFR_ICTX_READ_CHILD_MASK & ctx)
- | (AFR_ICTX_READ_CHILD_MASK & read_child);
+ GF_FREE(ctx);
+}
- __inode_ctx_put (inode, this, ctx);
+int
+__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
+{
+ uint64_t ctx_int = 0;
+ int ret = -1;
+ int i = -1;
+ int num_locks = -1;
+ afr_inode_ctx_t *ictx = NULL;
+ afr_lock_t *lock = NULL;
+ afr_private_t *priv = this->private;
+
+ ret = __inode_ctx_get(inode, this, &ctx_int);
+ if (ret == 0) {
+ *ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
+ return 0;
+ }
+
+ ictx = GF_CALLOC(1, sizeof(afr_inode_ctx_t), gf_afr_mt_inode_ctx_t);
+ if (!ictx)
+ goto out;
+
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ ictx->pre_op_done[i] = GF_CALLOC(sizeof *ictx->pre_op_done[i],
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!ictx->pre_op_done[i]) {
+ ret = -ENOMEM;
+ goto out;
}
- UNLOCK (&inode->lock);
-
+ }
+
+ num_locks = sizeof(ictx->lock) / sizeof(afr_lock_t);
+ for (i = 0; i < num_locks; i++) {
+ lock = &ictx->lock[i];
+ INIT_LIST_HEAD(&lock->post_op);
+ INIT_LIST_HEAD(&lock->frozen);
+ INIT_LIST_HEAD(&lock->waiting);
+ INIT_LIST_HEAD(&lock->owners);
+ }
+
+ ctx_int = (uint64_t)(uintptr_t)ictx;
+ ret = __inode_ctx_set(inode, this, &ctx_int);
+ if (ret) {
+ goto out;
+ }
+
+ ictx->spb_choice = -1;
+ ictx->read_subvol = 0;
+ ictx->write_subvol = 0;
+ ictx->lock_count = 0;
+ ret = 0;
+ *ctx = ictx;
out:
- return;
+ if (ret) {
+ afr_inode_ctx_destroy(ictx);
+ }
+ return ret;
}
-
-/**
- * afr_local_cleanup - cleanup everything in frame->local
+/*
+ * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS:
+ *
+ * |<---------- 64bit ------------>|
+ * 63 32 31 16 15 0
+ * | EVENT_GEN | DATA | METADATA |
+ *
+ *
+ * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which
+ * metadata can be attempted to be read.
+ *
+ * bit-0 => priv->subvolumes[0]
+ * bit-1 => priv->subvolumes[1]
+ * ... etc. till bit-15
+ *
+ * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data
+ * can be attempted to be read.
+ *
+ * bit-16 => priv->subvolumes[0]
+ * bit-17 => priv->subvolumes[1]
+ * ... etc. till bit-31
+ *
+ * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation)
+ * when DATA and METADATA was last updated.
+ *
+ * If EVENT_GEN is < priv->event_generation,
+ * or is 0, it means afr_inode_refresh() needs
+ * to be called to recalculate the bitmaps.
*/
-void
-afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
+int
+__afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
+ inode_t *inode)
{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ int i = 0;
+ int txn_type = 0;
+ int count = 0;
+ int index = -1;
+ uint16_t datamap_old = 0;
+ uint16_t metadatamap_old = 0;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint16_t tmp_map = 0;
+ uint16_t mask = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ txn_type = local->transaction.type;
+
+ if (txn_type == AFR_DATA_TRANSACTION)
+ val = local->inode_ctx->write_subvol;
+ else
+ val = local->inode_ctx->read_subvol;
+
+ metadatamap_old = metadatamap = (val & 0x000000000000ffff);
+ datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
+
+ if (txn_type == AFR_DATA_TRANSACTION)
+ tmp_map = datamap;
+ else if (txn_type == AFR_METADATA_TRANSACTION)
+ tmp_map = metadatamap;
+
+ count = gf_bits_count(tmp_map);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i])
+ continue;
+
+ mask = 1 << i;
+ if (txn_type == AFR_METADATA_TRANSACTION)
+ metadatamap &= ~mask;
+ else if (txn_type == AFR_DATA_TRANSACTION)
+ datamap &= ~mask;
+ }
+
+ switch (txn_type) {
+ case AFR_METADATA_TRANSACTION:
+ if ((metadatamap_old != 0) && (metadatamap == 0) && (count == 1)) {
+ index = gf_bits_index(tmp_map);
+ local->transaction.in_flight_sb_errno = local->replies[index]
+ .op_errno;
+ local->transaction.in_flight_sb = _gf_true;
+ metadatamap |= (1 << index);
+ }
+ if (metadatamap_old != metadatamap) {
+ __afr_inode_need_refresh_set(inode, this);
+ }
+ break;
+
+ case AFR_DATA_TRANSACTION:
+ if ((datamap_old != 0) && (datamap == 0) && (count == 1)) {
+ index = gf_bits_index(tmp_map);
+ local->transaction.in_flight_sb_errno = local->replies[index]
+ .op_errno;
+ local->transaction.in_flight_sb = _gf_true;
+ datamap |= (1 << index);
+ }
+ if (datamap_old != datamap)
+ __afr_inode_need_refresh_set(inode, this);
+ break;
+ default:
+ break;
+ }
- sh = &local->self_heal;
- priv = this->private;
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
- if (sh->buf)
- GF_FREE (sh->buf);
+ if (txn_type == AFR_DATA_TRANSACTION)
+ local->inode_ctx->write_subvol = val;
+ local->inode_ctx->read_subvol = val;
- if (sh->xattr) {
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
- }
- GF_FREE (sh->xattr);
- }
-
- if (sh->child_errno)
- GF_FREE (sh->child_errno);
+ return 0;
+}
- if (sh->pending_matrix) {
- for (i = 0; i < priv->child_count; i++) {
- GF_FREE (sh->pending_matrix[i]);
- }
- GF_FREE (sh->pending_matrix);
+gf_boolean_t
+afr_is_symmetric_error(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int i_errno = 0;
+ gf_boolean_t matching_errors = _gf_true;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret != -1) {
+ /* Operation succeeded on at least one subvol,
+ so it is not a failed-everywhere situation.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+ i_errno = local->replies[i].op_errno;
+
+ if (i_errno == ENOTCONN) {
+ /* ENOTCONN is not a symmetric error. We do not
+ know if the operation was performed on the
+ backend or not.
+ */
+ matching_errors = _gf_false;
+ break;
}
- if (sh->delta_matrix) {
- for (i = 0; i < priv->child_count; i++) {
- GF_FREE (sh->delta_matrix[i]);
- }
- GF_FREE (sh->delta_matrix);
+ if (!op_errno) {
+ op_errno = i_errno;
+ } else if (op_errno != i_errno) {
+ /* Mismatching op_errno's */
+ matching_errors = _gf_false;
+ break;
}
+ }
- if (sh->sources)
- GF_FREE (sh->sources);
+ return matching_errors;
+}
- if (sh->success)
- GF_FREE (sh->success);
+int
+afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, inode_t *inode)
+{
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- if (sh->locked_nodes)
- GF_FREE (sh->locked_nodes);
+ priv = this->private;
+ local = frame->local;
- if (sh->healing_fd && !sh->healing_fd_opened) {
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
- }
+ /* If this transaction saw no failures, then exit. */
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == 0)
+ return 0;
- if (sh->linkname)
- GF_FREE ((char *)sh->linkname);
+ if (afr_is_symmetric_error(frame, this))
+ return 0;
- loc_wipe (&sh->parent_loc);
-}
+ LOCK(&inode->lock);
+ {
+ ret = __afr_set_in_flight_sb_status(this, local, inode);
+ }
+ UNLOCK(&inode->lock);
+ return ret;
+}
-void
-afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
+int
+__afr_inode_read_subvol_get_small(inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
{
- int i = 0;
- afr_private_t * priv = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ int i = 0;
+ afr_inode_ctx_t *ctx = NULL;
+
+ priv = this->private;
+
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0)
+ return ret;
- priv = this->private;
+ val = ctx->read_subvol;
- for (i = 0; i < priv->child_count; i++) {
- if (local->pending && local->pending[i])
- GF_FREE (local->pending[i]);
- }
+ metadatamap = (val & 0x000000000000ffff);
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
- GF_FREE (local->pending);
+ for (i = 0; i < priv->child_count; i++) {
+ if (metadata)
+ metadata[i] = (metadatamap >> i) & 1;
+ if (data)
+ data[i] = (datamap >> i) & 1;
+ }
- if (local->internal_lock.locked_nodes)
- GF_FREE (local->internal_lock.locked_nodes);
+ if (event_p)
+ *event_p = event;
+ return ret;
+}
- if (local->internal_lock.inode_locked_nodes)
- GF_FREE (local->internal_lock.inode_locked_nodes);
+int
+__afr_inode_read_subvol_set_small(inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int event)
+{
+ afr_private_t *priv = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int i = 0;
+ int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
- if (local->internal_lock.entry_locked_nodes)
- GF_FREE (local->internal_lock.entry_locked_nodes);
+ priv = this->private;
- if (local->internal_lock.lower_locked_nodes)
- GF_FREE (local->internal_lock.lower_locked_nodes);
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (data[i])
+ datamap |= (1 << i);
+ if (metadata[i])
+ metadatamap |= (1 << i);
+ }
- GF_FREE (local->transaction.child_errno);
- GF_FREE (local->child_errno);
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
- GF_FREE (local->transaction.basename);
- GF_FREE (local->transaction.new_basename);
+ ctx->read_subvol = val;
- loc_wipe (&local->transaction.parent_loc);
- loc_wipe (&local->transaction.new_parent_loc);
+ ret = 0;
+out:
+ return ret;
}
-
-void
-afr_local_cleanup (afr_local_t *local, xlator_t *this)
+int
+__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
{
- int i;
- afr_private_t * priv = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
- if (!local)
- return;
+ priv = this->private;
- afr_local_sh_cleanup (local, this);
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_get_small(inode, this, data, metadata,
+ event_p);
+ else
+ /* TBD: allocate structure with array and read from it */
+ ret = -1;
- afr_local_transaction_cleanup (local, this);
+ return ret;
+}
- priv = this->private;
+int
+__afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
+ int *spb_choice)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- loc_wipe (&local->loc);
- loc_wipe (&local->newloc);
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0)
+ return ret;
- if (local->fd)
- fd_unref (local->fd);
-
- if (local->xattr_req)
- dict_unref (local->xattr_req);
-
- GF_FREE (local->child_up);
-
- { /* lookup */
- if (local->cont.lookup.xattrs) {
- for (i = 0; i < priv->child_count; i++) {
- if (local->cont.lookup.xattrs[i]) {
- dict_unref (local->cont.lookup.xattrs[i]);
- local->cont.lookup.xattrs[i] = NULL;
- }
- }
- GF_FREE (local->cont.lookup.xattrs);
- local->cont.lookup.xattrs = NULL;
- }
+ *spb_choice = ctx->spb_choice;
+ return 0;
+}
- if (local->cont.lookup.xattr) {
- dict_unref (local->cont.lookup.xattr);
- }
+int
+__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
- if (local->cont.lookup.inode) {
- inode_unref (local->cont.lookup.inode);
- }
- }
+ priv = this->private;
- { /* getxattr */
- if (local->cont.getxattr.name)
- GF_FREE (local->cont.getxattr.name);
- }
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_set_small(inode, this, data, metadata,
+ event);
+ else
+ ret = -1;
- { /* lk */
- if (local->cont.lk.locked_nodes)
- GF_FREE (local->cont.lk.locked_nodes);
- }
+ return ret;
+}
- { /* create */
- if (local->cont.create.fd)
- fd_unref (local->cont.create.fd);
- if (local->cont.create.params)
- dict_unref (local->cont.create.params);
- }
+int
+__afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
+ int spb_choice)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- { /* mknod */
- if (local->cont.mknod.params)
- dict_unref (local->cont.mknod.params);
- }
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto out;
- { /* mkdir */
- if (local->cont.mkdir.params)
- dict_unref (local->cont.mkdir.params);
- }
+ ctx->spb_choice = spb_choice;
- { /* symlink */
- if (local->cont.symlink.params)
- dict_unref (local->cont.symlink.params);
- }
+ ret = 0;
+out:
+ return ret;
+}
- { /* writev */
- GF_FREE (local->cont.writev.vector);
- }
+int
+afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
+{
+ int ret = -1;
- { /* setxattr */
- if (local->cont.setxattr.dict)
- dict_unref (local->cont.setxattr.dict);
- }
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- { /* removexattr */
- GF_FREE (local->cont.removexattr.name);
- }
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_get(inode, this, data, metadata, event_p);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
- { /* symlink */
- GF_FREE (local->cont.symlink.linkpath);
+int
+afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ unsigned char *data = alloca0(priv->child_count);
+ unsigned char *metadata = alloca0(priv->child_count);
+ int data_count = 0;
+ int metadata_count = 0;
+ int event_generation = 0;
+ int ret = 0;
+
+ ret = afr_inode_read_subvol_get(inode, this, data, metadata,
+ &event_generation);
+ if (ret == -1)
+ return -EIO;
+
+ data_count = AFR_COUNT(data, priv->child_count);
+ metadata_count = AFR_COUNT(metadata, priv->child_count);
+
+ if (inode->ia_type == IA_IFDIR) {
+ /* For directories, allow even if it is in data split-brain. */
+ if (type == AFR_METADATA_TRANSACTION || local->op == GF_FOP_STAT ||
+ local->op == GF_FOP_FSTAT) {
+ if (!metadata_count)
+ return -EIO;
}
-
- { /* opendir */
- if (local->cont.opendir.checksum)
- GF_FREE (local->cont.opendir.checksum);
+ } else {
+ /* For files, abort in case of data/metadata split-brain. */
+ if (!data_count || !metadata_count) {
+ return -EIO;
}
+ }
+
+ if (type == AFR_METADATA_TRANSACTION && readable)
+ memcpy(readable, metadata, priv->child_count * sizeof *metadata);
+ if (type == AFR_DATA_TRANSACTION && readable) {
+ if (!data_count)
+ memcpy(readable, local->child_up,
+ priv->child_count * sizeof *readable);
+ else
+ memcpy(readable, data, priv->child_count * sizeof *data);
+ }
+ if (event_p)
+ *event_p = event_generation;
+ return 0;
}
+static int
+afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
+ int *spb_choice)
+{
+ int ret = -1;
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_split_brain_choice_get(inode, this, spb_choice);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
+/*
+ * frame is used to get the favourite policy. Since
+ * afr_inode_split_brain_choice_get was called with afr_open, it is possible to
+ * have a frame with out local->replies. So in that case, frame is passed as
+ * null, hence this function will handle the frame NULL case.
+ */
int
-afr_frame_return (call_frame_t *frame)
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol)
{
- afr_local_t *local = NULL;
- int call_count = 0;
+ int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
+ GF_VALIDATE_OR_GOTO("afr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out);
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
+ priv = this->private;
+
+ ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol);
+ if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) {
+ local = frame->local;
+ *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (*spb_subvol >= 0) {
+ ret = 0;
}
- UNLOCK (&frame->lock);
+ }
- return call_count;
+out:
+ return ret;
}
+int
+afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
+{
+ int ret = -1;
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
-/**
- * up_children_count - return the number of children that are up
- */
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_set(inode, this, data, metadata, event);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
int
-afr_up_children_count (int child_count, unsigned char *child_up)
+afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice)
{
- int i = 0;
- int ret = 0;
+ int ret = -1;
- for (i = 0; i < child_count; i++)
- if (child_up[i])
- ret++;
- return ret;
-}
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_split_brain_choice_set(inode, this, spb_choice);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index)
+/* The caller of this should perform afr_inode_refresh, if this function
+ * returns _gf_true
+ */
+gf_boolean_t
+afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
+ int event_gen2)
{
- ino64_t scaled_ino = -1;
+ gf_boolean_t need_refresh = _gf_false;
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- if (ino == ((uint64_t) -1)) {
- scaled_ino = ((uint64_t) -1);
- goto out;
- }
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- scaled_ino = (ino * child_count) + child_index;
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto unlock;
+
+ need_refresh = ctx->need_refresh;
+ /* Hoping that the caller will do inode_refresh followed by
+ * this, hence setting the need_refresh to false */
+ ctx->need_refresh = _gf_false;
+ }
+unlock:
+ UNLOCK(&inode->lock);
+ if (event_gen1 != event_gen2)
+ need_refresh = _gf_true;
out:
- return scaled_ino;
+ return need_refresh;
}
-
int
-afr_deitransform_orig (ino64_t ino, int child_count)
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
- int index = -1;
+ int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
- index = ino % child_count;
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret == 0) {
+ ctx->need_refresh = _gf_true;
+ }
- return index;
+ return ret;
}
-
int
-afr_deitransform (ino64_t ino, int child_count)
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
- return 0;
-}
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_need_refresh_set(inode, this);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
+}
int
-afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this)
+afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- local = frame->local;
+ if (!inode)
+ return ret;
- if (local->govinda_gOvinda) {
- afr_set_split_brain (this, local->cont.lookup.inode, _gf_true);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0 || !ctx) {
+ UNLOCK(&inode->lock);
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Failed to cancel split-brain choice timer.");
+ goto out;
}
-
- AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
-
- return 0;
+ ctx->spb_choice = -1;
+ if (ctx->timer) {
+ gf_timer_call_cancel(this->ctx, ctx->timer);
+ ctx->timer = NULL;
+ }
+ ret = 0;
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
}
-
-static void
-afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this,
- int child_index, dict_t *xattr)
+void
+afr_set_split_brain_choice_cbk(void *data)
{
- uint32_t inodelk_count = 0;
- uint32_t entrylk_count = 0;
+ inode_t *inode = data;
+ xlator_t *this = THIS;
- int ret = 0;
+ afr_spb_choice_timeout_cancel(this, inode);
+ inode_invalidate(inode);
+ inode_unref(inode);
+ return;
+}
- if (afr_sh_has_metadata_pending (xattr, child_index, this)) {
- local->self_heal.need_metadata_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "metadata self-heal is pending for %s.",
- local->loc.path);
+int
+afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ inode_t *inode = NULL;
+ loc_t *loc = NULL;
+ xlator_t *this = NULL;
+ afr_spbc_timeout_t *data = opaque;
+ struct timespec delta = {
+ 0,
+ };
+ gf_boolean_t timer_set = _gf_false;
+ gf_boolean_t timer_cancelled = _gf_false;
+ gf_boolean_t timer_reset = _gf_false;
+ int old_spb_choice = -1;
+
+ frame = data->frame;
+ loc = data->loc;
+ this = frame->this;
+ priv = this->private;
+
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ delta.tv_sec = priv->spb_choice_timeout;
+ delta.tv_nsec = 0;
+
+ if (!loc->inode) {
+ ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!(data->d_spb || data->m_spb)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Cannot set "
+ "replica.split-brain-choice on %s. File is"
+ " not in data/metadata split-brain.",
+ uuid_utoa(loc->gfid));
+ ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ /*
+ * we're ref'ing the inode before LOCK like it is done elsewhere in the
+ * code. If we ref after LOCK, coverity complains of possible deadlocks.
+ */
+ inode = inode_ref(loc->inode);
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret) {
+ UNLOCK(&inode->lock);
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Failed to get inode_ctx for %s", loc->name);
+ goto post_unlock;
}
- if (afr_sh_has_entry_pending (xattr, child_index, this)) {
- local->self_heal.need_entry_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "entry self-heal is pending for %s.", local->loc.path);
+ old_spb_choice = ctx->spb_choice;
+ ctx->spb_choice = data->spb_child_index;
+
+ /* Possible changes in spb-choice :
+ * valid to -1 : cancel timer and unref
+ * valid to valid : cancel timer and inject new one
+ * -1 to -1 : unref and do not do anything
+ * -1 to valid : inject timer
+ */
+
+ /* ctx->timer is NULL iff previous value of
+ * ctx->spb_choice is -1
+ */
+ if (ctx->timer) {
+ if (ctx->spb_choice == -1) {
+ if (!gf_timer_call_cancel(this->ctx, ctx->timer)) {
+ ctx->timer = NULL;
+ timer_cancelled = _gf_true;
+ }
+ /* If timer cancel failed here it means that the
+ * previous cbk will be executed which will set
+ * spb_choice to -1. So we can consider the
+ * 'valid to -1' case to be a success
+ * (i.e. ret = 0) and goto unlock.
+ */
+ goto unlock;
+ }
+ goto reset_timer;
+ } else {
+ if (ctx->spb_choice == -1)
+ goto unlock;
+ goto set_timer;
}
- if (afr_sh_has_data_pending (xattr, child_index, this)) {
- local->self_heal.need_data_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "data self-heal is pending for %s.", local->loc.path);
+ reset_timer:
+ ret = gf_timer_call_cancel(this->ctx, ctx->timer);
+ if (ret != 0) {
+ /* We need to bail out now instead of launching a new
+ * timer. Otherwise the cbk of the previous timer event
+ * will cancel the new ctx->timer.
+ */
+ ctx->spb_choice = old_spb_choice;
+ ret = -1;
+ op_errno = EAGAIN;
+ goto unlock;
}
-
- ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT,
- &inodelk_count);
- if (ret == 0)
- local->inodelk_count += inodelk_count;
-
- ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT,
- &entrylk_count);
- if (ret == 0)
- local->entrylk_count += entrylk_count;
+ ctx->timer = NULL;
+ timer_reset = _gf_true;
+
+ set_timer:
+ ctx->timer = gf_timer_call_after(this->ctx, delta,
+ afr_set_split_brain_choice_cbk, inode);
+ if (!ctx->timer) {
+ ctx->spb_choice = old_spb_choice;
+ ret = -1;
+ op_errno = ENOMEM;
+ }
+ if (!timer_reset && ctx->timer)
+ timer_set = _gf_true;
+ if (timer_reset && !ctx->timer)
+ timer_cancelled = _gf_true;
+ }
+unlock:
+ UNLOCK(&inode->lock);
+post_unlock:
+ if (!timer_set)
+ inode_unref(inode);
+ if (timer_cancelled)
+ inode_unref(inode);
+ /*
+ * We need to invalidate the inode to prevent the kernel from serving
+ * reads from an older cached value despite a change in spb_choice to
+ * a new value.
+ */
+ inode_invalidate(inode);
+out:
+ GF_FREE(data);
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ return 0;
}
-
-static void
-afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local,
- struct iatt *buf, struct iatt *lookup_buf)
+int
+afr_accused_fill(xlator_t *this, dict_t *xdata, unsigned char *accused,
+ afr_transaction_type type)
{
- if (FILETYPE_DIFFERS (buf, lookup_buf)) {
- /* mismatching filetypes with same name
- */
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int idx = afr_index_for_transaction_type(type);
+ void *pending_raw = NULL;
+ int pending[3];
+ int ret = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_get_ptr(xdata, priv->pending_key[i], &pending_raw);
+ if (ret) /* no pending flags */
+ continue;
+ memcpy(pending, pending_raw, sizeof(pending));
+
+ if (ntoh32(pending[idx]))
+ accused[i] = 1;
+ }
+
+ return 0;
+}
- gf_log (this->name, GF_LOG_NORMAL,
- "filetype differs for %s ", local->loc.path);
+int
+afr_accuse_smallfiles(xlator_t *this, struct afr_reply *replies,
+ unsigned char *data_accused)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t maxsize = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].xdata &&
+ dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))
+ continue;
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size > maxsize)
+ maxsize = replies[i].poststat.ia_size;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+ if (replies[i].poststat.ia_size < maxsize)
+ data_accused[i] = 1;
+ }
+
+ return 0;
+}
- local->govinda_gOvinda = 1;
+int
+afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *data_accused, unsigned char *metadata_accused,
+ unsigned char *data_readable,
+ unsigned char *metadata_readable, struct afr_reply *replies)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+ int ret = 0;
+ ia_type_t ia_type = IA_INVAL;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ data_readable[i] = 1;
+ metadata_readable[i] = 1;
+ }
+ if (AFR_IS_ARBITER_BRICK(priv, ARBITER_BRICK_INDEX)) {
+ data_readable[ARBITER_BRICK_INDEX] = 0;
+ metadata_readable[ARBITER_BRICK_INDEX] = 0;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies) { /* Lookup */
+ if (!replies[i].valid || replies[i].op_ret == -1 ||
+ (replies[i].xdata &&
+ dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ xdata = replies[i].xdata;
+ ia_type = replies[i].poststat.ia_type;
+ } else { /* pre-op xattrop */
+ xdata = local->transaction.changelog_xdata[i];
+ ia_type = inode->ia_type;
}
- if (PERMISSION_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- gf_log (this->name, GF_LOG_NORMAL,
- "permissions differ for %s ", local->loc.path);
- local->self_heal.need_metadata_self_heal = _gf_true;
+ if (!xdata)
+ continue; /* mkdir_cbk sends NULL xdata_rsp. */
+ afr_accused_fill(this, xdata, data_accused,
+ (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION
+ : AFR_DATA_TRANSACTION);
+
+ afr_accused_fill(this, xdata, metadata_accused,
+ AFR_METADATA_TRANSACTION);
+ }
+
+ if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR &&
+ /* We want to accuse small files only when we know for
+ * sure that there is no IO happening. Otherwise, the
+ * ia_sizes obtained in post-refresh replies may
+ * mismatch due to a race between inode-refresh and
+ * ongoing writes, causing spurious heal launches*/
+ !afr_is_possibly_under_txn(AFR_DATA_TRANSACTION, local, this)) {
+ afr_accuse_smallfiles(this, replies, data_accused);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i]) {
+ data_readable[i] = 0;
+ ret = 1;
}
-
- if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- local->self_heal.need_metadata_self_heal = _gf_true;
- gf_log (this->name, GF_LOG_NORMAL,
- "ownership differs for %s ", local->loc.path);
+ if (metadata_accused[i]) {
+ metadata_readable[i] = 0;
+ ret = 1;
}
+ }
+ return ret;
+}
- if (SIZE_DIFFERS (buf, lookup_buf)
- && IA_ISREG (buf->ia_type)) {
- gf_log (this->name, GF_LOG_NORMAL,
- "size differs for %s ", local->loc.path);
- local->self_heal.need_data_self_heal = _gf_true;
+int
+afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t *start_heal)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int event_generation = 0;
+ int i = 0;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+ event_generation = local->event_generation;
+
+ data_accused = alloca0(priv->child_count);
+ data_readable = alloca0(priv->child_count);
+ metadata_accused = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+
+ ret = afr_readables_fill(frame, this, inode, data_accused, metadata_accused,
+ data_readable, metadata_readable, replies);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (start_heal && priv->child_up[i] &&
+ (data_accused[i] || metadata_accused[i])) {
+ *start_heal = _gf_true;
+ break;
}
-
+ }
+ afr_inode_read_subvol_set(inode, this, data_readable, metadata_readable,
+ event_generation);
+ return ret;
}
+int
+afr_refresh_selfheal_done(int ret, call_frame_t *heal, void *opaque)
+{
+ if (heal)
+ AFR_STACK_DESTROY(heal);
+ return 0;
+}
-static void
-afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf)
+int
+afr_inode_refresh_err(call_frame_t *frame, xlator_t *this)
{
- int unwind = 1;
- int source = -1;
- int up_count = 0;
- char sh_type_str[256] = {0,};
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int err = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && !local->replies[i].op_ret) {
+ err = 0;
+ goto ret;
+ }
+ }
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ err = afr_final_errno(local, priv);
+ret:
+ return err;
+}
- priv = this->private;
- local = frame->local;
+gf_boolean_t
+afr_selfheal_enabled(const xlator_t *this)
+{
+ const afr_private_t *priv = this->private;
- local->cont.lookup.postparent.ia_ino = local->cont.lookup.parent_ino;
+ return priv->data_self_heal || priv->metadata_self_heal ||
+ priv->entry_self_heal;
+}
- if (local->cont.lookup.ino) {
- local->cont.lookup.buf.ia_ino = local->cont.lookup.ino;
+int
+afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ int event_generation = 0;
+ int read_subvol = -1;
+ int ret = 0;
+
+ local = frame->local;
+ inode = local->inode;
+ priv = this->private;
+
+ if (err)
+ goto refresh_done;
+
+ if (local->op == GF_FOP_LOOKUP)
+ goto refresh_done;
+
+ ret = afr_inode_get_readable(frame, inode, this, local->readable,
+ &event_generation, local->transaction.type);
+
+ if (ret == -EIO) {
+ /* No readable subvolume even after refresh ==> splitbrain.*/
+ if (!priv->fav_child_policy) {
+ err = EIO;
+ goto refresh_done;
}
-
- up_count = afr_up_children_count (priv->child_count, priv->child_up);
- if (up_count == 1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Only 1 child up - do not attempt to detect self heal");
- goto unwind;
+ read_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (read_subvol == -1) {
+ err = EIO;
+ goto refresh_done;
}
- if (local->op_ret == 0) {
- /* KLUDGE: assuming DHT will not itransform in
- revalidate */
- if (local->cont.lookup.inode->ino) {
- local->cont.lookup.buf.ia_ino =
- local->cont.lookup.inode->ino;
- }
+ heal_frame = afr_frame_create(this, NULL);
+ if (!heal_frame) {
+ err = EIO;
+ goto refresh_done;
}
-
- if (local->success_count && local->enoent_count) {
- local->self_heal.need_metadata_self_heal = _gf_true;
- local->self_heal.need_data_self_heal = _gf_true;
- local->self_heal.need_entry_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_NORMAL,
- "entries are missing in lookup of %s.",
- local->loc.path);
+ heal_local = heal_frame->local;
+ heal_local->xdata_req = dict_new();
+ if (!heal_local->xdata_req) {
+ err = EIO;
+ AFR_STACK_DESTROY(heal_frame);
+ goto refresh_done;
}
+ heal_local->heal_frame = frame;
+ ret = synctask_new(this->ctx->env, afr_fav_child_reset_sink_xattrs,
+ afr_fav_child_reset_sink_xattrs_cbk, heal_frame,
+ heal_frame);
+ return 0;
+ }
- if (local->success_count) {
- /* check for split-brain case in previous lookup */
- if (afr_is_split_brain (this,
- local->cont.lookup.inode)) {
- local->self_heal.need_data_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_NORMAL,
- "split brain detected during lookup of "
- "%s.", local->loc.path);
- }
+refresh_done:
+ afr_local_replies_wipe(local, this->private);
+ local->refreshfn(frame, this, err);
+
+ return 0;
+}
+
+int
+afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
+{
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t start_heal = _gf_false;
+ afr_local_t *heal_local = NULL;
+ unsigned char *success_replies = NULL;
+ int ret = 0;
+
+ if (error != 0) {
+ goto refresh_done;
+ }
+
+ local = frame->local;
+ priv = this->private;
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+
+ if (priv->thin_arbiter_count && local->is_read_txn &&
+ AFR_COUNT(success_replies, priv->child_count) != priv->child_count) {
+ /* We need to query the good bricks and/or thin-arbiter.*/
+ if (success_replies[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (success_replies[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
+ error = EINVAL;
+ goto refresh_done;
+ }
+
+ if (!afr_has_quorum(success_replies, this, frame)) {
+ error = afr_final_errno(frame->local, this->private);
+ if (!error)
+ error = afr_quorum_errno(priv);
+ goto refresh_done;
+ }
+
+ ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal);
+
+ if (ret && afr_selfheal_enabled(this) && start_heal) {
+ heal_frame = afr_frame_create(this, NULL);
+ if (!heal_frame)
+ goto refresh_done;
+ heal_local = heal_frame->local;
+ heal_local->refreshinode = inode_ref(local->refreshinode);
+ heal_local->heal_frame = heal_frame;
+ if (!afr_throttled_selfheal(heal_frame, this)) {
+ AFR_STACK_DESTROY(heal_frame);
+ goto refresh_done;
}
+ }
- if ((local->self_heal.need_metadata_self_heal
- || local->self_heal.need_data_self_heal
- || local->self_heal.need_entry_self_heal)
- && ((!local->cont.lookup.is_revalidate)
- || (local->op_ret != -1))) {
+refresh_done:
+ afr_txn_refresh_done(frame, this, error);
- if (local->inodelk_count || local->entrylk_count) {
+ return 0;
+}
- /* Someone else is doing self-heal on this file.
- So just make a best effort to set the read-subvolume
- and return */
+void
+afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ dict_t *xdata, struct iatt *par)
+{
+ afr_local_t *local = NULL;
+ int call_child = (long)cookie;
+ int8_t need_heal = 1;
+ int call_count = 0;
+ int ret = 0;
+
+ local = frame->local;
+ local->replies[call_child].valid = 1;
+ local->replies[call_child].op_ret = op_ret;
+ local->replies[call_child].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[call_child].poststat = *buf;
+ if (par)
+ local->replies[call_child].postparent = *par;
+ if (xdata)
+ local->replies[call_child].xdata = dict_ref(xdata);
+ }
+
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to get link count");
+ }
+ }
+
+ local->replies[call_child].need_heal = need_heal;
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ ret = afr_inode_refresh_err(frame, this);
+ if (ret) {
+ gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed");
+ }
+ afr_inode_refresh_done(frame, this, ret);
+ }
+}
- if (IA_ISREG (local->cont.lookup.inode->ia_type)) {
- source = afr_self_heal_get_source (this, local, local->cont.lookup.xattrs);
+int
+afr_inode_refresh_subvol_with_lookup_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *par)
+{
+ afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ xdata, par);
+ return 0;
+}
- if (source >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- source);
- }
- }
- } else {
- if (!local->cont.lookup.inode->ia_type) {
- /* fix for RT #602 */
- local->cont.lookup.inode->ia_type =
- lookup_buf->ia_type;
- }
+int
+afr_inode_refresh_subvol_with_lookup(call_frame_t *frame, xlator_t *this, int i,
+ inode_t *inode, uuid_t gfid, dict_t *xdata)
+{
+ loc_t loc = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ loc.inode = inode;
+ if (gf_uuid_is_null(inode->gfid) && gfid) {
+ /* To handle setattr/setxattr on yet to be linked inode from
+ * dht */
+ gf_uuid_copy(loc.gfid, gfid);
+ } else {
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ }
+
+ STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_lookup_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &loc, xdata);
+ return 0;
+}
- local->self_heal.background = _gf_true;
- local->self_heal.type = local->cont.lookup.buf.ia_type;
- local->self_heal.unwind = afr_self_heal_lookup_unwind;
+int
+afr_inode_refresh_subvol_with_fstat_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ xdata, NULL);
+ return 0;
+}
- unwind = 0;
+int
+afr_inode_refresh_subvol_with_fstat(call_frame_t *frame, xlator_t *this, int i,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- afr_self_heal_type_str_get(&local->self_heal,
- sh_type_str,
- sizeof(sh_type_str));
+ priv = this->private;
+ local = frame->local;
- gf_log (this->name, GF_LOG_NORMAL, "background %s "
- "self-heal triggered. path: %s",
- sh_type_str, local->loc.path);
+ STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_fstat_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fstat, local->fd, xdata);
+ return 0;
+}
- afr_self_heal (frame, this);
- }
+int
+afr_inode_refresh_do(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ int ret = 0;
+ dict_t *xdata = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *wind_subvols = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ wind_subvols = alloca0(priv->child_count);
+
+ afr_local_replies_wipe(local, priv);
+
+ if (local->fd) {
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ afr_inode_refresh_done(frame, this, EINVAL);
+ return 0;
}
+ }
-unwind:
- if (unwind) {
- AFR_STACK_UNWIND (lookup, frame, local->op_ret,
- local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
- }
-}
+ xdata = dict_new();
+ if (!xdata) {
+ afr_inode_refresh_done(frame, this, ENOMEM);
+ return 0;
+ }
+ ret = afr_xattr_req_prepare(this, xdata);
+ if (ret != 0) {
+ dict_unref(xdata);
+ afr_inode_refresh_done(frame, this, -ret);
+ return 0;
+ }
-/*
- * During a lookup, some errors are more "important" than
- * others in that they must be given higher priority while
- * returning to the user.
- *
- * The hierarchy is ESTALE > ENOENT > others
- *
- */
+ ret = dict_set_sizen_str_sizen(xdata, "link-count", GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
-static gf_boolean_t
-__error_more_important (int32_t old_errno, int32_t new_errno)
-{
- gf_boolean_t ret = _gf_true;
+ ret = dict_set_str_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name);
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "Unable to set inodelk-dom-count in dict ");
+ }
+
+ if (local->fd) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] && fd_ctx->opened_on[i] == AFR_FD_OPENED)
+ wind_subvols[i] = 1;
+ }
+ } else {
+ memcpy(wind_subvols, local->child_up,
+ sizeof(*local->child_up) * priv->child_count);
+ }
+
+ local->call_count = AFR_COUNT(wind_subvols, priv->child_count);
+
+ call_count = local->call_count;
+ if (!call_count) {
+ dict_unref(xdata);
+ if (local->fd && AFR_COUNT(local->child_up, priv->child_count))
+ afr_inode_refresh_done(frame, this, EBADFD);
+ else
+ afr_inode_refresh_done(frame, this, ENOTCONN);
+ return 0;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_subvols[i])
+ continue;
- /* Nothing should ever overwrite ESTALE */
- if (old_errno == ESTALE)
- ret = _gf_false;
+ if (local->fd)
+ afr_inode_refresh_subvol_with_fstat(frame, this, i, xdata);
+ else
+ afr_inode_refresh_subvol_with_lookup(
+ frame, this, i, local->refreshinode, local->refreshgfid, xdata);
- /* Nothing should overwrite ENOENT, except ESTALE */
- else if ((old_errno == ENOENT) && (new_errno != ESTALE))
- ret = _gf_false;
+ if (!--call_count)
+ break;
+ }
- return ret;
-}
+ dict_unref(xdata);
+ return 0;
+}
int
-afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, afr_inode_refresh_cbk_t refreshfn)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- struct iatt * lookup_buf = NULL;
+ afr_local_t *local = NULL;
- int call_count = -1;
- int child_index = -1;
- int first_up_child = -1;
+ local = frame->local;
- child_index = (long) cookie;
- priv = this->private;
+ local->refreshfn = refreshfn;
- LOCK (&frame->lock);
- {
- local = frame->local;
+ if (local->refreshinode) {
+ inode_unref(local->refreshinode);
+ local->refreshinode = NULL;
+ }
- lookup_buf = &local->cont.lookup.buf;
+ local->refreshinode = inode_ref(inode);
- if (op_ret == -1) {
- if (op_errno == ENOENT)
- local->enoent_count++;
+ if (gfid)
+ gf_uuid_copy(local->refreshgfid, gfid);
+ else
+ gf_uuid_clear(local->refreshgfid);
- if (__error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
+ afr_inode_refresh_do(frame, this);
- if (local->op_errno == ESTALE) {
- local->op_ret = -1;
- }
+ return 0;
+}
- goto unlock;
- }
+int
+afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
- afr_lookup_collect_xattr (local, this, child_index, xattr);
+ priv = this->private;
- first_up_child = afr_first_up_child (priv);
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_uint64(xattr_req, priv->pending_key[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret < 0)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Unable to set dict value for %s", priv->pending_key[i]);
+ /* 3 = data+metadata+entry */
+ }
+ ret = dict_set_uint64(xattr_req, AFR_DIRTY,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "failed to set dirty "
+ "query flag");
+ }
+
+ ret = dict_set_int32_sizen(xattr_req, "list-xattr", 1);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set list-xattr in dict ");
+ }
+
+ return ret;
+}
- if (child_index == first_up_child) {
- local->cont.lookup.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- first_up_child);
- }
+int
+afr_lookup_xattr_req_prepare(afr_local_t *local, xlator_t *this,
+ dict_t *xattr_req, loc_t *loc)
+{
+ int ret = -ENOMEM;
+
+ if (!local->xattr_req)
+ local->xattr_req = dict_new();
+
+ if (!local->xattr_req)
+ goto out;
+
+ if (xattr_req && (xattr_req != local->xattr_req))
+ dict_copy(xattr_req, local->xattr_req);
+
+ ret = afr_xattr_req_prepare(this, local->xattr_req);
+
+ ret = dict_set_uint64(local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_INODELK_COUNT);
+ }
+ ret = dict_set_uint64(local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_ENTRYLK_COUNT);
+ }
+
+ ret = dict_set_uint32(local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_PARENT_ENTRYLK);
+ }
+
+ ret = dict_set_sizen_str_sizen(local->xattr_req, "link-count",
+ GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
- if (local->success_count == 0) {
- if (local->op_errno != ESTALE)
- local->op_ret = op_ret;
-
- local->cont.lookup.inode = inode_ref (inode);
- local->cont.lookup.xattr = dict_ref (xattr);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparent = *postparent;
-
- if (priv->first_lookup && inode->ino == 1) {
- gf_log (this->name, GF_LOG_NORMAL,
- "added root inode");
- priv->root_inode = inode_ref (inode);
- priv->first_lookup = 0;
- }
-
- *lookup_buf = *buf;
-
- lookup_buf->ia_ino = afr_itransform (buf->ia_ino,
- priv->child_count,
- child_index);
- if (priv->read_child >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- priv->read_child);
- } else {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- child_index);
- }
-
- } else {
- afr_lookup_self_heal_check (this, local, buf, lookup_buf);
-
- if (child_index == local->read_child_index) {
- /*
- lookup has succeeded on the read child.
- So use its inode number
- */
- if (local->cont.lookup.xattr)
- dict_unref (local->cont.lookup.xattr);
-
- local->cont.lookup.xattr = dict_ref (xattr);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparent = *postparent;
-
- *lookup_buf = *buf;
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- priv->read_child);
- } else {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- local->read_child_index);
- }
- }
+int
+afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)
+{
+ int i = 0;
+ int child = -1;
+ int64_t read_iter = -1;
+ int64_t pending_read = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])
+ continue;
+ read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
+ if (child == -1 || read_iter < pending_read) {
+ pending_read = read_iter;
+ child = i;
+ }
+ }
- }
+ return child;
+}
- local->success_count++;
- }
-unlock:
- UNLOCK (&frame->lock);
+static int32_t
+afr_least_latency_child(afr_private_t *priv, unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
- call_count = afr_frame_return (frame);
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
- if (call_count == 0) {
- afr_lookup_done (frame, this, lookup_buf);
+ if (child == -1 ||
+ priv->child_latency[i] < priv->child_latency[child]) {
+ child = i;
}
-
- return 0;
+ }
+ return child;
}
+static int32_t
+afr_least_latency_times_pending_reads_child(afr_private_t *priv,
+ unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+ int64_t pending_read = 0;
+ int64_t latency = -1;
+ int64_t least_latency = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
+ latency = (pending_read + 1) * priv->child_latency[i];
+
+ if (child == -1 || latency < least_latency) {
+ least_latency = latency;
+ child = i;
+ }
+ }
+ return child;
+}
int
-afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
+ unsigned char *readable)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- struct iatt * lookup_buf = NULL;
-
- int call_count = -1;
- int child_index = -1;
- int first_up_child = -1;
-
- child_index = (long) cookie;
- priv = this->private;
+ uuid_t gfid_copy = {
+ 0,
+ };
+ pid_t pid;
+ int child = -1;
+
+ switch (priv->hash_mode) {
+ case AFR_READ_POLICY_FIRST_UP:
+ break;
+ case AFR_READ_POLICY_GFID_HASH:
+ gf_uuid_copy(gfid_copy, args->gfid);
+ child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
+ priv->child_count;
+ break;
+ case AFR_READ_POLICY_GFID_PID_HASH:
+ if (args->ia_type != IA_IFDIR) {
+ /*
+ * Why getpid? Because it's one of the cheapest calls
+ * available - faster than gethostname etc. - and
+ * returns a constant-length value that's sure to be
+ * shorter than a UUID. It's still very unlikely to be
+ * the same across clients, so it still provides good
+ * mixing. We're not trying for perfection here. All we
+ * need is a low probability that multiple clients
+ * won't converge on the same subvolume.
+ */
+ gf_uuid_copy(gfid_copy, args->gfid);
+ pid = getpid();
+ *(pid_t *)gfid_copy ^= pid;
+ }
+ child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
+ priv->child_count;
+ break;
+ case AFR_READ_POLICY_LESS_LOAD:
+ child = afr_least_pending_reads_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LEAST_LATENCY:
+ child = afr_least_latency_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
+ child = afr_least_latency_times_pending_reads_child(priv, readable);
+ break;
+ }
+
+ return child;
+}
- LOCK (&frame->lock);
- {
- local = frame->local;
+int
+afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
+ unsigned char *readable,
+ afr_read_subvol_args_t *args)
+{
+ int i = 0;
+ int read_subvol = -1;
+ afr_private_t *priv = NULL;
+ afr_read_subvol_args_t local_args = {
+ 0,
+ };
+
+ priv = this->private;
+
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
+ return priv->read_child;
+
+ if (inode_is_linked(inode)) {
+ gf_uuid_copy(local_args.gfid, inode->gfid);
+ local_args.ia_type = inode->ia_type;
+ } else if (args) {
+ local_args = *args;
+ }
+
+ /* second preference - use hashed mode */
+ read_subvol = afr_hash_child(&local_args, priv, readable);
+ if (read_subvol >= 0 && readable[read_subvol])
+ return read_subvol;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i])
+ return i;
+ }
+
+ /* no readable subvolumes, either split brain or all subvols down */
+
+ return -1;
+}
- lookup_buf = &local->cont.lookup.buf;
+int
+afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type)
+{
+ int ret = -1;
- if (op_ret == -1) {
- if (op_errno == ENOENT)
- local->enoent_count++;
+ if (type == AFR_METADATA_TRANSACTION)
+ ret = afr_inode_read_subvol_get(inode, this, 0, readable, event_p);
+ else
+ ret = afr_inode_read_subvol_get(inode, this, readable, 0, event_p);
+ return ret;
+}
- if (__error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
+void
+afr_readables_intersect_get(inode_t *inode, xlator_t *this, int *event,
+ unsigned char *intersection)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ unsigned char *intersect = NULL;
+
+ priv = this->private;
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+ intersect = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable,
+ event);
+
+ AFR_INTERSECT(intersect, data_readable, metadata_readable,
+ priv->child_count);
+ if (intersection)
+ memcpy(intersection, intersect,
+ sizeof(*intersection) * priv->child_count);
+}
- if (local->op_errno == ESTALE) {
- local->op_ret = -1;
- }
+int
+afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
+ unsigned char *readables, int *event_p,
+ afr_transaction_type type, afr_read_subvol_args_t *args)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *readable = NULL;
+ unsigned char *intersection = NULL;
+ int subvol = -1;
+ int event = 0;
+
+ priv = this->private;
+
+ readable = alloca0(priv->child_count);
+ intersection = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_type_get(inode, this, readable, &event, type);
+
+ afr_readables_intersect_get(inode, this, &event, intersection);
+
+ if (AFR_COUNT(intersection, priv->child_count) > 0)
+ subvol = afr_read_subvol_select_by_policy(inode, this, intersection,
+ args);
+ else
+ subvol = afr_read_subvol_select_by_policy(inode, this, readable, args);
+ if (subvol_p)
+ *subvol_p = subvol;
+ if (event_p)
+ *event_p = event;
+ if (readables)
+ memcpy(readables, readable, sizeof(*readables) * priv->child_count);
+ return subvol;
+}
- goto unlock;
- }
+void
+afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
- afr_lookup_collect_xattr (local, this, child_index, xattr);
+ priv = this->private;
- first_up_child = afr_first_up_child (priv);
+ afr_matrix_cleanup(local->pending, priv->child_count);
- if (child_index == first_up_child) {
- local->cont.lookup.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- first_up_child);
- }
+ GF_FREE(local->internal_lock.lower_locked_nodes);
- /* in case of revalidate, we need to send stat of the
- * child whose stat was sent during the first lookup.
- * (so that time stamp does not vary with revalidate.
- * in case it is down, stat of the fist success will
- * be replied */
-
- /* inode number should be preserved across revalidates */
-
- if (local->success_count == 0) {
- if (local->op_errno != ESTALE)
- local->op_ret = op_ret;
-
- local->cont.lookup.inode = inode_ref (inode);
- local->cont.lookup.xattr = dict_ref (xattr);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparent = *postparent;
-
- *lookup_buf = *buf;
-
- lookup_buf->ia_ino = afr_itransform (buf->ia_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- priv->read_child);
- } else {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- child_index);
- }
-
- } else {
- afr_lookup_self_heal_check (this, local, buf, lookup_buf);
-
- if (child_index == local->read_child_index) {
-
- /*
- lookup has succeeded on the read child.
- So use its inode number
- */
-
- if (local->cont.lookup.xattr)
- dict_unref (local->cont.lookup.xattr);
-
- local->cont.lookup.xattr = dict_ref (xattr);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparent = *postparent;
-
- *lookup_buf = *buf;
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- priv->read_child);
- } else {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- local->read_child_index);
- }
- }
+ afr_lockees_cleanup(&local->internal_lock);
- }
+ GF_FREE(local->transaction.pre_op);
- local->success_count++;
+ GF_FREE(local->transaction.pre_op_sources);
+ if (local->transaction.changelog_xdata) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.changelog_xdata[i])
+ continue;
+ dict_unref(local->transaction.changelog_xdata[i]);
}
-unlock:
- UNLOCK (&frame->lock);
+ GF_FREE(local->transaction.changelog_xdata);
+ }
- call_count = afr_frame_return (frame);
+ GF_FREE(local->transaction.failed_subvols);
- if (call_count == 0) {
- afr_lookup_done (frame, this, lookup_buf);
- }
+ GF_FREE(local->transaction.basename);
+ GF_FREE(local->transaction.new_basename);
- return 0;
+ loc_wipe(&local->transaction.parent_loc);
+ loc_wipe(&local->transaction.new_parent_loc);
}
+void
+afr_reply_wipe(struct afr_reply *reply)
+{
+ if (reply->xdata) {
+ dict_unref(reply->xdata);
+ reply->xdata = NULL;
+ }
+
+ if (reply->xattr) {
+ dict_unref(reply->xattr);
+ reply->xattr = NULL;
+ }
+}
-int
-afr_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+void
+afr_replies_wipe(struct afr_reply *replies, int count)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
+ int i = 0;
- fop_lookup_cbk_t callback;
+ for (i = 0; i < count; i++) {
+ afr_reply_wipe(&replies[i]);
+ }
+}
- int call_count = 0;
+void
+afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv)
+{
+ if (!local->replies)
+ return;
- uint64_t ctx;
+ afr_replies_wipe(local->replies, priv->child_count);
- int32_t op_errno = 0;
+ memset(local->replies, 0, sizeof(*local->replies) * priv->child_count);
+}
- priv = this->private;
+static gf_boolean_t
+afr_fop_lock_is_unlock(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ if ((F_UNLCK == local->cont.inodelk.in_flock.l_type) &&
+ (local->cont.inodelk.in_cmd == F_SETLKW ||
+ local->cont.inodelk.in_cmd == F_SETLK))
+ return _gf_true;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ if (ENTRYLK_UNLOCK == local->cont.entrylk.in_cmd)
+ return _gf_true;
+ break;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+static gf_boolean_t
+afr_lk_is_unlock(int32_t cmd, struct gf_flock *flock)
+{
+ switch (cmd) {
+ case F_RESLK_UNLCK:
+ return _gf_true;
+ break;
- local->op_ret = -1;
+#if F_SETLKW != F_SETLKW64
+ case F_SETLKW64:
+#endif
+ case F_SETLKW:
- frame->local = local;
+#if F_SETLK != F_SETLK64
+ case F_SETLK64:
+#endif
+ case F_SETLK:
+ if (F_UNLCK == flock->l_type)
+ return _gf_true;
+ break;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
- if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) {
- op_errno = ENOENT;
- goto out;
- }
+void
+afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
+ int32_t *op_errno)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- loc_copy (&local->loc, loc);
+ if (!frame || !frame->this || !frame->local || !frame->this->private)
+ return;
- ret = inode_ctx_get (loc->inode, this, &ctx);
- if (ret == 0) {
- /* lookup is a revalidate */
+ if (*op_ret < 0)
+ return;
- callback = afr_revalidate_lookup_cbk;
+ /* Failing inodelk/entrylk/lk here is not a good idea because we
+ * need to cleanup the locks on the other bricks if we choose to fail
+ * the fop here. The brick may go down just after unwind happens as well
+ * so anyways the fop will fail when the next fop is sent so leaving
+ * it like this for now.*/
+ local = frame->local;
+ switch (local->op) {
+ case GF_FOP_LOOKUP:
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ case GF_FOP_LK:
+ return;
+ default:
+ break;
+ }
- local->cont.lookup.is_revalidate = _gf_true;
- local->read_child_index = afr_read_child (this,
- loc->inode);
- } else {
- callback = afr_fresh_lookup_cbk;
+ priv = frame->this->private;
+ if (!priv->consistent_io)
+ return;
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
- }
+ if (local->event_generation &&
+ (local->event_generation != priv->event_generation))
+ goto inconsistent;
- if (loc->parent)
- local->cont.lookup.parent_ino = loc->parent->ino;
+ return;
+inconsistent:
+ *op_ret = -1;
+ *op_errno = ENOTCONN;
+}
- local->child_up = memdup (priv->child_up, priv->child_count);
+void
+afr_local_cleanup(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
- local->cont.lookup.xattrs = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.lookup.xattr),
- gf_afr_mt_dict_t);
+ if (!local)
+ return;
- local->call_count = afr_up_children_count (priv->child_count,
- local->child_up);
- call_count = local->call_count;
+ syncbarrier_destroy(&local->barrier);
- if (local->call_count == 0) {
- ret = -1;
- op_errno = ENOTCONN;
- goto out;
- }
+ afr_local_transaction_cleanup(local, this);
- /* By default assume ENOTCONN. On success it will be set to 0. */
- local->op_errno = ENOTCONN;
+ priv = this->private;
- if (xattr_req == NULL)
- local->xattr_req = dict_new ();
- else
- local->xattr_req = dict_ref (xattr_req);
+ loc_wipe(&local->loc);
+ loc_wipe(&local->newloc);
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i],
- 3 * sizeof(int32_t));
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- /* 3 = data+metadata+entry */
- }
+ if (local->fd)
+ fd_unref(local->fd);
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- }
+ if (local->xattr_req)
+ dict_unref(local->xattr_req);
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- }
+ if (local->xattr_rsp)
+ dict_unref(local->xattr_rsp);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, callback, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- loc, local->xattr_req);
- if (!--call_count)
- break;
- }
- }
+ if (local->dict)
+ dict_unref(local->dict);
- ret = 0;
-out:
- if (ret == -1)
- AFR_STACK_UNWIND (lookup, frame, -1, op_errno,
- NULL, NULL, NULL, NULL);
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->replies);
- return 0;
-}
+ GF_FREE(local->child_up);
+ GF_FREE(local->read_attempted);
+
+ GF_FREE(local->readable);
+ GF_FREE(local->readable2);
+
+ if (local->inode)
+ inode_unref(local->inode);
-/* {{{ open */
+ if (local->parent)
+ inode_unref(local->parent);
+
+ if (local->parent2)
+ inode_unref(local->parent2);
+
+ if (local->refreshinode)
+ inode_unref(local->refreshinode);
+
+ { /* getxattr */
+ GF_FREE(local->cont.getxattr.name);
+ }
+
+ { /* lk */
+ GF_FREE(local->cont.lk.locked_nodes);
+ GF_FREE(local->cont.lk.dom_locked_nodes);
+ GF_FREE(local->cont.lk.dom_lock_op_ret);
+ GF_FREE(local->cont.lk.dom_lock_op_errno);
+ }
+
+ { /* create */
+ if (local->cont.create.fd)
+ fd_unref(local->cont.create.fd);
+ if (local->cont.create.params)
+ dict_unref(local->cont.create.params);
+ }
+
+ { /* mknod */
+ if (local->cont.mknod.params)
+ dict_unref(local->cont.mknod.params);
+ }
+
+ { /* mkdir */
+ if (local->cont.mkdir.params)
+ dict_unref(local->cont.mkdir.params);
+ }
+
+ { /* symlink */
+ if (local->cont.symlink.params)
+ dict_unref(local->cont.symlink.params);
+ }
+
+ { /* writev */
+ GF_FREE(local->cont.writev.vector);
+ if (local->cont.writev.iobref)
+ iobref_unref(local->cont.writev.iobref);
+ }
+
+ { /* setxattr */
+ if (local->cont.setxattr.dict)
+ dict_unref(local->cont.setxattr.dict);
+ }
+
+ { /* fsetxattr */
+ if (local->cont.fsetxattr.dict)
+ dict_unref(local->cont.fsetxattr.dict);
+ }
+
+ { /* removexattr */
+ GF_FREE(local->cont.removexattr.name);
+ }
+ { /* xattrop */
+ if (local->cont.xattrop.xattr)
+ dict_unref(local->cont.xattrop.xattr);
+ }
+ { /* symlink */
+ GF_FREE(local->cont.symlink.linkpath);
+ }
+
+ { /* opendir */
+ GF_FREE(local->cont.opendir.checksum);
+ }
+
+ { /* open */
+ if (local->cont.open.fd)
+ fd_unref(local->cont.open.fd);
+ }
+
+ { /* readdirp */
+ if (local->cont.readdir.dict)
+ dict_unref(local->cont.readdir.dict);
+ }
+
+ { /* inodelk */
+ GF_FREE(local->cont.inodelk.volume);
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ }
+
+ { /* entrylk */
+ GF_FREE(local->cont.entrylk.volume);
+ GF_FREE(local->cont.entrylk.basename);
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ }
+
+ if (local->xdata_req)
+ dict_unref(local->xdata_req);
+
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+}
int
-afr_fd_ctx_set (xlator_t *this, fd_t *fd)
+afr_frame_return(call_frame_t *frame)
{
- afr_private_t * priv = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
- int ret = 0;
+ local = frame->local;
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
+ LOCK(&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (fd, out);
+ return call_count;
+}
- priv = this->private;
+static char *afr_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
- LOCK (&fd->lock);
- {
- ret = __fd_ctx_get (fd, this, &ctx);
+gf_boolean_t
+afr_is_xattr_ignorable(char *key)
+{
+ int i = 0;
+
+ if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX)))
+ return _gf_true;
+ for (i = 0; afr_ignore_xattrs[i]; i++) {
+ if (!strcmp(key, afr_ignore_xattrs[i]))
+ return _gf_true;
+ }
+ return _gf_false;
+}
- if (ret == 0)
- goto unlock;
+static gf_boolean_t
+afr_xattr_match_needed(dict_t *this, char *key1, data_t *value1, void *data)
+{
+ /* Ignore all non-disk (i.e. virtual) xattrs right away. */
+ if (!gf_is_valid_xattr_namespace(key1))
+ return _gf_false;
- fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t),
- gf_afr_mt_afr_fd_ctx_t);
- if (!fd_ctx) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
+ /* Ignore on-disk xattrs that AFR doesn't need to heal. */
+ if (!afr_is_xattr_ignorable(key1))
+ return _gf_true;
- ret = -ENOMEM;
- goto unlock;
- }
+ return _gf_false;
+}
- fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->pre_op_done) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -ENOMEM;
- goto unlock;
- }
+gf_boolean_t
+afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2)
+{
+ return are_dicts_equal(dict1, dict2, afr_xattr_match_needed, NULL);
+}
- fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->pre_op_piggyback) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -ENOMEM;
- goto unlock;
- }
+static int
+afr_get_parent_read_subvol(xlator_t *this, inode_t *parent,
+ struct afr_reply *replies, unsigned char *readable)
+{
+ int i = 0;
+ int par_read_subvol = -1;
+ int par_read_subvol_iter = -1;
+ afr_private_t *priv = NULL;
- fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->opened_on) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -ENOMEM;
- goto unlock;
- }
+ priv = this->private;
- fd_ctx->up_count = priv->up_count;
- fd_ctx->down_count = priv->down_count;
-
- fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->locked_on) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -ENOMEM;
- goto unlock;
- }
+ if (parent)
+ par_read_subvol = afr_data_subvol_get(parent, this, NULL, NULL, NULL,
+ NULL);
- ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
- INIT_LIST_HEAD (&fd_ctx->entries);
+ if (replies[i].op_ret < 0)
+ continue;
+
+ if (par_read_subvol_iter == -1) {
+ par_read_subvol_iter = i;
+ continue;
}
-unlock:
- UNLOCK (&fd->lock);
-out:
- return ret;
-}
-/* {{{ flush */
+ if ((par_read_subvol_iter != par_read_subvol) && readable[i])
+ par_read_subvol_iter = i;
+
+ if (i == par_read_subvol)
+ par_read_subvol_iter = i;
+ }
+ /* At the end of the for-loop, the only reason why @par_read_subvol_iter
+ * could be -1 is when this LOOKUP has failed on all sub-volumes.
+ * So it is okay to send an arbitrary subvolume (0 in this case)
+ * as parent read subvol.
+ */
+ if (par_read_subvol_iter == -1)
+ par_read_subvol_iter = 0;
+
+ return par_read_subvol_iter;
+}
int
-afr_flush_unwind (call_frame_t *frame, xlator_t *this)
+afr_read_subvol_decide(inode_t *inode, xlator_t *this,
+ afr_read_subvol_args_t *args, unsigned char *readable)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ int event = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *intersection = NULL;
- local = frame->local;
+ priv = this->private;
+ intersection = alloca0(priv->child_count);
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ afr_readables_intersect_get(inode, this, &event, intersection);
- if (main_frame) {
- AFR_STACK_UNWIND (flush, main_frame,
- local->op_ret, local->op_errno);
- }
+ if (AFR_COUNT(intersection, priv->child_count) <= 0) {
+ /* TODO: If we have one brick with valid data_readable and
+ * another with metadata_readable, try to send an iatt with
+ * valid bits from both.*/
+ return -1;
+ }
- return 0;
+ memcpy(readable, intersection, sizeof(*readable) * priv->child_count);
+
+ return afr_read_subvol_select_by_policy(inode, this, intersection, args);
}
+static inline int
+afr_first_up_child(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
-int
-afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+ for (i = 0; i < priv->child_count; i++)
+ if (local->replies[i].valid && local->replies[i].op_ret == 0)
+ return i;
+ return -1;
+}
+
+static void
+afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
+ unsigned char *success_replies,
+ unsigned char *data_readable, int *read_subvol)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int spb_subvol = -1;
+ int child_count = -1;
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
+ if (*read_subvol != -1)
+ return;
- local = frame->local;
- priv = this->private;
+ priv = this->private;
+ local = frame->local;
+ child_count = priv->child_count;
+
+ afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol);
+ if ((spb_subvol >= 0) &&
+ (AFR_COUNT(success_replies, child_count) == child_count)) {
+ *read_subvol = spb_subvol;
+ } else if (!priv->quorum_count ||
+ frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) {
+ *read_subvol = afr_first_up_child(frame, this);
+ } else if (priv->quorum_count &&
+ afr_has_quorum(data_readable, this, NULL)) {
+ /* read_subvol is guaranteed to be valid if we hit this path. */
+ *read_subvol = afr_first_up_child(frame, this);
+ } else {
+ /* If quorum is enabled and we do not have a
+ readable yet, it means all good copies are down.
+ */
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR,
+ "no read "
+ "subvols for %s",
+ local->loc.path);
+ }
+ if (*read_subvol >= 0)
+ dict_del_sizen(local->replies[*read_subvol].xdata, GF_CONTENT_KEY);
+}
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+static void
+afr_lookup_done(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
+ int par_read_subvol = 0;
+ int ret = -1;
+ unsigned char *readable = NULL;
+ unsigned char *success_replies = NULL;
+ int event = 0;
+ struct afr_reply *replies = NULL;
+ uuid_t read_gfid = {
+ 0,
+ };
+ gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t in_flight_create = _gf_false;
+ gf_boolean_t can_interpret = _gf_true;
+ inode_t *parent = NULL;
+ ia_type_t ia_type = IA_INVAL;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+ char *gfid_heal_msg = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ replies = local->replies;
+ parent = local->loc.parent;
+
+ locked_entry = afr_is_possibly_under_txn(AFR_ENTRY_TRANSACTION, local,
+ this);
+
+ readable = alloca0(priv->child_count);
+ success_replies = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(parent, this, readable, NULL, &event);
+ par_read_subvol = afr_get_parent_read_subvol(this, parent, replies,
+ readable);
+
+ /* First, check if we have a gfid-change from somewhere,
+ If so, propagate that so that a fresh lookup can be
+ issued
+ */
+ if (local->cont.lookup.needs_fresh_lookup) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
+ goto error;
+ }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ op_errno = afr_final_errno(frame->local, this->private);
+ local->op_errno = op_errno;
- if (need_unwind)
- afr_flush_unwind (frame, this);
+ read_subvol = -1;
+ afr_fill_success_replies(local, priv, success_replies);
- call_count = afr_frame_return (frame);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
+ if (replies[i].op_ret == -1) {
+ if (locked_entry && replies[i].op_errno == ENOENT) {
+ in_flight_create = _gf_true;
+ }
+ continue;
}
- return 0;
-}
+ if (read_subvol == -1 || !readable[read_subvol]) {
+ read_subvol = i;
+ gf_uuid_copy(read_gfid, replies[i].poststat.ia_gfid);
+ ia_type = replies[i].poststat.ia_type;
+ local->op_ret = 0;
+ }
+ }
+ if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ goto error;
+ }
+
+ if (read_subvol == -1)
+ goto error;
+ /* We now have a read_subvol, which is readable[] (if there
+ were any). Next we look for GFID mismatches. We don't
+ consider a GFID mismatch as an error if read_subvol is
+ readable[] but the mismatching GFID subvol is not.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1) {
+ continue;
+ }
-int
-afr_flush_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ if (!gf_uuid_compare(replies[i].poststat.ia_gfid, read_gfid))
+ continue;
- int i = 0;
- int call_count = -1;
+ can_interpret = _gf_false;
- local = frame->local;
- priv = this->private;
+ if (locked_entry)
+ continue;
+
+ /* Now GFIDs mismatch. It's OK as long as this subvol
+ is not readable[] but read_subvol is */
+ if (readable[read_subvol] && !readable[i])
+ continue;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ /* If we were called from glfsheal and there is still a gfid
+ * mismatch, succeed the lookup and let glfsheal print the
+ * response via gfid-heal-msg.*/
+ if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg",
+ &gfid_heal_msg))
+ goto cant_interpret;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
+ /* LOG ERROR */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto error;
+ }
+
+ /* Forth, for the finalized GFID, pick the best subvolume
+ to return stats from.
+ */
+ read_subvol = -1;
+ memset(readable, 0, sizeof(*readable) * priv->child_count);
+ if (can_interpret) {
+ if (!afr_has_quorum(success_replies, this, NULL))
+ goto cant_interpret;
+ /* It is safe to call afr_replies_interpret() because we have
+ a response from all the UP subvolumes and all of them resolved
+ to the same GFID
+ */
+ gf_uuid_copy(args.gfid, read_gfid);
+ args.ia_type = ia_type;
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ read_subvol = afr_read_subvol_decide(local->inode, this, &args,
+ readable);
+ if (read_subvol == -1)
+ goto cant_interpret;
+ if (ret) {
+ afr_inode_need_refresh_set(local->inode, this);
+ dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
+ }
+ } else {
+ cant_interpret:
+ afr_attempt_readsubvol_set(frame, this, success_replies, readable,
+ &read_subvol);
+ if (read_subvol == -1) {
+ goto error;
}
+ }
- local->call_count = call_count;
+ afr_handle_quota_size(frame, this);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- local->fd);
-
- if (!--call_count)
- break;
- }
+ afr_set_need_heal(this, local);
+ if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg_debug(this->name, 0,
+ "Arbiter cannot be a read subvol "
+ "for %s",
+ local->loc.path);
+ goto error;
+ }
+
+ ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg);
+ if (!ret) {
+ ret = dict_set_str_sizen(local->replies[read_subvol].xdata,
+ "gfid-heal-msg", gfid_heal_msg);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "Error setting gfid-heal-msg dict");
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
}
+ }
- return 0;
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[par_read_subvol].postparent);
+ return;
+
+error:
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL, NULL);
}
+/*
+ * During a lookup, some errors are more "important" than
+ * others in that they must be given higher priority while
+ * returning to the user.
+ *
+ * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others
+ */
int
-afr_flush_done (call_frame_t *frame, xlator_t *this)
+afr_higher_errno(int32_t old_errno, int32_t new_errno)
{
- afr_local_t *local = NULL;
+ if (old_errno == ENODATA || new_errno == ENODATA)
+ return ENODATA;
+ if (old_errno == ENOENT || new_errno == ENOENT)
+ return ENOENT;
+ if (old_errno == ESTALE || new_errno == ESTALE)
+ return ESTALE;
+ if (old_errno == ENOSPC || new_errno == ENOSPC)
+ return ENOSPC;
+
+ return new_errno;
+}
- local = frame->local;
+int
+afr_final_errno(afr_local_t *local, afr_private_t *priv)
+{
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret >= 0)
+ continue;
+ tmp_errno = local->replies[i].op_errno;
+ op_errno = afr_higher_errno(op_errno, tmp_errno);
+ }
+
+ return op_errno;
+}
- local->transaction.unwind (frame, this);
+static int32_t
+afr_local_discovery_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int ret = 0;
+ char *pathinfo = NULL;
+ gf_boolean_t is_local = _gf_false;
+ afr_private_t *priv = NULL;
+ int32_t child_index = -1;
+
+ if (op_ret != 0) {
+ goto out;
+ }
+
+ priv = this->private;
+ child_index = (int32_t)(long)cookie;
+
+ ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
+ if (ret != 0) {
+ goto out;
+ }
+
+ ret = glusterfs_is_local_pathinfo(pathinfo, &is_local);
+ if (ret) {
+ goto out;
+ }
+
+ /*
+ * Note that one local subvolume will override another here. The only
+ * way to avoid that would be to retain extra information about whether
+ * the previous read_child is local, and it's just not worth it. Even
+ * the slowest local subvolume is far preferable to a remote one.
+ */
+ if (is_local) {
+ priv->local[child_index] = 1;
+ /* Don't set arbiter as read child. */
+ if (AFR_IS_ARBITER_BRICK(priv, child_index))
+ goto out;
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_LOCAL_CHILD,
+ "selecting local read_child %s",
+ priv->children[child_index]->name);
+
+ priv->read_child = child_index;
+ }
+out:
+ STACK_DESTROY(frame->root);
+ return 0;
+}
- AFR_STACK_DESTROY (frame);
+static void
+afr_attempt_local_discovery(xlator_t *this, int32_t child_index)
+{
+ call_frame_t *newframe = NULL;
+ loc_t tmploc = {
+ 0,
+ };
+ afr_private_t *priv = this->private;
+
+ newframe = create_frame(this, this->ctx->pool);
+ if (!newframe) {
+ return;
+ }
- return 0;
+ tmploc.gfid[sizeof(tmploc.gfid) - 1] = 1;
+ STACK_WIND_COOKIE(newframe, afr_local_discovery_cbk,
+ (void *)(long)child_index, priv->children[child_index],
+ priv->children[child_index]->fops->getxattr, &tmploc,
+ GF_XATTR_PATHINFO_KEY, NULL);
}
-
int
-afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+afr_lookup_sh_metadata_wrap(void *opaque)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0, first = -1;
+ int ret = -1;
+ dict_t *dict = NULL;
+
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ replies = local->replies;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ first = i;
+ break;
+ }
+ if (first == -1)
+ goto out;
+
+ if (afr_selfheal_metadata_by_stbuf(this, &replies[first].poststat))
+ goto out;
+
+ afr_local_replies_wipe(local, this->private);
+
+ dict = dict_new();
+ if (!dict)
+ goto out;
+ if (local->xattr_req) {
+ dict_copy(local->xattr_req, dict);
+ }
+
+ ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
+
+ if (loc_is_nameless(&local->loc)) {
+ ret = afr_selfheal_unlocked_discover_on(frame, local->inode,
+ local->loc.gfid, local->replies,
+ local->child_up, dict);
+ } else {
+ inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up, dict);
+ }
+ if (inode)
+ inode_unref(inode);
+out:
+ if (loc_is_nameless(&local->loc))
+ afr_discover_done(frame, this);
+ else
+ afr_lookup_done(frame, this);
- call_frame_t * transaction_frame = NULL;
+ if (dict)
+ dict_unref(dict);
- int ret = -1;
+ return 0;
+}
- int op_ret = -1;
- int op_errno = 0;
+gf_boolean_t
+afr_is_pending_set(xlator_t *this, dict_t *xdata, int type)
+{
+ int idx = -1;
+ afr_private_t *priv = NULL;
+ void *pending_raw = NULL;
+ int *pending_int = NULL;
+ int i = 0;
- int call_count = 0;
+ priv = this->private;
+ idx = afr_index_for_transaction_type(type);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw) == 0) {
+ if (pending_raw) {
+ pending_int = pending_raw;
- priv = this->private;
+ if (ntoh32(pending_int[idx]))
+ return _gf_true;
+ }
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
+ if (!pending_raw)
+ continue;
+ pending_int = pending_raw;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ if (ntoh32(pending_int[idx]))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static gf_boolean_t
+afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t start = _gf_false;
+ struct iatt stbuf = {
+ 0,
+ };
+
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
+
+ if (!priv->metadata_self_heal)
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (first == -1) {
+ first = i;
+ stbuf = replies[i].poststat;
+ continue;
}
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (afr_is_pending_set(this, replies[i].xdata,
+ AFR_METADATA_TRANSACTION)) {
+ /* Let shd do the heal so that lookup is not blocked
+ * on getting metadata lock/doing the heal */
+ start = _gf_false;
+ break;
+ }
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
+ if (gf_uuid_compare(stbuf.ia_gfid, replies[i].poststat.ia_gfid)) {
+ start = _gf_false;
+ break;
+ }
+ if (!IA_EQUAL(stbuf, replies[i].poststat, type)) {
+ start = _gf_false;
+ break;
}
- transaction_frame->local = local;
+ /*Check if iattrs need heal*/
+ if ((!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, prot))) {
+ start = _gf_true;
+ continue;
+ }
- local->op = GF_FOP_FLUSH;
+ /*Check if xattrs need heal*/
+ if (!afr_xattrs_are_equal(replies[first].xdata, replies[i].xdata))
+ start = _gf_true;
+ }
- local->transaction.fop = afr_flush_wind;
- local->transaction.done = afr_flush_done;
- local->transaction.unwind = afr_flush_unwind;
+ return start;
+}
- local->fd = fd_ref (fd);
+int
+afr_lookup_metadata_heal_check(call_frame_t *frame, xlator_t *this)
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+{
+ call_frame_t *heal = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ if (!afr_can_start_metadata_self_heal(frame, this))
+ goto out;
+
+ heal = afr_frame_create(this, &ret);
+ if (!heal) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = synctask_new(this->ctx->env, afr_lookup_sh_metadata_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto out;
+ return ret;
+out:
+ if (loc_is_nameless(&local->loc))
+ afr_discover_done(frame, this);
+ else
+ afr_lookup_done(frame, this);
+ if (heal)
+ AFR_STACK_DESTROY(heal);
+ return ret;
+}
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+int
+afr_lookup_selfheal_wrap(void *opaque)
+{
+ int ret = 0;
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
+ uuid_t pargfid = {
+ 0,
+ };
+
+ local = frame->local;
+ this = frame->this;
+ loc_pargfid(&local->loc, pargfid);
+
+ ret = afr_selfheal_name(frame->this, pargfid, local->loc.name,
+ &local->cont.lookup.gfid_req, local->xattr_req);
+ if (ret == -EIO)
+ goto unwind;
+
+ afr_local_replies_wipe(local, this->private);
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up, local->xattr_req);
+ if (inode)
+ inode_unref(inode);
+
+ afr_lookup_metadata_heal_check(frame, this);
+ return 0;
+unwind:
+ AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+ return 0;
+}
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+int
+afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *heal = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t name_state_mismatch = _gf_false;
+ struct afr_reply *replies = NULL;
+ int ret = 0;
+ unsigned char *par_readables = NULL;
+ unsigned char *success = NULL;
+ int32_t op_errno = 0;
+ uuid_t gfid = {0};
+
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
+ par_readables = alloca0(priv->child_count);
+ success = alloca0(priv->child_count);
+
+ ret = afr_inode_read_subvol_get(local->loc.parent, this, par_readables,
+ NULL, NULL);
+ if (ret < 0 || AFR_COUNT(par_readables, priv->child_count) == 0) {
+ /* In this case set par_readables to all 1 so that name_heal
+ * need checks at the end of this function will flag missing
+ * entry when name state mismatches*/
+ memset(par_readables, 1, priv->child_count);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret == 0) {
+ if (gf_uuid_is_null(gfid)) {
+ gf_uuid_copy(gfid, replies[i].poststat.ia_gfid);
+ }
+ success[i] = 1;
+ } else {
+ if ((replies[i].op_errno != ENOTCONN) &&
+ (replies[i].op_errno != ENOENT) &&
+ (replies[i].op_errno != ESTALE)) {
+ op_errno = replies[i].op_errno;
+ }
+ }
- AFR_STACK_UNWIND (flush, frame, op_ret, op_errno);
+ /*gfid is missing, needs heal*/
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) {
+ goto name_heal;
}
- return 0;
-}
+ if (first == -1) {
+ first = i;
+ continue;
+ }
-/* }}} */
+ if (replies[i].op_ret != replies[first].op_ret) {
+ name_state_mismatch = _gf_true;
+ }
+
+ if (replies[i].op_ret == 0) {
+ /* Rename after this lookup may succeed if we don't do
+ * a name-heal and the destination may not have pending xattrs
+ * to indicate which name is good and which is bad so always do
+ * this heal*/
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid, gfid)) {
+ goto name_heal;
+ }
+ }
+ }
+
+ if (name_state_mismatch) {
+ if (!priv->quorum_count)
+ goto name_heal;
+ if (!afr_has_quorum(success, this, NULL))
+ goto name_heal;
+ if (op_errno)
+ goto name_heal;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (par_readables[i] && replies[i].op_ret < 0 &&
+ replies[i].op_errno != ENOTCONN) {
+ goto name_heal;
+ }
+ }
+ }
+
+ goto metadata_heal;
+name_heal:
+ heal = afr_frame_create(this, NULL);
+ if (!heal)
+ goto metadata_heal;
+
+ ret = synctask_new(this->ctx->env, afr_lookup_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret) {
+ AFR_STACK_DESTROY(heal);
+ goto metadata_heal;
+ }
+ return ret;
+
+metadata_heal:
+ ret = afr_lookup_metadata_heal_check(frame, this);
+
+ return ret;
+}
int
-afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
+afr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
{
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ GF_UNUSED int ret = 0;
+ int8_t need_heal = 1;
+
+ child_index = (long)cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ /*
+ * On revalidate lookup if the gfid-changed, afr should unwind the fop
+ * with ESTALE so that a fresh lookup will be sent by the top xlator.
+ * So remember it.
+ */
+ if (xdata && dict_get_sizen(xdata, "gfid-changed"))
+ local->cont.lookup.needs_fresh_lookup = _gf_true;
+
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ local->replies[child_index].need_heal = need_heal;
+ } else {
+ local->replies[child_index].need_heal = need_heal;
+ }
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ afr_lookup_entry_heal(frame, this);
+ }
+
+ return 0;
+}
- ret = fd_ctx_get (fd, this, &ctx);
+static void
+afr_discover_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int read_subvol = -1;
+ int ret = 0;
+ unsigned char *data_readable = NULL;
+ unsigned char *success_replies = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ data_readable = alloca0(priv->child_count);
+ success_replies = alloca0(priv->child_count);
+
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) > 0)
+ local->op_ret = 0;
+
+ if (local->op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(frame->local, this->private);
+ goto error;
+ }
- if (ret < 0)
- goto out;
+ if (!afr_has_quorum(success_replies, this, frame))
+ goto unwind;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ if (ret) {
+ afr_inode_need_refresh_set(local->inode, this);
+ }
- if (fd_ctx) {
- if (fd_ctx->pre_op_done)
- GF_FREE (fd_ctx->pre_op_done);
+ read_subvol = afr_read_subvol_decide(local->inode, this, NULL,
+ data_readable);
- if (fd_ctx->opened_on)
- GF_FREE (fd_ctx->opened_on);
+unwind:
+ afr_attempt_readsubvol_set(frame, this, success_replies, data_readable,
+ &read_subvol);
+ if (read_subvol == -1)
+ goto error;
- if (fd_ctx->locked_on)
- GF_FREE (fd_ctx->locked_on);
+ if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg_debug(this->name, 0,
+ "Arbiter cannot be a read subvol "
+ "for %s",
+ local->loc.path);
+ }
+
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
+ return;
+
+error:
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL, NULL);
+}
- GF_FREE (fd_ctx);
- }
+static int
+afr_ta_id_file_check(void *opaque)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+ dict_t *dict = NULL;
+ uuid_t gfid = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ this = opaque;
+ priv = this->private;
+
+ ret = afr_fill_ta_loc(this, &loc, _gf_false);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ goto out;
+ }
+
+ ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, &stbuf,
+ 0, 0, 0);
+ if (ret == 0) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ fd = fd_create(loc.inode, getpid());
+ if (!fd)
+ goto out;
+ dict = dict_new();
+ if (!dict)
+ goto out;
+ gf_uuid_generate(gfid);
+ ret = dict_set_gfuuid(dict, "gfid-req", gfid, true);
+ ret = syncop_create(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ O_RDWR, 0664, fd, &stbuf, dict, NULL);
+ }
out:
- return 0;
+ if (ret == 0) {
+ gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to lookup/create thin-arbiter id file.");
+ }
+ if (dict)
+ dict_unref(dict);
+ if (fd)
+ fd_unref(fd);
+ loc_wipe(&loc);
+
+ return 0;
+}
+
+static int
+afr_ta_id_file_check_cbk(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ return 0;
}
+static void
+afr_discover_done(call_frame_t *frame, xlator_t *this)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ if (!priv->thin_arbiter_count)
+ goto unwind;
+ if (!gf_uuid_is_null(priv->ta_gfid))
+ goto unwind;
+
+ ret = synctask_new(this->ctx->env, afr_ta_id_file_check,
+ afr_ta_id_file_check_cbk, NULL, this);
+ if (ret)
+ goto unwind;
+unwind:
+ afr_discover_unwind(frame, this);
+}
int
-afr_release (xlator_t *this, fd_t *fd)
+afr_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
{
- afr_locked_fd_t *locked_fd = NULL;
- afr_locked_fd_t *tmp = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ GF_UNUSED int ret = 0;
+ int8_t need_heal = 1;
+
+ child_index = (long)cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+ }
+
+ if (local->do_discovery && (op_ret == 0))
+ afr_attempt_local_discovery(this, child_index);
+
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ local->replies[child_index].need_heal = need_heal;
+ } else {
+ local->replies[child_index].need_heal = need_heal;
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ afr_lookup_metadata_heal_check(frame, this);
+ }
+
+ return 0;
+}
- priv = this->private;
+int
+afr_discover_do(call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err) {
+ local->op_errno = err;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT(local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(
+ frame, afr_discover_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
- afr_cleanup_fd_ctx (this, fd);
+int
+afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int event = 0;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (__is_root_gfid(loc->inode->gfid)) {
+ if (!priv->root_inode)
+ priv->root_inode = inode_ref(loc->inode);
+
+ if (priv->choose_local && !priv->did_discovery) {
+ /* Logic to detect which subvolumes of AFR are
+ local, in order to prefer them for reads
+ */
+ local->do_discovery = _gf_true;
+ priv->did_discovery = _gf_true;
+ }
+ }
- list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds,
- list) {
+ local->op = GF_FOP_LOOKUP;
- if (locked_fd->fd == fd) {
- list_del_init (&locked_fd->list);
- GF_FREE (locked_fd);
- }
+ loc_copy(&local->loc, loc);
+
+ local->inode = inode_ref(loc->inode);
+ if (xattr_req) {
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+ if (!local->xattr_req) {
+ op_errno = ENOMEM;
+ goto out;
}
+ }
+ if (gf_uuid_is_null(loc->inode->gfid)) {
+ afr_discover_do(frame, this, 0);
return 0;
-}
+ }
+ afr_read_subvol_get(loc->inode, this, NULL, NULL, &event,
+ AFR_DATA_TRANSACTION, NULL);
-/* {{{ fsync */
+ afr_discover_do(frame, this, 0);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
int
-afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+afr_lookup_do(call_frame_t *frame, xlator_t *this, int err)
{
- afr_local_t *local = NULL;
-
- int call_count = -1;
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err < 0) {
+ local->op_errno = err;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT(local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(
+ frame, afr_lookup_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+out:
+ AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
- int child_index = (long) cookie;
- int read_child = 0;
+/*
+ * afr_lookup()
+ *
+ * The goal here is to figure out what the element getting looked up is.
+ * i.e what is the GFID, inode type and a conservative estimate of the
+ * inode attributes are.
+ *
+ * As we lookup, operations may be underway on the entry name and the
+ * inode. In lookup() we are primarily concerned only with the entry
+ * operations. If the entry is getting unlinked or renamed, we detect
+ * what operation is underway by querying for on-going transactions and
+ * pending self-healing on the entry through xdata.
+ *
+ * If the entry is a file/dir, it may need self-heal and/or in a
+ * split-brain condition. Lookup is not the place to worry about these
+ * conditions. Outcast marking will naturally handle them in the read
+ * paths.
+ *
+ * Here is a brief goal of what we are trying to achieve:
+ *
+ * - LOOKUP on all subvolumes concurrently, querying on-going transaction
+ * and pending self-heal info from the servers.
+ *
+ * - If all servers reply the same inode type and GFID, the overall call
+ * MUST be a success.
+ *
+ * - If inode types or GFIDs mismatch, and there IS either an on-going
+ * transaction or pending self-heal, inspect what the nature of the
+ * transaction or pending heal is, and select the appropriate subvolume's
+ * reply as the winner.
+ *
+ * - If inode types or GFIDs mismatch, and there are no on-going transactions
+ * or pending self-heal on the entry name on any of the servers, fail the
+ * lookup with EIO. Something has gone wrong beyond reasonable action.
+ */
- local = frame->local;
+int
+afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int event = 0;
+ int ret = 0;
+
+ if (loc_is_nameless(loc)) {
+ if (xattr_req)
+ dict_del_sizen(xattr_req, "gfid-req");
+ afr_discover(frame, this, loc, xattr_req);
+ return 0;
+ }
- read_child = afr_read_child (this, local->fd->inode);
+ if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name,
+ frame->root->pid)) {
+ op_errno = EPERM;
+ goto out;
+ }
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- if (op_ret == 0) {
- local->op_ret = 0;
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- if (local->success_count == 0) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
- }
+ local->op = GF_FOP_LOOKUP;
- if (child_index == read_child) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
- }
+ loc_copy(&local->loc, loc);
- local->success_count++;
- }
+ local->inode = inode_ref(loc->inode);
- local->op_errno = op_errno;
+ if (xattr_req) {
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+ if (!local->xattr_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ ret = dict_get_gfuuid(local->xattr_req, "gfid-req",
+ &local->cont.lookup.gfid_req);
+ if (ret == 0) {
+ dict_del_sizen(local->xattr_req, "gfid-req");
}
- UNLOCK (&frame->lock);
+ }
- call_count = afr_frame_return (frame);
+ afr_read_subvol_get(loc->parent, this, NULL, NULL, &event,
+ AFR_DATA_TRANSACTION, NULL);
- if (call_count == 0) {
- local->cont.fsync.prebuf.ia_ino = local->cont.fsync.ino;
- local->cont.fsync.postbuf.ia_ino = local->cont.fsync.ino;
+ afr_lookup_do(frame, this, 0);
- AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno,
- &local->cont.fsync.prebuf,
- &local->cont.fsync.postbuf);
- }
+ return 0;
+out:
+ AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
+void
+_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)
+{
+ afr_private_t *priv = this->private;
+
+ if (fd_ctx->lk_heal_info) {
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info);
+ fd_ctx->lk_heal_info = NULL;
+ }
+ GF_FREE(fd_ctx->opened_on);
+ GF_FREE(fd_ctx);
+ return;
+}
int
-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync)
+afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = 0;
- int ret = -1;
+ ret = fd_ctx_get(fd, this, &ctx);
+ if (ret < 0)
+ goto out;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ fd_ctx = (afr_fd_ctx_t *)(long)ctx;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if (fd_ctx) {
+ _afr_cleanup_fd_ctx(this, fd_ctx);
+ }
- priv = this->private;
+out:
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_release(xlator_t *this, fd_t *fd)
+{
+ afr_cleanup_fd_ctx(this, fd);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ return 0;
+}
- call_count = local->call_count;
- frame->local = local;
+afr_fd_ctx_t *
+__afr_fd_ctx_get(fd_t *fd, xlator_t *this)
+{
+ uint64_t ctx = 0;
+ int ret = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
- local->fd = fd_ref (fd);
- local->cont.fsync.ino = fd->inode->ino;
+ ret = __fd_ctx_get(fd, this, &ctx);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_fsync_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsync,
- fd, datasync);
- if (!--call_count)
- break;
- }
- }
+ if (ret < 0) {
+ ret = __afr_fd_ctx_set(this, fd);
+ if (ret < 0)
+ goto out;
+
+ ret = __fd_ctx_get(fd, this, &ctx);
+ if (ret < 0)
+ goto out;
+ }
- op_ret = 0;
+ fd_ctx = (afr_fd_ctx_t *)(long)ctx;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL);
- }
- return 0;
+ return fd_ctx;
}
-/* }}} */
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
-/* {{{ fsync */
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ }
+ UNLOCK(&fd->lock);
-int32_t
-afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ return fd_ctx;
+}
+
+int
+__afr_fd_ctx_set(xlator_t *this, fd_t *fd)
{
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int i = 0;
+
+ VALIDATE_OR_GOTO(this->private, out);
+ VALIDATE_OR_GOTO(fd, out);
+
+ priv = this->private;
+
+ ret = __fd_ctx_get(fd, this, &ctx);
+
+ if (ret == 0)
+ goto out;
+
+ fd_ctx = GF_CALLOC(1, sizeof(afr_fd_ctx_t), gf_afr_mt_afr_fd_ctx_t);
+ if (!fd_ctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ fd_ctx->opened_on = GF_CALLOC(sizeof(*fd_ctx->opened_on), priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!fd_ctx->opened_on) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_is_anonymous(fd))
+ fd_ctx->opened_on[i] = AFR_FD_OPENED;
+ else
+ fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
+ }
- int call_count = -1;
+ fd_ctx->readdir_subvol = -1;
+ fd_ctx->lk_heal_info = NULL;
- local = frame->local;
+ ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);
+ if (ret)
+ gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);
+out:
+ if (ret && fd_ctx)
+ _afr_cleanup_fd_ctx(this, fd_ctx);
+ return ret;
+}
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+/* {{{ flush */
+
+int
+afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
- local->op_errno = op_errno;
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ } else {
+ local->op_errno = op_errno;
}
- UNLOCK (&frame->lock);
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- call_count = afr_frame_return (frame);
+ if (call_count == 0)
+ AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
- if (call_count == 0)
- AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret,
- local->op_errno);
+ return 0;
+}
- return 0;
+static int
+afr_flush_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_flush_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->flush,
+ local->fd, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
}
+afr_local_t *
+afr_wakeup_same_fd_delayed_op(xlator_t *this, afr_lock_t *lock, fd_t *fd)
+{
+ afr_local_t *local = NULL;
+
+ if (lock->delay_timer) {
+ local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ if (fd == local->fd) {
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ local = NULL;
+ } else {
+ lock->delay_timer = NULL;
+ }
+ } else {
+ local = NULL;
+ }
+ }
+
+ return local;
+}
-int32_t
-afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync)
+void
+afr_delayed_changelog_wake_resume(xlator_t *this, inode_t *inode,
+ call_stub_t *stub)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ afr_lock_t *lock = NULL;
+ afr_local_t *metadata_local = NULL;
+ afr_local_t *data_local = NULL;
+ LOCK(&inode->lock);
+ {
+ (void)__afr_inode_ctx_get(this, inode, &ctx);
+ lock = &ctx->lock[AFR_DATA_TRANSACTION];
+ data_local = afr_wakeup_same_fd_delayed_op(this, lock, stub->args.fd);
+ lock = &ctx->lock[AFR_METADATA_TRANSACTION];
+ metadata_local = afr_wakeup_same_fd_delayed_op(this, lock,
+ stub->args.fd);
+ }
+ UNLOCK(&inode->lock);
+
+ if (data_local) {
+ data_local->transaction.resume_stub = stub;
+ } else if (metadata_local) {
+ metadata_local->transaction.resume_stub = stub;
+ } else {
+ call_resume(stub);
+ }
+ if (data_local) {
+ afr_delayed_changelog_wake_up_cbk(data_local);
+ }
+ if (metadata_local) {
+ afr_delayed_changelog_wake_up_cbk(metadata_local);
+ }
+}
- int ret = -1;
+int
+afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int op_errno = ENOMEM;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->op = GF_FOP_FLUSH;
+ if (!afr_is_consistent_io_possible(local, this->private, &op_errno))
+ goto out;
- priv = this->private;
+ local->fd = fd_ref(fd);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ stub = fop_flush_stub(frame, afr_flush_wrapper, fd, xdata);
+ if (!stub)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ afr_delayed_changelog_wake_resume(this, fd->inode, stub);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int
+afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ } else {
+ local->op_errno = op_errno;
}
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- call_count = local->call_count;
- frame->local = local;
+ if (call_count == 0)
+ AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fsyncdir_cbk,
- priv->children[i],
- priv->children[i]->fops->fsyncdir,
- fd, datasync);
- if (!--call_count)
- break;
- }
+ return 0;
+}
+
+int
+afr_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_FSYNCDIR;
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ call_count = local->call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND(frame, afr_fsyncdir_cbk, priv->children[i],
+ priv->children[i]->fops->fsyncdir, fd, datasync, xdata);
+ if (!--call_count)
+ break;
}
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno);
- }
- return 0;
+ AFR_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL);
+
+ return 0;
}
/* }}} */
-/* {{{ xattrop */
+static int
+afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this);
-int32_t
-afr_xattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
+static gf_boolean_t
+afr_is_conflicting_lock_present(int32_t op_ret, int32_t op_errno)
{
- afr_local_t *local = NULL;
+ if (op_ret == -1 && op_errno == EAGAIN)
+ return _gf_true;
+ return _gf_false;
+}
- int call_count = -1;
+static void
+afr_fop_lock_unwind(call_frame_t *frame, glusterfs_fop_t op, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ switch (op) {
+ case GF_FOP_INODELK:
+ AFR_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_FINODELK:
+ AFR_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_ENTRYLK:
+ AFR_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_FENTRYLK:
+ AFR_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata);
+ break;
+ default:
+ break;
+ }
+}
- local = frame->local;
+static void
+afr_fop_lock_wind(call_frame_t *frame, xlator_t *this, int child_index,
+ int32_t (*lock_cbk)(call_frame_t *, void *, xlator_t *,
+ int32_t, int32_t, dict_t *))
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = child_index;
+
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->inodelk,
+ (const char *)local->cont.inodelk.volume, &local->loc,
+ local->cont.inodelk.cmd, &local->cont.inodelk.flock,
+ local->cont.inodelk.xdata);
+ break;
+ case GF_FOP_FINODELK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->finodelk,
+ (const char *)local->cont.inodelk.volume, local->fd,
+ local->cont.inodelk.cmd, &local->cont.inodelk.flock,
+ local->cont.inodelk.xdata);
+ break;
+ case GF_FOP_ENTRYLK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->entrylk, local->cont.entrylk.volume,
+ &local->loc, local->cont.entrylk.basename,
+ local->cont.entrylk.cmd, local->cont.entrylk.type,
+ local->cont.entrylk.xdata);
+ break;
+ case GF_FOP_FENTRYLK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fentrylk, local->cont.entrylk.volume,
+ local->fd, local->cont.entrylk.basename,
+ local->cont.entrylk.cmd, local->cont.entrylk.type,
+ local->cont.entrylk.xdata);
+ break;
+ default:
+ break;
+ }
+}
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+void
+afr_fop_lock_proceed(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
+ priv = frame->this->private;
+
+ if (local->fop_lock_state != AFR_FOP_LOCK_PARALLEL) {
+ afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return;
+ }
+ /* At least one child is up */
+ /*
+ * Non-blocking locks also need to be serialized. Otherwise there is
+ * a chance that both the mounts which issued same non-blocking inodelk
+ * may endup not acquiring the lock on any-brick.
+ * Ex: Mount1 and Mount2
+ * request for full length lock on file f1. Mount1 afr may acquire the
+ * partial lock on brick-1 and may not acquire the lock on brick-2
+ * because Mount2 already got the lock on brick-2, vice versa. Since
+ * both the mounts only got partial locks, afr treats them as failure in
+ * gaining the locks and unwinds with EAGAIN errno.
+ */
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+ local->fop_lock_state = AFR_FOP_LOCK_SERIAL;
+ afr_local_replies_wipe(local, priv);
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.cmd = local->cont.inodelk.in_cmd;
+ local->cont.inodelk.flock = local->cont.inodelk.in_flock;
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ local->cont.inodelk.xdata = NULL;
+ if (local->xdata_req)
+ local->cont.inodelk.xdata = dict_ref(local->xdata_req);
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = local->cont.entrylk.in_cmd;
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ local->cont.entrylk.xdata = NULL;
+ if (local->xdata_req)
+ local->cont.entrylk.xdata = dict_ref(local->xdata_req);
+ break;
+ default:
+ break;
+ }
+ afr_serialized_lock_wind(frame, frame->this);
+}
- call_count = afr_frame_return (frame);
+static int32_t
+afr_unlock_partial_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
- if (call_count == 0)
- AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno,
- xattr);
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int child_index = (long)cookie;
+ uuid_t gfid = {0};
- return 0;
+ local = frame->local;
+ priv = this->private;
+
+ if (op_ret < 0 && op_errno != ENOTCONN) {
+ if (local->fd)
+ gf_uuid_copy(gfid, local->fd->inode->gfid);
+ else
+ loc_gfid(&local->loc, gfid);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "%s: Failed to unlock %s on %s "
+ "with lk_owner: %s",
+ uuid_utoa(gfid), gf_fop_list[local->op],
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ afr_fop_lock_proceed(frame);
+
+ return 0;
}
+static int32_t
+afr_unlock_locks_and_proceed(call_frame_t *frame, xlator_t *this,
+ int call_count)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ if (call_count == 0) {
+ afr_fop_lock_proceed(frame);
+ goto out;
+ }
+
+ local = frame->local;
+ priv = this->private;
+ local->call_count = call_count;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.flock.l_type = F_UNLCK;
+ local->cont.inodelk.cmd = F_SETLK;
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ local->cont.inodelk.xdata = NULL;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = ENTRYLK_UNLOCK;
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ local->cont.entrylk.xdata = NULL;
+ break;
+ default:
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret == -1)
+ continue;
+
+ afr_fop_lock_wind(frame, this, i, afr_unlock_partial_lock_cbk);
+
+ if (!--call_count)
+ break;
+ }
+
+out:
+ return 0;
+}
int32_t
-afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr)
+afr_fop_lock_done(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ int i = 0;
+ int lock_count = 0;
+ unsigned char *success = NULL;
- int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ local = frame->local;
+ priv = this->private;
+ success = alloca0(priv->child_count);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
- priv = this->private;
+ if (local->replies[i].op_ret == 0) {
+ lock_count++;
+ success[i] = 1;
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ if (local->op_ret == -1 && local->op_errno == EAGAIN)
+ continue;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ if ((local->replies[i].op_ret == -1) &&
+ (local->replies[i].op_errno == EAGAIN)) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
+ continue;
}
- call_count = local->call_count;
- frame->local = local;
+ if (local->replies[i].op_ret == 0)
+ local->op_ret = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_xattrop_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- loc, optype, xattr);
- if (!--call_count)
- break;
- }
+ local->op_errno = local->replies[i].op_errno;
+ }
+
+ if (afr_fop_lock_is_unlock(frame))
+ goto unwind;
+
+ if (afr_is_conflicting_lock_present(local->op_ret, local->op_errno)) {
+ afr_unlock_locks_and_proceed(frame, this, lock_count);
+ } else if (priv->quorum_count && !afr_has_quorum(success, this, NULL)) {
+ local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED;
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ if (local->op_errno == 0)
+ local->op_errno = afr_quorum_errno(priv);
+ afr_unlock_locks_and_proceed(frame, this, lock_count);
+ } else {
+ goto unwind;
+ }
+
+ return 0;
+unwind:
+ afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
+
+static int
+afr_common_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret == 0 && xdata) {
+ local->replies[child_index].xdata = dict_ref(xdata);
+ LOCK(&frame->lock);
+ {
+ if (!local->xdata_rsp)
+ local->xdata_rsp = dict_ref(xdata);
}
+ UNLOCK(&frame->lock);
+ }
+ return 0;
+}
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL);
+static int32_t
+afr_serialized_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long)cookie;
+ int next_child = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+
+ for (next_child = child_index + 1; next_child < priv->child_count;
+ next_child++) {
+ if (local->child_up[next_child])
+ break;
+ }
+
+ if (afr_is_conflicting_lock_present(op_ret, op_errno) ||
+ (next_child == priv->child_count)) {
+ afr_fop_lock_done(frame, this);
+ } else {
+ afr_fop_lock_wind(frame, this, next_child, afr_serialized_lock_cbk);
+ }
+
+ return 0;
+}
+
+static int
+afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ afr_fop_lock_wind(frame, this, i, afr_serialized_lock_cbk);
+ break;
}
- return 0;
+ }
+ return 0;
}
-/* }}} */
+static int32_t
+afr_parallel_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+ int call_count = 0;
-/* {{{ fxattrop */
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
-int32_t
-afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ afr_fop_lock_done(frame, this);
+
+ return 0;
+}
+
+static int
+afr_parallel_lock_wind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+ afr_fop_lock_wind(frame, this, i, afr_parallel_lock_cbk);
+ if (!--call_count)
+ break;
+ }
+ return 0;
+}
- int call_count = -1;
+static int
+afr_fop_handle_lock(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ int op_errno = 0;
- local = frame->local;
+ if (!afr_fop_lock_is_unlock(frame)) {
+ if (!afr_is_consistent_io_possible(local, this->private, &op_errno))
+ goto out;
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.cmd = F_SETLK;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = ENTRYLK_LOCK_NB;
+ break;
+ default:
+ break;
+ }
+ }
- local->op_errno = op_errno;
+ if (local->xdata_req) {
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.xdata = dict_ref(local->xdata_req);
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.xdata = dict_ref(local->xdata_req);
+ break;
+ default:
+ break;
}
- UNLOCK (&frame->lock);
+ }
- call_count = afr_frame_return (frame);
+ local->fop_lock_state = AFR_FOP_LOCK_PARALLEL;
+ afr_parallel_lock_wind(frame, this);
+out:
+ return -op_errno;
+}
- if (call_count == 0)
- AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno,
- xattr);
+static int32_t
+afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = fop;
+ if (loc)
+ loc_copy(&local->loc, loc);
+ if (fd && (flock->l_type != F_UNLCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local->fd = fd_ref(fd);
+ }
+
+ local->cont.inodelk.volume = gf_strdup(volume);
+ if (!local->cont.inodelk.volume) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->cont.inodelk.in_cmd = cmd;
+ local->cont.inodelk.cmd = cmd;
+ local->cont.inodelk.in_flock = *flock;
+ local->cont.inodelk.flock = *flock;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ op_errno = -afr_fop_handle_lock(frame, frame->this);
+ if (op_errno)
+ goto out;
+ return 0;
+out:
+ afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL);
- return 0;
+ return 0;
}
+int32_t
+afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd,
+ flock, xdata);
+ return 0;
+}
int32_t
-afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr)
+afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd,
+ flock, xdata);
+ return 0;
+}
- int ret = -1;
+static int
+afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = fop;
+ if (loc)
+ loc_copy(&local->loc, loc);
+ if (fd && (cmd != ENTRYLK_UNLOCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local->fd = fd_ref(fd);
+ }
+ local->cont.entrylk.cmd = cmd;
+ local->cont.entrylk.in_cmd = cmd;
+ local->cont.entrylk.type = type;
+ local->cont.entrylk.volume = gf_strdup(volume);
+ local->cont.entrylk.basename = gf_strdup(basename);
+ if (!local->cont.entrylk.volume || !local->cont.entrylk.basename) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+ op_errno = -afr_fop_handle_lock(frame, frame->this);
+ if (op_errno)
+ goto out;
+
+ return 0;
+out:
+ afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL);
+ return 0;
+}
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+int
+afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename,
+ cmd, type, xdata);
+ return 0;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename,
+ cmd, type, xdata);
+ return 0;
+}
- priv = this->private;
+int
+afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ struct statvfs *buf = NULL;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local = frame->local;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ LOCK(&frame->lock);
+ {
+ if (op_ret != 0) {
+ local->op_errno = op_errno;
+ goto unlock;
}
- call_count = local->call_count;
- frame->local = local;
+ local->op_ret = op_ret;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fxattrop_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- fd, optype, xattr);
- if (!--call_count)
- break;
+ buf = &local->cont.statfs.buf;
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < buf->f_bavail) {
+ *buf = *statvfs;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(xdata);
}
+ }
+ } else {
+ *buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ if (xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
+ }
+unlock:
+ call_count = --local->call_count;
+ UNLOCK(&frame->lock);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL);
- }
- return 0;
+ if (call_count == 0)
+ AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+ &local->cont.statfs.buf, local->xdata_rsp);
+
+ return 0;
}
-/* }}} */
+int
+afr_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_STATFS;
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX])
+ local->call_count--;
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ if (AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+ STACK_WIND(frame, afr_statfs_cbk, priv->children[i],
+ priv->children[i]->fops->statfs, loc, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+out:
+ AFR_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
int32_t
-afr_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ int call_count = -1;
+ int child_index = (long)cookie;
+
+ local = frame->local;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
+ local->xdata_rsp);
+ }
+
+ return 0;
+}
+int32_t
+afr_lk_unlock(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
- int call_count = -1;
+ local = frame->local;
+ priv = this->private;
- local = frame->local;
+ call_count = afr_locked_nodes_count(local->cont.lk.locked_nodes,
+ priv->child_count);
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
+ local->xdata_rsp);
+ return 0;
+ }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local->call_count = call_count;
- call_count = afr_frame_return (frame);
+ local->cont.lk.user_flock.l_type = F_UNLCK;
- if (call_count == 0)
- AFR_STACK_UNWIND (inodelk, frame, local->op_ret,
- local->op_errno);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.locked_nodes[i]) {
+ STACK_WIND_COOKIE(frame, afr_lk_unlock_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock,
+ NULL);
- return 0;
-}
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+}
int32_t
-afr_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock)
+afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = -1;
- int ret = -1;
+ local = frame->local;
+ priv = this->private;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ child_index = (long)cookie;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret < 0 && op_errno == EAGAIN) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
- priv = this->private;
+ afr_lk_unlock(frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+
+ child_index++;
+
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+ } else if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ afr_lk_unlock(frame, this);
+ } else {
+ if (local->op_ret < 0)
+ local->op_errno = afr_final_errno(local, priv);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
- call_count = local->call_count;
- frame->local = local;
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_inodelk_cbk,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- volume, loc, cmd, flock);
-
- if (!--call_count)
- break;
- }
- }
+int
+afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ return 0;
+}
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno);
- }
- return 0;
+int
+afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = -1;
+
+ local = frame->local;
+ child_index = (long)cookie;
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
}
+int
+afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int child_index = (long)cookie;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+ return 0;
+}
+int
+afr_lk_transaction(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *wind_on = NULL;
+ int op_errno = 0;
+ int i = 0;
+ int ret = 0;
+
+ frame = (call_frame_t *)opaque;
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+
+ if (priv->arbiter_count || priv->child_count != 3) {
+ op_errno = ENOTSUP;
+ gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Lock healing supported only for replica 3 volumes.",
+ uuid_utoa(local->fd->inode->gfid));
+ goto err;
+ }
+
+ op_errno = -afr_dom_lock_acquire(frame); // Released during
+ // AFR_STACK_UNWIND
+ if (op_errno != 0) {
+ goto err;
+ }
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) {
+ op_errno = afr_final_errno(local, priv);
+ goto err;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i])
+ wind_on[i] = 1;
+ }
+ AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ goto unlock;
+ } else {
+ if (local->cont.lk.user_flock.l_type == F_UNLCK)
+ ret = afr_remove_lock_from_saved_locks(local, this);
+ else
+ ret = afr_add_lock_to_saved_locks(frame, this);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ goto unlock;
+ }
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
+
+ return 0;
-int32_t
-afr_finodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+unlock:
+ local->cont.lk.user_flock.l_type = F_UNLCK;
+ AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock, NULL);
+err:
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+ return -1;
+}
+int
+afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ int i = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_LK;
+ if (!afr_lk_is_unlock(cmd, flock)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+ }
+
+ local->cont.lk.locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+
+ if (!local->cont.lk.locked_nodes) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref(fd);
+ local->cont.lk.cmd = cmd;
+ local->cont.lk.user_flock = *flock;
+ local->cont.lk.ret_flock = *flock;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ if (afr_is_lock_mode_mandatory(xdata)) {
+ ret = synctask_new(this->ctx->env, afr_lk_transaction,
+ afr_lk_transaction_cbk, frame, frame);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ return 0;
+ }
- int call_count = -1;
+ STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],
+ priv->children[i]->fops->lk, fd, cmd, flock,
+ local->xdata_req);
- local = frame->local;
+ return 0;
+out:
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ return 0;
+}
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+int32_t
+afr_lease_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_lease *lease,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
- call_count = afr_frame_return (frame);
+ local = frame->local;
+ call_count = afr_frame_return(frame);
- if (call_count == 0)
- AFR_STACK_UNWIND (finodelk, frame, local->op_ret,
- local->op_errno);
+ if (call_count == 0)
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, lease,
+ xdata);
- return 0;
+ return 0;
}
-
int32_t
-afr_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock)
+afr_lease_unlock(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
- int ret = -1;
+ local = frame->local;
+ priv = this->private;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ call_count = afr_locked_nodes_count(local->cont.lease.locked_nodes,
+ priv->child_count);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno,
+ &local->cont.lease.ret_lease, NULL);
+ return 0;
+ }
- priv = this->private;
+ local->call_count = call_count;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.lease.user_lease.cmd = GF_UNLK_LEASE;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lease.locked_nodes[i]) {
+ STACK_WIND(frame, afr_lease_unlock_cbk, priv->children[i],
+ priv->children[i]->fops->lease, &local->loc,
+ &local->cont.lease.user_lease, NULL);
+
+ if (!--call_count)
+ break;
}
+ }
- call_count = local->call_count;
- frame->local = local;
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_finodelk_cbk,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- volume, fd, cmd, flock);
-
- if (!--call_count)
- break;
- }
- }
+int32_t
+afr_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_lease *lease, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = -1;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno);
- }
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long)cookie;
+
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret < 0 && op_errno == EAGAIN) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
+
+ afr_lease_unlock(frame, this);
return 0;
-}
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lease.locked_nodes[child_index] = 1;
+ local->cont.lease.ret_lease = *lease;
+ }
+
+ child_index++;
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lease, &local->loc,
+ &local->cont.lease.user_lease, xdata);
+ } else if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lease.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ afr_lease_unlock(frame, this);
+ } else {
+ if (local->op_ret < 0)
+ local->op_errno = afr_final_errno(local, priv);
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno,
+ &local->cont.lease.ret_lease, NULL);
+ }
-int32_t
-afr_entrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ return 0;
+}
+int
+afr_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
- int call_count = -1;
+ priv = this->private;
- local = frame->local;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ local->op = GF_FOP_LEASE;
+ local->cont.lease.locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lease.locked_nodes),
+ gf_afr_mt_char);
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ if (!local->cont.lease.locked_nodes) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- call_count = afr_frame_return (frame);
+ loc_copy(&local->loc, loc);
+ local->cont.lease.user_lease = *lease;
+ local->cont.lease.ret_lease = *lease;
- if (call_count == 0)
- AFR_STACK_UNWIND (entrylk, frame, local->op_ret,
- local->op_errno);
+ STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)0, priv->children[0],
+ priv->children[0]->fops->lease, loc, lease, xdata);
- return 0;
-}
+ return 0;
+out:
+ AFR_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
-int32_t
-afr_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc,
- const char *basename, entrylk_cmd cmd, entrylk_type type)
+int
+afr_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = 0;
+ gf_boolean_t failed = _gf_false;
+ gf_boolean_t succeeded = _gf_false;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+
+ call_count = afr_frame_return(frame);
+ if (call_count)
+ goto out;
+ /* If any of the subvolumes failed with other than ENOTCONN
+ * return error else return success unless all the subvolumes
+ * failed.
+ * TODO: In case of failure, we need to unregister the xattrs
+ * from the other subvolumes where it succeeded (once upcall
+ * fixes the Bz-1371622)*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno != ENOTCONN) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ if (local->replies[i].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ failed = _gf_true;
+ break;
+ }
+ if (local->replies[i].op_ret == 0) {
+ succeeded = _gf_true;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ if (!local->xdata_rsp && local->replies[i].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ }
+ }
- int ret = -1;
+ if (!succeeded && !failed) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ }
+
+ AFR_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+out:
+ return 0;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_cnt = -1;
- priv = this->private;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ if (op != GF_IPC_TARGET_UPCALL)
+ goto wind_default;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ VALIDATE_OR_GOTO(this->private, err);
+ priv = this->private;
- call_count = local->call_count;
- frame->local = local;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto err;
+ call_cnt = local->call_count;
+
+ if (xdata) {
for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_entrylk_cbk,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- volume, loc, basename, cmd, type);
-
- if (!--call_count)
- break;
- }
+ if (dict_set_int8(xdata, priv->pending_key[i], 0) < 0)
+ goto err;
}
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE(frame, afr_ipc_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->ipc, op,
+ xdata);
+ if (!--call_cnt)
+ break;
+ }
+ return 0;
+
+err:
+ if (op_errno == -1)
+ op_errno = errno;
+ AFR_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
+
+ return 0;
+
+wind_default:
+ STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc, op, xdata);
+ return 0;
+}
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno);
- }
+int
+afr_forget(xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_int = 0;
+ afr_inode_ctx_t *ctx = NULL;
+
+ afr_spb_choice_timeout_cancel(this, inode);
+ inode_ctx_del(inode, this, &ctx_int);
+ if (!ctx_int)
return 0;
+
+ ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
+ afr_inode_ctx_destroy(ctx);
+ return 0;
}
+int
+afr_priv_dump(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+
+ GF_ASSERT(this);
+ priv = this->private;
+
+ GF_ASSERT(priv);
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+ gf_proc_dump_add_section("%s", key_prefix);
+ gf_proc_dump_write("child_count", "%u", priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sprintf(key, "child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->child_up[i]);
+ sprintf(key, "pending_key[%d]", i);
+ gf_proc_dump_write(key, "%s", priv->pending_key[i]);
+ sprintf(key, "pending_reads[%d]", i);
+ gf_proc_dump_write(key, "%" PRId64,
+ GF_ATOMIC_GET(priv->pending_reads[i]));
+ sprintf(key, "child_latency[%d]", i);
+ gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]);
+ sprintf(key, "halo_child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->halo_child_up[i]);
+ }
+ gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal);
+ gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
+ gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal);
+ gf_proc_dump_write("read_child", "%d", priv->read_child);
+ gf_proc_dump_write("wait_count", "%u", priv->wait_count);
+ gf_proc_dump_write("heal-wait-queue-length", "%d", priv->heal_wait_qlen);
+ gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters);
+ gf_proc_dump_write("background-self-heal-count", "%d",
+ priv->background_self_heal_count);
+ gf_proc_dump_write("healers", "%d", priv->healers);
+ gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
+ gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode);
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ gf_proc_dump_write("quorum-type", "auto");
+ } else if (priv->quorum_count == 0) {
+ gf_proc_dump_write("quorum-type", "none");
+ } else {
+ gf_proc_dump_write("quorum-type", "fixed");
+ gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);
+ }
+ gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this, NULL));
+ if (priv->thin_arbiter_count) {
+ gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up);
+ gf_proc_dump_write("ta_bad_child_index", "%d",
+ priv->ta_bad_child_index);
+ gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64,
+ priv->ta_notify_dom_lock_offset);
+ }
+
+ return 0;
+}
+/**
+ * find_child_index - find the child's index in the array of subvolumes
+ * @this: AFR
+ * @child: child
+ */
-int32_t
-afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+static int
+afr_find_child_index(xlator_t *this, xlator_t *child)
+{
+ afr_private_t *priv = NULL;
+ int child_count = -1;
+ int i = -1;
+
+ priv = this->private;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ child_count++;
+ }
+
+ for (i = 0; i < child_count; i++) {
+ if ((xlator_t *)child == priv->children[i])
+ break;
+ }
+
+ return i;
+}
+int
+__afr_get_up_children_count(afr_private_t *priv)
{
- afr_local_t *local = NULL;
+ int up_children = 0;
+ int i = 0;
- int call_count = -1;
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
- local = frame->local;
+ return up_children;
+}
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+static int
+__get_heard_from_all_status(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int i;
- local->op_errno = op_errno;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i]) {
+ return 0;
}
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ }
+ if (priv->thin_arbiter_count && !priv->ta_child_up) {
+ return 0;
+ }
+ return 1;
+}
- if (call_count == 0)
- AFR_STACK_UNWIND (fentrylk, frame, local->op_ret,
- local->op_errno);
+glusterfs_event_t
+__afr_transform_event_from_state(xlator_t *this)
+{
+ int i = 0;
+ int up_children = 0;
+ afr_private_t *priv = this->private;
+
+ if (__get_heard_from_all_status(this))
+ /* have_heard_from_all. Let afr_notify() do the propagation. */
+ return GF_EVENT_MAXVAL;
+
+ up_children = __afr_get_up_children_count(priv);
+ /* Treat the children with pending notification, as having sent a
+ * GF_EVENT_CHILD_DOWN. i.e. set the event as GF_EVENT_SOME_DESCENDENT_DOWN,
+ * as done in afr_notify() */
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i])
+ continue;
+ priv->last_event[i] = GF_EVENT_SOME_DESCENDENT_DOWN;
+ priv->child_up[i] = 0;
+ }
+
+ if (up_children)
+ /* We received at least one child up */
+ return GF_EVENT_CHILD_UP;
+ else
+ return GF_EVENT_CHILD_DOWN;
+
+ return GF_EVENT_MAXVAL;
+}
- return 0;
+static void
+afr_notify_cbk(void *data)
+{
+ xlator_t *this = data;
+ afr_private_t *priv = this->private;
+ glusterfs_event_t event = GF_EVENT_MAXVAL;
+ gf_boolean_t propagate = _gf_false;
+
+ LOCK(&priv->lock);
+ {
+ if (!priv->timer) {
+ /*
+ * Either child_up/child_down is already sent to parent.
+ * This is a spurious wake up.
+ */
+ goto unlock;
+ }
+ priv->timer = NULL;
+ event = __afr_transform_event_from_state(this);
+ if (event != GF_EVENT_MAXVAL)
+ propagate = _gf_true;
+ }
+unlock:
+ UNLOCK(&priv->lock);
+ if (propagate)
+ default_notify(this, event, NULL);
}
+static void
+__afr_launch_notify_timer(xlator_t *this, afr_private_t *priv)
+{
+ struct timespec delay = {
+ 0,
+ };
+
+ gf_msg_debug(this->name, 0, "Initiating child-down timer");
+ delay.tv_sec = 10;
+ delay.tv_nsec = 0;
+ priv->timer = gf_timer_call_after(this->ctx, delay, afr_notify_cbk, this);
+ if (priv->timer == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL,
+ "Cannot create timer for delayed initialization");
+ }
+}
-int32_t
-afr_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd,
- const char *basename, entrylk_cmd cmd, entrylk_type type)
+static int
+find_best_down_child(xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t best_child = -1;
+ int64_t best_latency = INT64_MAX;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] < best_latency) {
+ best_child = i;
+ best_latency = priv->child_latency[i];
+ }
+ }
+ if (best_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Found best down child (%d) @ %" PRId64 " ms latency",
+ best_child, best_latency);
+ }
+ return best_child;
+}
- int ret = -1;
+int
+find_worst_up_child(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t worst_child = -1;
+ int64_t worst_latency = INT64_MIN;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] > worst_latency) {
+ worst_child = i;
+ worst_latency = priv->child_latency[i];
+ }
+ }
+ if (worst_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Found worst up child (%d) @ %" PRId64 " ms latency",
+ worst_child, worst_latency);
+ }
+ return worst_child;
+}
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+void
+__afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
+ int64_t halo_max_latency_msec, int32_t *event,
+ int64_t child_latency_msec)
+{
+ afr_private_t *priv = NULL;
+ int up_children = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ priv = this->private;
- priv = this->private;
+ priv->child_latency[idx] = child_latency_msec;
+ gf_msg_debug(child_xlator->name, 0, "Client ping @ %" PRId64 " ms",
+ child_latency_msec);
+ if (priv->shd.iamshd)
+ return;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ up_children = __afr_get_up_children_count(priv);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ if (child_latency_msec > halo_max_latency_msec &&
+ priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) {
+ if ((up_children - 1) < priv->halo_min_replicas) {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Overriding halo threshold, "
+ "min replicas: %d",
+ priv->halo_min_replicas);
+ } else {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Child latency (%" PRId64
+ " ms) "
+ "exceeds halo threshold (%" PRId64
+ "), "
+ "marking child down.",
+ child_latency_msec, halo_max_latency_msec);
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_DOWN;
+ }
}
+ } else if (child_latency_msec < halo_max_latency_msec &&
+ priv->child_up[idx] == 0) {
+ if (up_children < priv->halo_max_replicas) {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Child latency (%" PRId64
+ " ms) "
+ "below halo threshold (%" PRId64
+ "), "
+ "marking child up.",
+ child_latency_msec, halo_max_latency_msec);
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_UP;
+ }
+ } else {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Not marking child %d up, "
+ "max replicas (%d) reached.",
+ idx, priv->halo_max_replicas);
+ }
+ }
+}
- call_count = local->call_count;
- frame->local = local;
+static int64_t
+afr_get_halo_latency(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int64_t halo_max_latency_msec = 0;
+
+ priv = this->private;
+
+ if (priv->shd.iamshd) {
+ halo_max_latency_msec = priv->shd.halo_max_latency_msec;
+ } else if (priv->nfsd.iamnfsd) {
+ halo_max_latency_msec = priv->nfsd.halo_max_latency_msec;
+ } else {
+ halo_max_latency_msec = priv->halo_max_latency_msec;
+ }
+ gf_msg_debug(this->name, 0, "Using halo latency %" PRId64,
+ halo_max_latency_msec);
+ return halo_max_latency_msec;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fentrylk_cbk,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
- volume, fd, basename, cmd, type);
-
- if (!--call_count)
- break;
- }
+void
+__afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
+ const int idx, int64_t child_latency_msec,
+ int32_t *event, int32_t *call_psh,
+ int32_t *up_child)
+{
+ afr_private_t *priv = NULL;
+ int up_children = 0;
+ int worst_up_child = -1;
+ int64_t halo_max_latency_msec = afr_get_halo_latency(this);
+
+ priv = this->private;
+
+ /*
+ * This only really counts if the child was never up
+ * (value = -1) or had been down (value = 0). See
+ * comment at GF_EVENT_CHILD_DOWN for a more detailed
+ * explanation.
+ */
+ if (priv->child_up[idx] != 1) {
+ priv->event_generation++;
+ }
+ priv->child_up[idx] = 1;
+
+ *call_psh = 1;
+ *up_child = idx;
+ up_children = __afr_get_up_children_count(priv);
+ /*
+ * If this is an _actual_ CHILD_UP event, we
+ * want to set the child_latency to MAX to indicate
+ * the child needs ping data to be available before doing child-up
+ */
+ if (!priv->halo_enabled)
+ goto out;
+
+ if (child_latency_msec < 0) {
+ /*set to INT64_MAX-1 so that it is found for best_down_child*/
+ priv->halo_child_up[idx] = 1;
+ if (priv->child_latency[idx] < 0) {
+ priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ }
+ }
+
+ /*
+ * Handle the edge case where we exceed
+ * halo_min_replicas and we've got a child which is
+ * marked up as it was helping to satisfy the
+ * halo_min_replicas even though it's latency exceeds
+ * halo_max_latency_msec.
+ */
+ if (up_children > priv->halo_min_replicas) {
+ worst_up_child = find_worst_up_child(this);
+ if (worst_up_child >= 0 &&
+ priv->child_latency[worst_up_child] > halo_max_latency_msec) {
+ gf_msg_debug(this->name, 0,
+ "Marking child %d down, "
+ "doesn't meet halo threshold (%" PRId64
+ "), and > "
+ "halo_min_replicas (%d)",
+ worst_up_child, halo_max_latency_msec,
+ priv->halo_min_replicas);
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
}
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno);
+ if (up_children > priv->halo_max_replicas && !priv->shd.iamshd) {
+ worst_up_child = find_worst_up_child(this);
+ if (worst_up_child < 0) {
+ worst_up_child = idx;
}
- return 0;
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ gf_msg_debug(this->name, 0,
+ "Marking child %d down, "
+ "up_children (%d) > halo_max_replicas (%d)",
+ worst_up_child, up_children, priv->halo_max_replicas);
+ }
+out:
+ if (up_children == 1) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
+ "Subvolume '%s' came back up; "
+ "going online.",
+ child_xlator->name);
+ gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ } else {
+ *event = GF_EVENT_SOME_DESCENDENT_UP;
+ }
+
+ priv->last_event[idx] = *event;
}
-int32_t
-afr_statfs_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct statvfs *statvfs)
+void
+__afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
+ int64_t child_latency_msec, int32_t *event,
+ int32_t *call_psh, int32_t *up_child)
{
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int up_children = 0;
+ int down_children = 0;
+ int best_down_child = -1;
+
+ priv = this->private;
+
+ /*
+ * If a brick is down when we start, we'll get a
+ * CHILD_DOWN to indicate its initial state. There
+ * was never a CHILD_UP in this case, so if we
+ * increment "down_count" the difference between than
+ * and "up_count" will no longer be the number of
+ * children that are currently up. This has serious
+ * implications e.g. for quorum enforcement, so we
+ * don't increment these values unless the event
+ * represents an actual state transition between "up"
+ * (value = 1) and anything else.
+ */
+ if (priv->child_up[idx] == 1) {
+ priv->event_generation++;
+ }
+
+ /*
+ * If this is an _actual_ CHILD_DOWN event, we
+ * want to set the child_latency to < 0 to indicate
+ * the child is really disconnected.
+ */
+ if (child_latency_msec < 0) {
+ priv->child_latency[idx] = child_latency_msec;
+ priv->halo_child_up[idx] = 0;
+ }
+ priv->child_up[idx] = 0;
+
+ up_children = __afr_get_up_children_count(priv);
+ /*
+ * Handle the edge case where we need to find the
+ * next best child (to mark up) as marking this child
+ * down would cause us to fall below halo_min_replicas.
+ * We will also force the SHD to heal this child _now_
+ * as we want it to be up to date if we are going to
+ * begin using it synchronously.
+ */
+ if (priv->halo_enabled && up_children < priv->halo_min_replicas) {
+ best_down_child = find_best_down_child(this);
+ if (best_down_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Swapping out child %d for "
+ "child %d to satisfy halo_min_replicas (%d).",
+ idx, best_down_child, priv->halo_min_replicas);
+ priv->child_up[best_down_child] = 1;
+ *call_psh = 1;
+ *up_child = best_down_child;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 0)
+ down_children++;
+ if (down_children == priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN,
+ "All subvolumes are down. Going "
+ "offline until at least one of them "
+ "comes back up.");
+ gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ } else {
+ *event = GF_EVENT_SOME_DESCENDENT_DOWN;
+ }
+ priv->last_event[idx] = *event;
+}
- int call_count = 0;
+void
+afr_ta_lock_release_synctask(xlator_t *this)
+{
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
- LOCK (&frame->lock);
- {
- local = frame->local;
-
- if (op_ret == 0) {
- local->op_ret = op_ret;
-
- if (local->cont.statfs.buf_set) {
- if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
- local->cont.statfs.buf = *statvfs;
- } else {
- local->cont.statfs.buf = *statvfs;
- local->cont.statfs.buf_set = 1;
- }
- }
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ return;
+ }
+
+ ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta,
+ afr_ta_lock_release_done, ta_frame, this);
+ if (ret) {
+ STACK_DESTROY(ta_frame->root);
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to release "
+ "AFR_TA_DOM_NOTIFY lock.");
+ }
+}
- if (op_ret == -1)
- local->op_errno = op_errno;
+static void
+afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ afr_private_t *priv = this->private;
- }
- UNLOCK (&frame->lock);
+ lc = upcall->data;
- call_count = afr_frame_return (frame);
+ if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0)
+ return;
- if (call_count == 0)
- AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
- &local->cont.statfs.buf);
+ if (priv->shd.iamshd) {
+ /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */
+ return;
+ }
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Ignore multiple release requests from shds.*/
+ UNLOCK(&priv->lock);
+ return;
+ }
+ priv->release_ta_notify_dom_lock = _gf_true;
+ inmem_count = priv->ta_in_mem_txn_count;
+ onwire_count = priv->ta_on_wire_txn_count;
+ }
+ UNLOCK(&priv->lock);
+ if (inmem_count || onwire_count)
+ /* lock release will happen in txn code path after
+ * in-memory or on-wire txns are over.*/
+ return;
- return 0;
+ afr_ta_lock_release_synctask(this);
}
+static void
+afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *up_ci = NULL;
+ afr_private_t *priv = this->private;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ int i = 0;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_INODELK_CONTENTION:
+ afr_handle_inodelk_contention(this, upcall);
+ break;
+ case GF_UPCALL_CACHE_INVALIDATION:
+ up_ci = (struct gf_upcall_cache_invalidation *)upcall->data;
+
+ /* Since md-cache will be aggressively filtering
+ * lookups, the stale read issue will be more
+ * pronounced. Hence when a pending xattr is set notify
+ * all the md-cache clients to invalidate the existing
+ * stat cache and send the lookup next time */
+ if (!up_ci->dict)
+ break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!dict_get(up_ci->dict, priv->pending_key[i]))
+ continue;
+ up_ci->flags |= UP_INVAL_ATTR;
+ itable = ((xlator_t *)this->graph->top)->itable;
+ /*Internal processes may not have itable for
+ *top xlator*/
+ if (itable)
+ inode = inode_find(itable, upcall->gfid);
+ if (inode)
+ afr_inode_need_refresh_set(inode, this);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+}
int32_t
-afr_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
{
- afr_private_t * priv = NULL;
- int child_count = 0;
- afr_local_t * local = NULL;
- int i = 0;
-
- int ret = -1;
- int call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_private_t *priv = NULL;
+ xlator_t *child_xlator = NULL;
+ int i = -1;
+ int propagate = 0;
+ int had_heard_from_all = 0;
+ int have_heard_from_all = 0;
+ int idx = -1;
+ int ret = -1;
+ int call_psh = 0;
+ int up_child = -1;
+ dict_t *input = NULL;
+ dict_t *output = NULL;
+ gf_boolean_t had_quorum = _gf_false;
+ gf_boolean_t has_quorum = _gf_false;
+ int64_t halo_max_latency_msec = 0;
+ int64_t child_latency_msec = -1;
+
+ child_xlator = (xlator_t *)data;
+
+ priv = this->private;
+
+ if (!priv)
+ return 0;
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ /*
+ * We need to reset this in case children come up in "staggered"
+ * fashion, so that we discover a late-arriving local subvolume. Note
+ * that we could end up issuing N lookups to the first subvolume, and
+ * O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
+ */
+ priv->did_discovery = _gf_false;
+
+ /* parent xlators don't need to know about every child_up, child_down
+ * because of afr ha. If all subvolumes go down, child_down has
+ * to be triggered. In that state when 1 subvolume comes up child_up
+ * needs to be triggered. dht optimizes revalidate lookup by sending
+ * it only to one of its subvolumes. When child up/down happens
+ * for afr's subvolumes dht should be notified by child_modified. The
+ * subsequent revalidate lookup happens on all the dht's subvolumes
+ * which triggers afr self-heals if any.
+ */
+ idx = afr_find_child_index(this, child_xlator);
+ if (idx < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
+ "Received child_up from invalid subvolume");
+ goto out;
+ }
+
+ had_quorum = priv->quorum_count &&
+ afr_has_quorum(priv->child_up, this, NULL);
+ if (event == GF_EVENT_CHILD_PING) {
+ child_latency_msec = (int64_t)(uintptr_t)data2;
+ if (priv->halo_enabled) {
+ halo_max_latency_msec = afr_get_halo_latency(this);
+
+ /* Calculates the child latency and sets event
+ */
+ LOCK(&priv->lock);
+ {
+ __afr_handle_ping_event(this, child_xlator, idx,
+ halo_max_latency_msec, &event,
+ child_latency_msec);
+ }
+ UNLOCK(&priv->lock);
+ } else {
+ LOCK(&priv->lock);
+ {
+ priv->child_latency[idx] = child_latency_msec;
+ }
+ UNLOCK(&priv->lock);
+ }
+ }
- priv = this->private;
- child_count = priv->child_count;
+ if (event == GF_EVENT_CHILD_PING) {
+ /* This is the only xlator that handles PING, no reason to
+ * propagate.
+ */
+ goto out;
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ if (event == GF_EVENT_TRANSLATOR_OP) {
+ LOCK(&priv->lock);
+ {
+ had_heard_from_all = __get_heard_from_all_status(this);
+ }
+ UNLOCK(&priv->lock);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ if (!had_heard_from_all) {
+ ret = -1;
+ } else {
+ input = data;
+ output = data2;
+ ret = afr_xl_op(this, input, output);
}
+ goto out;
+ }
+
+ if (event == GF_EVENT_UPCALL) {
+ afr_handle_upcall_event(this, data);
+ }
- frame->local = local;
- call_count = local->call_count;
+ LOCK(&priv->lock);
+ {
+ had_heard_from_all = __get_heard_from_all_status(this);
+ switch (event) {
+ case GF_EVENT_PARENT_UP:
+ __afr_launch_notify_timer(this, priv);
+ propagate = 1;
+ break;
+ case GF_EVENT_CHILD_UP:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 1;
+ priv->ta_event_gen++;
+ break;
+ }
+ __afr_handle_child_up_event(this, child_xlator, idx,
+ child_latency_msec, &event,
+ &call_psh, &up_child);
+ __afr_lock_heal_synctask(this, priv, idx);
+ break;
- for (i = 0; i < child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_statfs_cbk,
- priv->children[i],
- priv->children[i]->fops->statfs,
- loc);
- if (!--call_count)
- break;
+ case GF_EVENT_CHILD_DOWN:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 0;
+ priv->ta_event_gen++;
+ afr_ta_locked_priv_invalidate(priv);
+ break;
}
+ __afr_handle_child_down_event(this, child_xlator, idx,
+ child_latency_msec, &event,
+ &call_psh, &up_child);
+ __afr_mark_pending_lk_heal(this, priv, idx);
+ break;
+
+ case GF_EVENT_CHILD_CONNECTING:
+ priv->last_event[idx] = event;
+
+ break;
+
+ case GF_EVENT_SOME_DESCENDENT_DOWN:
+ priv->last_event[idx] = event;
+ break;
+ default:
+ propagate = 1;
+ break;
}
+ have_heard_from_all = __get_heard_from_all_status(this);
+ if (!had_heard_from_all && have_heard_from_all) {
+ if (priv->timer) {
+ gf_timer_call_cancel(this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+ /* This is the first event which completes aggregation
+ of events from all subvolumes. If at least one subvol
+ had come up, propagate CHILD_UP, but only this time
+ */
+ event = GF_EVENT_CHILD_DOWN;
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
+ event = GF_EVENT_CHILD_UP;
+ break;
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL);
+ if (priv->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
+ event = GF_EVENT_CHILD_CONNECTING;
+ /* continue to check other events for CHILD_UP */
+ }
+ }
}
- return 0;
+ }
+ UNLOCK(&priv->lock);
+
+ if (priv->quorum_count) {
+ has_quorum = afr_has_quorum(priv->child_up, this, NULL);
+ if (!had_quorum && has_quorum) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
+ "Client-quorum is met");
+ gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ }
+ if (had_quorum && !has_quorum) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
+ "Client-quorum is not met");
+ gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ }
+ }
+
+ /* if all subvols have reported status, no need to hide anything
+ or wait for anything else. Just propagate blindly */
+ if (have_heard_from_all)
+ propagate = 1;
+
+ ret = 0;
+ if (propagate)
+ ret = default_notify(this, event, data);
+
+ if ((!had_heard_from_all) || call_psh) {
+ /* Launch self-heal on all local subvolumes if:
+ * a) We have_heard_from_all for the first time
+ * b) Already heard from everyone, but we now got a child-up
+ * event.
+ */
+ if (have_heard_from_all) {
+ afr_selfheal_childup(this, priv);
+ }
+ }
+out:
+ return ret;
}
-
-int32_t
-afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
+int
+afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
{
- afr_local_t * local = NULL;
+ int __ret = -1;
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ __ret = syncbarrier_init(&local->barrier);
+ if (__ret) {
+ if (op_errno)
+ *op_errno = __ret;
+ goto out;
+ }
+
+ local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up),
+ gf_afr_mt_char);
+ if (!local->child_up) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ memcpy(local->child_up, priv->child_up,
+ sizeof(*local->child_up) * priv->child_count);
+ local->call_count = AFR_COUNT(local->child_up, priv->child_count);
+ if (local->call_count == 0) {
+ gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
+ "no subvolumes up");
+ if (op_errno)
+ *op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->event_generation = priv->event_generation;
+
+ local->read_attempted = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->read_attempted) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->readable = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->readable) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->readable2 = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->readable2) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->read_subvol = -1;
+
+ local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+ gf_afr_mt_reply_t);
+ if (!local->replies) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->need_full_crawl = _gf_false;
+ if (priv->thin_arbiter_count) {
+ local->ta_child_up = priv->ta_child_up;
+ local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
+ local->read_txn_query_child = AFR_CHILD_UNKNOWN;
+ local->ta_event_gen = priv->ta_event_gen;
+ local->fop_state = TA_SUCCESS;
+ }
+ local->is_new_entry = _gf_false;
+
+ INIT_LIST_HEAD(&local->healer);
+ return 0;
+out:
+ return -1;
+}
- int call_count = -1;
+int
+afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count)
+{
+ int ret = -ENOMEM;
- local = frame->local;
- call_count = afr_frame_return (frame);
+ lk->lower_locked_nodes = GF_CALLOC(sizeof(*lk->lower_locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->lower_locked_nodes)
+ goto out;
- if (call_count == 0)
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- lock);
+ lk->lock_op_ret = -1;
+ lk->lock_op_errno = EUCLEAN;
- return 0;
+ ret = 0;
+out:
+ return ret;
}
-
-int32_t
-afr_lk_unlock (call_frame_t *frame, xlator_t *this)
+void
+afr_matrix_cleanup(int32_t **matrix, unsigned int m)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ int i = 0;
- int i;
- int call_count = 0;
+ if (!matrix)
+ goto out;
+ for (i = 0; i < m; i++) {
+ GF_FREE(matrix[i]);
+ }
- local = frame->local;
- priv = this->private;
+ GF_FREE(matrix);
+out:
+ return;
+}
- call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,
- priv->child_count);
+int32_t **
+afr_matrix_create(unsigned int m, unsigned int n)
+{
+ int32_t **matrix = NULL;
+ int i = 0;
+
+ matrix = GF_CALLOC(sizeof(*matrix), m, gf_afr_mt_int32_t);
+ if (!matrix)
+ goto out;
+
+ for (i = 0; i < m; i++) {
+ matrix[i] = GF_CALLOC(sizeof(*matrix[i]), n, gf_afr_mt_int32_t);
+ if (!matrix[i])
+ goto out;
+ }
+ return matrix;
+out:
+ afr_matrix_cleanup(matrix, m);
+ return NULL;
+}
- if (call_count == 0) {
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- &local->cont.lk.ret_flock);
- return 0;
- }
+int
+afr_transaction_local_init(afr_local_t *local, xlator_t *this)
+{
+ int ret = -ENOMEM;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ INIT_LIST_HEAD(&local->transaction.wait_list);
+ INIT_LIST_HEAD(&local->transaction.owner_list);
+ INIT_LIST_HEAD(&local->ta_waitq);
+ INIT_LIST_HEAD(&local->ta_onwireq);
+ ret = afr_internal_lock_init(&local->internal_lock, priv->child_count);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOMEM;
+ local->pre_op_compat = priv->pre_op_compat;
+
+ local->transaction.pre_op = GF_CALLOC(sizeof(*local->transaction.pre_op),
+ priv->child_count, gf_afr_mt_char);
+ if (!local->transaction.pre_op)
+ goto out;
+
+ local->transaction.changelog_xdata = GF_CALLOC(
+ sizeof(*local->transaction.changelog_xdata), priv->child_count,
+ gf_afr_mt_dict_t);
+ if (!local->transaction.changelog_xdata)
+ goto out;
+
+ if (priv->arbiter_count == 1) {
+ local->transaction.pre_op_sources = GF_CALLOC(
+ sizeof(*local->transaction.pre_op_sources), priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.pre_op_sources)
+ goto out;
+ }
+
+ local->transaction.failed_subvols = GF_CALLOC(
+ sizeof(*local->transaction.failed_subvols), priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.failed_subvols)
+ goto out;
+
+ local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!local->pending)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
- local->call_count = call_count;
+void
+afr_set_low_priority(call_frame_t *frame)
+{
+ frame->root->pid = LOW_PRIO_PROC_PID;
+}
+
+void
+afr_priv_destroy(afr_private_t *priv)
+{
+ int i = 0;
+ int child_count = -1;
- local->cont.lk.user_flock.l_type = F_UNLCK;
+ if (!priv)
+ goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (local->cont.lk.locked_nodes[i]) {
- STACK_WIND (frame, afr_lk_unlock_cbk,
- priv->children[i],
- priv->children[i]->fops->lk,
- local->fd, F_SETLK,
- &local->cont.lk.user_flock);
-
- if (!--call_count)
- break;
- }
- }
+ GF_FREE(priv->sh_domain);
+ GF_FREE(priv->last_event);
- return 0;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ child_count++;
+ }
+ if (priv->pending_key) {
+ for (i = 0; i < child_count; i++)
+ GF_FREE(priv->pending_key[i]);
+ }
+
+ GF_FREE(priv->pending_reads);
+ GF_FREE(priv->local);
+ GF_FREE(priv->pending_key);
+ GF_FREE(priv->children);
+ GF_FREE(priv->anon_inode);
+ GF_FREE(priv->child_up);
+ GF_FREE(priv->halo_child_up);
+ GF_FREE(priv->child_latency);
+ LOCK_DESTROY(&priv->lock);
+
+ GF_FREE(priv);
+out:
+ return;
}
+int **
+afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, ia_type_t iat)
+{
+ int i = 0;
+ int **changelog = NULL;
+ int idx = -1;
+ int m_idx = 0;
+ int d_idx = 0;
+ int ret = 0;
+
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ idx = afr_index_from_ia_type(iat);
+
+ changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ continue;
+
+ changelog[i][m_idx] = hton32(1);
+ if (idx != -1)
+ changelog[i][idx] = hton32(1);
+ /* If the newentry marking is on a newly created directory,
+ * then mark it with the full-heal indicator.
+ */
+ if ((IA_ISDIR(iat)) && (priv->esh_granular))
+ changelog[i][d_idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict(priv, xattr, changelog);
+ if (ret < 0) {
+ afr_matrix_cleanup(changelog, priv->child_count);
+ return NULL;
+ }
+out:
+ return changelog;
+}
-int32_t
-afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
+static dict_t *
+afr_set_heal_info(char *status)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-/* int ret = 0; */
+ dict_t *dict = NULL;
+ int ret = -1;
+
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_dynstr_sizen(dict, "heal-info", status);
+ if (ret)
+ gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set heal-info key to "
+ "%s",
+ status);
+out:
+ /* Any error other than EINVAL, dict_set_dynstr frees status */
+ if (ret == -ENOMEM || ret == -EINVAL) {
+ GF_FREE(status);
+ }
+
+ if (ret && dict) {
+ dict_unref(dict);
+ dict = NULL;
+ }
+ return dict;
+}
- int child_index = -1;
+static gf_boolean_t
+afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ afr_private_t *priv = this->private;
+ int *dirty = alloca0(priv->child_count * sizeof(int));
+ int i = 0;
- local = frame->local;
- priv = this->private;
+ afr_selfheal_extract_xattr(this, replies, type, dirty, NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dirty[i] > 1)
+ return _gf_true;
+ }
- child_index = (long) cookie;
+ return _gf_false;
+}
- if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
- local->op_ret = -1;
- local->op_errno = op_errno;
+static gf_boolean_t
+afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type)
+{
+ gf_boolean_t data_chk = _gf_false;
+ gf_boolean_t mdata_chk = _gf_false;
+ gf_boolean_t entry_chk = _gf_false;
+
+ switch (ia_type) {
+ case IA_IFDIR:
+ mdata_chk = _gf_true;
+ entry_chk = _gf_true;
+ break;
+ case IA_IFREG:
+ mdata_chk = _gf_true;
+ data_chk = _gf_true;
+ break;
+ default:
+ /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/
+ mdata_chk = _gf_true;
+ break;
+ }
+
+ if (data_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_DATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_METADATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_ENTRY_TRANSACTION)) {
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
- afr_lk_unlock (frame, this);
- return 0;
+static int
+afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh,
+ gf_boolean_t *msh, unsigned char pending)
+{
+ int ret = -1;
+ GF_UNUSED int ret1 = 0;
+ int i = 0;
+ int io_domain_lk_count = 0;
+ int shd_domain_lk_count = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
+
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((replies[i].valid != 1) || (replies[i].op_ret != 0))
+ continue;
+ if (!io_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count);
}
+ if (!shd_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count);
+ }
+ }
- if (op_ret == 0) {
- local->op_ret = 0;
- local->op_errno = 0;
- local->cont.lk.locked_nodes[child_index] = 1;
- local->cont.lk.ret_flock = *lock;
+ if (!pending) {
+ if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) ||
+ (!io_domain_lk_count)) {
+ /* Needs heal. */
+ ret = 0;
+ } else {
+ /* No heal needed. */
+ *dsh = *esh = *msh = 0;
+ }
+ } else {
+ if (shd_domain_lk_count) {
+ ret = -EAGAIN; /*For 'possibly-healing'. */
+ } else {
+ ret = 0; /*needs heal. Just set a non -ve value so that it is
+ assumed as the source index.*/
}
+ }
+ return ret;
+}
- child_index++;
+/*return EIO, EAGAIN or pending*/
+int
+afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **inode, gf_boolean_t *entry_selfheal,
+ gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal, unsigned char *pending)
+{
+ int ret = -1;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ gf_boolean_t dsh = _gf_false;
+ gf_boolean_t msh = _gf_false;
+ gf_boolean_t esh = _gf_false;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *valid_on = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+ sources = alloca0(sizeof(*sources) * priv->child_count);
+ sinks = alloca0(sizeof(*sinks) * priv->child_count);
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ valid_on = alloca0(sizeof(*valid_on) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh,
+ &esh, replies);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ valid_on[i] = 1;
+ }
+ }
+ if (msh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_METADATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
+ if (dsh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_DATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
+ if (esh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
- if (child_index < priv->child_count) {
- STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->lk,
- local->fd, local->cont.lk.cmd,
- &local->cont.lk.user_flock);
- } else if (local->op_ret == -1) {
- /* all nodes have gone down */
+ ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh,
+ &msh, *pending);
+out:
+ *data_selfheal = dsh;
+ *entry_selfheal = esh;
+ *metadata_selfheal = msh;
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ return ret;
+}
- AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN,
- &local->cont.lk.ret_flock);
+int
+afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ unsigned char pending = 0;
+ dict_t *dict = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ inode_t *inode = NULL;
+ char *substr = NULL;
+ char *status = NULL;
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
+
+ /*Use frame with lk-owner set*/
+ heal_frame = afr_frame_create(frame->this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+
+ ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode,
+ &entry_selfheal, &data_selfheal,
+ &metadata_selfheal, &pending);
+
+ if (ret == -ENOMEM) {
+ ret = -1;
+ goto out;
+ }
+
+ if (pending & PFLAG_PENDING) {
+ gf_asprintf(&substr, "-pending");
+ if (!substr)
+ goto out;
+ }
+
+ if (ret == -EIO) {
+ ret = gf_asprintf(&status, "split-brain%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ } else if (ret == -EAGAIN) {
+ ret = gf_asprintf(&status, "possibly-healing%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ } else if (ret >= 0) {
+ /* value of ret = source index
+ * so ret >= 0 and at least one of the 3 booleans set to
+ * true means a source is identified; heal is required.
+ */
+ if (!data_selfheal && !entry_selfheal && !metadata_selfheal) {
+ status = gf_strdup("no-heal");
+ if (!status) {
+ ret = -1;
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
} else {
- /* locking has succeeded on all nodes that are up */
+ ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ }
+ } else if (ret < 0) {
+ /* Apart from above checked -ve ret values, there are
+ * other possible ret values like ENOTCONN
+ * (returned when number of valid replies received are
+ * less than 2)
+ * in which case heal is required when one of the
+ * selfheal booleans is set.
+ */
+ if (data_selfheal || entry_selfheal || metadata_selfheal) {
+ ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ }
+ }
- /* temporarily
- ret = afr_mark_locked_nodes (this, local->fd,
- local->cont.lk.locked_nodes);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not save locked nodes info in fdctx");
+ ret = 0;
+ op_errno = 0;
- ret = afr_save_locked_fd (this, local->fd);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not save locked fd");
+out:
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ if (dict)
+ dict_unref(dict);
+ if (inode)
+ inode_unref(inode);
+ GF_FREE(substr);
+ return ret;
+}
- */
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- &local->cont.lk.ret_flock);
- }
+int
+_afr_is_split_brain(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies, afr_transaction_type type,
+ gf_boolean_t *spb)
+{
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ int sources_count = 0;
+ int ret = 0;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ witness = alloca0(priv->child_count * sizeof(*witness));
+
+ ret = afr_selfheal_find_direction(frame, this, replies, type,
+ priv->child_up, sources, sinks, witness,
+ NULL);
+ if (ret)
+ return ret;
- return 0;
-}
+ sources_count = AFR_COUNT(sources, priv->child_count);
+ if (!sources_count)
+ *spb = _gf_true;
+ return ret;
+}
int
-afr_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t cmd,
- struct gf_flock *flock)
+afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
- int i = 0;
+ priv = this->private;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ replies = alloca0(sizeof(*replies) * priv->child_count);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+ if (ret)
+ goto out;
- priv = this->private;
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ ret = -EAGAIN;
+ goto out;
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
- AFR_LOCAL_INIT (local, priv);
+ ret = _afr_is_split_brain(frame, this, replies, AFR_DATA_TRANSACTION,
+ d_spb);
+ if (ret)
+ goto out;
- frame->local = local;
+ ret = _afr_is_split_brain(frame, this, replies, AFR_METADATA_TRANSACTION,
+ m_spb);
+out:
+ if (replies) {
+ afr_replies_wipe(replies, priv->child_count);
+ replies = NULL;
+ }
+ return ret;
+}
- local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.lk.locked_nodes),
- gf_afr_mt_char);
+int
+afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ GF_FREE(opaque);
+ return 0;
+}
- if (!local->cont.lk.locked_nodes) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
- op_errno = ENOMEM;
- goto out;
+int
+afr_get_split_brain_status(void *opaque)
+{
+ gf_boolean_t d_spb = _gf_false;
+ gf_boolean_t m_spb = _gf_false;
+ int ret = -1;
+ int op_errno = 0;
+ int i = 0;
+ char *choices = NULL;
+ char *status = NULL;
+ dict_t *dict = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ loc_t *loc = NULL;
+ afr_spb_status_t *data = NULL;
+
+ data = opaque;
+ frame = data->frame;
+ this = frame->this;
+ loc = data->loc;
+ priv = this->private;
+ children = priv->children;
+
+ inode = afr_inode_find(this, loc->gfid);
+ if (!inode)
+ goto out;
+
+ dict = dict_new();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ /* Calculation for string length :
+ * (child_count X length of child-name) + SLEN(" Choices :")
+ * child-name consists of :
+ * a) 251 = max characters for volname according to GD_VOLUME_NAME_MAX
+ * b) strlen("-client-00,") assuming 16 replicas
+ */
+ choices = alloca0(priv->child_count * (256 + SLEN("-client-00,")) +
+ SLEN(" Choices:"));
+
+ ret = afr_is_split_brain(frame, this, inode, loc->gfid, &d_spb, &m_spb);
+ if (ret) {
+ op_errno = -ret;
+ if (ret == -EAGAIN) {
+ ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+ SBRAIN_HEAL_NO_GO_MSG);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ AFR_MSG_DICT_SET_FAILED,
+ "Failed to set GF_AFR_SBRAIN_STATUS in dict");
+ }
}
+ ret = -1;
+ goto out;
+ }
- local->fd = fd_ref (fd);
- local->cont.lk.cmd = cmd;
- local->cont.lk.user_flock = *flock;
- local->cont.lk.ret_flock = *flock;
+ if (d_spb || m_spb) {
+ sprintf(choices, " Choices:");
+ for (i = 0; i < priv->child_count; i++) {
+ strcat(choices, children[i]->name);
+ strcat(choices, ",");
+ }
+ choices[strlen(choices) - 1] = '\0';
- STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
- priv->children[i],
- priv->children[i]->fops->lk,
- fd, cmd, flock);
+ ret = gf_asprintf(&status,
+ "data-split-brain:%s "
+ "metadata-split-brain:%s%s",
+ (d_spb) ? "yes" : "no", (m_spb) ? "yes" : "no",
+ choices);
+
+ if (-1 == ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ ret = dict_set_dynstr_sizen(dict, GF_AFR_SBRAIN_STATUS, status);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+ SFILE_NOT_UNDER_DATA);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ }
- op_ret = 0;
+ ret = 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL);
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ if (dict)
+ dict_unref(dict);
+ if (inode)
+ inode_unref(inode);
+ return ret;
+}
+
+int32_t
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int ret = 0;
+ int op_errno = 0;
+ dict_t *dict = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
+
+ local = frame->local;
+ dict = dict_new();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ heal_frame = afr_frame_create(this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+ /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk
+ * work correctly*/
+ ret = afr_selfheal_do(heal_frame, this, loc->gfid);
+
+ if (ret == 1 || ret == 2) {
+ ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set sh-fail-msg in dict");
+ ret = 0;
+ goto out;
+ } else {
+ if (local->xdata_rsp) {
+ /* 'sh-fail-msg' has been set in the dict during self-heal.*/
+ dict_copy(local->xdata_rsp, dict);
+ ret = 0;
+ } else if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
}
- return 0;
+ }
+
+out:
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
+ if (local->op == GF_FOP_GETXATTR)
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ else if (local->op == GF_FOP_SETXATTR)
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ if (dict)
+ dict_unref(dict);
+ return ret;
}
int
-afr_priv_dump (xlator_t *this)
+afr_get_child_index_from_name(xlator_t *this, char *name)
{
- afr_private_t *priv = NULL;
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
+ afr_private_t *priv = this->private;
+ int index = -1;
+
+ for (index = 0; index < priv->child_count; index++) {
+ if (!strcmp(priv->children[index]->name, name))
+ goto out;
+ }
+ index = -1;
+out:
+ return index;
+}
+void
+afr_priv_need_heal_set(afr_private_t *priv, gf_boolean_t need_heal)
+{
+ LOCK(&priv->lock);
+ {
+ priv->need_heal = need_heal;
+ }
+ UNLOCK(&priv->lock);
+}
- GF_ASSERT (this);
- priv = this->private;
+void
+afr_set_need_heal(xlator_t *this, afr_local_t *local)
+{
+ int i = 0;
+ afr_private_t *priv = this->private;
+ gf_boolean_t need_heal = _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].need_heal) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+ afr_priv_need_heal_set(priv, need_heal);
+ return;
+}
- GF_ASSERT (priv);
- snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
- gf_proc_dump_add_section(key_prefix);
- gf_proc_dump_build_key(key, key_prefix, "child_count");
- gf_proc_dump_write(key, "%u", priv->child_count);
- gf_proc_dump_build_key(key, key_prefix, "read_child_rr");
- gf_proc_dump_write(key, "%u", priv->read_child_rr);
- for (i = 0; i < priv->child_count; i++) {
- gf_proc_dump_build_key(key, key_prefix, "child_up[%d]", i);
- gf_proc_dump_write(key, "%d", priv->child_up[i]);
- gf_proc_dump_build_key(key, key_prefix,
- "pending_key[%d]", i);
- gf_proc_dump_write(key, "%s", priv->pending_key[i]);
- }
- gf_proc_dump_build_key(key, key_prefix, "data_self_heal");
- gf_proc_dump_write(key, "%d", priv->data_self_heal);
- gf_proc_dump_build_key(key, key_prefix, "metadata_self_heal");
- gf_proc_dump_write(key, "%d", priv->metadata_self_heal);
- gf_proc_dump_build_key(key, key_prefix, "entry_self_heal");
- gf_proc_dump_write(key, "%d", priv->entry_self_heal);
- gf_proc_dump_build_key(key, key_prefix, "data_change_log");
- gf_proc_dump_write(key, "%d", priv->data_change_log);
- gf_proc_dump_build_key(key, key_prefix, "metadata_change_log");
- gf_proc_dump_write(key, "%d", priv->metadata_change_log);
- gf_proc_dump_build_key(key, key_prefix, "entry_change_log");
- gf_proc_dump_write(key, "%d", priv->entry_change_log);
- gf_proc_dump_build_key(key, key_prefix, "read_child");
- gf_proc_dump_write(key, "%d", priv->read_child);
- gf_proc_dump_build_key(key, key_prefix, "favorite_child");
- gf_proc_dump_write(key, "%u", priv->favorite_child);
- gf_proc_dump_build_key(key, key_prefix, "data_lock_server_count");
- gf_proc_dump_write(key, "%u", priv->data_lock_server_count);
- gf_proc_dump_build_key(key, key_prefix, "metadata_lock_server_count");
- gf_proc_dump_write(key, "%u", priv->metadata_lock_server_count);
- gf_proc_dump_build_key(key, key_prefix, "entry_lock_server_count");
- gf_proc_dump_write(key, "%u", priv->entry_lock_server_count);
- gf_proc_dump_build_key(key, key_prefix, "wait_count");
- gf_proc_dump_write(key, "%u", priv->wait_count);
+gf_boolean_t
+afr_get_need_heal(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ gf_boolean_t need_heal = _gf_true;
+
+ LOCK(&priv->lock);
+ {
+ need_heal = priv->need_heal;
+ }
+ UNLOCK(&priv->lock);
+ return need_heal;
+}
- return 0;
+int
+afr_get_msg_id(char *op_type)
+{
+ if (!strcmp(op_type, GF_AFR_REPLACE_BRICK))
+ return AFR_MSG_REPLACE_BRICK_STATUS;
+ else if (!strcmp(op_type, GF_AFR_ADD_BRICK))
+ return AFR_MSG_ADD_BRICK_STATUS;
+ return -1;
}
+int
+afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *heal_frame,
+ void *opaque)
+{
+ call_frame_t *txn_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ xlator_t *this = NULL;
-/**
- * find_child_index - find the child's index in the array of subvolumes
- * @this: AFR
- * @child: child
+ heal_local = heal_frame->local;
+ txn_frame = heal_local->heal_frame;
+ local = txn_frame->local;
+ this = txn_frame->this;
+
+ /* Refresh the inode agan and proceed with the transaction.*/
+ afr_inode_refresh(txn_frame, this, local->inode, NULL, local->refreshfn);
+
+ AFR_STACK_DESTROY(heal_frame);
+
+ return 0;
+}
+
+int
+afr_fav_child_reset_sink_xattrs(void *opaque)
+{
+ call_frame_t *heal_frame = NULL;
+ call_frame_t *txn_frame = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t d_spb = _gf_false;
+ gf_boolean_t m_spb = _gf_false;
+ afr_local_t *heal_local = NULL;
+ afr_local_t *txn_local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ unsigned char *locked_on = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int ret = 0;
+
+ heal_frame = (call_frame_t *)opaque;
+ heal_local = heal_frame->local;
+ txn_frame = heal_local->heal_frame;
+ txn_local = txn_frame->local;
+ this = txn_frame->this;
+ inode = txn_local->inode;
+ priv = this->private;
+ locked_on = alloca0(priv->child_count);
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = _afr_is_split_brain(txn_frame, this, txn_local->replies,
+ AFR_DATA_TRANSACTION, &d_spb);
+
+ ret = _afr_is_split_brain(txn_frame, this, txn_local->replies,
+ AFR_METADATA_TRANSACTION, &m_spb);
+
+ /* Take appropriate locks and reset sink xattrs. */
+ if (d_spb) {
+ ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
+ locked_on);
+ {
+ if (ret < priv->child_count)
+ goto data_unlock;
+ ret = __afr_selfheal_data_prepare(
+ heal_frame, this, inode, locked_on, sources, sinks,
+ healed_sinks, undid_pending, locked_replies, NULL);
+ }
+ data_unlock:
+ afr_selfheal_uninodelk(heal_frame, this, inode, this->name, 0, 0,
+ locked_on);
+ }
+
+ if (m_spb) {
+ memset(locked_on, 0, sizeof(*locked_on) * priv->child_count);
+ memset(undid_pending, 0, sizeof(*undid_pending) * priv->child_count);
+ ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, locked_on);
+ {
+ if (ret < priv->child_count)
+ goto mdata_unlock;
+ ret = __afr_selfheal_metadata_prepare(
+ heal_frame, this, inode, locked_on, sources, sinks,
+ healed_sinks, undid_pending, locked_replies, NULL);
+ }
+ mdata_unlock:
+ afr_selfheal_uninodelk(heal_frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, locked_on);
+ }
+
+ return ret;
+}
+
+/*
+ * Concatenates the xattrs in local->replies separated by a delimiter.
*/
+int
+afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
+ char *buf, const char *default_str,
+ int32_t *serz_len, char delimiter)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *xattr = NULL;
+ int i = 0;
+ int len = 0;
+ int keylen = 0;
+ size_t str_len = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ keylen = strlen(local->cont.getxattr.name);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret) {
+ str_len = strlen(default_str);
+ buf = strncat(buf, default_str, str_len);
+ len += str_len;
+ buf[len++] = delimiter;
+ buf[len] = '\0';
+ } else {
+ ret = dict_get_strn(local->replies[i].xattr,
+ local->cont.getxattr.name, keylen, &xattr);
+ if (ret) {
+ gf_msg("TEST", GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "Failed to get the node_uuid of brick "
+ "%d",
+ i);
+ goto out;
+ }
+ str_len = strlen(xattr);
+ buf = strncat(buf, xattr, str_len);
+ len += str_len;
+ buf[len++] = delimiter;
+ buf[len] = '\0';
+ }
+ }
+ buf[--len] = '\0'; /*remove the last delimiter*/
+ if (serz_len)
+ *serz_len = ++len;
+ ret = 0;
-static int
-find_child_index (xlator_t *this, xlator_t *child)
+out:
+ return ret;
+}
+
+uint64_t
+afr_write_subvol_get(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ uint64_t write_subvol = 0;
- int i = -1;
+ local = frame->local;
+ LOCK(&local->inode->lock);
+ write_subvol = local->inode_ctx->write_subvol;
+ UNLOCK(&local->inode->lock);
- priv = this->private;
+ return write_subvol;
+}
- for (i = 0; i < priv->child_count; i++) {
- if ((xlator_t *) child == priv->children[i])
- break;
+int
+afr_write_subvol_set(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int event = 0;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ data_accused = alloca0(priv->child_count);
+ metadata_accused = alloca0(priv->child_count);
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+ event = local->event_generation;
+
+ afr_readables_fill(frame, this, local->inode, data_accused,
+ metadata_accused, data_readable, metadata_readable,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_readable[i])
+ datamap |= (1 << i);
+ if (metadata_readable[i])
+ metadatamap |= (1 << i);
+ }
+
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
+
+ LOCK(&local->inode->lock);
+ {
+ if (local->inode_ctx->write_subvol == 0 &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ local->inode_ctx->write_subvol = val;
}
+ }
+ UNLOCK(&local->inode->lock);
- return i;
+ return 0;
}
-int32_t
-afr_notify (xlator_t *this, int32_t event,
- void *data, ...)
+int
+afr_write_subvol_reset(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = NULL;
- unsigned char * child_up = NULL;
+ afr_local_t *local = NULL;
- int i = -1;
- int up_children = 0;
+ local = frame->local;
+ LOCK(&local->inode->lock);
+ {
+ GF_ASSERT(local->inode_ctx->lock_count > 0);
+ local->inode_ctx->lock_count--;
- priv = this->private;
+ if (!local->inode_ctx->lock_count)
+ local->inode_ctx->write_subvol = 0;
+ }
+ UNLOCK(&local->inode->lock);
- if (!priv)
- return 0;
+ return 0;
+}
- child_up = priv->child_up;
+int
+afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode)
+{
+ int ret = 0;
+
+ local->inode = inode_ref(inode);
+ LOCK(&local->inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, local->inode, &local->inode_ctx);
+ }
+ UNLOCK(&local->inode->lock);
+ if (ret < 0) {
+ gf_msg_callingfn(
+ this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_INODE_CTX_GET_FAILED,
+ "Error getting inode ctx %s", uuid_utoa(local->inode->gfid));
+ }
+ return ret;
+}
- switch (event) {
- case GF_EVENT_CHILD_UP:
- i = find_child_index (this, data);
+gf_boolean_t
+afr_ta_is_fop_called_from_synctask(xlator_t *this)
+{
+ struct synctask *task = NULL;
+ gf_lkowner_t tmp_owner = {
+ 0,
+ };
- /* temporarily
- afr_attempt_lock_recovery (this, i);
- */
+ task = synctask_get();
+ if (!task)
+ return _gf_false;
- child_up[i] = 1;
+ set_lk_owner_from_ptr(&tmp_owner, (void *)this);
- LOCK (&priv->lock);
- {
- priv->up_count++;
- }
- UNLOCK (&priv->lock);
+ if (!is_same_lkowner(&tmp_owner, &task->frame->root->lk_owner))
+ return _gf_false;
- /*
- if all the children were down, and one child came up,
- send notify to parent
- */
+ return _gf_true;
+}
- for (i = 0; i < priv->child_count; i++)
- if (child_up[i])
- up_children++;
+int
+afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
+{
+ int ret = 0;
+ uuid_t gfid = {
+ 0,
+ };
+ afr_private_t *priv = this->private;
+ gf_boolean_t locked = _gf_false;
+ struct gf_flock flock1 = {
+ 0,
+ };
+ struct gf_flock flock2 = {
+ 0,
+ };
+ int32_t cmd = 0;
+
+ /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock
+ * has been released in afr_notify due to upcall notification from shd.
+ */
+ GF_ASSERT(priv->ta_notify_dom_lock_offset == 0);
+
+ if (!priv->shd.iamshd)
+ GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+ flock1.l_type = F_WRLCK;
+
+ while (!locked) {
+ if (priv->shd.iamshd) {
+ cmd = F_SETLKW;
+ flock1.l_start = 0;
+ flock1.l_len = 0;
+ } else {
+ cmd = F_SETLK;
+ gf_uuid_generate(gfid);
+ flock1.l_start = gfid_to_ino(gfid);
+ if (flock1.l_start < 0)
+ flock1.l_start = -flock1.l_start;
+ flock1.l_len = 1;
+ }
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, cmd, &flock1, NULL, NULL);
+ if (!ret) {
+ locked = _gf_true;
+ priv->ta_notify_dom_lock_offset = flock1.l_start;
+ } else if (ret == -EAGAIN) {
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to get "
+ "AFR_TA_DOM_NOTIFY lock on %s.",
+ loc->name);
+ goto out;
+ }
+ }
+
+ flock2.l_type = F_WRLCK;
+ flock2.l_start = 0;
+ flock2.l_len = 0;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);
+ flock1.l_type = F_UNLCK;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL,
+ NULL);
+ }
+out:
+ return ret;
+}
- if (up_children == 1) {
- gf_log (this->name, GF_LOG_NORMAL,
- "Subvolume '%s' came back up; "
- "going online.", ((xlator_t *)data)->name);
+int
+afr_ta_post_op_unlock(xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = 0;
+
+ if (!priv->shd.iamshd)
+ GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+ flock.l_type = F_UNLCK;
+ flock.l_start = 0;
+ flock.l_len = 0;
+
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_MODIFY lock.");
+ goto out;
+ }
+
+ if (!priv->shd.iamshd)
+ /* Mounts (clients) will not release the AFR_TA_DOM_NOTIFY lock
+ * in post-op as they use it as a notification mechanism. When
+ * shd sends a lock request on TA during heal, the clients will
+ * receive a lock-contention upcall notification upon which they
+ * will release the AFR_TA_DOM_NOTIFY lock after completing the
+ * in flight I/O.*/
+ goto out;
+
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+ }
+out:
+ return ret;
+}
- default_notify (this, event, data);
- }
+call_frame_t *
+afr_ta_frame_create(xlator_t *this)
+{
+ call_frame_t *frame = NULL;
+ void *lk_owner = NULL;
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return NULL;
+ lk_owner = (void *)this;
+ afr_set_lk_owner(frame, this, lk_owner);
+ return frame;
+}
- break;
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
+{
+ int data_count = 0;
- case GF_EVENT_CHILD_DOWN:
- i = find_child_index (this, data);
+ data_count = AFR_COUNT(local->child_up, priv->child_count);
+ if (data_count == 2) {
+ return _gf_true;
+ } else if (data_count == 1 && local->ta_child_up) {
+ return _gf_true;
+ }
- child_up[i] = 0;
+ return _gf_false;
+}
- LOCK (&priv->lock);
- {
- priv->down_count++;
- }
- UNLOCK (&priv->lock);
+static gf_boolean_t
+afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
- /*
- if all children are down, and this was the last to go down,
- send notify to parent
- */
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT)
+ return _gf_false;
- for (i = 0; i < priv->child_count; i++)
- if (child_up[i])
- up_children++;
+ local = frame->local;
- if (up_children == 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "All subvolumes are down. Going offline "
- "until atleast one of them comes back up.");
+ if (local->op != GF_FOP_LOOKUP)
+ /* TODO:If the replica count is being increased on a plain distribute
+ * volume that was never mounted, we need to allow setxattr on '/' with
+ * GF_CLIENT_PID_NO_ROOT_SQUASH to accomodate for DHT layout setting */
+ return _gf_false;
- default_notify (this, event, data);
- }
+ if (local->inode == NULL)
+ return _gf_false;
- break;
+ if (!__is_root_gfid(local->inode->gfid))
+ return _gf_false;
- default:
- default_notify (this, event, data);
- }
+ return _gf_true;
+}
- return 0;
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count)
+{
+ if (frame && (up_children_count > 0) &&
+ afr_is_add_replica_mount_lookup_on_root(frame))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ unsigned char *success_replies = NULL;
+
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+
+ if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_errno = afr_final_errno(local, priv);
+ if (!local->op_errno)
+ local->op_errno = afr_quorum_errno(priv);
+ local->op_ret = -1;
+ }
+}
+
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child)
+{
+ int *pending = NULL;
+ int ret = 0;
+ int i = 0;
+
+ ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending);
+ if (ret == 0) {
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ /* Not doing a ntoh32(pending) as we just want to check
+ * if it is non-zero or not. */
+ if (pending[i]) {
+ return _gf_true;
+ }
+ }
+ }
+ return _gf_false;
}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index f52054eaafe..f8bf8340dab 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -1,753 +1,346 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "checksum.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/list.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
#include "afr.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-int
-afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- afr_set_opendir_done (this, local->fd->inode);
-
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
-
- return 0;
-}
-
-
-gf_boolean_t
-__checksums_differ (uint32_t *checksum, int child_count,
- unsigned char *child_up)
-{
- int ret = _gf_false;
- int i = 0;
-
- uint32_t cksum;
-
- cksum = checksum[0];
-
- for (i = 0; i < child_count; i++) {
- if (!child_up[i])
- continue;
-
- if (cksum != checksum[i]) {
- ret = _gf_true;
- break;
- }
-
- cksum = checksum[i];
- }
-
- return ret;
-}
-
+#include "afr-transaction.h"
int32_t
-afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
+afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
-
- gf_dirent_t * entry = NULL;
- gf_dirent_t * tmp = NULL;
-
- int child_index = 0;
-
- uint32_t entry_cksum;
-
- int call_count = 0;
- off_t last_offset = 0;
- char sh_type_str[256] = {0,};
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int32_t child_index = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+ child_index = (long)cookie;
- child_index = (long) cookie;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ LOCK(&frame->lock);
+ {
if (op_ret == -1) {
- local->op_ret = -1;
- local->op_ret = op_errno;
- goto out;
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
+ local->op_ret = op_ret;
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- if (op_ret == 0)
- goto out;
+ if (call_count == 0) {
+ afr_handle_replies_quorum(frame, this);
+ AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno,
+ local->fd, NULL);
+ }
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- entry_cksum = gf_rsync_weak_checksum (entry->d_name,
- strlen (entry->d_name));
- local->cont.opendir.checksum[child_index] ^= entry_cksum;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- }
-
- /* read more entries */
-
- STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->readdir,
- local->fd, 131072, last_offset);
-
-out:
- if ((op_ret == 0) || (op_ret == -1)) {
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (__checksums_differ (local->cont.opendir.checksum,
- priv->child_count,
- local->child_up)) {
-
- sh->need_entry_self_heal = _gf_true;
- sh->forced_merge = _gf_true;
- sh->type = local->fd->inode->ia_type;
- sh->background = _gf_false;
- sh->unwind = afr_examine_dir_sh_unwind;
-
- afr_self_heal_type_str_get(&local->self_heal,
- sh_type_str,
- sizeof(sh_type_str));
- gf_log (this->name, GF_LOG_NORMAL,
- "%s self-heal triggered. path: %s, "
- "reason: checksums of directory differ,"
- " forced merge option set",
- sh_type_str, local->loc.path);
-
- afr_self_heal (frame, this);
- } else {
- afr_set_opendir_done (this, local->fd->inode);
-
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
- }
- }
- }
-
- return 0;
+ return 0;
}
-
int
-afr_examine_dir (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
-
- int i;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
-
- local->cont.opendir.checksum = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.opendir.checksum),
- gf_afr_mt_int32_t);
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->readdir,
- local->fd, 131072, 0);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int32_t
-afr_opendir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int32_t up_children_count = 0;
- int ret = -1;
-
- int call_count = -1;
-
- priv = this->private;
- local = frame->local;
-
- up_children_count = afr_up_children_count (priv->child_count,
- local->child_up);
-
- LOCK (&frame->lock);
- {
- if (op_ret >= 0)
- local->op_ret = op_ret;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (local->op_ret == 0) {
-
- ret = afr_fd_ctx_set (this, local->fd);
-
- if (ret) {
- local->op_ret = -1;
- local->op_errno = -1;
- gf_log (this->name, GF_LOG_ERROR, " failed to "
- "set fd ctx for fd %d", local->fd);
- goto out;
- }
- if (!afr_is_opendir_done (this, local->fd->inode) &&
- up_children_count > 1) {
-
- /*
- * This is the first opendir on this inode. We need
- * to check if the directory's entries are the same
- * on all subvolumes. This is needed in addition
- * to regular entry self-heal because the readdir
- * call is sent only to the first subvolume, and
- * thus files that exist only there will never be healed
- * otherwise (assuming changelog shows no anamolies).
- */
-
- gf_log (this->name, GF_LOG_TRACE,
- "reading contents of directory %s looking for mismatch",
- local->loc.path);
-
- afr_examine_dir (frame, this);
-
- } else {
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
- }
- } else {
-out:
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
- }
- }
-
- return 0;
-}
-
-
-int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd)
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
-
- int child_count = 0;
- int i = 0;
-
- int ret = -1;
- int call_count = -1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = -1;
+ int32_t op_errno = ENOMEM;
+ afr_fd_ctx_t *fd_ctx = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ priv = this->private;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- priv = this->private;
+ local->op = GF_FOP_OPENDIR;
- child_count = priv->child_count;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- loc_copy (&local->loc, loc);
-
- frame->local = local;
- local->fd = fd_ref (fd);
-
- call_count = local->call_count;
-
- for (i = 0; i < child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_opendir_cbk,
- priv->children[i],
- priv->children[i]->fops->opendir,
- loc, fd);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd);
- }
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
- return 0;
-}
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
+ goto out;
-/**
- * Common algorithm for directory read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: readdir
- */
+ loc_copy(&local->loc, loc);
+ local->fd = fd_ref(fd);
+ local->fd_ctx = fd_ctx;
-struct entry_name {
- char *name;
- struct list_head list;
-};
+ call_count = local->call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_opendir_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->opendir, loc, fd, NULL);
-static gf_boolean_t
-remembered_name (const char *name, struct list_head *entries)
-{
- struct entry_name *e;
- gf_boolean_t ret = _gf_false;
-
- list_for_each_entry (e, entries, list) {
- if (!strcmp (name, e->name)) {
- ret = _gf_true;
- goto out;
- }
+ if (!--call_count)
+ break;
}
+ }
+ return 0;
out:
- return ret;
+ AFR_STACK_UNWIND(opendir, frame, -1, op_errno, fd, NULL);
+ return 0;
}
-
-static void
-afr_remember_entries (gf_dirent_t *entries, fd_t *fd)
+static int
+afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol)
{
- struct entry_name *n = NULL;
- gf_dirent_t * entry = NULL;
-
- int ret = 0;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "could not get fd ctx for fd=%p", fd);
- return;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- list_for_each_entry (entry, &entries->list, list) {
- n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name);
- n->name = gf_strdup (entry->d_name);
- INIT_LIST_HEAD (&n->list);
+ int gen = 0;
+ int entry_read_subvol = 0;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable,
+ &gen);
+
+ if (gen != priv->event_generation || !data_readable[par_read_subvol] ||
+ !metadata_readable[par_read_subvol])
+ return -1;
+
+ /* Once the control reaches the following statement, it means that the
+ * parent's read subvol is perfectly readable. So calling
+ * either afr_data_subvol_get() or afr_metadata_subvol_get() would
+ * yield the same result. Hence, choosing afr_data_subvol_get() below.
+ */
+
+ if (!priv->consistent_metadata)
+ return 0;
- list_add (&n->list, &fd_ctx->entries);
- }
+ /* For an inode fetched through readdirp which is yet to be linked,
+ * inode ctx would not be initialised (yet). So this function returns
+ * -1 above due to gen being 0, which is why it is OK to pass NULL for
+ * read_subvol_args here.
+ */
+ entry_read_subvol = afr_data_subvol_get(inode, this, NULL, NULL, NULL,
+ NULL);
+ if (entry_read_subvol != par_read_subvol)
+ return -1;
+
+ return 0;
}
-
-static off_t
-afr_filter_entries (gf_dirent_t *entries, fd_t *fd)
+static void
+afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries,
+ int subvol, gf_dirent_t *entries, fd_t *fd)
{
- gf_dirent_t *entry, *tmp;
- int ret = 0;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- off_t offset = 0;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "could not get fd ctx for fd=%p", fd);
- return -1;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- offset = entry->d_off;
-
- if (remembered_name (entry->d_name, &fd_ctx->entries)) {
- list_del (&entry->list);
- GF_FREE (entry);
- }
- }
+ int ret = -1;
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t need_heal = _gf_false;
+ gf_boolean_t validate_subvol = _gf_false;
+
+ this = THIS;
+ priv = this->private;
+
+ need_heal = afr_get_need_heal(this);
+ validate_subvol = need_heal | priv->consistent_metadata;
+
+ list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list)
+ {
+ if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name,
+ frame->root->pid)) {
+ continue;
+ }
- return offset;
-}
+ list_del_init(&entry->list);
+ list_add_tail(&entry->list, &entries->list);
+ if (!validate_subvol)
+ continue;
-static void
-afr_forget_entries (fd_t *fd)
-{
- struct entry_name *entry, *tmp;
- int ret = 0;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "could not get fd ctx for fd=%p", fd);
- return;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) {
- GF_FREE (entry->name);
- list_del (&entry->list);
- GF_FREE (entry);
- }
+ if (entry->inode) {
+ ret = afr_validate_read_subvol(entry->inode, this, subvol);
+ if (ret == -1) {
+ inode_unref(entry->inode);
+ entry->inode = NULL;
+ continue;
+ }
+ }
+ }
}
-
int32_t
-afr_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
+afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
+ gf_dirent_t entries;
- gf_dirent_t * entry = NULL;
- gf_dirent_t * tmp = NULL;
+ INIT_LIST_HEAD(&entries.list);
- int child_index = -1;
+ local = frame->local;
- priv = this->private;
- local = frame->local;
- child_index = (long) cookie;
+ if (op_ret < 0 && !local->cont.readdir.offset) {
+ /* failover only if this was first readdir, detected
+ by offset == 0 */
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- if (op_ret != -1) {
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- entry->d_ino = afr_itransform (entry->d_ino,
- priv->child_count,
- child_index);
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- if ((local->fd->inode == local->fd->inode->table->root)
- && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
- list_del_init (&entry->list);
- GF_FREE (entry);
- }
- }
- }
+ if (op_ret >= 0)
+ afr_readdir_transform_entries(frame, subvol_entries, (long)cookie,
+ &entries, local->fd);
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries);
+ AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata);
- return 0;
-}
+ gf_dirent_free(&entries);
+ return 0;
+}
-int32_t
-afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries)
+int
+afr_readdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- ino_t inum = 0;
-
- int call_child = 0;
- int ret = 0;
-
- gf_dirent_t * entry = NULL;
- gf_dirent_t * tmp = NULL;
-
- int child_index = -1;
-
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- off_t offset = 0;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
-
- child_index = (long) cookie;
-
- if (priv->strict_readdir) {
- ret = fd_ctx_get (local->fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not get fd ctx for fd=%p", local->fd);
- op_ret = -1;
- op_errno = -ret;
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (child_went_down (op_ret, op_errno)) {
- if (all_tried (child_index, priv->child_count)) {
- goto out;
- }
-
- call_child = ++child_index;
-
- gf_log (this->name, GF_LOG_TRACE,
- "starting readdir afresh on child %d, offset %"PRId64,
- call_child, (uint64_t) 0);
-
- fd_ctx->failed_over = _gf_true;
-
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdirp, local->fd,
- local->cont.readdir.size, 0);
- return 0;
- }
- }
-
- if (op_ret != -1) {
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- inum = afr_itransform (entry->d_ino, priv->child_count,
- child_index);
- entry->d_ino = inum;
- inum = afr_itransform (entry->d_stat.ia_ino,
- priv->child_count, child_index);
- entry->d_stat.ia_ino = inum;
-
- if ((local->fd->inode == local->fd->inode->table->root)
- && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
- list_del_init (&entry->list);
- GF_FREE (entry);
- }
- }
- }
-
- if (priv->strict_readdir) {
- if (fd_ctx->failed_over) {
- if (list_empty (&entries->list)) {
- goto out;
- }
-
- offset = afr_filter_entries (entries, local->fd);
-
- afr_remember_entries (entries, local->fd);
-
- if (list_empty (&entries->list)) {
- /* All the entries we got were duplicate. We
- shouldn't send an empty list now, because
- that'll make the application stop reading. So
- try to get more entries */
-
- gf_log (this->name, GF_LOG_TRACE,
- "trying to fetch non-duplicate entries from offset %"PRId64", child %s",
- offset, children[child_index]->name);
-
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) child_index,
- children[child_index],
- children[child_index]->fops->readdirp,
- local->fd, local->cont.readdir.size, offset);
- return 0;
- }
- } else {
- afr_remember_entries (entries, local->fd);
- }
- }
-
-out:
- AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries);
-
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ local->op_errno = EINVAL;
+ local->op_ret = -1;
+ }
+
+ if (subvol == -1 || !fd_ctx) {
+ AFR_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, 0, 0);
return 0;
+ }
+
+ fd_ctx->readdir_subvol = subvol;
+
+ if (local->op == GF_FOP_READDIR)
+ STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdir, local->fd,
+ local->cont.readdir.size, local->cont.readdir.offset,
+ local->xdata_req);
+ else
+ STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdirp, local->fd,
+ local->cont.readdir.size, local->cont.readdir.offset,
+ local->xdata_req);
+ return 0;
}
-
-int32_t
-afr_do_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, int whichop)
+int
+afr_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, int whichop, dict_t *dict)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- int ret = -1;
-
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- children = priv->children;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- frame->local = local;
-
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
- }
-
- local->fd = fd_ref (fd);
- local->cont.readdir.size = size;
-
- if (priv->strict_readdir) {
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not get fd ctx for fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (fd_ctx->last_tried != call_child) {
- gf_log (this->name, GF_LOG_TRACE,
- "first up child has changed from %d to %d, restarting readdir from offset 0",
- fd_ctx->last_tried, call_child);
-
- fd_ctx->failed_over = _gf_true;
- offset = 0;
- }
-
- fd_ctx->last_tried = call_child;
- }
-
- if (whichop == GF_FOP_READDIR)
- STACK_WIND_COOKIE (frame, afr_readdir_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdir, fd,
- size, offset);
- else
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdirp, fd,
- size, offset);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int subvol = -1;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ local->op = whichop;
+ local->fd = fd_ref(fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+ local->xdata_req = (dict) ? dict_ref(dict) : NULL;
+
+ subvol = fd_ctx->readdir_subvol;
+
+ if (offset == 0 || subvol == -1) {
+ /* First readdir has option of failing over and selecting
+ an appropriate read subvolume */
+ afr_read_txn(frame, this, fd->inode, afr_readdir_wind,
+ AFR_DATA_TRANSACTION);
+ } else {
+ /* But continued readdirs MUST stick to the same subvolume
+ without an option to failover */
+ afr_readdir_wind(frame, this, subvol);
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, NULL);
- }
- return 0;
+ AFR_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
-
int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
+afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
{
- afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR);
- return 0;
-}
+ afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
+ return 0;
+}
int32_t
-afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
+afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict)
{
- afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP);
- return 0;
-}
+ afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIRP, dict);
+ return 0;
+}
int32_t
-afr_releasedir (xlator_t *this, fd_t *fd)
+afr_releasedir(xlator_t *this, fd_t *fd)
{
- afr_forget_entries (fd);
- afr_cleanup_fd_ctx (this, fd);
+ afr_cleanup_fd_ctx(this, fd);
- return 0;
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
index 40c7b6aef28..773e925ec6c 100644
--- a/xlators/cluster/afr/src/afr-dir-read.h
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -1,50 +1,33 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __DIR_READ_H__
#define __DIR_READ_H__
-
int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd);
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata);
int32_t
-afr_releasedir (xlator_t *this, fd_t *fd);
+afr_releasedir(xlator_t *this, fd_t *fd);
int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset);
-
+afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata);
int32_t
-afr_readdirp (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset);
+afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict);
int32_t
-afr_getdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, int32_t flag);
-
-
-int32_t
-afr_checksum (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags);
-
+afr_checksum(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ dict_t *xdata);
#endif /* __DIR_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index af42e7e06a0..b7cceb79158 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -1,1349 +1,937 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
#include "afr.h"
#include "afr-transaction.h"
-
void
-afr_build_parent_loc (loc_t *parent, loc_t *child)
+afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this);
+
+int
+afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno)
{
- char *tmp = NULL;
+ int ret = -1;
+ char *child_path = NULL;
+
+ if (!child->parent) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
+ }
+
+ child_path = gf_strdup(child->path);
+ if (!child_path) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ parent->path = gf_strdup(dirname(child_path));
+ if (!parent->path) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ parent->inode = inode_ref(child->parent);
+ gf_uuid_copy(parent->gfid, child->pargfid);
+
+ ret = 0;
+out:
+ GF_FREE(child_path);
- if (!child->parent) {
- loc_copy (parent, child);
- return;
- }
+ return ret;
+}
- tmp = gf_strdup (child->path);
- parent->path = gf_strdup (dirname (tmp));
- GF_FREE (tmp);
+static void
+__afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int inode_read_subvol = -1;
+ int parent_read_subvol = -1;
+ int parent2_read_subvol = -1;
+ int i = 0;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == -1)
+ continue;
+ gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid);
+ args.ia_type = local->replies[i].poststat.ia_type;
+ break;
+ }
+
+ if (local->inode) {
+ if (local->op != GF_FOP_RENAME && local->op != GF_FOP_LINK)
+ afr_replies_interpret(frame, this, local->inode, NULL);
+
+ inode_read_subvol = afr_data_subvol_get(local->inode, this, NULL, NULL,
+ NULL, &args);
+ }
+
+ if (local->parent)
+ parent_read_subvol = afr_data_subvol_get(local->parent, this, NULL,
+ local->readable, NULL, NULL);
+
+ if (local->parent2)
+ parent2_read_subvol = afr_data_subvol_get(local->parent2, this, NULL,
+ local->readable2, NULL, NULL);
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ afr_pick_error_xdata(local, priv, local->parent, local->readable,
+ local->parent2, local->readable2);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ if (local->inode)
+ afr_inode_need_refresh_set(local->inode, this);
+ if (local->parent)
+ afr_inode_need_refresh_set(local->parent, this);
+ if (local->parent2)
+ afr_inode_need_refresh_set(local->parent2, this);
+ continue;
+ }
- parent->name = strrchr (parent->path, '/');
- if (parent->name)
- parent->name++;
+ if (local->op_ret == -1) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.dir_fop.buf = local->replies[i].poststat;
+ local->cont.dir_fop.preparent = local->replies[i].preparent;
+ local->cont.dir_fop.postparent = local->replies[i].postparent;
+ local->cont.dir_fop.prenewparent = local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent = local->replies[i].postparent2;
+ if (local->xdata_rsp) {
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ }
+
+ if (local->replies[i].xdata)
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ continue;
+ }
- parent->inode = inode_ref (child->parent);
- parent->parent = inode_parent (parent->inode, 0, NULL);
- parent->ino = parent->inode->ino;
-}
+ if (i == inode_read_subvol) {
+ local->cont.dir_fop.buf = local->replies[i].poststat;
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ }
-/* {{{ create */
+ if (i == parent_read_subvol) {
+ local->cont.dir_fop.preparent = local->replies[i].preparent;
+ local->cont.dir_fop.postparent = local->replies[i].postparent;
+ }
-int
-afr_create_unwind (call_frame_t *frame, xlator_t *this)
-{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.create.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.create.read_child_buf;
- } else {
- unwind_buf = &local->cont.create.buf;
- }
-
- unwind_buf->ia_ino = local->cont.create.ino;
-
- local->cont.create.preparent.ia_ino = local->cont.create.parent_ino;
- local->cont.create.postparent.ia_ino = local->cont.create.parent_ino;
-
- AFR_STACK_UNWIND (create, main_frame,
- local->op_ret, local->op_errno,
- local->cont.create.fd,
- local->cont.create.inode,
- unwind_buf, &local->cont.create.preparent,
- &local->cont.create.postparent);
+ if (i == parent2_read_subvol) {
+ local->cont.dir_fop.prenewparent = local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent = local->replies[i].postparent2;
}
-
- return 0;
+ }
}
-
-int
-afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- fd_t *fd, inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+static void
+__afr_dir_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *poststat,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- int ret = 0;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- ret = afr_fd_ctx_set (this, fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set ctx on fd=%p", fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not get fd ctx for fd=%p", fd);
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = 1;
- fd_ctx->flags = local->cont.create.flags;
-
- if (local->success_count == 0) {
- local->cont.create.buf = *buf;
-
- local->cont.create.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->first_up_child) {
- local->cont.create.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- local->first_up_child);
- }
-
- if (child_index == local->read_child_index) {
- local->cont.create.read_child_buf = *buf;
- local->cont.create.preparent = *preparent;
- local->cont.create.postparent = *postparent;
- }
-
- local->cont.create.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
-
-unlock:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+
+ if (op_ret >= 0) {
+ if (poststat)
+ local->replies[child_index].poststat = *poststat;
+ if (preparent)
+ local->replies[child_index].preparent = *preparent;
+ if (postparent)
+ local->replies[child_index].postparent = *postparent;
+ if (preparent2)
+ local->replies[child_index].preparent2 = *preparent2;
+ if (postparent2)
+ local->replies[child_index].postparent2 = *postparent2;
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ if (op_errno != ENOTEMPTY)
+ afr_transaction_fop_failed(frame, this, child_index);
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ }
+
+ return;
}
-
-int
-afr_create_wind (call_frame_t *frame, xlator_t *this)
+static int
+__afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->create,
- &local->loc,
- local->cont.create.flags,
- local->cont.create.mode,
- local->cont.create.fd,
- local->cont.create.params);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf,
+ preparent, postparent, preparent2, postparent2,
+ xdata);
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ if (call_count == 0) {
+ __afr_dir_write_finalize(frame, this);
+
+ if (afr_txn_nothing_failed(frame, this)) {
+ /*if it did pre-op, it will do post-op changing ctime*/
+ if (priv->consistent_metadata && afr_needs_changelog_update(local))
+ afr_zero_fill_stat(local);
+ local->transaction.unwind(frame, this);
+ }
+
+ afr_mark_entry_pending_changelog(frame, this);
+ afr_transaction_resume(frame, this);
+ }
+
+ return 0;
+}
int
-afr_create_done (call_frame_t *frame, xlator_t *this)
+afr_mark_new_entry_changelog_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
{
- afr_local_t * local = NULL;
+ int call_count = 0;
- local = frame->local;
+ call_count = afr_frame_return(frame);
- local->transaction.unwind (frame, this);
+ if (call_count == 0)
+ AFR_STACK_DESTROY(frame);
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ return 0;
}
-
-int
-afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params)
+void
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
+ call_frame_t *new_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *new_local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int32_t **changelog = NULL;
+ int i = 0;
+ int op_errno = ENOMEM;
+ unsigned char *pending = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ new_frame = copy_frame(frame);
+ if (!new_frame)
+ goto out;
+
+ new_local = AFR_FRAME_INIT(new_frame, op_errno);
+ if (!new_local)
+ goto out;
+
+ xattr = dict_new();
+ if (!xattr)
+ goto out;
+
+ pending = alloca0(priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ !local->transaction.failed_subvols[i]) {
+ call_count++;
+ continue;
+ }
+ pending[i] = 1;
+ }
- int op_ret = -1;
- int op_errno = 0;
+ changelog = afr_mark_pending_changelog(priv, pending, xattr,
+ local->cont.dir_fop.buf.ia_type);
+ if (!changelog)
+ goto out;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ new_local->pending = changelog;
+ gf_uuid_copy(new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid);
+ new_local->loc.inode = inode_ref(local->inode);
- priv = this->private;
+ new_local->call_count = call_count;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (pending[i])
+ continue;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ STACK_WIND_COOKIE(new_frame, afr_mark_new_entry_changelog_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->xattrop, &new_local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL);
+ if (!--call_count)
+ break;
+ }
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ new_frame = NULL;
+out:
+ if (new_frame)
+ AFR_STACK_DESTROY(new_frame);
+ if (xattr)
+ dict_unref(xattr);
+ return;
+}
- transaction_frame->local = local;
+void
+afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_count = 0;
+ int failed_count = 0;
+ unsigned char *success_replies = NULL;
- loc_copy (&local->loc, loc);
+ local = frame->local;
+ priv = this->private;
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ if (local->op_ret < 0)
+ return;
- local->cont.create.flags = flags;
- local->cont.create.mode = mode;
- local->cont.create.fd = fd_ref (fd);
- if (params)
- local->cont.create.params = dict_ref (params);
+ if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD &&
+ local->op != GF_FOP_MKDIR)
+ return;
- if (loc->parent)
- local->cont.create.parent_ino = loc->parent->ino;
+ pre_op_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
- local->transaction.fop = afr_create_wind;
- local->transaction.done = afr_create_done;
- local->transaction.unwind = afr_create_unwind;
+ /* FOP succeeded on all bricks. */
+ if (pre_op_count == priv->child_count && !failed_count)
+ return;
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
+ /* FOP did not suceed on quorum no. of bricks. */
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+ if (!afr_has_quorum(success_replies, this, NULL))
+ return;
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
+ if (priv->thin_arbiter_count) {
+ /*Mark new entry using ta file*/
+ local->is_new_entry = _gf_true;
+ return;
+ }
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ afr_mark_new_entry_changelog(frame, this);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (create, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
-
- return 0;
+ return;
}
-/* }}} */
-
-/* {{{ mknod */
+/* {{{ create */
int
-afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
+afr_create_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.mknod.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.mknod.read_child_buf;
- } else {
- unwind_buf = &local->cont.mknod.buf;
- }
-
- unwind_buf->ia_ino = local->cont.mknod.ino;
-
- local->cont.mknod.preparent.ia_ino = local->cont.mknod.parent_ino;
- local->cont.mknod.postparent.ia_ino = local->cont.mknod.parent_ino;
-
- AFR_STACK_UNWIND (mknod, main_frame,
- local->op_ret, local->op_errno,
- local->cont.mknod.inode,
- unwind_buf, &local->cont.mknod.preparent,
- &local->cont.mknod.postparent);
- }
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- return 0;
-}
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+
+ if (!main_frame)
+ return 0;
+
+ AFR_STACK_UNWIND(create, main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd, local->inode,
+ &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
+afr_create_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0){
- local->cont.mknod.buf = *buf;
- local->cont.mknod.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->first_up_child) {
- local->cont.mknod.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- local->first_up_child);
- }
-
- if (child_index == local->read_child_index) {
- local->cont.mknod.read_child_buf = *buf;
- local->cont.mknod.preparent = *preparent;
- local->cont.mknod.postparent = *postparent;
- }
-
- local->cont.mknod.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
-int32_t
-afr_mknod_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_create_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mknod,
- &local->loc, local->cont.mknod.mode,
- local->cont.mknod.dev,
- local->cont.mknod.params);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_create_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->create, &local->loc,
+ local->cont.create.flags, local->cont.create.mode,
+ local->umask, local->cont.create.fd, local->xdata_req);
+ return 0;
}
-
int
-afr_mknod_done (call_frame_t *frame, xlator_t *this)
+afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
- AFR_STACK_DESTROY (frame);
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+
+ local->fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!local->fd_ctx)
+ goto out;
+
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->op = GF_FOP_CREATE;
+ local->cont.create.flags = flags;
+ local->fd_ctx->flags = flags;
+ local->cont.create.mode = mode;
+ local->cont.create.fd = fd_ref(fd);
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_create_wind;
+ local->transaction.unwind = afr_create_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
+/* }}} */
+
+/* {{{ mknod */
int
-afr_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev, dict_t *params)
+afr_mknod_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = frame->local;
- transaction_frame->local = local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
-
- local->cont.mknod.mode = mode;
- local->cont.mknod.dev = dev;
- if (params)
- local->cont.mknod.params = dict_ref (params);
-
- if (loc->parent)
- local->cont.mknod.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_mknod_wind;
- local->transaction.done = afr_mknod_done;
- local->transaction.unwind = afr_mknod_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
+ AFR_STACK_UNWIND(mknod, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
+int
+afr_mknod_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+int
+afr_mknod_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_mknod_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mknod, &local->loc,
+ local->cont.mknod.mode, local->cont.mknod.dev,
+ local->umask, local->xdata_req);
+ return 0;
+}
- op_ret = 0;
+int
+afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->op = GF_FOP_MKNOD;
+ local->cont.mknod.mode = mode;
+ local->cont.mknod.dev = dev;
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_mknod_wind;
+ local->transaction.unwind = afr_mknod_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (mknod, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL);
- }
-
- return 0;
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+
+ AFR_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ mkdir */
-
int
-afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.mkdir.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.mkdir.read_child_buf;
- } else {
- unwind_buf = &local->cont.mkdir.buf;
- }
-
- unwind_buf->ia_ino = local->cont.mkdir.ino;
-
- local->cont.mkdir.preparent.ia_ino = local->cont.mkdir.parent_ino;
- local->cont.mkdir.postparent.ia_ino = local->cont.mkdir.parent_ino;
-
- AFR_STACK_UNWIND (mkdir, main_frame,
- local->op_ret, local->op_errno,
- local->cont.mkdir.inode,
- unwind_buf, &local->cont.mkdir.preparent,
- &local->cont.mkdir.postparent);
- }
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- return 0;
-}
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
-int
-afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.mkdir.buf = *buf;
-
- local->cont.mkdir.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->first_up_child) {
- local->cont.mkdir.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- local->first_up_child);
- }
-
- if (child_index == local->read_child_index) {
- local->cont.mkdir.read_child_buf = *buf;
- local->cont.mkdir.preparent = *preparent;
- local->cont.mkdir.postparent = *postparent;
- }
-
- local->cont.mkdir.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(mkdir, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-
int
-afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mkdir,
- &local->loc, local->cont.mkdir.mode,
- local->cont.mkdir.params);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_mkdir_done (call_frame_t *frame, xlator_t *this)
+afr_mkdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ STACK_WIND_COOKIE(frame, afr_mkdir_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mkdir, &local->loc,
+ local->cont.mkdir.mode, local->umask, local->xdata_req);
+ return 0;
}
-
int
-afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params)
+afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
-
- local->cont.mkdir.mode = mode;
- if (params)
- local->cont.mkdir.params = dict_ref (params);
-
- if (loc->parent)
- local->cont.mkdir.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_mkdir_wind;
- local->transaction.done = afr_mkdir_done;
- local->transaction.unwind = afr_mkdir_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.mkdir.mode = mode;
+ local->umask = umask;
+
+ if (!xdata || !dict_get_sizen(xdata, "gfid-req")) {
+ op_errno = EPERM;
+ gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_GFID_NULL,
+ "mkdir: %s is received "
+ "without gfid-req %p",
+ loc->path, xdata);
+ goto out;
+ }
+
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ if (!local->xdata_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->op = GF_FOP_MKDIR;
+ local->transaction.wind = afr_mkdir_wind;
+ local->transaction.unwind = afr_mkdir_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (mkdir, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ link */
-
-int
-afr_link_unwind (call_frame_t *frame, xlator_t *this)
-{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.link.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.link.read_child_buf;
- } else {
- unwind_buf = &local->cont.link.buf;
- }
-
- unwind_buf->ia_ino = local->cont.link.ino;
-
- local->cont.link.preparent.ia_ino = local->cont.link.parent_ino;
- local->cont.link.postparent.ia_ino = local->cont.link.parent_ino;
-
- AFR_STACK_UNWIND (link, main_frame,
- local->op_ret, local->op_errno,
- local->cont.link.inode,
- unwind_buf, &local->cont.link.preparent,
- &local->cont.link.postparent);
- }
-
- return 0;
-}
-
-
int
-afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
+afr_link_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.link.buf = *buf;
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->read_child_index) {
- local->cont.link.read_child_buf = *buf;
- local->cont.link.preparent = *preparent;
- local->cont.link.postparent = *postparent;
- }
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local->cont.link.inode = inode;
+ local = frame->local;
- local->success_count++;
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(link, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-
int
-afr_link_wind (call_frame_t *frame, xlator_t *this)
+afr_link_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->link,
- &local->loc,
- &local->newloc);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_link_done (call_frame_t *frame, xlator_t *this)
+afr_link_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ STACK_WIND_COOKIE(frame, afr_link_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->link, &local->loc,
+ &local->newloc, local->xdata_req);
+ return 0;
}
-
int
-afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
+afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- int ret = -1;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- int op_ret = -1;
- int op_errno = 0;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ loc_copy(&local->loc, oldloc);
+ loc_copy(&local->newloc, newloc);
- priv = this->private;
+ local->inode = inode_ref(oldloc->inode);
+ local->parent = inode_ref(newloc->parent);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ if (!local->xdata_req)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->op = GF_FOP_LINK;
- transaction_frame->local = local;
+ local->transaction.wind = afr_link_wind;
+ local->transaction.unwind = afr_link_unwind;
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->newloc, newloc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, newloc,
+ &op_errno);
+ if (ret)
+ goto out;
- local->cont.link.ino = oldloc->inode->ino;
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(newloc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- if (oldloc->parent)
- local->cont.link.parent_ino = newloc->parent->ino;
-
- local->transaction.fop = afr_link_wind;
- local->transaction.done = afr_link_done;
- local->transaction.unwind = afr_link_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (oldloc->path);
- local->transaction.new_basename = AFR_BASENAME (newloc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (link, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL);
- }
-
- return 0;
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+
+ AFR_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ symlink */
-
int
-afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_symlink_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.symlink.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.symlink.read_child_buf;
- } else {
- unwind_buf = &local->cont.symlink.buf;
- }
-
- unwind_buf->ia_ino = local->cont.symlink.ino;
-
- local->cont.symlink.preparent.ia_ino = local->cont.symlink.parent_ino;
- local->cont.symlink.postparent.ia_ino = local->cont.symlink.parent_ino;
-
- AFR_STACK_UNWIND (symlink, main_frame,
- local->op_ret, local->op_errno,
- local->cont.symlink.inode,
- unwind_buf, &local->cont.symlink.preparent,
- &local->cont.symlink.postparent);
- }
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- return 0;
-}
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
-int
-afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.symlink.buf = *buf;
- local->cont.symlink.ino =
- afr_itransform (buf->ia_ino, priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->first_up_child) {
- local->cont.symlink.ino =
- afr_itransform (buf->ia_ino,
- priv->child_count,
- local->first_up_child);
- }
-
- if (child_index == local->read_child_index) {
- local->cont.symlink.read_child_buf = *buf;
- local->cont.symlink.preparent = *preparent;
- local->cont.symlink.postparent = *postparent;
- }
-
- local->cont.symlink.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(symlink, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-
int
-afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+afr_symlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->symlink,
- local->cont.symlink.linkpath,
- &local->loc,
- local->cont.symlink.params);
-
- if (!--call_count)
- break;
-
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_symlink_done (call_frame_t *frame, xlator_t *this)
+afr_symlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_symlink_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->symlink,
+ local->cont.symlink.linkpath, &local->loc, local->umask,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, dict_t *params)
+afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
-
- local->cont.symlink.linkpath = gf_strdup (linkpath);
- if (params)
- local->cont.symlink.params = dict_ref (params);
-
- if (loc->parent)
- local->cont.symlink.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_symlink_wind;
- local->transaction.done = afr_symlink_done;
- local->transaction.unwind = afr_symlink_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.symlink.linkpath = gf_strdup(linkpath);
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_SYMLINK;
+ local->transaction.wind = afr_symlink_wind;
+ local->transaction.unwind = afr_symlink_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (symlink, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL);
- }
-
- return 0;
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+
+ AFR_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
@@ -1351,230 +939,118 @@ out:
/* {{{ rename */
int
-afr_rename_unwind (call_frame_t *frame, xlator_t *this)
+afr_rename_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- struct iatt *unwind_buf = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- if (local->cont.rename.read_child_buf.ia_ino) {
- unwind_buf = &local->cont.rename.read_child_buf;
- } else {
- unwind_buf = &local->cont.rename.buf;
- }
-
- unwind_buf->ia_ino = local->cont.rename.ino;
-
- local->cont.rename.preoldparent.ia_ino = local->cont.rename.oldparent_ino;
- local->cont.rename.postoldparent.ia_ino = local->cont.rename.oldparent_ino;
- local->cont.rename.prenewparent.ia_ino = local->cont.rename.newparent_ino;
- local->cont.rename.postnewparent.ia_ino = local->cont.rename.newparent_ino;
-
- AFR_STACK_UNWIND (rename, main_frame,
- local->op_ret, local->op_errno,
- unwind_buf,
- &local->cont.rename.preoldparent,
- &local->cont.rename.postoldparent,
- &local->cont.rename.prenewparent,
- &local->cont.rename.postnewparent);
- }
-
- return 0;
-}
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ local = frame->local;
-int
-afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
-{
- afr_local_t * local = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY)
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
-
- if (buf) {
- local->cont.rename.buf = *buf;
- }
-
- local->success_count++;
- }
-
- if (child_index == local->read_child_index) {
- local->cont.rename.read_child_buf = *buf;
-
- local->cont.rename.preoldparent = *preoldparent;
- local->cont.rename.postoldparent = *postoldparent;
- local->cont.rename.prenewparent = *prenewparent;
- local->cont.rename.postnewparent = *postnewparent;
- }
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(rename, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent,
+ &local->cont.dir_fop.prenewparent,
+ &local->cont.dir_fop.postnewparent, local->xdata_rsp);
+ return 0;
}
-
-int32_t
-afr_rename_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_rename_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rename,
- &local->loc,
- &local->newloc);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
}
-
int
-afr_rename_done (call_frame_t *frame, xlator_t *this)
+afr_rename_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ STACK_WIND_COOKIE(frame, afr_rename_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rename, &local->loc,
+ &local->newloc, local->xdata_req);
+ return 0;
}
-
int
-afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
+afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->newloc, newloc);
-
- local->read_child_index = afr_read_child (this, oldloc->inode);
-
- local->cont.rename.ino = oldloc->inode->ino;
-
- if (oldloc->parent)
- local->cont.rename.oldparent_ino = oldloc->parent->ino;
- if (newloc->parent)
- local->cont.rename.newparent_ino = newloc->parent->ino;
-
- local->transaction.fop = afr_rename_wind;
- local->transaction.done = afr_rename_done;
- local->transaction.unwind = afr_rename_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
- afr_build_parent_loc (&local->transaction.new_parent_loc, newloc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (oldloc->path);
- local->transaction.new_basename = AFR_BASENAME (newloc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, oldloc);
+ loc_copy(&local->newloc, newloc);
+
+ local->inode = inode_ref(oldloc->inode);
+ local->parent = inode_ref(oldloc->parent);
+ local->parent2 = inode_ref(newloc->parent);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_RENAME;
+ local->transaction.wind = afr_rename_wind;
+ local->transaction.unwind = afr_rename_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, oldloc,
+ &op_errno);
+ if (ret)
+ goto out;
+ ret = afr_build_parent_loc(&local->transaction.new_parent_loc, newloc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME(newloc->path);
+ ret = afr_transaction(transaction_frame, this,
+ AFR_ENTRY_RENAME_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (rename, frame, op_ret, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
@@ -1582,412 +1058,205 @@ out:
/* {{{ unlink */
int
-afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
-{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.unlink.preparent.ia_ino = local->cont.unlink.parent_ino;
- local->cont.unlink.postparent.ia_ino = local->cont.unlink.parent_ino;
-
- AFR_STACK_UNWIND (unlink, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.unlink.preparent,
- &local->cont.unlink.postparent);
- }
-
- return 0;
-}
-
-
-int
-afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent)
+afr_unlink_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (child_index == local->read_child_index) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.unlink.preparent = *preparent;
- local->cont.unlink.postparent = *postparent;
- }
+ local = frame->local;
- if (child_index == local->read_child_index) {
- local->cont.unlink.preparent = *preparent;
- local->cont.unlink.postparent = *postparent;
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(unlink, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-
-int32_t
-afr_unlink_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_unlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->unlink,
- &local->loc);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-
-int32_t
-afr_unlink_done (call_frame_t *frame, xlator_t *this)
+int
+afr_unlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ STACK_WIND_COOKIE(frame, afr_unlink_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->unlink, &local->loc,
+ local->xflag, local->xdata_req);
+ return 0;
}
-
-int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+int
+afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- if (loc->parent)
- local->cont.unlink.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_unlink_wind;
- local->transaction.done = afr_unlink_done;
- local->transaction.unwind = afr_unlink_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->xflag = xflag;
+
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_UNLINK;
+ local->transaction.wind = afr_unlink_wind;
+ local->transaction.unwind = afr_unlink_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (unlink, frame, op_ret, op_errno,
- NULL, NULL);
- }
-
- return 0;
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+
+ AFR_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ rmdir */
-
-
int
-afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
+afr_rmdir_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.rmdir.preparent.ia_ino = local->cont.rmdir.parent_ino;
- local->cont.rmdir.postparent.ia_ino = local->cont.rmdir.parent_ino;
-
- AFR_STACK_UNWIND (rmdir, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.rmdir.preparent,
- &local->cont.rmdir.postparent);
- }
-
- return 0;
-}
-
-
-int
-afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
- int read_child = 0;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.rmdir.preparent = *preparent;
- local->cont.rmdir.postparent = *postparent;
-
- }
-
- if (child_index == read_child) {
- local->cont.rmdir.preparent = *preparent;
- local->cont.rmdir.postparent = *postparent;
- }
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-
int
-afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+afr_rmdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rmdir,
- &local->loc, local->cont.rmdir.flags);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_rmdir_done (call_frame_t *frame, xlator_t *this)
+afr_rmdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ STACK_WIND_COOKIE(frame, afr_rmdir_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rmdir, &local->loc,
+ local->cont.rmdir.flags, local->xdata_req);
+ return 0;
}
-
int
-afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags)
+afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- transaction_frame->local = local;
-
- local->cont.rmdir.flags = flags;
- loc_copy (&local->loc, loc);
-
- if (loc->parent)
- local->cont.rmdir.parent_ino = loc->parent->ino;
-
- local->transaction.fop = afr_rmdir_wind;
- local->transaction.done = afr_rmdir_done;
- local->transaction.unwind = afr_rmdir_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.rmdir.flags = flags;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_RMDIR;
+ local->transaction.wind = afr_rmdir_wind;
+ local->transaction.unwind = afr_rmdir_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (rmdir, frame, op_ret, op_errno,
- NULL, NULL);
- }
-
- return 0;
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+
+ AFR_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
-
diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h
index e589efa3794..1d88c3b9b26 100644
--- a/xlators/cluster/afr/src/afr-dir-write.h
+++ b/xlators/cluster/afr/src/afr-dir-write.h
@@ -1,60 +1,46 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __DIR_WRITE_H__
#define __DIR_WRITE_H__
int32_t
-afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params);
+afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
int32_t
-afr_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev, dict_t *params);
+afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata);
int32_t
-afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params);
+afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc);
+afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata);
int32_t
-afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags);
+afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata);
int32_t
-afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc);
+afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
int32_t
-afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc);
+afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
int
-afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *oldloc, dict_t *params);
-
-int32_t
-afr_setdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count);
+afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *oldloc, mode_t umask, dict_t *params);
#endif /* __DIR_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 3998607672a..c5521704de2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -1,22 +1,12 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
#include <libgen.h>
#include <unistd.h>
@@ -25,863 +15,1880 @@
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-
+#include <glusterfs/glusterfs.h>
#include "afr.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/quota-common-utils.h>
+
+#include "afr-transaction.h"
+#include "afr-messages.h"
-
-/**
- * Common algorithm for inode read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: access, stat, fstat, readlink, getxattr
- */
+/*
+ * Quota size xattrs are not maintained by afr. There is a
+ * possibility that they differ even when both the directory changelog xattrs
+ * suggest everything is fine. So if there is at least one 'source' check among
+ * the sources which has the maximum quota size. Otherwise check among all the
+ * available ones for maximum quota size. This way if there is a source and
+ * stale copies it always votes for the 'source'.
+ * */
+
+int
+afr_handle_quota_size(call_frame_t *frame, xlator_t *this)
+{
+ unsigned char *readable = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0;
+ int ret = 0;
+ quota_meta_t size = {
+ 0,
+ };
+ quota_meta_t max_size = {
+ 0,
+ };
+ int readable_cnt = 0;
+ int read_subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+
+ readable = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(local->inode, this, readable, 0, 0);
+
+ readable_cnt = AFR_COUNT(readable, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY,
+ SLEN(QUOTA_SIZE_KEY), &size);
+ if (ret == -1)
+ continue;
+ if (read_subvol == -1)
+ read_subvol = i;
+ if (size.size > max_size.size ||
+ (size.file_count + size.dir_count) >
+ (max_size.file_count + max_size.dir_count))
+ read_subvol = i;
+
+ if (size.size > max_size.size)
+ max_size.size = size.size;
+ if (size.file_count > max_size.file_count)
+ max_size.file_count = size.file_count;
+ if (size.dir_count > max_size.dir_count)
+ max_size.dir_count = size.dir_count;
+ }
+
+ if (max_size.size == 0 && max_size.file_count == 0 &&
+ max_size.dir_count == 0)
+ return read_subvol;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ quota_dict_set_meta(replies[i].xdata, QUOTA_SIZE_KEY, &max_size,
+ IA_IFDIR);
+ }
+
+ return read_subvol;
+}
/* {{{ access */
-int32_t
-afr_access_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+int
+afr_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ local = frame->local;
- priv = this->private;
- children = priv->children;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- local = frame->local;
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- read_child = (long) cookie;
+ AFR_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.access.last_tried;
+ return 0;
+}
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.access.last_tried;
+int
+afr_access_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(access, frame, local->op_ret, local->op_errno, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_access_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->access, &local->loc,
+ local->cont.access.mask, local->xdata_req);
+ return 0;
+}
- if (this_try == read_child) {
- goto retry;
- }
+int
+afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- unwind = 0;
+ local->op = GF_FOP_ACCESS;
+ loc_copy(&local->loc, loc);
+ local->cont.access.mask = mask;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->access,
- &local->loc, local->cont.access.mask);
- }
+ afr_read_txn(frame, this, loc->inode, afr_access_wind,
+ AFR_METADATA_TRANSACTION);
+ return 0;
out:
- if (unwind) {
- AFR_STACK_UNWIND (access, frame, op_ret, op_errno);
- }
+ AFR_STACK_UNWIND(access, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
+/* }}} */
-int32_t
-afr_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask)
-{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
+/* {{{ stat */
- int32_t read_child = -1;
+int
+afr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ local = frame->local;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ AFR_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
- children = priv->children;
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_stat_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- read_child = afr_read_child (this, loc->inode);
+ priv = this->private;
+ local = frame->local;
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, 0, 0);
+ return 0;
+ }
- local->cont.access.last_tried = -1;
+ STACK_WIND_COOKIE(
+ frame, afr_stat_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->stat, &local->loc, local->xdata_req);
+ return 0;
+}
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
- }
+int
+afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- local->cont.access.last_tried = call_child;
- }
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- loc_copy (&local->loc, loc);
- local->cont.access.mask = mask;
+ local->op = GF_FOP_STAT;
+ loc_copy(&local->loc, loc);
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->access,
- loc, mask);
+ afr_read_txn(frame, this, loc->inode, afr_stat_wind, AFR_DATA_TRANSACTION);
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (access, frame, op_ret, op_errno);
- }
- return 0;
-}
+ AFR_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
/* }}} */
-/* {{{ stat */
+/* {{{ fstat */
-int32_t
-afr_stat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf)
+int
+afr_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ local = frame->local;
- priv = this->private;
- children = priv->children;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- read_child = (long) cookie;
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- local = frame->local;
+ AFR_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.stat.last_tried;
-
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.stat.last_tried;
-
- if (this_try == read_child) {
- goto retry;
- }
-
- unwind = 0;
+ return 0;
+}
- STACK_WIND_COOKIE (frame, afr_stat_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->stat,
- &local->loc);
- }
+int
+afr_fstat_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
-out:
- if (unwind) {
- if (buf)
- buf->ia_ino = local->cont.stat.ino;
+ priv = this->private;
+ local = frame->local;
- AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf);
- }
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, 0, 0);
+ return 0;
+ }
- return 0;
+ STACK_WIND_COOKIE(
+ frame, afr_fstat_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->fstat, local->fd, local->xdata_req);
+ return 0;
}
-
int32_t
-afr_stat (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- int32_t read_child = -1;
- int call_child = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ local->op = GF_FOP_FSTAT;
+ local->fd = fd_ref(fd);
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_fix_open(fd, this);
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ afr_read_txn(frame, this, fd->inode, afr_fstat_wind, AFR_DATA_TRANSACTION);
- children = priv->children;
+ return 0;
+out:
+ AFR_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ return 0;
+}
- frame->local = local;
+/* }}} */
- read_child = afr_read_child (this, loc->inode);
+/* {{{ readlink */
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+int
+afr_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, const char *buf,
+ struct iatt *sbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- local->cont.stat.last_tried = -1;
+ local = frame->local;
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
- }
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- local->cont.stat.last_tried = call_child;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- loc_copy (&local->loc, loc);
+ AFR_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
+ return 0;
+}
- local->cont.stat.ino = loc->inode->ino;
+int
+afr_readlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(readlink, frame, local->op_ret, local->op_errno, 0, 0,
+ 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_readlink_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readlink, &local->loc,
+ local->cont.readlink.size, local->xdata_req);
+ return 0;
+}
- STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->stat,
- loc);
+int
+afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, NULL);
- }
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- return 0;
-}
+ local->op = GF_FOP_READLINK;
+ loc_copy(&local->loc, loc);
+ local->cont.readlink.size = size;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+ afr_read_txn(frame, this, loc->inode, afr_readlink_wind,
+ AFR_DATA_TRANSACTION);
-/* }}} */
+ return 0;
+out:
+ AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0);
-/* {{{ fstat */
+ return 0;
+}
-int32_t
-afr_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+/* }}} */
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+/* {{{ getxattr */
- priv = this->private;
- children = priv->children;
+struct _xattr_key {
+ char *key;
+ struct list_head list;
+};
- local = frame->local;
+int
+__gather_xattr_keys(dict_t *dict, char *key, data_t *value, void *data)
+{
+ struct list_head *list = data;
+ struct _xattr_key *xkey = NULL;
- read_child = (long) cookie;
+ if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) {
+ xkey = GF_MALLOC(sizeof(*xkey), gf_afr_mt_xattr_key);
+ if (!xkey)
+ return -1;
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.fstat.last_tried;
+ xkey->key = key;
+ INIT_LIST_HEAD(&xkey->list);
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.fstat.last_tried;
+ list_add_tail(&xkey->list, list);
+ }
+ return 0;
+}
- if (this_try == read_child) {
- goto retry;
- }
+void
+afr_filter_xattrs(dict_t *dict)
+{
+ struct list_head keys = {
+ 0,
+ };
+ struct _xattr_key *key = NULL;
+ struct _xattr_key *tmp = NULL;
- unwind = 0;
+ INIT_LIST_HEAD(&keys);
- STACK_WIND_COOKIE (frame, afr_fstat_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->fstat,
- local->fd);
- }
+ dict_foreach(dict, __gather_xattr_keys, (void *)&keys);
-out:
- if (unwind) {
- if (buf)
- buf->ia_ino = local->cont.fstat.ino;
+ list_for_each_entry_safe(key, tmp, &keys, list)
+ {
+ dict_del(dict, key->key);
- AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf);
- }
+ list_del_init(&key->list);
- return 0;
+ GF_FREE(key);
+ }
}
+static gf_boolean_t
+afr_getxattr_ignorable_errnos(int32_t op_errno)
+{
+ if (op_errno == ENODATA || op_errno == ENOTSUP || op_errno == ERANGE ||
+ op_errno == ENAMETOOLONG)
+ return _gf_true;
-int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
+ return _gf_false;
+}
+int
+afr_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
- int call_child = 0;
- int32_t read_child = -1;
+ local = frame->local;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ if (op_ret < 0 && !afr_getxattr_ignorable_errnos(op_errno)) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ if (dict)
+ afr_filter_xattrs(dict);
- children = priv->children;
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ return 0;
+}
- frame->local = local;
+int
+afr_getxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_getxattr_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->getxattr, &local->loc,
+ local->cont.getxattr.name, local->xdata_req);
+ return 0;
+}
- VALIDATE_OR_GOTO (fd->inode, out);
+int32_t
+afr_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
+ dict_t *xdata)
- read_child = afr_read_child (this, fd->inode);
+{
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+int32_t
+afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {
+ 0,
+ };
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+ int keylen = 0;
+ int children_keylen = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ children_keylen = strlen(children[cky]->name);
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (local->dict) {
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstrn(local->dict, children[cky]->name,
+ children_keylen, gf_strdup(tmp_report));
+ if (ret)
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim(local->dict, lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
+ ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+ gf_strdup(lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED,
+ "Error setting dictionary");
+ goto unwind;
+ }
- local->cont.fstat.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
+ op_errno = afr_final_errno(local, priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
- }
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+ if (xattr)
+ dict_unref(xattr);
+ }
- local->cont.fstat.last_tried = call_child;
+ return ret;
+}
+
+int32_t
+afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {
+ 0,
+ };
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+ int keylen = 0;
+ int children_keylen = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long)cookie;
+
+ keylen = strlen(local->cont.getxattr.name);
+ children_keylen = strlen(children[cky]->name);
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (local->dict) {
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstrn(local->dict, children[cky]->name,
+ children_keylen, gf_strdup(tmp_report));
+ if (ret)
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim(local->dict, lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
+ ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+ gf_strdup(lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED,
+ "Error setting dictionary");
+ goto unwind;
}
- local->cont.fstat.ino = fd->inode->ino;
- local->fd = fd_ref (fd);
+ op_errno = afr_final_errno(local, priv);
- STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->fstat,
- fd);
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL);
- }
+ if (xattr)
+ dict_unref(xattr);
+ }
- return 0;
+ return ret;
}
-/* }}} */
-
-/* {{{ readlink */
+/**
+ * node-uuid cbk uses next child querying mechanism
+ */
+int32_t
+afr_getxattr_node_uuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t **children = NULL;
+ int unwind = 1;
+ int curr_call_child = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) { /** query the _next_ child */
+
+ /**
+ * _current_ becomes _next_
+ * If done with all children and yet no success; give up !
+ */
+ curr_call_child = (int)((long)cookie);
+ if (++curr_call_child == priv->child_count)
+ goto unwind;
+
+ gf_msg_debug(this->name, op_errno,
+ "op_ret (-1): Re-querying afr-child (%d/%d)",
+ curr_call_child, priv->child_count);
+
+ unwind = 0;
+ STACK_WIND_COOKIE(
+ frame, afr_getxattr_node_uuid_cbk, (void *)(long)curr_call_child,
+ children[curr_call_child],
+ children[curr_call_child]->fops->getxattr, &local->loc,
+ local->cont.getxattr.name, local->xdata_req);
+ }
+
+unwind:
+ if (unwind)
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+
+ return 0;
+}
+/**
+ * list-node-uuids cbk returns the list of node_uuids for the subvolume.
+ */
int32_t
-afr_readlink_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- const char *buf, struct iatt *sbuf)
+afr_getxattr_list_node_uuids_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr_serz = NULL;
+ long cky = 0;
+ int32_t tlen = 0;
+
+ local = frame->local;
+ priv = this->private;
+ cky = (long)cookie;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ local->replies[cky].valid = 1;
+ local->replies[cky].op_ret = op_ret;
+ local->replies[cky].op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto unlock;
+
+ local->op_ret = 0;
+
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ local->replies[cky].xattr = dict_ref(dict);
+ }
+
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ if (local->op_ret != 0) {
+ /* All bricks gave an error. */
+ local->op_errno = afr_final_errno(local, priv);
+ goto unwind;
+ }
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ /*Since we store the UUID0_STR as node uuid for down bricks and
+ *for non zero op_ret, assigning length to priv->child_count
+ *number of uuids*/
+ local->cont.getxattr.xattr_len = (SLEN(UUID0_STR) + 2) *
+ priv->child_count;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- priv = this->private;
- children = priv->children;
+ xattr_serz = GF_CALLOC(local->cont.getxattr.xattr_len, sizeof(char),
+ gf_common_mt_char);
- local = frame->local;
+ if (!xattr_serz) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- read_child = (long) cookie;
+ ret = afr_serialize_xattrs_with_delimiter(frame, this, xattr_serz,
+ UUID0_STR, &tlen, ' ');
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ GF_FREE(xattr_serz);
+ goto unwind;
+ }
+ ret = dict_set_dynstr_sizen(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set node_uuid key in dict");
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
+ } else {
+ local->op_ret = local->cont.getxattr.xattr_len - 1;
+ local->op_errno = 0;
+ }
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.readlink.last_tried;
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->dict, local->xdata_rsp);
+ }
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.readlink.last_tried;
+ return ret;
+}
- if (this_try == read_child) {
- goto retry;
- }
+int32_t
+afr_getxattr_quota_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int idx = (long)cookie;
+ int call_count = 0;
+ afr_local_t *local = frame->local;
+ int read_subvol = -1;
+
+ local->replies[idx].valid = 1;
+ local->replies[idx].op_ret = op_ret;
+ local->replies[idx].op_errno = op_errno;
+ if (dict)
+ local->replies[idx].xdata = dict_ref(dict);
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ local->inode = inode_ref(local->loc.inode);
+ read_subvol = afr_handle_quota_size(frame, this);
+ if (read_subvol != -1) {
+ op_ret = local->replies[read_subvol].op_ret;
+ op_errno = local->replies[read_subvol].op_errno;
+ dict = local->replies[read_subvol].xdata;
+ }
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ }
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->readlink,
- &local->loc,
- local->cont.readlink.size);
- }
+ return 0;
+}
-out:
- if (unwind) {
- if (sbuf)
- sbuf->ia_ino = local->cont.readlink.ino;
+int32_t
+afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
- AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf);
- }
+ LOCK(&frame->lock);
+ {
+ local = frame->local;
- return 0;
-}
+ call_cnt = --local->call_count;
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
-int32_t
-afr_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size)
-{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+
+ if (!dict) {
+ goto unlock;
+ }
+
+ op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
- int32_t read_child = -1;
+ if (!lockinfo_buf) {
+ goto unlock;
+ }
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if (lockinfo && local->dict) {
+ dict_copy(lockinfo, local->dict);
+ }
+ }
+ }
+
+ if (xdata && local->xdata_rsp) {
+ dict_copy(xdata, local->xdata_rsp);
+ }
+
+ if (!call_cnt) {
+ newdict = dict_new();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
+ local->op_ret = -1;
+ goto unwind;
+ }
- children = priv->children;
+ op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict,
+ local->xdata_rsp);
+ }
- frame->local = local;
+ dict_unref(lockinfo);
- read_child = afr_read_child (this, loc->inode);
+ return 0;
+}
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+int32_t
+afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
- local->cont.readlink.last_tried = -1;
+ LOCK(&frame->lock);
+ {
+ local = frame->local;
- } else {
- call_child = afr_first_up_child (priv);
+ call_cnt = --local->call_count;
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
+
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
}
+ }
+ }
- local->cont.readlink.last_tried = call_child;
+ if (!dict) {
+ goto unlock;
}
- loc_copy (&local->loc, loc);
+ op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
- local->cont.readlink.size = size;
- local->cont.readlink.ino = loc->inode->ino;
+ if (!lockinfo_buf) {
+ goto unlock;
+ }
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->readlink,
- loc, size);
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, NULL, NULL);
- }
- return 0;
-}
+ if (lockinfo && local->dict) {
+ dict_copy(lockinfo, local->dict);
+ }
+ }
+ }
+
+ if (xdata && local->xdata_rsp) {
+ dict_copy(xdata, local->xdata_rsp);
+ }
+
+ if (!call_cnt) {
+ newdict = dict_new();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
+ local->op_ret = -1;
+ goto unwind;
+ }
-/* }}} */
+ op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
+ }
-/* {{{ getxattr */
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict,
+ local->xdata_rsp);
+ }
-struct _xattr_key {
- char *key;
- struct list_head list;
-};
+ dict_unref(lockinfo);
+ return 0;
+}
-void
-__gather_xattr_keys (dict_t *dict, char *key, data_t *value,
- void *data)
+int32_t
+afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- struct list_head * list = data;
- struct _xattr_key * xkey = NULL;
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ int keylen = 0;
+ char xattr_cky[1024] = {
+ 0,
+ };
+ int xattr_cky_len = 0;
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+ local->cont.getxattr.name, cky);
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ }
- if (!strncmp (key, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
+ if (!dict || (op_ret < 0))
+ goto unlock;
- xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key);
- if (!xkey)
- return;
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict)
+ goto unlock;
+ }
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+ if (ret)
+ goto unlock;
+
+ xattr = gf_strdup(xattr);
+
+ ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set xattr cookie key");
+ goto post_unlock;
+ }
- xkey->key = key;
- INIT_LIST_HEAD (&xkey->list);
+ local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len,
+ gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ int xattr_serz_len = sprintf(
+ xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim(
+ local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
+ if (ret) {
+ GF_FREE(xattr_serz);
+ goto unwind;
+ }
- list_add_tail (&xkey->list, list);
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
+
+ ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set pathinfo key in dict");
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
}
-}
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno,
+ nxattr, local->xdata_rsp);
-void
-__filter_xattrs (dict_t *dict)
-{
- struct list_head keys;
+ if (nxattr)
+ dict_unref(nxattr);
+ }
- struct _xattr_key *key;
- struct _xattr_key *tmp;
+out:
+ return ret;
+}
- INIT_LIST_HEAD (&keys);
+int32_t
+afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ char xattr_cky[1024] = {
+ 0,
+ };
+ int keylen = 0;
+ int xattr_cky_len = 0;
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+ local->cont.getxattr.name, cky);
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ }
- dict_foreach (dict, __gather_xattr_keys,
- (void *) &keys);
+ if (!dict || (op_ret < 0))
+ goto unlock;
- list_for_each_entry_safe (key, tmp, &keys, list) {
- dict_del (dict, key->key);
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict)
+ goto unlock;
+ }
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+ if (ret)
+ goto unlock;
+
+ xattr = gf_strdup(xattr);
+
+ ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set xattr cookie key");
+ goto post_unlock;
+ }
- list_del_init (&key->list);
+ local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len,
+ gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ int xattr_serz_len = sprintf(
+ xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim(
+ local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
+ if (ret) {
+ GF_FREE(xattr_serz);
+ goto unwind;
+ }
- GF_FREE (key);
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
+
+ ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set pathinfo key in dict");
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
}
+
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ nxattr, local->xdata_rsp);
+
+ if (nxattr)
+ dict_unref(nxattr);
+ }
+
+out:
+ return ret;
}
+static int
+afr_aggregate_stime_xattr(dict_t *this, char *key, data_t *value, void *data)
+{
+ int ret = 0;
+
+ if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+ ret = gf_get_max_stime(THIS, data, key, value);
+ return ret;
+}
int32_t
-afr_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict)
+afr_common_getxattr_stime_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
- priv = this->private;
- children = priv->children;
+ local = frame->local;
- local = frame->local;
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.getxattr.last_tried;
+ if (!dict || (op_ret < 0)) {
+ local->op_errno = op_errno;
+ goto cleanup;
+ }
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.getxattr.last_tried;
+ if (!local->dict)
+ local->dict = dict_copy_with_ref(dict, NULL);
+ else
+ dict_foreach(dict, afr_aggregate_stime_xattr, local->dict);
+ local->op_ret = 0;
+ }
- if (this_try == read_child) {
- goto retry;
- }
+cleanup:
+ UNLOCK(&frame->lock);
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name);
- }
+ if (!callcnt) {
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->dict, xdata);
+ }
out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
+ return 0;
+}
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
- }
+static gf_boolean_t
+afr_is_special_xattr(const char *name, fop_getxattr_cbk_t *cbk,
+ gf_boolean_t is_fgetxattr)
+{
+ gf_boolean_t is_spl = _gf_true;
+
+ GF_ASSERT(cbk);
+ if (!cbk || !name) {
+ is_spl = _gf_false;
+ goto out;
+ }
+
+ if (!strcmp(name, GF_XATTR_PATHINFO_KEY) ||
+ !strcmp(name, GF_XATTR_USER_PATHINFO_KEY)) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_pathinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_pathinfo_cbk;
+ }
+ } else if (!strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_clrlk_cbk;
+ } else {
+ *cbk = afr_getxattr_clrlk_cbk;
+ }
+ } else if (!strncmp(name, GF_XATTR_LOCKINFO_KEY,
+ SLEN(GF_XATTR_LOCKINFO_KEY))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_lockinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_lockinfo_cbk;
+ }
+ } else if (fnmatch(GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+ *cbk = afr_common_getxattr_stime_cbk;
+ } else if (strcmp(name, QUOTA_SIZE_KEY) == 0) {
+ *cbk = afr_getxattr_quota_size_cbk;
+ } else if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) {
+ *cbk = afr_getxattr_list_node_uuids_cbk;
+ } else {
+ is_spl = _gf_false;
+ }
- return 0;
+out:
+ return is_spl;
}
-int32_t
-afr_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+static void
+afr_getxattr_all_subvols(xlator_t *this, call_frame_t *frame, const char *name,
+ loc_t *loc, fop_getxattr_cbk_t cbk)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t * local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ priv = this->private;
+
+ local = frame->local;
+ // local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) {
+ GF_FREE(local->cont.getxattr.name);
+ local->cont.getxattr.name = gf_strdup(GF_XATTR_NODE_UUID_KEY);
+ }
+
+ // If up-children count is 0, afr_local_init would have failed already
+ // and the call would have unwound so not handling it here.
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->getxattr, loc,
+ local->cont.getxattr.name, NULL);
+ if (!--call_count)
+ break;
+ }
+ }
+ return;
+}
- int read_child = -1;
+int
+afr_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+ xlator_t **subvols)
+{
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ memcpy(subvols, priv->children, sizeof(*subvols) * priv->child_count);
+ if (type == MARKER_XTIME_TYPE) {
+ /*Don't error out on ENOENT/ENOTCONN */
+ gauge[MCNT_NOTFOUND] = 0;
+ gauge[MCNT_ENOTCONN] = 0;
+ }
+ return priv->child_count;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+static int
+afr_handle_heal_xattrs(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *heal_op)
+{
+ int ret = -1;
+ afr_spb_status_t *data = NULL;
+
+ if (!strcmp(heal_op, GF_HEAL_INFO)) {
+ afr_get_heal_info(frame, this, loc);
+ ret = 0;
+ goto out;
+ }
+
+ if (!strcmp(heal_op, GF_AFR_HEAL_SBRAIN)) {
+ afr_heal_splitbrain_file(frame, this, loc);
+ ret = 0;
+ goto out;
+ }
+
+ if (!strcmp(heal_op, GF_AFR_SBRAIN_STATUS)) {
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spb_status_t);
+ if (!data) {
+ ret = 1;
+ goto out;
+ }
+ data->frame = frame;
+ data->loc = loc;
+ ret = synctask_new(this->ctx->env, afr_get_split_brain_status,
+ afr_get_split_brain_status_cbk, NULL, data);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ "Failed to create"
+ " synctask. Unable to fetch split-brain status"
+ " for %s.",
+ loc->name);
+ ret = 1;
+ goto out;
+ }
+ goto out;
+ }
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+out:
+ if (ret == 1) {
+ AFR_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL);
+ if (data)
+ GF_FREE(data);
+ ret = 0;
+ }
+ return ret;
+}
- children = priv->children;
+int32_t
+afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t **children = NULL;
+ int i = 0;
+ int32_t op_errno = 0;
+ int ret = -1;
+ fop_getxattr_cbk_t cbk = NULL;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ priv = this->private;
+
+ children = priv->children;
+
+ loc_copy(&local->loc, loc);
+
+ local->op = GF_FOP_GETXATTR;
+
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ if (!name)
+ goto no_name;
+
+ local->cont.getxattr.name = gf_strdup(name);
+
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (!strncmp(name, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) {
+ op_errno = ENODATA;
+ goto out;
+ }
+
+ if (cluster_handle_marker_getxattr(frame, loc, name, priv->vol_uuid,
+ afr_getxattr_unwind,
+ afr_marker_populate_args) == 0)
+ return 0;
+
+ ret = afr_handle_heal_xattrs(frame, this, &local->loc, name);
+ if (ret == 0)
+ return 0;
+
+ /*
+ * Heal daemons don't have IO threads ... and as a result they
+ * send this getxattr down and eventually crash :(
+ */
+ op_errno = -1;
+ GF_CHECK_XATTR_KEY_AND_GOTO(name, IO_THREADS_QUEUE_SIZE_KEY, op_errno, out);
+
+ /*
+ * Special xattrs which need responses from all subvols
+ */
+ if (afr_is_special_xattr(name, &cbk, 0)) {
+ afr_getxattr_all_subvols(this, frame, name, loc, cbk);
+ return 0;
+ }
+
+ if (XATTR_IS_NODE_UUID(name)) {
+ i = 0;
+ STACK_WIND_COOKIE(frame, afr_getxattr_node_uuid_cbk, (void *)(long)i,
+ children[i], children[i]->fops->getxattr, loc, name,
+ xdata);
+ return 0;
+ }
+
+no_name:
+
+ afr_read_txn(frame, this, local->loc.inode, afr_getxattr_wind,
+ AFR_METADATA_TRANSACTION);
+
+ ret = 0;
+out:
+ if (ret < 0)
+ AFR_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
- frame->local = local;
+/* {{{ fgetxattr */
- if (name) {
- if (!strncmp (name, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
+int32_t
+afr_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- op_errno = ENODATA;
- goto out;
- }
+ local = frame->local;
- }
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- read_child = afr_read_child (this, loc->inode);
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+ if (dict)
+ afr_filter_xattrs(dict);
- local->cont.getxattr.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
- }
+ return 0;
+}
- local->cont.getxattr.last_tried = call_child;
- }
+int
+afr_fgetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_fgetxattr_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fgetxattr, local->fd,
+ local->cont.getxattr.name, local->xdata_req);
+ return 0;
+}
- loc_copy (&local->loc, loc);
- if (name)
- local->cont.getxattr.name = gf_strdup (name);
+static void
+afr_fgetxattr_all_subvols(xlator_t *this, call_frame_t *frame,
+ fop_fgetxattr_cbk_t cbk)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ priv = this->private;
+
+ local = frame->local;
+ // local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ // If up-children count is 0, afr_local_init would have failed already
+ // and the call would have unwound so not handling it here.
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fgetxattr, local->fd,
+ local->cont.getxattr.name, NULL);
+ if (!--call_count)
+ break;
+ }
+ }
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->getxattr,
- loc, name);
+ return;
+}
- op_ret = 0;
+int
+afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ fop_fgetxattr_cbk_t cbk = NULL;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_FGETXATTR;
+ local->fd = fd_ref(fd);
+ if (name) {
+ local->cont.getxattr.name = gf_strdup(name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ }
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ /* pathinfo gets handled only in getxattr(), but we need to handle
+ * lockinfo.
+ * If we are doing fgetxattr with lockinfo as the key then we
+ * collect information from all children.
+ */
+ if (afr_is_special_xattr(name, &cbk, 1)) {
+ afr_fgetxattr_all_subvols(this, frame, cbk);
+ return 0;
+ }
+
+ afr_fix_open(fd, this);
+
+ afr_read_txn(frame, this, fd->inode, afr_fgetxattr_wind,
+ AFR_METADATA_TRANSACTION);
+
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL);
- }
- return 0;
-}
+ AFR_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
/* }}} */
/* {{{ readv */
-/**
- * read algorithm:
- *
- * if the user has specified a read subvolume, use it
- * otherwise -
- * use the inode number to hash it to one of the subvolumes, and
- * read from there (to balance read load)
- *
- * if any of the above read's fail, try the children in sequence
- * beginning at the beginning
- */
-
-int32_t
-afr_readv_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref)
+int
+afr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iovec *vector, int32_t count,
+ struct iatt *buf, struct iobref *iobref, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
-
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ afr_local_t *local = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- children = priv->children;
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- local = frame->local;
+ AFR_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf, iobref,
+ xdata);
+ return 0;
+}
- read_child = (long) cookie;
+int
+afr_readv_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, 0, 0, 0,
+ 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(
+ frame, afr_readv_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->readv, local->fd, local->cont.readv.size,
+ local->cont.readv.offset, local->cont.readv.flags, local->xdata_req);
+ return 0;
+}
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.readv.last_tried;
+int
+afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.readv.last_tried;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- if (this_try == read_child) {
- /*
- skip the read child since if we are here
- we must have already tried that child
- */
- goto retry;
- }
+ local->op = GF_FOP_READ;
+ local->fd = fd_ref(fd);
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+ local->cont.readv.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- unwind = 0;
+ afr_fix_open(fd, this);
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->readv,
- local->fd, local->cont.readv.size,
- local->cont.readv.offset);
- }
+ afr_read_txn(frame, this, fd->inode, afr_readv_wind, AFR_DATA_TRANSACTION);
+ return 0;
out:
- if (unwind) {
- if (buf && local)
- buf->ia_ino = local->cont.readv.ino;
-
- AFR_STACK_UNWIND (readv, frame, op_ret, op_errno,
- vector, count, buf, iobref);
- }
+ AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
- return 0;
+ return 0;
}
+/* }}} */
-int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset)
+/* {{{ seek */
+
+int
+afr_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, off_t offset, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
- int32_t read_child = -1;
- int call_child = 0;
+ local = frame->local;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (fd, out);
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- priv = this->private;
- children = priv->children;
+ AFR_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata);
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_seek_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(seek, frame, local->op_ret, local->op_errno, 0, NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(
+ frame, afr_seek_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->seek, local->fd, local->cont.seek.offset,
+ local->cont.seek.what, local->xdata_req);
+ return 0;
+}
- frame->local = local;
+int
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
- read_child = afr_read_child (this, fd->inode);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- if ((read_child >= 0) && (priv->child_up[read_child])) {
- call_child = read_child;
+ local->op = GF_FOP_SEEK;
+ local->fd = fd_ref(fd);
+ local->cont.seek.offset = offset;
+ local->cont.seek.what = what;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- /*
- if read fails from the read child, we try
- all children starting with the first one
- */
- local->cont.readv.last_tried = -1;
+ afr_fix_open(fd, this);
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
- }
-
- local->cont.readv.last_tried = call_child;
- }
-
- local->fd = fd_ref (fd);
-
- local->cont.readv.ino = fd->inode->ino;
- local->cont.readv.size = size;
- local->cont.readv.offset = offset;
-
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readv,
- fd, size, offset);
-
- op_ret = 0;
+ afr_read_txn(frame, this, fd->inode, afr_seek_wind, AFR_DATA_TRANSACTION);
+
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, 0, NULL,
- NULL);
- }
- return 0;
-}
+ AFR_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL);
+ return 0;
+}
/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
index acc814fb7dc..8c982bc7e6f 100644
--- a/xlators/cluster/afr/src/afr-inode-read.h
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -1,47 +1,45 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __INODE_READ_H__
#define __INODE_READ_H__
int32_t
-afr_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask);
+afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata);
int32_t
-afr_stat (call_frame_t *frame, xlator_t *this,
- loc_t *loc);
+afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd);
+afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
int32_t
-afr_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size);
+afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata);
int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset);
+afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
int32_t
-afr_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name);
+afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata);
+int32_t
+afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata);
+
+int
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata);
+int
+afr_handle_quota_size(call_frame_t *frame, xlator_t *this);
#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 050a4f0e9ef..1d6e4f3570a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -1,1594 +1,2565 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-
+#include <glusterfs/glusterfs.h>
#include "afr.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include "protocol-common.h"
+#include <glusterfs/byte-order.h>
#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
-/* {{{ writev */
+static void
+__afr_inode_write_finalize(call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int ret = 0;
+ int read_subvol = 0;
+ struct iatt *stbuf = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, local->inode, out);
+
+ /*This code needs to stay till DHT sends fops on linked
+ * inodes*/
+ if (!inode_is_linked(local->inode)) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == -1)
+ continue;
+ if (!gf_uuid_is_null(local->replies[i].poststat.ia_gfid)) {
+ gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid);
+ args.ia_type = local->replies[i].poststat.ia_type;
+ break;
+ } else {
+ ret = dict_get_bin(local->replies[i].xdata,
+ DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+ if (ret)
+ continue;
+ gf_uuid_copy(args.gfid, stbuf->ia_gfid);
+ args.ia_type = stbuf->ia_type;
+ break;
+ }
+ }
+ }
+
+ if (local->transaction.type == AFR_METADATA_TRANSACTION) {
+ read_subvol = afr_metadata_subvol_get(local->inode, this, NULL,
+ local->readable, NULL, &args);
+ } else {
+ read_subvol = afr_data_subvol_get(local->inode, this, NULL,
+ local->readable, NULL, &args);
+ }
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ afr_pick_error_xdata(local, priv, local->inode, local->readable, NULL,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0)
+ continue;
+
+ /* Order of checks in the compound conditional
+ below is important.
+
+ - Highest precedence: largest op_ret
+ - Next precedence: if all op_rets are equal, read subvol
+ - Least precedence: any succeeded subvol
+ */
+ if ((local->op_ret < local->replies[i].op_ret) ||
+ ((local->op_ret == local->replies[i].op_ret) &&
+ (i == read_subvol))) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.inode_wfop.prebuf = local->replies[i].prestat;
+ local->cont.inode_wfop.postbuf = local->replies[i].poststat;
+
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ if (local->replies[i].xattr) {
+ if (local->xattr_rsp)
+ dict_unref(local->xattr_rsp);
+ local->xattr_rsp = dict_ref(local->replies[i].xattr);
+ }
+ }
+ }
+
+ afr_set_in_flight_sb_status(this, frame, local->inode);
+out:
+ return;
+}
-int
-afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+static void
+__afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->replies[child_index].valid = 1;
+
+ if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1)
+ op_ret = iov_length(local->cont.writev.vector,
+ local->cont.writev.count);
+
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+
+ if (op_ret >= 0) {
+ if (prebuf)
+ local->replies[child_index].prestat = *prebuf;
+ if (postbuf)
+ local->replies[child_index].poststat = *postbuf;
+ if (xattr)
+ local->replies[child_index].xattr = dict_ref(xattr);
+ } else {
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
+
+ return;
+}
- local = frame->local;
+static int
+__afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_private_t *priv = NULL;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ priv = this->private;
+ local = frame->local;
- if (main_frame) {
- local->cont.writev.prebuf.ia_ino = local->cont.writev.ino;
- local->cont.writev.postbuf.ia_ino = local->cont.writev.ino;
+ LOCK(&frame->lock);
+ {
+ __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
+ prebuf, postbuf, xattr, xdata);
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- AFR_STACK_UNWIND (writev, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.writev.prebuf,
- &local->cont.writev.postbuf);
+ if (call_count == 0) {
+ __afr_inode_write_finalize(frame, this);
+
+ if (afr_txn_nothing_failed(frame, this)) {
+ /*if it did pre-op, it will do post-op changing ctime*/
+ if (priv->consistent_metadata && afr_needs_changelog_update(local))
+ afr_zero_fill_stat(local);
+ local->transaction.unwind(frame, this);
}
- return 0;
+
+ afr_transaction_resume(frame, this);
+ }
+
+ return 0;
}
+/* {{{ writev */
-int
-afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+void
+afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame)
{
- afr_local_t * local = NULL;
-
- int child_index = (long) cookie;
- int call_count = -1;
- int read_child = 0;
+ afr_local_t *src_local = NULL;
+ afr_local_t *dst_local = NULL;
+
+ src_local = src_frame->local;
+ dst_local = dst_frame->local;
+
+ dst_local->op_ret = src_local->op_ret;
+ dst_local->op_errno = src_local->op_errno;
+ dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+ dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
+ if (src_local->xdata_rsp)
+ dst_local->xdata_rsp = dict_ref(src_local->xdata_rsp);
+}
- local = frame->local;
+void
+afr_writev_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
- read_child = afr_read_child (this, local->fd->inode);
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ if (priv->consistent_metadata)
+ afr_zero_fill_stat(local);
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+ AFR_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+}
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
+int
+afr_transaction_writev_unwind(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *fop_frame = NULL;
- if (child_index == read_child) {
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
- }
+ fop_frame = afr_transaction_detach_fop_frame(frame);
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ if (fop_frame) {
+ afr_writev_copy_outvars(frame, fop_frame);
+ afr_writev_unwind(fop_frame, this);
+ }
+ return 0;
+}
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
+static void
+afr_writev_handle_short_writes(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ /*
+ * We already have the best case result of the writev calls staged
+ * as the return value. Any writev that returns some value less
+ * than the best case is now out of sync, so mark the fop as
+ * failed. Note that fops that have returned with errors have
+ * already been marked as failed.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret == -1))
+ continue;
+
+ if (local->replies[i].op_ret < local->op_ret)
+ afr_transaction_fop_failed(frame, this, i);
+ }
+}
- local->transaction.resume (frame, this);
- }
- return 0;
+void
+afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int ret = 0;
+ afr_local_t *local = frame->local;
+ uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
+ int32_t num_inodelks = 0;
+
+ LOCK(&frame->lock);
+ {
+ __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
+ prebuf, postbuf, NULL, xdata);
+ if (op_ret == -1 || !xdata)
+ goto unlock;
+
+ write_is_append = 0;
+ ret = dict_get_uint32(xdata, GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
+
+ ret = dict_get_uint32(xdata, GLUSTERFS_ACTIVE_FD_COUNT, &open_fd_count);
+ if (ret < 0)
+ goto unlock;
+ if (open_fd_count > local->open_fd_count) {
+ local->open_fd_count = open_fd_count;
+ local->update_open_fd_count = _gf_true;
+ }
+
+ ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT,
+ &num_inodelks);
+ if (ret < 0)
+ goto unlock;
+ if (num_inodelks > local->num_inodelks) {
+ local->num_inodelks = num_inodelks;
+ local->update_num_inodelks = _gf_true;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
}
+void
+afr_process_post_writev(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
+ afr_fd_report_unstable_write(this, local);
+
+ __afr_inode_write_finalize(frame, this);
+
+ afr_writev_handle_short_writes(frame, this);
+
+ if (local->update_open_fd_count)
+ local->inode_ctx->open_fd_count = local->open_fd_count;
+ if (local->update_num_inodelks &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ lock->num_inodelks = local->num_inodelks;
+ }
+}
int
-afr_writev_wind (call_frame_t *frame, xlator_t *this)
+afr_writev_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ call_frame_t *fop_frame = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
- int i = 0;
- int call_count = -1;
+ afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf,
+ postbuf, xdata);
- local = frame->local;
- priv = this->private;
+ call_count = afr_frame_return(frame);
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (call_count == 0) {
+ afr_process_post_writev(frame, this);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- local->fd,
- local->cont.writev.vector,
- local->cont.writev.count,
- local->cont.writev.offset,
- local->cont.writev.iobref);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ if (!afr_txn_nothing_failed(frame, this)) {
+ // Don't unwind until post-op is complete
+ afr_transaction_resume(frame, this);
+ } else {
+ /*
+ * Generally inode-write fops do transaction.unwind then
+ * transaction.resume, but writev needs to make sure that
+ * delayed post-op frame is placed in fdctx before unwind
+ * happens. This prevents the race of flush doing the
+ * changelog wakeup first in fuse thread and then this
+ * writev placing its delayed post-op frame in fdctx.
+ * This helps flush make sure all the delayed post-ops are
+ * completed.
+ */
+
+ fop_frame = afr_transaction_detach_fop_frame(frame);
+ afr_writev_copy_outvars(frame, fop_frame);
+ afr_transaction_resume(frame, this);
+ afr_writev_unwind(fop_frame, this);
+ }
+ }
+ return 0;
}
+static int
+afr_arbiter_writev_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ static char byte = 0xFF;
+ static struct iovec vector = {&byte, 1};
+ int32_t count = 1;
+
+ STACK_WIND_COOKIE(
+ frame, afr_writev_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol], priv->children[subvol]->fops->writev, local->fd,
+ &vector, count, local->cont.writev.offset, local->cont.writev.flags,
+ local->cont.writev.iobref, local->xdata_req);
+
+ return 0;
+}
int
-afr_writev_done (call_frame_t *frame, xlator_t *this)
+afr_writev_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- iobref_unref (local->cont.writev.iobref);
- local->cont.writev.iobref = NULL;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ priv = this->private;
+ if (AFR_IS_ARBITER_BRICK(priv, subvol)) {
+ afr_arbiter_writev_wind(frame, this, subvol);
return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_writev_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->writev, local->fd,
+ local->cont.writev.vector, local->cont.writev.count,
+ local->cont.writev.offset, local->cont.writev.flags,
+ local->cont.writev.iobref, local->xdata_req);
+ return 0;
}
+int
+afr_do_writev(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = frame->local;
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ if (!AFR_FRAME_INIT(frame, op_errno))
+ goto out;
+
+ local->op = GF_FOP_WRITE;
+
+ local->transaction.wind = afr_writev_wind;
+ local->transaction.unwind = afr_transaction_writev_unwind;
+
+ local->transaction.main_frame = frame;
+
+ if (local->fd->flags & O_APPEND) {
+ /*
+ * Backend vfs ignores the 'offset' for append mode fd so
+ * locking just the region provided for the writev does not
+ * give consistency guarantee. The actual write may happen at a
+ * completely different range than the one provided by the
+ * offset, len in the fop. So lock the entire file.
+ */
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ } else {
+ local->transaction.start = local->cont.writev.offset;
+ local->transaction.len = iov_length(local->cont.writev.vector,
+ local->cont.writev.count);
+ }
+
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+
+ AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
int
-afr_do_writev (call_frame_t *frame, xlator_t *this)
+afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
{
- call_frame_t * transaction_frame = NULL;
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = ENOMEM;
+ int ret = -1;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.writev.vector = iov_dup(vector, count);
+ if (!local->cont.writev.vector)
+ goto out;
+ local->cont.writev.count = count;
+ local->cont.writev.offset = offset;
+ local->cont.writev.flags = flags;
+ local->cont.writev.iobref = iobref_ref(iobref);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
+
+ if (dict_set_uint32(local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT,
+ this->name)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (dict_set_uint32(local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
+
+ /* detect here, but set it in writev_wind_cbk *after* the unstable
+ write is performed
+ */
+ local->stable_write = !!((fd->flags | flags) & (O_SYNC | O_DSYNC));
+
+ afr_fix_open(fd, this);
+
+ afr_do_writev(frame, this);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
- int op_ret = -1;
- int op_errno = 0;
+ return 0;
+}
- local = frame->local;
+/* }}} */
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- goto out;
- }
+/* {{{ truncate */
- transaction_frame->local = local;
- frame->local = NULL;
+int
+afr_truncate_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local->op = GF_FOP_WRITE;
+ local = frame->local;
- local->success_count = 0;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- local->transaction.fop = afr_writev_wind;
- local->transaction.done = afr_writev_done;
- local->transaction.unwind = afr_writev_unwind;
+ AFR_STACK_UNWIND(truncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
- local->transaction.main_frame = frame;
- if (local->fd->flags & O_APPEND) {
- local->transaction.start = 0;
- local->transaction.len = 0;
- } else {
- local->transaction.start = local->cont.writev.offset;
- local->transaction.len = iov_length (local->cont.writev.vector,
- local->cont.writev.count);
- }
+int
+afr_truncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ local = frame->local;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL);
- }
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- return 0;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
}
-
int
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref)
+afr_truncate_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- int ret = -1;
+ local = frame->local;
+ priv = this->private;
- int op_ret = -1;
- int op_errno = 0;
+ STACK_WIND_COOKIE(frame, afr_truncate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->truncate, &local->loc,
+ local->cont.truncate.offset, local->xdata_req);
+ return 0;
+}
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx = NULL;
+int
+afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- priv = this->private;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.truncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ if (!local->xdata_req)
+ goto out;
- frame->local = local;
+ local->transaction.wind = afr_truncate_wind;
+ local->transaction.unwind = afr_truncate_unwind;
- local->cont.writev.vector = iov_dup (vector, count);
- local->cont.writev.count = count;
- local->cont.writev.offset = offset;
- local->cont.writev.ino = fd->inode->ino;
- local->cont.writev.iobref = iobref_ref (iobref);
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- local->fd = fd_ref (fd);
+ local->op = GF_FOP_TRUNCATE;
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- goto out;
- }
+ local->transaction.main_frame = frame;
+ local->transaction.start = offset;
+ local->transaction.len = 0;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ /* Set it true speculatively, will get reset in afr_truncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- if (fd_ctx->up_count < priv->up_count) {
- local->openfd_flush_cbk = afr_do_writev;
- afr_openfd_flush (frame, this, fd);
- } else {
- afr_do_writev (frame, this);
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
/* }}} */
-/* {{{ truncate */
+/* {{{ ftruncate */
int
-afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.truncate.prebuf.ia_ino = local->cont.truncate.ino;
- local->cont.truncate.postbuf.ia_ino = local->cont.truncate.ino;
-
- AFR_STACK_UNWIND (truncate, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.truncate.prebuf,
- &local->cont.truncate.postbuf);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(ftruncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+afr_ftruncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_read_child (this, local->loc.inode);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG)
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- call_count = afr_frame_return (frame);
+ local = frame->local;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- return 0;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
}
-
-int32_t
-afr_truncate_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_ftruncate_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->truncate,
- &local->loc,
- local->cont.truncate.offset);
-
- if (!--call_count)
- break;
- }
- }
+ local = frame->local;
+ priv = this->private;
- return 0;
+ STACK_WIND_COOKIE(frame, afr_ftruncate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->ftruncate, local->fd,
+ local->cont.ftruncate.offset, local->xdata_req);
+ return 0;
}
-
int
-afr_truncate_done (call_frame_t *frame, xlator_t *this)
+afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local = frame->local;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local->transaction.unwind (frame, this);
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
+ local->cont.ftruncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+ if (!local->xdata_req)
+ goto out;
-int
-afr_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- int ret = -1;
+ local->op = GF_FOP_FTRUNCATE;
- int op_ret = -1;
- int op_errno = 0;
+ local->transaction.wind = afr_ftruncate_wind;
+ local->transaction.unwind = afr_ftruncate_unwind;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->transaction.main_frame = frame;
- priv = this->private;
+ local->transaction.start = local->cont.ftruncate.offset;
+ local->transaction.len = 0;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ afr_fix_open(fd, this);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- transaction_frame->local = local;
+ return 0;
+out:
+ AFR_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
- local->op_ret = -1;
+ return 0;
+}
- local->cont.truncate.offset = offset;
- local->cont.truncate.ino = loc->inode->ino;
+/* }}} */
- local->transaction.fop = afr_truncate_wind;
- local->transaction.done = afr_truncate_done;
- local->transaction.unwind = afr_truncate_unwind;
+/* {{{ setattr */
- loc_copy (&local->loc, loc);
+int
+afr_setattr_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = offset;
+ local = frame->local;
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (truncate, frame, op_ret, op_errno, NULL, NULL);
- }
+ AFR_STACK_UNWIND(setattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
- return 0;
+int
+afr_setattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preop,
+ struct iatt *postop, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop,
+ postop, NULL, xdata);
}
+int
+afr_setattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_setattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setattr, &local->loc,
+ &local->cont.setattr.in_buf, local->cont.setattr.valid,
+ local->xdata_req);
+ return 0;
+}
-/* }}} */
+int
+afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.setattr.in_buf = *buf;
+ local->cont.setattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_setattr_wind;
+ local->transaction.unwind = afr_setattr_unwind;
+
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
+
+ local->op = GF_FOP_SETATTR;
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
-/* {{{ ftruncate */
+ AFR_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+/* {{{ fsetattr */
int
-afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
+afr_fsetattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- local->cont.ftruncate.prebuf.ia_ino = local->cont.ftruncate.ino;
- local->cont.ftruncate.postbuf.ia_ino = local->cont.ftruncate.ino;
+ AFR_STACK_UNWIND(fsetattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
- AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.ftruncate.prebuf,
- &local->cont.ftruncate.postbuf);
- }
- return 0;
+int
+afr_fsetattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preop,
+ struct iatt *postop, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop,
+ postop, NULL, xdata);
}
+int
+afr_fsetattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fsetattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetattr, local->fd,
+ &local->cont.fsetattr.in_buf, local->cont.fsetattr.valid,
+ local->xdata_req);
+ return 0;
+}
int
-afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- int child_index = (long) cookie;
- int call_count = -1;
- int need_unwind = 0;
- int read_child = 0;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = frame->local;
- priv = this->private;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- read_child = afr_read_child (this, local->fd->inode);
+ local->cont.fsetattr.in_buf = *buf;
+ local->cont.fsetattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ if (!local->xdata_req)
+ goto out;
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+ local->transaction.wind = afr_fsetattr_wind;
+ local->transaction.unwind = afr_fsetattr_unwind;
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- if (child_index == read_child) {
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
+ local->op = GF_FOP_FSETATTR;
- local->success_count++;
+ afr_fix_open(fd, this);
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- call_count = afr_frame_return (frame);
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ AFR_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+/* {{{ setxattr */
+
+int
+afr_setxattr_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ local = frame->local;
+
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(setxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
}
+int
+afr_setxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
+}
int
-afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+afr_setxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_setxattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setxattr, &local->loc,
+ local->cont.setxattr.dict, local->cont.setxattr.flags,
+ local->xdata_req);
+ return 0;
+}
- int call_count = -1;
- int i = 0;
+int
+afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
- local = frame->local;
- priv = this->private;
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i, ret = 0;
+ char *op_type = NULL;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local = frame->local;
+ priv = this->private;
+ i = (long)cookie;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- local->call_count = call_count;
+ ret = dict_get_str_sizen(local->xdata_req, "replicate-brick-op", &op_type);
+ if (ret)
+ goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- local->fd, local->cont.ftruncate.offset);
-
- if (!--call_count)
- break;
- }
- }
+ gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
+ op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s",
+ priv->children[i]->name, "op_ret=%s",
+ op_ret ? "failed" : "succeeded", NULL);
- return 0;
+out:
+ syncbarrier_wake(&local->barrier);
+ return 0;
}
-
int
-afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+afr_emptyb_set_pending_changelog(call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_nodes)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = 0, i = 0;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- local->transaction.unwind (frame, this);
+ AFR_ONLIST(locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk,
+ xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, local->xattr_req,
+ NULL);
- AFR_STACK_DESTROY (frame);
+ /* It is sufficient if xattrop was successful on one child */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
- return 0;
+ if (local->replies[i].op_ret == 0) {
+ ret = 0;
+ goto out;
+ } else {
+ ret = afr_higher_errno(ret, local->replies[i].op_errno);
+ }
+ }
+out:
+ return -ret;
}
+static int
+_afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ int empty_index, afr_transaction_type type,
+ char *op_type, const int op_type_len)
+{
+ int count = 0;
+ int ret = -ENOMEM;
+ int idx = -1;
+ int d_idx = -1;
+ unsigned char *locked_nodes = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ locked_nodes = alloca0(priv->child_count);
+
+ idx = afr_index_for_transaction_type(type);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!local->pending)
+ goto out;
+
+ local->pending[empty_index][idx] = hton32(1);
+
+ if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION))
+ local->pending[empty_index][d_idx] = hton32(1);
+
+ local->xdata_req = dict_new();
+ if (!local->xdata_req)
+ goto out;
+
+ ret = dict_set_nstrn(local->xdata_req, "replicate-brick-op",
+ SLEN("replicate-brick-op"), op_type, op_type_len);
+ if (ret)
+ goto out;
+
+ local->xattr_req = dict_new();
+ if (!local->xattr_req)
+ goto out;
+
+ ret = afr_set_pending_dict(priv, local->xattr_req, local->pending);
+ if (ret < 0)
+ goto out;
+
+ if (AFR_ENTRY_TRANSACTION == type) {
+ count = afr_selfheal_entrylk(frame, this, loc->inode, this->name, NULL,
+ locked_nodes);
+ } else {
+ count = afr_selfheal_inodelk(frame, this, loc->inode, this->name,
+ LLONG_MAX - 1, 0, locked_nodes);
+ }
+
+ if (!count) {
+ gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS,
+ NULL);
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ ret = afr_emptyb_set_pending_changelog(frame, this, locked_nodes);
+ if (ret)
+ goto unlock;
+ ret = 0;
+unlock:
+ if (AFR_ENTRY_TRANSACTION == type) {
+ afr_selfheal_unentrylk(frame, this, loc->inode, this->name, NULL,
+ locked_nodes, NULL);
+ } else {
+ afr_selfheal_uninodelk(frame, this, loc->inode, this->name,
+ LLONG_MAX - 1, 0, locked_nodes);
+ }
+out:
+ return ret;
+}
-int
-afr_do_ftruncate (call_frame_t *frame, xlator_t *this)
+void
+afr_brick_args_cleanup(void *opaque)
{
- call_frame_t * transaction_frame = NULL;
- afr_local_t * local = NULL;
+ afr_empty_brick_args_t *data = NULL;
- int op_ret = -1;
- int op_errno = 0;
+ data = opaque;
+ loc_wipe(&data->loc);
+ GF_FREE(data);
+}
- local = frame->local;
+int
+_afr_handle_empty_brick_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ afr_brick_args_cleanup(opaque);
+ return 0;
+}
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+int
+_afr_handle_empty_brick(void *opaque)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int empty_index = -1;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ char *op_type = NULL;
+ int op_type_len = 0;
+ afr_empty_brick_args_t *data = NULL;
+ call_frame_t *op_frame = NULL;
+
+ data = opaque;
+ frame = data->frame;
+ empty_index = data->empty_index;
+ if (!data->op_type)
+ goto out;
+
+ op_frame = copy_frame(frame);
+ if (!op_frame) {
+ ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_type = data->op_type;
+ op_type_len = strlen(op_type);
+ this = op_frame->this;
+ priv = this->private;
+
+ afr_set_lk_owner(op_frame, this, op_frame->root);
+ local = AFR_FRAME_INIT(op_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, &data->loc);
+
+ gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s",
+ priv->children[empty_index]->name, NULL);
+
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+ AFR_METADATA_TRANSACTION, op_type,
+ op_type_len);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ dict_unref(local->xdata_req);
+ dict_unref(local->xattr_req);
+ afr_matrix_cleanup(local->pending, priv->child_count);
+ local->pending = NULL;
+ local->xattr_req = NULL;
+ local->xdata_req = NULL;
+
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+ AFR_ENTRY_TRANSACTION, op_type,
+ op_type_len);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ if (op_frame) {
+ AFR_STACK_DESTROY(op_frame);
+ }
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ return 0;
+}
- transaction_frame->local = local;
- frame->local = NULL;
+int
+afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ char *data)
+{
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ local = frame->local;
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_int32_sizen(local->xdata_req, "heal-op",
+ GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str_sizen(local->xdata_req, "child-name", data);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ /* set spb choice to -1 whether heal succeeds or not:
+ * If heal succeeds : spb-choice should be set to -1 as
+ * it is no longer valid; file is not
+ * in split-brain anymore.
+ * If heal doesn't succeed:
+ * spb-choice should be set to -1
+ * otherwise reads will be served
+ * from spb-choice which is misleading.
+ */
+ ret = afr_inode_split_brain_choice_set(loc->inode, this, -1);
+ if (ret)
+ gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ NULL);
+ afr_heal_splitbrain_file(frame, this, loc);
+ ret = 0;
+out:
+ if (ret < 0)
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ return 0;
+}
- local->op = GF_FOP_FTRUNCATE;
+int
+afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len)
+{
+ int spb_child_index = -1;
+ char *spb_child_str = NULL;
- local->transaction.fop = afr_ftruncate_wind;
- local->transaction.done = afr_ftruncate_done;
- local->transaction.unwind = afr_ftruncate_unwind;
+ spb_child_str = alloca0(len + 1);
+ memcpy(spb_child_str, value, len);
- local->transaction.main_frame = frame;
+ if (!strcmp(spb_child_str, "none"))
+ return -2;
- local->transaction.start = 0;
- local->transaction.len = local->cont.ftruncate.offset;
+ spb_child_index = afr_get_child_index_from_name(this, spb_child_str);
+ if (spb_child_index < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "subvol=%s", spb_child_str, NULL);
+ }
+ return spb_child_index;
+}
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+int
+afr_can_set_split_brain_choice(void *opaque)
+{
+ afr_spbc_timeout_t *data = opaque;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ loc_t *loc = NULL;
+ int ret = -1;
+
+ frame = data->frame;
+ loc = data->loc;
+ this = frame->this;
+
+ ret = afr_is_split_brain(frame, this, loc->inode, loc->gfid, &data->d_spb,
+ &data->m_spb);
+
+ if (ret)
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s",
+ uuid_utoa(loc->gfid), NULL);
+ return ret;
+}
- op_ret = 0;
+int
+afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
+{
+ void *choice_value = NULL;
+ void *resolve_value = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_spbc_timeout_t *data = NULL;
+ int len = 0;
+ int spb_child_index = -1;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len);
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value,
+ &len);
+ if (!choice_value && !resolve_value) {
+ ret = -1;
+ goto out;
+ }
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local) {
+ ret = 1;
+ goto out;
+ }
+
+ local->op = GF_FOP_SETXATTR;
+
+ if (choice_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, choice_value,
+ len);
+ if (spb_child_index < 0) {
+ /* Case where value was "none" */
+ if (spb_child_index == -2)
+ spb_child_index = -1;
+ else {
+ ret = 1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ }
+
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spbc_timeout_t);
+ if (!data) {
+ ret = 1;
+ goto out;
+ }
+ data->spb_child_index = spb_child_index;
+ data->frame = frame;
+ loc_copy(&local->loc, loc);
+ data->loc = &local->loc;
+ ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice,
+ afr_set_split_brain_choice, NULL, data);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ "name=%s", loc->name, NULL);
+ ret = 1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ if (resolve_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, resolve_value,
+ len);
+ if (spb_child_index < 0) {
+ ret = 1;
+ goto out;
+ }
+
+ afr_split_brain_resolve_do(frame, this, loc,
+ priv->children[spb_child_index]->name);
+ ret = 0;
+ }
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL);
- }
-
- return 0;
+ /* key was correct but value was invalid when ret == 1 */
+ if (ret == 1) {
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ if (data)
+ GF_FREE(data);
+ ret = 0;
+ }
+ return ret;
}
-
int
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset)
+afr_handle_spb_choice_timeout(xlator_t *this, call_frame_t *frame, dict_t *dict)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = 0;
+ uint64_t timeout = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
- int ret = -1;
+ ret = dict_get_uint64(dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout);
+ if (!ret) {
+ priv->spb_choice_timeout = timeout * 60;
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ }
- int op_ret = -1;
- int op_errno = 0;
+ return ret;
+}
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx = NULL;
+int
+afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
+{
+ int ret = -1;
+ int ab_ret = -1;
+ int empty_index = -1;
+ int op_errno = EPERM;
+ char *empty_brick = NULL;
+ char *op_type = NULL;
+ afr_empty_brick_args_t *data = NULL;
+
+ ret = dict_get_str_sizen(dict, GF_AFR_REPLACE_BRICK, &empty_brick);
+ if (!ret)
+ op_type = GF_AFR_REPLACE_BRICK;
+
+ ab_ret = dict_get_str_sizen(dict, GF_AFR_ADD_BRICK, &empty_brick);
+ if (!ab_ret)
+ op_type = GF_AFR_ADD_BRICK;
+
+ if (ret && ab_ret)
+ goto out;
+
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) {
+ gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR,
+ "op_type=%s", op_type, NULL);
+ ret = 1;
+ goto out;
+ }
+ empty_index = afr_get_child_index_from_name(this, empty_brick);
+
+ if (empty_index < 0) {
+ /* Didn't belong to this replica pair
+ * Just do a no-op
+ */
+ AFR_STACK_UNWIND(setxattr, frame, 0, 0, NULL);
+ return 0;
+ } else {
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_empty_brick_t);
+ if (!data) {
+ ret = 1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+ data->frame = frame;
+ loc_copy(&data->loc, loc);
+ data->empty_index = empty_index;
+ data->op_type = op_type;
+ ret = synctask_new(this->ctx->env, _afr_handle_empty_brick,
+ _afr_handle_empty_brick_cbk, NULL, data);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ NULL);
+ ret = 1;
+ op_errno = ENOMEM;
+ afr_brick_args_cleanup(data);
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ if (ret == 1) {
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ ret = 0;
+ }
+ return ret;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+static int
+afr_handle_special_xattr(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
+{
+ int ret = -1;
- priv = this->private;
+ ret = afr_handle_split_brain_commands(this, frame, loc, dict);
+ if (ret == 0)
+ goto out;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = afr_handle_spb_choice_timeout(this, frame, dict);
+ if (ret == 0)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ /* Applicable for replace-brick and add-brick commands */
+ ret = afr_handle_empty_brick(this, frame, loc, dict);
+out:
+ return ret;
+}
- frame->local = local;
+int
+afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
- local->cont.ftruncate.offset = offset;
- local->cont.ftruncate.ino = fd->inode->ino;
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out);
- local->fd = fd_ref (fd);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- goto out;
- }
+ ret = afr_handle_special_xattr(this, frame, loc, dict);
+ if (ret == 0)
+ return 0;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- if (fd_ctx->up_count < priv->up_count) {
- local->openfd_flush_cbk = afr_do_ftruncate;
- afr_openfd_flush (frame, this, fd);
- } else {
- afr_do_ftruncate (frame, this);
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL);
- }
+ local->cont.setxattr.dict = dict_ref(dict);
+ local->cont.setxattr.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- return 0;
-}
+ if (!local->xdata_req)
+ goto out;
-/* }}} */
+ local->transaction.wind = afr_setxattr_wind;
+ local->transaction.unwind = afr_setxattr_unwind;
-/* {{{ setattr */
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
-int
-afr_setattr_unwind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- local = frame->local;
+ local->op = GF_FOP_SETXATTR;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- if (main_frame) {
- local->cont.setattr.preop_buf.ia_ino = local->cont.setattr.ino;
- local->cont.setattr.postop_buf.ia_ino = local->cont.setattr.ino;
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (setattr, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.setattr.preop_buf,
- &local->cont.setattr.postop_buf);
- }
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
+/* {{{ fsetxattr */
int
-afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
+afr_fsetxattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- read_child = afr_read_child (this, local->loc.inode);
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+ AFR_STACK_UNWIND(fsetxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
+int
+afr_fsetxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
+}
- if (child_index == read_child) {
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
+int
+afr_fsetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fsetxattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetxattr, local->fd,
+ local->cont.fsetxattr.dict, local->cont.fsetxattr.flags,
+ local->xdata_req);
+ return 0;
+}
- local->success_count++;
+int
+afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out);
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
- call_count = afr_frame_return (frame);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- return 0;
-}
+ local->cont.fsetxattr.dict = dict_ref(dict);
+ local->cont.fsetxattr.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
-int32_t
-afr_setattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ if (!local->xdata_req)
+ goto out;
- int call_count = -1;
- int i = 0;
+ local->transaction.wind = afr_fsetxattr_wind;
+ local->transaction.unwind = afr_fsetxattr_unwind;
- local = frame->local;
- priv = this->private;
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local->op = GF_FOP_FSETXATTR;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- local->call_count = call_count;
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc,
- &local->cont.setattr.in_buf,
- local->cont.setattr.valid);
-
- if (!--call_count)
- break;
- }
- }
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+ return 0;
}
+/* }}} */
+
+/* {{{ removexattr */
int
-afr_setattr_done (call_frame_t *frame, xlator_t *this)
+afr_removexattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(removexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
}
+int
+afr_removexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
+}
int
-afr_setattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct iatt *buf, int32_t valid)
+afr_removexattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- int ret = -1;
+ local = frame->local;
+ priv = this->private;
- int op_ret = -1;
- int op_errno = 0;
+ STACK_WIND_COOKIE(frame, afr_removexattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->removexattr, &local->loc,
+ local->cont.removexattr.name, local->xdata_req);
+ return 0;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- priv = this->private;
+ GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- transaction_frame->local = local;
+ local->cont.removexattr.name = gf_strdup(name);
- local->op_ret = -1;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- local->cont.setattr.ino = loc->inode->ino;
+ if (!local->xdata_req)
+ goto out;
- local->cont.setattr.in_buf = *buf;
- local->cont.setattr.valid = valid;
+ local->transaction.wind = afr_removexattr_wind;
+ local->transaction.unwind = afr_removexattr_unwind;
- local->transaction.fop = afr_setattr_wind;
- local->transaction.done = afr_setattr_done;
- local->transaction.unwind = afr_setattr_unwind;
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- loc_copy (&local->loc, loc);
+ local->op = GF_FOP_REMOVEXATTR;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setattr, frame, op_ret, op_errno, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+ return 0;
}
-/* {{{ fsetattr */
-
+/* ffremovexattr */
int
-afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fremovexattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.fsetattr.preop_buf.ia_ino =
- local->cont.fsetattr.ino;
- local->cont.fsetattr.postop_buf.ia_ino =
- local->cont.fsetattr.ino;
-
- AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.fsetattr.preop_buf,
- &local->cont.fsetattr.postop_buf);
- }
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(fremovexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
int
-afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
+afr_fremovexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_read_child (this, local->fd->inode);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
+}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
+int
+afr_fremovexattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
+ local = frame->local;
+ priv = this->private;
- if (child_index == read_child) {
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
+ STACK_WIND_COOKIE(frame, afr_fremovexattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fremovexattr, local->fd,
+ local->cont.removexattr.name, local->xdata_req);
+ return 0;
+}
- local->success_count++;
+int
+afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out);
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
- call_count = afr_frame_return (frame);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- return 0;
-}
+ local->cont.removexattr.name = gf_strdup(name);
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+ if (!local->xdata_req)
+ goto out;
-int32_t
-afr_fsetattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ local->transaction.wind = afr_fremovexattr_wind;
+ local->transaction.unwind = afr_fremovexattr_unwind;
- int call_count = -1;
- int i = 0;
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- local = frame->local;
- priv = this->private;
+ local->op = GF_FOP_FREMOVEXATTR;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- local->call_count = call_count;
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsetattr,
- local->fd,
- &local->cont.fsetattr.in_buf,
- local->cont.fsetattr.valid);
-
- if (!--call_count)
- break;
- }
- }
+ AFR_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-afr_fsetattr_done (call_frame_t *frame, xlator_t *this)
+afr_fallocate_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(fallocate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_fsetattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *buf, int32_t valid)
+afr_fallocate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
-
- int ret = -1;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
- int op_ret = -1;
- int op_errno = 0;
+int
+afr_fallocate_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fallocate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fallocate, local->fd,
+ local->cont.fallocate.mode, local->cont.fallocate.offset,
+ local->cont.fallocate.len, local->xdata_req);
+ return 0;
+}
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+int
+afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- priv = this->private;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.fallocate.mode = mode;
+ local->cont.fallocate.offset = offset;
+ local->cont.fallocate.len = len;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- transaction_frame->local = local;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- local->op_ret = -1;
+ if (!local->xdata_req)
+ goto out;
- local->cont.fsetattr.ino = fd->inode->ino;
+ local->op = GF_FOP_FALLOCATE;
- local->cont.fsetattr.in_buf = *buf;
- local->cont.fsetattr.valid = valid;
+ local->transaction.wind = afr_fallocate_wind;
+ local->transaction.unwind = afr_fallocate_unwind;
- local->transaction.fop = afr_fsetattr_wind;
- local->transaction.done = afr_fsetattr_done;
- local->transaction.unwind = afr_fsetattr_unwind;
+ local->transaction.main_frame = frame;
- local->fd = fd_ref (fd);
+ local->transaction.start = local->cont.fallocate.offset;
+ local->transaction.len = 0;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ afr_fix_open(fd, this);
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
+/* }}} */
-/* {{{ setxattr */
-
+/* {{{ discard */
int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_discard_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
- if (main_frame) {
- AFR_STACK_UNWIND (setxattr, main_frame,
- local->op_ret, local->op_errno)
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_discard_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int need_unwind = 0;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
- local = frame->local;
- priv = this->private;
+int
+afr_discard_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_discard_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->discard, local->fd,
+ local->cont.discard.offset, local->cont.discard.len,
+ local->xdata_req);
+ return 0;
+}
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
+int
+afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ local->cont.discard.offset = offset;
+ local->cont.discard.len = len;
- call_count = afr_frame_return (frame);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- return 0;
-}
+ if (!local->xdata_req)
+ goto out;
+ local->op = GF_FOP_DISCARD;
-int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ local->transaction.wind = afr_discard_wind;
+ local->transaction.unwind = afr_discard_unwind;
- int call_count = -1;
- int i = 0;
+ local->transaction.main_frame = frame;
- local = frame->local;
- priv = this->private;
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = 0;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ afr_fix_open(fd, this);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags);
-
- if (!--call_count)
- break;
- }
- }
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
+/* {{{ zerofill */
int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
+afr_zerofill_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
}
int
-afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags)
+afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
- int ret = -1;
+int
+afr_zerofill_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_zerofill_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->zerofill, local->fd,
+ local->cont.zerofill.offset, local->cont.zerofill.len,
+ local->xdata_req);
+ return 0;
+}
- int op_ret = -1;
- int op_errno = 0;
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- priv = this->private;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.zerofill.offset = offset;
+ local->cont.zerofill.len = len;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- transaction_frame->local = local;
+ if (!local->xdata_req)
+ goto out;
- local->op_ret = -1;
+ local->op = GF_FOP_ZEROFILL;
- local->cont.setxattr.dict = dict_ref (dict);
- local->cont.setxattr.flags = flags;
+ local->transaction.wind = afr_zerofill_wind;
+ local->transaction.unwind = afr_zerofill_unwind;
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
- local->transaction.unwind = afr_setxattr_unwind;
+ local->transaction.main_frame = frame;
- loc_copy (&local->loc, loc);
+ local->transaction.start = local->cont.zerofill.offset;
+ local->transaction.len = len;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ afr_fix_open(fd, this);
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
-/* {{{ removexattr */
+int32_t
+afr_xattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, xattr, xdata);
+}
+int
+afr_xattrop_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_xattrop_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &local->loc,
+ local->cont.xattrop.optype, local->cont.xattrop.xattr,
+ local->xdata_req);
+ return 0;
+}
int
-afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_xattrop_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (removexattr, main_frame,
- local->op_ret, local->op_errno)
- }
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(xattrop, main_frame, local->op_ret, local->op_errno,
+ local->xattr_rsp, local->xdata_rsp);
+ return 0;
+}
-int
-afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+int32_t
+afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- int call_count = -1;
- int need_unwind = 0;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = frame->local;
- priv = this->private;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
+ local->cont.xattrop.xattr = dict_ref(xattr);
+ local->cont.xattrop.optype = optype;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+ local->transaction.wind = afr_xattrop_wind;
+ local->transaction.unwind = afr_xattrop_unwind;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ local->op = GF_FOP_XATTROP;
- call_count = afr_frame_return (frame);
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
-}
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+ AFR_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
int32_t
-afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
+afr_fxattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->removexattr,
- &local->loc,
- local->cont.removexattr.name);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, xattr, xdata);
}
-
int
-afr_removexattr_done (call_frame_t *frame, xlator_t *this)
+afr_fxattrop_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fxattrop_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fxattrop, local->fd,
+ local->cont.xattrop.optype, local->cont.xattrop.xattr,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+afr_fxattrop_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- int ret = -1;
+ local = frame->local;
- int op_ret = -1;
- int op_errno = 0;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ AFR_STACK_UNWIND(fxattrop, main_frame, local->op_ret, local->op_errno,
+ local->xattr_rsp, local->xdata_rsp);
+ return 0;
+}
- priv = this->private;
+int32_t
+afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.xattrop.xattr = dict_ref(xattr);
+ local->cont.xattrop.optype = optype;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ local->transaction.wind = afr_fxattrop_wind;
+ local->transaction.unwind = afr_fxattrop_unwind;
+
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
+
+ local->op = GF_FOP_FXATTROP;
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ AFR_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_fsync_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = frame->local;
- transaction_frame->local = local;
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
+ return 0;
- local->op_ret = -1;
+ AFR_STACK_UNWIND(fsync, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
- local->cont.removexattr.name = gf_strdup (name);
+ return 0;
+}
- local->transaction.fop = afr_removexattr_wind;
- local->transaction.done = afr_removexattr_done;
- local->transaction.unwind = afr_removexattr_unwind;
+int
+afr_fsync_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
- loc_copy (&local->loc, loc);
+int
+afr_fsync_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local = frame->local;
+ priv = this->private;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ STACK_WIND_COOKIE(frame, afr_fsync_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsync, local->fd,
+ local->cont.fsync.datasync, local->xdata_req);
+ return 0;
+}
- op_ret = 0;
+int
+afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int32_t op_errno = ENOMEM;
+ int8_t last_fsync = 0;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (xdata) {
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
+ if (last_fsync) {
+ local->transaction.disable_delayed_post_op = _gf_true;
+ }
+ }
+ } else {
+ local->xdata_req = dict_new();
+ }
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
+
+ local->op = GF_FOP_FSYNC;
+ local->cont.fsync.datasync = datasync;
+
+ if (afr_fd_has_witnessed_unstable_write(this, fd->inode)) {
+ /* don't care. we only wanted to CLEAR the bit */
+ }
+
+ local->transaction.wind = afr_fsync_wind;
+ local->transaction.unwind = afr_fsync_unwind;
+
+ local->transaction.main_frame = frame;
+
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (removexattr, frame, op_ret, op_errno);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- return 0;
+ AFR_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index 47589872206..a787069b7a1 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -1,72 +1,94 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __INODE_WRITE_H__
#define __INODE_WRITE_H__
int32_t
-afr_chmod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode);
+afr_chmod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dict_t *xdata);
int32_t
-afr_chown (call_frame_t *frame, xlator_t *this,
- loc_t *loc, uid_t uid, gid_t gid);
+afr_chown(call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, gid_t gid,
+ dict_t *xdata);
int
-afr_fchown (call_frame_t *frame, xlator_t *this,
- fd_t *fd, uid_t uid, gid_t gid);
+afr_fchown(call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid,
+ dict_t *xdata);
+
+int32_t
+afr_fchmod(call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode,
+ dict_t *xdata);
+
+int32_t
+afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata);
+
+int32_t
+afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata);
+
+int32_t
+afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata);
int32_t
-afr_fchmod (call_frame_t *frame, xlator_t *this,
- fd_t *fd, mode_t mode);
+afr_utimens(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct timespec tv[2], dict_t *xdata);
+
+int
+afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata);
+
+int
+afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+ int32_t valid, dict_t *xdata);
int32_t
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref);
+afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata);
int32_t
-afr_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset);
+afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata);
int32_t
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset);
+afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
int32_t
-afr_utimens (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct timespec tv[2]);
+afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+int
+afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
int
-afr_setattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct iatt *buf, int32_t valid);
+afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
int
-afr_fsetattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *buf, int32_t valid);
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata);
int32_t
-afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags);
+afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
int32_t
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name);
+afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+int
+afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata);
#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index c8b0b66009e..bc8eabe0f43 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -1,2183 +1,791 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "dict.h"
-#include "byte-order.h"
-#include "common-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
#include "afr.h"
#include "afr-transaction.h"
+#include "afr-messages.h"
#include <signal.h>
-
-#define LOCKED_NO 0x0 /* no lock held */
-#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */
-#define LOCKED_LOWER 0x2 /* for lower path */
-
-int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
-
-static uint64_t afr_lock_number = 1;
-
-static uint64_t
-get_afr_lock_number ()
-{
- return (++afr_lock_number);
-}
-
-int
-afr_set_lock_number (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_number = get_afr_lock_number ();
-
- return 0;
-}
+#define LOCKED_NO 0x0 /* no lock held */
+#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */
+#define LOCKED_LOWER 0x2 /* for lower path */
void
-afr_set_lk_owner (call_frame_t *frame, xlator_t *this)
-{
- if (!frame->root->lk_owner) {
- gf_log (this->name, GF_LOG_TRACE,
- "Setting lk-owner=%llu",
- (unsigned long long) (unsigned long)frame->root);
- frame->root->lk_owner = (uint64_t) (unsigned long)frame->root;
- }
-}
-
-static int
-is_afr_lock_selfheal (afr_local_t *local)
+afr_lockee_cleanup(afr_lockee_t *lockee)
{
- afr_internal_lock_t *int_lock = NULL;
- int ret = -1;
-
- int_lock = &local->internal_lock;
-
- switch (int_lock->selfheal_lk_type) {
- case AFR_DATA_SELF_HEAL_LK:
- case AFR_METADATA_SELF_HEAL_LK:
- ret = 1;
- break;
- case AFR_ENTRY_SELF_HEAL_LK:
- ret = 0;
- break;
- }
+ if (lockee->fd) {
+ fd_unref(lockee->fd);
+ lockee->fd = NULL;
+ } else {
+ loc_wipe(&lockee->loc);
+ }
- return ret;
+ GF_FREE(lockee->basename);
+ lockee->basename = NULL;
+ GF_FREE(lockee->locked_nodes);
+ lockee->locked_nodes = NULL;
-}
-
-int32_t
-internal_lock_count (call_frame_t *frame, xlator_t *this,
- afr_fd_ctx_t *fd_ctx)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int32_t call_count = 0;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- if (fd_ctx) {
- GF_ASSERT (local->fd);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && fd_ctx->opened_on[i])
- ++call_count;
- }
- } else {
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i])
- ++call_count;
- }
- }
-
- return call_count;
-}
-
-static void
-afr_print_inodelk (char *str, int size, int cmd,
- struct gf_flock *flock, uint64_t owner)
-{
- char *cmd_str = NULL;
- char *type_str = NULL;
-
- switch (cmd) {
-#if F_GETLK != F_GETLK64
- case F_GETLK64:
-#endif
- case F_GETLK:
- cmd_str = "GETLK";
- break;
-
-#if F_SETLK != F_SETLK64
- case F_SETLK64:
-#endif
- case F_SETLK:
- cmd_str = "SETLK";
- break;
-
-#if F_SETLKW != F_SETLKW64
- case F_SETLKW64:
-#endif
- case F_SETLKW:
- cmd_str = "SETLKW";
- break;
-
- default:
- cmd_str = "<null>";
- break;
- }
-
- switch (flock->l_type) {
- case F_RDLCK:
- type_str = "READ";
- break;
- case F_WRLCK:
- type_str = "WRITE";
- break;
- case F_UNLCK:
- type_str = "UNLOCK";
- break;
- default:
- type_str = "UNKNOWN";
- break;
- }
-
- snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, "
- "start=%llu, len=%llu, pid=%llu, lk-owner=%llu",
- cmd_str, type_str, (unsigned long long) flock->l_start,
- (unsigned long long) flock->l_len,
- (unsigned long long) flock->l_pid,
- (unsigned long long) owner);
-
-}
-
-static void
-afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd,
- int child_index)
-{
- snprintf (str, size, "path=%s, fd=%p, child=%d",
- loc->path ? loc->path : "<nul>",
- fd ? fd : NULL,
- child_index);
+ return;
}
void
-afr_print_entrylk (char *str, int size, const char *basename,
- uint64_t owner)
+afr_lockees_cleanup(afr_internal_lock_t *int_lock)
{
- snprintf (str, size, "Basename=%s, lk-owner=%llu",
- basename ? basename : "<nul>",
- (unsigned long long)owner);
-}
+ int i = 0;
-static void
-afr_print_verdict (int op_ret, int op_errno, char *str)
-{
- if (op_ret < 0) {
- if (op_errno == EAGAIN)
- strcpy (str, "EAGAIN");
- else
- strcpy (str, "FAILED");
- }
- else
- strcpy (str, "GRANTED");
-}
-
-static void
-afr_set_lock_call_type (afr_lock_call_type_t lock_call_type,
- char *lock_call_type_str,
- afr_internal_lock_t *int_lock)
-{
- switch (lock_call_type) {
- case AFR_INODELK_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL");
- break;
- case AFR_INODELK_NB_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL");
- break;
- case AFR_ENTRYLK_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL");
- break;
- case AFR_ENTRYLK_NB_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL");
- break;
- default:
- strcpy (lock_call_type_str, "UNKNOWN");
- break;
- }
-
-}
-
-static void
-afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, struct gf_flock *flock,
- int op_ret, int op_errno, int32_t child_index)
-{
- xlator_t *this = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- char lockee[256];
- char lock_call_type_str[256];
- char verdict[16];
-
- this = THIS;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->inodelk_trace) {
- return;
- }
-
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- afr_print_verdict (op_ret, op_errno, verdict);
-
- gf_log (this->name, GF_LOG_NORMAL,
- "[%s %s] [%s] Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
- verdict,
- lockee,
- (unsigned long long) int_lock->lock_number);
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ afr_lockee_cleanup(&int_lock->lockee[i]);
+ }
+ return;
}
-
-static void
-afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, struct gf_flock *flock,
- int32_t cmd, int32_t child_index)
-{
- xlator_t *this = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
-
- this = THIS;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->inodelk_trace) {
- return;
- }
-
- afr_print_inodelk (lock, 256, cmd, flock, frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- gf_log (this->name, GF_LOG_NORMAL,
- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
- lock, lockee,
- (unsigned long long) int_lock->lock_number);
-
+int
+afr_entry_lockee_cmp(const void *l1, const void *l2)
+{
+ const afr_lockee_t *r1 = l1;
+ const afr_lockee_t *r2 = l2;
+ int ret = 0;
+ uuid_t gfid1 = {0};
+ uuid_t gfid2 = {0};
+
+ loc_gfid((loc_t *)&r1->loc, gfid1);
+ loc_gfid((loc_t *)&r2->loc, gfid2);
+ ret = gf_uuid_compare(gfid1, gfid2);
+ /*Entrylks with NULL basename are the 'smallest'*/
+ if (ret == 0) {
+ if (!r1->basename)
+ return -1;
+ if (!r2->basename)
+ return 1;
+ ret = strcmp(r1->basename, r2->basename);
+ }
+
+ if (ret <= 0)
+ return -1;
+ else
+ return 1;
}
-static void
-afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, const char *basename,
- int32_t child_index)
-{
- xlator_t *this = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
-
- this = THIS;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->entrylk_trace) {
- return;
- }
-
- afr_print_entrylk (lock, 256, basename, frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- gf_log (this->name, GF_LOG_NORMAL,
- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
- lock, lockee,
- (unsigned long long) int_lock->lock_number);
-}
+int
+afr_lock_blocking(call_frame_t *frame, xlator_t *this, int child_index);
-static void
-afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, const char *basename, int op_ret,
- int op_errno, int32_t child_index)
+void
+afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner)
{
- xlator_t *this = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
- char verdict[16];
-
- this = THIS;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->entrylk_trace) {
- return;
- }
-
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- afr_print_verdict (op_ret, op_errno, verdict);
-
- gf_log (this->name, GF_LOG_NORMAL,
- "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
- verdict,
- lock, lockee,
- (unsigned long long) int_lock->lock_number);
+ gf_msg_trace(this->name, 0, "Setting lk-owner=%llu",
+ (unsigned long long)(unsigned long)lk_owner);
+ set_lk_owner_from_ptr(&frame->root->lk_owner, lk_owner);
}
-static int
-transaction_lk_op (afr_local_t *local)
+int32_t
+internal_lock_count(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- int ret = -1;
-
- int_lock = &local->internal_lock;
-
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "lk op is for a transaction");
- ret = 1;
- }
- else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "lk op is for a self heal");
-
- ret = 0;
- }
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int32_t call_count = 0;
+ int i = 0;
- if (ret == -1)
- gf_log (THIS->name, GF_LOG_DEBUG,
- "lk op is not set");
+ local = frame->local;
+ priv = this->private;
- return ret;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i])
+ ++call_count;
+ }
+ return call_count;
}
-static int
-is_afr_lock_transaction (afr_local_t *local)
+int
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+ int child_count)
{
- int ret = 0;
+ int ret = -ENOMEM;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- ret = 1;
- break;
+ GF_ASSERT(int_lock->lockee_count < AFR_LOCKEE_COUNT_MAX);
+ loc_copy(&lockee->loc, loc);
+ lockee->basename = (basename) ? gf_strdup(basename) : NULL;
+ if (basename && !lockee->basename)
+ goto out;
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- ret = 0;
- break;
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
- }
+ if (!lockee->locked_nodes)
+ goto out;
- return ret;
+ ret = 0;
+ int_lock->lockee_count++;
+out:
+ if (ret) {
+ afr_lockee_cleanup(lockee);
+ }
+ return ret;
}
-static int
-initialize_entrylk_variables (call_frame_t *frame, xlator_t *this)
+int
+afr_add_inode_lockee(afr_local_t *local, int child_count)
{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
+ int ret = -ENOMEM;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
- int i = 0;
+ if (local->fd) {
+ lockee->fd = fd_ref(local->fd);
+ } else {
+ loc_copy(&lockee->loc, &local->loc);
+ }
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
- int_lock->entrylk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
+ if (!lockee->locked_nodes)
+ goto out;
- for (i = 0; i < priv->child_count; i++) {
- int_lock->entry_locked_nodes[i] = 0;
- }
-
- return 0;
+ ret = 0;
+ int_lock->lockee_count++;
+out:
+ if (ret) {
+ afr_lockee_cleanup(lockee);
+ }
+ return ret;
}
static int
-initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
+initialize_internal_lock_variables(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
- int i = 0;
+ int i = 0;
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- int_lock->inodelk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
+ int_lock->lock_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
+ int_lock->lk_attempted_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- int_lock->inode_locked_nodes[i] = 0;
- }
+ for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
+ if (!int_lock->lockee[i].locked_nodes)
+ break;
+ int_lock->lockee[i].locked_count = 0;
+ memset(int_lock->lockee[i].locked_nodes, 0,
+ sizeof(*int_lock->lockee[i].locked_nodes) * priv->child_count);
+ }
- return 0;
-}
-
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
-{
- int ret = 0;
-
- ret = strcmp (l1->path, l2->path);
-
- if (ret == 0)
- ret = strcmp (b1, b2);
-
- if (ret <= 0)
- return l1;
- else
- return l2;
+ return 0;
}
int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
-
+afr_lockee_locked_nodes_count(afr_internal_lock_t *int_lock)
{
- int i;
- int call_count = 0;
+ int call_count = 0;
+ int i = 0;
- for (i = 0; i < child_count; i++) {
- if (locked_nodes[i] & LOCKED_YES)
- call_count++;
- }
-
- return call_count;
-}
-
-/* FIXME: What if UNLOCK fails */
-static int32_t
-afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- int call_count = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
-
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "All internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- }
-
- return 0;
-}
-
-static int32_t
-afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
-
- if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
- gf_log (this->name, GF_LOG_TRACE,
- "Unlock failed for some reason");
- }
-
- afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
-
-}
-
-static int
-afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- struct gf_flock flock;
- int call_count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = F_UNLCK;
-
- call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes,
- priv->child_count);
-
- int_lock->lk_call_count = call_count;
-
- if (!call_count) {
- gf_log (this->name, GF_LOG_TRACE,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->inode_locked_nodes[i] & LOCKED_YES) {
- if (local->fd) {
- afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, &flock, F_SETLK, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
- (void *) (long)i,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- this->name, local->fd,
- F_SETLK, &flock);
-
- if (!--call_count)
- break;
-
- } else {
- afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, &flock, F_SETLK, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
- (void *) (long)i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name, &local->loc,
- F_SETLK, &flock);
-
- if (!--call_count)
- break;
+ for (i = 0; i < int_lock->lockee_count; i++)
+ call_count += int_lock->lockee[i].locked_count;
- }
-
- }
-
- }
-
-out:
- return 0;
+ return call_count;
}
-static int32_t
-afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_UNLOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
-
- afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
-}
+int
+afr_locked_nodes_count(unsigned char *locked_nodes, int child_count)
-static int
-afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
-
- int call_count = 0;
- int i = -1;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
- call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes,
- priv->child_count);
- int_lock->lk_call_count = call_count;
-
- if (!call_count){
- gf_log (this->name, GF_LOG_TRACE,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
+ int i = 0;
+ int call_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->entry_locked_nodes[i] & LOCKED_YES) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- loc, basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-
- if (!--call_count)
- break;
- }
- }
-
-out:
- return 0;
+ for (i = 0; i < child_count; i++) {
+ if (locked_nodes[i] & LOCKED_YES)
+ call_count++;
+ }
+ return call_count;
}
-static int32_t
-afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int done = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/posix-locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
- done = 1;
- }
-
- local->child_up[child_index] = 0;
- local->op_errno = op_errno;
- int_lock->lock_op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if ((op_ret == -1) &&
- (op_errno == ENOSYS)) {
- afr_unlock (frame, this);
- } else {
- if (op_ret == 0) {
- int_lock->locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->lock_count++;
- }
- afr_lock_blocking (frame, this, child_index + 1);
- }
-
- return 0;
-}
-
-static int32_t
-afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static void
+afr_log_locks_failure(call_frame_t *frame, char *where, char *what,
+ int op_errno)
{
- afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
+ xlator_t *this = frame->this;
+ gf_lkowner_t *lk_owner = &frame->root->lk_owner;
+ afr_local_t *local = frame->local;
+ const char *fop = NULL;
+ char *gfid = NULL;
+ const char *name = NULL;
- afr_lock_cbk (frame, cookie, this, op_ret, op_errno);
- return 0;
+ fop = gf_fop_list[local->op];
+ switch (local->transaction.type) {
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
+ switch (local->op) {
+ case GF_FOP_LINK:
+ gfid = uuid_utoa(local->newloc.pargfid);
+ name = local->newloc.name;
+ break;
+ default:
+ gfid = uuid_utoa(local->loc.pargfid);
+ name = local->loc.name;
+ break;
+ }
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_INTERNAL_LKS_FAILED,
+ "Unable to do entry %s with lk-owner:%s on %s "
+ "while attempting %s on {pgfid:%s, name:%s}.",
+ what, lkowner_utoa(lk_owner), where, fop, gfid, name);
+ break;
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ gfid = uuid_utoa(local->inode->gfid);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_INTERNAL_LKS_FAILED,
+ "Unable to do inode %s with lk-owner:%s on %s "
+ "while attempting %s on gfid:%s.",
+ what, lkowner_utoa(lk_owner), where, fop, gfid);
+ break;
+ }
}
static int32_t
-afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *lower_name = NULL;
- const char *higher_name = NULL;
-
- int child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int lockee_num = 0;
+ int call_count = 0;
+ int child_index = 0;
+ int ret = 0;
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
-
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/posix-locks xlator on server");
-
- local->op_ret = op_ret;
- }
-
- local->child_up[child_index] = 0;
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if (op_ret != 0) {
- afr_unlock (frame, this);
- goto out;
- } else {
- int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER;
- int_lock->lock_count++;
- }
-
- /* The lower path has been locked. Now lock the higher path */
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ lockee_num = (int)((long)cookie) / priv->child_count;
+ child_index = (int)((long)cookie) % priv->child_count;
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ afr_log_locks_failure(frame, priv->children[child_index]->name,
+ "unlock", op_errno);
+ }
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
+ int_lock->lockee[lockee_num].locked_nodes[child_index] &= LOCKED_NO;
+ if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1)
+ ret = afr_write_subvol_reset(frame, this);
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
+ LOCK(&frame->lock);
+ {
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK(&frame->lock);
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, higher_name, child_index);
+ if (call_count == 0) {
+ int_lock->lock_cbk(frame, this);
+ }
-
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, higher, higher_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
-out:
- return 0;
+ return ret;
}
-static int32_t
-afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long)cookie);
-
- afr_lock_cbk (frame, cookie, this, op_ret, op_errno);
- return 0;
-}
-
-static int
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- memcpy (int_lock->inode_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->inodelk_lock_count = int_lock->lock_count;
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
+void
+afr_internal_lock_wind(call_frame_t *frame,
+ int32_t (*cbk)(call_frame_t *, void *, xlator_t *,
+ int32_t, int32_t, dict_t *),
+ void *cookie, int child, int lockee_num,
+ gf_boolean_t blocking, gf_boolean_t unlock)
+{
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ entrylk_cmd cmd = ENTRYLK_LOCK_NB;
+ int32_t cmd1 = F_SETLK;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ switch (local->transaction.type) {
case AFR_ENTRY_TRANSACTION:
- memcpy (int_lock->entry_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->entrylk_lock_count = int_lock->lock_count;
- break;
- }
-
- return 0;
-
-}
-
-int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *lower_name = NULL;
- const char *higher_name = NULL;
-
- struct gf_flock flock;
- uint64_t ctx;
- int ret = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
-
- if (local->fd) {
- ret = fd_ctx_get (local->fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unable to get fd ctx for fd=%p",
- local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EINVAL;
- int_lock->lock_op_errno = EINVAL;
-
- afr_copy_locked_nodes (frame, this);
-
- afr_unlock (frame, this);
-
- return 0;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- /* skip over children that or down
- or don't have the fd open */
-
- while ((child_index < priv->child_count)
- && (!local->child_up[child_index]
- || !fd_ctx->opened_on[child_index]))
-
- child_index++;
- } else {
- /* skip over children that are down */
- while ((child_index < priv->child_count)
- && !local->child_up[child_index])
- child_index++;
- }
-
- if ((child_index == priv->child_count) &&
- int_lock->lock_count == 0) {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "unable to lock on even one child");
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EAGAIN;
- int_lock->lock_op_errno = EAGAIN;
-
- afr_copy_locked_nodes (frame, this);
-
- afr_unlock(frame, this);
-
- return 0;
-
- }
-
- if ((child_index == priv->child_count)
- || (int_lock->lock_count ==
- afr_up_children_count (priv->child_count,
- local->child_up))) {
-
- /* we're done locking */
-
- gf_log (this->name, GF_LOG_DEBUG,
- "we're done locking");
-
- afr_copy_locked_nodes (frame, this);
-
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- return 0;
- }
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
-
- if (local->fd) {
- afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLKW,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->finodelk,
- this->name, local->fd,
- F_SETLKW, &flock);
-
- } else {
- afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLKW,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->inodelk,
- this->name, &local->loc,
- F_SETLKW, &flock);
- }
-
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, lower_name, child_index);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_lower_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, lower, lower_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
- break;
- }
-
- case AFR_ENTRY_TRANSACTION:
- if (local->fd) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, local->transaction.basename,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->fentrylk,
- this->name, local->fd,
- local->transaction.basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
- } else {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, local->transaction.basename,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name,
- &local->transaction.parent_loc,
- local->transaction.basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
- }
-
- break;
- }
-
- return 0;
-
-
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ if (unlock) {
+ cmd = ENTRYLK_UNLOCK;
+ } else if (blocking) { /*Doesn't make sense to have blocking
+ unlock*/
+ cmd = ENTRYLK_LOCK;
+ }
+
+ if (local->fd) {
+ STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->fentrylk,
+ int_lock->domain,
+ int_lock->lockee[lockee_num].fd,
+ int_lock->lockee[lockee_num].basename, cmd,
+ ENTRYLK_WRLCK, NULL);
+ } else {
+ STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->entrylk,
+ int_lock->domain,
+ &int_lock->lockee[lockee_num].loc,
+ int_lock->lockee[lockee_num].basename, cmd,
+ ENTRYLK_WRLCK, NULL);
+ }
+ break;
+
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ flock = int_lock->lockee[lockee_num].flock;
+ if (unlock) {
+ flock.l_type = F_UNLCK;
+ } else if (blocking) { /*Doesn't make sense to have blocking
+ unlock*/
+ cmd1 = F_SETLKW;
+ }
+
+ if (local->fd) {
+ STACK_WIND_COOKIE(
+ frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->finodelk, int_lock->domain,
+ int_lock->lockee[lockee_num].fd, cmd1, &flock, NULL);
+ } else {
+ STACK_WIND_COOKIE(
+ frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->inodelk, int_lock->domain,
+ &int_lock->lockee[lockee_num].loc, cmd1, &flock, NULL);
+ }
+ break;
+ }
}
-int32_t
-afr_blocking_lock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- initialize_inodelk_variables (frame, this);
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- initialize_entrylk_variables (frame, this);
+static int
+afr_unlock_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ int lockee_num = 0;
+ int i = -1;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ call_count = afr_lockee_locked_nodes_count(int_lock);
+
+ int_lock->lk_call_count = call_count;
+
+ if (!call_count) {
+ gf_msg_trace(this->name, 0, "No internal locks unlocked");
+ int_lock->lock_cbk(frame, this);
+ goto out;
+ }
+
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ lockee_num = i / priv->child_count;
+ child_index = i % priv->child_count;
+ if (int_lock->lockee[lockee_num].locked_nodes[child_index] &
+ LOCKED_YES) {
+ afr_internal_lock_wind(frame, afr_unlock_common_cbk,
+ (void *)(long)i, child_index, lockee_num,
+ _gf_false, _gf_true);
+ if (!--call_count)
break;
}
+ }
- afr_lock_blocking (frame, this, 0);
-
- return 0;
+out:
+ return 0;
}
static int32_t
-afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
-
- LOCK (&frame->lock);
- {
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
-
- if (op_ret < 0 ) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/posix-locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
-
- local->child_up[child_index] = 0;
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- } else if (op_ret == 0) {
- int_lock->entry_locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->entrylk_lock_count++;
- }
-
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "Last locking reply received");
- /* all locks successfull. Proceed to call FOP */
- if (int_lock->entrylk_lock_count ==
- afr_up_children_count (priv->child_count, local->child_up)) {
- gf_log (this->name, GF_LOG_TRACE,
- "All servers locked. Calling the cbk");
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- }
- /* Not all locks were successfull. Unlock and try locking
- again, this time with serially blocking locks */
- else {
- gf_log (this->name, GF_LOG_TRACE,
- "%d servers locked. Trying again with blocking calls",
- int_lock->lock_count);
-
- afr_unlock(frame, this);
+afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int cky = (long)cookie;
+ int child_index = 0;
+ int lockee_num = 0;
+
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ child_index = ((int)cky) % priv->child_count;
+ lockee_num = ((int)cky) / priv->child_count;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+ }
+
+ local->op_errno = op_errno;
+ int_lock->lock_op_errno = op_errno;
+ }
+
+ int_lock->lk_attempted_count++;
+ }
+ UNLOCK(&frame->lock);
+
+ if ((op_ret == -1) && (op_errno == ENOSYS)) {
+ afr_unlock_now(frame, this);
+ } else {
+ if (op_ret == 0) {
+ int_lock->lockee[lockee_num]
+ .locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_num].locked_count++;
+ int_lock->lock_count++;
+ if (local->transaction.type == AFR_DATA_TRANSACTION) {
+ LOCK(&local->inode->lock);
+ {
+ local->inode_ctx->lock_count++;
}
+ UNLOCK(&local->inode->lock);
+ }
}
+ afr_lock_blocking(frame, this, cky + 1);
+ }
- return 0;
+ return 0;
}
-int
-afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+_is_lock_wind_needed(afr_local_t *local, int child_index)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
-
- int32_t call_count = 0;
- int i = 0;
- uint64_t ctx;
- int ret = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- initialize_entrylk_variables (frame, this);
-
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
- if (local->fd) {
- ret = fd_ctx_get (local->fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unable to get fd ctx for fd=%p",
- local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EINVAL;
- int_lock->lock_op_errno = EINVAL;
-
- return -1;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- call_count = internal_lock_count (frame, this, fd_ctx);
- int_lock->lk_call_count = call_count;
-
- if (!call_count) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fd not open on any subvolumes. aborting.");
- afr_unlock (frame, this);
- goto out;
- }
-
- /* Send non-blocking entrylk calls only on up children
- and where the fd has been opened */
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && fd_ctx->opened_on[i]) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
- this->name, local->fd,
- basename,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
- }
- }
- } else {
- GF_ASSERT (loc);
-
- call_count = internal_lock_count (frame, this, NULL);
- int_lock->lk_call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name, loc, basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
- if (!--call_count)
- break;
+ if (!local->child_up[child_index])
+ return _gf_false;
- }
- }
- }
-out:
- return 0;
+ return _gf_true;
}
-int32_t
-afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static gf_boolean_t
+is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int child = 0;
+ int nlockee = 0;
+ int lockee_count = 0;
+ gf_boolean_t ret = _gf_true;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- afr_trace_inodelk_out (frame, AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
-
- LOCK (&frame->lock);
- {
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
-
- if (op_ret < 0 ) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/posix-locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
- local->child_up[child_index] = 0;
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- } else if (op_ret == 0) {
- int_lock->inode_locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->inodelk_lock_count++;
- }
+ local = frame->local;
+ priv = this->private;
+ int_lock = &local->internal_lock;
+ lockee_count = int_lock->lockee_count;
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "Last inode locking reply received");
- /* all locks successfull. Proceed to call FOP */
- if (int_lock->inodelk_lock_count ==
- afr_up_children_count (priv->child_count, local->child_up)) {
- gf_log (this->name, GF_LOG_TRACE,
- "All servers locked. Calling the cbk");
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- }
- /* Not all locks were successfull. Unlock and try locking
- again, this time with serially blocking locks */
- else {
- gf_log (this->name, GF_LOG_TRACE,
- "%d servers locked. Trying again with blocking calls",
- int_lock->lock_count);
-
- afr_unlock(frame, this);
- }
+ if (int_lock->lock_count == 0) {
+ afr_log_locks_failure(frame, "any subvolume", "lock",
+ int_lock->lock_op_errno);
+ return _gf_false;
+ }
+ /* For FOPS that take multiple sets of locks (mkdir, rename),
+ * there must be at least one brick on which the locks from
+ * all lock sets were successful. */
+ for (child = 0; child < priv->child_count; child++) {
+ ret = _gf_true;
+ for (nlockee = 0; nlockee < lockee_count; nlockee++) {
+ if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES))
+ ret = _gf_false;
}
+ if (ret)
+ return ret;
+ }
+ if (!ret)
+ afr_log_locks_failure(frame, "all", "lock", int_lock->lock_op_errno);
- return 0;
+ return ret;
}
int
-afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
+afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- int32_t call_count = 0;
- uint64_t ctx = 0;
- int i = 0;
- int ret = 0;
- struct gf_flock flock;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
-
- initialize_inodelk_variables (frame, this);
-
- if (local->fd) {
- ret = fd_ctx_get (local->fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unable to get fd ctx for fd=%p",
- local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EINVAL;
- int_lock->lock_op_errno = EINVAL;
-
- ret = -1;
- goto out;
- }
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ uint64_t ctx = 0;
+ int ret = 0;
+ int child_index = 0;
+ int lockee_num = 0;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ child_index = cookie % priv->child_count;
+ lockee_num = cookie / priv->child_count;
- call_count = internal_lock_count (frame, this, fd_ctx);
- int_lock->lk_call_count = call_count;
+ if (local->fd) {
+ ret = fd_ctx_get(local->fd, this, &ctx);
- if (!call_count) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fd not open on any subvolumes. aborting.");
- afr_unlock (frame, this);
- goto out;
- }
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
+ "unable to get fd ctx for fd=%p", local->fd);
- /* Send non-blocking inodelk calls only on up children
- and where the fd has been opened */
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && fd_ctx->opened_on[i]) {
- afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLK, i);
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
- STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- this->name, local->fd,
- F_SETLK, &flock);
+ afr_unlock_now(frame, this);
- if (!--call_count)
- break;
-
- }
-
- }
- } else {
- call_count = internal_lock_count (frame, this, NULL);
- int_lock->lk_call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLK, i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name, &local->loc,
- F_SETLK, &flock);
-
- if (!--call_count)
- break;
-
- }
- }
+ return 0;
}
+ }
-out:
- return ret;
-}
-
-static int
-__is_lower_locked (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ if (!is_blocking_locks_count_sufficient(frame, this)) {
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
- int count = 0;
- int i = 0;
+ afr_unlock_now(frame, this);
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER)
- count++;
+ return 0;
}
+ }
- return count;
-
-}
-
-static int
-__is_higher_locked (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ /* we're done locking */
- int count = 0;
- int i = 0;
+ gf_msg_debug(this->name, 0, "we're done locking");
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->locked_nodes[i] & LOCKED_YES)
- count++;
- }
-
- return count;
-
-}
-
-static int
-afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
-
- int call_count = 0;
- int i = -1;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
- call_count = __is_lower_locked (frame, this);
- int_lock->lk_call_count = call_count;
-
- if (!call_count){
- gf_log (this->name, GF_LOG_TRACE,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) {
- afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- loc, basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-
- if (!--call_count)
- break;
-
- }
- }
-
-out:
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk(frame, this);
return 0;
+ }
-}
-
-
-static int
-afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.done (frame, this);
- return 0;
-}
-
-static int
-afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *lower_name = NULL;
- const char *higher_name = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- if (__is_higher_locked (frame, this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking higher");
- int_lock->lk_basename = higher_name;
- int_lock->lk_loc = higher;
- int_lock->lock_cbk = afr_post_unlock_higher_cbk;
-
- afr_unlock_entrylk (frame, this);
- } else
- local->transaction.done (frame, this);
-
- return 0;
-}
-
-static int
-afr_rename_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *lower_name = NULL;
- const char *higher_name = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
-
- if (__is_lower_locked (frame, this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking lower");
- int_lock->lk_basename = lower_name;
- int_lock->lk_loc = lower;
- int_lock->lock_cbk = afr_post_unlock_lower_cbk;
-
- afr_unlock_lower_entrylk (frame, this);
- } else
- afr_post_unlock_lower_cbk (frame, this);
-
+ if (!_is_lock_wind_needed(local, child_index)) {
+ afr_lock_blocking(frame, this, cookie + 1);
return 0;
-}
-
-static int
-afr_rename_transaction (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
+ }
- return (local->transaction.type ==
- AFR_ENTRY_RENAME_TRANSACTION);
+ afr_internal_lock_wind(frame, afr_lock_cbk, (void *)(long)cookie,
+ child_index, lockee_num, _gf_true, _gf_false);
+ return 0;
}
int32_t
-afr_unlock (call_frame_t *frame, xlator_t *this)
+afr_blocking_lock(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- if (transaction_lk_op (local)) {
- if (is_afr_lock_transaction (local))
- afr_unlock_inodelk (frame, this);
- else
- if (!afr_rename_transaction (frame, this))
- afr_unlock_entrylk (frame, this);
- else
- afr_rename_unlock (frame, this);
- } else {
- if (is_afr_lock_selfheal (local))
- afr_unlock_inodelk (frame, this);
- else
- afr_unlock_entrylk (frame, this);
- }
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int up_count = 0;
- return 0;
-}
-
-int
-afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
- unsigned char *locked_nodes)
-{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- priv = this->private;
+ up_count = AFR_COUNT(local->child_up, priv->child_count);
+ int_lock->lk_call_count = int_lock->lk_expected_count =
+ (int_lock->lockee_count * up_count);
+ initialize_internal_lock_variables(frame, this);
- afr_fd_ctx_set (this, fd);
- if (ret < 0)
- goto out;
+ afr_lock_blocking(frame, this, 0);
- ret = fd_ctx_get (fd, this, &tmp);
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- GF_ASSERT (fdctx->locked_on);
-
- memcpy (fdctx->locked_on, locked_nodes,
- priv->child_count);
-
-out:
- return ret;
+ return 0;
}
-static int
-__is_fd_saved (xlator_t *this, fd_t *fd)
+static int32_t
+afr_nb_internal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_locked_fd_t *locked_fd = NULL;
- afr_private_t *priv = NULL;
- int found = 0;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ int lockee_num = 0;
+ afr_private_t *priv = NULL;
- priv = this->private;
+ priv = this->private;
- list_for_each_entry (locked_fd, &priv->saved_fds, list) {
- if (locked_fd->fd == fd) {
- found = 1;
- break;
- }
- }
+ child_index = ((long)cookie) % priv->child_count;
+ lockee_num = ((long)cookie) / priv->child_count;
- return found;
-}
+ local = frame->local;
+ int_lock = &local->internal_lock;
-static int
-__afr_save_locked_fd (xlator_t *this, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- afr_locked_fd_t *locked_fd = NULL;
- int ret = 0;
-
- priv = this->private;
-
- locked_fd = GF_CALLOC (1, sizeof (*locked_fd),
- gf_afr_mt_locked_fd);
- if (!locked_fd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- locked_fd->fd = fd;
- INIT_LIST_HEAD (&locked_fd->list);
-
- list_add_tail (&locked_fd->list, &priv->saved_fds);
-
-out:
- return ret;
-}
-
-int
-afr_save_locked_fd (xlator_t *this, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- int ret = 0;
-
- priv = this->private;
-
- pthread_mutex_lock (&priv->mutex);
+ if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) {
+ LOCK(&local->inode->lock);
{
- if (__is_fd_saved (this, fd)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fd=%p already saved", fd);
- goto unlock;
- }
-
- ret = __afr_save_locked_fd (this, fd);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fd=%p could not be saved");
- goto unlock;
- }
+ local->inode_ctx->lock_count++;
}
-unlock:
- pthread_mutex_unlock (&priv->mutex);
-
- return ret;
-}
-
-static int
-afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_locked_fd_t *locked_fd = NULL;
-
- local = frame->local;
-
- locked_fd = local->locked_fd;
-
- STACK_DESTROY (frame->root);
- afr_local_cleanup (local, this);
-
- afr_save_locked_fd (this, locked_fd->fd);
-
- return 0;
-
-}
-
-static int
-afr_get_source_lock_recovery (xlator_t *this, fd_t *fd)
-{
- afr_fd_ctx_t *fdctx = NULL;
- afr_private_t *priv = NULL;
- uint64_t tmp = 0;
- int i = 0;
- int source_child = -1;
- int ret = 0;
-
- priv = this->private;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- for (i = 0; i < priv->child_count; i++) {
- if (fdctx->locked_on[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Found lock recovery source=%d",
- i);
- source_child = i;
- break;
- }
-
- }
-
-out:
- return source_child;
-
-}
+ UNLOCK(&local->inode->lock);
+ }
-int32_t
-afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock);
-int32_t
-afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t source_child = 0;
- struct gf_flock flock = {0,};
-
- local = frame->local;
- priv = this->private;
-
- if (op_ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "lock recovery failed");
- goto cleanup;
+ LOCK(&frame->lock);
+ {
+ if (op_ret < 0) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED,
+ "subvolume does not support "
+ "locking. please load features/locks"
+ " xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ } else if (op_ret == 0) {
+ int_lock->lockee[lockee_num]
+ .locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_num].locked_count++;
+ int_lock->lock_count++;
}
- source_child = local->source_child;
-
- memcpy (&flock, lock, sizeof (*lock));
-
- STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk,
- (void *) (long) source_child,
- priv->children[source_child],
- priv->children[source_child]->fops->lk,
- local->fd, F_GETLK_FD, &flock);
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
- return 0;
-}
-
-int
-afr_recover_lock (call_frame_t *frame, xlator_t *this,
- struct gf_flock *flock)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t lock_recovery_child = 0;
-
- priv = this->private;
- local = frame->local;
-
- lock_recovery_child = local->lock_recovery_child;
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK(&frame->lock);
- frame->root->lk_owner = flock->l_owner;
-
- STACK_WIND_COOKIE (frame, afr_recover_lock_cbk,
- (void *) (long) lock_recovery_child,
- priv->children[lock_recovery_child],
- priv->children[lock_recovery_child]->fops->lk,
- local->fd, F_SETLK, flock);
-
- return 0;
-}
-
-static int
-is_afr_lock_eol (struct gf_flock *lock)
-{
- int ret = 0;
-
- if ((lock->l_type == GF_LK_EOL))
- ret = 1;
-
- return ret;
-}
-
-int32_t
-afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
-{
- if (op_ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Failed to get locks on fd");
- goto cleanup;
+ if (call_count == 0) {
+ gf_msg_trace(this->name, 0, "Last locking reply received");
+ /* all locks successful. Proceed to call FOP */
+ if (int_lock->lock_count == int_lock->lk_expected_count) {
+ gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk");
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk(frame, this);
}
+ /* Not all locks were successful. Unlock and try locking
+ again, this time with serially blocking locks */
+ else {
+ gf_msg_trace(this->name, 0,
+ "%d servers locked. Trying again "
+ "with blocking calls",
+ int_lock->lock_count);
- gf_log (this->name, GF_LOG_DEBUG,
- "Got a lock on fd");
-
- if (is_afr_lock_eol (lock)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reached EOL on locks on fd");
- goto cleanup;
+ afr_unlock_now(frame, this);
}
+ }
- afr_recover_lock (frame, this, lock);
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
-
- return 0;
+ return 0;
}
-static int
-afr_lock_recovery (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- int ret = 0;
- int32_t source_child = 0;
- struct gf_flock flock = {0,};
-
- priv = this->private;
- local = frame->local;
-
- fd = local->fd;
-
- source_child = afr_get_source_lock_recovery (this, fd);
- if (source_child < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not recover locks due to lock "
- "split brain");
- ret = -1;
- goto out;
+int
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int child = 0;
+ int lockee_num = 0;
+ int32_t call_count = 0;
+ int i = 0;
+ int ret = 0;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ initialize_internal_lock_variables(frame, this);
+
+ if (local->fd) {
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
+ "unable to get fd ctx for fd=%p", local->fd);
+
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
+ local->op_errno = EINVAL;
+ int_lock->lock_op_errno = EINVAL;
+
+ afr_unlock_now(frame, this);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ call_count = int_lock->lockee_count * internal_lock_count(frame, this);
+ int_lock->lk_call_count = call_count;
+ int_lock->lk_expected_count = call_count;
+
+ if (!call_count) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON,
+ "fd not open on any subvolumes. aborting.");
+ afr_unlock_now(frame, this);
+ goto out;
+ }
+
+ /* Send non-blocking lock calls only on up children
+ and where the fd has been opened */
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ child = i % priv->child_count;
+ lockee_num = i / priv->child_count;
+ if (local->child_up[child]) {
+ afr_internal_lock_wind(frame, afr_nb_internal_lock_cbk,
+ (void *)(long)i, child, lockee_num,
+ _gf_false, _gf_false);
+ if (!--call_count)
+ break;
}
-
- local->source_child = source_child;
-
- /* the flock can be zero filled as we're querying incrementally
- the locks held on the fd.
- */
- STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk,
- (void *) (long) source_child,
- priv->children[source_child],
- priv->children[source_child]->fops->lk,
- local->fd, F_GETLK_FD, &flock);
-
+ }
out:
- return ret;
-}
-
-
-static int
-afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index)
-{
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- fdctx->opened_on[child_index] = 1;
-
-out:
- return ret;
+ return ret;
}
int32_t
-afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- int32_t child_index = (long )cookie;
- int ret = 0;
-
- if (op_ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reopen during lock-recovery failed");
- goto cleanup;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Open succeeded => proceed to recover locks");
-
- ret = afr_lock_recovery (frame, this);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Lock recovery failed");
- goto cleanup;
- }
-
- ret = afr_mark_fd_opened (this, fd, child_index);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Marking fd open failed");
- goto cleanup;
- }
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
- return 0;
-}
-
-static int
-afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- uint64_t tmp = 0;
- afr_fd_ctx_t *fdctx = NULL;
- loc_t loc = {0,};
- int32_t child_index = 0;
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
-
- GF_ASSERT (local && local->fd);
-
- ret = fd_ctx_get (local->fd, this, &tmp);
- fdctx = (afr_fd_ctx_t *) (long) tmp;
- GF_ASSERT (fdctx);
-
- child_index = local->lock_recovery_child;
-
- inode_path (local->fd->inode, NULL, (char **)&loc.path);
- loc.name = strrchr (loc.path, '/');
- loc.inode = inode_ref (local->fd->inode);
- loc.parent = inode_parent (local->fd->inode, 0, NULL);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk,
- (void *)(long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->open,
- &loc, fdctx->flags, local->fd,
- fdctx->wbflags);
-
+afr_unlock(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+
+ if (!local->transaction.eager_lock_on)
+ goto out;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ list_del_init(&local->transaction.owner_list);
+ if (list_empty(&lock->owners) && list_empty(&lock->post_op)) {
+ local->transaction.do_eager_unlock = _gf_true;
+ /*TODO: Need to get metadata use on_disk and inherit/uninherit
+ *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]);
+ *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]);
+ */
+ GF_ASSERT(lock->release);
+ }
+ }
+ UNLOCK(&local->inode->lock);
+ if (!local->transaction.do_eager_unlock) {
+ local->internal_lock.lock_cbk(frame, this);
return 0;
-}
-
-static int
-is_fd_opened (fd_t *fd, int32_t child_index)
-{
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, THIS, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- if (fdctx->opened_on[child_index])
- ret = 1;
-
-out:
- return ret;
-}
-
-int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index)
-{
- call_frame_t *frame = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_locked_fd_t *locked_fd = NULL;
- afr_locked_fd_t *tmp = NULL;
- int ret = 0;
- struct list_head locks_list;
-
-
- priv = this->private;
-
- if (list_empty (&priv->saved_fds))
- goto out;
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- local = GF_CALLOC (1, sizeof (*local),
- gf_afr_mt_afr_local_t);
- if (!local) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- AFR_LOCAL_INIT (local, priv);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- frame->local = local;
-
- INIT_LIST_HEAD (&locks_list);
-
- pthread_mutex_lock (&priv->mutex);
- {
- list_splice_init (&priv->saved_fds, &locks_list);
- }
- pthread_mutex_unlock (&priv->mutex);
-
- list_for_each_entry_safe (locked_fd, tmp,
- &locks_list, list) {
-
- list_del_init (&locked_fd->list);
-
- local->fd = fd_ref (locked_fd->fd);
- local->lock_recovery_child = child_index;
- local->locked_fd = locked_fd;
-
- if (!is_fd_opened (locked_fd->fd, child_index)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting open before lock "
- "recovery");
- afr_lock_recovery_preopen (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting lock recovery "
- "without a preopen");
- afr_lock_recovery (frame, this);
- }
- }
+ }
out:
- return ret;
+ afr_unlock_now(frame, this);
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 14064ebcd76..816065fb57a 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -1,48 +1,38 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#ifndef __AFR_MEM_TYPES_H__
#define __AFR_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_afr_mem_types_ {
- gf_afr_mt_iovec = gf_common_mt_end + 1,
- gf_afr_mt_afr_fd_ctx_t,
- gf_afr_mt_afr_local_t,
- gf_afr_mt_afr_private_t,
- gf_afr_mt_int32_t,
- gf_afr_mt_char,
- gf_afr_mt_xattr_key,
- gf_afr_mt_dict_t,
- gf_afr_mt_xlator_t,
- gf_afr_mt_iatt,
- gf_afr_mt_int,
- gf_afr_mt_afr_node_character,
- gf_afr_mt_sh_diff_loop_state,
- gf_afr_mt_uint8_t,
- gf_afr_mt_loc_t,
- gf_afr_mt_entry_name,
- gf_afr_mt_pump_priv,
- gf_afr_mt_locked_fd,
- gf_afr_mt_end
+ gf_afr_mt_afr_fd_ctx_t = gf_common_mt_end + 1,
+ gf_afr_mt_afr_private_t,
+ gf_afr_mt_int32_t,
+ gf_afr_mt_char,
+ gf_afr_mt_xattr_key,
+ gf_afr_mt_dict_t,
+ gf_afr_mt_xlator_t,
+ gf_afr_mt_afr_node_character,
+ gf_afr_mt_inode_ctx_t,
+ gf_afr_mt_shd_event_t,
+ gf_afr_mt_reply_t,
+ gf_afr_mt_subvol_healer_t,
+ gf_afr_mt_spbc_timeout_t,
+ gf_afr_mt_spb_status_t,
+ gf_afr_mt_empty_brick_t,
+ gf_afr_mt_child_latency_t,
+ gf_afr_mt_atomic_t,
+ gf_afr_mt_lk_heal_info_t,
+ gf_afr_mt_gf_lock,
+ gf_afr_mt_end
};
#endif
-
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
new file mode 100644
index 00000000000..e73fd997765
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -0,0 +1,167 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _AFR_MESSAGES_H_
+#define _AFR_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+ AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE,
+ AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN,
+ AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL,
+ AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL,
+ AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS,
+ AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED,
+ AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED,
+ AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO,
+ AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA,
+ AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED,
+ AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED,
+ AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG,
+ AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB,
+ AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED,
+ AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET,
+ AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START,
+ AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT,
+ AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED,
+ AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG,
+ AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED,
+ AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH,
+ AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED,
+ AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA,
+ SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE,
+ SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED,
+ AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED,
+ AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED,
+ AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED,
+ AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN,
+ AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN,
+ AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP,
+ AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR,
+ AFR_MSG_INTERNAL_ATTR);
+
+#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed"
+#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed"
+#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed"
+#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed"
+#define AFR_MSG_INVALID_ARG_STR "Invalid argument"
+#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on "
+#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file"
+#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict."
+#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed."
+#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \
+ "Failed to populate loc for thin-arbiter"
+#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending"
+#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping."
+#define AFR_MSG_UNKNOWN_SET_STR "Unknown set"
+#define AFR_MSG_NO_XL_ID_STR "xl does not have id"
+#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on"
+#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on"
+#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter."
+#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output"
+#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time"
+#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict"
+#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \
+ "No majority to resolve gfid split brain"
+#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected"
+#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal"
+#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch"
+#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \
+ "Size policy is not applicable to directories."
+#define AFR_MSG_NO_CHILD_SELECTED_STR \
+ "No child selected by favorite-child policy"
+#define AFR_MSG_INVALID_CHILD_STR "Invalid child"
+#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \
+ "selected as authentic to resolve conflicting data"
+#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick"
+#define SNO_DIFF_IN_MTIME_STR "No difference in mtime"
+#define SNO_BIGGER_FILE_STR "No bigger file"
+#define SALL_BRICKS_UP_TO_RESOLVE_STR \
+ "All the bricks should be up to resolve the gfid split brain"
+#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock"
+#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed"
+#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame"
+#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop"
+#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed"
+#define AFR_MSG_FSYNC_FAILED_STR "fsync failed"
+#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met"
+#define AFR_MSG_FOP_FAILED_STR "Failing Fop"
+#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume"
+#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling"
+#define AFR_MSG_CHILD_MISCONFIGURED_STR \
+ "replicate translator needs more than one subvolume defined"
+#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads"
+#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count"
+#define AFR_MSG_UNABLE_TO_FETCH_STR \
+ "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \
+ "using client translator names"
+#define AFR_MSG_NULL_DEREF_STR "possible NULL deref"
+#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key"
+#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask"
+#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up"
+#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \
+ "Failed to cancel split-brain choice"
+#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \
+ "Cannot set replica. File is not in data/metadata split-brain"
+#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx"
+#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols"
+#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child"
+#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file"
+#define AFR_MSG_TIMER_CREATE_FAIL_STR \
+ "Cannot create timer for delayed initialization"
+#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online"
+#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \
+ "All subvolumes are down. Going offline until atleast one of them is up"
+#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock"
+#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume"
+#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met"
+#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir"
+#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid"
+#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file"
+#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain"
+#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down"
+#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask"
+#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \
+ "read subvolume in this generation is not up"
+#define AFR_MSG_INTERNAL_LKS_FAILED_STR \
+ "Unable to work with lk-owner while attempting fop"
+#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \
+ "subvolume does not support locking. please load features/locks xlator " \
+ "on server."
+#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx"
+#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting."
+#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child."
+#define AFR_MSG_NEW_BRICK_STR "New brick"
+#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \
+ "Failed to set split-brain choice to -1"
+#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \
+ "Failed to determine split-brain. Aborting split-brain-choice set"
+#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume"
+#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr"
+#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute"
+#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 44ba22ee77a..64856042b65 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -1,642 +1,353 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "statedump.h"
-
-#include "fd.h"
-
-#include "afr-inode-read.h"
-#include "afr-inode-write.h"
-#include "afr-dir-read.h"
-#include "afr-dir-write.h"
-#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-int
-afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- afr_local_t * local = frame->local;
-
- AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
- local->fd);
- return 0;
-}
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+#include "afr-transaction.h"
-int
-afr_open_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
+gf_boolean_t
+afr_is_fd_fixable(fd_t *fd)
{
- afr_local_t * local = NULL;
-
- int child_index = (long) cookie;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- int ret = 0;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- local->success_count++;
-
- ret = afr_fd_ctx_set (this, fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set fd ctx for fd=%p",
- fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not get fd ctx for fd=%p", fd);
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = 1;
- fd_ctx->flags = local->cont.open.flags;
- fd_ctx->wbflags = local->cont.open.wbflags;
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if ((local->cont.open.flags & O_TRUNC)
- && (local->op_ret >= 0)) {
- STACK_WIND (frame, afr_open_ftruncate_cbk,
- this, this->fops->ftruncate,
- fd, 0);
- } else {
- AFR_STACK_UNWIND (open, frame, local->op_ret,
- local->op_errno, local->fd);
- }
- }
-
- return 0;
+ if (!fd || !fd->inode)
+ return _gf_false;
+ else if (fd_is_anonymous(fd))
+ return _gf_false;
+ else if (gf_uuid_is_null(fd->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
}
-
int
-afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags)
+afr_open_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
-
- int i = 0;
- int ret = -1;
-
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t wind_flags = flags & (~O_TRUNC);
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
-
- if (afr_is_split_brain (this, loc->inode)) {
- /* self-heal failed */
- op_errno = EIO;
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- frame->local = local;
- call_count = local->call_count;
-
- loc_copy (&local->loc, loc);
-
- local->cont.open.flags = flags;
- local->cont.open.wbflags = wbflags;
-
- local->fd = fd_ref (fd);
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- loc, wind_flags, fd, wbflags);
-
- if (!--call_count)
- break;
- }
- }
+ afr_local_t *local = frame->local;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd);
- }
-
- return 0;
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
+ local->cont.open.fd, xdata);
+ return 0;
}
-
int
-afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
+afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int ret = 0;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- int call_count = 0;
- int child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- if (op_ret >= 0) {
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = 1;
-
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened successfully on subvolume %s",
- local->loc.path, priv->children[child_index]->name);
- }
- }
-out:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- int_lock->lock_cbk = local->transaction.done;
- local->transaction.resume (frame, this);
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = (long)cookie;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
+ local->op_ret = op_ret;
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
-
- return 0;
-}
-
-
-static int
-__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up)
-{
- int i;
- int count = 0;
-
- for (i = 0; i < child_count; i++) {
- if (!opened_on[i] && child_up[i])
- count++;
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ if (call_count == 0) {
+ afr_handle_replies_quorum(frame, this);
+ if (local->op_ret == -1) {
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ } else if (fd_ctx->flags & O_TRUNC) {
+ STACK_WIND(frame, afr_open_ftruncate_cbk, this,
+ this->fops->ftruncate, fd, 0, NULL);
+ } else {
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
+ local->cont.open.fd, local->xdata_rsp);
}
+ }
- return count;
+ return 0;
}
-
int
-afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this)
+afr_open_continue(call_frame_t *frame, xlator_t *this, int err)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- int abandon = 0;
- int ret = 0;
- int i;
- int call_count = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
- priv = this->private;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- /*
- * Some subvolumes might have come up on which we never
- * opened this fd in the first place. Re-open fd's on those
- * subvolumes now.
- */
-
- ret = fd_ctx_get (local->fd, this, &ctx);
-
- if (ret < 0) {
- abandon = 1;
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- LOCK (&local->fd->lock);
- {
- call_count = __unopened_count (priv->child_count,
- fd_ctx->opened_on,
- local->child_up);
- for (i = 0; i < priv->child_count; i++) {
- fd_ctx->pre_op_done[i] = 0;
- fd_ctx->pre_op_piggyback[i] = 0;
- }
- }
- UNLOCK (&local->fd->lock);
-
- if (call_count == 0) {
- abandon = 1;
- goto out;
- }
-
- local->call_count = call_count;
+ if (err) {
+ AFR_STACK_UNWIND(open, frame, -1, err, NULL, NULL);
+ } else {
+ local->call_count = AFR_COUNT(local->child_up, priv->child_count);
+ call_count = local->call_count;
for (i = 0; i < priv->child_count; i++) {
- if (!fd_ctx->opened_on[i] && local->child_up[i]) {
- gf_log (this->name, GF_LOG_TRACE,
- "opening fd for %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk,
- (void *)(long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc, fd_ctx->flags, local->fd,
- fd_ctx->wbflags);
-
- if (!--call_count)
- break;
- }
- }
-
-out:
- if (abandon)
- local->transaction.resume (frame, this);
-
- return 0;
-}
-
-
-static int
-afr_prepare_loc (call_frame_t *frame, fd_t *fd)
-{
- afr_local_t *local = NULL;
- char *name = NULL;
- char *path = NULL;
- int ret = 0;
-
- if (!fd)
- return -1;
-
- local = frame->local;
- ret = inode_path (fd->inode, NULL, (char **)&path);
- if (ret == -1)
- return -1;
-
- if (local->loc.path) {
- if (strcmp (path, local->loc.path))
- gf_log (frame->this->name, GF_LOG_DEBUG,
- "overwriting old loc->path %s with %s",
- local->loc.path, path);
- GF_FREE ((char *)local->loc.path);
- }
- local->loc.path = path;
-
- name = strrchr (local->loc.path, '/');
- if (name)
- name++;
- local->loc.name = name;
-
- if (local->loc.inode) {
- inode_unref (local->loc.inode);
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_open_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->open, &local->loc,
+ (local->cont.open.flags & ~O_TRUNC),
+ local->cont.open.fd, local->xdata_req);
+ if (!--call_count)
+ break;
+ }
}
- local->loc.inode = inode_ref (fd->inode);
-
- if (local->loc.parent) {
- inode_unref (local->loc.parent);
- }
-
- local->loc.parent = inode_parent (local->loc.inode, 0, NULL);
-
- return 0;
-}
-
-
-int
-afr_openfd_sh (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- char sh_type_str[256] = {0,};
-
- local = frame->local;
- sh = &local->self_heal;
-
- afr_prepare_loc (frame, local->fd);
-
- /* forcibly trigger missing-entries self-heal */
-
- local->success_count = 1;
- local->enoent_count = 1;
-
- sh->data_lock_held = _gf_true;
- sh->need_data_self_heal = _gf_true;
- sh->type = local->fd->inode->ia_type;
- sh->background = _gf_false;
- sh->unwind = afr_openfd_sh_unwind;
-
- afr_self_heal_type_str_get(&local->self_heal,
- sh_type_str,
- sizeof(sh_type_str));
- gf_log (this->name, GF_LOG_NORMAL, "%s self-heal triggered. "
- "path: %s, reason: Replicate up down flush, data lock is held",
- sh_type_str, local->loc.path);
-
- afr_self_heal (frame, this);
-
- return 0;
+ }
+ return 0;
}
-
int
-afr_openfd_flush_done (call_frame_t *frame, xlator_t *this)
+afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- int _ret = -1;
-
- priv = this->private;
- local = frame->local;
-
- LOCK (&local->fd->lock);
- {
- _ret = __fd_ctx_get (local->fd, this, &ctx);
-
- if (_ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->down_count = priv->down_count;
- fd_ctx->up_count = priv->up_count;
- }
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int spb_subvol = 0;
+ int event_generation = 0;
+ int ret = 0;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ // We can't let truncation to happen outside transaction.
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_OPEN;
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ local->inode = inode_ref(loc->inode);
+ loc_copy(&local->loc, loc);
+ local->fd_ctx = fd_ctx;
+ fd_ctx->flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ local->cont.open.flags = flags;
+ local->cont.open.fd = fd_ref(fd);
+
+ ret = afr_inode_get_readable(frame, local->inode, this, NULL,
+ &event_generation, AFR_DATA_TRANSACTION);
+ if ((ret < 0) &&
+ (afr_split_brain_read_subvol_get(local->inode, this, NULL,
+ &spb_subvol) == 0) &&
+ spb_subvol < 0) {
+ afr_inode_refresh(frame, this, local->inode, local->inode->gfid,
+ afr_open_continue);
+ } else {
+ afr_open_continue(frame, this, 0);
+ }
+
+ return 0;
out:
- UNLOCK (&local->fd->lock);
-
- afr_local_transaction_cleanup (local, this);
-
- gf_log (this->name, GF_LOG_TRACE,
- "The up/down flush is over");
-
- fd_unref (local->fd);
- local->openfd_flush_cbk (frame, this);
+ AFR_STACK_UNWIND(open, frame, -1, op_errno, fd, NULL);
- return 0;
+ return 0;
}
-
-
int
-afr_openfd_xaction (call_frame_t *frame, xlator_t *this, fd_t *fd)
+afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
{
- afr_local_t * local = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- local = frame->local;
-
- local->op = GF_FOP_FLUSH;
-
- local->transaction.fop = afr_openfd_sh;
- local->transaction.done = afr_openfd_flush_done;
-
- local->transaction.start = 0;
- local->transaction.len = 0;
-
- gf_log (this->name, GF_LOG_TRACE,
- "doing up/down flush on fd=%p",
- fd);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int call_count = 0;
+ int child_index = (long)cookie;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (op_ret >= 0) {
+ gf_msg_debug(this->name, 0,
+ "fd for %s opened "
+ "successfully on subvolume %s",
+ local->loc.path, priv->children[child_index]->name);
+ } else {
+ gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno,
+ AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s",
+ priv->children[child_index]->name, NULL);
+ }
+
+ fd_ctx = local->fd_ctx;
+
+ LOCK(&local->fd->lock);
+ {
+ if (op_ret >= 0) {
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ }
+ }
+ UNLOCK(&local->fd->lock);
- afr_transaction (frame, this, AFR_DATA_TRANSACTION);
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ AFR_STACK_DESTROY(frame);
-out:
- return 0;
+ return 0;
}
-
-
-int
-afr_openfd_xaction_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
+static int
+afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int ret = 0;
-
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int count = 0;
- int call_count = 0;
- int child_index = (long) cookie;
+ priv = this->private;
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- if (op_ret >= 0) {
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = 1;
-
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened successfully on subvolume %s",
- local->loc.path, priv->children[child_index]->name);
- }
- }
-out:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
+ return 0;
- if (call_count == 0) {
- afr_openfd_xaction (frame, this, local->fd);
+ LOCK(&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED &&
+ priv->child_up[i]) {
+ fd_ctx->opened_on[i] = AFR_FD_OPENING;
+ need_open[i] = 1;
+ count++;
+ } else {
+ need_open[i] = 0;
+ }
}
+ }
+ UNLOCK(&fd->lock);
- return 0;
+ return count;
}
-
-int
-afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+void
+afr_fix_open(fd_t *fd, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- uint64_t ctx;
- afr_fd_ctx_t *fd_ctx;
-
- int no_open = 0;
- int ret = 0;
- int i;
- int call_count = 0;
-
- priv = this->private;
- local = frame->local;
-
- /*
- * Some subvolumes might have come up on which we never
- * opened this fd in the first place. Re-open fd's on those
- * subvolumes now.
- */
-
- local->fd = fd_ref (fd);
-
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0) {
- no_open = 1;
- goto out;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *need_open = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+
+ if (!afr_is_fd_fixable(fd))
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
+ goto out;
+
+ need_open = alloca0(priv->child_count);
+
+ call_count = afr_fd_ctx_need_open(fd, this, need_open);
+ if (!call_count)
+ goto out;
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->loc.inode = inode_ref(fd->inode);
+ ret = loc_path(&local->loc, NULL);
+ if (ret < 0)
+ goto out;
+
+ local->fd = fd_ref(fd);
+ local->fd_ctx = fd_ctx;
+
+ local->call_count = call_count;
+
+ gf_msg_debug(this->name, 0, "need open count: %d", call_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!need_open[i])
+ continue;
+
+ if (IA_IFDIR == fd->inode->ia_type) {
+ gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->opendir, &local->loc,
+ local->fd, NULL);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "opening fd for file %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->open,
+ &local->loc, fd_ctx->flags & (~O_TRUNC),
+ local->fd, NULL);
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- LOCK (&local->fd->lock);
- {
- call_count = __unopened_count (priv->child_count,
- fd_ctx->opened_on,
- local->child_up);
- }
- UNLOCK (&local->fd->lock);
-
- if (call_count == 0) {
- no_open = 1;
- goto out;
- }
-
- afr_prepare_loc (frame, fd);
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!fd_ctx->opened_on[i] && local->child_up[i]) {
- gf_log (this->name, GF_LOG_TRACE,
- "opening fd for %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_openfd_xaction_open_cbk,
- (void *)(long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc, fd_ctx->flags, fd,
- fd_ctx->wbflags);
-
- if (!--call_count)
- break;
- }
- }
+ if (!--call_count)
+ break;
+ }
+ return;
out:
- if (no_open)
- afr_openfd_xaction (frame, this, fd);
-
- return 0;
+ if (frame)
+ AFR_STACK_DESTROY(frame);
}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
new file mode 100644
index 00000000000..6fc2c75145c
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -0,0 +1,494 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-transaction.h"
+#include "afr-messages.h"
+
+void
+afr_pending_read_increment(afr_private_t *priv, int child_index)
+{
+ if (child_index < 0 || child_index > priv->child_count)
+ return;
+
+ GF_ATOMIC_INC(priv->pending_reads[child_index]);
+}
+
+void
+afr_pending_read_decrement(afr_private_t *priv, int child_index)
+{
+ if (child_index < 0 || child_index > priv->child_count)
+ return;
+
+ GF_ATOMIC_DEC(priv->pending_reads[child_index]);
+}
+
+void
+afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_pending_read_decrement(priv, local->read_subvol);
+ local->read_subvol = subvol;
+ afr_pending_read_increment(priv, subvol);
+ local->readfn(frame, this, subvol);
+}
+
+int
+afr_read_txn_next_subvol(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->readable[i]) {
+ /* don't even bother trying here.
+ just mark as attempted and move on. */
+ local->read_attempted[i] = 1;
+ continue;
+ }
+
+ if (!local->read_attempted[i]) {
+ subvol = i;
+ break;
+ }
+ }
+
+ /* If no more subvols were available for reading, we leave
+ @subvol as -1, which is an indication we have run out of
+ readable subvols. */
+ if (subvol != -1)
+ local->read_attempted[subvol] = 1;
+ afr_read_txn_wind(frame, this, subvol);
+
+ return 0;
+}
+
+static int
+afr_ta_read_txn_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ STACK_DESTROY(ta_frame->root);
+ return 0;
+}
+
+static int
+afr_ta_read_txn(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ int read_subvol = -1;
+ int query_child = AFR_CHILD_UNKNOWN;
+ int possible_bad_child = AFR_CHILD_UNKNOWN;
+ int ret = 0;
+ int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int **pending = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ frame = (call_frame_t *)opaque;
+ this = frame->this;
+ local = frame->local;
+ priv = this->private;
+ query_child = local->read_txn_query_child;
+
+ if (query_child == AFR_CHILD_ZERO) {
+ possible_bad_child = AFR_CHILD_ONE;
+ } else if (query_child == AFR_CHILD_ONE) {
+ possible_bad_child = AFR_CHILD_ZERO;
+ } else {
+ /*read_txn_query_child is AFR_CHILD_UNKNOWN*/
+ goto out;
+ }
+
+ /* Ask the query_child to see if it blames the possibly bad one. */
+ xdata_req = dict_new();
+ if (!xdata_req)
+ goto out;
+
+ pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!pending)
+ goto out;
+
+ ret = afr_set_pending_dict(priv, xdata_req, pending);
+ if (ret < 0)
+ goto out;
+
+ if (local->fd) {
+ ret = syncop_fxattrop(priv->children[query_child], local->fd,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ } else {
+ ret = syncop_xattrop(priv->children[query_child], &local->loc,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ }
+ if (ret || !xdata_rsp) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed xattrop for gfid %s on %s",
+ uuid_utoa(local->inode->gfid),
+ priv->children[query_child]->name);
+ op_errno = -ret;
+ goto out;
+ }
+
+ if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv,
+ possible_bad_child)) {
+ read_subvol = query_child;
+ goto out;
+ }
+ dict_unref(xdata_rsp);
+ xdata_rsp = NULL;
+
+ /* It doesn't. So query thin-arbiter to see if it blames any data brick. */
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ goto out;
+ }
+ flock.l_type = F_WRLCK; /*start and length are already zero. */
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.",
+ uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ op_errno = -ret;
+ goto out;
+ }
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ if (ret || !xdata_rsp) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed xattrop on %s.", uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ op_errno = -ret;
+ goto unlock;
+ }
+
+ if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) {
+ read_subvol = query_child;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Failing read for gfid %s since good brick %s is down",
+ uuid_utoa(local->inode->gfid),
+ priv->children[possible_bad_child]->name);
+ op_errno = EIO;
+ }
+
+unlock:
+ flock.l_type = F_UNLCK;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on "
+ "%s.",
+ uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ }
+out:
+ if (xdata_req)
+ dict_unref(xdata_req);
+ if (xdata_rsp)
+ dict_unref(xdata_rsp);
+ if (pending)
+ afr_matrix_cleanup(pending, priv->child_count);
+ loc_wipe(&loc);
+
+ if (read_subvol == -1) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ afr_read_txn_wind(frame, this, read_subvol);
+ return ret;
+}
+
+void
+afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *ta_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ goto out;
+ }
+ ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done,
+ ta_frame, frame);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to launch "
+ "afr_ta_read_txn synctask for gfid %s.",
+ uuid_utoa(local->inode->gfid));
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ STACK_DESTROY(ta_frame->root);
+ goto out;
+ }
+ return;
+out:
+ afr_read_txn_wind(frame, this, -1);
+}
+
+int
+afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int read_subvol = -1;
+ inode_t *inode = NULL;
+ int ret = -1;
+ int spb_subvol = -1;
+
+ local = frame->local;
+ inode = local->inode;
+ priv = this->private;
+
+ if (err) {
+ if (!priv->thin_arbiter_count)
+ goto readfn;
+ if (err != EINVAL)
+ goto readfn;
+ /* We need to query the good bricks and/or thin-arbiter.*/
+ afr_ta_read_txn_synctask(frame, this);
+ return 0;
+ }
+
+ read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable,
+ NULL);
+ if (read_subvol == -1) {
+ err = EIO;
+ goto readfn;
+ }
+
+ if (local->read_attempted[read_subvol]) {
+ afr_read_txn_next_subvol(frame, this);
+ return 0;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+readfn:
+ if (read_subvol == -1) {
+ ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol);
+ if ((ret == 0) && spb_subvol >= 0)
+ read_subvol = spb_subvol;
+ }
+
+ if (read_subvol == -1) {
+ AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+ }
+ afr_read_txn_wind(frame, this, read_subvol);
+
+ return 0;
+}
+
+int
+afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (!local->refreshed) {
+ local->refreshed = _gf_true;
+ afr_inode_refresh(frame, this, local->inode, NULL,
+ afr_read_txn_refresh_done);
+ } else {
+ afr_read_txn_next_subvol(frame, this);
+ }
+
+ return 0;
+}
+
+/* afr_read_txn_wipe:
+
+ clean internal variables in @local in order to make
+ it possible to call afr_read_txn() multiple times from
+ the same frame
+*/
+
+void
+afr_read_txn_wipe(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->readfn = NULL;
+
+ if (local->inode)
+ inode_unref(local->inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ local->read_attempted[i] = 0;
+ local->readable[i] = 0;
+ }
+}
+
+/*
+ afr_read_txn:
+
+ This is the read transaction function. The way it works:
+
+ - Determine read-subvolume from inode ctx.
+
+ - If read-subvolume's generation was stale, refresh ctx once by
+ calling afr_inode_refresh()
+
+ Else make an attempt to read on read-subvolume.
+
+ - If attempted read on read-subvolume fails, refresh ctx once
+ by calling afr_inode_refresh()
+
+ - After ctx refresh, query read-subvolume freshly and attempt
+ read once.
+
+ - If read fails, try every other readable[] subvolume before
+ finally giving up. readable[] elements are set by afr_inode_refresh()
+ based on dirty and pending flags.
+
+ - If file is in split brain in the backend, generation will be
+ kept 0 by afr_inode_refresh() and readable[] will be set 0 for
+ all elements. Therefore reads always fail.
+*/
+
+int
+afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *data = NULL;
+ unsigned char *metadata = NULL;
+ int read_subvol = -1;
+ int event_generation = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+ data = alloca0(priv->child_count);
+ metadata = alloca0(priv->child_count);
+
+ afr_read_txn_wipe(frame, this);
+
+ local->readfn = readfn;
+ local->inode = inode_ref(inode);
+ local->is_read_txn = _gf_true;
+ local->transaction.type = type;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_quorum_errno(priv);
+ goto read;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &local->op_errno)) {
+ local->op_ret = -1;
+ goto read;
+ }
+
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ local->op_ret = -1;
+ local->op_errno = -afr_quorum_errno(priv);
+ goto read;
+ }
+
+ if (priv->thin_arbiter_count &&
+ AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ if (local->child_up[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (local->child_up[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
+ afr_ta_read_txn_synctask(frame, this);
+ return 0;
+ }
+
+ ret = afr_inode_read_subvol_get(inode, this, data, metadata,
+ &event_generation);
+ if (ret == -1)
+ /* very first transaction on this inode */
+ goto refresh;
+ AFR_INTERSECT(local->readable, data, metadata, priv->child_count);
+
+ gf_msg_debug(this->name, 0,
+ "%s: generation now vs cached: %d, "
+ "%d",
+ uuid_utoa(inode->gfid), local->event_generation,
+ event_generation);
+ if (afr_is_inode_refresh_reqd(inode, this, local->event_generation,
+ event_generation))
+ /* servers have disconnected / reconnected, and possibly
+ rebooted, very likely changing the state of freshness
+ of copies */
+ goto refresh;
+
+ read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable,
+ NULL);
+
+ if (read_subvol < 0 || read_subvol > priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "Unreadable subvolume %d found "
+ "with event generation %d for gfid %s.",
+ read_subvol, event_generation, uuid_utoa(inode->gfid));
+ goto refresh;
+ }
+
+ if (!local->child_up[read_subvol]) {
+ /* should never happen, just in case */
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR,
+ "subvolume %d is the "
+ "read subvolume in this generation, but is not up",
+ read_subvol);
+ goto refresh;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+
+read:
+ afr_read_txn_wind(frame, this, read_subvol);
+
+ return 0;
+
+refresh:
+ afr_inode_refresh(frame, this, inode, NULL, afr_read_txn_refresh_done);
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
deleted file mode 100644
index e61f6defa70..00000000000
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ /dev/null
@@ -1,1072 +0,0 @@
-/*
- Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "xlator.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "md5.h"
-
-#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
-/*
- This file contains the various self-heal algorithms
-*/
-
-
-/*
- The "full" algorithm. Copies the entire file from
- source to sinks.
-*/
-
-
-static void
-sh_full_private_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = sh->private;
-
- if (sh_priv)
- GF_FREE (sh_priv);
-}
-
-
-static int
-sh_full_loop_driver (call_frame_t *frame, xlator_t *this);
-
-static int
-sh_full_loop_return (call_frame_t *rw_frame, xlator_t *this, off_t offset)
-{
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
-
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
-
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- LOCK (&sh_priv->lock);
- {
- sh_priv->loops_running--;
- }
- UNLOCK (&sh_priv->lock);
-
- gf_log (this->name, GF_LOG_TRACE,
- "loop for offset %"PRId64" returned", offset);
-
- AFR_STACK_DESTROY (rw_frame);
-
- sh_full_loop_driver (sh_frame, this);
-
- return 0;
-}
-
-
-static int
-sh_full_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t *rw_sh = NULL;
-
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
-
- int child_index = (long) cookie;
- int call_count = 0;
-
- priv = this->private;
-
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- gf_log (this->name, GF_LOG_TRACE,
- "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
- op_ret, sh_local->loc.path, child_index,
- rw_sh->offset - op_ret);
-
- LOCK (&sh_frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "write to %s failed on subvolume %s (%s)",
- sh_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->op_failed = 1;
- }
- }
- UNLOCK (&sh_frame->lock);
-
- call_count = afr_frame_return (rw_frame);
-
- if (call_count == 0) {
- sh_full_loop_return (rw_frame, this, rw_sh->offset - op_ret);
- }
-
- return 0;
-}
-
-
-static int
-sh_full_read_cbk (call_frame_t *rw_frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t *rw_sh = NULL;
-
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
-
- int i = 0;
- int call_count = 0;
-
- off_t offset = (long) cookie;
-
- priv = this->private;
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- call_count = sh->active_sinks;
-
- rw_local->call_count = call_count;
-
- gf_log (this->name, GF_LOG_TRACE,
- "read %d bytes of data from %s, offset %"PRId64"",
- op_ret, sh_local->loc.path, offset);
-
- if (op_ret <= 0) {
- sh->op_failed = 1;
-
- sh_full_loop_return (rw_frame, this, offset);
- return 0;
- }
-
- rw_sh->offset += op_ret;
-
- if (sh->file_has_holes) {
- if (iov_0filled (vector, count) == 0) {
- /* the iter function depends on the
- sh->offset already being updated
- above
- */
-
- sh_full_loop_return (rw_frame, this, offset);
- goto out;
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !sh_local->child_up[i])
- continue;
-
- /* this is a sink, so write to it */
-
- STACK_WIND_COOKIE (rw_frame, sh_full_write_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- sh->healing_fd, vector, count, offset,
- iobref);
-
- if (!--call_count)
- break;
- }
-
-out:
- return 0;
-}
-
-
-static int
-sh_full_read_write (call_frame_t *frame, xlator_t *this, off_t offset)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t *rw_sh = NULL;
- afr_self_heal_t *sh = NULL;
-
- call_frame_t *rw_frame = NULL;
-
- int32_t op_errno = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- rw_frame = copy_frame (frame);
- if (!rw_frame)
- goto out;
-
- ALLOC_OR_GOTO (rw_local, afr_local_t, out);
-
- rw_frame->local = rw_local;
- rw_sh = &rw_local->self_heal;
-
- rw_sh->offset = offset;
- rw_sh->sh_frame = frame;
-
- STACK_WIND_COOKIE (rw_frame, sh_full_read_cbk,
- (void *) (long) offset,
- priv->children[sh->source],
- priv->children[sh->source]->fops->readv,
- sh->healing_fd, sh->block_size,
- offset);
- return 0;
-
-out:
- sh->op_failed = 1;
-
- sh_full_loop_driver (frame, this);
-
- return 0;
-}
-
-
-static int
-sh_full_loop_driver (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
-
- int loop = 0;
- int recurse = 0;
-
- off_t offset = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
-
- if (sh->op_failed) {
- if (sh_priv->loops_running == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "full self-heal aborting on %s",
- local->loc.path);
-
- sh_full_private_cleanup (frame, this);
- local->self_heal.algo_abort_cbk (frame, this);
- }
-
- goto out;
- }
-
- if (sh_priv->offset >= sh->file_size) {
- if (sh_priv->loops_running == 0) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "full self-heal completed on %s",
- local->loc.path);
-
- sh_full_private_cleanup (frame, this);
- local->self_heal.algo_completion_cbk (frame, this);
- }
-
- goto out;
- }
-
-spawn:
- loop = 0;
- recurse = 0;
-
- LOCK (&sh_priv->lock);
- {
- if ((sh_priv->loops_running < priv->data_self_heal_window_size)
- && (sh_priv->offset < sh->file_size)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "spawning a loop for offset %"PRId64,
- sh_priv->offset);
-
- offset = sh_priv->offset;
- sh_priv->offset += sh->block_size;
-
- sh_priv->loops_running++;
-
- loop = 1;
-
- if (sh_priv->offset < sh->file_size)
- recurse = 1;
- }
- }
- UNLOCK (&sh_priv->lock);
-
- if (loop) {
- sh_full_read_write (frame, this, offset);
- if (recurse)
- goto spawn;
- }
-
-out:
- return 0;
-}
-
-
-int
-afr_sh_algo_full (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_full_private_t *sh_priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = GF_CALLOC (1, sizeof (*sh_priv),
- gf_afr_mt_afr_private_t);
-
- LOCK_INIT (&sh_priv->lock);
-
- sh->private = sh_priv;
-
- local->call_count = 0;
-
- sh_full_loop_driver (frame, this);
- return 0;
-}
-
-
-/*
- * The "diff" algorithm. Copies only those blocks whose checksums
- * don't match with those of source.
- */
-
-
-static void
-sh_diff_private_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
-
- int i;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = sh->private;
-
- for (i = 0; i < priv->data_self_heal_window_size; i++) {
- if (sh_priv->loops[i]) {
- if (sh_priv->loops[i]->write_needed)
- GF_FREE (sh_priv->loops[i]->write_needed);
-
- if (sh_priv->loops[i]->checksum)
- GF_FREE (sh_priv->loops[i]->checksum);
-
- GF_FREE (sh_priv->loops[i]);
- }
- }
-
- if (sh_priv) {
- if (sh_priv->loops)
- GF_FREE (sh_priv->loops);
-
- GF_FREE (sh_priv);
- }
-
-
-}
-
-
-static uint32_t
-__make_cookie (int loop_index, int child_index)
-{
- uint32_t ret = (loop_index << 16) | child_index;
- return ret;
-}
-
-
-static int
-__loop_index (uint32_t cookie)
-{
- return (cookie & 0xFFFF0000) >> 16;
-}
-
-
-static int
-__child_index (uint32_t cookie)
-{
- return (cookie & 0x0000FFFF);
-}
-
-
-static void
-sh_diff_loop_state_reset (struct sh_diff_loop_state *loop_state, int child_count)
-{
- loop_state->active = _gf_false;
-// loop_state->offset = 0;
-
- memset (loop_state->write_needed,
- 0, sizeof (*loop_state->write_needed) * child_count);
-
- memset (loop_state->checksum,
- 0, MD5_DIGEST_LEN * child_count);
-}
-
-
-static int
-sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count)
-{
- int writes = 0;
- int i;
-
- for (i = 0; i < child_count; i++) {
- if (write_needed[i])
- writes++;
- }
-
- return writes;
-}
-
-
-static int
-sh_diff_loop_driver (call_frame_t *frame, xlator_t *this);
-
-
-static int
-sh_diff_loop_return (call_frame_t *rw_frame, xlator_t *this,
- struct sh_diff_loop_state *loop_state)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
-
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
-
- priv = this->private;
-
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- gf_log (this->name, GF_LOG_TRACE,
- "loop for offset %"PRId64" returned", loop_state->offset);
-
- LOCK (&sh_priv->lock);
- {
- sh_priv->loops_running--;
- sh_diff_loop_state_reset (loop_state, priv->child_count);
- }
- UNLOCK (&sh_priv->lock);
-
- AFR_STACK_DESTROY (rw_frame);
-
- sh_diff_loop_driver (sh_frame, this);
-
- return 0;
-}
-
-
-static int
-sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *postbuf)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
-
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
-
- afr_sh_algo_diff_private_t *sh_priv;
- struct sh_diff_loop_state *loop_state;
-
- int call_count = 0;
- int child_index = 0;
- int loop_index = 0;
-
- priv = this->private;
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- child_index = __child_index ((uint32_t) (long) cookie);
- loop_index = __loop_index ((uint32_t) (long) cookie);
- loop_state = sh_priv->loops[loop_index];
-
- gf_log (this->name, GF_LOG_TRACE,
- "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
- op_ret, sh_local->loc.path, child_index,
- loop_state->offset);
-
- LOCK (&sh_frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "write to %s failed on subvolume %s (%s)",
- sh_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->op_failed = 1;
- }
- }
- UNLOCK (&sh_frame->lock);
-
- call_count = afr_frame_return (rw_frame);
-
- if (call_count == 0) {
- sh_diff_loop_return (rw_frame, this, loop_state);
- }
-
- return 0;
-}
-
-
-static int
-sh_diff_read_cbk (call_frame_t *rw_frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
-
- afr_sh_algo_diff_private_t * sh_priv = NULL;
-
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
-
- int loop_index;
- struct sh_diff_loop_state *loop_state;
-
- uint32_t wcookie;
-
- int i = 0;
- int call_count = 0;
-
- priv = this->private;
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- loop_index = __loop_index ((uint32_t) (long) cookie);
- loop_state = sh_priv->loops[loop_index];
-
- call_count = sh_diff_number_of_writes_needed (loop_state->write_needed,
- priv->child_count);
-
- rw_local->call_count = call_count;
-
- gf_log (this->name, GF_LOG_TRACE,
- "read %d bytes of data from %s, offset %"PRId64"",
- op_ret, sh_local->loc.path, loop_state->offset);
-
- if ((op_ret <= 0) ||
- (call_count == 0)) {
- sh_diff_loop_return (rw_frame, this, loop_state);
-
- return 0;
- }
-
- if (sh->file_has_holes) {
- if (iov_0filled (vector, count) == 0) {
-
- sh_diff_loop_return (rw_frame, this, loop_state);
- goto out;
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (loop_state->write_needed[i]) {
- wcookie = __make_cookie (loop_index, i);
-
- STACK_WIND_COOKIE (rw_frame, sh_diff_write_cbk,
- (void *) (long) wcookie,
- priv->children[i],
- priv->children[i]->fops->writev,
- sh->healing_fd, vector, count,
- loop_state->offset, iobref);
-
- if (!--call_count)
- break;
- }
- }
-
-out:
- return 0;
-}
-
-
-static int
-sh_diff_read (call_frame_t *rw_frame, xlator_t *this,
- int loop_index)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * rw_sh = NULL;
-
- afr_sh_algo_diff_private_t * sh_priv = NULL;
- struct sh_diff_loop_state *loop_state;
-
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
-
- uint32_t cookie;
-
- priv = this->private;
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- sh_priv = sh->private;
-
- loop_state = sh_priv->loops[loop_index];
-
- cookie = __make_cookie (loop_index, sh->source);
-
- STACK_WIND_COOKIE (rw_frame, sh_diff_read_cbk,
- (void *) (long) cookie,
- priv->children[sh->source],
- priv->children[sh->source]->fops->readv,
- sh->healing_fd, sh_priv->block_size,
- loop_state->offset);
-
- return 0;
-}
-
-
-static int
-sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- uint32_t weak_checksum, uint8_t *strong_checksum)
-{
- afr_private_t * priv = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t *rw_sh = NULL;
-
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
-
- afr_sh_algo_diff_private_t * sh_priv = NULL;
-
- int loop_index = 0;
- int child_index = 0;
- struct sh_diff_loop_state *loop_state;
-
- int call_count = 0;
- int i = 0;
- int write_needed = 0;
-
- priv = this->private;
-
- rw_local = rw_frame->local;
- rw_sh = &rw_local->self_heal;
-
- sh_frame = rw_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- sh_priv = sh->private;
-
- child_index = __child_index ((uint32_t) (long) cookie);
- loop_index = __loop_index ((uint32_t) (long) cookie);
-
- loop_state = sh_priv->loops[loop_index];
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "checksum on %s failed on subvolume %s (%s)",
- sh_local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->op_failed = 1;
- } else {
- memcpy (loop_state->checksum + child_index * MD5_DIGEST_LEN,
- strong_checksum,
- MD5_DIGEST_LEN);
- }
-
- call_count = afr_frame_return (rw_frame);
-
- if (call_count == 0) {
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !sh_local->child_up[i])
- continue;
-
- if (memcmp (loop_state->checksum + (i * MD5_DIGEST_LEN),
- loop_state->checksum + (sh->source * MD5_DIGEST_LEN),
- MD5_DIGEST_LEN)) {
- /*
- Checksums differ, so this block
- must be written to this sink
- */
-
- gf_log (this->name, GF_LOG_TRACE,
- "checksum on subvolume %s at offset %"
- PRId64" differs from that on source",
- priv->children[i]->name, loop_state->offset);
-
- write_needed = loop_state->write_needed[i] = 1;
- }
- }
-
- LOCK (&sh_priv->lock);
- {
- sh_priv->total_blocks++;
- if (write_needed)
- sh_priv->diff_blocks++;
- }
- UNLOCK (&sh_priv->lock);
-
- if (write_needed && !sh->op_failed) {
- sh_diff_read (rw_frame, this, loop_index);
- } else {
- sh->offset += sh_priv->block_size;
-
- sh_diff_loop_return (rw_frame, this, loop_state);
- }
- }
-
- return 0;
-}
-
-
-static int
-sh_diff_find_unused_loop (afr_sh_algo_diff_private_t *sh_priv, int max)
-{
- int i;
-
- LOCK (&sh_priv->lock);
- {
- for (i = 0; i < max; i++) {
- if (sh_priv->loops[i]->active == _gf_false) {
- sh_priv->loops[i]->active = _gf_true;
- break;
- }
- }
- }
- UNLOCK (&sh_priv->lock);
-
- if (i == max) {
- gf_log ("[sh-diff]", GF_LOG_ERROR,
- "no free loops found! This shouldn't happen. Please"
- " report this to gluster-devel@nongnu.org");
- }
-
- return i;
-}
-
-
-static int
-sh_diff_checksum (call_frame_t *frame, xlator_t *this, off_t offset)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_local_t * rw_local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_self_heal_t * rw_sh = NULL;
-
- afr_sh_algo_diff_private_t * sh_priv = NULL;
-
- call_frame_t *rw_frame = NULL;
-
- uint32_t cookie;
- int loop_index = 0;
- struct sh_diff_loop_state *loop_state = NULL;
-
- int32_t op_errno = 0;
-
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = sh->private;
-
- rw_frame = copy_frame (frame);
- if (!rw_frame)
- goto out;
-
- ALLOC_OR_GOTO (rw_local, afr_local_t, out);
-
- rw_frame->local = rw_local;
- rw_sh = &rw_local->self_heal;
-
- rw_sh->offset = sh->offset;
- rw_sh->sh_frame = frame;
-
- call_count = sh->active_sinks + 1; /* sinks and source */
-
- rw_local->call_count = call_count;
-
- loop_index = sh_diff_find_unused_loop (sh_priv, priv->data_self_heal_window_size);
-
- loop_state = sh_priv->loops[loop_index];
- loop_state->offset = offset;
-
- /* we need to send both the loop index and child index,
- so squeeze them both into a 32-bit number */
-
- cookie = __make_cookie (loop_index, sh->source);
-
- STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk,
- (void *) (long) cookie,
- priv->children[sh->source],
- priv->children[sh->source]->fops->rchecksum,
- sh->healing_fd,
- offset, sh_priv->block_size);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- cookie = __make_cookie (loop_index, i);
-
- STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk,
- (void *) (long) cookie,
- priv->children[i],
- priv->children[i]->fops->rchecksum,
- sh->healing_fd,
- offset, sh_priv->block_size);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-
-out:
- sh->op_failed = 1;
-
- sh_diff_loop_driver (frame, this);
-
- return 0;
-}
-
-
-static int
-sh_diff_loop_driver (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
-
- int loop = 0;
- int recurse = 0;
-
- off_t offset = 0;
- char sh_type_str[256] = {0,};
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
-
- afr_self_heal_type_str_get(sh, sh_type_str, sizeof(sh_type_str));
-
- if (sh->op_failed) {
- if (sh_priv->loops_running == 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "diff %s self-heal aborting on %s",
- sh_type_str, local->loc.path);
-
- sh_diff_private_cleanup (frame, this);
- local->self_heal.algo_abort_cbk (frame, this);
- }
-
- goto out;
- }
-
- if (sh_priv->offset >= sh->file_size) {
- if (sh_priv->loops_running == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "diff %s self-heal completed on %s",
- sh_type_str, local->loc.path);
-
-
- gf_log (this->name, GF_LOG_NORMAL,
- "diff %s self-heal on %s: %d blocks of %d were different (%.2f%%)",
- sh_type_str, local->loc.path,
- sh_priv->diff_blocks, sh_priv->total_blocks,
- ((sh_priv->diff_blocks * 1.0)/sh_priv->total_blocks) * 100);
-
- sh_diff_private_cleanup (frame, this);
- local->self_heal.algo_completion_cbk (frame, this);
- }
-
- goto out;
- }
-
-spawn:
- loop = 0;
- recurse = 0;
-
- LOCK (&sh_priv->lock);
- {
- if ((sh_priv->loops_running < priv->data_self_heal_window_size)
- && (sh_priv->offset < sh->file_size)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "spawning a loop for offset %"PRId64,
- sh_priv->offset);
-
- offset = sh_priv->offset;
- sh_priv->offset += sh_priv->block_size;
-
- sh_priv->loops_running++;
-
- loop = 1;
-
- if (sh_priv->offset < sh->file_size)
- recurse = 1;
- }
- }
- UNLOCK (&sh_priv->lock);
-
- if (loop) {
- sh_diff_checksum (frame, this, offset);
- if (recurse)
- goto spawn;
- }
-
-out:
- return 0;
-}
-
-
-int
-afr_sh_algo_diff (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_diff_private_t *sh_priv = NULL;
-
- int i;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = GF_CALLOC (1, sizeof (*sh_priv),
- gf_afr_mt_afr_private_t);
-
- sh_priv->block_size = this->ctx->page_size;
-
- sh->private = sh_priv;
-
- LOCK_INIT (&sh_priv->lock);
-
- local->call_count = 0;
-
- sh_priv->loops = GF_CALLOC (priv->data_self_heal_window_size,
- sizeof (*sh_priv->loops),
- gf_afr_mt_sh_diff_loop_state);
-
- for (i = 0; i < priv->data_self_heal_window_size; i++) {
- sh_priv->loops[i] = GF_CALLOC (1, sizeof (*sh_priv->loops[i]),
- gf_afr_mt_sh_diff_loop_state);
-
- sh_priv->loops[i]->checksum = GF_CALLOC (priv->child_count,
- MD5_DIGEST_LEN, gf_afr_mt_uint8_t);
- sh_priv->loops[i]->write_needed = GF_CALLOC (priv->child_count,
- sizeof (*sh_priv->loops[i]->write_needed),
- gf_afr_mt_char);
- }
-
- sh_diff_loop_driver (frame, this);
-
- return 0;
-}
-
-
-struct afr_sh_algorithm afr_self_heal_algorithms[] = {
- {.name = "full", .fn = afr_sh_algo_full},
- {.name = "diff", .fn = afr_sh_algo_diff},
- {0, 0},
-};
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h
deleted file mode 100644
index e45621b0ecd..00000000000
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __AFR_SELF_HEAL_ALGORITHM_H__
-#define __AFR_SELF_HEAL_ALGORITHM_H__
-
-
-typedef int (*afr_sh_algo_fn) (call_frame_t *frame,
- xlator_t *this);
-
-struct afr_sh_algorithm {
- const char *name;
- afr_sh_algo_fn fn;
-};
-
-extern struct afr_sh_algorithm afr_self_heal_algorithms[3];
-
-typedef struct {
- gf_lock_t lock;
- unsigned int loops_running;
- off_t offset;
-} afr_sh_algo_full_private_t;
-
-struct sh_diff_loop_state {
- off_t offset;
- unsigned char *write_needed;
- uint8_t *checksum;
- gf_boolean_t active;
-};
-
-typedef struct {
- size_t block_size;
-
- gf_lock_t lock;
- unsigned int loops_running;
- off_t offset;
-
- int32_t total_blocks;
- int32_t diff_blocks;
-
- struct sh_diff_loop_state **loops;
-} afr_sh_algo_diff_private_t;
-
-#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 1bb395cc387..a580a1584cc 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1,1657 +1,2934 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
-#include "byte-order.h"
-
#include "afr.h"
-#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
#include "afr-self-heal.h"
-#include "pump.h"
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include "afr-messages.h"
+#include <glusterfs/events.h>
-/**
- * select_source - select a source and return it
- */
+void
+afr_heal_synctask(xlator_t *this, afr_local_t *local);
int
-afr_sh_select_source (int sources[], int child_count)
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+ inode_t *inode, struct afr_reply *replies, int source,
+ unsigned char *sources, void *gfid, int *gfid_idx)
{
- int i;
- for (i = 0; i < child_count; i++)
- if (sources[i])
- return i;
+ afr_private_t *priv = NULL;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_on = NULL;
+ ia_type_t ia_type = IA_INVAL;
+ dict_t *xdata = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
+ int i = 0;
+
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+ if (source >= 0 && replies[source].valid && replies[source].op_ret == 0)
+ ia_type = replies[source].poststat.ia_type;
+
+ if (ia_type != IA_INVAL)
+ goto heal;
+
+ /* If ia_type is still invalid, it means either
+ * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain
+ * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks
+ * are sources) and the 'source' we selected earlier might be the one where
+ * the file is not actually present.
+ *
+ * In both cases, let us pick a brick with a successful reply and use its
+ * ia_type.
+ * */
+ for (i = 0; i < priv->child_count; i++) {
+ if (source == -1) {
+ /* case (a) above. */
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
+ ia_type = replies[i].poststat.ia_type;
+ break;
+ }
+ } else {
+ /* case (b) above. */
+ if (i == source)
+ continue;
+ if (sources[i] && replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
+ ia_type = replies[i].poststat.ia_type;
+ break;
+ }
+ }
+ }
- return -1;
-}
+heal:
+ /* gfid heal on those subvolumes that do not have gfid associated
+ * with the inode and update those replies.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+ if (gf_uuid_is_null(gfid) &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid) &&
+ replies[i].poststat.ia_type == ia_type)
+ gfid = replies[i].poststat.ia_gfid;
-/**
- * sink_count - return number of sinks in sources array
- */
+ if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) ||
+ replies[i].poststat.ia_type != ia_type)
+ continue;
+
+ wind_on[i] = 1;
+ }
+
+ if (AFR_COUNT(wind_on, priv->child_count) == 0)
+ return 0;
+
+ xdata = dict_new();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ local = frame->local;
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
+
+ AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ afr_reply_wipe(&replies[i]);
+ afr_reply_copy(&replies[i], &local->replies[i]);
+ }
+ if (gfid_idx && (*gfid_idx == -1)) {
+ /*Pick a brick where the gifd heal was successful.*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid)) {
+ *gfid_idx = i;
+ break;
+ }
+ }
+ }
+out:
+ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) {
+ ret = -afr_final_errno(local, priv);
+ }
+ loc_wipe(&loc);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ if (xdata)
+ dict_unref(xdata);
+
+ return ret;
+}
int
-afr_sh_sink_count (int sources[], int child_count)
+afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies,
+ char *src_brick)
{
- int i;
- int sinks = 0;
- for (i = 0; i < child_count; i++)
- if (!sources[i])
- sinks++;
- return sinks;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (strcmp(priv->children[i]->name, src_brick) == 0)
+ return i;
+ }
+ return -1;
}
int
-afr_sh_source_count (int sources[], int child_count)
+afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies,
+ int child_count)
{
- int i;
- int nsource = 0;
+ int j = 0;
+ int i = 0;
+ int votes;
+
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+
+ votes = 1;
+ for (j = i + 1; j < child_count; j++) {
+ if ((!gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[j].poststat.ia_gfid)))
+ votes++;
+ if (votes > child_count / 2)
+ return i;
+ }
+ }
- for (i = 0; i < child_count; i++)
- if (sources[i])
- nsource++;
- return nsource;
+ return -1;
}
-
int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
- int child_count)
+afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies,
+ int child_count)
{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (child_errno[i] && sources[i]) {
- sources[i] = 0;
- }
- }
-
- return 0;
+ int i = 0;
+ int src = -1;
+ uint64_t size = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (size < replies[i].poststat.ia_size) {
+ src = i;
+ size = replies[i].poststat.ia_size;
+ } else if (replies[i].poststat.ia_size == size) {
+ src = -1;
+ }
+ }
+ return src;
}
-
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
+int
+afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies,
+ int child_count)
{
- afr_private_t * priv = this->private;
-
- char *buf = NULL;
- char *ptr = NULL;
-
- int i, j;
+ int i = 0;
+ int src = -1;
+ uint32_t mtime = 0;
+ uint32_t mtime_nsec = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+ if ((mtime < replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+ src = i;
+ mtime = replies[i].poststat.ia_mtime;
+ mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ } else if ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) {
+ src = -1;
+ }
+ }
+ return src;
+}
- /* 10 digits per entry + 1 space + '[' and ']' */
- buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char);
+int
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid, const char *bname,
+ int src_idx, int child_idx,
+ unsigned char *locked_on, int *src, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ char g1[64] = {
+ 0,
+ };
+ char g2[64] = {
+ 0,
+ };
+ int up_count = 0;
+ int heal_op = -1;
+ int ret = -1;
+ char *src_brick = NULL;
+
+ *src = -1;
+ priv = this->private;
+ up_count = AFR_COUNT(locked_on, priv->child_count);
+ if (up_count != priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "All the bricks should be up to resolve the gfid split "
+ "barin");
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SALL_BRICKS_UP_TO_RESOLVE);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "Error setting"
+ " gfid-heal-msg dict");
+ }
+ goto out;
+ }
- for (i = 0; i < priv->child_count; i++) {
- ptr = buf;
- ptr += sprintf (ptr, "[ ");
- for (j = 0; j < priv->child_count; j++) {
- ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
- }
- sprintf (ptr, "]");
- gf_log (this->name, GF_LOG_TRACE,
- "pending_matrix: %s", buf);
- }
+ if (xdata) {
+ ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op);
+ if (ret)
+ goto fav_child;
+ } else {
+ goto fav_child;
+ }
+
+ switch (heal_op) {
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ *src = afr_gfid_sbrain_source_from_bigger_file(replies,
+ priv->child_count);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SNO_BIGGER_FILE);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_BIGGER_FILE);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ " setting gfid-heal-msg dict");
+ }
+ }
+ break;
+
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+ *src = afr_gfid_sbrain_source_from_latest_mtime(replies,
+ priv->child_count);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SNO_DIFF_IN_MTIME);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_DIFF_IN_MTIME);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ "setting gfid-heal-msg dict");
+ }
+ }
+ break;
+
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str_sizen(xdata, "child-name", &src_brick);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Error getting the source "
+ "brick");
+ break;
+ }
+ *src = afr_gfid_sbrain_source_from_src_brick(this, replies,
+ src_brick);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SERROR_GETTING_SRC_BRICK);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SERROR_GETTING_SRC_BRICK);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ " setting gfid-heal-msg dict");
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+ goto out;
+
+fav_child:
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ *src = afr_sh_fav_by_size(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ *src = afr_sh_fav_by_mtime(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ *src = afr_sh_fav_by_ctime(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ if (priv->child_count != 2)
+ *src = afr_selfheal_gfid_mismatch_by_majority(
+ replies, priv->child_count);
+ else
+ *src = -1;
+
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No majority to resolve "
+ "gfid split brain");
+ }
+ break;
+ default:
+ break;
+ }
- GF_FREE (buf);
+out:
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and"
+ " %s on %s.",
+ uuid_utoa(pargfid), bname,
+ uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1),
+ priv->children[child_idx]->name,
+ uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2),
+ priv->children[src_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=gfid;file="
+ "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
+ "child-%d=%s;gfid-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid),
+ bname, child_idx, priv->children[child_idx]->name, child_idx,
+ uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx,
+ priv->children[src_idx]->name, src_idx,
+ uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2));
+ return -1;
+ }
+ return 0;
}
+int
+afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
-void
-afr_sh_build_pending_matrix (afr_private_t *priv,
- int32_t *pending_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
-{
- int i, j, k;
-
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3];
- void *pending_raw = NULL;
- int ret = -1;
-
- unsigned char *ignorant_subvols = NULL;
-
- ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count,
- gf_afr_mt_char);
-
- /* start clean */
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- pending_matrix[i][j] = 0;
- }
- }
-
- for (i = 0; i < child_count; i++) {
- pending_raw = NULL;
-
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], priv->pending_key[j],
- &pending_raw);
-
- if (ret != 0) {
- /*
- * There is no xattr present. This means this
- * subvolume should be considered an 'ignorant'
- * subvolume.
- */
-
- ignorant_subvols[i] = 1;
- continue;
- }
-
- memcpy (pending, pending_raw, sizeof(pending));
- k = afr_index_for_transaction_type (type);
-
- pending_matrix[i][j] = ntoh32 (pending[k]);
- }
- }
-
- /*
- * Make all non-ignorant subvols point towards the ignorant
- * subvolumes.
- */
+ local = frame->local;
- for (i = 0; i < child_count; i++) {
- if (ignorant_subvols[i]) {
- for (j = 0; j < child_count; j++) {
- if (!ignorant_subvols[j])
- pending_matrix[j][i] += 1;
- }
- }
- }
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ syncbarrier_wake(&local->barrier);
- GF_FREE (ignorant_subvols);
+ return 0;
}
+int
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
-/**
- * mark_sources: Mark all 'source' nodes and return number of source
- * nodes found
- *
- * A node (a row in the pending matrix) belongs to one of
- * three categories:
- *
- * M is the pending matrix.
- *
- * 'innocent' - M[i] is all zeroes
- * 'fool' - M[i] has i'th element = 1 (self-reference)
- * 'wise' - M[i] has i'th element = 0, others are 1 or 0.
- *
- * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is
- * needed.
- *
- * A 'wise' node can be a source. If two 'wise' nodes conflict, it is
- * a split-brain. If one wise node refers to the other but the other doesn't
- * refer back, the referrer is a source.
- *
- * All fools are sinks, unless there are no 'wise' nodes. In that case,
- * one of the fools is made a source.
- */
+ priv = this->private;
+ local = frame->local;
-typedef enum {
- AFR_NODE_INNOCENT,
- AFR_NODE_FOOL,
- AFR_NODE_WISE
-} afr_node_type;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
-typedef struct {
- afr_node_type type;
- int wisdom;
-} afr_node_character;
+ local->op_ret = 0;
+ STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
-static int
-afr_sh_is_innocent (int32_t *array, int child_count)
-{
- int i = 0;
- int ret = 1; /* innocent until proven guilty */
+ syncbarrier_wait(&local->barrier, 1);
+ if (local->op_ret < 0)
+ ret = -local->op_errno;
- for (i = 0; i < child_count; i++) {
- if (array[i]) {
- ret = 0;
- break;
- }
- }
+ loc_wipe(&loc);
+ local->op_ret = 0;
- return ret;
+ return ret;
}
-
-static int
-afr_sh_is_fool (int32_t *array, int i, int child_count)
+int
+afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv)
{
- return array[i]; /* fool if accuses itself */
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
+ int stale_count = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ tmp_errno = replies[i].op_errno;
+ if (tmp_errno == ENOENT || tmp_errno == ESTALE) {
+ op_errno = afr_higher_errno(op_errno, tmp_errno);
+ stale_count++;
+ }
+ }
+ if (stale_count != priv->child_count)
+ return -ENOTCONN;
+ else
+ return -op_errno;
}
-
-static int
-afr_sh_is_wise (int32_t *array, int i, int child_count)
+int
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- return !array[i]; /* wise if does not accuse itself */
-}
+ int i = (long)cookie;
+ afr_local_t *local = NULL;
+ local = frame->local;
-static int
-afr_sh_all_nodes_innocent (afr_node_character *characters,
- int child_count)
-{
- int i = 0;
- int ret = 1;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (pre)
+ local->replies[i].prestat = *pre;
+ if (post)
+ local->replies[i].poststat = *post;
+ if (xdata)
+ local->replies[i].xdata = dict_ref(xdata);
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_INNOCENT) {
- ret = 0;
- break;
- }
- }
+ syncbarrier_wake(&local->barrier);
- return ret;
+ return 0;
}
-
-static int
-afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count)
+int
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- int i = 0;
- int ret = 0;
+ loc_t loc = {
+ 0,
+ };
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- ret = 1;
- break;
- }
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- return ret;
-}
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc,
+ &replies[source].poststat,
+ (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME),
+ NULL);
+ loc_wipe(&loc);
-/*
- * The 'wisdom' of a wise node is 0 if any other wise node accuses it.
- * It is 1 if no other wise node accuses it.
- * Only wise nodes with wisdom 1 are sources.
- *
- * If no nodes with wisdom 1 exist, a split-brain has occured.
- */
-
-static void
-afr_sh_compute_wisdom (int32_t *pending_matrix[],
- afr_node_character characters[], int child_count)
-{
- int i = 0;
- int j = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- characters[i].wisdom = 1;
-
- for (j = 0; j < child_count; j++) {
- if ((characters[j].type == AFR_NODE_WISE)
- && pending_matrix[j][i]) {
-
- characters[i].wisdom = 0;
- }
- }
- }
- }
+ return 0;
}
-
-static int
-afr_sh_wise_nodes_conflict (afr_node_character *characters,
- int child_count)
+dict_t *
+afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl,
+ afr_transaction_type type, int *output_dirty,
+ int **output_matrix, int subvol,
+ int **full_heal_mtx_out)
{
- int i = 0;
- int ret = 1;
-
- for (i = 0; i < child_count; i++) {
- if ((characters[i].type == AFR_NODE_WISE)
- && characters[i].wisdom == 1) {
-
- /* There is atleast one bona-fide wise node */
- ret = 0;
- break;
- }
+ int j = 0;
+ int idx = 0;
+ int d_idx = 0;
+ int ret = 0;
+ int *raw = 0;
+ dict_t *xattr = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ idx = afr_index_for_transaction_type(type);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ xattr = dict_new();
+ if (!xattr)
+ return NULL;
+
+ /* clear dirty */
+ raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32(output_dirty[subvol]);
+ ret = dict_set_bin(xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ GF_FREE(raw);
+ goto err;
+ }
+
+ /* clear/set pending */
+ for (j = 0; j < priv->child_count; j++) {
+ raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32(output_matrix[subvol][j]);
+ if (is_full_crawl)
+ raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]);
+
+ ret = dict_set_bin(xattr, priv->pending_key[j], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ GF_FREE(raw);
+ goto err;
}
+ }
- return ret;
+ return xattr;
+err:
+ if (xattr)
+ dict_unref(xattr);
+ return NULL;
}
+int
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int j = 0;
+ unsigned char *pending = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int **full_heal_mtx_in = NULL;
+ int **full_heal_mtx_out = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ pending = alloca0(priv->child_count);
+
+ input_dirty = alloca0(priv->child_count * sizeof(int));
+ input_matrix = ALLOC_MATRIX(priv->child_count, int);
+ full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int);
+ full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int);
+ output_dirty = alloca0(priv->child_count * sizeof(int));
+ output_matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
+
+ afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
+
+ if (local->need_full_crawl)
+ afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL,
+ full_heal_mtx_in);
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sinks[i] && !healed_sinks[i])
+ pending[i] = 1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (pending[j]) {
+ output_matrix[i][j] = 1;
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = 1;
+ } else if (locked_on[j]) {
+ output_matrix[i][j] = -input_matrix[i][j];
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j];
+ }
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ output_dirty[i] = -input_dirty[i];
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ /* perform post-op only on subvols we had locked
+ and inspected on.
+ */
+ continue;
+ if (undid_pending[i])
+ /* We already unset the pending xattrs in
+ * _afr_fav_child_reset_sink_xattrs(). */
+ continue;
+
+ xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type,
+ output_dirty, output_matrix, i,
+ full_heal_mtx_out);
+ if (!xattr) {
+ continue;
+ }
-static int
-afr_sh_mark_wisest_as_sources (int sources[],
- afr_node_character *characters,
- int child_count)
-{
- int nsources = 0;
-
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].wisdom == 1) {
- sources[i] = 1;
- nsources++;
- }
+ if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) {
+ if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1))
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set"
+ " dict value for %s",
+ GF_XATTROP_PURGE_INDEX);
}
- return nsources;
+ afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
+ dict_unref(xattr);
+ }
+
+ if (xdata)
+ dict_unref(xdata);
+
+ return 0;
}
+void
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src)
+{
+ dict_t *xdata = NULL;
+
+ dst->valid = src->valid;
+ dst->op_ret = src->op_ret;
+ dst->op_errno = src->op_errno;
+ dst->prestat = src->prestat;
+ dst->poststat = src->poststat;
+ dst->preparent = src->preparent;
+ dst->postparent = src->postparent;
+ dst->preparent2 = src->preparent2;
+ dst->postparent2 = src->postparent2;
+ if (src->xdata)
+ xdata = dict_ref(src->xdata);
+ else
+ xdata = NULL;
+ if (dst->xdata)
+ dict_unref(dst->xdata);
+ dst->xdata = xdata;
+ if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum",
+ _gf_false) == _gf_true) {
+ memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH);
+ } else {
+ memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH);
+ }
+ dst->fips_mode_rchecksum = src->fips_mode_rchecksum;
+}
-static int
-afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count)
+void
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count)
{
- int32_t ** pending_matrix;
- int i, j;
+ int i = 0;
- int size_differs = 0;
+ if (dst == src)
+ return;
- pending_matrix = sh->pending_matrix;
+ for (i = 0; i < count; i++) {
+ afr_reply_copy(&dst[i], &src[i]);
+ }
+}
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- if (!sh->buf)
- break;
+int
+afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx,
+ dict_t *xdata)
+{
+ void *pending_raw = NULL;
+ int pending[3] = {
+ 0,
+ };
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j])
- && (pending_matrix[i][j] == 0)
- && (pending_matrix[j][i] == 0)) {
+ if (!dirty)
+ return 0;
- pending_matrix[i][j] = 1;
- pending_matrix[j][i] = 1;
+ if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw))
+ return -1;
- size_differs = 1;
- }
- }
- }
+ if (!pending_raw)
+ return -1;
- return size_differs;
+ memcpy(pending, pending_raw, sizeof(pending));
+
+ dirty[subvol] = ntoh32(pending[idx]);
+
+ return 0;
}
-
-static int
-afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh,
- afr_node_character *characters,
- int child_count)
+int
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+ dict_t *xdata)
{
- int i = 0;
- int biggest = 0;
+ int i = 0;
+ void *pending_raw = NULL;
+ int pending[3] = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_FOOL) {
- biggest = i;
- break;
- }
- }
+ priv = this->private;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
+ if (!matrix)
+ return 0;
- if (!sh->buf)
- break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
- if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) {
- biggest = i;
- }
- }
+ if (!pending_raw)
+ continue;
- sh->sources[biggest] = 1;
+ memcpy(pending, pending_raw, sizeof(pending));
- return 1;
-}
+ matrix[subvol][i] = ntoh32(pending[idx]);
+ }
+ return 0;
+}
-static int
-afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count)
+int
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix)
{
- int biggest = 0;
- int i;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ dict_t *xdata = NULL;
+ int idx = -1;
- for (i = 0; i < child_count; i++) {
- if (!sh->buf)
- break;
+ idx = afr_index_for_transaction_type(type);
- if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) {
- biggest = i;
- }
- }
+ priv = this->private;
- sh->sources[biggest] = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
- return 1;
-}
+ if (!replies[i].xdata)
+ continue;
+ xdata = replies[i].xdata;
-static int
-afr_sh_mark_loweia_uid_as_source (afr_self_heal_t *sh, int child_count)
-{
- uid_t smallest = 0;
- int i;
+ afr_selfheal_fill_dirty(this, dirty, i, idx, xdata);
+ afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+ }
- for (i = 0; i < child_count; i++) {
- if (!sh->buf)
- break;
+ return 0;
+}
- if (sh->buf[i].ia_uid < sh->buf[smallest].ia_uid) {
- smallest = i;
- }
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
+ *
+ * This can happen if data was directly modified in the backend or for snapshots
+ */
+void
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
+
+ /* Find source with biggest file size */
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!replies[i].valid || replies[i].op_ret != 0) {
+ sources[i] = 0;
+ continue;
}
-
- sh->sources[smallest] = 1;
-
- return 1;
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
+ }
+ }
+
+ /* Mark sources with less size as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size > replies[i].poststat.ia_size)
+ sources[i] = 0;
+ }
}
-
-int
-afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
- afr_self_heal_type type)
+void
+afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- int i = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint32_t mtime = 0;
+ uint32_t mtime_nsec = 0;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!replies[i].valid || replies[i].op_ret != 0) {
+ sources[i] = 0;
+ continue;
+ }
+ if ((mtime < replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+ mtime = replies[i].poststat.ia_mtime;
+ mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if ((mtime > replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) {
+ sources[i] = 0;
+ }
+ }
+}
- int32_t ** pending_matrix;
- int * sources;
+void
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+ unsigned char *locked_on, unsigned char *sinks)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
- int size_differs = 0;
+ priv = this->private;
- pending_matrix = sh->pending_matrix;
- sources = sh->sources;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] && locked_on[i])
+ sinks[i] = 1;
+ else
+ sinks[i] = 0;
+ }
+}
- int nsources = 0;
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ dict_t *xdata_req = NULL;
+ int ret = 0;
+ int heal_op = -1;
+
+ local = frame->local;
+ xdata_req = local->xdata_req;
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+ if (ret)
+ return _gf_false;
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp)
+ return _gf_true;
+ }
+ ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
+
+ return _gf_true;
+}
- /* stores the 'characters' (innocent, fool, wise) of the nodes */
- afr_node_character *
- characters = GF_CALLOC (sizeof (afr_node_character),
- child_count,
- gf_afr_mt_afr_node_character) ;
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+ int child_count)
+{
+ int i = 0;
- /* start clean */
- for (i = 0; i < child_count; i++) {
- sources[i] = 0;
- }
-
- for (i = 0; i < child_count; i++) {
- if (afr_sh_is_innocent (pending_matrix[i], child_count)) {
- characters[i].type = AFR_NODE_INNOCENT;
+ for (i = 0; i < child_count; i++)
+ if (replies[i].valid != 1 || replies[i].op_ret != 0)
+ return _gf_false;
- } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) {
- characters[i].type = AFR_NODE_FOOL;
+ return _gf_true;
+}
- } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) {
- characters[i].type = AFR_NODE_WISE;
+int
+afr_mark_split_brain_source_sinks_by_heal_op(
+ call_frame_t *frame, xlator_t *this, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type, int heal_op)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int ret = 0;
+ int i = 0;
+ char *name = NULL;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ if (sources[i] || !sinks[i] || !healed_sinks[i]) {
+ ret = -1;
+ goto out;
+ }
+ }
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ ret = -1;
+ goto out;
+ }
+ }
+ xdata_rsp = local->xdata_rsp;
+
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRAIN_HEAL_NO_GO_MSG);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++)
+ if (locked_on[i])
+ sources[i] = 1;
+ switch (heal_op) {
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ afr_mark_largest_file_as_source(this, sources, replies);
+ if (AFR_COUNT(sources, priv->child_count) != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_BIGGER_FILE);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ break;
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ afr_mark_latest_mtime_file_as_source(this, sources, replies);
+ if (AFR_COUNT(sources, priv->child_count) != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_DIFF_IN_MTIME);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ break;
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str_sizen(xdata_req, "child-name", &name);
+ if (ret)
+ goto out;
+ source = afr_get_child_index_from_name(this, name);
+ if (source < 0) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SINVALID_BRICK_NAME);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ if (locked_on[source] != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRICK_IS_NOT_UP);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ sources[source] = 1;
+ break;
+ default:
+ ret = -1;
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+ sinks[source] = 0;
+ healed_sinks[source] = 0;
+ ret = source;
+out:
+ if (ret < 0)
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ return ret;
+}
- } else {
- gf_log ("[module:replicate]", GF_LOG_ERROR,
- "Could not determine the state of subvolume %d!"
- " (This message should never appear."
- " Please file a bug report to "
- "<gluster-devel@nongnu.org>.)", i);
+int
+afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode)
+{
+ afr_private_t *priv;
+ int vote_count = -1;
+ int fav_child = -1;
+ int i = 0;
+ int k = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64
+ " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_size, uuid_utoa(inode->gfid));
+ vote_count = 0;
+ for (k = 0; k < priv->child_count; k++) {
+ if ((replies[k].poststat.ia_mtime ==
+ replies[i].poststat.ia_mtime) &&
+ (replies[k].poststat.ia_size ==
+ replies[i].poststat.ia_size)) {
+ vote_count++;
}
+ }
+ if (vote_count > priv->child_count / 2) {
+ fav_child = i;
+ break;
+ }
}
+ }
+ return fav_child;
+}
- if (type == AFR_SELF_HEAL_DATA) {
- size_differs = afr_sh_mark_if_size_differs (sh, child_count);
+/*
+ * afr_sh_fav_by_mtime: Choose favorite child by mtime.
+ */
+int
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_mtime = 0;
+ uint32_t cmp_mtime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s mtime = %" PRId64
+ ", mtime_nsec = %d for "
+ "gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_mtime_nsec,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_mtime > cmp_mtime) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_mtime == cmp_mtime) &&
+ (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ }
}
+ }
+ return fav_child;
+}
- if ((type == AFR_SELF_HEAL_METADATA)
- && afr_sh_all_nodes_innocent (characters, child_count)) {
+/*
+ * afr_sh_fav_by_ctime: Choose favorite child by ctime.
+ */
+int
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_ctime = 0;
+ uint32_t cmp_ctime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s ctime = %" PRId64
+ ", ctime_nsec = %d for "
+ "gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_ctime,
+ replies[i].poststat.ia_ctime_nsec,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_ctime > cmp_ctime) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_ctime == cmp_ctime) &&
+ (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
+}
- nsources = afr_sh_mark_loweia_uid_as_source (sh, child_count);
- goto out;
+/*
+ * afr_sh_fav_by_size: Choose favorite child by size
+ * when not all files are of zero size.
+ */
+int
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint64_t cmp_sz = 0;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid) {
+ continue;
}
+ gf_msg_debug(this->name, 0,
+ "Child:%s file size = %" PRIu64 " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_size,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_type == IA_IFDIR) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Cannot perform selfheal on %s. "
+ "Size policy is not applicable to directories.",
+ uuid_utoa(inode->gfid));
+ break;
+ }
+ if (replies[i].poststat.ia_size > cmp_sz) {
+ cmp_sz = replies[i].poststat.ia_size;
+ fav_child = i;
+ } else if (replies[i].poststat.ia_size == cmp_sz) {
+ fav_child = -1;
+ }
+ }
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No bigger file");
+ }
+ return fav_child;
+}
- if (afr_sh_all_nodes_innocent (characters, child_count)) {
- if (size_differs) {
- nsources = afr_sh_mark_biggest_as_source (sh,
- child_count);
- }
+int
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str)
+{
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
+
+ priv = this->private;
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ return -1;
+ }
+
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ fav_child = afr_sh_fav_by_size(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "SIZE";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ fav_child = afr_sh_fav_by_ctime(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "CTIME";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ fav_child = afr_sh_fav_by_mtime(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "MTIME";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ fav_child = afr_sh_fav_by_majority(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "MAJORITY";
+ }
+ break;
+ case AFR_FAV_CHILD_NONE:
+ default:
+ break;
+ }
+
+ return fav_child;
+}
- } else if (afr_sh_wise_nodes_exist (characters, child_count)) {
- afr_sh_compute_wisdom (pending_matrix, characters, child_count);
+int
+afr_mark_split_brain_source_sinks_by_policy(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
+ char mtime_str[256];
+ char ctime_str[256];
+ char *policy_str = NULL;
+ struct tm *tm_ptr;
+ time_t time;
+
+ priv = this->private;
+
+ fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str);
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "No child selected by favorite-child policy.");
+ } else if (fav_child > priv->child_count - 1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Invalid child (%d) "
+ "selected by policy %s.",
+ fav_child, policy_str);
+ } else if (fav_child >= 0) {
+ time = replies[fav_child].poststat.ia_mtime;
+ tm_ptr = localtime(&time);
+ strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+ time = replies[fav_child].poststat.ia_ctime;
+ tm_ptr = localtime(&time);
+ strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Source %s selected as authentic to resolve conflicting data "
+ "in file (gfid:%s) by %s (%" PRIu64
+ " bytes @ %s mtime, %s "
+ "ctime).",
+ priv->children[fav_child]->name, uuid_utoa(inode->gfid),
+ policy_str, replies[fav_child].poststat.ia_size, mtime_str,
+ ctime_str);
+
+ sources[fav_child] = 1;
+ sinks[fav_child] = 0;
+ healed_sinks[fav_child] = 0;
+ }
+ return fav_child;
+}
- if (afr_sh_wise_nodes_conflict (characters, child_count)) {
- /* split-brain */
+gf_boolean_t
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+ struct afr_reply *replies)
+{
+ int i = 0;
- nsources = -1;
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!replies[i].valid) || (replies[i].op_ret != 0) ||
+ (replies[i].poststat.ia_size != 0))
+ return _gf_false;
+ }
- } else {
- nsources = afr_sh_mark_wisest_as_sources (sources,
- characters,
- child_count);
- }
- } else {
- nsources = afr_sh_mark_biggest_fool_as_source (sh, characters,
- child_count);
- }
+ return _gf_true;
+}
-out:
- GF_FREE (characters);
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ int source = -1;
+ int i = 0;
+ afr_private_t *priv = this->private;
+ struct iatt stbuf = {
+ 0,
+ };
+
+ if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) ||
+ (afr_success_count(replies, priv->child_count) < priv->child_count))
+ return -1;
+
+ if (type == AFR_DATA_TRANSACTION) {
+ if (!afr_is_file_empty_on_all_children(priv, replies))
+ return -1;
+ goto mark;
+ }
+
+ /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/
+ stbuf = replies[0].poststat;
+ for (i = 1; i < priv->child_count; i++) {
+ if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, prot)))
+ return -1;
+ }
+ for (i = 1; i < priv->child_count; i++) {
+ if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata))
+ return -1;
+ }
+
+mark:
+ /* data/metadata is same on all bricks. Pick one of them as source. Rest
+ * are sinks.*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (source == -1) {
+ source = i;
+ sources[i] = 1;
+ sinks[i] = 0;
+ healed_sinks[i] = 0;
+ continue;
+ }
+ sources[i] = 0;
+ sinks[i] = 1;
+ healed_sinks[i] = 1;
+ }
- return nsources;
+ return source;
}
+/* Return a source depending on the type of heal_op, and set sources[source],
+ * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
+ * only if the following condition is met:
+ * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
+ * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
+ * sinks[node] are 1. This should be the case if the file is in split-brain.
+ */
+int
+afr_mark_split_brain_source_sinks(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ int heal_op = -1;
+ int ret = -1;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ source = afr_mark_source_sinks_if_file_empty(
+ this, sources, sinks, healed_sinks, locked_on, replies, type);
+ if (source >= 0)
+ return source;
+
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+ if (ret)
+ goto autoheal;
+
+ source = afr_mark_split_brain_source_sinks_by_heal_op(
+ frame, this, sources, sinks, healed_sinks, locked_on, replies, type,
+ heal_op);
+ return source;
+
+autoheal:
+ /* Automatically heal if fav_child_policy is set. */
+ if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
+ source = afr_mark_split_brain_source_sinks_by_policy(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, type);
+ if (source != -1) {
+ ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1);
+ if (ret)
+ return -1;
+ }
+ }
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], int success[],
- int child_count, afr_transaction_type type)
-{
- int i = 0;
- int j = 0;
- int k = 0;
-
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3];
- void * pending_raw = NULL;
- int ret = 0;
-
- /* start clean */
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- delta_matrix[i][j] = 0;
- }
- }
-
- for (i = 0; i < child_count; i++) {
- pending_raw = NULL;
-
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], priv->pending_key[j],
- &pending_raw);
- if (ret < 0)
- gf_log ("afr_sh_pending_to_delta",
- GF_LOG_WARNING,
- "Unable to get dict value.");
-
- if (!success[j])
- continue;
-
- k = afr_index_for_transaction_type (type);
-
- if (pending_raw) {
- memcpy (pending, pending_raw, sizeof(pending));
- delta_matrix[i][j] = -(ntoh32 (pending[k]));
- } else {
- delta_matrix[i][j] = 0;
- }
-
- }
- }
+ return source;
}
-
int
-afr_sh_delta_to_xattr (afr_private_t *priv,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- int i = 0;
- int j = 0;
- int k = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (!dict_get_sizen(local->xdata_req, "fav-child-policy"))
+ return 0;
- int ret = 0;
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
- int32_t *pending = 0;
+ input_dirty = alloca0(priv->child_count * sizeof(int));
+ input_matrix = ALLOC_MATRIX(priv->child_count, int);
+ output_dirty = alloca0(priv->child_count * sizeof(int));
+ output_matrix = ALLOC_MATRIX(priv->child_count, int);
- for (i = 0; i < child_count; i++) {
- if (!xattr[i])
- continue;
+ afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
- for (j = 0; j < child_count; j++) {
- pending = GF_CALLOC (sizeof (int32_t), 3,
- gf_afr_mt_int32_t);
- /* 3 = data+metadata+entry */
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
+ output_dirty[i] = -input_dirty[i];
+ output_matrix[i][source] = -input_matrix[i][source];
+ }
- k = afr_index_for_transaction_type (type);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] || !locked_on[i])
+ continue;
+ xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty,
+ output_matrix, i, NULL);
- pending[k] = hton32 (delta_matrix[i][j]);
+ afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
- ret = dict_set_bin (xattr[i], priv->pending_key[j],
- pending,
- 3 * sizeof (int32_t));
- if (ret < 0)
- gf_log ("afr_sh_delta_to_xattr",
- GF_LOG_WARNING,
- "Unable to set dict value.");
- }
- }
- return 0;
-}
+ undid_pending[i] = 1;
+ dict_unref(xattr);
+ }
+ if (xdata)
+ dict_unref(xdata);
-int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
+ return 0;
+}
+
+gf_boolean_t
+afr_does_witness_exist(xlator_t *this, uint64_t *witness)
{
- afr_private_t *priv = NULL;
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3];
- void *pending_raw = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
+ priv = this->private;
- priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (witness[i])
+ return _gf_true;
+ }
+ return _gf_false;
+}
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
+unsigned int
+afr_get_quorum_count(afr_private_t *priv)
+{
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ return priv->child_count / 2 + 1;
+ } else {
+ return priv->quorum_count;
+ }
+}
- if (ret != 0)
- return 0;
+void
+afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused,
+ unsigned char *sources,
+ unsigned char *locked_on)
+{
+ int i = 0;
+ unsigned int quorum_count = 0;
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
+ if (AFR_COUNT(sources, priv->child_count) != 0)
+ return;
- if (pending[j])
- return 1;
+ quorum_count = afr_get_quorum_count(priv);
+ for (i = 0; i < priv->child_count; i++) {
+ if ((accused[i] < quorum_count) && locked_on[i]) {
+ sources[i] = 1;
}
-
- return 0;
+ }
+ return;
}
+/*
+ * This function determines if a self-heal is required for a given inode,
+ * and if needed, in what direction.
+ *
+ * locked_on[] is the array representing servers which have been locked and
+ * from which xattrs have been fetched for analysis.
+ *
+ * The output of the function is by filling the arrays sources[] and sinks[].
+ *
+ * sources[i] is set if i'th server is an eligible source for a selfheal.
+ *
+ * sinks[i] is set if i'th server needs to be healed.
+ *
+ * if sources[0..N] are all set, there is no need for a selfheal.
+ *
+ * if sinks[0..N] are all set, the inode is in split brain.
+ *
+ */
int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ uint64_t *witness, unsigned char *pflag)
{
- afr_private_t *priv = NULL;
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3];
- void *pending_raw = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ int *dirty = NULL; /* Denotes if dirty xattr is set */
+ int **matrix = NULL; /* Changelog matrix */
+ char *accused = NULL; /* Accused others without any self-accusal */
+ char *pending = NULL; /* Have pending operations on others */
+ char *self_accused = NULL; /* Accused itself */
+
+ priv = this->private;
+
+ dirty = alloca0(priv->child_count * sizeof(int));
+ accused = alloca0(priv->child_count);
+ pending = alloca0(priv->child_count);
+ self_accused = alloca0(priv->child_count);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+ memset(witness, 0, sizeof(*witness) * priv->child_count);
+
+ /* First construct the pending matrix for further analysis */
+ afr_selfheal_extract_xattr(this, replies, type, dirty, matrix);
+
+ if (pflag) {
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j])
+ *pflag |= PFLAG_PENDING;
+ if (*pflag)
+ break;
+ }
+ }
+
+ if (afr_success_count(replies, priv->child_count) < priv->child_count) {
+ /* Treat this just like locks not being acquired */
+ return -ENOTCONN;
+ }
+
+ /* short list all self-accused */
+ for (i = 0; i < priv->child_count; i++) {
+ if (matrix[i][i])
+ self_accused[i] = 1;
+ }
+
+ /* Next short list all accused to exclude them from being sources */
+ /* Self-accused can't accuse others as they are FOOLs */
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j]) {
+ if (!self_accused[i])
+ accused[j] += 1;
+ if (i != j)
+ pending[i] += 1;
+ }
+ }
+ }
- int ret = -1;
- int i = 0;
- int j = 0;
+ /* Short list all non-accused as sources */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!accused[i] && locked_on[i])
+ sources[i] = 1;
+ else
+ sources[i] = 0;
+ }
+
+ /* Everyone accused by non-self-accused sources are sinks */
+ memset(sinks, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (self_accused[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ sinks[j] = 1;
+ }
+ }
+
+ /* For breaking ties provide with number of fops they witnessed */
+
+ /*
+ * count the pending fops witnessed from itself to others when it is
+ * self-accused
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!self_accused[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (i == j)
+ continue;
+ witness[i] += matrix[i][j];
+ }
+ }
- priv = this->private;
+ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
+ afr_selfheal_post_op_failure_accounting(priv, accused, sources,
+ locked_on);
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT(sources, priv->child_count) == 0) {
for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
-
- if (ret != 0)
- return 0;
-
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
-
- if (pending[j])
- return 1;
+ if (locked_on[i])
+ sinks[i] = 1;
}
+ if (pflag)
+ *pflag |= PFLAG_SBRAIN;
+ }
+
+ /* One more class of witness similar to dirty in v2 is where no pending
+ * exists but we have self-accusing markers. This can happen in afr-v1
+ * if the brick crashes just after doing xattrop on self but
+ * before xattrop on the other xattrs on the brick in pre-op. */
+ if (AFR_COUNT(pending, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (self_accused[i])
+ witness[i] += matrix[i][i];
+ }
+ } else {
+ /* In afr-v1 if a file is self-accused and has pending
+ * operations on others then it is similar to 'dirty' in afr-v2.
+ * Consider such cases as witness.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (self_accused[i] && pending[i])
+ witness[i] += matrix[i][i];
+ }
+ }
+
+ /* count the number of dirty fops witnessed */
+ for (i = 0; i < priv->child_count; i++)
+ witness[i] += dirty[i];
- return 0;
+ return 0;
}
+void
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ char *status = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
+ char *sources_str = NULL;
+ char *q = NULL;
+ afr_private_t *priv = NULL;
+ gf_loglevel_t loglevel = GF_LOG_NONE;
+ int i = 0;
+
+ priv = this->private;
+ sinks_str = alloca0(priv->child_count * 8);
+ p = sinks_str;
+ sources_str = alloca0(priv->child_count * 8);
+ q = sources_str;
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i])
+ p += sprintf(p, "%d ", i);
+ if (sources[i]) {
+ if (source == i) {
+ q += sprintf(q, "[%d] ", i);
+ } else {
+ q += sprintf(q, "%d ", i);
+ }
+ }
+ }
+
+ if (ret < 0) {
+ status = "Failed";
+ loglevel = GF_LOG_DEBUG;
+ } else {
+ status = "Completed";
+ loglevel = GF_LOG_INFO;
+ }
+
+ gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO,
+ "%s %s selfheal on %s. "
+ "sources=%s sinks=%s",
+ status, type, uuid_utoa(gfid), sources_str, sinks_str);
+}
int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
{
- afr_private_t *priv = NULL;
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3];
- void *pending_raw = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ GF_UNUSED int ret = -1;
+ int8_t need_heal = 1;
+
+ local = frame->local;
+ i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (buf)
+ local->replies[i].poststat = *buf;
+ if (parbuf)
+ local->replies[i].postparent = *parbuf;
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ }
+
+ local->replies[i].need_heal = need_heal;
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
- int ret = -1;
- int i = 0;
- int j = 0;
+inode_t *
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on, dict_t *xattr)
+{
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
- priv = this->private;
+ local = frame->local;
+ priv = frame->this->private;
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
+ xattr_req = dict_new();
+ if (!xattr_req)
+ return NULL;
- if (ret != 0)
- return 0;
-
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
+ if (xattr)
+ dict_copy(xattr, xattr_req);
- if (pending[j])
- return 1;
- }
+ if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+ dict_unref(xattr_req);
+ return NULL;
+ }
- return 0;
-}
+ inode = inode_new(parent->table);
+ if (!inode) {
+ dict_unref(xattr_req);
+ return NULL;
+ }
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
-/**
- * is_matrix_zero - return true if pending matrix is all zeroes
- */
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
-int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count)
-{
- int i, j;
+ afr_replies_copy(replies, local->replies, priv->child_count);
+
+ loc_wipe(&loc);
+ dict_unref(xattr_req);
- for (i = 0; i < child_count; i++)
- for (j = 0; j < child_count; j++)
- if (pending_matrix[i][j])
- return 0;
- return 1;
+ return inode;
}
+static int
+afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
+
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+
+ ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ ret = dict_set_uint32(dict, key1, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+ ret = dict_set_uint32(dict, key2, 1);
+ if (ret)
+ return ret;
+
+ return 0;
+}
int
-afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on, dict_t *dict)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ local = frame->local;
+ priv = frame->this->private;
-// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
- memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count);
+ xattr_req = dict_new();
+ if (!xattr_req)
+ return -ENOMEM;
+ if (dict)
+ dict_copy(dict, xattr_req);
- for (i = 0; i < priv->child_count; i++) {
- sh->locked_nodes[i] = 0;
- }
+ if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+ dict_unref(xattr_req);
+ return -ENOMEM;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
+ if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) {
+ dict_unref(xattr_req);
+ return -1;
+ }
- if (local->govinda_gOvinda) {
- gf_log (this->name, GF_LOG_TRACE,
- "aborting selfheal of %s",
- local->loc.path);
- sh->completion_cbk (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
- afr_self_heal_metadata (frame, this);
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, gfid);
- return 0;
-}
+ AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
+ afr_replies_copy(replies, local->replies, priv->child_count);
-static int
-sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
+ loc_wipe(&loc);
+ dict_unref(xattr_req);
+
+ return 0;
+}
+
+int
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+ struct afr_reply *replies)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ dict_t *dict = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
- int_lock->lock_cbk = afr_sh_missing_entries_done;
- afr_unlock (frame, this);
+ if (local->xattr_req)
+ dict = local->xattr_req;
- return 0;
+ return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
+ local->child_up, dict);
}
-
-static int
-sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int op_errno,
- struct iatt *preop, struct iatt *postop)
+unsigned int
+afr_success_count(struct afr_reply *replies, unsigned int count)
{
- afr_local_t *local = NULL;
+ int i = 0;
+ unsigned int success = 0;
- loc_t *parent_loc = cookie;
+ for (i = 0; i < count; i++)
+ if (replies[i].valid && replies[i].op_ret == 0)
+ success++;
+ return success;
+}
- int call_count = 0;
+int
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
- local = frame->local;
+ local = frame->local;
+ i = (long)cookie;
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setattr on %s failed: %s",
- local->loc.path, strerror (op_errno));
- }
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- if (parent_loc) {
- loc_wipe (parent_loc);
- GF_FREE (parent_loc);
- }
+ syncbarrier_wake(&local->barrier);
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- STACK_DESTROY (frame->root);
- }
-
- return 0;
+ return 0;
}
+int
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ locked_on[i] = 1;
+ count++;
+ } else {
+ locked_on[i] = 0;
+ }
+ }
-static int
-sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- call_frame_t *setattr_frame = NULL;
- int call_count = 0;
- int child_index = 0;
-
- loc_t *parent_loc = NULL;
-
- struct iatt stbuf;
- int32_t valid;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- stbuf.ia_atime = sh->buf[sh->source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[sh->source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[sh->source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[sh->source].ia_mtime_nsec;
-
- stbuf.ia_uid = sh->buf[sh->source].ia_uid;
- stbuf.ia_gid = sh->buf[sh->source].ia_gid;
-
- valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- if (op_ret == 0) {
- setattr_frame = copy_frame (frame);
-
- setattr_frame->local = GF_CALLOC (1, sizeof (afr_local_t),
- gf_afr_mt_afr_local_t);
-
- ((afr_local_t *)setattr_frame->local)->call_count = 2;
-
- gf_log (this->name, GF_LOG_TRACE,
- "setattr (%s) on subvolume %s",
- local->loc.path, priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (setattr_frame, sh_destroy_cbk,
- (void *) (long) 0,
- priv->children[child_index],
- priv->children[child_index]->fops->setattr,
- &local->loc, &stbuf, valid);
-
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- parent_loc = GF_CALLOC (1, sizeof (*parent_loc),
- gf_afr_mt_loc_t);
- afr_build_parent_loc (parent_loc, &local->loc);
-
- STACK_WIND_COOKIE (setattr_frame, sh_destroy_cbk,
- (void *) (long) parent_loc,
- priv->children[child_index],
- priv->children[child_index]->fops->setattr,
- parent_loc, &sh->parentbuf, valid);
- }
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- sh_missing_entries_finish (frame, this);
- }
-
- return 0;
+ return count;
}
+int
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
-static int
-sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int ret = 0;
- int enoent_count = 0;
- int call_count = 0;
- mode_t st_mode = 0;
- dev_t ia_dev = 0;
- dict_t *dict = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++)
- if (sh->child_errno[i] == ENOENT)
- enoent_count++;
-
- call_count = enoent_count;
- local->call_count = call_count;
-
- st_mode = st_mode_from_ia (sh->buf[sh->source].ia_prot,
- sh->buf[sh->source].ia_type);
- ia_dev = sh->buf[sh->source].ia_dev;
-
- gf_log (this->name, GF_LOG_TRACE,
- "mknod %s mode 0%o on %d subvolumes",
- local->loc.path, st_mode, enoent_count);
-
- dict = dict_new ();
- if (!dict)
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
-
- ret = afr_set_dict_gfid (dict, sh->buf[sh->source].ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG, "gfid set failed");
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i] == ENOENT) {
- STACK_WIND_COOKIE (frame,
- sh_missing_entries_newentry_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mknod,
- &local->loc, st_mode, ia_dev, dict);
- if (!--call_count)
- break;
- }
- }
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- if (dict)
- dict_unref (dict);
+ loc_wipe(&loc);
- return 0;
+ return afr_locked_fill(frame, this, locked_on);
}
+int
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
+
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_uninodelk(frame, this, inode, dom, off, size,
+ locked_on);
+
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+ &flock, NULL);
+ break;
+ }
+ }
-static int
-sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- dict_t *dict = NULL;
- int i = 0;
- int ret = 0;
- int enoent_count = 0;
- int call_count = 0;
- mode_t st_mode = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++)
- if (sh->child_errno[i] == ENOENT)
- enoent_count++;
-
- call_count = enoent_count;
- local->call_count = call_count;
-
- st_mode = st_mode_from_ia (sh->buf[sh->source].ia_prot,
- sh->buf[sh->source].ia_type);
-
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- sh_missing_entries_finish (frame, this);
- return 0;
+ loc_wipe(&loc);
+
+ return afr_locked_fill(frame, this, locked_on);
+}
+
+static void
+afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies,
+ int *lock_count, int *eagain_count)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == 0) {
+ (*lock_count)++;
+ } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) {
+ (*eagain_count)++;
}
+ }
+}
- ret = afr_set_dict_gfid (dict, sh->buf[sh->source].ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "inode gfid set failed");
+/*Do blocking locks if number of locks acquired is majority and there were some
+ * EAGAINs. Useful for odd-way replication*/
+int
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, off_t off,
+ size_t size, unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lock_count = 0;
+ int eagain_count = 0;
+ priv = this->private;
+ local = frame->local;
- gf_log (this->name, GF_LOG_TRACE,
- "mkdir %s mode 0%o on %d subvolumes",
- local->loc.path, st_mode, enoent_count);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i] == ENOENT) {
- if (!strcmp (local->loc.path, "/")) {
- /* We shouldn't try to create "/" */
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- sh_missing_entries_finish (frame, this);
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- return 0;
- } else {
- STACK_WIND_COOKIE (frame,
- sh_missing_entries_newentry_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mkdir,
- &local->loc, st_mode, dict);
- if (!--call_count)
- break;
- }
- }
- }
+ afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+ &eagain_count);
- if (dict)
- dict_unref (dict);
+ if (lock_count > priv->child_count / 2 && eagain_count) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on);
- return 0;
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+ &flock, NULL);
+ }
+
+ loc_wipe(&loc);
+
+ return afr_locked_fill(frame, this, locked_on);
}
+int
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
-static int
-sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this,
- const char *link, struct iatt *buf)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- dict_t *dict = NULL;
- int i = 0;
- int ret = 0;
- int enoent_count = 0;
- int call_count = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++)
- if (sh->child_errno[i] == ENOENT)
- enoent_count++;
-
- call_count = enoent_count;
- local->call_count = call_count;
-
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- sh_missing_entries_finish (frame, this);
- return 0;
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- ret = afr_set_dict_gfid (dict, buf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "dict gfid set failed");
+ flock.l_type = F_UNLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- gf_log (this->name, GF_LOG_TRACE,
- "symlink %s -> %s on %d subvolumes",
- local->loc.path, link, enoent_count);
+ AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc,
+ F_SETLK, &flock, NULL);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i] == ENOENT) {
- STACK_WIND_COOKIE (frame,
- sh_missing_entries_newentry_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->symlink,
- link, &local->loc, dict);
- if (!--call_count)
- break;
- }
- }
+ loc_wipe(&loc);
- return 0;
+ return 0;
}
-
-static int
-sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *link, struct iatt *sbuf)
+int
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- if (op_ret > 0)
- sh_missing_entries_symlink (frame, this, link, sbuf);
- else
- sh_missing_entries_finish (frame, this);
+ loc_t loc = {
+ 0,
+ };
- return 0;
-}
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
-static int
-sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ loc_wipe(&loc);
+ return afr_locked_fill(frame, this, locked_on);
+}
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+int
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on,
+ NULL);
+
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ break;
+ }
+ }
- STACK_WIND (frame, sh_missing_entries_readlink_cbk,
- priv->children[sh->source],
- priv->children[sh->source]->fops->readlink,
- &local->loc, 4096);
+ loc_wipe(&loc);
- return 0;
+ return afr_locked_fill(frame, this, locked_on);
}
+int
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, const char *name,
+ unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lock_count = 0;
+ int eagain_count = 0;
-static int
-sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int type = 0;
- int i = 0;
- afr_private_t *priv = NULL;
- int enoent_count = 0;
- int govinda_gOvinda = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
-
- if (sh->child_errno[i]) {
- if (sh->child_errno[i] == ENOENT)
- enoent_count++;
- } else {
- if (type) {
- if (type != sh->buf[i].ia_type) {
- gf_log (this->name, GF_LOG_TRACE,
- "file %s is govinda!",
- local->loc.path);
-
- govinda_gOvinda = 1;
- }
- } else {
- sh->source = i;
- type = sh->buf[i].ia_type;
- }
- }
- }
-
- if (govinda_gOvinda) {
- gf_log (this->name, GF_LOG_ERROR,
- "conflicting filetypes exist for path %s. returning.",
- local->loc.path);
-
- local->govinda_gOvinda = 1;
- sh_missing_entries_finish (frame, this);
- return 0;
- }
-
- if (!type) {
- gf_log (this->name, GF_LOG_ERROR,
- "no source found for %s. all nodes down?. returning.",
- local->loc.path);
- /* subvolumes down and/or file does not exist */
- sh_missing_entries_finish (frame, this);
- return 0;
- }
-
- if (enoent_count == 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "no missing files - %s. proceeding to metadata check",
- local->loc.path);
- /* proceed to next step - metadata self-heal */
- sh_missing_entries_finish (frame, this);
- return 0;
- }
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- sh_missing_entries_mknod (frame, this);
- break;
- case IA_IFLNK:
- sh_missing_entries_readlink (frame, this);
- break;
- case IA_IFDIR:
- sh_missing_entries_mkdir (frame, this);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "unknown file type: 0%o", type);
- local->govinda_gOvinda = 1;
- sh_missing_entries_finish (frame, this);
- }
-
- return 0;
-}
+ priv = this->private;
+ local = frame->local;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
-static int
-sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- int child_index = 0;
- afr_local_t *local = NULL;
- int call_count = 0;
- afr_private_t *priv = NULL;
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+ afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+ &eagain_count);
- local = frame->local;
- priv = this->private;
+ if (lock_count > priv->child_count / 2 && eagain_count) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL);
- child_index = (long) cookie;
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ }
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "path %s on subvolume %s is of mode 0%o",
- local->loc.path,
- priv->children[child_index]->name,
- buf->ia_type);
+ loc_wipe(&loc);
- local->self_heal.buf[child_index] = *buf;
- local->self_heal.parentbuf = *postparent;
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "path %s on subvolume %s => -1 (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
+ return afr_locked_fill(frame, this, locked_on);
+}
- local->self_heal.child_errno[child_index] = op_errno;
- }
+int
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata)
+{
+ loc_t loc = {
+ 0,
+ };
- }
- UNLOCK (&frame->lock);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- call_count = afr_frame_return (frame);
+ AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
+ name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
- if (call_count == 0) {
- sh_missing_entries_create (frame, this);
- }
+ loc_wipe(&loc);
- return 0;
+ return 0;
}
+gf_boolean_t
+afr_is_data_set(xlator_t *this, dict_t *xdata)
+{
+ return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION);
+}
-static int
-sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_metadata_set(xlator_t *this, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
- int ret = -1;
+ return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION);
+}
- local = frame->local;
- priv = this->private;
+gf_boolean_t
+afr_is_entry_set(xlator_t *this, dict_t *xdata)
+{
+ return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION);
+}
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
+/*
+ * This function inspects the looked up replies (in an unlocked manner)
+ * and decides whether a locked verification and possible healing is
+ * required or not. It updates the three booleans for each type
+ * of healing. If the boolean flag gets set to FALSE, then we are sure
+ * no healing is required. If the boolean flag gets set to TRUE then
+ * we have to proceed with locked reinspection.
+ */
- local->call_count = call_count;
+int
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **link_inode, gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies_dst)
+{
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ int i = 0;
+ int valid_cnt = 0;
+ struct iatt first = {
+ 0,
+ };
+ int first_idx = 0;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ inode = afr_inode_find(this, gfid);
+ if (!inode)
+ goto out;
+
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1)
+ continue;
+
+ /* The data segment of the changelog can be non-zero to indicate
+ * the directory needs a full heal. So the check below ensures
+ * it's not a directory before setting the data_selfheal boolean.
+ */
+ if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) &&
+ afr_is_data_set(this, replies[i].xdata))
+ *data_selfheal = _gf_true;
- xattr_req = dict_new();
+ if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata))
+ *metadata_selfheal = _gf_true;
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req,
- priv->pending_key[i],
- 3 * sizeof(int32_t));
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- }
+ if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata))
+ *entry_selfheal = _gf_true;
+
+ valid_cnt++;
+ if (valid_cnt == 1) {
+ first = replies[i].poststat;
+ first_idx = i;
+ continue;
}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
+ if (!IA_EQUAL(first, replies[i].poststat, type)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "TYPE mismatch %d vs %d on %s for gfid:%s",
+ (int)first.ia_type, (int)replies[i].poststat.ia_type,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;"
+ "type=file;gfid=%s;"
+ "ia_type-%d=%s;ia_type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(replies[i].poststat.ia_gfid), first_idx,
+ gf_inode_type_to_str(first.ia_type), i,
+ gf_inode_type_to_str(replies[i].poststat.ia_type));
+ ret = -EIO;
+ goto out;
+ }
- STACK_WIND_COOKIE (frame,
- sh_missing_entries_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, xattr_req);
+ if (!IA_EQUAL(first, replies[i].poststat, uid)) {
+ gf_msg_debug(this->name, 0,
+ "UID mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
- if (!--call_count)
- break;
- }
- }
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
- if (xattr_req)
- dict_unref (xattr_req);
+ if (!IA_EQUAL(first, replies[i].poststat, gid)) {
+ gf_msg_debug(this->name, 0,
+ "GID mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
- return 0;
-}
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+ if (!IA_EQUAL(first, replies[i].poststat, prot)) {
+ gf_msg_debug(this->name, 0,
+ "MODE mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)st_mode_from_ia(first.ia_prot, 0),
+ (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0),
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+ if (IA_ISREG(first.ia_type) &&
+ !IA_EQUAL(first, replies[i].poststat, size)) {
+ gf_msg_debug(this->name, 0,
+ "SIZE mismatch "
+ "%lld vs %lld on %s for gfid:%s",
+ (long long)first.ia_size,
+ (long long)replies[i].poststat.ia_size,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (data_selfheal)
+ *data_selfheal = _gf_true;
+ }
+ }
-int
-afr_sh_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+ if (valid_cnt > 0 && link_inode) {
+ *link_inode = inode_link(inode, NULL, NULL, &first);
+ if (!*link_inode) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else if (valid_cnt < 2) {
+ ret = afr_check_stale_error(replies, priv);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (replies && replies_dst)
+ afr_replies_copy(replies_dst, replies, priv->child_count);
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+
+ return ret;
+}
+
+inode_t *
+afr_inode_find(xlator_t *this, uuid_t gfid)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ inode_table_t *table = NULL;
+ inode_t *inode = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ table = this->itable;
+ if (!table)
+ return NULL;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks failed.");
- afr_sh_missing_entries_done (frame, this);
- } else {
+ inode = inode_find(table, gfid);
+ if (inode)
+ return inode;
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- sh_missing_entries_lookup (frame, this);
- }
+ inode = inode_new(table);
+ if (!inode)
+ return NULL;
- return 0;
+ gf_uuid_copy(inode->gfid, gfid);
+
+ return inode;
}
-static int
-afr_sh_entrylk (call_frame_t *frame, xlator_t *this)
+call_frame_t *
+afr_frame_create(xlator_t *this, int32_t *op_errno)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ pid_t pid = GF_CLIENT_PID_SELF_HEALD;
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ return NULL;
+ }
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK;
+ local = AFR_FRAME_INIT(frame, (*op_errno));
+ if (!local) {
+ STACK_DESTROY(frame->root);
+ return NULL;
+ }
- afr_set_lock_number (frame, this);
+ syncopctx_setfspid(&pid);
- int_lock->lk_basename = local->loc.name;
- int_lock->lk_loc = &sh->parent_loc;
- int_lock->lock_cbk = afr_sh_post_nonblocking_entrylk_cbk;
+ frame->root->pid = pid;
- afr_nonblocking_entrylk (frame, this);
+ afr_set_lk_owner(frame, this, frame->root);
- return 0;
+ return frame;
}
-static int
-afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int **changelog = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
- priv = this->private;
+ priv = this->private;
- gf_log (this->name, GF_LOG_TRACE,
- "attempting to recreate missing entries for path=%s",
- local->loc.path);
+ gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid);
- afr_build_parent_loc (&sh->parent_loc, &local->loc);
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
- afr_sh_entrylk (frame, this);
- return 0;
+ changelog = afr_mark_pending_changelog(priv, newentry, xattr,
+ replies[source].poststat.ia_type);
+
+ if (!changelog) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ }
+out:
+ if (changelog)
+ afr_matrix_cleanup(changelog, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
}
-afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)
+int
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
{
- afr_private_t *priv = NULL;
- afr_local_t *lc = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *shc = NULL;
+ int ret = -1;
+ int entry_ret = 1;
+ int metadata_ret = 1;
+ int data_ret = 1;
+ int or_ret = 0;
+ inode_t *inode = NULL;
+ fd_t *fd = NULL;
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode,
+ &data_selfheal, &metadata_selfheal,
+ &entry_selfheal, NULL);
+ if (ret)
+ goto out;
+
+ if (!(data_selfheal || metadata_selfheal || entry_selfheal)) {
+ ret = 2;
+ goto out;
+ }
+
+ if (inode->ia_type == IA_IFREG) {
+ ret = afr_selfheal_data_open(this, inode, &fd);
+ if (!fd) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+ if (data_selfheal && priv->data_self_heal)
+ data_ret = afr_selfheal_data(frame, this, fd);
- priv = this->private;
+ if (metadata_selfheal && priv->metadata_self_heal)
+ metadata_ret = afr_selfheal_metadata(frame, this, inode);
- sh = &l->self_heal;
+ if (entry_selfheal && priv->entry_self_heal)
+ entry_ret = afr_selfheal_entry(frame, this, inode);
- lc = GF_CALLOC (1, sizeof (afr_local_t),
- gf_afr_mt_afr_local_t);
+ or_ret = (data_ret | metadata_ret | entry_ret);
- shc = &lc->self_heal;
+ if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO)
+ ret = -EIO;
+ else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1)
+ ret = 1;
+ else if (or_ret < 0)
+ ret = or_ret;
+ else
+ ret = 0;
- shc->unwind = sh->unwind;
- shc->need_data_self_heal = sh->need_data_self_heal;
- shc->need_metadata_self_heal = sh->need_metadata_self_heal;
- shc->need_entry_self_heal = sh->need_entry_self_heal;
- shc->forced_merge = sh->forced_merge;
- shc->healing_fd_opened = sh->healing_fd_opened;
- shc->data_lock_held = sh->data_lock_held;
- if (sh->healing_fd && !sh->healing_fd_opened)
- shc->healing_fd = fd_ref (sh->healing_fd);
- else
- shc->healing_fd = sh->healing_fd;
- shc->background = sh->background;
- shc->type = sh->type;
-
- if (l->loc.path)
- loc_copy (&lc->loc, &l->loc);
-
- lc->child_up = memdup (l->child_up, priv->child_count);
- if (l->xattr_req)
- lc->xattr_req = dict_ref (l->xattr_req);
-
- if (l->cont.lookup.inode)
- lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode);
- if (l->cont.lookup.xattr)
- lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr);
- if (l->internal_lock.inode_locked_nodes)
- lc->internal_lock.inode_locked_nodes =
- memdup (l->internal_lock.inode_locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.entry_locked_nodes)
- lc->internal_lock.entry_locked_nodes =
- memdup (l->internal_lock.entry_locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.locked_nodes)
- lc->internal_lock.locked_nodes =
- memdup (l->internal_lock.locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+out:
+ if (inode)
+ inode_unref(inode);
+ if (fd)
+ fd_unref(fd);
+ return ret;
+}
+/*
+ * This is the entry point for healing a given GFID. The return values for this
+ * function are as follows:
+ * '0' if the self-heal is successful
+ * '1' if the afr-xattrs are non-zero (due to on-going IO) and no heal is needed
+ * '2' if the afr-xattrs are all-zero and no heal is needed
+ * $errno if the heal on the gfid failed.
+ */
- lc->internal_lock.inodelk_lock_count =
- l->internal_lock.inodelk_lock_count;
- lc->internal_lock.entrylk_lock_count =
- l->internal_lock.entrylk_lock_count;
+int
+afr_selfheal(xlator_t *this, uuid_t gfid)
+{
+ int ret = -1;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
- return lc;
-}
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ return ret;
+ local = frame->local;
+ local->xdata_req = dict_new();
-int
-afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- char sh_type_str[256] = {0,};
-
- priv = this->private;
- local = bgsh_frame->local;
- sh = &local->self_heal;
-
- if (local->govinda_gOvinda) {
- afr_set_split_brain (this, local->cont.lookup.inode, _gf_true);
- } else {
- afr_set_split_brain (this, local->cont.lookup.inode, _gf_false);
- }
-
- afr_self_heal_type_str_get(sh, sh_type_str,
- sizeof(sh_type_str));
- gf_log (this->name, GF_LOG_NORMAL,
- "background %s self-heal completed on %s", sh_type_str,
- local->loc.path);
-
- if (!sh->unwound) {
- sh->unwind (sh->orig_frame, this);
- }
+ ret = afr_selfheal_do(frame, this, gfid);
- if (sh->background) {
- LOCK (&priv->lock);
- {
- priv->background_self_heals_started--;
- }
- UNLOCK (&priv->lock);
- }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
- AFR_STACK_DESTROY (bgsh_frame);
+ return ret;
+}
- return 0;
+afr_local_t *
+__afr_dequeue_heals(afr_private_t *priv)
+{
+ afr_local_t *local = NULL;
+
+ if (list_empty(&priv->heal_waiting))
+ goto none;
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->healers >= priv->background_self_heal_count))
+ goto none;
+
+ local = list_entry(priv->heal_waiting.next, afr_local_t, healer);
+ priv->heal_waiters--;
+ GF_ASSERT(priv->heal_waiters >= 0);
+ list_del_init(&local->healer);
+ list_add(&local->healer, &priv->healing);
+ priv->healers++;
+ return local;
+none:
+ gf_msg_debug(THIS->name, 0,
+ "Nothing dequeued. "
+ "Num healers: %d, Num Waiters: %d",
+ priv->healers, priv->heal_waiters);
+ return NULL;
}
int
-afr_self_heal (call_frame_t *frame, xlator_t *this)
+afr_refresh_selfheal_wrap(void *opaque)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ call_frame_t *heal_frame = opaque;
+ afr_local_t *local = heal_frame->local;
+ int ret = 0;
- call_frame_t *sh_frame = NULL;
- afr_local_t *sh_local = NULL;
+ ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid);
+ return ret;
+}
+int
+afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ call_frame_t *heal_frame = opaque;
+ xlator_t *this = heal_frame->this;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = heal_frame->local;
+
+ LOCK(&priv->lock);
+ {
+ list_del_init(&local->healer);
+ priv->healers--;
+ GF_ASSERT(priv->healers >= 0);
+ local = __afr_dequeue_heals(priv);
+ }
+ UNLOCK(&priv->lock);
+
+ AFR_STACK_DESTROY(heal_frame);
+
+ if (local)
+ afr_heal_synctask(this, local);
+ return 0;
+}
- local = frame->local;
- priv = this->private;
+void
+afr_heal_synctask(xlator_t *this, afr_local_t *local)
+{
+ int ret = 0;
+ call_frame_t *heal_frame = NULL;
+
+ heal_frame = local->heal_frame;
+ ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap,
+ afr_refresh_heal_done, heal_frame, heal_frame);
+ if (ret < 0)
+ /* Heal not launched. Will be queued when the next inode
+ * refresh happens and shd hasn't healed it yet. */
+ afr_refresh_heal_done(ret, heal_frame, heal_frame);
+}
- afr_set_lk_owner (frame, this);
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this)
+{
+ gf_boolean_t can_heal = _gf_true;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ LOCK(&priv->lock);
+ {
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->heal_wait_qlen + priv->background_self_heal_count) >
+ (priv->heal_waiters + priv->healers)) {
+ list_add_tail(&local->healer, &priv->heal_waiting);
+ priv->heal_waiters++;
+ local = __afr_dequeue_heals(priv);
+ } else {
+ can_heal = _gf_false;
+ }
+ }
+ UNLOCK(&priv->lock);
- if (local->self_heal.background) {
- LOCK (&priv->lock);
- {
- if (priv->background_self_heals_started
- > priv->background_self_heal_count) {
+ if (can_heal) {
+ if (local)
+ afr_heal_synctask(this, local);
+ else
+ gf_msg_debug(this->name, 0,
+ "Max number of heals are "
+ "pending, background self-heal rejected.");
+ }
- local->self_heal.background = _gf_false;
+ return can_heal;
+}
- } else {
- priv->background_self_heals_started++;
- }
- }
- UNLOCK (&priv->lock);
+int
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+ afr_transaction_type type)
+{
+ int source = -1;
+ int i = 0;
+
+ /* Give preference to local child to save on bandwidth */
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->local[i] && sources[i]) {
+ if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+
+ source = i;
+ goto out;
}
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "performing self heal on %s (metadata=%d data=%d entry=%d)",
- local->loc.path,
- local->self_heal.need_metadata_self_heal,
- local->self_heal.need_data_self_heal,
- local->self_heal.need_entry_self_heal);
-
- sh_frame = copy_frame (frame);
- sh_local = afr_local_copy (local, this);
- sh_frame->local = sh_local;
- sh = &sh_local->self_heal;
-
- sh->orig_frame = frame;
-
- sh->completion_cbk = afr_self_heal_completion_cbk;
-
-
- sh->buf = GF_CALLOC (priv->child_count, sizeof (struct iatt),
- gf_afr_mt_iatt);
- sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int),
- gf_afr_mt_int);
- sh->success = GF_CALLOC (priv->child_count, sizeof (int),
- gf_afr_mt_int);
- sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *),
- gf_afr_mt_dict_t);
- sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count,
- gf_afr_mt_int);
- sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes),
- priv->child_count,
- gf_afr_mt_int);
-
- sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count,
- gf_afr_mt_int32_t);
-
- for (i = 0; i < priv->child_count; i++) {
- sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t),
- priv->child_count,
- gf_afr_mt_int32_t);
- }
-
- sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count,
- gf_afr_mt_int32_t);
- for (i = 0; i < priv->child_count; i++) {
- sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t),
- priv->child_count,
- gf_afr_mt_int32_t);
- }
-
- if (local->success_count && local->enoent_count) {
- afr_self_heal_missing_entries (sh_frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
-
- afr_sh_missing_entries_done (sh_frame, this);
- }
-
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ goto out;
+ }
+ }
+out:
+ return source;
}
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size)
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- GF_ASSERT (str && (size > 0));
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->replies[i].poststat = *buf;
+ local->replies[i].preparent = *preparent;
+ local->replies[i].postparent = *postparent;
+ }
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
- if (self_heal_p->need_metadata_self_heal) {
- snprintf(str, size, " meta-data");
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ unsigned char *mkdir_on = alloca0(priv->child_count);
+ unsigned char *lookup_on = alloca0(priv->child_count);
+ loc_t loc = {0};
+ int32_t op_errno = 0;
+ int32_t child_op_errno = 0;
+ struct iatt iatt = {0};
+ dict_t *xdata = NULL;
+ uuid_t anon_inode_gfid = {0};
+ int mkdir_count = 0;
+ int i = 0;
+
+ /*Try to mkdir everywhere and return success if the dir exists on 'child'
+ */
+
+ if (!priv->use_anon_inode) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &op_errno);
+ if (op_errno) {
+ goto out;
+ }
+ local = frame->local;
+ if (!local->child_up[child]) {
+ /*Other bricks may need mkdir so don't error out yet*/
+ child_op_errno = ENOTCONN;
+ }
+ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ if (priv->anon_inode[i]) {
+ mkdir_on[i] = 0;
+ } else {
+ mkdir_on[i] = 1;
+ mkdir_count++;
}
+ }
- if (self_heal_p->need_data_self_heal) {
- snprintf(str + strlen(str), size - strlen(str),
- " data");
+ if (mkdir_count == 0) {
+ *linked_inode = inode_find(this->itable, anon_inode_gfid);
+ if (*linked_inode) {
+ op_errno = 0;
+ goto out;
+ }
+ }
+
+ loc.parent = inode_ref(this->itable->root);
+ loc.name = priv->anon_inode_name;
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+ if (op_errno) {
+ goto out;
+ }
+
+ if (mkdir_count == 0) {
+ memcpy(lookup_on, local->child_up, priv->child_count);
+ goto lookup;
+ }
+
+ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+ xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!mkdir_on[i]) {
+ continue;
}
- if (self_heal_p->need_entry_self_heal) {
- snprintf(str + strlen(str), size - strlen(str),
- " entry");
+ if (local->replies[i].op_ret == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno == EEXIST) {
+ lookup_on[i] = 1;
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+
+ if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+ goto link;
+ }
+
+lookup:
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xdata);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!lookup_on[i]) {
+ continue;
}
+
+ if (local->replies[i].op_ret == 0) {
+ if (gf_uuid_compare(anon_inode_gfid,
+ local->replies[i].poststat.ia_gfid) == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else {
+ if (i == child)
+ child_op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+ "%s has gfid: %s", priv->anon_inode_name,
+ uuid_utoa(local->replies[i].poststat.ia_gfid));
+ }
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+link:
+ if (!gf_uuid_is_null(iatt.ia_gfid)) {
+ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+ if (*linked_inode) {
+ op_errno = 0;
+ inode_lookup(*linked_inode);
+ } else {
+ op_errno = ENOMEM;
+ }
+ goto out;
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ /*child_op_errno takes precedence*/
+ if (child_op_errno == 0) {
+ child_op_errno = op_errno;
+ }
+
+ if (child_op_errno && *linked_inode) {
+ inode_unref(*linked_inode);
+ *linked_inode = NULL;
+ }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return -child_op_errno;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
deleted file mode 100644
index 6431feaff35..00000000000
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __AFR_SELF_HEAL_COMMON_H__
-#define __AFR_SELF_HEAL_COMMON_H__
-
-#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512))
-
-typedef enum {
- AFR_SELF_HEAL_ENTRY,
- AFR_SELF_HEAL_METADATA,
- AFR_SELF_HEAL_DATA,
-} afr_self_heal_type;
-
-int
-afr_sh_select_source (int sources[], int child_count);
-
-int
-afr_sh_sink_count (int sources[], int child_count);
-
-int
-afr_sh_source_count (int sources[], int child_count);
-
-int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
- int child_count);
-
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
-
-void
-afr_sh_build_pending_matrix (afr_private_t *priv,
- int32_t *pending_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type);
-
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], int success[],
- int child_count, afr_transaction_type type);
-
-int
-afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
- afr_self_heal_type type);
-
-int
-afr_sh_delta_to_xattr (afr_private_t *priv,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type);
-
-int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count);
-
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size);
-
-#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 0fd8dae69f9..37bcc2b3f9e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -1,1113 +1,891 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include "afr-messages.h"
+#include <glusterfs/events.h>
-int
-afr_sh_data_done (call_frame_t *frame, xlator_t *this)
+#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
+static int
+__checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- /*
- TODO: cleanup sh->*
- */
-
- if (sh->healing_fd && !sh->healing_fd_opened) {
- /* unref only if we created the fd ourselves */
-
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
+ afr_local_t *local = NULL;
+ struct afr_reply *replies = NULL;
+ int i = (long)cookie;
+
+ local = frame->local;
+ replies = local->replies;
+
+ replies[i].valid = 1;
+ replies[i].op_ret = op_ret;
+ replies[i].op_errno = op_errno;
+ if (xdata) {
+ replies[i].buf_has_zeroes = dict_get_str_boolean(
+ xdata, "buf-has-zeroes", _gf_false);
+ replies[i].fips_mode_rchecksum = dict_get_str_boolean(
+ xdata, "fips-mode-rchecksum", _gf_false);
+ }
+ if (strong) {
+ if (replies[i].fips_mode_rchecksum) {
+ memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH);
+ } else {
+ memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
}
+ }
-/* for (i = 0; i < priv->child_count; i++) */
-/* sh->locked_nodes[i] = 0; */
-
- gf_log (this->name, GF_LOG_TRACE,
- "self heal of %s completed",
- local->loc.path);
-
- sh->completion_cbk (frame, this);
-
- return 0;
+ syncbarrier_wake(&local->barrier);
+ return 0;
}
-
-int
-afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static gf_boolean_t
+__afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size, struct iatt *poststat)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
-
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "flush or setattr failed on %s on subvolume %s: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_subvols = NULL;
+ gf_boolean_t checksum_match = _gf_true;
+ struct afr_reply *replies = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ replies = local->replies;
+
+ xdata = dict_new();
+ if (!xdata)
+ goto out;
+ if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) {
+ dict_unref(xdata);
+ goto out;
+ }
+
+ wind_subvols = alloca0(priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || healed_sinks[i])
+ wind_subvols[i] = 1;
+ }
+
+ AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+
+ if (!replies[source].valid || replies[source].op_ret != 0)
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+ if (replies[i].valid) {
+ if (memcmp(replies[source].checksum, replies[i].checksum,
+ replies[source].fips_mode_rchecksum
+ ? SHA256_DIGEST_LENGTH
+ : MD5_DIGEST_LENGTH)) {
+ checksum_match = _gf_false;
+ break;
+ }
+ }
+ }
- if (call_count == 0) {
- afr_sh_data_done (frame, this);
- }
+ if (checksum_match) {
+ if (HAS_HOLES(poststat))
+ return _gf_true;
- return 0;
+ /* For non-sparse files, we might be better off writing the
+ * zeroes to sinks to avoid mismatch of disk-usage in bricks. */
+ if (local->replies[source].buf_has_zeroes)
+ return _gf_false;
+ else
+ return _gf_true;
+ }
+out:
+ return _gf_false;
}
-
-int
-afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost)
+static gf_boolean_t
+__afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset,
+ int sink)
{
- afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
+ afr_private_t *priv = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec *iovec = NULL;
+ int count = 0;
+ int ret = 0;
+ gf_boolean_t zero_filled = _gf_false;
+
+ priv = this->private;
+ ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec,
+ &count, &iobref, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ ret = iov_0filled(iovec, count);
+ if (!ret)
+ zero_filled = _gf_true;
+out:
+ if (iovec)
+ GF_FREE(iovec);
+ if (iobref)
+ iobref_unref(iobref);
+ return zero_filled;
}
-
-int
-afr_sh_data_close (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size,
+ struct afr_reply *replies, int type)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- int i = 0;
- int call_count = 0;
- int source = 0;
- int32_t valid = 0;
-
- struct iatt stbuf = {0,};
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME);
-
- stbuf.ia_atime = sh->buf[source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec;
-
- if (sh->healing_fd_opened) {
- /* not our job to close the fd */
+ struct iovec *iovec = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec,
+ &count, &iobref, NULL, NULL, NULL);
+ if (ret <= 0)
+ return ret;
- afr_sh_data_done (frame, this);
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ /*
+ * TODO: Use fiemap() and discard() to heal holes
+ * in the future.
+ *
+ * For now,
+ *
+ * - if the source had any holes at all,
+ * AND
+ * - if we are writing past the original file size
+ * of the sink
+ * AND
+ * - is NOT the last block of the source file. if
+ * the block contains EOF, it has to be written
+ * in order to set the file size even if the
+ * last block is 0-filled.
+ * AND
+ * - if the read buffer is filled with only 0's
+ *
+ * then, skip writing to this source. We don't depend
+ * on the write to happen to update the size as we
+ * have performed an ftruncate() upfront anyways.
+ */
+#define is_last_block(o, b, s) ((s >= o) && (s <= (o + b)))
+ if (HAS_HOLES((&replies[source].poststat)) &&
+ offset >= replies[i].poststat.ia_size &&
+ !is_last_block(offset, size, replies[source].poststat.ia_size) &&
+ (iov_0filled(iovec, count) == 0))
+ continue;
+
+ /* Avoid filling up sparse regions of the sink with 0-filled
+ * writes.*/
+ if (type == AFR_SELFHEAL_DATA_FULL &&
+ HAS_HOLES((&replies[source].poststat)) &&
+ ((offset + size) <= replies[i].poststat.ia_size) &&
+ (iov_0filled(iovec, count) == 0) &&
+ __afr_is_sink_zero_filled(this, fd, size, offset, i)) {
+ continue;
}
- if (!sh->healing_fd) {
- afr_sh_data_done (frame, this);
- return 0;
- }
-
- call_count = (sh->active_sinks + 1) * 2;
- local->call_count = call_count;
-
- /* closed source */
- gf_log (this->name, GF_LOG_TRACE,
- "closing fd of %s on %s",
- local->loc.path, priv->children[sh->source]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->flush,
- sh->healing_fd);
- call_count--;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->setattr,
- &local->loc, &stbuf, valid);
-
- call_count--;
-
- if (call_count == 0)
- return 0;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "closing fd of %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- sh->healing_fd);
-
- call_count--;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, &stbuf, valid);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
+ ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref,
+ 0, NULL, NULL, NULL, NULL);
+ if (ret != iov_length(iovec, count)) {
+ /* write() failed on this sink. unset the corresponding
+ member in sinks[] (which is healed_sinks[] in the
+ caller) so that this server does NOT get considered
+ as successfully healed.
+ */
+ healed_sinks[i] = 0;
+ }
+ }
+ if (iovec)
+ GF_FREE(iovec);
+ if (iobref)
+ iobref_unref(iobref);
-int
-afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * local = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "locking inode of %s on child %d failed: %s",
- local->loc.path, child_index,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "inode of %s on child %d locked",
- local->loc.path, child_index);
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_close (frame, this);
- }
-
- return 0;
+ return ret;
}
-
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source,
+ unsigned char *healed_sinks)
{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+ int i = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ if (!locked_on[source])
+ return _gf_false;
- GF_ASSERT (!sh->data_lock_held);
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && locked_on[i])
+ return _gf_true;
+ }
- int_lock->lock_cbk = afr_sh_data_close;
- afr_unlock (frame, this);
-
- return 0;
+ return _gf_false;
}
-
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks, off_t offset,
+ size_t size, int type, struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- gf_log (this->name, GF_LOG_TRACE,
- "finishing data selfheal of %s", local->loc.path);
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ unsigned char *data_lock = NULL;
+
+ priv = this->private;
+ data_lock = alloca0(priv->child_count);
+
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size,
+ data_lock);
+ {
+ if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
- if (!sh->data_lock_held)
- afr_sh_data_unlock (frame, this);
- else
- afr_sh_data_close (frame, this);
+ if (type == AFR_SELFHEAL_DATA_DIFF &&
+ __afr_can_skip_data_block_heal(frame, this, fd, source,
+ healed_sinks, offset, size,
+ &replies[source].poststat)) {
+ ret = 0;
+ goto unlock;
+ }
- return 0;
+ ret = __afr_selfheal_data_read_write(
+ frame, this, fd, source, healed_sinks, offset, size, replies, type);
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size,
+ data_lock);
+ return ret;
}
-
-int
-afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
+static int
+afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks)
{
- int call_count = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- call_count = afr_frame_return (frame);
+ local = frame->local;
+ priv = this->private;
- if (call_count == 0)
- afr_sh_data_finish (frame, this);
-
- return 0;
-}
+ if (!priv->ensure_durability)
+ return 0;
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL);
-int
-afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success,
- priv->child_count, AFR_DATA_TRANSACTION);
-
- erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,
- gf_afr_mt_dict_t);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
-
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
- }
- }
-
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_DATA_TRANSACTION);
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
- }
- GF_FREE (erase_xattr);
-
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret != 0)
+ /* fsync() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
-
-int
-afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+static int
+afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks,
+ int source, struct afr_reply *replies)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int call_count = 0;
- int child_index = 0;
-
- priv = this->private;
- local = frame->local;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1)
- gf_log (this->name, GF_LOG_DEBUG,
- "ftruncate of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- else
- gf_log (this->name, GF_LOG_TRACE,
- "ftruncate of %s on subvolume %s completed",
- local->loc.path,
- priv->children[child_index]->name);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_erase_pending (frame, this);
- }
-
- return 0;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int i = 0;
+
+ if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) {
+ type = AFR_SELFHEAL_DATA_FULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] && i != source)
+ continue;
+ if (replies[i].poststat.ia_size) {
+ type = AFR_SELFHEAL_DATA_DIFF;
+ break;
+ }
+ }
+ } else {
+ type = priv->data_self_heal_algorithm;
+ }
+ return type;
}
-
-int
-afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
+ unsigned char *healed_sinks, struct afr_reply *replies)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- int *sources = NULL;
- int call_count = 0;
- int i = 0;
-
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sources = sh->sources;
- call_count = sh->active_sinks;
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- sh->healing_fd, sh->file_size);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
+ afr_private_t *priv = NULL;
+ off_t off = 0;
+ size_t block = 0;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int ret = -1;
+ call_frame_t *iter_frame = NULL;
+ unsigned char arbiter_sink_status = 0;
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing data selfheal on %s", uuid_utoa(fd->inode->gfid));
+
+ priv = this->private;
+ if (priv->arbiter_count) {
+ arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
+ healed_sinks[ARBITER_BRICK_INDEX] = 0;
+ }
+
+ block = 128 * 1024 * priv->data_self_heal_window_size;
+
+ type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies);
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ ret = afr_selfheal_data_block(iter_frame, this, fd, source,
+ healed_sinks, off, block, type, replies);
+ if (ret < 0)
+ goto out;
-static struct afr_sh_algorithm *
-sh_algo_from_name (xlator_t *this, char *name)
-{
- int i = 0;
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ }
- while (afr_self_heal_algorithms[i].name) {
- if (!strcmp (name, afr_self_heal_algorithms[i].name)) {
- return &afr_self_heal_algorithms[i];
- }
+ ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks);
- i++;
- }
+out:
+ if (arbiter_sink_status)
+ healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
- return NULL;
+ if (iter_frame)
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
}
-
static int
-sh_zero_byte_files_exist (afr_self_heal_t *sh, int child_count)
+__afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks, uint64_t size)
{
- int i;
- int ret = 0;
-
- for (i = 0; i < child_count; i++) {
- if (sh->buf[i].ia_size == 0) {
- ret = 1;
- break;
- }
- }
-
- return ret;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ /* This will send truncate on the arbiter brick as well if it is marked as
+ * sink. If changelog is enabled on the volume it captures truncate as a
+ * data transactions on the arbiter brick. This will help geo-rep to
+ * properly sync the data from master to slave if arbiter is the ACTIVE
+ * brick during syncing and which had got some entries healed for data as
+ * part of self heal.
+ */
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret == -1)
+ /* truncate() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+
+ return 0;
}
-
-struct afr_sh_algorithm *
-afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_has_source_witnesses(xlator_t *this, unsigned char *sources,
+ uint64_t *witness)
{
- afr_private_t * priv = NULL;
- struct afr_sh_algorithm * algo = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- algo = sh_algo_from_name (this, priv->data_self_heal_algorithm);
-
- if (algo == NULL) {
- /* option not set, so fall back on heuristics */
-
- if ((local->enoent_count != 0)
- || sh_zero_byte_files_exist (sh, priv->child_count)
- || (sh->file_size <= (priv->data_self_heal_window_size * this->ctx->page_size))) {
-
- /*
- * If the file does not exist on one of the subvolumes,
- * or a zero-byte file exists (created by entry self-heal)
- * the entire content has to be copied anyway, so there
- * is no benefit from using the "diff" algorithm.
- *
- * If the file size is about the same as page size,
- * the entire file can be read and written with a few
- * (pipelined) STACK_WINDs, which will be faster
- * than "diff" which has to read checksums and then
- * read and write.
- */
-
- algo = sh_algo_from_name (this, "full");
-
- } else {
- algo = sh_algo_from_name (this, "diff");
- }
- }
-
- return algo;
-}
+ int i = 0;
+ afr_private_t *priv = NULL;
+ priv = this->private;
-int
-afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
-
- struct afr_sh_algorithm *sh_algo = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
- }
- }
- sh->success[source] = 1;
-
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_data_finish (frame, this);
- return 0;
- }
- sh->active_sinks = active_sinks;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing file %s from subvolume %s to %d other",
- local->loc.path, priv->children[source]->name, active_sinks);
-
- sh->algo_completion_cbk = afr_sh_data_trim_sinks;
- sh->algo_abort_cbk = afr_sh_data_finish;
-
- sh_algo = afr_sh_data_pick_algo (frame, this);
-
- sh_algo->fn (frame, this);
-
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && witness[i])
+ return _gf_true;
+ }
+ return _gf_false;
}
-
-int
-afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+afr_does_size_mismatch(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_local_t * orig_local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt *min = NULL;
+ struct iatt *max = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int source = 0;
- int i = 0;
+ priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count, AFR_DATA_TRANSACTION);
+ if (replies[i].op_ret < 0)
+ continue;
- afr_sh_print_pending_matrix (sh->pending_matrix, this);
+ if (!sources[i])
+ continue;
- nsources = afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_DATA);
+ if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0))
+ continue;
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
+ if (!min)
+ min = &replies[i].poststat;
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
+ if (!max)
+ max = &replies[i].poststat;
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Picking favorite child %s as authentic source to resolve conflicting data of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to self-heal contents of '%s' (possible split-brain). "
- "Please delete the file from all but the preferred "
- "subvolume.", local->loc.path);
-
- local->govinda_gOvinda = 1;
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- if (source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- sh->source = source;
- sh->block_size = 65536;
- sh->file_size = sh->buf[source].ia_size;
-
- if (FILE_HAS_HOLES (&sh->buf[source]))
- sh->file_has_holes = 1;
-
- orig_local = sh->orig_frame->local;
- orig_local->cont.lookup.buf.ia_size = sh->buf[source].ia_size;
+ if (min->ia_size > replies[i].poststat.ia_size)
+ min = &replies[i].poststat;
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == source || sh->child_errno[i])
- continue;
+ if (max->ia_size < replies[i].poststat.ia_size)
+ max = &replies[i].poststat;
+ }
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
- }
+ if (min && max) {
+ if (min->ia_size != max->ia_size)
+ return _gf_true;
+ }
- afr_set_read_child (this, local->loc.inode, sh->source);
-
- /*
- quick-read might have read the file, so send xattr from
- the source subvolume (http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=815)
- */
-
- dict_unref (orig_local->cont.lookup.xattr);
- if (orig_local->cont.lookup.xattrs)
- orig_local->cont.lookup.xattr = dict_ref (orig_local->cont.lookup.xattrs[sh->source]);
-
- if (sh->background) {
- sh->unwind (sh->orig_frame, this);
- sh->unwound = _gf_true;
- }
-
- afr_sh_data_sync_prepare (frame, this);
-
- return 0;
+ return _gf_false;
}
-
-int
-afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr)
+static void
+afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources,
+ uint64_t *witness)
{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- int source = 0;
- int i = 0;
-
- sh = &local->self_heal;
- priv = this->private;
-
- sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count,
- gf_afr_mt_int32_t);
- for (i = 0; i < priv->child_count; i++) {
- sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t),
- priv->child_count,
- gf_afr_mt_int32_t);
- }
-
- sh->sources = GF_CALLOC (priv->child_count, sizeof (*sh->sources),
- gf_afr_mt_int32_t);
-
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, xattr,
- priv->child_count, AFR_DATA_TRANSACTION);
-
- (void)afr_sh_mark_sources (sh, priv->child_count, AFR_SELF_HEAL_DATA);
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- return source;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t biggest_witness = 0;
+
+ priv = this->private;
+ /* Find source with biggest witness count */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (biggest_witness < witness[i])
+ biggest_witness = witness[i];
+ }
+
+ /* Mark files with less witness count as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (witness[i] < biggest_witness)
+ sources[i] = 0;
+ }
+
+ return;
}
-
-int
-afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf)
+/* This is a tie breaker function. Only one source be assigned here */
+static void
+afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fstat of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->buf[child_index] = *buf;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_fix (frame, this);
- }
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ uint32_t max_ctime = 0;
+
+ priv = this->private;
+ /* Find source with latest ctime */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+
+ if (max_ctime <= replies[i].poststat.ia_ctime) {
+ source = i;
+ max_ctime = replies[i].poststat.ia_ctime;
+ }
+ }
- return 0;
+ /* Only mark one of the files as source to break ties */
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ sources[source] = 1;
}
-
-int
-afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_data_finalize_source(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness)
{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fstat,
- sh->healing_fd);
-
- if (!--call_count)
- break;
- }
- }
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count) {
+ /* split brain */
+ source = afr_mark_split_brain_source_sinks(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, AFR_DATA_TRANSACTION);
+ if (source < 0) {
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=data;"
+ "file=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
+ return -EIO;
+ }
- return 0;
+ _afr_fav_child_reset_sink_xattrs(
+ frame, this, inode, source, healed_sinks, undid_pending,
+ AFR_DATA_TRANSACTION, locked_on, replies);
+ goto out;
+ }
+
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
+ /* If there are no witnesses/size-mismatches on sources we are done*/
+ if (!afr_does_size_mismatch(this, sources, replies) &&
+ !afr_has_source_witnesses(this, sources, witness))
+ goto out;
+
+ afr_mark_largest_file_as_source(this, sources, replies);
+ afr_mark_biggest_witness_as_source(this, sources, witness);
+ afr_mark_newest_file_as_source(this, sources, replies);
+ if (priv->arbiter_count)
+ /* Choose non-arbiter brick as source for empty files. */
+ afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks,
+ locked_on, replies,
+ AFR_DATA_TRANSACTION);
+
+out:
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION);
+
+ return source;
}
-
+/*
+ * __afr_selfheal_data_prepare:
+ *
+ * This function inspects the on-disk xattrs and determines which subvols
+ * are sources and sinks.
+ *
+ * The return value is the index of the subvolume to be used as the source
+ * for self-healing, or -1 if no healing is necessary/split brain.
+ */
int
-afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
+__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *pflag)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fxattrop of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
+ priv = this->private;
- sh->xattr[child_index] = dict_ref (xattr);
- }
- }
- UNLOCK (&frame->lock);
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
- call_count = afr_frame_return (frame);
+ if (ret)
+ return ret;
- if (call_count == 0) {
- afr_sh_data_fstat (frame, this);
- }
+ witness = alloca0(priv->child_count * sizeof(*witness));
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_DATA_TRANSACTION, locked_on, sources,
+ sinks, witness, pflag);
+ if (ret)
+ return ret;
- return 0;
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ source = __afr_selfheal_data_finalize_source(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ undid_pending, replies, witness);
+ if (source < 0)
+ return -EIO;
+
+ return source;
}
-
-int
-afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
-
- int32_t zero_pending[3] = {0, 0, 0};
-
- int call_count = 0;
- int i = 0;
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin (xattr_req, priv->pending_key[i],
- zero_pending, 3 * sizeof(int32_t));
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value");
- }
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+ gf_boolean_t did_sh = _gf_true;
+ gf_boolean_t is_arbiter_the_only_sink = _gf_false;
+ gf_boolean_t empty_file = _gf_false;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "self-heal as only %d number "
+ "of subvolumes "
+ "could be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd, GF_XATTROP_ADD_ARRAY,
- xattr_req);
-
- if (!--call_count)
- break;
- }
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
+ ret = __afr_selfheal_data_prepare(frame, this, fd->inode, data_lock,
+ sources, sinks, healed_sinks,
+ undid_pending, locked_replies, NULL);
+ if (ret < 0)
+ goto unlock;
- return 0;
-}
-
-
-int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
+ }
- local = frame->local;
- int_lock = &local->internal_lock;
+ source = ret;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non Blocking inodelks failed.");
- afr_sh_data_done (frame, this);
- } else {
+ if (AFR_IS_ARBITER_BRICK(priv, source)) {
+ empty_file = afr_is_file_empty_on_all_children(priv,
+ locked_replies);
+ if (empty_file)
+ goto restore_time;
- gf_log (this->name, GF_LOG_DEBUG,
- "Non Blocking inodelks done. Proceeding to FOP");
- afr_sh_data_fxattrop (frame, this);
+ did_sh = _gf_false;
+ goto unlock;
}
- return 0;
+ ret = __afr_selfheal_truncate_sinks(
+ frame, this, fd, healed_sinks,
+ locked_replies[source].poststat.ia_size);
+ if (ret < 0)
+ goto unlock;
+
+ if (priv->arbiter_count &&
+ AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
+ healed_sinks[ARBITER_BRICK_INDEX]) {
+ is_arbiter_the_only_sink = _gf_true;
+ goto restore_time;
+ }
+ ret = 0;
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
+ if (ret < 0)
+ goto out;
+
+ if (!did_sh)
+ goto out;
+
+ ret = afr_selfheal_data_do(frame, this, fd, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
+restore_time:
+ afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+ locked_replies);
+
+ if (!is_arbiter_the_only_sink && !empty_file) {
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ did_sh = _gf_false;
+ goto skip_undo_pending;
+ }
+ }
+ ret = afr_selfheal_undo_pending(
+ frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_DATA_TRANSACTION, locked_replies, data_lock);
+skip_undo_pending:
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
+out:
+
+ if (did_sh)
+ afr_log_selfheal(fd->inode->gfid, this, ret, "data", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
+
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
+
+ return ret;
}
int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this)
+afr_selfheal_data_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ afr_local_t *local = NULL;
+ int i = (long)cookie;
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK;
+ local = frame->local;
- afr_set_lock_number (frame, this);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- int_lock->lk_flock.l_start = 0;
- int_lock->lk_flock.l_len = 0;
- int_lock->lk_flock.l_type = F_WRLCK;
- int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk;
+ syncbarrier_wake(&local->barrier);
- afr_nonblocking_inodelk (frame, this);
-
-
- return 0;
+ return 0;
}
-
int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this)
+afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t * sh = NULL;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (sh->data_lock_held) {
- /* caller has held the lock already,
- so skip locking */
-
- afr_sh_data_fxattrop (frame, this);
- return 0;
+ int ret = 0;
+ fd_t *fd_tmp = NULL;
+ loc_t loc = {
+ 0,
+ };
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ fd_tmp = fd_create(inode, 0);
+ if (!fd_tmp)
+ return -ENOMEM;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ fd_unref(fd_tmp);
+ goto out;
+ }
+ local = frame->local;
+
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_data_open_cbk, open, &loc,
+ O_RDWR | O_LARGEFILE, fd_tmp, NULL);
+
+ ret = -ENOTCONN;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret < 0) {
+ ret = -local->replies[i].op_errno;
+ continue;
}
- return afr_sh_data_lock_rec (frame, this);
+ ret = 0;
+ break;
+ }
+
+ if (ret < 0) {
+ fd_unref(fd_tmp);
+ goto out;
+ } else {
+ fd_bind(fd_tmp);
+ }
+
+ *fd = fd_tmp;
+out:
+ loc_wipe(&loc);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
}
-
-int
-afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "open of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "open of %s succeeded on child %s",
- local->loc.path,
- priv->children[child_index]->name);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
-
- afr_sh_data_lock (frame, this);
- }
-
- return 0;
-}
-
-
int
-afr_sh_data_open (call_frame_t *frame, xlator_t *this)
+afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd)
{
- int i = 0;
- int call_count = 0;
-
- fd_t *fd = NULL;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (sh->healing_fd_opened) {
- /* caller has opened the fd for us already, so skip open */
-
- afr_sh_data_lock (frame, this);
- return 0;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+ inode_t *inode = fd->inode;
+
+ priv = this->private;
+
+ locked_on = alloca0(priv->child_count);
+
+ ret = afr_selfheal_tie_breaker_inodelk(frame, this, inode, priv->sh_domain,
+ 0, 0, locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "self-heal as only %d number of "
+ "subvolumes could be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
}
- call_count = afr_up_children_count (priv->child_count, local->child_up);
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
+ ret = __afr_selfheal_data(frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if(!local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc,
- O_RDWR|O_LARGEFILE, fd, 0);
-
- if (!--call_count)
- break;
- }
-
- return 0;
+ return ret;
}
-
-
-int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
-
-
- local = frame->local;
- sh = &local->self_heal;
-
- if (sh->need_data_self_heal && priv->data_self_heal) {
- afr_sh_data_open (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "not doing data self heal on %s",
- local->loc.path);
- afr_sh_data_done (frame, this);
- }
-
- return 0;
-}
-
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 2010ae648f5..64893f441e3 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -1,2273 +1,1276 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "inode.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-int
-afr_sh_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- /*
- TODO: cleanup sh->*
- */
-
- if (sh->healing_fd)
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
-
-/* for (i = 0; i < priv->child_count; i++) { */
-/* sh->locked_nodes[i] = 0; */
-/* } */
-
- gf_log (this->name, GF_LOG_TRACE,
- "self heal of %s completed",
- local->loc.path);
-
- sh->completion_cbk (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_cbk = afr_sh_entry_done;
- afr_unlock (frame, this);
-
- return 0;
-}
-
+#include <glusterfs/byte-order.h>
+#include "afr-transaction.h"
+#include "afr-messages.h"
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/events.h>
int
-afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
+afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child,
+ struct afr_reply *replies,
+ gf_boolean_t *anon_inode)
{
- afr_local_t *local = NULL;
-
- local = frame->local;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ int i = 0;
+ char g[64] = {0};
+ unsigned char *lookup_success = NULL;
+ call_frame_t *frame = NULL;
+ loc_t loc2 = {
+ 0,
+ };
+ loc_t loc = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[child];
+ lookup_success = alloca0(priv->child_count);
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (replies[child].poststat.ia_type == IA_IFDIR) {
+ /* This directory may have sub-directory hierarchy which may need to
+ * be preserved for subsequent heals. So unconditionally move the
+ * directory to anonymous-inode directory*/
+ *anon_inode = _gf_true;
+ goto anon_inode;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid);
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ lookup_success[i] = 1;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ ret = -local->replies[i].op_errno;
+ }
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "finishing entry selfheal of %s", local->loc.path);
+ if (priv->quorum_count) {
+ if (afr_has_quorum(lookup_success, this, NULL)) {
+ *anon_inode = _gf_true;
+ }
+ } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) {
+ *anon_inode = _gf_true;
+ } else if (ret) {
+ goto out;
+ }
+
+anon_inode:
+ if (!*anon_inode) {
+ ret = 0;
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ gf_uuid_copy(loc.pargfid, dir->gfid);
+ loc.name = name;
+
+ ret = afr_anon_inode_create(this, child, &loc2.parent);
+ if (ret < 0)
+ goto out;
+
+ loc2.name = g;
+ ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s failed",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s successful",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ }
- afr_sh_entry_unlock (frame, this);
+out:
+ loc_wipe(&loc);
+ loc_wipe(&loc2);
+ if (frame) {
+ AFR_STACK_DESTROY(frame);
+ }
- return 0;
+ return ret;
}
-
int
-afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies)
{
- int call_count = 0;
+ char g[64] = {0};
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ gf_boolean_t anon_inode = _gf_false;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ if ((!replies[child].valid) || (replies[child].op_ret < 0)) {
+ /*Nothing to do*/
+ ret = 0;
+ goto out;
+ }
+
+ if (priv->use_anon_inode) {
+ ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child,
+ replies, &anon_inode);
+ if (ret < 0 || anon_inode)
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name,
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ break;
+ default:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
+ name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+ break;
+ }
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_finish (frame, this);
-
- return 0;
+out:
+ loc_wipe(&loc);
+ return ret;
}
-
int
-afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
+afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
+ unsigned char *sources, inode_t *dir,
+ const char *name, inode_t *inode,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
- int need_unwind = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success,
- priv->child_count, AFR_ENTRY_TRANSACTION);
-
- erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,
- gf_afr_mt_dict_t);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
-
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
- }
- }
-
- if (call_count == 0)
- need_unwind = 1;
-
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_ENTRY_TRANSACTION);
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
- }
- GF_FREE (erase_xattr);
-
- if (need_unwind)
- afr_sh_entry_finish (frame, this);
-
- return 0;
-}
-
-
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ loc_t srcloc = {
+ 0,
+ };
+ loc_t anonloc = {
+ 0,
+ };
+ xlator_t *this = frame->this;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ struct iatt *iatt = NULL;
+ char *linkname = NULL;
+ mode_t mode = 0;
+ struct iatt newent = {
+ 0,
+ };
+ unsigned char *newentry = NULL;
+ char iatt_uuid_str[64] = {0};
+ char dir_uuid_str[64] = {0};
+
+ priv = this->private;
+ iatt = &replies[source].poststat;
+ uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str);
+ if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
+ "Invalid ia_type (%d) or gfid(%s). source brick=%d, "
+ "pargfid=%s, name=%s",
+ iatt->ia_type, iatt_uuid_str, source,
+ uuid_utoa_r(dir->gfid, dir_uuid_str), name);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata)
+ return -ENOMEM;
+ newentry = alloca0(priv->child_count);
+ loc.parent = inode_ref(dir);
+ gf_uuid_copy(loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
+
+ ret = afr_selfheal_entry_delete(this, dir, name, inode, dst, replies);
+ if (ret)
+ goto out;
+
+ ret = dict_set_gfuuid(xdata, "gfid-req", replies[source].poststat.ia_gfid,
+ true);
+ if (ret)
+ goto out;
+
+ srcloc.inode = inode_ref(inode);
+ gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
+ ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == -ENOENT || ret == -ESTALE) {
+ newentry[dst] = 1;
+ ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
+ sources, newentry);
+ if (ret)
+ goto out;
+ } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) {
+ // Try rename from hidden directory
+ ret = afr_anon_inode_create(this, dst, &anonloc.parent);
+ if (ret < 0)
+ goto out;
+ anonloc.inode = inode_ref(inode);
+ anonloc.name = iatt_uuid_str;
+ ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = -1; /*This sets 'mismatch' to true*/
+ goto out;
+ }
+
+ mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
+
+ switch (iatt->ia_type) {
+ case IA_IFDIR:
+ ret = syncop_mkdir(priv->children[dst], &loc, mode, 0, xdata, NULL);
+ break;
+ case IA_IFLNK:
+ if (!newentry[dst]) {
+ ret = syncop_link(priv->children[dst], &srcloc, &loc, &newent,
+ NULL, NULL);
+ } else {
+ ret = syncop_readlink(priv->children[source], &srcloc,
+ &linkname, 4096, NULL, NULL);
+ if (ret <= 0)
+ goto out;
+ ret = syncop_symlink(priv->children[dst], &loc, linkname, NULL,
+ xdata, NULL);
+ }
+ break;
+ default:
+ ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret)
+ goto out;
+ ret = syncop_mknod(
+ priv->children[dst], &loc, mode,
+ makedev(ia_major(iatt->ia_rdev), ia_minor(iatt->ia_rdev)),
+ &newent, xdata, NULL);
+ break;
+ }
-static int
-next_active_source (call_frame_t *frame, xlator_t *this,
- int current_active_source)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int source = -1;
- int next_active_source = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- source = sh->source;
-
- if (source != -1) {
- if (current_active_source != source)
- next_active_source = source;
- goto out;
- }
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_source)) {
-
- next_active_source = i;
- break;
- }
- }
out:
- return next_active_source;
+ if (xdata)
+ dict_unref(xdata);
+ GF_FREE(linkname);
+ loc_wipe(&loc);
+ loc_wipe(&srcloc);
+ loc_wipe(&anonloc);
+ return ret;
}
-
-
static int
-next_active_sink (call_frame_t *frame, xlator_t *this,
- int current_active_sink)
+__afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int next_active_sink = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_sink)) {
-
- next_active_sink = i;
- break;
- }
- }
-
- return next_active_sink;
-}
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ priv = this->private;
-int
-build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
-{
- int ret = -1;
-
- if (!child) {
- goto out;
- }
-
- if (strcmp (parent->path, "/") == 0)
- ret = gf_asprintf ((char **)&child->path, "/%s", name);
- else
- ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
- name);
-
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed while setting child path");
- }
-
- if (!child->path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ if (!replies[source].valid)
+ return -EIO;
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
-
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
-
- if (!child->inode) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ret = 0;
-out:
- if (ret == -1)
- loc_wipe (child);
-
- return ret;
-}
+ /* Skip healing this entry if the last lookup on it failed for reasons
+ * other than ENOENT.
+ */
+ if ((replies[source].op_ret < 0) && (replies[source].op_errno != ENOENT))
+ return -replies[source].op_errno;
+ if (replies[source].op_ret == 0) {
+ ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
+ source, sources,
+ &replies[source].poststat.ia_gfid, NULL);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if (replies[source].op_ret == -1 &&
+ replies[source].op_errno == ENOENT) {
+ ret = afr_selfheal_entry_delete(this, fd->inode, name, inode, i,
+ replies);
+ } else {
+ if (!gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[source].poststat.ia_gfid))
+ continue;
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src);
-
-int
-afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- int call_count = 0;
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_expunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
-
- int active_src = (long) cookie;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
-
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setattr on parent directory of %s on subvolume %s failed: %s",
- expunge_local->loc.path,
- priv->children[active_src]->name, strerror (op_errno));
+ ret = afr_selfheal_recreate_entry(frame, i, source, sources,
+ fd->inode, name, inode, replies);
}
+ if (ret < 0)
+ break;
+ }
- AFR_STACK_DESTROY (expunge_frame);
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- call_frame_t *frame = NULL;
-
- int32_t valid = 0;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
-
- active_src = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "removed %s on %s",
- expunge_local->loc.path,
- priv->children[active_src]->name);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "removing %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- }
-
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- afr_build_parent_loc (&expunge_sh->parent_loc, &expunge_local->loc);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->setattr,
- &expunge_sh->parent_loc,
- &expunge_sh->parentbuf,
- valid);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "expunging file %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->unlink,
- &expunge_local->loc);
-
- return 0;
+ return ret;
}
-
-
-int
-afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
+static int
+afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
+ struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid,
+ char *bname, int src_idx,
+ unsigned char *locked_on, int *src)
{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
+ int i = 0;
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ void *gfid = NULL;
+ ia_type_t ia_type = IA_INVAL;
- priv = this->private;
- expunge_local = expunge_frame->local;
+ priv = this->private;
+ gfid = &replies[src_idx].poststat.ia_gfid;
+ ia_type = replies[src_idx].poststat.ia_type;
- gf_log (this->name, GF_LOG_DEBUG,
- "expunging directory %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == src_idx)
+ continue;
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->rmdir,
- &expunge_local->loc, 1);
+ if (!replies[i].valid)
+ continue;
- return 0;
-}
+ if (replies[i].op_ret != 0)
+ continue;
+ if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+ continue;
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
- int active_src, struct iatt *buf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int source = 0;
- call_frame_t *frame = NULL;
- int type = 0;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- source = expunge_sh->source;
-
- type = buf->ia_type;
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
- break;
- case IA_IFDIR:
- afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- expunge_local->loc.path,
- priv->children[source]->name, type);
- goto out;
- break;
- }
-
- return 0;
-out:
- AFR_STACK_DESTROY (expunge_frame);
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
+ if (replies[i].poststat.ia_type == IA_INVAL)
+ continue;
- return 0;
-}
+ if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) {
+ src_idx = i;
+ ia_type = replies[src_idx].poststat.ia_type;
+ gfid = &replies[src_idx].poststat.ia_gfid;
+ continue;
+ }
+ if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) &&
+ (ia_type == replies[i].poststat.ia_type)) {
+ ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
+ bname, src_idx, i, locked_on, src,
+ NULL);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Skipping conservative merge on the "
+ "file.");
+ return ret;
+ }
-int
-afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "lookup of %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf);
-
- return 0;
-out:
- AFR_STACK_DESTROY (expunge_frame);
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
+ if (ia_type != replies[i].poststat.ia_type) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Type mismatch detected "
+ "for <gfid:%s>/%s>, %s on %s and %s on %s. "
+ "Skipping conservative merge on the file.",
+ uuid_utoa(pargfid), bname,
+ gf_inode_type_to_str(replies[i].poststat.ia_type),
+ priv->children[i]->name,
+ gf_inode_type_to_str(replies[src_idx].poststat.ia_type),
+ priv->children[src_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=file;"
+ "file=<gfid:%s>/%s>;count=2;child-%d=%s;type-"
+ "%d=%s;child-%d=%s;type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name, i,
+ gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx,
+ priv->children[src_idx]->name, src_idx,
+ gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
+ return -1;
+ }
+ }
- return 0;
+ return 0;
}
-
-int
-afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
+static int
+__afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->lookup,
- &expunge_local->loc, 0);
-
- return 0;
-}
-
+ int ret = 0;
+ int i = 0;
+ int source = -1;
+ int src = -1;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ source = i;
+ break;
+ }
+ }
-int
-afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int source = 0;
- call_frame_t *frame = NULL;
- int active_src = 0;
-
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = expunge_sh->active_source;
- source = (long) cookie;
-
- if (op_ret == -1 && op_errno == ENOENT && postparent) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "missing entry %s on %s",
- expunge_local->loc.path,
- priv->children[source]->name);
-
- expunge_sh->parentbuf = *postparent;
-
- afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
-
- return 0;
- }
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "%s exists under %s",
- expunge_local->loc.path,
- priv->children[source]->name);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s under %s failed (%s)",
- expunge_local->loc.path,
- priv->children[source]->name,
- strerror (op_errno));
- }
-
- AFR_STACK_DESTROY (expunge_frame);
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
- return 0;
-}
+ if (source == -1) {
+ /* entry got deleted in the mean time? */
+ return 0;
+ }
+ /* Set all the sources as 1, otheriwse newentry_mark won't be set */
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ sources[i] = 1;
+ }
+ }
+
+ ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
+ source, sources,
+ &replies[source].poststat.ia_gfid, NULL);
+ if (ret)
+ return ret;
+
+ /* In case of type mismatch / unable to resolve gfid mismatch on the
+ * entry, return -1.*/
+ ret = afr_selfheal_detect_gfid_and_type_mismatch(
+ this, replies, inode, fd->inode->gfid, name, source, locked_on, &src);
+
+ if (ret < 0)
+ return ret;
+ if (src != -1) {
+ source = src;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i != src && replies[i].valid &&
+ gf_uuid_compare(replies[src].poststat.ia_gfid,
+ replies[i].poststat.ia_gfid)) {
+ sources[i] = 0;
+ }
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
+
+ if (src != -1) {
+ if (!gf_uuid_compare(replies[src].poststat.ia_gfid,
+ replies[i].poststat.ia_gfid))
+ continue;
+ } else if (replies[i].op_errno != ENOENT) {
+ continue;
+ }
-int
-afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
- char *name)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = -1;
- call_frame_t *expunge_frame = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- int source = 0;
- int op_errno = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
- source = sh->source;
-
- if ((strcmp (name, ".") == 0)
- || (strcmp (name, "..") == 0)
- || ((strcmp (local->loc.path, "/") == 0)
- && (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0))) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- name, local->loc.path);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "inspecting existance of %s under %s",
- name, local->loc.path);
-
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
-
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- expunge_sh->active_source = active_src;
-
- ret = build_child_loc (this, &expunge_local->loc, &local->loc, name);
- if (ret != 0) {
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s", expunge_local->loc.path,
- priv->children[source]->name);
-
- STACK_WIND_COOKIE (expunge_frame,
- afr_sh_entry_expunge_entry_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->lookup,
- &expunge_local->loc, 0);
-
- ret = 0;
-out:
- if (ret == -1)
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
+ ret |= afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode,
+ name, inode, replies);
+ }
- return 0;
+ return ret;
}
-
-int
-afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
+static int
+__afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_expunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_expunge_entry (frame, this, entry->d_name);
- }
-
- return 0;
+ int ret = -1;
+
+ if (source < 0)
+ ret = __afr_selfheal_merge_dirent(frame, this, fd, name, inode, sources,
+ healed_sinks, locked_on, replies);
+ else
+ ret = __afr_selfheal_heal_dirent(frame, this, fd, name, inode, source,
+ sources, healed_sinks, locked_on,
+ replies);
+ return ret;
}
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src)
+static gf_boolean_t
+is_full_heal_marker_present(xlator_t *this, dict_t *xdata, int idx)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdir,
- sh->healing_fd, sh->block_size, sh->offset);
-
- return 0;
+ int i = 0;
+ int pending[3] = {
+ 0,
+ };
+ void *pending_raw = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!xdata)
+ return _gf_false;
+
+ /* Iterate over each of the priv->pending_keys[] elements and then
+ * see if any of them have data segment non-zero. If they do, return
+ * true. Else return false.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
+
+ if (!pending_raw)
+ continue;
+
+ memcpy(pending, pending_raw, sizeof(pending));
+ if (ntoh32(pending[idx]))
+ return _gf_true;
+ }
+
+ return _gf_false;
}
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+afr_need_full_heal(xlator_t *this, struct afr_reply *replies, int source,
+ unsigned char *healed_sinks, afr_transaction_type type)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
+ int i = 0;
+ int idx = 0;
+ afr_private_t *priv = NULL;
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
+ priv = this->private;
- sh->offset = 0;
+ if (!priv->esh_granular)
+ return _gf_true;
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "no active sources for %s to expunge entries",
- local->loc.path);
- goto out;
- }
+ if (type != AFR_ENTRY_TRANSACTION)
+ return _gf_true;
- active_src = next_active_sink (frame, this, sh->active_source);
- sh->active_source = active_src;
+ priv = this->private;
+ idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
- if (sh->op_failed) {
- goto out;
- }
+ /* If there is a clear source, check whether the full-heal-indicator
+ * is present in its xdata. Otherwise, we need to examine all the
+ * participating bricks and then figure if *even* one of them has a
+ * full-heal-indicator.
+ */
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- goto out;
- }
+ if (source != -1) {
+ if (is_full_heal_marker_present(this, replies[source].xdata, idx))
+ return _gf_true;
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "expunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
+ /* else ..*/
- afr_sh_entry_expunge_subvol (frame, this, active_src);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
- return 0;
-out:
- afr_sh_entry_erase_pending (frame, this);
- return 0;
+ if (is_full_heal_marker_present(this, replies[i].xdata, idx))
+ return _gf_true;
+ }
+ return _gf_false;
}
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src);
-
-int
-afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
- int active_src)
+static int
+__afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ uint64_t *witness)
{
- int call_count = 0;
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_impunge_subvol (frame, this, active_src);
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+ int i = 0;
+
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count || afr_does_witness_exist(this, witness)) {
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return -1;
+ }
+
+ source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION);
+
+ /*If the selected source does not blame any other brick, then mark
+ * everything as sink to trigger conservative merge.
+ */
+ if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ return -1;
+ }
- return 0;
+ return source;
}
-
int
-afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
+__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p,
+ unsigned char *pflag)
{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- local = frame->local;
- sh = &local->self_heal;
- active_src = sh->active_source;
- child_index = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "setattr done for %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "setattr (%s) on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- }
-
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
- }
-
- return 0;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+ if (ret)
+ return ret;
+
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, locked_on, sources,
+ sinks, witness, pflag);
+ if (ret)
+ return ret;
+
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ source = __afr_selfheal_entry_finalize_source(this, sources, healed_sinks,
+ locked_on, replies, witness);
+
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
+
+ return ret;
}
-
-int
-afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
+static int
+afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *parent_idx_inode,
+ xlator_t *subvol, gf_boolean_t full_crawl)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = 0;
-
- struct iatt stbuf;
- int32_t valid = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- child_index = (long) cookie;
-
- gf_log (this->name, GF_LOG_TRACE,
- "setting ownership of %s on %s to %d/%d",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- impunge_local->cont.lookup.buf.ia_uid,
- impunge_local->cont.lookup.buf.ia_gid);
-
- stbuf.ia_atime = impunge_local->cont.lookup.buf.ia_atime;
- stbuf.ia_atime_nsec = impunge_local->cont.lookup.buf.ia_atime_nsec;
- stbuf.ia_mtime = impunge_local->cont.lookup.buf.ia_mtime;
- stbuf.ia_mtime_nsec = impunge_local->cont.lookup.buf.ia_mtime_nsec;
-
- stbuf.ia_uid = impunge_local->cont.lookup.buf.ia_uid;
- stbuf.ia_gid = impunge_local->cont.lookup.buf.ia_gid;
-
- valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_setattr_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->setattr,
- &impunge_local->loc,
- &stbuf, valid);
+ int ret = 0;
+ int source = -1;
+ unsigned char *locked_on = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ inode_t *inode = NULL;
+ struct afr_reply *replies = NULL;
+ struct afr_reply *par_replies = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+
+ if (afr_is_private_directory(priv, fd->inode->gfid, name,
+ GF_CLIENT_PID_SELF_HEALD)) {
return 0;
-}
-
-
-int
-afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- loc_t *parent_loc = cookie;
-
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setattr on parent directory failed: %s",
- strerror (op_errno));
+ }
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+ ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
+ if (ret) {
+ dict_unref(xattr);
+ return -1;
+ }
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ locked_on = alloca0(priv->child_count);
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+ par_replies = alloca0(priv->child_count * sizeof(*par_replies));
+
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes "
+ " could be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, this->name);
+ ret = -ENOTCONN;
+ goto unlock;
}
- loc_wipe (parent_loc);
-
- GF_FREE (parent_loc);
-
- AFR_STACK_DESTROY (setattr_frame);
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- int child_index = 0;
- int pending_array[3] = {0, };
- dict_t *xattr = NULL;
- int ret = 0;
- int idx = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- call_frame_t *setattr_frame = NULL;
- int32_t valid = 0;
- loc_t *parent_loc = NULL;
- struct iatt parentbuf;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- local = frame->local;
- sh = &local->self_heal;
- active_src = sh->active_source;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "creation of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- inode->ia_type = stbuf->ia_type;
-
- xattr = get_new_dict ();
- dict_ref (xattr);
-
- idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
- pending_array[idx] = hton32 (1);
- if (IA_ISDIR (stbuf->ia_type))
- idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
- else
- idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- pending_array[idx] = hton32 (1);
-
- ret = dict_set_static_bin (xattr, priv->pending_key[child_index],
- pending_array, sizeof (pending_array));
+ ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, locked_on,
+ sources, sinks, healed_sinks,
+ par_replies, &source, NULL);
if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
-
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- parentbuf = impunge_sh->parentbuf;
- setattr_frame = copy_frame (impunge_frame);
-
- parent_loc = GF_CALLOC (1, sizeof (*parent_loc),
- gf_afr_mt_loc_t);
- afr_build_parent_loc (parent_loc, &impunge_local->loc);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->xattrop,
- &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr);
-
- STACK_WIND_COOKIE (setattr_frame, afr_sh_entry_impunge_parent_setattr_cbk,
- (void *) (long) parent_loc,
- priv->children[child_index],
- priv->children[child_index]->fops->setattr,
- parent_loc, &parentbuf, valid);
-
- dict_unref (xattr);
-
- return 0;
-
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
-
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing file %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- dict = dict_new ();
- if (!dict)
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
-
- ret = afr_set_dict_gfid (dict, stbuf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG, "gfid set failed");
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mknod,
- &impunge_local->loc,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- stbuf->ia_rdev, dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
-}
-
-
-
-int
-afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
-
- int ret = 0;
+ goto unlock;
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return 0;
+ inode = afr_selfheal_unlocked_lookup_on(frame, fd->inode, name, replies,
+ locked_on, xattr);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
}
- ret = afr_set_dict_gfid (dict, stbuf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG, "gfid set failed");
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing directory %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mkdir,
- &impunge_local->loc,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, const char *linkname)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
- struct iatt *buf = NULL;
-
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- buf = &impunge_local->cont.symlink.buf;
-
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- afr_sh_entry_impunge_entry_done (impunge_frame, this, 0);
+ ret = __afr_selfheal_entry_dirent(frame, this, fd, name, inode, source,
+ sources, healed_sinks, locked_on,
+ replies);
+
+ if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
+ ret = afr_shd_entry_purge(subvol, parent_idx_inode, name,
+ inode->ia_type);
+ /* Why is ret force-set to 0? We do not care about
+ * index purge failing for full heal as it is quite
+ * possible during replace-brick that not all files
+ * and directories have their name indices present in
+ * entry-changes/.
+ */
+ ret = 0;
}
-
- ret = afr_set_dict_gfid (dict, buf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "dict set gfid failed");
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing symlink %s -> %s on %s",
- impunge_local->loc.path, linkname,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->symlink,
- linkname, &impunge_local->loc, dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
+ }
+
+unlock:
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, locked_on,
+ NULL);
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ if (par_replies)
+ afr_replies_wipe(par_replies, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
}
-
-int
-afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
+static inode_t *
+afr_shd_entry_changes_index_inode(xlator_t *this, xlator_t *subvol,
+ uuid_t pargfid)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- call_frame_t *frame = NULL;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
- impunge_sh->linkname);
+ int ret = -1;
+ void *index_gfid = NULL;
+ loc_t rootloc = {
+ 0,
+ };
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ inode_t *inode = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr(subvol, &rootloc, &xattr,
+ GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr(xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid);
+ if (ret) {
+ errno = EINVAL;
+ goto out;
+ }
+
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ gf_uuid_copy(loc.pargfid, index_gfid);
+ loc.name = gf_strdup(uuid_utoa(pargfid));
+
+ ret = syncop_lookup(subvol, &loc, &iatt, NULL, NULL, NULL);
+ if (ret < 0) {
+ errno = -ret;
+ goto out;
+ }
+
+ inode = inode_link(loc.inode, NULL, NULL, &iatt);
- return 0;
out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
- }
-
- return 0;
-}
+ if (xattr)
+ dict_unref(xattr);
+ loc_wipe(&rootloc);
+ GF_FREE((char *)loc.name);
+ loc_wipe(&loc);
-
-int
-afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "unlinking symlink %s with wrong target on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->unlink,
- &impunge_local->loc);
-
- return 0;
+ return inode;
}
-
-int
-afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname, struct iatt *sbuf)
+static int
+afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int child)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- call_frame_t *frame = NULL;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- /* symlink doesn't exist on the sink */
-
- if ((op_ret == -1) && (op_errno == ENOENT)) {
- afr_sh_entry_impunge_symlink (impunge_frame, this,
- child_index, impunge_sh->linkname);
- return 0;
- }
-
-
- /* symlink exists on the sink, so check if targets match */
-
- if (strcmp (linkname, impunge_sh->linkname) == 0) {
- /* targets match, nothing to do */
-
- goto out;
- } else {
- /*
- * Hah! Sneaky wolf in sheep's clothing!
- */
- afr_sh_entry_impunge_symlink_unlink (impunge_frame, this,
- child_index);
- return 0;
+ int ret = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ off_t offset = 0;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t mismatch = _gf_false;
+ afr_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ INIT_LIST_HEAD(&entries.list);
+
+ local = frame->local;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame)
+ return -ENOMEM;
+
+ loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
+ fd->inode->gfid);
+
+ while ((ret = syncop_readdir(subvol, fd, 131072, offset, &entries, NULL,
+ NULL))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry(entry, &entries.list, list)
+ {
+ offset = entry->d_off;
+
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+ continue;
+
+ ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
+ loc.inode, subvol,
+ local->need_full_crawl);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = -ENOTCONN;
+ break;
+ }
+
+ if (ret == -1) {
+ /* gfid or type mismatch. */
+ mismatch = _gf_true;
+ ret = 0;
+ }
+ if (ret)
+ break;
}
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "checking symlink target of %s on %s",
- impunge_local->loc.path, priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->readlink,
- &impunge_local->loc, 4096);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname, struct iatt *sbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- call_frame_t *frame = NULL;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- impunge_sh->linkname = gf_strdup (linkname);
- afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index);
-
- return 0;
-
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
- }
-
- return 0;
-}
+ gf_dirent_free(&entries);
+ if (ret)
+ break;
+ }
+ loc_wipe(&loc);
-int
-afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
- impunge_local->cont.symlink.buf = *stbuf;
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->readlink,
- &impunge_local->loc, 4096);
-
- return 0;
+ AFR_STACK_DESTROY(iter_frame);
+ if (mismatch == _gf_true)
+ /* undo pending will be skipped */
+ ret = -1;
+ return ret;
}
-
-int
-afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr,struct iatt *postparent)
+static int
+afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
+ loc_t *parent, void *data)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = 0;
- int type = 0;
- int child_index = 0;
- call_frame_t *frame = NULL;
- int call_count = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
-
- child_index = (long) cookie;
-
- active_src = impunge_sh->active_source;
-
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s (for %s) failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- impunge_sh->parentbuf = *postparent;
-
- impunge_local->cont.lookup.buf = *buf;
- type = buf->ia_type;
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- afr_sh_entry_impunge_mknod (impunge_frame, this,
- child_index, buf);
- break;
- case IA_IFLNK:
- afr_sh_entry_impunge_readlink (impunge_frame, this,
- child_index, buf);
- break;
- case IA_IFDIR:
- afr_sh_entry_impunge_mkdir (impunge_frame, this,
- child_index, buf);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- impunge_local->loc.path,
- priv->children[active_src]->name, type);
- goto out;
- break;
- }
-
- return 0;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+ afr_granular_esh_args_t *args = data;
+
+ /* Look up the actual inode associated with entry. If the lookup returns
+ * ESTALE or ENOENT, then it means we have a stale index. Remove it.
+ * This is analogous to the check in afr_shd_index_heal() except that
+ * here it is achieved through LOOKUP and in afr_shd_index_heal() through
+ * a GETXATTR.
+ */
+
+ loc.inode = inode_new(args->xl->itable);
+ loc.parent = inode_ref(args->heal_fd->inode);
+ gf_uuid_copy(loc.pargfid, loc.parent->gfid);
+ loc.name = entry->d_name;
+
+ ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL);
+ if ((ret == -ENOENT) || (ret == -ESTALE)) {
+ /* The name indices under the pgfid index dir are guaranteed
+ * to be regular files. Hence the hardcoding.
+ */
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
+ ret = 0;
+ goto out;
+ }
+ /* TBD: afr_shd_zero_xattrop? */
+
+ ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd,
+ entry->d_name, parent->inode, subvol,
+ _gf_false);
+ AFR_STACK_RESET(args->frame);
+ if (args->frame->local == NULL)
+ ret = -ENOTCONN;
+
+ if (ret == -1)
+ args->mismatch = _gf_true;
out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
- }
-
- return 0;
+ loc_wipe(&loc);
+ return 0;
}
-
-int
-afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
+static int
+afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int subvol_idx, gf_boolean_t is_src)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = 0;
-
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- active_src = impunge_sh->active_source;
-
- STACK_WIND_COOKIE (impunge_frame,
- afr_sh_entry_impunge_recreate_lookup_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->lookup,
- &impunge_local->loc, 0);
-
- return 0;
-}
-
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ afr_granular_esh_args_t args = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[subvol_idx];
+
+ args.frame = afr_copy_frame(frame);
+ if (!args.frame)
+ goto out;
+ args.xl = this;
+ /* args.heal_fd represents the fd associated with the original directory
+ * on which entry heal is being attempted.
+ */
+ args.heal_fd = fd;
+
+ /* @subvol here represents the subvolume of AFR where
+ * indices/entry-changes/<pargfid> will be processed
+ */
+ loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
+ fd->inode->gfid);
+ if (!loc.inode) {
+ /* If granular heal failed on the sink (as it might sometimes
+ * because it is the src that would mostly contain the granular
+ * changelogs and the sink's entry-changes would be empty),
+ * do not treat heal as failure.
+ */
+ if (is_src)
+ ret = -errno;
+ else
+ ret = 0;
+ goto out;
+ }
-int
-afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int call_count = 0;
- int child_index = 0;
- call_frame_t *frame = NULL;
- int active_src = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- child_index = (long) cookie;
- active_src = impunge_sh->active_source;
-
- if ((op_ret == -1 && op_errno == ENOENT)
- || (IA_ISLNK (impunge_sh->impunging_entry_mode))) {
-
- /*
- * A symlink's target might have changed, so
- * always go down the recreate path for them.
- */
-
- /* decrease call_count in recreate-callback */
-
- gf_log (this->name, GF_LOG_TRACE,
- "missing entry %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- afr_sh_entry_impunge_recreate (impunge_frame, this,
- child_index);
- return 0;
- }
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "%s exists under %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- impunge_sh->parentbuf = *postparent;
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s under %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- }
-
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
- }
-
- return 0;
-}
+ ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args,
+ afr_selfheal_entry_granular_dirent);
+ loc_wipe(&loc);
-int
-afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entry)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = -1;
- call_frame_t *impunge_frame = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = 0;
- int i = 0;
- int call_count = 0;
- int op_errno = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if ((strcmp (entry->d_name, ".") == 0)
- || (strcmp (entry->d_name, "..") == 0)
- || ((strcmp (local->loc.path, "/") == 0)
- && (strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR) == 0))) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- entry->d_name, local->loc.path);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "inspecting existance of %s under %s",
- entry->d_name, local->loc.path);
-
- impunge_frame = copy_frame (frame);
- if (!impunge_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- ALLOC_OR_GOTO (impunge_local, afr_local_t, out);
-
- impunge_frame->local = impunge_local;
- impunge_sh = &impunge_local->self_heal;
- impunge_sh->sh_frame = frame;
- impunge_sh->active_source = active_src;
-
- impunge_sh->impunging_entry_mode =
- st_mode_from_ia (entry->d_stat.ia_prot, entry->d_stat.ia_type);
-
- ret = build_child_loc (this, &impunge_local->loc, &local->loc, entry->d_name);
- if (ret != 0) {
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (i == active_src)
- continue;
- if (local->child_up[i] == 0)
- continue;
- if (sh->sources[i] == 1)
- continue;
- call_count++;
- }
-
- impunge_local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (i == active_src)
- continue;
- if (local->child_up[i] == 0)
- continue;
- if (sh->sources[i] == 1)
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s", impunge_local->loc.path,
- priv->children[i]->name);
-
- STACK_WIND_COOKIE (impunge_frame,
- afr_sh_entry_impunge_entry_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &impunge_local->loc, 0);
-
- if (!--call_count)
- break;
- }
-
- ret = 0;
+ if (args.mismatch == _gf_true)
+ ret = -1;
out:
- if (ret == -1)
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_impunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_impunge_entry (frame, this, entry);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdirp,
- sh->healing_fd, sh->block_size, sh->offset);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh->offset = 0;
-
- active_src = next_active_source (frame, this, sh->active_source);
- sh->active_source = active_src;
-
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
-
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- afr_sh_entry_expunge_all (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "impunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
-
- afr_sh_entry_impunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "opendir of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
-
- sh->active_source = -1;
- afr_sh_entry_impunge_all (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
-{
- int i = 0;
- int call_count = 0;
-
- int source = -1;
- int *sources = NULL;
-
- fd_t *fd = NULL;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = local->self_heal.source;
- sources = local->self_heal.sources;
-
- sh->block_size = 65536; //131072
- sh->offset = 0;
-
- call_count = sh->active_sinks;
- if (source != -1)
- call_count++;
-
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
-
- if (source != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "opening directory %s on subvolume %s (source)",
- local->loc.path, priv->children[source]->name);
-
- /* open source */
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->opendir,
- &local->loc, fd);
- call_count--;
- }
-
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "opening directory %s on subvolume %s (sink)",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->opendir,
- &local->loc, fd);
-
- if (!--call_count)
- break;
- }
-
- return 0;
+ if (args.frame)
+ AFR_STACK_DESTROY(args.frame);
+ return ret;
}
-
-int
-afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
- }
- }
- if (source != -1)
- sh->success[source] = 1;
-
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "no active sinks for self-heal on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- if (source == -1 && active_sinks < 2) {
- gf_log (this->name, GF_LOG_TRACE,
- "cannot sync with 0 sources and 1 sink on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- sh->active_sinks = active_sinks;
-
- if (source != -1)
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing directory %s from subvolume %s to "
- "%d other",
- local->loc.path, priv->children[source]->name,
- active_sinks);
- else
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s found. "
- "merging all entries as a conservative decision",
- local->loc.path);
-
- afr_sh_entry_open (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
+ unsigned char *sources, unsigned char *healed_sinks)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- int nsources = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (sh->forced_merge) {
- sh->source = -1;
- goto heal;
- }
-
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count, AFR_ENTRY_TRANSACTION);
-
- afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
- nsources = afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_ENTRY);
-
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
+ int i = 0;
+ int ret = 0;
+ gf_boolean_t mismatch = _gf_false;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid));
+
+ for (i = 0; i < priv->child_count; i++) {
+ /* Expunge */
+ if (!healed_sinks[i])
+ continue;
+
+ if (!local->need_full_crawl)
+ /* Why call afr_selfheal_entry_granular() on a "healed sink",
+ * given that it is the source that contains the granular
+ * indices?
+ * If the index for this directory is non-existent or empty on
+ * this subvol (=> clear sink), then it will return early
+ * without failure status.
+ * If the index is non-empty and it is yet a 'healed sink', then
+ * it is due to a split-brain in which case we anyway need to
+ * crawl the indices/entry-changes/pargfid directory.
+ */
+ ret = afr_selfheal_entry_granular(frame, this, fd, i, _gf_false);
+ else
+ ret = afr_selfheal_entry_do_subvol(frame, this, fd, i);
- afr_sh_entry_finish (frame, this);
- return 0;
+ if (ret == -1) {
+ /* gfid or type mismatch. */
+ mismatch = _gf_true;
+ ret = 0;
}
+ if (ret)
+ break;
+ }
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- sh->source = source;
-
-heal:
- afr_sh_entry_sync_prepare (frame, this);
-
- return 0;
-}
-
-
-
-int
-afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- sh->xattr[child_index] = dict_ref (xattr);
- sh->buf[child_index] = *buf;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_entry_fix (frame, this);
- }
-
- return 0;
+ if (!ret && source != -1) {
+ /* Impunge */
+ if (local->need_full_crawl)
+ ret = afr_selfheal_entry_do_subvol(frame, this, fd, source);
+ else
+ ret = afr_selfheal_entry_granular(frame, this, fd, source,
+ _gf_true);
+ }
+
+ if (mismatch == _gf_true)
+ /* undo pending will be skipped */
+ ret = -1;
+ return ret;
}
-
-
-int
-afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- dict_t *xattr_req = NULL;
- int ret = 0;
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
-
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req,
- priv->pending_key[i],
- 3 * sizeof(int32_t));
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- }
+ int ret = -1;
+ int source = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *postop_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t did_sh = _gf_true;
+
+ priv = this->private;
+ local = frame->local;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+ postop_lock = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes could "
+ "be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, this->name);
+ ret = -ENOTCONN;
+ goto unlock;
}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame,
- afr_sh_entry_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, xattr_req);
- if (!--call_count)
- break;
- }
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
-
- return 0;
-}
-
-int
-afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non Blocking entrylks failed.");
- afr_sh_entry_done (frame, this);
- } else {
+ ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies, &source, NULL);
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
+ }
- gf_log (this->name, GF_LOG_DEBUG,
- "Non Blocking entrylks done. Proceeding to FOP");
- afr_sh_entry_lookup(frame, this);
+ local->need_full_crawl = afr_need_full_heal(
+ this, locked_replies, source, healed_sinks, AFR_ENTRY_TRANSACTION);
+ }
+unlock:
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, data_lock,
+ NULL);
+ if (ret < 0)
+ goto out;
+
+ if (!did_sh)
+ goto out;
+
+ ret = afr_selfheal_entry_do(frame, this, fd, source, sources, healed_sinks);
+ if (ret)
+ goto out;
+
+ /* Take entrylks in xlator domain before doing post-op (undo-pending) in
+ * entry self-heal. This is to prevent a parallel name self-heal on
+ * an entry under @fd->inode from reading pending xattrs while it is
+ * being modified by SHD after entry sh below, given that
+ * name self-heal takes locks ONLY in xlator domain and is free to read
+ * pending changelog in the absence of the following locking.
+ */
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ postop_lock);
+ {
+ if (AFR_CMP(data_lock, postop_lock, priv->child_count) != 0) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "post-op after entry self-heal as %d "
+ "sub-volumes, as opposed to %d, "
+ "could be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret,
+ AFR_COUNT(data_lock, priv->child_count), this->name);
+ ret = -ENOTCONN;
+ goto postop_unlock;
}
- return 0;
+ afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+ locked_replies);
+ ret = afr_selfheal_undo_pending(
+ frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_ENTRY_TRANSACTION, locked_replies, postop_lock);
+ }
+postop_unlock:
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL,
+ postop_lock, NULL);
+out:
+ if (did_sh)
+ afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
+
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
+ return ret;
}
-int
-afr_sh_entry_lock (call_frame_t *frame, xlator_t *this)
+static fd_t *
+afr_selfheal_data_opendir(xlator_t *this, inode_t *inode)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK;
-
- afr_set_lock_number (frame, this);
-
- int_lock->lk_basename = NULL;
- int_lock->lk_loc = &local->loc;
- int_lock->lock_cbk = afr_sh_post_nonblocking_entry_cbk;
-
- afr_nonblocking_entrylk (frame, this);
-
-
- return 0;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
+ fd_t *fd = NULL;
+
+ fd = fd_create(inode, 0);
+ if (!fd)
+ return NULL;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ ret = syncop_opendir(this, &loc, fd, NULL, NULL);
+ if (ret) {
+ fd_unref(fd);
+ fd = NULL;
+ } else {
+ fd_bind(fd);
+ }
+
+ loc_wipe(&loc);
+ return fd;
}
-
int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
+afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ fd = afr_selfheal_data_opendir(this, inode);
+ if (!fd)
+ return -EIO;
+
+ locked_on = alloca0(priv->child_count);
+
+ ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
+ NULL, locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes could "
+ "be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, priv->sh_domain);
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
- priv = this->private;
- local = frame->local;
+ ret = __afr_selfheal_entry(frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, locked_on,
+ NULL);
- if (local->self_heal.need_entry_self_heal && priv->entry_self_heal) {
- afr_sh_entry_lock (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to completion on %s",
- local->loc.path);
- afr_sh_entry_done (frame, this);
- }
+ if (fd)
+ fd_unref(fd);
- return 0;
+ return ret;
}
-
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index e76d58850cd..03f43bad16e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -1,731 +1,546 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include <glusterfs/events.h>
+#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE)
-int
-afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
-// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
- memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count);
- memset (sh->success, 0, sizeof (int) * priv->child_count);
-
-/* for (i = 0; i < priv->child_count; i++) { */
-/* sh->locked_nodes[i] = 1; */
-/* } */
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
-
- if (local->govinda_gOvinda) {
- gf_log (this->name, GF_LOG_DEBUG,
- "aborting selfheal of %s",
- local->loc.path);
- sh->completion_cbk (frame, this);
- } else {
- if (IA_ISREG (sh->type)) {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to data check on %s",
- local->loc.path);
- afr_self_heal_data (frame, this);
- return 0;
- }
-
- if (IA_ISDIR (sh->type)) {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to entry check on %s",
- local->loc.path);
- afr_self_heal_entry (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_DEBUG,
- "completed self heal of %s",
- local->loc.path);
-
- sh->completion_cbk (frame, this);
- }
-
- return 0;
+ return afr_is_xattr_ignorable(k);
}
-
-int
-afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+void
+afr_delete_ignorable_xattrs(dict_t *xattr)
{
- int call_count = 0;
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_done (frame, this);
-
- return 0;
+ dict_foreach_match(xattr, _afr_ignorable_key_match, NULL,
+ dict_remove_foreach_fn, NULL);
}
int
-afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this)
+__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_cbk = afr_sh_metadata_done;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_sh_inode_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
-{
- afr_local_t *local = NULL;
- int call_count = 0;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ int ret = -1;
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ dict_t *old_xattr = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing metadata selfheal on %s", uuid_utoa(inode->gfid));
+
+ ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL,
+ NULL);
+ if (ret < 0) {
+ ret = -EIO;
+ goto out;
+ }
+
+ afr_delete_ignorable_xattrs(xattr);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (old_xattr) {
+ dict_unref(old_xattr);
+ old_xattr = NULL;
+ }
- if (call_count == 0)
- afr_sh_metadata_finish (frame, this);
+ if (!healed_sinks[i])
+ continue;
+
+ ret = syncop_setattr(priv->children[i], &loc,
+ &locked_replies[source].poststat, AFR_HEAL_ATTR,
+ NULL, NULL, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+
+ ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL,
+ NULL);
+ if (old_xattr) {
+ afr_delete_ignorable_xattrs(old_xattr);
+ ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr,
+ NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+ }
- return 0;
-}
+ ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+ }
+ ret = 0;
+out:
+ loc_wipe(&loc);
+ if (xattr)
+ dict_unref(xattr);
+ if (old_xattr)
+ dict_unref(old_xattr);
-int
-afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix,
- sh->success, priv->child_count,
- AFR_METADATA_TRANSACTION);
-
- erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,
- gf_afr_mt_dict_t);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
-
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
- }
- }
-
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_METADATA_TRANSACTION);
-
- local->call_count = call_count;
-
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "metadata of %s not healed on any subvolume",
- local->loc.path);
-
- afr_sh_metadata_finish (frame, this);
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
- }
- GF_FREE (erase_xattr);
-
- return 0;
+ return ret;
}
-
-int
-afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static uint64_t
+mtime_ns(struct iatt *ia)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setting attributes failed for %s on %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->success[child_index] = 0;
- }
- }
- UNLOCK (&frame->lock);
+ uint64_t ret;
- call_count = afr_frame_return (frame);
+ ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) +
+ (uint64_t)(ia->ia_mtime_nsec);
- if (call_count == 0)
- afr_sh_metadata_erase_pending (frame, this);
-
- return 0;
+ return ret;
}
-
-int
-afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
+/*
+ * When directory content is modified, [mc]time is updated. On
+ * Linux, the filesystem does it, while at least on NetBSD, the
+ * kernel file-system independent code does it. This means that
+ * when entries are added while bricks are down, the kernel sends
+ * a SETATTR [mc]time which will cause metadata split brain for
+ * the directory. In this case, clear the split brain by finding
+ * the source with the most recent modification date.
+ */
+static int
+afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ unsigned char *locked_on)
{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ struct iatt source_ia;
+ struct iatt child_ia;
+ uint64_t mtime = 0;
+ int i;
+ int ret = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ continue;
+
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret != 0)
+ continue;
+
+ if (mtime_ns(&replies[i].poststat) <= mtime)
+ continue;
+
+ mtime = mtime_ns(&replies[i].poststat);
+ source = i;
+ }
+
+ if (source == -1)
+ goto out;
+
+ source_ia = replies[source].poststat;
+ if (source_ia.ia_type != IA_IFDIR)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret != 0)
+ continue;
+
+ child_ia = replies[i].poststat;
+
+ if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+ !IA_EQUAL(source_ia, child_ia, type) ||
+ !IA_EQUAL(source_ia, child_ia, prot) ||
+ !IA_EQUAL(source_ia, child_ia, uid) ||
+ !IA_EQUAL(source_ia, child_ia, gid) ||
+ !afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata))
+ goto out;
+ }
+
+ /*
+ * Metadata split brain is just about [amc]time
+ * We return our source.
+ */
+ ret = source;
+out:
+ return ret;
}
-
-int
-afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static int
+__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
+ struct afr_reply *replies,
+ unsigned char *sources)
{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
-}
+ int ret = 0;
+ int i = 0;
+ int m_idx = 0;
+ afr_private_t *priv = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ raw[m_idx] = 1;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO,
+ "Failed to set pending metadata xattr on child %d for %s", i,
+ uuid_utoa(inode->gfid));
+ goto out;
+ }
+ }
+ afr_replies_wipe(replies, priv->child_count);
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
-int
-afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
- int active_sinks = 0;
- int call_count = 0;
- int i = 0;
-
- struct iatt stbuf;
- int32_t valid = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- active_sinks = sh->active_sinks;
-
- /*
- * 2 calls per sink - setattr, setxattr
- */
- if (xattr)
- call_count = active_sinks * 2;
- else
- call_count = active_sinks;
-
- local->call_count = call_count;
-
- stbuf.ia_atime = sh->buf[source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec;
-
- stbuf.ia_uid = sh->buf[source].ia_uid;
- stbuf.ia_gid = sh->buf[source].ia_gid;
-
- stbuf.ia_type = sh->buf[source].ia_type;
- stbuf.ia_prot = sh->buf[source].ia_prot;
-
- valid = GF_SET_ATTR_MODE |
- GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- for (i = 0; i < priv->child_count; i++) {
- if (call_count == 0) {
- break;
- }
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing metadata of %s from %s to %s",
- local->loc.path, priv->children[source]->name,
- priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, &stbuf, valid);
-
- call_count--;
-
- if (!xattr)
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc, xattr, 0);
- call_count--;
- }
-
- return 0;
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
}
-
-int
-afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+/*
+ * Look for mismatching uid/gid or mode or user xattrs even if
+ * AFR xattrs don't say so, and pick one arbitrarily as winner. */
+
+static int
+__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- int i;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
- local->loc.path, priv->children[source]->name,
- strerror (op_errno));
-
- afr_sh_metadata_sync (frame, this, NULL);
- } else {
- for (i = 0; i < priv->child_count; i++) {
- dict_del (xattr, priv->pending_key[i]);
- }
-
- afr_sh_metadata_sync (frame, this, xattr);
- }
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt srcstat = {
+ 0,
+ };
+ int source = -1;
+ int sources_count = 0;
+ int ret = 0;
+
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count) {
+ source = afr_mark_split_brain_source_sinks(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, AFR_METADATA_TRANSACTION);
+ if (source >= 0) {
+ _afr_fav_child_reset_sink_xattrs(
+ frame, this, inode, source, healed_sinks, undid_pending,
+ AFR_METADATA_TRANSACTION, locked_on, replies);
+ goto out;
+ }
- return 0;
-}
+ /* If this is a directory mtime/ctime only split brain
+ use the most recent */
+ source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on);
+ if (source != -1) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN,
+ "clear time "
+ "split brain on %s",
+ uuid_utoa(replies[source].poststat.ia_gfid));
+ sources[source] = 1;
+ healed_sinks[source] = 0;
+ goto out;
+ }
+ if (!priv->metadata_splitbrain_forced_heal) {
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;"
+ "type=metadata;file=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
+ return -EIO;
+ }
-int
-afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
- }
- }
- sh->success[source] = 1;
-
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_metadata_finish (frame, this);
- return 0;
- }
- sh->active_sinks = active_sinks;
-
- gf_log (this->name, GF_LOG_TRACE,
- "syncing metadata of %s from subvolume %s to %d active sinks",
- local->loc.path, priv->children[source]->name, active_sinks);
-
- STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
- priv->children[source],
- priv->children[source]->fops->getxattr,
- &local->loc, NULL);
-
- return 0;
+ /* Metadata split brain, select one subvol
+ arbitrarily */
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i] && healed_sinks[i]) {
+ sources[i] = 1;
+ healed_sinks[i] = 0;
+ break;
+ }
+ }
+ }
+
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
+ source = afr_choose_source_by_policy(priv, sources,
+ AFR_METADATA_TRANSACTION);
+ srcstat = replies[source].poststat;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] || i == source)
+ continue;
+ if (!IA_EQUAL(srcstat, replies[i].poststat, type) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, uid) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, gid) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, prot)) {
+ gf_msg_debug(this->name, 0,
+ "%s: iatt mismatch "
+ "for source(%d) vs (%d)",
+ uuid_utoa(replies[source].poststat.ia_gfid), source,
+ i);
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] || i == source)
+ continue;
+ if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) {
+ gf_msg_debug(this->name, 0,
+ "%s: xattr mismatch "
+ "for source(%d) vs (%d)",
+ uuid_utoa(replies[source].poststat.ia_gfid), source,
+ i);
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ if ((sources_count == priv->child_count) && (source > -1) &&
+ (AFR_COUNT(healed_sinks, priv->child_count) != 0)) {
+ ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode,
+ replies, sources);
+ if (ret < 0)
+ return ret;
+ }
+out:
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return source;
}
-
int
-afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)
+__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *pflag)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count,
- AFR_METADATA_TRANSACTION);
-
- afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
- nsources = afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_METADATA);
-
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
-
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_metadata_finish (frame, this);
- return 0;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+ if (ret)
+ return ret;
+
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_METADATA_TRANSACTION, locked_on,
+ sources, sinks, witness, pflag);
+ if (ret)
+ return ret;
+
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ /* If any source has witness, pick first
+ * witness source and make everybody else sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && witness[i]) {
+ source = i;
+ break;
}
-
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_WARNING,
- "Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to self-heal permissions/ownership of '%s' "
- "(possible split-brain). Please fix the file on "
- "all backend volumes", local->loc.path);
-
- local->govinda_gOvinda = 1;
-
- afr_sh_metadata_finish (frame, this);
- return 0;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- if (source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
-
- afr_sh_metadata_finish (frame, this);
- return 0;
+ }
+
+ if (source != -1) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (i != source && sources[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
}
+ }
- sh->source = source;
-
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == source || sh->child_errno[i])
- continue;
+ source = __afr_selfheal_metadata_finalize_source(
+ frame, this, inode, sources, sinks, healed_sinks, undid_pending,
+ locked_on, replies);
- if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
+ if (source < 0)
+ return -EIO;
- if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
- }
-
- afr_sh_metadata_sync_prepare (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "path %s on subvolume %s is of mode 0%o",
- local->loc.path,
- priv->children[child_index]->name,
- buf->ia_type);
-
- sh->buf[child_index] = *buf;
- if (xattr)
- sh->xattr[child_index] = dict_ref (xattr);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "path %s on subvolume %s => -1 (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_fix (frame, this);
-
- return 0;
+ return source;
}
-
int
-afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = 0;
- dict_t *xattr_req = NULL;
- int ret = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
- local->call_count = call_count;
-
- xattr_req = dict_new();
-
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req,
- priv->pending_key[i],
- 3 * sizeof(int32_t));
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- }
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ gf_boolean_t did_sh = _gf_true;
+ int source = -1;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ goto unlock;
}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, xattr_req);
- if (!--call_count)
- break;
- }
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
-
- return 0;
-}
-
-int
-afr_sh_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ ret = __afr_selfheal_metadata_prepare(
+ frame, this, inode, data_lock, sources, sinks, healed_sinks,
+ undid_pending, locked_replies, NULL);
+ if (ret < 0)
+ goto unlock;
- local = frame->local;
- int_lock = &local->internal_lock;
+ source = ret;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non Blocking inodelks failed.");
- afr_sh_metadata_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Non Blocking inodelks done. Proceeding to FOP");
- afr_sh_metadata_lookup (frame, this);
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
}
- return 0;
+ ret = __afr_selfheal_metadata_do(frame, this, inode, source,
+ healed_sinks, locked_replies);
+ if (ret)
+ goto unlock;
+
+ afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
+ locked_replies);
+
+ ret = afr_selfheal_undo_pending(
+ frame, this, inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_METADATA_TRANSACTION, locked_replies, data_lock);
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
+ data_lock);
+
+ if (did_sh)
+ afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
+
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
+ return ret;
}
int
-afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK;
-
- afr_set_lock_number (frame, this);
-
- int_lock->lk_flock.l_start = 0;
- int_lock->lk_flock.l_len = 0;
- int_lock->lk_flock.l_type = F_WRLCK;
- int_lock->lock_cbk = afr_sh_post_nonblocking_inodelk_cbk;
-
- afr_nonblocking_inodelk (frame, this);
-
- return 0;
-}
-
-
-int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = this->private;
-
-
- local = frame->local;
-
- if (local->self_heal.need_metadata_self_heal && priv->metadata_self_heal) {
- afr_sh_metadata_lock (frame, this);
- } else {
- afr_sh_metadata_done (frame, this);
- }
-
- return 0;
+ inode_t *inode = NULL;
+ inode_t *link_inode = NULL;
+ call_frame_t *frame = NULL;
+ int ret = 0;
+
+ if (gf_uuid_is_null(stbuf->ia_gfid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ inode = inode_new(this->itable);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ link_inode = inode_link(inode, NULL, NULL, stbuf);
+ if (!link_inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = afr_selfheal_metadata(frame, this, link_inode);
+out:
+ if (inode)
+ inode_unref(inode);
+ if (link_inode)
+ inode_unref(link_inode);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
}
-
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
new file mode 100644
index 00000000000..834aac86d48
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -0,0 +1,616 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/events.h>
+#include "afr.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
+
+int
+__afr_selfheal_assign_gfid(xlator_t *this, inode_t *parent, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ struct afr_reply *replies, void *gfid,
+ unsigned char *locked_on, int source,
+ unsigned char *sources, gf_boolean_t is_gfid_absent,
+ int *gfid_idx)
+{
+ int ret = 0;
+ int up_count = 0;
+ int locked_count = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ gf_uuid_copy(parent->gfid, pargfid);
+
+ if (is_gfid_absent) {
+ /* Ensure all children of AFR are up before performing gfid heal, to
+ * guard against the possibility of gfid split brain. */
+
+ up_count = AFR_COUNT(priv->child_up, priv->child_count);
+ if (up_count != priv->child_count) {
+ ret = -EIO;
+ goto out;
+ }
+
+ locked_count = AFR_COUNT(locked_on, priv->child_count);
+ if (locked_count != priv->child_count) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ ret = afr_lookup_and_heal_gfid(this, parent, bname, inode, replies, source,
+ sources, gfid, gfid_idx);
+
+out:
+ return ret;
+}
+
+int
+__afr_selfheal_name_impunge(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid, const char *bname,
+ inode_t *inode, struct afr_reply *replies,
+ int gfid_idx)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ unsigned char *sources = NULL;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+
+ gf_uuid_copy(parent->gfid, pargfid);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[gfid_idx].poststat.ia_gfid) == 0) {
+ sources[i] = 1;
+ continue;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
+
+ ret |= afr_selfheal_recreate_entry(frame, i, gfid_idx, sources, parent,
+ bname, inode, replies);
+ }
+
+ return ret;
+}
+
+int
+__afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret)
+ continue;
+
+ ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i,
+ replies);
+ }
+
+ return ret;
+}
+
+static gf_boolean_t
+afr_selfheal_name_need_heal_check(xlator_t *this, struct afr_reply *replies)
+{
+ int i = 0;
+ int first_idx = -1;
+ gf_boolean_t need_heal = _gf_false;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA))
+ need_heal = _gf_true;
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ need_heal = _gf_true;
+
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ need_heal = _gf_true;
+
+ if ((replies[i].op_ret == 0) &&
+ (gf_uuid_is_null(replies[i].poststat.ia_gfid)))
+ need_heal = _gf_true;
+ }
+
+ return need_heal;
+}
+
+static int
+afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies,
+ int source, unsigned char *sources,
+ uuid_t pargfid, const char *bname)
+{
+ int i = 0;
+ int type_idx = -1;
+ ia_type_t inode_type = IA_INVAL;
+ ia_type_t inode_type1 = IA_INVAL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+
+ if (replies[i].poststat.ia_type == IA_INVAL)
+ continue;
+
+ if (inode_type == IA_INVAL) {
+ inode_type = replies[i].poststat.ia_type;
+ type_idx = i;
+ continue;
+ }
+ inode_type1 = replies[i].poststat.ia_type;
+ if (sources[i] || source == -1) {
+ if ((sources[type_idx] || source == -1) &&
+ (inode_type != inode_type1)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN,
+ "Type mismatch for <gfid:%s>/%s: "
+ "%s on %s and %s on %s",
+ uuid_utoa(pargfid), bname,
+ gf_inode_type_to_str(inode_type1),
+ priv->children[i]->name,
+ gf_inode_type_to_str(inode_type),
+ priv->children[type_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=file;"
+ "file=<gfid:%s>/%s;count=2;"
+ "child-%d=%s;type-%d=%s;child-%d=%s;"
+ "type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name,
+ i, gf_inode_type_to_str(inode_type1), type_idx,
+ priv->children[type_idx]->name, type_idx,
+ gf_inode_type_to_str(inode_type));
+ return -EIO;
+ }
+ inode_type = replies[i].poststat.ia_type;
+ type_idx = i;
+ }
+ }
+ return 0;
+}
+
+static int
+afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies,
+ int source, unsigned char *sources,
+ int *gfid_idx, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ unsigned char *locked_on, dict_t *xdata)
+{
+ int i = 0;
+ int gfid_idx_iter = -1;
+ int ret = -1;
+ void *gfid = NULL;
+ void *gfid1 = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+
+ if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+ continue;
+
+ if (!gfid) {
+ gfid = &replies[i].poststat.ia_gfid;
+ gfid_idx_iter = i;
+ continue;
+ }
+
+ gfid1 = &replies[i].poststat.ia_gfid;
+ if (sources[i] || source == -1) {
+ if ((sources[gfid_idx_iter] || source == -1) &&
+ gf_uuid_compare(gfid, gfid1)) {
+ ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
+ bname, gfid_idx_iter, i,
+ locked_on, gfid_idx, xdata);
+ if (!ret && *gfid_idx >= 0) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ "GFID split-brain resolved");
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error setting gfid-"
+ "heal-msg dict");
+ }
+ return ret;
+ }
+ gfid = &replies[i].poststat.ia_gfid;
+ gfid_idx_iter = i;
+ }
+ }
+
+ *gfid_idx = gfid_idx_iter;
+ return 0;
+}
+
+static gf_boolean_t
+afr_selfheal_name_source_empty_check(xlator_t *this, struct afr_reply *replies,
+ unsigned char *sources, int source)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ gf_boolean_t source_is_empty = _gf_true;
+
+ priv = this->private;
+
+ if (source == -1) {
+ source_is_empty = _gf_false;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+
+ if (replies[i].op_ret == -1 && replies[i].op_errno == ENOENT)
+ continue;
+
+ source_is_empty = _gf_false;
+ break;
+ }
+out:
+ return source_is_empty;
+}
+
+int
+__afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, int source,
+ unsigned char *locked_on, struct afr_reply *replies,
+ void *gfid_req, dict_t *xdata)
+{
+ int gfid_idx = -1;
+ int ret = -1;
+ void *gfid = NULL;
+ gf_boolean_t source_is_empty = _gf_true;
+ gf_boolean_t need_heal = _gf_false;
+ gf_boolean_t is_gfid_absent = _gf_false;
+
+ need_heal = afr_selfheal_name_need_heal_check(this, replies);
+ if (!need_heal)
+ return 0;
+
+ source_is_empty = afr_selfheal_name_source_empty_check(this, replies,
+ sources, source);
+ if (source_is_empty) {
+ ret = __afr_selfheal_name_expunge(this, parent, pargfid, bname, inode,
+ replies);
+ if (ret == -EIO)
+ ret = -1;
+ return ret;
+ }
+
+ ret = afr_selfheal_name_type_mismatch_check(this, replies, source, sources,
+ pargfid, bname);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_name_gfid_mismatch_check(this, replies, source, sources,
+ &gfid_idx, pargfid, bname,
+ inode, locked_on, xdata);
+ if (ret)
+ return ret;
+
+ if (gfid_idx == -1) {
+ if (!gfid_req || gf_uuid_is_null(gfid_req))
+ return -1;
+ gfid = gfid_req;
+ } else {
+ gfid = &replies[gfid_idx].poststat.ia_gfid;
+ if (source == -1)
+ /* Either entry split-brain or dirty xattrs are present on parent.*/
+ source = gfid_idx;
+ }
+
+ is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;
+ ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode,
+ replies, gfid, locked_on, source, sources,
+ is_gfid_absent, &gfid_idx);
+ if (ret || (gfid_idx < 0))
+ return ret;
+
+ ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname,
+ inode, replies, gfid_idx);
+ if (ret == -EIO)
+ ret = -1;
+
+ return ret;
+}
+
+int
+__afr_selfheal_name_finalize_source(xlator_t *this, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on, uint64_t *witness)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count || afr_does_witness_exist(this, witness)) {
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return -1;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+
+ return source;
+}
+
+int
+__afr_selfheal_name_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ int *source_p)
+{
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_unlocked_discover(frame, parent, pargfid, replies);
+ if (ret)
+ goto out;
+
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, locked_on, sources,
+ sinks, witness, NULL);
+ if (ret)
+ goto out;
+
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ source = __afr_selfheal_name_finalize_source(this, sources, healed_sinks,
+ locked_on, witness);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
+
+out:
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+
+ return ret;
+}
+
+int
+afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, void *gfid_req,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *locked_on = NULL;
+ int source = -1;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+ dict_t *xattr = NULL;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
+ if (ret) {
+ dict_unref(xattr);
+ return -1;
+ }
+
+ priv = this->private;
+
+ locked_on = alloca0(priv->child_count);
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
+ locked_on);
+ {
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_prepare(frame, this, parent, pargfid,
+ locked_on, sources, sinks,
+ healed_sinks, &source);
+ if (ret)
+ goto unlock;
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
+ locked_on, xattr);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_do(frame, this, parent, pargfid, bname, inode,
+ sources, sinks, healed_sinks, source,
+ locked_on, replies, gfid_req, xdata);
+ }
+unlock:
+ afr_selfheal_unentrylk(frame, this, parent, this->name, bname, locked_on,
+ NULL);
+ if (inode)
+ inode_unref(inode);
+
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
+}
+
+int
+afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ const char *bname, gf_boolean_t *need_heal)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
+ int first_idx = -1;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
+ local->child_up, NULL);
+ if (!inode)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) {
+ *need_heal = _gf_true;
+ break;
+ }
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret) {
+ *need_heal = _gf_true;
+ break;
+ }
+
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid)) {
+ *need_heal = _gf_true;
+ break;
+ }
+ }
+
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ return 0;
+}
+
+int
+afr_selfheal_name(xlator_t *this, uuid_t pargfid, const char *bname,
+ void *gfid_req, dict_t *xdata)
+{
+ inode_t *parent = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t need_heal = _gf_false;
+
+ parent = afr_inode_find(this, pargfid);
+ if (!parent)
+ goto out;
+
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ goto out;
+
+ ret = afr_selfheal_name_unlocked_inspect(frame, this, parent, pargfid,
+ bname, &need_heal);
+ if (ret)
+ goto out;
+
+ if (need_heal) {
+ ret = afr_selfheal_name_do(frame, this, parent, pargfid, bname,
+ gfid_req, xdata);
+ if (ret)
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (parent)
+ inode_unref(parent);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+
+ return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index b10ae3fc037..48e6dbcfb18 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -1,54 +1,377 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#ifndef __AFR_SELF_HEAL_H__
-#define __AFR_SELF_HEAL_H__
+#ifndef _AFR_SELFHEAL_H
+#define _AFR_SELFHEAL_H
+
+/* Perform fop on all UP subvolumes and wait for all callbacks to return */
+
+#define AFR_ONALL(frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ unsigned char *__child_up = alloca(__priv->child_count); \
+ \
+ memcpy(__child_up, __priv->child_up, \
+ sizeof(*__child_up) * __priv->child_count); \
+ __count = AFR_COUNT(__child_up, __priv->child_count); \
+ \
+ __local->barrier.waitfor = __count; \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__child_up[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ } \
+ syncbarrier_wait(&__local->barrier, __count); \
+ } while (0)
+
+/* Perform fop on all subvolumes represented by list[] array and wait
+ for all callbacks to return */
+
+#define AFR_ONLIST(list, frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ int __count = 0; \
+ unsigned char *__list = alloca(__priv->child_count); \
+ \
+ memcpy(__list, list, sizeof(*__list) * __priv->child_count); \
+ __count = AFR_COUNT(__list, __priv->child_count); \
+ __local->barrier.waitfor = __count; \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__list[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ } \
+ syncbarrier_wait(&__local->barrier, __count); \
+ } while (0)
+
+#define AFR_SEQ(frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ syncbarrier_wait(&__local->barrier, 1); \
+ } \
+ } while (0)
+
+#define ALLOC_MATRIX(n, type) \
+ ({ \
+ int __i; \
+ type **__ptr = alloca(n * sizeof(type *)); \
+ \
+ for (__i = 0; __i < n; __i++) \
+ __ptr[__i] = alloca0(n * sizeof(type)); \
+ __ptr; \
+ })
+
+#define IA_EQUAL(f, s, field) \
+ (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0)
+
+#define SBRAIN_HEAL_NO_GO_MSG \
+ "Failed to obtain replies from all bricks of " \
+ "the replica (are they up?). Cannot resolve split-brain."
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SNO_BIGGER_FILE "No bigger file"
+#define SNO_DIFF_IN_MTIME "No difference in mtime"
+#define SUSE_SOURCE_BRICK_TO_HEAL \
+ "Use source-brick option to heal metadata" \
+ " split-brain"
+#define SINVALID_BRICK_NAME "Invalid brick name"
+#define SBRICK_IS_NOT_UP "Brick is not up"
+#define SBRICK_NOT_CONNECTED "Brick is not connected"
+#define SLESS_THAN2_BRICKS_in_REP "< 2 bricks in replica are up"
+#define SBRICK_IS_REMOTE "Brick is remote"
+#define SSTARTED_SELF_HEAL "Started self-heal"
+#define SOP_NOT_SUPPORTED "Operation Not Supported"
+#define SFILE_NOT_UNDER_DATA \
+ "The file is not under data or metadata " \
+ "split-brain"
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SALL_BRICKS_UP_TO_RESOLVE \
+ "All the bricks should be up to resolve the" \
+ " gfid split brain"
+#define SERROR_GETTING_SRC_BRICK "Error getting the source brick"
+int
+afr_selfheal(xlator_t *this, uuid_t gfid);
+
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this);
+
+int
+afr_selfheal_name(xlator_t *this, uuid_t gfid, const char *name, void *gfid_req,
+ dict_t *xdata);
+
+int
+afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd);
+
+int
+afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+int
+afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+int
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+ inode_t *inode, struct afr_reply *replies, int source,
+ unsigned char *sources, void *gfid, int *gfid_idx);
+
+int
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
+
+int
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
+
+int
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, off_t off,
+ size_t size, unsigned char *locked_on);
+
+int
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on);
+
+int
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
+
+int
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
+
+int
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, const char *name,
+ unsigned char *locked_on);
+
+int
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata);
+
+int
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+ struct afr_reply *replies);
+
+int
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on, dict_t *dict);
+inode_t *
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on, dict_t *xattr);
+
+int
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ uint64_t *witness, unsigned char *flag);
+int
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+ dict_t *xdata);
+
+int
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix);
+
+int
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata);
+
+int
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies);
+int
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on);
+
+int
+afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
+ unsigned char *sources, inode_t *dir,
+ const char *name, inode_t *inode,
+ struct afr_reply *replies);
+
+int
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr, dict_t *xdata);
+
+call_frame_t *
+afr_frame_create(xlator_t *this, int32_t *op_errno);
+
+inode_t *
+afr_inode_find(xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf);
+void
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src);
+
+void
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count);
+
+int
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry);
+
+unsigned int
+afr_success_count(struct afr_reply *replies, unsigned int count);
-#include <sys/stat.h>
+void
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+ unsigned char *sources, unsigned char *healed_sinks);
-#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type)
-#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type))
-#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid))
-#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size)
+void
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies);
+void
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+ unsigned char *locked_on, unsigned char *sinks);
-#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size)
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame);
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+ int child_count);
int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_mark_split_brain_source_sinks(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type);
+
+int
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str);
+
int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this);
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies);
+
int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_get_child_index_from_name(xlator_t *this, char *name);
+
+gf_boolean_t
+afr_does_witness_exist(xlator_t *this, uint64_t *witness);
int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *flag);
int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *flag);
+int
+__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p,
+ unsigned char *flag);
int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **link_inode, gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies);
int
-afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr);
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
+
+int
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on);
+int
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+ afr_transaction_type type);
int
-afr_self_heal (call_frame_t *frame, xlator_t *this);
+afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf);
-#endif /* __AFR_SELF_HEAL_H__ */
+int
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+int
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+int
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+
+int
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid, const char *bname,
+ int src_idx, int child_idx,
+ unsigned char *locked_on, int *src, dict_t *xdata);
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type);
+
+gf_boolean_t
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+ struct afr_reply *replies);
+
+int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies);
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode);
+#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
new file mode 100644
index 00000000000..109fd4b7421
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -0,0 +1,1716 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-self-heal.h"
+#include "afr-self-heald.h"
+#include "protocol-common.h"
+#include <glusterfs/syncop-utils.h>
+#include "afr-messages.h"
+#include <glusterfs/byte-order.h>
+
+#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
+#define AFR_STATISTICS_HISTORY_SIZE 50
+
+#define ASSERT_LOCAL(this, healer) \
+ if (!afr_shd_is_subvol_local(this, healer->subvol)) { \
+ healer->local = _gf_false; \
+ if (safe_break(healer)) { \
+ break; \
+ } else { \
+ continue; \
+ } \
+ } else { \
+ healer->local = _gf_true; \
+ }
+
+#define NTH_INDEX_HEALER(this, n) \
+ &((((afr_private_t *)this->private))->shd.index_healers[n])
+#define NTH_FULL_HEALER(this, n) \
+ &((((afr_private_t *)this->private))->shd.full_healers[n])
+
+char *
+afr_subvol_name(xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ if (subvol < 0 || subvol > priv->child_count)
+ return NULL;
+
+ return priv->children[subvol]->name;
+}
+
+void
+afr_destroy_crawl_event_data(void *data)
+{
+ return;
+}
+
+void
+afr_destroy_shd_event_data(void *data)
+{
+ shd_event_t *shd_event = data;
+
+ if (!shd_event)
+ return;
+ GF_FREE(shd_event->path);
+
+ return;
+}
+
+gf_boolean_t
+afr_shd_is_subvol_local(xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ gf_boolean_t is_local = _gf_false;
+ loc_t loc = {
+ 0,
+ };
+
+ loc.inode = this->itable->root;
+ gf_uuid_copy(loc.gfid, loc.inode->gfid);
+ priv = this->private;
+ syncop_is_subvol_local(priv->children[subvol], &loc, &is_local);
+ return is_local;
+}
+
+int
+__afr_shd_healer_wait(struct subvol_healer *healer)
+{
+ afr_private_t *priv = NULL;
+ struct timespec wait_till = {
+ 0,
+ };
+ int ret = 0;
+
+ priv = healer->this->private;
+
+disabled_loop:
+ wait_till.tv_sec = gf_time() + priv->shd.timeout;
+
+ while (!healer->rerun) {
+ ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
+ if (ret == ETIMEDOUT)
+ break;
+ }
+
+ ret = healer->rerun;
+ healer->rerun = 0;
+
+ if (!priv->shd.enabled)
+ goto disabled_loop;
+
+ return ret;
+}
+
+int
+afr_shd_healer_wait(struct subvol_healer *healer)
+{
+ int ret = 0;
+
+ pthread_mutex_lock(&healer->mutex);
+ {
+ ret = __afr_shd_healer_wait(healer);
+ }
+ pthread_mutex_unlock(&healer->mutex);
+
+ return ret;
+}
+
+gf_boolean_t
+safe_break(struct subvol_healer *healer)
+{
+ gf_boolean_t ret = _gf_false;
+
+ pthread_mutex_lock(&healer->mutex);
+ {
+ if (healer->rerun)
+ goto unlock;
+
+ healer->running = _gf_false;
+ ret = _gf_true;
+ }
+unlock:
+ pthread_mutex_unlock(&healer->mutex);
+
+ return ret;
+}
+
+inode_t *
+afr_shd_inode_find(xlator_t *this, xlator_t *subvol, uuid_t gfid)
+{
+ int ret = 0;
+ uint64_t val = IA_INVAL;
+ dict_t *xdata = NULL;
+ dict_t *rsp_dict = NULL;
+ inode_t *inode = NULL;
+
+ xdata = dict_new();
+ if (!xdata)
+ goto out;
+
+ ret = dict_set_int8(xdata, GF_INDEX_IA_TYPE_GET_REQ, 1);
+ if (ret)
+ goto out;
+
+ ret = syncop_inode_find(this, subvol, gfid, &inode, xdata, &rsp_dict);
+ if (ret < 0)
+ goto out;
+
+ if (rsp_dict) {
+ ret = dict_get_uint64(rsp_dict, GF_INDEX_IA_TYPE_GET_RSP, &val);
+ if (ret)
+ goto out;
+ }
+ ret = inode_ctx_set2(inode, subvol, 0, &val);
+out:
+ if (ret && inode) {
+ inode_unref(inode);
+ inode = NULL;
+ }
+ if (xdata)
+ dict_unref(xdata);
+ if (rsp_dict)
+ dict_unref(rsp_dict);
+ return inode;
+}
+
+inode_t *
+afr_shd_index_inode(xlator_t *this, xlator_t *subvol, char *vgfid)
+{
+ loc_t rootloc = {
+ 0,
+ };
+ inode_t *inode = NULL;
+ int ret = 0;
+ dict_t *xattr = NULL;
+ void *index_gfid = NULL;
+
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr(subvol, &rootloc, &xattr, vgfid, NULL, NULL);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr(xattr, vgfid, &index_gfid);
+ if (ret)
+ goto out;
+
+ gf_msg_debug(this->name, 0, "%s dir gfid for %s: %s", vgfid, subvol->name,
+ uuid_utoa(index_gfid));
+
+ inode = afr_shd_inode_find(this, subvol, index_gfid);
+
+out:
+ loc_wipe(&rootloc);
+
+ if (xattr)
+ dict_unref(xattr);
+
+ return inode;
+}
+
+int
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
+ ia_type_t type)
+{
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+
+ loc.parent = inode_ref(inode);
+ loc.name = name;
+
+ if (IA_ISDIR(type))
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ else
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+
+ loc_wipe(&loc);
+ return ret;
+}
+
+void
+afr_shd_zero_xattrop(xlator_t *this, uuid_t gfid)
+{
+ call_frame_t *frame = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+ int i = 0;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+
+ priv = this->private;
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ goto out;
+ inode = afr_inode_find(this, gfid);
+ if (!inode)
+ goto out;
+ xattr = dict_new();
+ if (!xattr)
+ goto out;
+ ret = dict_set_static_bin(xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto out;
+ }
+
+ /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or
+ * has valid repies for this gfid seems a bit of an overkill.*/
+ for (i = 0; i < priv->child_count; i++)
+ afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ if (inode)
+ inode_unref(inode);
+ if (xattr)
+ dict_unref(xattr);
+ return;
+}
+
+int
+afr_shd_selfheal_name(struct subvol_healer *healer, int child, uuid_t parent,
+ const char *bname)
+{
+ int ret = -1;
+
+ ret = afr_selfheal_name(THIS, parent, bname, NULL, NULL);
+
+ return ret;
+}
+
+int
+afr_shd_selfheal(struct subvol_healer *healer, int child, uuid_t gfid)
+{
+ int ret = 0;
+ eh_t *eh = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ this = healer->this;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = &healer->crawl_event;
+
+ subvol = priv->children[child];
+
+ // If this fails with ENOENT/ESTALE index is stale
+ ret = syncop_gfid_to_path(this->itable, subvol, gfid, &path);
+ if (ret < 0)
+ return ret;
+
+ ret = afr_selfheal(this, gfid);
+
+ LOCK(&priv->lock);
+ {
+ if (ret == -EIO) {
+ eh = shd->split_brain;
+ crawl_event->split_brain_count++;
+ } else if (ret < 0) {
+ crawl_event->heal_failed_count++;
+ } else if (ret == 0) {
+ crawl_event->healed_count++;
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (eh) {
+ shd_event = GF_CALLOC(1, sizeof(*shd_event), gf_afr_mt_shd_event_t);
+ if (!shd_event)
+ goto out;
+
+ shd_event->child = child;
+ shd_event->path = path;
+
+ if (eh_save_history(eh, shd_event) < 0)
+ goto out;
+
+ shd_event = NULL;
+ path = NULL;
+ }
+out:
+ GF_FREE(shd_event);
+ GF_FREE(path);
+ return ret;
+}
+
+void
+afr_shd_sweep_prepare(struct subvol_healer *healer)
+{
+ crawl_event_t *event = NULL;
+
+ event = &healer->crawl_event;
+
+ event->healed_count = 0;
+ event->split_brain_count = 0;
+ event->heal_failed_count = 0;
+
+ event->start_time = gf_time();
+ event->end_time = 0;
+ _mask_cancellation();
+}
+
+void
+afr_shd_sweep_done(struct subvol_healer *healer)
+{
+ crawl_event_t *event = NULL;
+ crawl_event_t *history = NULL;
+ afr_self_heald_t *shd = NULL;
+
+ event = &healer->crawl_event;
+ shd = &(((afr_private_t *)healer->this->private)->shd);
+
+ event->end_time = gf_time();
+ history = gf_memdup(event, sizeof(*event));
+ event->start_time = 0;
+
+ if (!history)
+ return;
+
+ if (eh_save_history(shd->statistics[healer->subvol], history) < 0)
+ GF_FREE(history);
+ _unmask_cancellation();
+}
+
+int
+afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = NULL;
+ uuid_t gfid = {0};
+ int ret = 0;
+ uint64_t val = IA_INVAL;
+
+ priv = healer->this->private;
+ if (!priv->shd.enabled)
+ return -EBUSY;
+
+ gf_msg_debug(healer->this->name, 0, "got entry: %s from %s", entry->d_name,
+ priv->children[healer->subvol]->name);
+
+ ret = gf_uuid_parse(entry->d_name, gfid);
+ if (ret)
+ return 0;
+
+ inode_ctx_get2(parent->inode, subvol, NULL, &val);
+
+ ret = afr_shd_selfheal(healer, healer->subvol, gfid);
+
+ if (ret == -ENOENT || ret == -ESTALE)
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val);
+
+ if (ret == 2)
+ /* If bricks crashed in pre-op after creating indices/xattrop
+ * link but before setting afr changelogs, we end up with stale
+ * xattrop links but zero changelogs. Remove such entries by
+ * sending a post-op with zero changelogs.
+ */
+ afr_shd_zero_xattrop(healer->this, gfid);
+
+ return 0;
+}
+
+int
+afr_shd_index_sweep(struct subvol_healer *healer, char *vgfid)
+{
+ loc_t loc = {0};
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ xlator_t *subvol = NULL;
+ dict_t *xdata = NULL;
+ call_frame_t *frame = NULL;
+
+ priv = healer->this->private;
+ subvol = priv->children[healer->subvol];
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ loc.inode = afr_shd_index_inode(healer->this, subvol, vgfid);
+ if (!loc.inode) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_INDEX_DIR_GET_FAILED, "unable to get index-dir on %s",
+ subvol->name);
+ ret = -errno;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata || dict_set_int32_sizen(xdata, "get-gfid-type", 1)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
+ healer, afr_shd_index_heal, xdata,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+
+ if (ret == 0)
+ ret = healer->crawl_event.healed_count;
+
+out:
+ loc_wipe(&loc);
+
+ if (xdata)
+ dict_unref(xdata);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
+}
+
+int
+afr_shd_index_sweep_all(struct subvol_healer *healer)
+{
+ int ret = 0;
+ int count = 0;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_INDEX_GFID);
+ if (ret < 0)
+ goto out;
+ count = ret;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_DIRTY_GFID);
+ if (ret < 0)
+ goto out;
+ count += ret;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_ENTRY_CHANGES_GFID);
+ if (ret < 0)
+ goto out;
+ count += ret;
+out:
+ if (ret < 0)
+ return ret;
+ else
+ return count;
+}
+
+int
+afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ xlator_t *this = healer->this;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (this->cleanup_starting) {
+ return -ENOTCONN;
+ }
+
+ if (!priv->shd.enabled)
+ return -EBUSY;
+
+ afr_shd_selfheal_name(healer, healer->subvol, parent->inode->gfid,
+ entry->d_name);
+
+ afr_shd_selfheal(healer, healer->subvol, entry->d_stat.ia_gfid);
+
+ return 0;
+}
+
+int
+afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode)
+{
+ afr_private_t *priv = NULL;
+ loc_t loc = {0};
+
+ priv = healer->this->private;
+ loc.inode = inode;
+ return syncop_ftw(priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal);
+}
+
+int
+afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ int ret = -1;
+
+ priv = this->private;
+ loc->parent = inode_ref(this->itable->root);
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ loc->inode = inode_new(loc->parent->table);
+ GF_CHECK_ALLOC(loc->inode, ret, out);
+
+ if (!gf_uuid_is_null(priv->ta_gfid))
+ goto assign_gfid;
+
+ ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf,
+ 0, 0, 0);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed lookup on file %s.", loc->name);
+ goto out;
+ }
+
+ gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+
+assign_gfid:
+ gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ ret = 0;
+
+out:
+ if (ret)
+ loc_wipe(loc);
+
+ return ret;
+}
+
+int
+_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
+{
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {
+ 0,
+ };
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ xattr = dict_new();
+ if (!xattr) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED,
+ "Failed to create dict.");
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret)
+ goto out;
+ }
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL);
+ if (ret || !(*xdata)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Xattrop failed on %s.", loc->name);
+ }
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
+}
+
+void
+afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer,
+ dict_t **xdata)
+{
+ int ret = 0;
+
+ loc_wipe(loc);
+ if (afr_shd_fill_ta_loc(this, loc)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc->name);
+ ret = -1;
+ goto out;
+ }
+
+ ret = afr_ta_post_op_lock(this, loc);
+ if (ret)
+ goto out;
+
+ ret = _afr_shd_ta_get_xattrs(this, loc, xdata);
+ if (ret) {
+ if (*xdata) {
+ dict_unref(*xdata);
+ *xdata = NULL;
+ }
+ }
+
+ afr_ta_post_op_unlock(this, loc);
+
+out:
+ if (ret)
+ healer->rerun = 1;
+}
+
+int
+afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
+{
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ gf_boolean_t need_xattrop = _gf_false;
+ void *pending_raw = NULL;
+ int *raw = NULL;
+ int pending[AFR_NUM_CHANGE_LOGS] = {
+ 0,
+ };
+ int i = 0;
+ int j = 0;
+ int val = 0;
+ int ret = -1;
+
+ priv = this->private;
+
+ xattr = dict_new();
+ if (!xattr) {
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t);
+ if (!raw) {
+ goto out;
+ }
+
+ ret = dict_get_ptr(*xdata, priv->pending_key[i], &pending_raw);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "Error getting value "
+ "of pending key %s",
+ priv->pending_key[i]);
+ GF_FREE(raw);
+ goto out;
+ }
+
+ memcpy(pending, pending_raw, sizeof(pending));
+ for (j = 0; j < AFR_NUM_CHANGE_LOGS; j++) {
+ val = ntoh32(pending[j]);
+ if (val) {
+ if (i == healer) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_THIN_ARB,
+ "I am "
+ "not the good shd. Skipping. "
+ "SHD = %d.",
+ healer);
+ ret = 0;
+ GF_FREE(raw);
+ goto out;
+ }
+ need_xattrop = _gf_true;
+ raw[j] = hton32(-val);
+ }
+ }
+
+ ret = dict_set_bin(xattr, priv->pending_key[i], raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ GF_FREE(raw);
+ goto out;
+ }
+
+ if (need_xattrop)
+ break;
+ }
+
+ if (!need_xattrop) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Xattrop failed.");
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
+}
+
+void
+afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc,
+ struct subvol_healer *healer,
+ dict_t *pre_crawl_xdata)
+{
+ int ret_lock = 0;
+ int ret = 0;
+ dict_t *post_crawl_xdata = NULL;
+
+ ret_lock = afr_ta_post_op_lock(this, loc);
+ if (ret_lock)
+ goto unref;
+
+ ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata);
+ if (ret)
+ goto unref;
+
+ if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) {
+ ret = -1;
+ goto unref;
+ }
+
+ ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol);
+
+unref:
+ if (post_crawl_xdata) {
+ dict_unref(post_crawl_xdata);
+ post_crawl_xdata = NULL;
+ }
+
+ if (ret || ret_lock)
+ healer->rerun = 1;
+
+ if (!ret_lock)
+ afr_ta_post_op_unlock(this, loc);
+}
+
+gf_boolean_t
+afr_bricks_available_for_heal(afr_private_t *priv)
+{
+ int up_children = 0;
+
+ up_children = __afr_get_up_children_count(priv);
+ if (up_children < 2) {
+ return _gf_false;
+ }
+ return _gf_true;
+}
+
+static gf_boolean_t
+afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer)
+{
+ dict_t *xdata = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = -1;
+ int i = 0;
+ gf_boolean_t need_heal = _gf_false;
+
+ priv = this->private;
+
+ ret = afr_shd_fill_ta_loc(this, &loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ healer->rerun = 1;
+ goto out;
+ }
+
+ if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) {
+ healer->rerun = 1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+
+ return need_heal;
+}
+
+static int
+afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = healer->this->private;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int count = 0;
+ int i = 0;
+ int op_errno = 0;
+ struct iatt *iatt = NULL;
+ gf_boolean_t multiple_links = _gf_false;
+ unsigned char *gfid_present = alloca0(priv->child_count);
+ unsigned char *entry_present = alloca0(priv->child_count);
+ char *type = "file";
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ gf_msg_debug(healer->this->name, 0,
+ "Not all bricks are up. Skipping "
+ "cleanup of %s on %s",
+ entry->d_name, subvol->name);
+ ret = 0;
+ goto out;
+ }
+
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = gf_uuid_parse(entry->d_name, loc.gfid);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ gfid_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ if (iatt->ia_type == IA_IFDIR) {
+ type = "dir";
+ }
+
+ if (i == healer->subvol) {
+ if (local->replies[i].poststat.ia_nlink > 1) {
+ multiple_links = _gf_true;
+ }
+ }
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ /*Inode is deleted from subvol*/
+ if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type,
+ priv->anon_inode_name, entry->d_name, subvol->name);
+ ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name,
+ iatt->ia_type);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = 0;
+ } else if (count > 1) {
+ loc_wipe(&loc);
+ loc.parent = inode_ref(parent->inode);
+ loc.name = entry->d_name;
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup,
+ &loc, NULL);
+ count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ entry_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (gfid_present[i] && !entry_present[i]) {
+ /*Entry is not anonymous on at least one subvol*/
+ gf_msg_debug(healer->this->name, 0,
+ "Valid entry present on %s "
+ "Skipping cleanup of %s on %s",
+ priv->children[i]->name, entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging %s %s/%s on all subvols", type, priv->anon_inode_name,
+ entry->d_name);
+ ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent,
+ entry->d_name, iatt->ia_type);
+ if (op_errno != ENOENT && op_errno != ESTALE) {
+ ret |= -op_errno;
+ }
+ }
+ }
+
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return ret;
+}
+
+static void
+afr_cleanup_anon_inode_dir(struct subvol_healer *healer)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_private_t *priv = healer->this->private;
+ loc_t loc = {0};
+
+ ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode);
+ if (ret)
+ goto out;
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer,
+ afr_shd_anon_inode_cleaner, NULL,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return;
+}
+
+void *
+afr_shd_index_healer(void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ dict_t *pre_crawl_xdata = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ healer = data;
+ THIS = this = healer->this;
+ priv = this->private;
+
+ for (;;) {
+ afr_shd_healer_wait(healer);
+
+ if (!afr_bricks_available_for_heal(priv))
+ continue;
+
+ ASSERT_LOCAL(this, healer);
+ priv->local[healer->subvol] = healer->local;
+
+ if (priv->thin_arbiter_count) {
+ if (afr_shd_ta_needs_heal(this, healer))
+ afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
+ }
+
+ do {
+ gf_msg_debug(this->name, 0, "starting index sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+
+ afr_shd_sweep_prepare(healer);
+
+ ret = afr_shd_index_sweep_all(healer);
+
+ afr_shd_sweep_done(healer);
+ /*
+ As long as at least one gfid was
+ healed, keep retrying. We may have
+ just healed a directory and thereby
+ created entries for other gfids which
+ could not be healed thus far.
+ */
+
+ gf_msg_debug(this->name, 0, "finished index sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+ /*
+ Give a pause before retrying to avoid a busy loop
+ in case the only entry in index is because of
+ an ongoing I/O.
+ */
+ sleep(1);
+ } while (ret > 0);
+
+ if (ret == 0) {
+ afr_cleanup_anon_inode_dir(healer);
+ }
+
+ if (ret == 0 && pre_crawl_xdata &&
+ !healer->crawl_event.heal_failed_count) {
+ afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
+ pre_crawl_xdata);
+ }
+
+ if (pre_crawl_xdata) {
+ dict_unref(pre_crawl_xdata);
+ pre_crawl_xdata = NULL;
+ }
+ }
+
+ return NULL;
+}
+
+void *
+afr_shd_full_healer(void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int run = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+
+ for (;;) {
+ pthread_mutex_lock(&healer->mutex);
+ {
+ run = __afr_shd_healer_wait(healer);
+ if (!run)
+ healer->running = _gf_false;
+ }
+ pthread_mutex_unlock(&healer->mutex);
+
+ if (!run)
+ break;
+
+ ASSERT_LOCAL(this, healer);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "starting full sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+
+ afr_shd_sweep_prepare(healer);
+
+ afr_shd_full_sweep(healer, this->itable->root);
+
+ afr_shd_sweep_done(healer);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "finished full sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+ }
+
+ return NULL;
+}
+
+int
+afr_shd_healer_init(xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = 0;
+
+ ret = pthread_mutex_init(&healer->mutex, NULL);
+ if (ret)
+ goto out;
+
+ ret = pthread_cond_init(&healer->cond, NULL);
+ if (ret)
+ goto out;
+
+ healer->this = this;
+ healer->running = _gf_false;
+ healer->rerun = _gf_false;
+ healer->local = _gf_false;
+out:
+ return ret;
+}
+
+int
+afr_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer,
+ void *(threadfn)(void *))
+{
+ int ret = 0;
+
+ pthread_mutex_lock(&healer->mutex);
+ {
+ if (healer->running) {
+ pthread_cond_signal(&healer->cond);
+ } else {
+ ret = gf_thread_create(&healer->thread, NULL, threadfn, healer,
+ "shdheal");
+ if (ret)
+ goto unlock;
+ healer->running = 1;
+ }
+
+ healer->rerun = 1;
+ }
+unlock:
+ pthread_mutex_unlock(&healer->mutex);
+
+ return ret;
+}
+
+int
+afr_shd_full_healer_spawn(xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol),
+ afr_shd_full_healer);
+}
+
+int
+afr_shd_index_healer_spawn(xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol),
+ afr_shd_index_healer);
+}
+
+int
+afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
+ crawl_event_t *crawl_event)
+{
+ int ret = 0;
+ uint64_t count = 0;
+ char key[128] = {0};
+ int keylen = 0;
+ char suffix[64] = {0};
+ int xl_id = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = 0;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+ int child = -1;
+
+ child = crawl_event->child;
+ healed_count = crawl_event->healed_count;
+ split_brain_count = crawl_event->split_brain_count;
+ heal_failed_count = crawl_event->heal_failed_count;
+ crawl_type = crawl_event->crawl_type;
+
+ if (!crawl_event->start_time)
+ goto out;
+
+ start_time_str = gf_strdup(ctime(&crawl_event->start_time));
+
+ if (crawl_event->end_time)
+ end_time_str = gf_strdup(ctime(&crawl_event->end_time));
+
+ ret = dict_get_int32(output, this->name, &xl_id);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "xl does not have id");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_get_uint64(output, key, &count);
+
+ snprintf(suffix, sizeof(suffix), "%d-%d-%" PRIu64, xl_id, child, count);
+ snprintf(key, sizeof(key), "statistics_healed_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, healed_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_healed_count to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics_sb_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, split_brain_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_split_brain_count to output");
+ goto out;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_crawl_type-%s", suffix);
+ ret = dict_set_strn(output, key, keylen, crawl_type);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_type to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics_heal_failed_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, heal_failed_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_healed_failed_count to output");
+ goto out;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_strt_time-%s", suffix);
+ ret = dict_set_dynstrn(output, key, keylen, start_time_str);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_start_time to output");
+ goto out;
+ } else {
+ start_time_str = NULL;
+ }
+
+ if (!end_time_str)
+ progress = 1;
+ else
+ progress = 0;
+
+ keylen = snprintf(key, sizeof(key), "statistics_end_time-%s", suffix);
+ if (!end_time_str)
+ end_time_str = gf_strdup("Could not determine the end time");
+ ret = dict_set_dynstrn(output, key, keylen, end_time_str);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_end_time to output");
+ goto out;
+ } else {
+ end_time_str = NULL;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_inprogress-%s", suffix);
+
+ ret = dict_set_int32n(output, key, keylen, progress);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_inprogress to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_set_uint64(output, key, count + 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not increment the counter.");
+ goto out;
+ }
+out:
+ GF_FREE(start_time_str);
+ GF_FREE(end_time_str);
+ return ret;
+}
+
+int
+afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
+ struct timeval *tv)
+{
+ int ret = -1;
+ uint64_t count = 0;
+ char key[64] = {0};
+ int keylen = 0;
+ char xl_id_child_str[32] = {0};
+ int xl_id = 0;
+
+ ret = dict_get_int32(output, this->name, &xl_id);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "xl does not have id");
+ goto out;
+ }
+
+ snprintf(xl_id_child_str, sizeof(xl_id_child_str), "%d-%d", xl_id, child);
+ snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
+ ret = dict_get_uint64(output, key, &count);
+
+ keylen = snprintf(key, sizeof(key), "%s-%" PRIu64, xl_id_child_str, count);
+ ret = dict_set_dynstrn(output, key, keylen, path);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Could not add to output", path);
+ goto out;
+ }
+
+ if (tv) {
+ snprintf(key, sizeof(key), "%s-%" PRIu64 "-time", xl_id_child_str,
+ count);
+ ret = dict_set_uint32(output, key, tv->tv_sec);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Could not set time", path);
+ goto out;
+ }
+ }
+
+ snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
+
+ ret = dict_set_uint64(output, key, count + 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not increment count");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+afr_add_shd_event(circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ shd_event = cb->data;
+
+ if (!shd->index_healers[shd_event->child].local)
+ return 0;
+
+ path = gf_strdup(shd_event->path);
+ if (!path)
+ return -ENOMEM;
+
+ afr_shd_dict_add_path(this, output, shd_event->child, path, &cb->tv);
+ return 0;
+}
+
+int
+afr_add_crawl_event(circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = cb->data;
+
+ if (!shd->index_healers[crawl_event->child].local)
+ return 0;
+
+ afr_shd_dict_add_crawl_event(this, output, crawl_event);
+
+ return 0;
+}
+
+int
+afr_selfheal_daemon_init(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->index_healers)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->index_healers[i].subvol = i;
+ ret = afr_shd_healer_init(this, &shd->index_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->full_healers)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ shd->full_healers[i].subvol = i;
+ ret = afr_shd_healer_init(this, &shd->full_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->split_brain = eh_new(AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->split_brain)
+ goto out;
+
+ shd->statistics = GF_CALLOC(sizeof(eh_t *), priv->child_count,
+ gf_common_mt_eh_t);
+ if (!shd->statistics)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->statistics[i] = eh_new(AFR_STATISTICS_HISTORY_SIZE, _gf_false,
+ afr_destroy_crawl_event_data);
+ if (!shd->statistics[i])
+ goto out;
+ shd->full_healers[i].crawl_event.child = i;
+ shd->full_healers[i].crawl_event.crawl_type = "FULL";
+ shd->index_healers[i].crawl_event.child = i;
+ shd->index_healers[i].crawl_event.crawl_type = "INDEX";
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv)
+{
+ int subvol = 0;
+
+ if (!priv->shd.iamshd)
+ return;
+ for (subvol = 0; subvol < priv->child_count; subvol++)
+ if (priv->child_up[subvol])
+ afr_shd_index_healer_spawn(this, subvol);
+
+ return;
+}
+
+int
+afr_shd_get_index_count(xlator_t *this, int i, uint64_t *count)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ loc_t rootloc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ int ret = -1;
+
+ priv = this->private;
+ subvol = priv->children[i];
+
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_COUNT,
+ NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ ret = dict_get_uint64(xattr, GF_XATTROP_INDEX_COUNT, count);
+ if (ret)
+ goto out;
+
+ ret = 0;
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+ loc_wipe(&rootloc);
+
+ return ret;
+}
+
+int
+afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
+{
+ gf_xl_afr_op_t op = GF_SHD_OP_INVALID;
+ int ret = 0;
+ int xl_id = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ struct subvol_healer *healer = NULL;
+ int i = 0;
+ char key[64];
+ int keylen = 0;
+ int this_name_len = 0;
+ int op_ret = 0;
+ uint64_t cnt = 0;
+
+#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \
+ dict_str_len) \
+ { \
+ int ret; \
+ \
+ ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \
+ if (ret) { \
+ gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \
+ "key=%s", key, "value=%s", dict_str, NULL); \
+ } \
+ }
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=xl-op", NULL);
+ goto out;
+ }
+ this_name_len = strlen(this->name);
+ ret = dict_get_int32n(input, this->name, this_name_len, &xl_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=%s", this->name, NULL);
+ goto out;
+ }
+ ret = dict_set_int32n(output, this->name, this_name_len, xl_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "key=%s", this->name, NULL);
+ goto out;
+ }
+ switch (op) {
+ case GF_SHD_OP_HEAL_INDEX:
+ op_ret = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ op_ret = -1;
+ } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SLESS_THAN2_BRICKS_in_REP,
+ SLEN(SLESS_THAN2_BRICKS_in_REP));
+ op_ret = -1;
+ } else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
+ SLEN(SBRICK_IS_REMOTE));
+ } else {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SSTARTED_SELF_HEAL,
+ SLEN(SSTARTED_SELF_HEAL));
+
+ ret = afr_shd_index_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
+ }
+ }
+ break;
+ case GF_SHD_OP_HEAL_FULL:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->full_healers[i];
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SLESS_THAN2_BRICKS_in_REP,
+ SLEN(SLESS_THAN2_BRICKS_in_REP));
+ } else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
+ SLEN(SBRICK_IS_REMOTE));
+ } else {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SSTARTED_SELF_HEAL,
+ SLEN(SSTARTED_SELF_HEAL));
+
+ ret = afr_shd_full_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
+ op_ret = 0;
+ }
+ }
+ break;
+ case GF_SHD_OP_INDEX_SUMMARY:
+ /* this case has been handled in glfs-heal.c */
+ break;
+ case GF_SHD_OP_SPLIT_BRAIN_FILES:
+ eh_dump(shd->split_brain, output, afr_add_shd_event);
+ break;
+ case GF_SHD_OP_STATISTICS:
+ for (i = 0; i < priv->child_count; i++) {
+ eh_dump(shd->statistics[i], output, afr_add_crawl_event);
+ ret = afr_shd_dict_add_crawl_event(
+ this, output, &shd->index_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
+
+ ret = afr_shd_dict_add_crawl_event(
+ this, output, &shd->full_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
+ }
+ break;
+ case GF_SHD_OP_STATISTICS_HEAL_COUNT:
+ case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i]) {
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id,
+ i);
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ } else {
+ snprintf(key, sizeof(key), "%d-%d-hardlinks", xl_id, i);
+ ret = afr_shd_get_index_count(this, i, &cnt);
+ if (ret == 0) {
+ ret = dict_set_uint64(output, key, cnt);
+ }
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_DICT_SET_FAILED, NULL);
+ }
+ op_ret = 0;
+ }
+ }
+
+ break;
+
+ default:
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d",
+ op, NULL);
+ break;
+ }
+out:
+ dict_deln(output, this->name, this_name_len);
+ return op_ret;
+
+#undef AFR_SET_DICT_AND_LOG
+}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
new file mode 100644
index 00000000000..18db728ea7b
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -0,0 +1,75 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _AFR_SELF_HEALD_H
+#define _AFR_SELF_HEALD_H
+
+#include <pthread.h>
+
+typedef struct {
+ char *path;
+ int child;
+} shd_event_t;
+
+typedef struct {
+ uint64_t healed_count;
+ uint64_t split_brain_count;
+ uint64_t heal_failed_count;
+
+ /* If start_time is 0, it means crawler is not in progress
+ and stats are not valid */
+ time_t start_time;
+ /* If start_time is NOT 0 and end_time is 0, it means
+ cralwer is in progress */
+ time_t end_time;
+ char *crawl_type;
+ int child;
+} crawl_event_t;
+
+struct subvol_healer {
+ xlator_t *this;
+ crawl_event_t crawl_event;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ pthread_t thread;
+ int subvol;
+ gf_boolean_t local;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
+};
+
+typedef struct {
+ struct subvol_healer *index_healers;
+ struct subvol_healer *full_healers;
+
+ eh_t *split_brain;
+ eh_t **statistics;
+ int timeout;
+ uint32_t max_threads;
+ uint32_t wait_qlength;
+ uint32_t halo_max_latency_msec;
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
+} afr_self_heald_t;
+
+int
+afr_selfheal_daemon_init(xlator_t *this);
+
+int
+afr_xl_op(xlator_t *this, dict_t *input, dict_t *output);
+
+int
+afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid,
+ char **path_p);
+
+int
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
+ ia_type_t type);
+#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index ff9c88badd3..a51f79b1f43 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -1,1199 +1,2927 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "dict.h"
-#include "byte-order.h"
-#include "common-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/timer.h>
#include "afr.h"
#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
#include <signal.h>
+typedef enum {
+ AFR_TRANSACTION_PRE_OP,
+ AFR_TRANSACTION_POST_OP,
+} afr_xattrop_type_t;
-#define LOCKED_NO 0x0 /* no lock held */
-#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path
- of RENAME */
-#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */
+static void
+afr_lock_resume_shared(struct list_head *list);
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this);
-afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this)
-{
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno);
- ret = fd_ctx_get (fd, this, &ctx);
+void
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
- if (ret < 0)
- goto out;
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this);
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+int
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this);
-out:
- return fd_ctx;
-}
+gf_boolean_t
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this);
+gf_boolean_t
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_call_count(afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned char *failed_subvols,
+ unsigned int child_count);
+int
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume,
+ afr_xattrop_type_t op);
static void
-afr_pid_save (call_frame_t *frame)
-{
- afr_local_t * local = NULL;
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this);
- local = frame->local;
+static int
+afr_ta_post_op_do(void *opaque);
- local->saved_pid = frame->root->pid;
-}
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local);
+static int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);
static void
-afr_pid_restore (call_frame_t *frame)
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno);
+
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv)
+{
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ priv->ta_notify_dom_lock_offset = 0;
+}
+
+static void
+afr_ta_process_waitq(xlator_t *this)
{
- afr_local_t * local = NULL;
+ afr_local_t *entry = NULL;
+ afr_private_t *priv = this->private;
+ struct list_head waitq = {
+ 0,
+ };
+
+ INIT_LIST_HEAD(&waitq);
+ LOCK(&priv->lock);
+ list_splice_init(&priv->ta_waitq, &waitq);
+ UNLOCK(&priv->lock);
+ list_for_each_entry(entry, &waitq, ta_waitq)
+ {
+ afr_ta_decide_post_op_state(entry->transaction.frame, this);
+ }
+}
- local = frame->local;
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ afr_ta_process_waitq(ta_frame->this);
+ STACK_DESTROY(ta_frame->root);
+ return 0;
+}
- frame->root->pid = local->saved_pid;
+int
+afr_release_notify_lock_for_ta(void *opaque)
+{
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = -1;
+
+ this = (xlator_t *)opaque;
+ priv = this->private;
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+ flock.l_type = F_UNLCK;
+ flock.l_start = priv->ta_notify_dom_lock_offset;
+ flock.l_len = 1;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+ }
+
+ LOCK(&priv->lock);
+ {
+ afr_ta_locked_priv_invalidate(priv);
+ }
+ UNLOCK(&priv->lock);
+out:
+ loc_wipe(&loc);
+ return ret;
}
+void
+afr_zero_fill_stat(afr_local_t *local)
+{
+ if (!local)
+ return;
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION) {
+ gf_zero_fill_stat(&local->cont.inode_wfop.prebuf);
+ gf_zero_fill_stat(&local->cont.inode_wfop.postbuf);
+ } else if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ gf_zero_fill_stat(&local->cont.dir_fop.buf);
+ gf_zero_fill_stat(&local->cont.dir_fop.preparent);
+ gf_zero_fill_stat(&local->cont.dir_fop.postparent);
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION)
+ return;
+ gf_zero_fill_stat(&local->cont.dir_fop.prenewparent);
+ gf_zero_fill_stat(&local->cont.dir_fop.postnewparent);
+ }
+}
-static void
-__mark_all_pending (int32_t *pending[], int child_count,
- afr_transaction_type type)
+/* In case of errors afr needs to choose which xdata from lower xlators it needs
+ * to unwind with. The way it is done is by checking if there are
+ * any good subvols which failed. Give preference to errnos other than
+ * ENOTCONN even if the child is source */
+void
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+ unsigned char *readable1, inode_t *inode2,
+ unsigned char *readable2)
{
- int i;
- int j;
+ int s = -1; /*selection*/
+ int i = 0;
+ unsigned char *readable = NULL;
+
+ if (local->xdata_rsp) {
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ }
+
+ readable = alloca0(priv->child_count * sizeof(*readable));
+ if (inode2 && readable2) { /*rename fop*/
+ AFR_INTERSECT(readable, readable1, readable2, priv->child_count);
+ } else {
+ memcpy(readable, readable1, sizeof(*readable) * priv->child_count);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret >= 0)
+ continue;
+
+ if (local->replies[i].op_errno == ENOTCONN)
+ continue;
+
+ /*Order is important in the following condition*/
+ if ((s < 0) || (!readable[s] && readable[i]))
+ s = i;
+ }
+
+ if (s != -1 && local->replies[s].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[s].xdata);
+ } else if (s == -1) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret >= 0)
+ continue;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (1);
+ if (!local->replies[i].xdata)
+ continue;
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ break;
}
+ }
}
+gf_boolean_t
+afr_needs_changelog_update(afr_local_t *local)
+{
+ if (local->transaction.type == AFR_DATA_TRANSACTION)
+ return _gf_true;
+ if (!local->optimistic_change_log)
+ return _gf_true;
+ return _gf_false;
+}
-static void
-__mark_child_dead (int32_t *pending[], int child_count, int child,
- afr_transaction_type type)
+gf_boolean_t
+afr_changelog_has_quorum(afr_local_t *local, xlator_t *this)
{
- int j;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ unsigned char *success_children = NULL;
- j = afr_index_for_transaction_type (type);
+ priv = this->private;
+ success_children = alloca0(priv->child_count);
- pending[child][j] = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i]) {
+ success_children[i] = 1;
+ }
+ }
+
+ if (afr_has_quorum(success_children, this, NULL)) {
+ return _gf_true;
+ }
+
+ return _gf_false;
}
+gf_boolean_t
+afr_is_write_subvol_valid(call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ uint64_t write_subvol = 0;
+ unsigned char *writable = NULL;
+ uint16_t datamap = 0;
+
+ local = frame->local;
+ priv = this->private;
+ writable = alloca0(priv->child_count);
+
+ write_subvol = afr_write_subvol_get(frame, this);
+ datamap = (write_subvol & 0x00000000ffff0000) >> 16;
+ for (i = 0; i < priv->child_count; i++) {
+ if (datamap & (1 << i))
+ writable[i] = 1;
+
+ if (writable[i] && !local->transaction.failed_subvols[i])
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
-static void
-__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+int
+afr_transaction_fop(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ unsigned char *failed_subvols = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ failed_subvols = local->transaction.failed_subvols;
+ call_count = priv->child_count -
+ AFR_COUNT(failed_subvols, priv->child_count);
+ /* Fail if pre-op did not succeed on quorum no. of bricks. */
+ if (!afr_changelog_has_quorum(local, this) || !call_count) {
+ local->op_ret = -1;
+ /* local->op_errno is already captured in changelog cbk. */
+ afr_transaction_resume(frame, this);
+ return 0;
+ }
+
+ /* Fail if at least one writeable brick isn't up.*/
+ if (local->transaction.type == AFR_DATA_TRANSACTION &&
+ !afr_is_write_subvol_valid(frame, this)) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ afr_transaction_resume(frame, this);
+ return 0;
+ }
- local = frame->local;
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] && !failed_subvols[i]) {
+ local->transaction.wind(frame, this, i);
- if (!local->fd)
- return;
+ if (!--call_count)
+ break;
+ }
+ }
- fd_ctx = afr_fd_ctx_get (local->fd, this);
+ return 0;
+}
- if (!fd_ctx)
- goto out;
+int
+afr_transaction_done(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t unwind = _gf_false;
+ afr_lock_t *lock = NULL;
+ afr_local_t *lock_local = NULL;
+
+ priv = this->private;
+ local = frame->local;
- LOCK (&local->fd->lock);
+ if (priv->consistent_metadata) {
+ LOCK(&frame->lock);
{
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- fd_ctx->pre_op_done[child_index]++;
+ unwind = (local->transaction.main_frame != NULL);
}
- UNLOCK (&local->fd->lock);
-out:
- return;
+ UNLOCK(&frame->lock);
+ if (unwind) /*It definitely did post-op*/
+ afr_zero_fill_stat(local);
+ }
+
+ if (local->transaction.do_eager_unlock) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ lock->acquired = _gf_false;
+ lock->release = _gf_false;
+ list_splice_init(&lock->frozen, &lock->waiting);
+ if (list_empty(&lock->waiting))
+ goto unlock;
+ lock_local = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ list_del_init(&lock_local->transaction.wait_list);
+ list_add(&lock_local->transaction.owner_list, &lock->owners);
+ }
+ unlock:
+ UNLOCK(&local->inode->lock);
+ }
+ if (lock_local) {
+ afr_lock(lock_local->transaction.frame,
+ lock_local->transaction.frame->this);
+ }
+ local->transaction.unwind(frame, this);
+
+ GF_ASSERT(list_empty(&local->transaction.owner_list));
+ GF_ASSERT(list_empty(&local->transaction.wait_list));
+ AFR_STACK_DESTROY(frame);
+
+ return 0;
}
+static void
+afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
+{
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t, transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ each->op_ret = -1;
+ each->op_errno = local->op_errno;
+ afr_transaction_done(each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
static void
-__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+afr_handle_lock_acquire_failure(afr_local_t *local)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
- local = frame->local;
+ if (!local->transaction.eager_lock_on)
+ goto out;
- if (!local->fd)
- return;
+ lock = &local->inode_ctx->lock[local->transaction.type];
- fd_ctx = afr_fd_ctx_get (local->fd, this);
+ INIT_LIST_HEAD(&shared);
+ LOCK(&local->inode->lock);
+ {
+ lock->release = _gf_true;
+ list_splice_init(&lock->waiting, &shared);
+ }
+ UNLOCK(&local->inode->lock);
- if (!fd_ctx)
- goto out;
-
- LOCK (&local->fd->lock);
- {
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- fd_ctx->pre_op_done[child_index]--;
- }
- UNLOCK (&local->fd->lock);
+ afr_lock_fail_shared(local, &shared);
+ local->transaction.do_eager_unlock = _gf_true;
out:
- return;
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ afr_unlock(local->transaction.frame, local->transaction.frame->this);
}
-
-static void
-__mark_down_children (int32_t *pending[], int child_count,
- unsigned char *child_up, afr_transaction_type type)
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame)
{
- int i;
- int j;
+ afr_local_t *local = NULL;
+ call_frame_t *fop_frame = NULL;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
+ local = frame->local;
- if (!child_up[i])
- pending[i][j] = 0;
- }
-}
+ afr_handle_inconsistent_fop(frame, &local->op_ret, &local->op_errno);
+ LOCK(&frame->lock);
+ {
+ fop_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK(&frame->lock);
+ return fop_frame;
+}
static void
-__mark_all_success (int32_t *pending[], int child_count,
- afr_transaction_type type)
+afr_save_lk_owner(call_frame_t *frame)
{
- int i;
- int j;
+ afr_local_t *local = NULL;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (-1);
- }
-}
+ local = frame->local;
+ local->saved_lk_owner = frame->root->lk_owner;
+}
-static int
-__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
+static void
+afr_restore_lk_owner(call_frame_t *frame)
{
- int ret = 0;
-
- switch (type) {
- case AFR_DATA_TRANSACTION:
- if (priv->data_change_log)
- ret = 1;
+ afr_local_t *local = NULL;
- break;
+ local = frame->local;
- case AFR_METADATA_TRANSACTION:
- if (priv->metadata_change_log)
- ret = 1;
+ frame->root->lk_owner = local->saved_lk_owner;
+}
- break;
+void
+__mark_all_success(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i;
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- if (priv->entry_change_log)
- ret = 1;
+ local = frame->local;
+ priv = this->private;
- break;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ local->transaction.failed_subvols[i] = 0;
+ }
+}
- return ret;
+void
+afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_transaction_type type = -1;
+ dict_t *xdata = NULL;
+ int **matrix = NULL;
+ int idx = -1;
+ int i = 0;
+ int j = 0;
+
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
+ idx = afr_index_for_transaction_type(type);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.changelog_xdata[i])
+ continue;
+ xdata = local->transaction.changelog_xdata[i];
+ afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+ }
+
+ memset(local->transaction.pre_op_sources, 1, priv->child_count);
+
+ /*If lock or pre-op failed on a brick, it is not a source. */
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->transaction.pre_op_sources[i] = 0;
+ }
+
+ /* If brick is blamed by others, it is not a source. */
+ for (i = 0; i < priv->child_count; i++)
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j] != 0)
+ local->transaction.pre_op_sources[j] = 0;
}
+void
+afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_sources_count = 0;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_compute_pre_op_sources(frame, this);
+ pre_op_sources_count = AFR_COUNT(local->transaction.pre_op_sources,
+ priv->child_count);
+
+ /* If arbiter is the only source, do not proceed. */
+ if (pre_op_sources_count < 2 &&
+ local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
+ }
+
+ afr_transaction_fop(frame, this);
+
+ return;
+}
-static int
-__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
+int
+afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int ret = 0;
+ int failure_count = 0;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ INIT_LIST_HEAD(&shared);
+ if (local->transaction.type == AFR_DATA_TRANSACTION &&
+ !local->transaction.inherited) {
+ ret = afr_write_subvol_set(frame, this);
+ if (ret) {
+ /*act as if operation failed on all subvols*/
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
+ }
+ }
+
+ if (local->pre_op_compat)
+ /* old mode, pre-op was done as afr_changelog_do()
+ just now, before OP */
+ afr_changelog_pre_op_update(frame, this);
+
+ if (!local->transaction.eager_lock_on || local->transaction.inherited)
+ goto fop;
+ failure_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failure_count == priv->child_count) {
+ afr_handle_lock_acquire_failure(local);
+ return 0;
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ lock->acquired = _gf_true;
+ __afr_transaction_wake_shared(local, &shared);
+ }
+ UNLOCK(&local->inode->lock);
+ }
+
+fop:
+ /* Perform fops with the lk-owner from top xlator.
+ * Eg: lk-owner of posix-lk and flush should be same,
+ * flush cant clear the posix-lks without that lk-owner.
+ */
+ afr_save_lk_owner(frame);
+ frame->root->lk_owner = local->transaction.main_frame->root->lk_owner;
+
+ if (priv->arbiter_count == 1) {
+ afr_txn_arbitrate_fop(frame, this);
+ } else {
+ afr_transaction_fop(frame, this);
+ }
+
+ afr_lock_resume_shared(&shared);
+ return 0;
+}
- int op_ret = 0;
+int
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
+{
+ int i = 0;
+ int ret = 0;
- priv = this->private;
- local = frame->local;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], pending[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ /* 3 = data+metadata+entry */
- if (__changelog_enabled (priv, local->transaction.type)) {
- switch (local->op) {
+ if (ret)
+ break;
+ }
- case GF_FOP_WRITE:
- case GF_FOP_FTRUNCATE:
- op_ret = 1;
- break;
+ return ret;
+}
- case GF_FOP_FLUSH:
- op_ret = 0;
- break;
+static void
+afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ gf_boolean_t release = _gf_false;
+
+ LOCK(&priv->lock);
+ {
+ /*Once we get notify lock release upcall notification,
+ if any of the fop state counters are non-zero, we will
+ not release the lock.
+ */
+ onwire_count = priv->ta_on_wire_txn_count;
+ inmem_count = priv->ta_in_mem_txn_count;
+ switch (fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ onwire_count = --priv->ta_on_wire_txn_count;
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ case TA_INFO_IN_MEMORY_FAILED:
+ inmem_count = --priv->ta_in_mem_txn_count;
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ GF_ASSERT(0);
+ break;
+ case TA_SUCCESS:
+ break;
+ }
+ release = priv->release_ta_notify_dom_lock;
+ }
+ UNLOCK(&priv->lock);
- default:
- op_ret = 1;
- }
- }
+ if (inmem_count != 0 || release == _gf_false || onwire_count != 0)
+ return;
- return op_ret;
+ afr_ta_lock_release_synctask(this);
}
+static void
+afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *entry = NULL;
+ int bad_child = AFR_CHILD_UNKNOWN;
+
+ struct list_head onwireq = {
+ 0,
+ };
+ INIT_LIST_HEAD(&onwireq);
+
+ LOCK(&priv->lock);
+ {
+ bad_child = priv->ta_bad_child_index;
+ if (bad_child == AFR_CHILD_UNKNOWN) {
+ /*The previous on-wire ta_post_op was a failure. Just dequeue
+ *one element to wind on-wire again. */
+ entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ } else {
+ /* Prepare to process all fops based on bad_child_index. */
+ list_splice_init(&priv->ta_onwireq, &onwireq);
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (entry) {
+ afr_ta_post_op_synctask(this, entry);
+ return;
+ } else {
+ while (!list_empty(&onwireq)) {
+ entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ if (entry->ta_failed_subvol == bad_child) {
+ afr_post_op_handle_success(entry->transaction.frame, this);
+ } else {
+ afr_post_op_handle_failure(entry->transaction.frame, this, EIO);
+ }
+ }
+ }
+}
-static int
-__changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
+int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+ int_lock = &local->internal_lock;
+
+ if (priv->thin_arbiter_count) {
+ /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */
+ afr_ta_dom_lock_check_and_release(local->fop_state, this);
+ }
+
+ /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
+ if (!afr_changelog_has_quorum(local, this)) {
+ local->op_ret = -1;
+ /*local->op_errno is already captured in changelog cbk*/
+ }
+
+ if (local->transaction.resume_stub) {
+ call_resume(local->transaction.resume_stub);
+ local->transaction.resume_stub = NULL;
+ }
+
+ int_lock->lock_cbk = afr_transaction_done;
+ afr_unlock(frame, this);
+
+ return 0;
+}
- int op_ret = 0;
- afr_transaction_type type = -1;
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+ afr_local_t *local = frame->local;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- priv = this->private;
- local = frame->local;
- type = local->transaction.type;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB,
+ "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op],
+ uuid_utoa(local->inode->gfid), local->fop_state);
- if (__changelog_enabled (priv, type)) {
- switch (local->op) {
+ afr_changelog_post_op_done(frame, this);
+}
- case GF_FOP_WRITE:
- case GF_FOP_FTRUNCATE:
- op_ret = 1;
- break;
+unsigned char *
+afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)
+{
+ /*Because same set of subvols participate in all lockee
+ * entities*/
+ return int_lock->lockee[0].locked_nodes;
+}
- case GF_FOP_FLUSH:
- op_ret = 0;
- break;
+int
+afr_changelog_call_count(afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned char *failed_subvols,
+ unsigned int child_count)
+{
+ int i = 0;
+ int call_count = 0;
- default:
- op_ret = 1;
- }
+ for (i = 0; i < child_count; i++) {
+ if (pre_op_subvols[i] && !failed_subvols[i]) {
+ call_count++;
}
+ }
- return op_ret;
-}
+ if (type == AFR_ENTRY_RENAME_TRANSACTION)
+ call_count *= 2;
+ return call_count;
+}
-static int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending)
+gf_boolean_t
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this)
{
- int i;
- int ret = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (priv->thin_arbiter_count) {
+ /* We need to perform post-op even if 1 data brick was down
+ * before the txn started.*/
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count))
+ return _gf_false;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ local->transaction.failed_subvols[i])
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin (xattr, priv->pending_key[i],
- pending[i], 3 * sizeof (int32_t));
- /* 3 = data+metadata+entry */
+void
+afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this)
+{
+ if (afr_is_symmetric_error(frame, this))
+ __mark_all_success(frame, this);
+}
- if (ret < 0)
- goto out;
+gf_boolean_t
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame)
+{
+ unsigned int quorum_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned int up_children_count = 0;
+
+ priv = this->private;
+ up_children_count = AFR_COUNT(subvols, priv->child_count);
+
+ if (afr_lookup_has_quorum(frame, up_children_count))
+ return _gf_true;
+
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ /*
+ * Special case for auto-quorum with an even number of nodes.
+ *
+ * A replica set with even count N can only handle the same
+ * number of failures as odd N-1 before losing "vanilla"
+ * quorum, and the probability of more simultaneous failures is
+ * actually higher. For example, with a 1% chance of failure
+ * we'd have a 0.03% chance of two simultaneous failures with
+ * N=3 but a 0.06% chance with N=4. However, the special case
+ * is necessary for N=2 because there's no real quorum in that
+ * case (i.e. can't normally survive *any* failures). In that
+ * case, we treat the first node as a tie-breaker, allowing
+ * quorum to be retained in some cases while still honoring the
+ * all-important constraint that there can not simultaneously
+ * be two partitioned sets of nodes each believing they have
+ * quorum. Of two equally sized sets, the one without that
+ * first node will lose.
+ *
+ * It turns out that the special case is beneficial for higher
+ * values of N as well. Continuing the example above, the
+ * probability of losing quorum with N=4 and this type of
+ * quorum is (very) slightly lower than with N=3 and vanilla
+ * quorum. The difference becomes even more pronounced with
+ * higher N. Therefore, even though such replica counts are
+ * unlikely to be seen in practice, we might as well use the
+ * "special" quorum then as well.
+ */
+ if ((up_children_count * 2) == priv->child_count) {
+ return subvols[0];
}
+ }
-out:
- return ret;
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ quorum_count = priv->child_count / 2 + 1;
+ } else {
+ quorum_count = priv->quorum_count;
+ }
+
+ if (up_children_count >= quorum_count)
+ return _gf_true;
+
+ return _gf_false;
}
+static gf_boolean_t
+afr_has_fop_quorum(call_frame_t *frame)
+{
+ xlator_t *this = frame->this;
+ afr_local_t *local = frame->local;
+ unsigned char *locked_nodes = NULL;
-static int
-afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
- afr_transaction_type type)
+ locked_nodes = afr_locked_nodes_get(local->transaction.type,
+ &local->internal_lock);
+ return afr_has_quorum(locked_nodes, this, NULL);
+}
+
+static gf_boolean_t
+afr_has_fop_cbk_quorum(call_frame_t *frame)
{
- int i;
- int ret = 0;
- int *arr = NULL;
- int index = 0;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ unsigned char *success = alloca0(priv->child_count);
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i])
+ if (!local->transaction.failed_subvols[i])
+ success[i] = 1;
+ }
+
+ return afr_has_quorum(success, this, NULL);
+}
- index = afr_index_for_transaction_type (type);
+gf_boolean_t
+afr_need_dirty_marking(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ gf_boolean_t need_dirty = _gf_false;
- for (i = 0; i < priv->child_count; i++) {
- arr = GF_CALLOC (3 * sizeof (int32_t), priv->child_count,
- gf_afr_mt_char);
- if (!arr) {
- ret = -1;
- goto out;
- }
+ local = frame->local;
- memcpy (arr, pending[i], 3 * sizeof (int32_t));
+ if (!priv->quorum_count || !local->optimistic_change_log)
+ return _gf_false;
- arr[index]++;
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION)
+ return _gf_false;
- ret = dict_set_bin (xattr, priv->pending_key[i],
- arr, 3 * sizeof (int32_t));
- /* 3 = data+metadata+entry */
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) ==
+ priv->child_count)
+ return _gf_false;
- if (ret < 0)
- goto out;
- }
+ if (!afr_has_fop_cbk_quorum(frame))
+ need_dirty = _gf_true;
-out:
- return ret;
+ return need_dirty;
}
-
-int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
+void
+afr_handle_quorum(call_frame_t *frame, xlator_t *this)
{
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ const char *file = NULL;
+ uuid_t gfid = {0};
+
+ local = frame->local;
+ priv = frame->this->private;
- switch (type) {
- case AFR_DATA_TRANSACTION:
- ret = priv->child_count;
- break;
+ if (priv->quorum_count == 0)
+ return;
- case AFR_METADATA_TRANSACTION:
- ret = priv->child_count;
- break;
+ /* If the fop already failed return right away to preserve errno */
+ if (local->op_ret == -1)
+ return;
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- ret = priv->child_count;
- break;
- }
+ /*
+ * Network split may happen just after the fops are unwound, so check
+ * if the fop succeeded in a way it still follows quorum. If it doesn't,
+ * mark the fop as failure, mark the changelogs so it reflects that
+ * failure.
+ *
+ * Scenario:
+ * There are 3 mounts on 3 machines(node1, node2, node3) all writing to
+ * single file. Network split happened in a way that node1 can't see
+ * node2, node3. Node2, node3 both of them can't see node1. Now at the
+ * time of sending write all the bricks are up. Just after write fop is
+ * wound on node1, network split happens. Node1 thinks write fop failed
+ * on node2, node3 so marks pending changelog for those 2 extended
+ * attributes on node1. Node2, node3 thinks writes failed on node1 so
+ * they mark pending changelog for node1. When the network is stable
+ * again the file already is in split-brain. These checks prevent
+ * marking pending changelog on other subvolumes if the fop doesn't
+ * succeed in a way it is still following quorum. So with this fix what
+ * is happening is, node1 will have all pending changelog(FOOL) because
+ * the write succeeded only on node1 but failed on node2, node3 so
+ * instead of marking pending changelogs on node2, node3 it just treats
+ * the fop as failure and goes into DIRTY state. Where as node2, node3
+ * say they are sources and have pending changelog to node1 so there is
+ * no split-brain with the fix. The problem is eliminated completely.
+ */
+
+ if (afr_has_fop_cbk_quorum(frame))
+ return;
- return ret;
+ if (afr_need_dirty_marking(frame, this))
+ goto set_response;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i])
+ afr_transaction_fop_failed(frame, frame->this, i);
+ }
+
+set_response:
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ if (local->op_errno == 0)
+ local->op_errno = afr_quorum_errno(priv);
+
+ if (local->fd) {
+ gf_uuid_copy(gfid, local->fd->inode->gfid);
+ file = uuid_utoa(gfid);
+ } else {
+ loc_path(&local->loc, local->loc.name);
+ file = local->loc.path;
+ }
+
+ gf_msg(frame->this->name, GF_LOG_WARNING, local->op_errno,
+ AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met", file,
+ gf_fop_list[local->op]);
+
+ switch (local->transaction.type) {
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ afr_pick_error_xdata(local, priv, local->parent, local->readable,
+ local->parent2, local->readable2);
+ break;
+ default:
+ afr_pick_error_xdata(local, priv, local->inode, local->readable,
+ NULL, NULL);
+ break;
+ }
}
-/* {{{ pending */
+int
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ loc->parent = inode_ref(priv->root_inode);
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) {
+ /* Except afr_ta_id_file_check() which is path based, all other gluster
+ * FOPS need gfid.*/
+ return -EINVAL;
+ }
+ gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ loc->inode = inode_new(loc->parent->table);
+ if (!loc->inode) {
+ loc_wipe(loc);
+ return -ENOMEM;
+ }
+ return 0;
+}
-int32_t
-afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+static int
+afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int child_index = 0;
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *txn_frame = NULL;
+ afr_ta_fop_state_t fop_state;
+
+ local = (afr_local_t *)opaque;
+ fop_state = local->fop_state;
+ txn_frame = local->transaction.frame;
+ this = frame->this;
+
+ if (ret == 0) {
+ /*Mark pending xattrs on the up data brick.*/
+ afr_post_op_handle_success(txn_frame, this);
+ } else {
+ afr_post_op_handle_failure(txn_frame, this, -ret);
+ }
+
+ STACK_DESTROY(frame->root);
+ afr_ta_process_onwireq(fop_state, this);
+
+ return 0;
+}
- int call_count = -1;
+int **
+afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, afr_local_t *local)
+{
+ int **changelog = NULL;
+ int idx = 0;
+ int ret = 0;
+ int i;
+
+ if (local->is_new_entry == _gf_true) {
+ changelog = afr_mark_pending_changelog(priv, pending, xattr,
+ local->cont.dir_fop.buf.ia_type);
+ } else {
+ idx = afr_index_for_transaction_type(local->transaction.type);
+ changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog) {
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ changelog[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict(priv, xattr, changelog);
+ if (ret < 0) {
+ afr_matrix_cleanup(changelog, priv->child_count);
+ return NULL;
+ }
+ }
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+out:
+ return changelog;
+}
- child_index = (long) cookie;
+static void
+afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local,
+ gf_boolean_t *valid)
+{
+ if (priv->ta_event_gen > local->ta_event_gen) {
+ /* We can't trust the ta's response anymore.*/
+ afr_ta_locked_priv_invalidate(priv);
+ *valid = _gf_false;
+ return;
+ }
+ return;
+}
- if (op_ret == 1) {
+static int
+afr_ta_post_op_do(void *opaque)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
+ dict_t *xattr = NULL;
+ unsigned char *pending = NULL;
+ int **changelog = NULL;
+ int failed_subvol = -1;
+ int success_subvol = -1;
+ loc_t loc = {
+ 0,
+ };
+ int i = 0;
+ int ret = 0;
+ gf_boolean_t valid = _gf_true;
+
+ local = (afr_local_t *)opaque;
+ this = local->transaction.frame->this;
+ priv = this->private;
+
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+
+ xattr = dict_new();
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pending = alloca0(priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ pending[i] = 1;
+ failed_subvol = i;
+ } else {
+ success_subvol = i;
}
-
- if (op_ret == 0) {
- __mark_pre_op_undone_on_fd (frame, this, child_index);
+ }
+
+ changelog = afr_set_changelog_xattr(priv, pending, xattr, local);
+
+ if (!changelog) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = afr_ta_post_op_lock(this, &loc);
+ if (ret)
+ goto out;
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s failed for gfid %s.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ }
+ LOCK(&priv->lock);
+ {
+ if (ret == 0) {
+ priv->ta_bad_child_index = failed_subvol;
+ } else if (ret == -EINVAL) {
+ priv->ta_bad_child_index = success_subvol;
+ ret = -EIO; /* TA failed the fop. Return EIO to application. */
}
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
+ afr_ta_locked_xattrop_validate(priv, local, &valid);
+ }
+ UNLOCK(&priv->lock);
+ if (valid == _gf_false) {
+ gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s for gfid %s invalidated due "
+ "to event-gen mismatch.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ ret = -EIO;
+ }
+
+ afr_ta_post_op_unlock(this, &loc);
+out:
+ if (xattr)
+ dict_unref(xattr);
- if (call_count == 0) {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
- }
+ if (changelog)
+ afr_matrix_cleanup(changelog, priv->child_count);
- return 0;
-}
+ loc_wipe(&loc);
+ return ret;
+}
-void
-afr_update_read_child (call_frame_t *frame, xlator_t *this, inode_t *inode,
- afr_transaction_type type)
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local)
{
- int curr_read_child = -1;
- int new_read_child = -1;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int **pending = NULL;
- int idx = 0;
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
+
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ goto err;
+ }
+ ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done,
+ ta_frame, local);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to launch post-op on thin arbiter for gfid %s",
+ uuid_utoa(local->inode->gfid));
+ STACK_DESTROY(ta_frame->root);
+ goto err;
+ }
+
+ return ret;
+err:
+ afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM);
+ return ret;
+}
- idx = afr_index_for_transaction_type (type);
+static void
+afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,
+ int *on_wire_count)
+{
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Put the fop in waitq until notify dom lock is released.*/
+ local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL;
+ list_add_tail(&local->ta_waitq, &priv->ta_waitq);
+ } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) {
+ /* Post-op on thin-arbiter to decide success/failure. */
+ local->fop_state = TA_GET_INFO_FROM_TA_FILE;
+ *on_wire_count = ++priv->ta_on_wire_txn_count;
+ if (*on_wire_count > 1) {
+ /*Avoid sending multiple on-wire post-ops on TA*/
+ list_add_tail(&local->ta_onwireq, &priv->ta_onwireq);
+ }
+ } else if (local->ta_failed_subvol == priv->ta_bad_child_index) {
+ /* Post-op on TA not needed as the fop failed on the in-memory bad
+ * brick. Just mark pending xattrs on the good data brick.*/
+ local->fop_state = TA_INFO_IN_MEMORY_SUCCESS;
+ priv->ta_in_mem_txn_count++;
+ } else {
+ /* Post-op on TA not needed as the fop succeeded only on the
+ * in-memory bad data brick and not the good one. Fail the fop.*/
+ local->fop_state = TA_INFO_IN_MEMORY_FAILED;
+ priv->ta_in_mem_txn_count++;
+ }
+ }
+ UNLOCK(&priv->lock);
+}
- priv = this->private;
- local = frame->local;
- curr_read_child = afr_read_child (this, inode);
- pending = local->pending;
+static void
+afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local)
+{
+ int i = 0;
- if (pending[curr_read_child][idx] != 0)
- return;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ local->ta_failed_subvol = i;
+ break;
+ }
+ }
+}
- /* need to set new read_child */
- for (new_read_child = 0; new_read_child < priv->child_count;
- new_read_child++) {
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
- if (!priv->child_up[new_read_child])
- /* child is down */
- continue;
+ local = frame->local;
+ if (local->is_new_entry == _gf_true) {
+ afr_mark_new_entry_changelog(frame, this);
+ }
+ afr_changelog_post_op_do(frame, this);
- if (pending[new_read_child][idx] == 0)
- /* op just failed */
- continue;
+ return;
+}
- break;
- }
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+ afr_changelog_post_op_fail(frame, this, op_errno);
- if (new_read_child == priv->child_count)
- /* all children uneligible. leave as-is */
- return;
+ return;
+}
- afr_set_read_child (this, inode, new_read_child);
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int on_wire_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_ta_set_fop_state(priv, local, &on_wire_count);
+
+ switch (local->fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ if (on_wire_count == 1)
+ afr_ta_post_op_synctask(this, local);
+ /*else, fop is queued in ta_onwireq.*/
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ /*Post releasing the notify lock, we will act on this queue*/
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ afr_post_op_handle_success(frame, this);
+ break;
+ case TA_INFO_IN_MEMORY_FAILED:
+ afr_post_op_handle_failure(frame, this, EIO);
+ break;
+ default:
+ break;
+ }
+ return;
}
+static void
+afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ afr_ta_fill_failed_subvol(priv, local);
+ gf_msg_debug(this->name, 0,
+ "Fop failed on data brick (%s) for gfid=%s. "
+ "ta info needed to decide fop result.",
+ priv->children[local->ta_failed_subvol]->name,
+ uuid_utoa(local->inode->gfid));
+ afr_ta_decide_post_op_state(frame, this);
+}
-int
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = this->private;
- afr_internal_lock_t *int_lock = NULL;
- int ret = 0;
- int i = 0;
- int call_count = 0;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ dict_t *xattr = NULL;
+ int i = 0;
+ int ret = 0;
+ int idx = 0;
+ int nothing_failed = 1;
+ gf_boolean_t need_undirty = _gf_false;
+
+ afr_handle_quorum(frame, this);
+ local = frame->local;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ xattr = dict_new();
+ if (!xattr) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ nothing_failed = afr_txn_nothing_failed(frame, this);
+
+ if (afr_changelog_pre_op_uninherit(frame, this))
+ need_undirty = _gf_false;
+ else
+ need_undirty = _gf_true;
+
+ if (local->op_ret < 0 && !nothing_failed) {
+ if (afr_need_dirty_marking(frame, this)) {
+ local->dirty[idx] = hton32(1);
+ goto set_dirty;
+ }
- afr_local_t * local = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- dict_t **xattr = NULL;
- int piggyback = 0;
- int index = 0;
- int nothing_failed = 1;
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ if (nothing_failed && !need_undirty) {
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ if (local->transaction.in_flight_sb) {
+ afr_changelog_post_op_fail(frame, this,
+ local->transaction.in_flight_sb_errno);
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+
+ ret = afr_set_pending_dict(priv, xattr, local->pending);
+ if (ret < 0) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ if (need_undirty)
+ local->dirty[idx] = hton32(-1);
+ else
+ local->dirty[idx] = hton32(0);
+
+set_dirty:
+ ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ afr_changelog_do(frame, this, xattr, afr_changelog_post_op_done,
+ AFR_TRANSACTION_POST_OP);
+out:
+ if (xattr)
+ dict_unref(xattr);
- local = frame->local;
- int_lock = &local->internal_lock;
+ return;
+}
- __mark_down_children (local->pending, priv->child_count,
- local->child_up, local->transaction.type);
+static int
+afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int failed_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (priv->thin_arbiter_count) {
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failed_count == 1) {
+ afr_handle_failure_using_thin_arbiter(frame, this);
+ return 0;
+ } else {
+ /* Txn either succeeded or failed on both data bricks. Let
+ * post_op_do handle it as the case might be. */
+ }
+ }
- if (local->fd)
- afr_update_read_child (frame, this, local->fd->inode,
- local->transaction.type);
+ afr_changelog_post_op_do(frame, this);
+ return 0;
+}
- xattr = alloca (priv->child_count * sizeof (*xattr));
- memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
- for (i = 0; i < priv->child_count; i++) {
- xattr[i] = get_new_dict ();
- dict_ref (xattr[i]);
+gf_boolean_t
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
+
+ local = frame->local;
+ priv = this->private;
+ ctx = local->inode_ctx;
+
+ type = afr_index_for_transaction_type(local->transaction.type);
+ if (type != AFR_DATA_TRANSACTION)
+ return !local->transaction.dirtied;
+
+ if (local->transaction.no_uninherit)
+ return _gf_false;
+
+ /* This function must be idempotent. So check if we
+ were called before and return the same answer again.
+
+ It is important to keep this function idempotent for
+ the call in afr_changelog_post_op_safe() to not have
+ side effects on the call from afr_changelog_post_op_now()
+ */
+ if (local->transaction.uninherit_done)
+ return local->transaction.uninherit_value;
+
+ LOCK(&local->inode->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] != ctx->pre_op_done[type][i]) {
+ ret = !local->transaction.dirtied;
+ goto unlock;
+ }
}
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
+ if (ctx->inherited[type]) {
+ ret = _gf_true;
+ ctx->inherited[type]--;
+ } else if (ctx->on_disk[type]) {
+ ret = _gf_false;
+ ctx->on_disk[type]--;
+ } else {
+ /* ASSERT */
+ ret = _gf_false;
}
- local->call_count = call_count;
+ if (!ctx->inherited[type] && !ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ ctx->pre_op_done[type][i] = 0;
+ }
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- if (local->fd)
- fdctx = afr_fd_ctx_get (local->fd, this);
+ local->transaction.uninherit_done = _gf_true;
+ local->transaction.uninherit_value = ret;
- if (call_count == 0) {
- /* no child is up */
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+ return ret;
+}
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- return 0;
- }
+gf_boolean_t
+afr_changelog_pre_op_inherit(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return _gf_false;
+
+ type = afr_index_for_transaction_type(local->transaction.type);
+
+ LOCK(&local->inode->lock);
+ {
+ if (!local->inode_ctx->on_disk[type]) {
+ /* nothing to inherit yet */
+ ret = _gf_false;
+ goto unlock;
+ }
- /* check if something has failed, to handle piggybacking */
- nothing_failed = 1;
- index = afr_index_for_transaction_type (local->transaction.type);
for (i = 0; i < priv->child_count; i++) {
- if (local->pending[i][index] == 0) {
- nothing_failed = 0;
- break;
- }
+ if (local->transaction.pre_op[i] !=
+ local->inode_ctx->pre_op_done[type][i]) {
+ /* either inherit exactly, or don't */
+ ret = _gf_false;
+ goto unlock;
+ }
}
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
+ local->inode_ctx->inherited[type]++;
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- {
- if (!fdctx) {
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- break;
- }
-
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_piggyback[i]) {
- fdctx->pre_op_piggyback[i]--;
- piggyback = 1;
- }
- }
- UNLOCK (&local->fd->lock);
-
- if (piggyback && !nothing_failed)
- ret = afr_set_piggyback_dict (priv, xattr[i],
- local->pending,
- local->transaction.type);
-
- if (nothing_failed && piggyback) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- } else {
- STACK_WIND_COOKIE (frame,
- afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- }
- break;
- case AFR_METADATA_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
+ ret = _gf_true;
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- call_count--;
- }
+ local->transaction.inherited = _gf_true;
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+ return ret;
+}
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
+gf_boolean_t
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
+ local = frame->local;
+ priv = this->private;
- /* fall through */
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ return _gf_false;
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
- }
+ if (local->transaction.inherited)
+ /* was already inherited in afr_changelog_pre_op */
+ return _gf_false;
- if (!--call_count)
- break;
- }
+ if (!local->transaction.dirtied)
+ return _gf_false;
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
+ if (!afr_txn_nothing_failed(frame, this))
+ return _gf_false;
+
+ type = afr_index_for_transaction_type(local->transaction.type);
+
+ ret = _gf_false;
+
+ LOCK(&local->inode->lock);
+ {
+ if (!local->inode_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ local->inode_ctx->pre_op_done[type][i] =
+ (!local->transaction.failed_subvols[i]);
+ } else {
+ for (i = 0; i < priv->child_count; i++)
+ if (local->inode_ctx->pre_op_done[type][i] !=
+ (!local->transaction.failed_subvols[i])) {
+ local->transaction.no_uninherit = 1;
+ goto unlock;
+ }
}
+ local->inode_ctx->on_disk[type]++;
- return 0;
-}
+ ret = _gf_true;
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
+ return ret;
+}
-int32_t
-afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+int
+afr_changelog_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = this->private;
- loc_t * loc = NULL;
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
- int call_count = -1;
- int child_index = (long) cookie;
+ local = frame->local;
+ child_index = (long)cookie;
- local = frame->local;
- loc = &local->loc;
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
- LOCK (&frame->lock);
- {
- if (op_ret == 1) {
- /* special op_ret for piggyback */
- }
+ if (xattr)
+ local->transaction.changelog_xdata[child_index] = dict_ref(xattr);
+
+ call_count = afr_frame_return(frame);
+
+ if (call_count == 0) {
+ local->transaction.changelog_resume(frame, this);
+ }
- if (op_ret == 0) {
- __mark_pre_op_done_on_fd (frame, this, child_index);
+ return 0;
+}
+
+void
+afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op,
+ dict_t **xdata, dict_t **newloc_xdata)
+{
+ int i = 0;
+ int ret = 0;
+ char *key = NULL;
+ int keylen = 0;
+ const char *name = NULL;
+ dict_t *xdata1 = NULL;
+ dict_t *xdata2 = NULL;
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t need_entry_key_set = _gf_true;
+
+ local = frame->local;
+ this = THIS;
+ priv = this->private;
+
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION)
+ goto out;
+
+ if (!priv->esh_granular)
+ goto out;
+
+ xdata1 = dict_new();
+ if (!xdata1)
+ goto out;
+
+ name = local->loc.name;
+ if (local->op == GF_FOP_LINK)
+ name = local->newloc.name;
+
+ switch (op) {
+ case AFR_TRANSACTION_PRE_OP:
+ key = GF_XATTROP_ENTRY_IN_KEY;
+ break;
+ case AFR_TRANSACTION_POST_OP:
+ if (afr_txn_nothing_failed(frame, this)) {
+ key = GF_XATTROP_ENTRY_OUT_KEY;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i])
+ continue;
+ need_entry_key_set = _gf_false;
+ break;
}
+ /* If the transaction itself did not fail and there
+ * are no failed subvolumes, check whether the fop
+ * failed due to a symmetric error. If it did, do
+ * not set the ENTRY_OUT xattr which would end up
+ * deleting a name index which was created possibly by
+ * an earlier entry txn that may have failed on some
+ * of the sub-volumes.
+ */
+ if (local->op_ret)
+ need_entry_key_set = _gf_false;
+ } else {
+ key = GF_XATTROP_ENTRY_IN_KEY;
+ }
+ break;
+ }
+
+ if (need_entry_key_set) {
+ keylen = strlen(key);
+ ret = dict_set_strn(xdata1, key, keylen, (char *)name);
+ if (ret)
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "%s/%s: Could not set %s key during xattrop",
+ uuid_utoa(local->loc.pargfid), local->loc.name, key);
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ xdata2 = dict_new();
+ if (!xdata2)
+ goto out;
- if (op_ret == -1) {
- local->child_up[child_index] = 0;
+ ret = dict_set_strn(xdata2, key, keylen,
+ (char *)local->newloc.name);
+ if (ret)
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "%s/%s: Could not set %s key during "
+ "xattrop",
+ uuid_utoa(local->newloc.pargfid), local->newloc.name,
+ key);
+ }
+ }
- if (op_errno == ENOTSUP) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop not supported by %s",
- priv->children[child_index]->name);
- local->op_ret = -1;
+ *xdata = xdata1;
+ *newloc_xdata = xdata2;
+ xdata1 = xdata2 = NULL;
+out:
+ if (xdata1)
+ dict_unref(xdata1);
+ return;
+}
- } else if (!child_went_down (op_ret, op_errno)) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop failed on child %s: %s",
- priv->children[child_index]->name,
- strerror (op_errno));
- }
- local->op_errno = op_errno;
- }
+int
+afr_changelog_prepare(xlator_t *this, call_frame_t *frame, int *call_count,
+ afr_changelog_resume_t changelog_resume,
+ afr_xattrop_type_t op, dict_t **xdata,
+ dict_t **newloc_xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
+ priv = this->private;
- if (call_count == 0) {
- if ((local->op_ret == -1) &&
- (local->op_errno == ENOTSUP)) {
- local->transaction.resume (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
+ *call_count = afr_changelog_call_count(
+ local->transaction.type, local->transaction.pre_op,
+ local->transaction.failed_subvols, priv->child_count);
- afr_pid_restore (frame);
+ if (*call_count == 0) {
+ changelog_resume(frame, this);
+ return -1;
+ }
- local->transaction.fop (frame, this);
- }
- }
+ afr_changelog_populate_xdata(frame, op, xdata, newloc_xdata);
+ local->call_count = *call_count;
- return 0;
+ local->transaction.changelog_resume = changelog_resume;
+ return 0;
}
-
int
-afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op)
{
- afr_private_t * priv = this->private;
- int i = 0;
- int ret = 0;
- int call_count = 0;
- dict_t **xattr = NULL;
- afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ dict_t *newloc_xdata = NULL;
+ int i = 0;
+ int call_count = 0;
+ int ret = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.changelog_xdata[i]) {
+ dict_unref(local->transaction.changelog_xdata[i]);
+ local->transaction.changelog_xdata[i] = NULL;
+ }
+ }
+
+ ret = afr_changelog_prepare(this, frame, &call_count, changelog_resume, op,
+ &xdata, &newloc_xdata);
- afr_local_t *local = NULL;
- int piggyback = 0;
+ if (ret)
+ return 0;
- local = frame->local;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i] ||
+ local->transaction.failed_subvols[i])
+ continue;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ if (!local->fd) {
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->xattrop,
+ &local->loc, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ } else {
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fxattrop,
+ local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ }
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
- xattr = alloca (priv->child_count * sizeof (*xattr));
- memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
+ STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr, newloc_xdata);
+ call_count--;
- for (i = 0; i < priv->child_count; i++) {
- xattr[i] = get_new_dict ();
- dict_ref (xattr[i]);
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd)
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fxattrop,
+ local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ else
+ STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ break;
}
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
+ if (!--call_count)
+ break;
+ }
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
- }
+ if (xdata)
+ dict_unref(xdata);
+ if (newloc_xdata)
+ dict_unref(newloc_xdata);
+ return 0;
+}
- if (call_count == 0) {
- /* no child is up */
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+static void
+afr_init_optimistic_changelog_for_txn(xlator_t *this, afr_local_t *local)
+{
+ int locked_count = 0;
+ afr_private_t *priv = NULL;
- local->internal_lock.lock_cbk =
- local->transaction.done;
- afr_unlock (frame, this);
- return 0;
- }
+ priv = this->private;
- local->call_count = call_count;
+ locked_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ if (priv->optimistic_change_log && locked_count == priv->child_count)
+ local->optimistic_change_log = 1;
- __mark_all_pending (local->pending, priv->child_count,
- local->transaction.type);
+ return;
+}
- if (local->fd)
- fdctx = afr_fd_ctx_get (local->fd, this);
+int
+afr_changelog_pre_op(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ int op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ unsigned char *locked_nodes = NULL;
+ int idx = -1;
+ gf_boolean_t pre_nop = _gf_true;
+ dict_t *xdata_req = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ locked_nodes = afr_locked_nodes_get(local->transaction.type, int_lock);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_nodes[i]) {
+ local->transaction.pre_op[i] = 1;
+ call_count++;
+ } else {
+ local->transaction.failed_subvols[i] = 1;
+ }
+ }
+
+ afr_init_optimistic_changelog_for_txn(this, local);
+
+ if (afr_changelog_pre_op_inherit(frame, this))
+ goto next;
+
+ /* This condition should not be met with present code, as
+ * transaction.done will be called if locks are not acquired on even a
+ * single node.
+ */
+ if (call_count == 0) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+
+ /* Check if the fop can be performed on at least
+ * quorum number of nodes.
+ */
+ if (priv->quorum_count && !afr_has_fop_quorum(frame)) {
+ op_errno = int_lock->lock_op_errno;
+ if (op_errno == 0)
+ op_errno = afr_quorum_errno(priv);
+ goto err;
+ }
+
+ xdata_req = dict_new();
+ if (!xdata_req) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (call_count < priv->child_count)
+ pre_nop = _gf_false;
+
+ /* Set an all-zero pending changelog so that in the cbk, we can get the
+ * current on-disk values. In a replica 3 volume with arbiter enabled,
+ * these values are needed to arrive at a go/ no-go of the fop phase to
+ * avoid ending up in split-brain.*/
+
+ ret = afr_set_pending_dict(priv, xdata_req, local->pending);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (afr_needs_changelog_update(local)) {
+ local->dirty[idx] = hton32(1);
+
+ ret = dict_set_static_bin(xdata_req, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- {
- if (!fdctx) {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- break;
- }
-
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_done[i]) {
- fdctx->pre_op_piggyback[i]++;
- piggyback = 1;
- fdctx->hit++;
- } else {
- fdctx->miss++;
- }
- }
- UNLOCK (&local->fd->lock);
-
- if (piggyback)
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
- case AFR_METADATA_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
+ pre_nop = _gf_false;
+ local->transaction.dirtied = 1;
+ }
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
-
- call_count--;
- }
+ if (pre_nop)
+ goto next;
+ if (!local->pre_op_compat) {
+ dict_copy(xdata_req, local->xdata_req);
+ goto next;
+ }
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+ afr_changelog_do(frame, this, xdata_req, afr_transaction_perform_fop,
+ AFR_TRANSACTION_PRE_OP);
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
+ if (xdata_req)
+ dict_unref(xdata_req);
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
+ return 0;
+next:
+ afr_transaction_perform_fop(frame, this);
- /* fall through */
+ if (xdata_req)
+ dict_unref(xdata_req);
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
- }
+ return 0;
+err:
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- if (!--call_count)
- break;
- }
+ afr_handle_lock_acquire_failure(local);
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+ if (xdata_req)
+ dict_unref(xdata_req);
- return 0;
+ return 0;
}
+int
+afr_post_nonblocking_lock_cbk(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ /* Initiate blocking locks if non-blocking has failed */
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg_debug(this->name, 0,
+ "Non blocking locks failed. Proceeding to blocking");
+ int_lock->lock_cbk = afr_internal_lock_finish;
+ afr_blocking_lock(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Non blocking locks done. Proceeding to FOP");
+
+ afr_internal_lock_finish(frame, this);
+ }
+
+ return 0;
+}
int
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking inodelks failed.");
- local->transaction.done (frame, this);
- } else {
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INTERNAL_LKS_FAILED,
+ "Blocking entrylks failed.");
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
+ afr_transaction_done(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Blocking entrylks done. Proceeding to FOP");
- return 0;
+ afr_internal_lock_finish(frame, this);
+ }
+ return 0;
}
-
int
-afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking inodelks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
- afr_blocking_lock (frame, this);
- } else {
+ GF_ASSERT(!int_lock->higher_locked);
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
+ int_lock->lock_cbk = afr_post_blocking_rename_cbk;
+ afr_blocking_lock(frame, this);
- return 0;
+ return 0;
}
+int
+afr_set_transaction_flock(xlator_t *this, afr_local_t *local,
+ afr_lockee_t *lockee)
+{
+ afr_private_t *priv = NULL;
+ struct gf_flock *flock = NULL;
+
+ priv = this->private;
+ flock = &lockee->flock;
+
+ if ((priv->arbiter_count || local->transaction.eager_lock_on ||
+ priv->full_lock) &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ /*Lock entire file to avoid network split brains.*/
+ flock->l_len = 0;
+ flock->l_start = 0;
+ } else {
+ flock->l_len = local->transaction.len;
+ flock->l_start = local->transaction.start;
+ }
+ flock->l_type = F_WRLCK;
+
+ return 0;
+}
int
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+afr_lock(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ int_lock->lock_cbk = afr_post_nonblocking_lock_cbk;
+ int_lock->domain = this->name;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ afr_set_transaction_flock(this, local, &int_lock->lockee[i]);
+ }
+
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ int_lock->lk_basename = local->transaction.basename;
+ if (local->transaction.parent_loc.path)
+ int_lock->lk_loc = &local->transaction.parent_loc;
+ else
+ GF_ASSERT(local->fd);
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ break;
+ }
+ afr_lock_nonblocking(frame, this);
- local = frame->local;
- int_lock = &local->internal_lock;
+ return 0;
+}
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking entrylks failed.");
- local->transaction.done (frame, this);
- } else {
+static gf_boolean_t
+afr_locals_overlap(afr_local_t *local1, afr_local_t *local2)
+{
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
+
+ return ((end1 >= start2) && (end2 >= start1));
+}
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
+gf_boolean_t
+afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check)
+{
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = NULL;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ list_for_each_entry(each, &lock->owners, transaction.owner_list)
+ {
+ if (afr_locals_overlap(each, local)) {
+ return _gf_true;
}
-
- return 0;
+ }
+
+ if (!waitlist_check)
+ return _gf_false;
+ list_for_each_entry(each, &lock->waiting, transaction.wait_list)
+ {
+ if (afr_locals_overlap(each, local)) {
+ return _gf_true;
+ }
+ }
+ return _gf_false;
}
+/* }}} */
+static void
+afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src,
+ xlator_t *this, int lockee_num)
+{
+ afr_private_t *priv = this->private;
+ afr_lockee_t *sl = &src->lockee[lockee_num];
+ afr_lockee_t *dl = &dst->lockee[lockee_num];
+
+ dst->domain = src->domain;
+ dl->flock.l_len = sl->flock.l_len;
+ dl->flock.l_start = sl->flock.l_start;
+ dl->flock.l_type = sl->flock.l_type;
+ dl->locked_count = sl->locked_count;
+ memcpy(dl->locked_nodes, sl->locked_nodes,
+ priv->child_count * sizeof(*dl->locked_nodes));
+}
-int
-afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+void
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ gf_boolean_t conflict = _gf_false;
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
+
+ while (!conflict) {
+ if (list_empty(&lock->waiting))
+ return;
+ each = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ if (afr_has_lock_conflict(each, _gf_false)) {
+ conflict = _gf_true;
+ }
+ if (conflict && !list_empty(&lock->owners))
+ return;
+ afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock,
+ each->transaction.frame->this, 0);
+ list_move_tail(&each->transaction.wait_list, shared);
+ list_add_tail(&each->transaction.owner_list, &lock->owners);
+ }
+}
- local = frame->local;
- int_lock = &local->internal_lock;
+static void
+afr_lock_resume_shared(struct list_head *list)
+{
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t, transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ afr_changelog_pre_op(each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
- afr_blocking_lock (frame, this);
+int
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+
+ local->internal_lock.lock_cbk = NULL;
+ if (!local->transaction.eager_lock_on) {
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_transaction_done(frame, this);
+ return 0;
+ }
+ afr_changelog_pre_op(frame, this);
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_handle_lock_acquire_failure(local);
} else {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
+ lock->event_generation = local->event_generation;
+ afr_changelog_pre_op(frame, this);
}
+ }
- return 0;
+ return 0;
}
-
-int
-afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_are_conflicting_ops_waiting(afr_local_t *local, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any {meta,}data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1 for metadata transactions and if num-inodelks > 1
+ * for data transactions
+ */
+
+ if (local->transaction.type == AFR_METADATA_TRANSACTION) {
+ if (local->inode_ctx->open_fd_count > 1) {
+ return _gf_true;
+ }
+ } else if (local->transaction.type == AFR_DATA_TRANSACTION) {
+ if (lock->num_inodelks > 1) {
+ return _gf_true;
+ }
+ }
- local = frame->local;
- int_lock = &local->internal_lock;
+ return _gf_false;
+}
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking entrylks failed.");
- local->transaction.done (frame, this);
- } else {
+gf_boolean_t
+afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
+ int delay)
+{
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t res = _gf_false;
+
+ local = frame->local;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ if (!afr_txn_nothing_failed(frame, this)) {
+ lock->release = _gf_true;
+ goto out;
+ }
+
+ if (afr_are_conflicting_ops_waiting(local, this)) {
+ lock->release = _gf_true;
+ goto out;
+ }
+
+ if (!list_empty(&lock->owners))
+ goto out;
+ else
+ GF_ASSERT(list_empty(&lock->waiting));
+
+ if (lock->release) {
+ goto out;
+ }
+
+ if (!delay) {
+ goto out;
+ }
+
+ if (local->transaction.disable_delayed_post_op) {
+ goto out;
+ }
+
+ if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
+ (local->op != GF_FOP_FSYNC)) {
+ /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
+ * they are fine too*/
+ goto out;
+ }
+
+ res = _gf_true;
+out:
+ return res;
+}
- gf_log (this->name, GF_LOG_DEBUG,
- "Blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
+void
+afr_delayed_changelog_wake_up_cbk(void *data)
+{
+ afr_lock_t *lock = NULL;
+ afr_local_t *local = data;
+ afr_local_t *timer_local = NULL;
+ struct list_head shared;
+
+ INIT_LIST_HEAD(&shared);
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ if (list_empty(&lock->owners) && (local == timer_local)) {
+ GF_ASSERT(list_empty(&lock->waiting));
+ /*Last owner*/
+ lock->release = _gf_true;
+ lock->delay_timer = NULL;
}
- return 0;
+ }
+ UNLOCK(&local->inode->lock);
+ afr_changelog_post_op_now(local->transaction.frame,
+ local->transaction.frame->this);
}
-
+/* SET operation */
int
-afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this)
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
+ LOCK(&local->inode->lock);
+ {
+ local->inode_ctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&local->inode->lock);
- GF_ASSERT (!int_lock->higher_locked);
-
- int_lock->lock_cbk = afr_post_blocking_rename_cbk;
- afr_blocking_lock (frame, this);
-
- return 0;
+ return 0;
}
-
-int
-afr_set_transaction_flock (afr_local_t *local)
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode)
{
- afr_internal_lock_t *int_lock = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ gf_boolean_t witness = _gf_false;
- int_lock = &local->internal_lock;
+ LOCK(&inode->lock);
+ {
+ (void)__afr_inode_ctx_get(this, inode, &ctx);
- int_lock->lk_flock.l_len = local->transaction.len;
- int_lock->lk_flock.l_start = local->transaction.start;
- int_lock->lk_flock.l_type = F_WRLCK;
+ if (ctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ ctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK(&inode->lock);
- return 0;
+ return witness;
}
int
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
+afr_changelog_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ priv = this->private;
+ local = frame->local;
- int_lock->transaction_lk_type = AFR_TRANSACTION_LK;
+ if (op_ret != 0) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, AFR_MSG_FSYNC_FAILED,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name, gf_fop_list[local->op]);
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- afr_set_transaction_flock (local);
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
- int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
+ call_count = afr_frame_return(frame);
- afr_nonblocking_inodelk (frame, this);
- break;
+ if (call_count == 0)
+ afr_changelog_post_op_now(frame, this);
- case AFR_ENTRY_RENAME_TRANSACTION:
+ return 0;
+}
- int_lock->lock_cbk = afr_post_blocking_rename_cbk;
- afr_blocking_lock (frame, this);
- break;
+int
+afr_changelog_fsync(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
- case AFR_ENTRY_TRANSACTION:
- int_lock->lk_basename = local->transaction.basename;
- if (&local->transaction.parent_loc)
- int_lock->lk_loc = &local->transaction.parent_loc;
- else
- GF_ASSERT (local->fd);
+ local = frame->local;
+ priv = this->private;
- int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
- afr_nonblocking_entrylk (frame, this);
- break;
- }
+ call_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now(frame, this);
return 0;
-}
+ }
+ local->call_count = call_count;
-int
-afr_lock (call_frame_t *frame, xlator_t *this)
-{
- afr_pid_save (frame);
+ xdata = dict_new();
+ if (xdata) {
+ ret = dict_set_int32_sizen(xdata, "batch-fsync", 1);
+ ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ }
- frame->root->pid = (long) frame->root;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
- afr_set_lk_owner (frame, this);
+ STACK_WIND_COOKIE(frame, afr_changelog_fsync_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fsync,
+ local->fd, 1, xdata);
+ if (!--call_count)
+ break;
+ }
- afr_set_lock_number (frame, this);
+ if (xdata)
+ dict_unref(xdata);
- return afr_lock_rec (frame, this);
+ return 0;
}
-
-/* }}} */
-
int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- priv = this->private;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- if (__changelog_needed_pre_op (frame, this)) {
- afr_changelog_pre_op (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
+
+ if (afr_changelog_pre_op_uninherit(frame, this) &&
+ afr_txn_nothing_failed(frame, this)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
+
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write(this, local->inode)) {
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
+
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync(frame, this);
+ } else {
+ afr_changelog_post_op_now(frame, this);
+ }
+
+ return 0;
+}
- afr_pid_restore (frame);
+void
+afr_changelog_post_op(call_frame_t *frame, xlator_t *this)
+{
+ struct timespec delta = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t post_op = _gf_true;
+ struct list_head shared;
+
+ priv = this->private;
+ delta.tv_sec = priv->post_op_delay_secs;
+ delta.tv_nsec = 0;
+
+ INIT_LIST_HEAD(&shared);
+ if (!local->transaction.eager_lock_on)
+ goto out;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ list_del_init(&local->transaction.owner_list);
+ list_add(&local->transaction.owner_list, &lock->post_op);
+ __afr_transaction_wake_shared(local, &shared);
+
+ if (!afr_is_delayed_changelog_post_op_needed(frame, this,
+ delta.tv_sec)) {
+ if (list_empty(&lock->owners))
+ lock->release = _gf_true;
+ goto unlock;
+ }
- local->transaction.fop (frame, this);
+ GF_ASSERT(lock->delay_timer == NULL);
+ lock->delay_timer = gf_timer_call_after(
+ this->ctx, delta, afr_delayed_changelog_wake_up_cbk, local);
+ if (!lock->delay_timer) {
+ lock->release = _gf_true;
+ } else {
+ post_op = _gf_false;
}
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- return 0;
-}
+ if (!list_empty(&shared)) {
+ afr_lock_resume_shared(&shared);
+ }
+out:
+ if (post_op) {
+ if (!local->transaction.eager_lock_on || lock->release) {
+ afr_changelog_post_op_safe(frame, this);
+ } else {
+ afr_changelog_post_op_now(frame, this);
+ }
+ }
+}
int
-afr_transaction_resume (call_frame_t *frame, xlator_t *this)
+afr_transaction_resume(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
+ local = frame->local;
- if (__changelog_needed_post_op (frame, this)) {
- afr_changelog_post_op (frame, this);
- } else {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
- }
+ afr_restore_lk_owner(frame);
- return 0;
-}
+ afr_handle_symmetric_errors(frame, this);
+ if (!local->pre_op_compat)
+ /* new mode, pre-op was done along
+ with OP */
+ afr_changelog_pre_op_update(frame, this);
+
+ afr_changelog_post_op(frame, this);
+
+ return 0;
+}
/**
* afr_transaction_fop_failed - inform that an fop failed
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index)
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, int child_index)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- __mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
+ local->transaction.failed_subvols[child_index] = 1;
}
+static gf_boolean_t
+__need_previous_lock_unlocked(afr_local_t *local)
+{
+ afr_lock_t *lock = NULL;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (!lock->acquired)
+ return _gf_false;
+ if (lock->acquired && lock->event_generation != local->event_generation)
+ return _gf_true;
+ return _gf_false;
+}
-int
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+void
+__afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock,
+ gf_boolean_t *do_pre_op, afr_local_t **timer_local)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_lock_t *lock = NULL;
+ afr_local_t *owner_local = NULL;
+ xlator_t *this = local->transaction.frame->this;
+
+ local->transaction.eager_lock_on = _gf_true;
+ afr_set_lk_owner(local->transaction.frame, this, local->inode);
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (__need_previous_lock_unlocked(local)) {
+ if (!list_empty(&lock->owners)) {
+ lock->release = _gf_true;
+ } else if (lock->delay_timer) {
+ lock->release = _gf_true;
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ /* It will be put in frozen list
+ * in the code flow below*/
+ } else {
+ *timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ lock->delay_timer = NULL;
+ }
+ }
+ }
+
+ if (lock->release) {
+ list_add_tail(&local->transaction.wait_list, &lock->frozen);
+ *take_lock = _gf_false;
+ goto out;
+ }
+
+ if (lock->delay_timer) {
+ *take_lock = _gf_false;
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ list_add_tail(&local->transaction.wait_list, &lock->frozen);
+ } else {
+ *timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars(&local->internal_lock,
+ &(*timer_local)->internal_lock, this, 0);
+ lock->delay_timer = NULL;
+ *do_pre_op = _gf_true;
+ list_add_tail(&local->transaction.owner_list, &lock->owners);
+ }
+ goto out;
+ }
+
+ if (!list_empty(&lock->owners)) {
+ if (!lock->acquired || afr_has_lock_conflict(local, _gf_true)) {
+ list_add_tail(&local->transaction.wait_list, &lock->waiting);
+ *take_lock = _gf_false;
+ goto out;
+ }
+ owner_local = list_entry(lock->owners.next, afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars(&local->internal_lock,
+ &owner_local->internal_lock, this, 0);
+ *take_lock = _gf_false;
+ *do_pre_op = _gf_true;
+ }
+
+ if (lock->acquired)
+ GF_ASSERT(!(*take_lock));
+ list_add_tail(&local->transaction.owner_list, &lock->owners);
+out:
+ return;
+}
- local = frame->local;
- priv = this->private;
+void
+afr_transaction_start(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ gf_boolean_t take_lock = _gf_true;
+ gf_boolean_t do_pre_op = _gf_false;
+ afr_local_t *timer_local = NULL;
+
+ priv = this->private;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION &&
+ local->transaction.type != AFR_METADATA_TRANSACTION)
+ goto lock_phase;
+
+ if (!priv->eager_lock)
+ goto lock_phase;
+
+ LOCK(&local->inode->lock);
+ {
+ __afr_eager_lock_handle(local, &take_lock, &do_pre_op, &timer_local);
+ }
+ UNLOCK(&local->inode->lock);
+lock_phase:
+ if (!local->transaction.eager_lock_on) {
+ afr_set_lk_owner(local->transaction.frame, this,
+ local->transaction.frame->root);
+ }
+
+ if (take_lock) {
+ afr_lock(local->transaction.frame, this);
+ } else if (do_pre_op) {
+ afr_changelog_pre_op(local->transaction.frame, this);
+ }
+ /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
+ * so that any inheriting can happen*/
+ if (timer_local)
+ afr_delayed_changelog_wake_up_cbk(timer_local);
+}
- afr_transaction_local_init (local, priv);
+int
+afr_write_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+ afr_local_t *local = frame->local;
+
+ if (err) {
+ AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+ goto fail;
+ }
+
+ afr_transaction_start(local, this);
+ return 0;
+fail:
+ local->transaction.unwind(frame, this);
+ AFR_STACK_DESTROY(frame);
+ return 0;
+}
- local->transaction.resume = afr_transaction_resume;
- local->transaction.type = type;
+int
+afr_transaction_lockee_init(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_private_t *priv = frame->this->private;
+ int ret = 0;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ ret = afr_add_inode_lockee(local, priv->child_count);
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ ret = afr_add_entry_lockee(local, &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ if (local->op == GF_FOP_RENAME) {
+ ret = afr_add_entry_lockee(
+ local, &local->transaction.new_parent_loc,
+ local->transaction.new_basename, priv->child_count);
+ if (ret) {
+ goto out;
+ }
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- afr_internal_lock_finish (frame, this);
- } else {
- afr_lock (frame, this);
- }
+ if (local->newloc.inode &&
+ IA_ISDIR(local->newloc.inode->ia_type)) {
+ ret = afr_add_entry_lockee(local, &local->newloc, NULL,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ }
+ } else if (local->op == GF_FOP_RMDIR) {
+ ret = afr_add_entry_lockee(local, &local->loc, NULL,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ }
+
+ if (int_lock->lockee_count > 1) {
+ qsort(int_lock->lockee, int_lock->lockee_count,
+ sizeof(*int_lock->lockee), afr_entry_lockee_cmp);
+ }
+ break;
+ }
+out:
+ return ret;
+}
- return 0;
+int
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ int event_generation = 0;
+
+ local = frame->local;
+ priv = this->private;
+ local->transaction.frame = frame;
+
+ local->transaction.type = type;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &ret)) {
+ ret = -ret; /*op_errno to ret conversion*/
+ goto out;
+ }
+
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
+ ret = afr_transaction_local_init(local, this);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_transaction_lockee_init(frame);
+ if (ret)
+ goto out;
+
+ if (type != AFR_METADATA_TRANSACTION) {
+ goto txn_start;
+ }
+
+ ret = afr_inode_get_readable(frame, local->inode, this, local->readable,
+ &event_generation, type);
+ if (ret < 0 ||
+ afr_is_inode_refresh_reqd(local->inode, this, priv->event_generation,
+ event_generation)) {
+ afr_inode_refresh(frame, this, local->inode, local->loc.gfid,
+ afr_write_txn_refresh_done);
+ ret = 0;
+ goto out;
+ }
+
+txn_start:
+ ret = 0;
+ afr_transaction_start(local, this);
+out:
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index 84cf31d633e..beefa26f4a6 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -1,33 +1,75 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __TRANSACTION_H__
#define __TRANSACTION_H__
+#include "afr.h"
+
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
- int child_index);
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this,
+ int child_index);
+
+int32_t
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type);
int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int32_t **pending);
-int32_t
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
+void
+afr_delayed_changelog_wake_up(xlator_t *this, fd_t *fd);
+
+void
+__mark_all_success(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this);
+
+int
+afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type);
+
+int
+afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol);
+
+void
+afr_pending_read_increment(afr_private_t *priv, int child_index);
+void
+afr_pending_read_decrement(afr_private_t *priv, int child_index);
+
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame);
+gf_boolean_t
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame);
+gf_boolean_t
+afr_needs_changelog_update(afr_local_t *local);
+void
+afr_zero_fill_stat(afr_local_t *local);
+
+void
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+ unsigned char *readable1, inode_t *inode2,
+ unsigned char *readable2);
+int
+afr_transaction_resume(call_frame_t *frame, xlator_t *this);
+
+int
+afr_lock(call_frame_t *frame, xlator_t *this);
+
+void
+afr_delayed_changelog_wake_up_cbk(void *data);
+
+int
+afr_release_notify_lock_for_ta(void *opaque);
+
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 775a53a8fba..df7366f0a65 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#include <libgen.h>
@@ -24,990 +15,1330 @@
#include <stdlib.h>
#include <signal.h>
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
#include "afr-common.c"
+#include "afr-messages.h"
+
+struct volume_options options[];
+
+static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = {
+ [AFR_FAV_CHILD_NONE] = "none",
+ [AFR_FAV_CHILD_BY_SIZE] = "size",
+ [AFR_FAV_CHILD_BY_CTIME] = "ctime",
+ [AFR_FAV_CHILD_BY_MTIME] = "mtime",
+ [AFR_FAV_CHILD_BY_MAJORITY] = "majority",
+ [AFR_FAV_CHILD_POLICY_MAX] = NULL,
+};
int32_t
-notify (xlator_t *this, int32_t event,
- void *data, ...)
+notify(xlator_t *this, int32_t event, void *data, ...)
{
- int ret = -1;
+ int ret = -1;
+ va_list ap;
+ void *data2 = NULL;
- ret = afr_notify (this, event, data);
+ va_start(ap, data);
+ data2 = va_arg(ap, dict_t *);
+ va_end(ap);
+ ret = afr_notify(this, event, data, data2);
- return ret;
+ return ret;
}
int32_t
-mem_acct_init (xlator_t *this)
+mem_acct_init(xlator_t *this)
{
- int ret = -1;
-
- if (!this)
- return ret;
+ int ret = -1;
- ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1);
+ if (!this)
+ return ret;
- if (ret != 0) {
- gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
+ ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1);
+ if (ret != 0) {
return ret;
+ }
+
+ return ret;
}
+
int
-validate_options (xlator_t *this, dict_t *options, char **op_errstr)
+xlator_subvolume_index(xlator_t *this, xlator_t *subvol)
{
+ int index = -1;
+ int i = 0;
+ xlator_list_t *list = NULL;
-
- gf_boolean_t metadata_self_heal;
- gf_boolean_t entry_self_heal;
- gf_boolean_t data_self_heal;
- gf_boolean_t data_change_log;
- gf_boolean_t metadata_change_log;
- gf_boolean_t entry_change_log;
- gf_boolean_t strict_readdir;
-
- xlator_list_t * trav = NULL;
-
- char * read_subvol = NULL;
- char * self_heal = NULL;
- char * change_log = NULL;
- char * str_readdir = NULL;
-
- int32_t background_count = 0;
- int32_t window_size = 0;
-
- int read_ret = -1;
- int dict_ret = -1;
- int flag = 1;
- int ret = 0;
- int temp_ret = -1;
-
-
-
- dict_ret = dict_get_int32 (options, "background-self-heal-count",
- &background_count);
- if (dict_ret == 0) {
- if (background_count < 0) {
- *op_errstr = gf_strdup ("Error, option should be >= 0");
- ret = -1;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "validated background self-heal count to %d",
- background_count);
- }
+ list = this->children;
- dict_ret = dict_get_str (options, "metadata-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &metadata_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "validation failed 'option metadata"
- "-self-heal %s'.not correct.",
- self_heal);
- *op_errstr = gf_strdup ("Error, option should be boolean");
- ret = -1;
- goto out;
- }
-
-
-
+ while (list) {
+ if (subvol == list->xlator ||
+ strcmp(subvol->name, list->xlator->name) == 0) {
+ index = i;
+ break;
}
+ list = list->next;
+ i++;
+ }
- dict_ret = dict_get_str (options, "data-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &data_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Validation failed for data self heal",
- self_heal);
- *op_errstr = gf_strdup ("Error, option should be boolean");
- ret = -1;
- goto out;
- }
-
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option data"
- "-self-heal %s'.", self_heal);
- }
+ return index;
+}
- dict_ret = dict_get_str (options, "entry-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &entry_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Validation faled for entry-self-heal",
- self_heal);
- *op_errstr = gf_strdup ("Error, option should be boolean");
- ret = -1;
- goto out;
- }
-
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Validated 'option entry"
- "-self-heal %s'.", self_heal);
- }
+static void
+fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype,
+ dict_t *options)
+{
+ if (dict_get_sizen(options, "quorum-type") == NULL) {
+ /* If user doesn't configure anything enable auto-quorum if the
+ * replica has more than two subvolumes */
+ if (priv->child_count > 2)
+ qtype = "auto";
+ }
+
+ if (priv->quorum_count && strcmp(qtype, "fixed")) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE,
+ "quorum-type %s overriding quorum-count %u", qtype,
+ priv->quorum_count);
+ }
+
+ if (!strcmp(qtype, "none")) {
+ priv->quorum_count = 0;
+ } else if (!strcmp(qtype, "auto")) {
+ priv->quorum_count = AFR_QUORUM_AUTO;
+ }
+}
+int
+afr_set_favorite_child_policy(afr_private_t *priv, char *policy)
+{
+ int index = -1;
- dict_ret = dict_get_str (options, "strict-readdir",
- &str_readdir);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (str_readdir, &strict_readdir);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Validation faled for strict_readdir",
- str_readdir);
- *op_errstr = gf_strdup ("Error, option should be boolean");
- ret = -1;
- goto out;
- }
-
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Validated 'option strict"
- "-readdir %s'.", str_readdir);
- }
+ index = gf_get_index_by_elem(afr_favorite_child_policies, policy);
+ if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX)
+ return -1;
- dict_ret = dict_get_int32 (options, "data-self-heal-window-size",
- &window_size);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "validated data self-heal window size to %d",
- window_size);
+ priv->fav_child_policy = index;
- if (window_size < 0) {
- *op_errstr = gf_strdup ("Error, option should be >= 0");
- ret = -1;
- goto out;
- }
+ return 0;
+}
- if (window_size > 1024) {
- *op_errstr = gf_strdup ("Error, option should be <= 1024");
- ret = -1;
- goto out;
- }
+static void
+set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
+{
+ if (!algo) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+ } else if (strcmp(algo, "full") == 0) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL;
+ } else if (strcmp(algo, "diff") == 0) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF;
+ } else {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+ }
+}
+void
+afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options)
+{
+ char *volfile_id_str = NULL;
+ uuid_t anon_inode_gfid = {0};
+
+ /*If volume id is not present don't enable anything*/
+ if (dict_get_str(options, "volume-id", &volfile_id_str))
+ return;
+ GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX);
+ /*anon_inode_name is not supposed to change once assigned*/
+ if (!priv->anon_inode_name[0]) {
+ snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s",
+ AFR_ANON_DIR_PREFIX, volfile_id_str);
+ gf_uuid_parse(volfile_id_str, anon_inode_gfid);
+ /*Flip a bit to make sure volfile-id and anon-gfid are not same*/
+ anon_inode_gfid[0] ^= 1;
+ uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str);
+ }
+}
- }
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *read_subvol = NULL;
+ int read_subvol_index = -1;
+ int timeout_old = 0;
+ int ret = -1;
+ int index = -1;
+ char *qtype = NULL;
+ char *fav_child_policy = NULL;
+ char *data_self_heal = NULL;
+ char *data_self_heal_algorithm = NULL;
+ char *locking_scheme = NULL;
+ gf_boolean_t consistent_io = _gf_false;
+ gf_boolean_t choose_local_old = _gf_false;
+ gf_boolean_t enabled_old = _gf_false;
- dict_ret = dict_get_str (options, "data-change-log",
- &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log, &data_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Validation faled for data-change-log");
- *op_errstr = gf_strdup ("Error, option should be boolean");
- ret = -1;
- goto out;
- }
-
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Validated 'option data-"
- "change-log %s'.", change_log);
- }
+ priv = this->private;
- dict_ret = dict_get_str (options, "metadata-change-log",
- &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log,
- &metadata_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Validation faild for metadata-change-log");
- *op_errstr = gf_strdup ("Error, option should be boolean");
- ret = -1;
- goto out;
- }
-
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Validated 'option metadata-"
- "change-log %s'.", change_log);
- }
+ GF_OPTION_RECONF("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, options, bool, out);
- dict_ret = dict_get_str (options, "entry-change-log",
- &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log, &entry_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Validation faild for entr-change-log");
- *op_errstr = gf_strdup ("Error, option should be boolean");
- ret = -1;
- goto out;
- }
-
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Validated 'option entry-"
- "change-log %s'.", change_log);
- }
+ GF_OPTION_RECONF("background-self-heal-count",
+ priv->background_self_heal_count, options, uint32, out);
- read_ret = dict_get_str (options, "read-subvolume", &read_subvol);
-
- if (read_ret)
- goto next;// No need to traverse, hence set the next option
-
- trav = this->children;
- flag = 0;
- while (trav) {
- if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Validated Subvolume '%s' as read child.",
- trav->xlator->name);
-
- flag = 1;
- ret = 0;
- goto out;
- }
-
-
- trav = trav->next;
- }
+ GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options,
+ uint32, out);
- if (flag == 0 ) {
+ GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options,
+ bool, out);
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option read-subvolume %s', no such subvolume"
- , read_subvol);
- *op_errstr = gf_strdup ("Error, the sub-volume is not right");
- ret = -1;
- goto out;
- }
+ GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
-next:
-out:
- return ret;
+ GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
+ out);
+ GF_OPTION_RECONF("data-self-heal-window-size",
+ priv->data_self_heal_window_size, options, uint32, out);
-}
+ GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm,
+ options, str, out);
+ set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
+ GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out);
-int
-reconfigure (xlator_t *this, dict_t *options)
-{
+ GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+ options, uint32, out);
+
+ GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
+ options, uint32, out);
+
+ GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options,
+ uint32, out);
+
+ GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options,
+ uint32, out);
+
+ GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options,
+ uint32, out);
- gf_boolean_t metadata_self_heal; /* on/off */
- gf_boolean_t entry_self_heal;
- gf_boolean_t data_self_heal;
- gf_boolean_t data_change_log; /* on/off */
- gf_boolean_t metadata_change_log; /* on/off */
- gf_boolean_t entry_change_log; /* on/off */
- gf_boolean_t strict_readdir;
-
- afr_private_t * priv = NULL;
- xlator_list_t * trav = NULL;
-
- char * read_subvol = NULL;
- char * self_heal = NULL;
- char * change_log = NULL;
- char * str_readdir = NULL;
-
- int32_t background_count = 0;
- int32_t window_size = 0;
-
- int read_ret = -1;
- int dict_ret = -1;
- int flag = 1;
- int ret = 0;
- int temp_ret = -1;
-
- priv = this->private;
-
- dict_ret = dict_get_int32 (options, "background-self-heal-count",
- &background_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring background self-heal count to %d",
- background_count);
-
- priv->background_self_heal_count = background_count;
- }
-
- dict_ret = dict_get_str (options, "metadata-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &metadata_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Reconfiguration Invalid 'option metadata"
- "-self-heal %s'. Defaulting to old value.",
- self_heal);
- ret = -1;
- goto out;
- }
-
- priv->metadata_self_heal = metadata_self_heal;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option metadata"
- "-self-heal %s'.",
- self_heal);
- }
-
- dict_ret = dict_get_str (options, "data-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &data_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Reconfiguration Invalid 'option data"
- "-self-heal %s'. Defaulting to old value.",
- self_heal);
- ret = -1;
- goto out;
- }
-
- priv->data_self_heal = data_self_heal;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option data"
- "-self-heal %s'.", self_heal);
- }
-
- dict_ret = dict_get_str (options, "entry-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (self_heal, &entry_self_heal);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Reconfiguration Invalid 'option data"
- "-self-heal %s'. Defaulting to old value.",
- self_heal);
- ret = -1;
- goto out;
- }
-
- priv->entry_self_heal = entry_self_heal;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option entry"
- "-self-heal %s'.", self_heal);
- }
-
-
- dict_ret = dict_get_str (options, "strict-readdir",
- &str_readdir);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (str_readdir, &strict_readdir);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option strict-readdir %s'. "
- "Defaulting to old value.",
- str_readdir);
- ret = -1;
- goto out;
- }
-
- priv->strict_readdir = strict_readdir;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option strict"
- "-readdir %s'.", str_readdir);
- }
-
- dict_ret = dict_get_int32 (options, "data-self-heal-window-size",
- &window_size);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring, Setting data self-heal window size to %d",
- window_size);
-
- priv->data_self_heal_window_size = window_size;
- }
- else {
- priv->data_self_heal_window_size = 16;
+ GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out);
+
+ choose_local_old = priv->choose_local;
+ GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out);
+
+ if (choose_local_old != priv->choose_local) {
+ priv->read_child = -1;
+ if (choose_local_old == _gf_false)
+ priv->did_discovery = _gf_false;
+ }
+
+ GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out);
+
+ if (read_subvol) {
+ index = xlator_subvolume_index(this, read_subvol);
+ if (index == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%s not a subvolume", read_subvol->name);
+ goto out;
}
-
-
- dict_ret = dict_get_str (options, "data-change-log",
- &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log, &data_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Reconfiguration Invalid 'option data-"
- "change-log %s'. Defaulting to old value.",
- change_log);
- ret = -1;
- goto out;
- }
-
- priv->data_change_log = data_change_log;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option data-"
- "change-log %s'.", change_log);
- }
-
- dict_ret = dict_get_str (options, "metadata-change-log",
- &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log,
- &metadata_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option metadata-change-log %s'. "
- "Defaulting to metadata-change-log as 'off'.",
- change_log);
- ret = -1;
- goto out;
- }
-
- priv->metadata_change_log = metadata_change_log;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option metadata-"
- "change-log %s'.", change_log);
- }
-
- dict_ret = dict_get_str (options, "entry-change-log",
- &change_log);
- if (dict_ret == 0) {
- temp_ret = gf_string2boolean (change_log, &entry_change_log);
- if (temp_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option entry-change-log %s'. "
- "Defaulting to entry-change-log as 'on'.",
- change_log);
- ret = -1;
- goto out;
- }
-
- priv->entry_change_log = entry_change_log;
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfiguring 'option entry-"
- "change-log %s'.", change_log);
- }
-
- read_ret = dict_get_str (options, "read-subvolume", &read_subvol);
-
- if (read_ret < 0)
- goto next;// No need to traverse, hence set the next option
-
- trav = this->children;
- flag = 0;
- while (trav) {
- if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Subvolume '%s' specified as read child.",
- trav->xlator->name);
-
- flag = 1;
- ret = -1;
- goto out;
- }
-
-
- trav = trav->next;
- }
-
- if (flag == 0 ) {
-
- gf_log (this->name, GF_LOG_ERROR,
- "Invalid 'option read-subvolume %s', no such subvolume"
- , read_subvol);
- ret = -1;
- goto out;
- }
-
-next:
+ priv->read_child = index;
+ }
+
+ GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32,
+ out);
+
+ if (read_subvol_index > -1) {
+ index = read_subvol_index;
+ if (index >= priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%d not a subvolume-index", index);
+ goto out;
+ }
+ priv->read_child = index;
+ }
+
+ GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out);
+ GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out);
+ priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
+ GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out);
+ GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool,
+ out);
+
+ GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out);
+ GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log,
+ options, bool, out);
+ GF_OPTION_RECONF("quorum-type", qtype, options, str, out);
+ GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out);
+ fix_quorum_options(this, priv, qtype, options);
+ if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL))
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
+ "Client-quorum is not met");
+
+ GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options,
+ uint32, out);
+
+ GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options,
+ size_uint64, out);
+ /* Reset this so we re-discover in case the topology changed. */
+ GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options,
+ bool, out);
+
+ enabled_old = priv->shd.enabled;
+ GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out);
+
+ GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool,
+ out);
+
+ timeout_old = priv->shd.timeout;
+ GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out);
+
+ GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options,
+ bool, out);
+
+ GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32,
+ out);
+
+ GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options,
+ uint32, out);
+
+ GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str,
+ out);
+ if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+ goto out;
+
+ priv->did_discovery = _gf_false;
+
+ GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out);
+ if (priv->quorum_count != 0)
+ consistent_io = _gf_false;
+ priv->consistent_io = consistent_io;
+
+ afr_handle_anon_inode_options(priv, options);
+
+ GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool,
+ out);
+ if (priv->shd.enabled) {
+ if ((priv->shd.enabled != enabled_old) ||
+ (timeout_old != priv->shd.timeout))
+ afr_selfheal_childup(this, priv);
+ }
+
+ ret = 0;
out:
- return ret;
+ return ret;
+}
+static int
+afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this)
+{
+ int ret = -1;
+ int i = 0;
+ char *ptr = NULL;
+ char *ptr1 = NULL;
+ char *xattrs_list = NULL;
+ xlator_list_t *trav = NULL;
+ int child_count = -1;
+
+ trav = this->children;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the
+ * name of the thin arbiter file for persistence across add/
+ * removal of DHT subvols.*/
+ child_count++;
+ }
+
+ GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out);
+ priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count,
+ gf_afr_mt_char);
+ if (!priv->pending_key) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (!xattrs_list) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG,
+ "Unable to fetch afr-pending-xattr option from volfile."
+ " Falling back to using client translator names. ");
+
+ while (i < child_count) {
+ ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+ trav->xlator->name);
+ if (ret == -1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ trav = trav->next;
+ i++;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ ptr = ptr1 = gf_strdup(xattrs_list);
+ if (!ptr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) {
+ ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+ ptr);
+ if (ret == -1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ i++;
+ }
+ ret = 0;
+
+out:
+ GF_FREE(ptr1);
+ return ret;
}
+void
+afr_ta_init(afr_private_t *priv)
+{
+ priv->thin_arbiter_count = 1;
+ priv->child_count--;
+ priv->ta_child_up = 0;
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->ta_notify_dom_lock_offset = 0;
+ priv->ta_in_mem_txn_count = 0;
+ priv->ta_on_wire_txn_count = 0;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ INIT_LIST_HEAD(&priv->ta_waitq);
+ INIT_LIST_HEAD(&priv->ta_onwireq);
+ gf_uuid_clear(priv->ta_gfid);
+}
-static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
- "as the 'favorite child'. This means that if a discrepancy in the content "
- "or attributes (ownership, permission, etc.) of a file is detected among "
- "the subvolumes, the file on '%s' will be considered the definitive "
- "version and its contents will OVERWRITE the contents of the file on other "
- "subvolumes. All versions of the file except that on '%s' "
- "WILL BE LOST.";
-
-static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. "
- "This means correctness is NO LONGER GUARANTEED in all cases. If two or more "
- "applications write to the same region of a file, there is a possibility that "
- "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you "
- "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS "
- "RESPONSIBLE for inconsistent data. If you are in doubt, set it to a value "
- "greater than 0.";
-
-int32_t
-init (xlator_t *this)
+int32_t
+init(xlator_t *this)
{
- afr_private_t * priv = NULL;
- int child_count = 0;
- xlator_list_t * trav = NULL;
- int i = 0;
- int ret = -1;
- int op_errno = 0;
-
- char * read_subvol = NULL;
- char * fav_child = NULL;
- char * self_heal = NULL;
- char * algo = NULL;
- char * change_log = NULL;
- char * strict_readdir = NULL;
- char * inodelk_trace = NULL;
- char * entrylk_trace = NULL;
-
- int32_t background_count = 0;
- int32_t lock_server_count = 1;
- int32_t window_size = 0;
-
- int fav_ret = -1;
- int read_ret = -1;
- int dict_ret = -1;
-
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "replicate translator needs more than one "
- "subvolume defined.");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "Volume is dangling.");
- }
-
-
- ALLOC_OR_GOTO (this->private, afr_private_t, out);
-
- priv = this->private;
-
- read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol);
- priv->read_child = -1;
-
- fav_ret = dict_get_str (this->options, "favorite-child", &fav_child);
- priv->favorite_child = -1;
-
- priv->background_self_heal_count = 16;
-
- dict_ret = dict_get_int32 (this->options, "background-self-heal-count",
- &background_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting background self-heal count to %d",
- background_count);
-
- priv->background_self_heal_count = background_count;
- }
-
- /* Default values */
-
- priv->data_self_heal = 1;
- priv->metadata_self_heal = 1;
- priv->entry_self_heal = 1;
-
- dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->data_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option data-self-heal %s'. "
- "Defaulting to data-self-heal as 'on'",
- self_heal);
- priv->data_self_heal = 1;
- }
- }
-
- priv->data_self_heal_algorithm = "";
-
- dict_ret = dict_get_str (this->options, "data-self-heal-algorithm",
- &algo);
- if (dict_ret == 0) {
- priv->data_self_heal_algorithm = gf_strdup (algo);
+ afr_private_t *priv = NULL;
+ int child_count = 0;
+ xlator_list_t *trav = NULL;
+ int i = 0;
+ int ret = -1;
+ GF_UNUSED int op_errno = 0;
+ xlator_t *read_subvol = NULL;
+ int read_subvol_index = -1;
+ char *qtype = NULL;
+ char *fav_child_policy = NULL;
+ char *thin_arbiter = NULL;
+ char *data_self_heal = NULL;
+ char *locking_scheme = NULL;
+ char *data_self_heal_algorithm = NULL;
+
+ if (!this->children) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED,
+ "replicate translator needs more than one "
+ "subvolume defined.");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED,
+ "Volume is dangling.");
+ }
+
+ this->private = GF_CALLOC(1, sizeof(afr_private_t),
+ gf_afr_mt_afr_private_t);
+ if (!this->private)
+ goto out;
+
+ priv = this->private;
+ INIT_LIST_HEAD(&priv->saved_locks);
+ INIT_LIST_HEAD(&priv->lk_healq);
+ LOCK_INIT(&priv->lock);
+
+ child_count = xlator_subvolume_count(this);
+
+ priv->child_count = child_count;
+
+ priv->read_child = -1;
+
+ GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
+ GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
+ if (thin_arbiter && strlen(thin_arbiter) > 0) {
+ afr_ta_init(priv);
+ }
+ INIT_LIST_HEAD(&priv->healing);
+ INIT_LIST_HEAD(&priv->heal_waiting);
+
+ priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
+
+ GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out);
+
+ GF_OPTION_INIT("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, bool, out);
+
+ GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out);
+ if (read_subvol) {
+ priv->read_child = xlator_subvolume_index(this, read_subvol);
+ if (priv->read_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%s not a subvolume", read_subvol->name);
+ goto out;
+ }
+ }
+ GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out);
+ if (read_subvol_index > -1) {
+ if (read_subvol_index >= priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%d not a subvolume-index", read_subvol_index);
+ goto out;
}
+ priv->read_child = read_subvol_index;
+ }
+ GF_OPTION_INIT("choose-local", priv->choose_local, bool, out);
+
+ priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads),
+ priv->child_count, gf_afr_mt_atomic_t);
+
+ GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out);
+
+ priv->favorite_child = -1;
+
+ GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out);
+ if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+ goto out;
+
+ GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out);
+
+ GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out);
+
+ GF_OPTION_INIT("background-self-heal-count",
+ priv->background_self_heal_count, uint32, out);
+
+ GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);
+
+ GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
+
+ GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
+ out);
+ set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
+
+ GF_OPTION_INIT("data-self-heal-window-size",
+ priv->data_self_heal_window_size, uint32, out);
+
+ GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out);
+
+ GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out);
+
+ GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+ uint32, out);
+
+ GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32,
+ out);
+ GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out);
+ GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out);
+
+ GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out);
+
+ GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
+ uint32, out);
+
+ GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);
+
+ GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool,
+ out);
+
+ GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out);
+ GF_OPTION_INIT("locking-scheme", locking_scheme, str, out);
+ priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
+ GF_OPTION_INIT("full-lock", priv->full_lock, bool, out);
+ GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out);
+
+ GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out);
+ GF_OPTION_INIT("quorum-type", qtype, str, out);
+ GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out);
+ GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
+ out);
+ fix_quorum_options(this, priv, qtype, this->options);
+
+ GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
+ GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out);
+
+ GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out);
+ GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
+ GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out);
- priv->data_self_heal_window_size = 16;
-
- dict_ret = dict_get_int32 (this->options, "data-self-heal-window-size",
- &window_size);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting data self-heal window size to %d",
- window_size);
-
- priv->data_self_heal_window_size = window_size;
- }
-
- dict_ret = dict_get_str (this->options, "metadata-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->metadata_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option metadata-self-heal %s'. "
- "Defaulting to metadata-self-heal as 'on'.",
- self_heal);
- priv->metadata_self_heal = 1;
- }
- }
-
- dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->entry_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option entry-self-heal %s'. "
- "Defaulting to entry-self-heal as 'on'.",
- self_heal);
- priv->entry_self_heal = 1;
- }
- }
-
- /* Change log options */
-
- priv->data_change_log = 1;
- priv->metadata_change_log = 1;
- priv->entry_change_log = 1;
-
- dict_ret = dict_get_str (this->options, "data-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log, &priv->data_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option data-change-log %s'. "
- "Defaulting to data-change-log as 'on'.",
- change_log);
- priv->data_change_log = 1;
- }
- }
-
- dict_ret = dict_get_str (this->options, "metadata-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log,
- &priv->metadata_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option metadata-change-log %s'. "
- "Defaulting to metadata-change-log as 'off'.",
- change_log);
- priv->metadata_change_log = 0;
- }
- }
-
- dict_ret = dict_get_str (this->options, "entry-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log, &priv->entry_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option entry-change-log %s'. "
- "Defaulting to entry-change-log as 'on'.",
- change_log);
- priv->entry_change_log = 1;
- }
- }
-
- /* Locking options */
-
- priv->inodelk_trace = 0;
- priv->entrylk_trace = 0;
-
- dict_ret = dict_get_str (this->options, "inodelk-trace",
- &inodelk_trace);
- if (dict_ret == 0) {
- ret = gf_string2boolean (inodelk_trace, &priv->inodelk_trace);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option inodelk-trace %s' ",
- inodelk_trace);
-
- priv->inodelk_trace = 0;
- }
- }
-
-
- dict_ret = dict_get_str (this->options, "entrylk-trace",
- &entrylk_trace);
- if (dict_ret == 0) {
- ret = gf_string2boolean (entrylk_trace, &priv->entrylk_trace);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option entrylk-trace %s' ",
- inodelk_trace);
-
- priv->entrylk_trace = 0;
- }
- }
-
-
- priv->data_lock_server_count = 1;
- priv->metadata_lock_server_count = 0;
- priv->entry_lock_server_count = 1;
-
- dict_ret = dict_get_int32 (this->options, "data-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting data lock server count to %d.",
- lock_server_count);
-
- if (lock_server_count == 0)
- gf_log (this->name, GF_LOG_WARNING, "%s",
- no_lock_servers_warning_str);
-
- priv->data_lock_server_count = lock_server_count;
- }
-
-
- dict_ret = dict_get_int32 (this->options,
- "metadata-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting metadata lock server count to %d.",
- lock_server_count);
- priv->metadata_lock_server_count = lock_server_count;
- }
-
-
- dict_ret = dict_get_int32 (this->options, "entry-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Setting entry lock server count to %d.",
- lock_server_count);
-
- priv->entry_lock_server_count = lock_server_count;
- }
-
- priv->strict_readdir = _gf_false;
-
- dict_ret = dict_get_str (this->options, "strict-readdir",
- &strict_readdir);
- if (dict_ret == 0) {
- ret = gf_string2boolean (strict_readdir, &priv->strict_readdir);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Invalid 'option strict-readdir %s'. "
- "Defaulting to strict-readdir as 'off'.",
- strict_readdir);
- }
- }
-
- trav = this->children;
- while (trav) {
- if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Subvolume '%s' specified as read child.",
- trav->xlator->name);
-
- priv->read_child = child_count;
- }
-
- if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_WARNING,
- favorite_child_warning_str, trav->xlator->name,
- trav->xlator->name, trav->xlator->name);
- priv->favorite_child = child_count;
- }
-
- child_count++;
- trav = trav->next;
- }
-
- priv->wait_count = 1;
-
- priv->child_count = child_count;
-
- LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
-
- priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
+ GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
+ GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
+ afr_handle_anon_inode_options(priv, this->options);
+
+ GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out);
+ if (priv->quorum_count != 0)
+ priv->consistent_io = _gf_false;
+
+ priv->wait_count = 1;
+
+ priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char);
+ if (!priv->local) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
+
+ priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
+
+ priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
+ gf_afr_mt_child_latency_t);
+ priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count,
gf_afr_mt_char);
- if (!priv->child_up) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- ret = -ENOMEM;
- goto out;
- }
-
- priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
- gf_afr_mt_xlator_t);
- if (!priv->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- ret = -ENOMEM;
- goto out;
- }
-
- priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key),
- child_count,
- gf_afr_mt_char);
- if (!priv->pending_key) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- ret = -ENOMEM;
- goto out;
+
+ if (!priv->child_up || !priv->child_latency || !priv->halo_child_up ||
+ !priv->anon_inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /*Initialize to -ve ping timeout so that they are not considered
+ * in child-up events until ping-event comes*/
+ for (i = 0; i < child_count; i++)
+ priv->child_latency[i] = -1;
+
+ priv->children = GF_CALLOC(sizeof(xlator_t *), child_count,
+ gf_afr_mt_xlator_t);
+ if (!priv->children) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = afr_pending_xattrs_init(priv, this);
+ if (ret)
+ goto out;
+
+ trav = this->children;
+ i = 0;
+ while (i < child_count) {
+ priv->children[i] = trav->xlator;
+ trav = trav->next;
+ i++;
+ }
+
+ ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event),
+ gf_afr_mt_int32_t);
+ if (!priv->last_event) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this);
+ if (!this->itable) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (priv->shd.iamshd) {
+ ret = afr_selfheal_daemon_init(this);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
}
+ }
+
+ /* keep more local here as we may need them for self-heal etc */
+ this->local_pool = mem_pool_new(afr_local_t, 512);
+ if (!this->local_pool) {
+ ret = -1;
+ goto out;
+ }
+
+ priv->root_inode = NULL;
- trav = this->children;
- i = 0;
- while (i < child_count) {
- priv->children[i] = trav->xlator;
-
- ret = gf_asprintf (&priv->pending_key[i], "%s.%s",
- AFR_XATTR_PREFIX,
- trav->xlator->name);
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed to set pending key");
- ret = -ENOMEM;
- goto out;
- }
-
- trav = trav->next;
- i++;
- }
-
- LOCK_INIT (&priv->root_inode_lk);
- priv->first_lookup = 1;
- priv->root_inode = NULL;
-
- pthread_mutex_init (&priv->mutex, NULL);
- INIT_LIST_HEAD (&priv->saved_fds);
-
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
+}
+void
+afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = -1;
+
+ if (!healer)
+ return;
+
+ if (healer->running) {
+ /*
+ * If there are any resources to cleanup, We need
+ * to do that gracefully using pthread_cleanup_push
+ */
+ ret = gf_thread_cleanup_xint(healer->thread);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED,
+ "Failed to clean up healer threads.");
+ healer->thread = 0;
+ }
+ pthread_cond_destroy(&healer->cond);
+ pthread_mutex_destroy(&healer->mutex);
}
-
-int
-fini (xlator_t *this)
+void
+afr_selfheal_daemon_fini(xlator_t *this)
{
- return 0;
+ struct subvol_healer *healer = NULL;
+ afr_self_heald_t *shd = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+ if (!priv)
+ return;
+
+ shd = &priv->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ healer = &shd->full_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ if (shd->statistics[i])
+ eh_destroy(shd->statistics[i]);
+ }
+ GF_FREE(shd->index_healers);
+ GF_FREE(shd->full_healers);
+ GF_FREE(shd->statistics);
+ if (shd->split_brain)
+ eh_destroy(shd->split_brain);
}
+void
+fini(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ afr_selfheal_daemon_fini(this);
+ GF_ASSERT(list_empty(&priv->saved_locks));
+ LOCK(&priv->lock);
+ if (priv->timer != NULL) {
+ gf_timer_call_cancel(this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+ UNLOCK(&priv->lock);
+
+ if (this->local_pool != NULL) {
+ mem_pool_destroy(this->local_pool);
+ this->local_pool = NULL;
+ }
+
+ this->private = NULL;
+ afr_priv_destroy(priv);
+ if (this->itable) {
+ inode_table_destroy(this->itable);
+ this->itable = NULL;
+ }
+
+ return;
+}
struct xlator_fops fops = {
- .lookup = afr_lookup,
- .open = afr_open,
- .lk = afr_lk,
- .flush = afr_flush,
- .statfs = afr_statfs,
- .fsync = afr_fsync,
- .fsyncdir = afr_fsyncdir,
- .xattrop = afr_xattrop,
- .fxattrop = afr_fxattrop,
- .inodelk = afr_inodelk,
- .finodelk = afr_finodelk,
- .entrylk = afr_entrylk,
- .fentrylk = afr_fentrylk,
-
- /* inode read */
- .access = afr_access,
- .stat = afr_stat,
- .fstat = afr_fstat,
- .readlink = afr_readlink,
- .getxattr = afr_getxattr,
- .readv = afr_readv,
-
- /* inode write */
- .writev = afr_writev,
- .truncate = afr_truncate,
- .ftruncate = afr_ftruncate,
- .setxattr = afr_setxattr,
- .setattr = afr_setattr,
- .fsetattr = afr_fsetattr,
- .removexattr = afr_removexattr,
-
- /* dir read */
- .opendir = afr_opendir,
- .readdir = afr_readdir,
- .readdirp = afr_readdirp,
-
- /* dir write */
- .create = afr_create,
- .mknod = afr_mknod,
- .mkdir = afr_mkdir,
- .unlink = afr_unlink,
- .rmdir = afr_rmdir,
- .link = afr_link,
- .symlink = afr_symlink,
- .rename = afr_rename,
+ .lookup = afr_lookup,
+ .lk = afr_lk,
+ .flush = afr_flush,
+ .statfs = afr_statfs,
+ .fsyncdir = afr_fsyncdir,
+ .inodelk = afr_inodelk,
+ .finodelk = afr_finodelk,
+ .entrylk = afr_entrylk,
+ .fentrylk = afr_fentrylk,
+ .ipc = afr_ipc,
+ .lease = afr_lease,
+
+ /* inode read */
+ .access = afr_access,
+ .stat = afr_stat,
+ .fstat = afr_fstat,
+ .readlink = afr_readlink,
+ .getxattr = afr_getxattr,
+ .fgetxattr = afr_fgetxattr,
+ .readv = afr_readv,
+ .seek = afr_seek,
+
+ /* inode write */
+ .writev = afr_writev,
+ .truncate = afr_truncate,
+ .ftruncate = afr_ftruncate,
+ .setxattr = afr_setxattr,
+ .fsetxattr = afr_fsetxattr,
+ .setattr = afr_setattr,
+ .fsetattr = afr_fsetattr,
+ .removexattr = afr_removexattr,
+ .fremovexattr = afr_fremovexattr,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
+ .xattrop = afr_xattrop,
+ .fxattrop = afr_fxattrop,
+ .fsync = afr_fsync,
+
+ /*inode open*/
+ .opendir = afr_opendir,
+ .open = afr_open,
+
+ /* dir read */
+ .readdir = afr_readdir,
+ .readdirp = afr_readdirp,
+
+ /* dir write */
+ .create = afr_create,
+ .mknod = afr_mknod,
+ .mkdir = afr_mkdir,
+ .unlink = afr_unlink,
+ .rmdir = afr_rmdir,
+ .link = afr_link,
+ .symlink = afr_symlink,
+ .rename = afr_rename,
};
-
struct xlator_dumpops dumpops = {
- .priv = afr_priv_dump,
+ .priv = afr_priv_dump,
};
-
struct xlator_cbks cbks = {
- .release = afr_release,
- .releasedir = afr_releasedir,
+ .release = afr_release,
+ .releasedir = afr_releasedir,
+ .forget = afr_forget,
};
-
struct volume_options options[] = {
- { .key = {"read-subvolume" },
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"favorite-child"},
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"background-self-heal-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
- },
- { .key = {"data-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"data-self-heal-algorithm"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"data-self-heal-window-size"},
- .type = GF_OPTION_TYPE_INT,
- .min = 1,
- .max = 1024
- },
- { .key = {"metadata-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"entry-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"data-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"metadata-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"entry-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"data-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
- },
- { .key = {"metadata-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
- },
- { .key = {"entry-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
- },
- { .key = {"strict-readdir"},
- .type = GF_OPTION_TYPE_BOOL,
- },
- { .key = {NULL} },
+ {.key = {"read-subvolume"},
+ .type = GF_OPTION_TYPE_XLATOR,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. Afr will prefer the one specified using "
+ "this option if it is not stale. Option value must be "
+ "one of the xlator names of the children. "
+ "Ex: <volname>-client-0 till "
+ "<volname>-client-<number-of-bricks - 1>"},
+ {.key = {"read-subvolume-index"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "-1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one specified using "
+ "this option if it is not stale. allowed options"
+ " include -1 till replica-count - 1"},
+ {.key = {"read-hash-mode"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 5,
+ .default_value = "1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description =
+ "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one computed using "
+ "the method specified using this option.\n"
+ "0 = first readable child of AFR, starting from 1st child.\n"
+ "1 = hash by GFID of file (all clients use "
+ "same subvolume).\n"
+ "2 = hash by GFID of file and client PID.\n"
+ "3 = brick having the least outstanding read requests.\n"
+ "4 = brick having the least network ping latency.\n"
+ "5 = Hybrid mode between 3 and 4, ie least value among "
+ "network-latency multiplied by outstanding-read-requests."},
+ {
+ .key = {"choose-local"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "true",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Choose a local subvolume (i.e. Brick) to read from"
+ " if read-subvolume is not explicitly set.",
+ },
+ {.key = {"background-self-heal-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 256,
+ .default_value = "8",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This specifies the number of per client self-heal "
+ "jobs that can perform parallel heals in the "
+ "background."},
+ {.key = {"halo-shd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for shd halo replication in msec."},
+ {.key = {"halo-enabled"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "False",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Enable Halo (geo) replication mode."},
+ {.key = {"halo-nfsd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "5",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for nfsd halo replication in msec."},
+ {.key = {"halo-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = AFR_HALO_MAX_LATENCY,
+ .default_value = "5",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for halo replication in msec."},
+ {.key = {"halo-max-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "The maximum number of halo replicas; replicas"
+ " beyond this value will be written asynchronously"
+ "via the SHD."},
+ {.key = {"halo-min-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "2",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "The minimmum number of halo replicas, before adding "
+ "out of region replicas."},
+ {.key = {"heal-wait-queue-length"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
+ .default_value = "128",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .op_version = {GD_OP_VERSION_3_7_10},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This specifies the number of heals that can be queued"
+ " for the parallel background self heal jobs."},
+ {.key = {"data-self-heal"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false",
+ "disable", "open"},
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Using this option we can enable/disable data "
+ "self-heal on the file. \"open\" means data "
+ "self-heal action will only be triggered by file "
+ "open operations."},
+ {.key = {"data-self-heal-algorithm"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Select between \"full\", \"diff\". The "
+ "\"full\" algorithm copies the entire file from "
+ "source to sink. The \"diff\" algorithm copies to "
+ "sink only those blocks whose checksums don't match "
+ "with those of source. If no option is configured "
+ "the option is chosen dynamically as follows: "
+ "If the file does not exist on one of the sinks "
+ "or empty file exists or if the source file size is "
+ "about the same as page size the entire file will "
+ "be read and written i.e \"full\" algo, "
+ "otherwise \"diff\" algo is chosen.",
+ .value = {"diff", "full"}},
+ {.key = {"data-self-heal-window-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 1024,
+ .default_value = "1",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Maximum number blocks per file for which self-heal "
+ "process would be applied simultaneously."},
+ {.key = {"metadata-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica*/
+ .description = "Using this option we can enable/disable metadata "
+ "i.e. Permissions, ownerships, xattrs self-heal on "
+ "the file/directory."},
+ {.key = {"entry-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica*/
+ .description = "Using this option we can enable/disable entry "
+ "self-heal on the directory."},
+ {.key = {"data-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"metadata-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"entry-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Entry/Metadata fops will not perform "
+ "pre fop changelog operations in afr transaction "
+ "if this option is enabled."},
+ {.key = {"inodelk-trace"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enabling this option logs inode lock/unlocks"},
+ {.key = {"entrylk-trace"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enabling this option logs entry lock/unlocks"},
+ {.key = {"pre-op-compat"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Use separate pre-op xattrop() FOP rather than "
+ "overloading xdata of the OP"},
+ {.key = {"eager-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description =
+ "Enable/Disable eager lock for replica volume. "
+ "Lock phase of a transaction has two sub-phases. "
+ "First is an attempt to acquire locks in parallel by "
+ "broadcasting non-blocking lock requests. If lock "
+ "acquisition fails on any server, then the held locks "
+ "are unlocked and we revert to a blocking locks mode "
+ "sequentially on one server after another. If this "
+ "option is enabled the initial broadcasting lock "
+ "request attempts to acquire a full lock on the entire file. "
+ "If this fails, we revert back to the sequential "
+ "\"regional\" blocking locks as before. In the case "
+ "where such an \"eager\" lock is granted in the "
+ "non-blocking phase, it gives rise to an opportunity "
+ "for optimization. i.e, if the next write transaction "
+ "on the same FD arrives before the unlock phase of "
+ "the first transaction, it \"takes over\" the full "
+ "file lock. Similarly if yet another data transaction "
+ "arrives before the unlock phase of the \"optimized\" "
+ "transaction, that in turn \"takes over\" the lock as "
+ "well. The actual unlock now happens at the end of "
+ "the last \"optimized\" transaction."
+
+ },
+ {.key = {"self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica_heal_enable_disable*/
+ .description = "This option applies to only self-heal-daemon. "
+ "Index directory crawl and automatic healing of files "
+ "will not be performed if this option is turned off."},
+ {.key = {"iam-self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of self-heal-daemon "
+ "or not."},
+ {.key = {"iam-nfs-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of an NFS daemon "
+ "or not."},
+ {
+ .key = {"quorum-type"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"none", "auto", "fixed"},
+ .default_value = "none",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.option = quorum-type*/
+ .description = "If value is \"fixed\" only allow writes if "
+ "quorum-count bricks are present. If value is "
+ "\"auto\" only allow writes if more than half of "
+ "bricks, or exactly half including the first, are "
+ "present.",
+ },
+ {
+ .key = {"quorum-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = INT_MAX,
+ .default_value = 0,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.option = quorum-count*/
+ /*.validate_fn = validate_quorum_count*/
+ .description = "If quorum-type is \"fixed\" only allow writes if "
+ "this many bricks are present. Other quorum types "
+ "will OVERWRITE this value.",
+ },
+ {
+ .key = {"quorum-reads"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option has been removed. Reads are not allowed "
+ "if quorum is not met.",
+ },
+ {
+ .key = {"node-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Local glusterd uuid string, used in starting "
+ "self-heal-daemon so that it can crawl only on "
+ "local index directories.",
+ },
+ {
+ .key = {"post-op-delay-secs"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Time interval induced artificially before "
+ "post-operation phase of the transaction to "
+ "enhance overlap of adjacent write operations.",
+ },
+ {
+ .key = {AFR_SH_READDIR_SIZE_KEY},
+ .type = GF_OPTION_TYPE_SIZET,
+ .description = "readdirp size for performing entry self-heal",
+ .min = 1024,
+ .max = 131072,
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .default_value = "1KB",
+ },
+ {
+ .key = {"ensure-durability"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .op_version = {3},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Afr performs fsyncs for transactions if this "
+ "option is on to make sure the changelogs/data is "
+ "written to the disk",
+ .default_value = "on",
+ },
+ {
+ .key = {"afr-dirty-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = AFR_DIRTY_DEFAULT,
+ },
+ {.key = {"afr-pending-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Comma separated list of xattrs that are used to "
+ "capture information on pending heals."},
+ {
+ .key = {"metadata-splitbrain-forced-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
+ {.key = {"heal-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 5,
+ .max = INT_MAX,
+ .default_value = "600",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "time interval for checking the need to self-heal "
+ "in self-heal-daemon"},
+ {
+ .key = {"consistent-metadata"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is enabled, readdirp will force "
+ "lookups on those entries read whose read child is "
+ "not the same as that of the parent. This will "
+ "guarantee that all read operations on a file serve "
+ "attributes from the same subvol as long as it holds "
+ " a good copy of the file/dir.",
+ },
+ {.key = {"arbiter-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .description = "subset of child_count. Has to be 0 or 1."},
+ {
+ .key = {"thin-arbiter"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_4_1_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "contains host:path of thin abriter brick",
+ },
+ {.key = {"shd-max-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 64,
+ .default_value = "1",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Maximum number of parallel heals SHD can do per "
+ "local brick. This can substantially lower heal times"
+ ", but can also crush your bricks if you don't have "
+ "the storage hardware to support this."},
+ {
+ .key = {"shd-wait-qlength"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 655536,
+ .default_value = "1024",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option can be used to control number of heals"
+ " that can wait in SHD per subvolume",
+ },
+ {
+ .key = {"locking-scheme"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"full", "granular"},
+ .default_value = "full",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is set to granular, self-heal will "
+ "stop being compatible with afr-v1, which helps afr "
+ "be more granular while self-healing",
+ },
+ {.key = {"full-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes",
+ .op_version = {GD_OP_VERSION_3_13_2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "If this option is disabled, then the IOs will take "
+ "range locks same as versions till 3.13.1."},
+ {
+ .key = {"granular-entry-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is enabled, self-heal will resort to "
+ "granular way of recording changelogs and doing entry "
+ "self-heal.",
+ },
+ {
+ .key = {"favorite-child-policy"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"none", "size", "ctime", "mtime", "majority"},
+ .default_value = "none",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option can be used to automatically resolve "
+ "split-brains using various policies without user "
+ "intervention. \"size\" picks the file with the "
+ "biggest size as the source. \"ctime\" and \"mtime\" "
+ "pick the file with the latest ctime and mtime "
+ "respectively as the source. \"majority\" picks a file"
+ " with identical mtime and size in more than half the "
+ "number of bricks in the replica.",
+ },
+ {
+ .key = {"consistent-io"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .description = "If this option is enabled, i/o will fail even if "
+ "one of the bricks is down in the replicas",
+ },
+ {.key = {"use-compound-fops"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_8_4},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"use-anonymous-inode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "Setting this option heals directory renames efficiently"},
+
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "replicate",
+ .category = GF_MAINTAINED,
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 758ac789aff..d62f9a9caf2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1,922 +1,1423 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
#ifndef __AFR_H__
#define __AFR_H__
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "call-stub.h"
-#include "compat-errno.h"
+#include <glusterfs/call-stub.h>
+#include <glusterfs/compat-errno.h>
#include "afr-mem-types.h"
-#define AFR_XATTR_PREFIX "trusted.afr"
+#include "libxlator.h"
+#include <glusterfs/timer.h>
+#include <glusterfs/syncop.h>
+
+#include "afr-self-heald.h"
+#include "afr-messages.h"
+
+#define SHD_INODE_LRU_LIMIT 1
+#define AFR_PATHINFO_HEADER "REPLICATE:"
+#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
+#define AFR_DIRTY (((afr_private_t *)(THIS->private))->afr_dirty)
+
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
+#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/
+
+#define ARBITER_BRICK_INDEX 2
+#define THIN_ARBITER_BRICK_INDEX 2
+#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
+#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
+
+#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
+
+#define AFR_HALO_MAX_LATENCY 99999
+#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode"
+
+#define PFLAG_PENDING (1 << 0)
+#define PFLAG_SBRAIN (1 << 1)
+
+typedef int (*afr_lock_cbk_t)(call_frame_t *frame, xlator_t *this);
+
+typedef int (*afr_read_txn_wind_t)(call_frame_t *frame, xlator_t *this,
+ int subvol);
+
+typedef int (*afr_inode_refresh_cbk_t)(call_frame_t *frame, xlator_t *this,
+ int err);
+
+typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
+
+#define AFR_COUNT(array, max) \
+ ({ \
+ int __i; \
+ int __res = 0; \
+ for (__i = 0; __i < max; __i++) \
+ if (array[__i]) \
+ __res++; \
+ __res; \
+ })
+#define AFR_INTERSECT(dst, src1, src2, max) \
+ ({ \
+ int __i; \
+ for (__i = 0; __i < max; __i++) \
+ dst[__i] = src1[__i] && src2[__i]; \
+ })
+#define AFR_CMP(a1, a2, len) \
+ ({ \
+ int __cmp = 0; \
+ int __i; \
+ for (__i = 0; __i < len; __i++) \
+ if (a1[__i] != a2[__i]) { \
+ __cmp = 1; \
+ break; \
+ } \
+ __cmp; \
+ })
+#define AFR_IS_ARBITER_BRICK(priv, index) \
+ ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX))
+
+#define AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(ret, errnum) \
+ do { \
+ local->op_ret = ret; \
+ local->op_errno = errnum; \
+ if (local->op_errno == EIO) \
+ gf_msg(this->name, GF_LOG_ERROR, local->op_errno, \
+ AFR_MSG_SPLIT_BRAIN, \
+ "Failing %s on gfid %s: " \
+ "split-brain observed.", \
+ gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \
+ } while (0)
+
+#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \
+ do { \
+ afr_fd_ctx_t *__fd_ctx = NULL; \
+ __fd_ctx = afr_fd_ctx_get(__fd, __this); \
+ if (__fd_ctx && __fd_ctx->is_fd_bad) { \
+ __error = EBADF; \
+ goto __label; \
+ } \
+ } while (0)
-struct _pump_private;
+typedef enum {
+ AFR_READ_POLICY_FIRST_UP,
+ AFR_READ_POLICY_GFID_HASH,
+ AFR_READ_POLICY_GFID_PID_HASH,
+ AFR_READ_POLICY_LESS_LOAD,
+ AFR_READ_POLICY_LEAST_LATENCY,
+ AFR_READ_POLICY_LOAD_LATENCY_HYBRID,
+} afr_read_hash_mode_t;
-typedef struct _afr_private {
- gf_lock_t lock; /* to guard access to child_count, etc */
- unsigned int child_count; /* total number of children */
+typedef enum {
+ AFR_FAV_CHILD_NONE,
+ AFR_FAV_CHILD_BY_SIZE,
+ AFR_FAV_CHILD_BY_CTIME,
+ AFR_FAV_CHILD_BY_MTIME,
+ AFR_FAV_CHILD_BY_MAJORITY,
+ AFR_FAV_CHILD_POLICY_MAX,
+} afr_favorite_child_policy;
- unsigned int read_child_rr; /* round-robin index of the read_child */
- gf_lock_t read_child_lock; /* lock to protect above */
+typedef enum {
+ AFR_SELFHEAL_DATA_FULL = 0,
+ AFR_SELFHEAL_DATA_DIFF,
+ AFR_SELFHEAL_DATA_DYNAMIC,
+} afr_data_self_heal_type_t;
- xlator_t **children;
+typedef enum {
+ AFR_CHILD_UNKNOWN = -1,
+ AFR_CHILD_ZERO,
+ AFR_CHILD_ONE,
+ AFR_CHILD_THIN_ARBITER,
+} afr_child_index;
- gf_lock_t root_inode_lk;
- int first_lookup;
- inode_t *root_inode;
+typedef enum {
+ TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall
+ notification and waiting for its release.*/
+ TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get
+ *info about which brick is bad.*/
+ TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed
+ *on BAD brick - Success*/
+ TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed
+ *on GOOD brick - Failed*/
+ TA_SUCCESS, /*FOP succeeded on both data bricks.*/
+} afr_ta_fop_state_t;
+
+struct afr_nfsd {
+ uint32_t halo_max_latency_msec;
+ gf_boolean_t iamnfsd;
+};
+
+typedef struct _afr_lk_heal_info {
+ fd_t *fd;
+ int32_t cmd;
+ struct gf_flock flock;
+ dict_t *xdata_req;
+ unsigned char *locked_nodes;
+ struct list_head pos;
+ gf_lkowner_t lk_owner;
+ pid_t pid;
+ int32_t *child_up_event_gen;
+ int32_t *child_down_event_gen;
+} afr_lk_heal_info_t;
- unsigned char *child_up;
+typedef struct _afr_private {
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
+ unsigned int arbiter_count; /*subset of child_count.
+ Has to be 0 or 1.*/
+
+ xlator_t **children;
+
+ inode_t *root_inode;
+
+ int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
+ /* For thin-arbiter. */
+ uuid_t ta_gfid;
+ unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
+ int ta_bad_child_index;
+ int ta_event_gen;
+ unsigned int ta_in_mem_txn_count;
+ unsigned int ta_on_wire_txn_count;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+
+ unsigned char *anon_inode;
+ unsigned char *child_up;
+ unsigned char *halo_child_up;
+ int64_t *child_latency;
+ unsigned char *local;
+
+ char **pending_key;
+
+ afr_data_self_heal_type_t data_self_heal_algorithm;
+ unsigned int data_self_heal_window_size; /* max number of pipelined
+ read/writes */
+
+ struct list_head heal_waiting; /*queue for files that need heal*/
+ uint32_t heal_wait_qlen; /*configurable queue length for heal_waiting*/
+ int32_t heal_waiters; /* No. of elements currently in wait queue.*/
+
+ struct list_head healing; /* queue for files that are undergoing
+ background heal*/
+ uint32_t background_self_heal_count; /*configurable queue length for
+ healing queue*/
+ int32_t healers; /* No. of elements currently undergoing background
+ heal*/
+
+ gf_boolean_t release_ta_notify_dom_lock;
+
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
+
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+ int read_child; /* read-subvolume */
+ gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
+
+ gf_timer_t *timer; /* launched when parent up is received */
+
+ unsigned int wait_count; /* # of servers to wait for success */
+
+ unsigned char ta_child_up;
+ gf_boolean_t optimistic_change_log;
+ gf_boolean_t eager_lock;
+ gf_boolean_t pre_op_compat; /* on/off */
+ uint32_t post_op_delay_secs;
+ unsigned int quorum_count;
+
+ off_t ta_notify_dom_lock_offset;
+ afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
+ resolution of split-brains.*/
+ afr_read_hash_mode_t hash_mode; /* for when read_child is not set */
+
+ int32_t *last_event;
+
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
+ char vol_uuid[UUID_SIZE + 1];
+
+ gf_boolean_t choose_local;
+ gf_boolean_t did_discovery;
+ gf_boolean_t ensure_durability;
+ gf_boolean_t halo_enabled;
+ gf_boolean_t consistent_metadata;
+ gf_boolean_t need_heal;
+ gf_boolean_t granular_locks;
+ uint64_t sh_readdir_size;
+ char *sh_domain;
+ char *afr_dirty;
+
+ uint64_t spb_choice_timeout;
+
+ afr_self_heald_t shd;
+ struct afr_nfsd nfsd;
+
+ uint32_t halo_max_latency_msec;
+ uint32_t halo_max_replicas;
+ uint32_t halo_min_replicas;
+
+ gf_boolean_t full_lock;
+ gf_boolean_t esh_granular;
+ gf_boolean_t consistent_io;
+ gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t use_anon_inode;
+
+ /*For lock healing.*/
+ struct list_head saved_locks;
+ struct list_head lk_healq;
+
+ /*For anon-inode handling */
+ char anon_inode_name[NAME_MAX + 1];
+ char anon_gfid_str[UUID_SIZE + 1];
+} afr_private_t;
- char **pending_key;
+typedef enum {
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+} afr_transaction_type;
- gf_boolean_t data_self_heal; /* on/off */
- char * data_self_heal_algorithm; /* name of algorithm */
- unsigned int data_self_heal_window_size; /* max number of pipelined
- read/writes */
+/*
+ xattr format: trusted.afr.volume = [x y z]
+ x - data pending
+ y - metadata pending
+ z - entry pending
+*/
- unsigned int background_self_heal_count;
- unsigned int background_self_heals_started;
- gf_boolean_t metadata_self_heal; /* on/off */
- gf_boolean_t entry_self_heal; /* on/off */
+static inline int
+afr_index_for_transaction_type(afr_transaction_type type)
+{
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ return 0;
- gf_boolean_t data_change_log; /* on/off */
- gf_boolean_t metadata_change_log; /* on/off */
- gf_boolean_t entry_change_log; /* on/off */
+ case AFR_METADATA_TRANSACTION:
+ return 1;
- int read_child; /* read-subvolume */
- unsigned int favorite_child; /* subvolume to be preferred in resolving
- split-brain cases */
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ return 2;
+ }
- unsigned int data_lock_server_count;
- unsigned int metadata_lock_server_count;
- unsigned int entry_lock_server_count;
+ return -1; /* make gcc happy */
+}
- gf_boolean_t inodelk_trace;
- gf_boolean_t entrylk_trace;
+static inline int
+afr_index_from_ia_type(ia_type_t type)
+{
+ switch (type) {
+ case IA_IFDIR:
+ return afr_index_for_transaction_type(AFR_ENTRY_TRANSACTION);
+ case IA_IFREG:
+ return afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+ default:
+ return -1;
+ }
+}
- gf_boolean_t strict_readdir;
+typedef struct {
+ struct gf_flock flock;
+ loc_t loc;
+ fd_t *fd;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
- unsigned int wait_count; /* # of servers to wait for success */
+} afr_lockee_t;
- uint64_t up_count; /* number of CHILD_UPs we have seen */
- uint64_t down_count; /* number of CHILD_DOWNs we have seen */
+int
+afr_entry_lockee_cmp(const void *l1, const void *l2);
- struct _pump_private *pump_private; /* Set if we are loaded as pump */
- int use_afr_in_pump;
+typedef struct {
+ loc_t *lk_loc;
- pthread_mutex_t mutex;
- struct list_head saved_fds; /* list of fds on which locks have succeeded */
-} afr_private_t;
+ afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
-typedef struct {
- /* External interface: These are variables (some optional) that
- are set by whoever has triggered self-heal */
+ const char *lk_basename;
+ const char *lower_basename;
+ const char *higher_basename;
- gf_boolean_t need_data_self_heal;
- gf_boolean_t need_metadata_self_heal;
- gf_boolean_t need_entry_self_heal;
+ unsigned char *lower_locked_nodes;
- gf_boolean_t forced_merge; /* Is this a self-heal triggered to
- forcibly merge the directories? */
+ afr_lock_cbk_t lock_cbk;
- gf_boolean_t healing_fd_opened; /* true if caller has already
- opened fd */
+ int lockee_count;
- gf_boolean_t data_lock_held; /* true if caller has already
- acquired 0-0 lock */
+ int32_t lk_call_count;
+ int32_t lk_expected_count;
+ int32_t lk_attempted_count;
- fd_t *healing_fd; /* set if callers has opened fd */
+ int32_t lock_op_ret;
+ int32_t lock_op_errno;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+ int32_t lock_count;
+ char lower_locked;
+ char higher_locked;
+} afr_internal_lock_t;
- gf_boolean_t background; /* do self-heal in background
- if possible */
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ dict_t *xattr; /*For xattrop*/
+ dict_t *xdata;
+ struct iatt poststat;
+ struct iatt postparent;
+ struct iatt prestat;
+ struct iatt preparent;
+ struct iatt preparent2;
+ struct iatt postparent2;
+ int32_t op_errno;
+ /* For rchecksum */
+ uint8_t checksum[SHA256_DIGEST_LENGTH];
+ gf_boolean_t buf_has_zeroes;
+ gf_boolean_t fips_mode_rchecksum;
+ /* For lookup */
+ int8_t need_heal;
+};
- ia_type_t type; /* st_mode of the entry we're doing
- self-heal on */
+typedef enum {
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
+} afr_fd_open_status_t;
- /* Function to call to unwind. If self-heal is being done in the
- background, this function will be called as soon as possible. */
+typedef struct {
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+ int flags;
+
+ /* the subvolume on which the latest sequence of readdirs (starting
+ at offset 0) has begun. Till the next readdir request with 0 offset
+ arrives, we continue to read off this subvol.
+ */
+ int readdir_subvol;
+ /* lock-healing related members. */
+ gf_boolean_t is_fd_bad;
+ afr_lk_heal_info_t *lk_heal_info;
- int (*unwind) (call_frame_t *frame, xlator_t *this);
+} afr_fd_ctx_t;
- /* End of external interface members */
+typedef enum {
+ AFR_FOP_LOCK_PARALLEL,
+ AFR_FOP_LOCK_SERIAL,
+ AFR_FOP_LOCK_QUORUM_FAILED,
+} afr_fop_lock_state_t;
+
+typedef struct _afr_inode_lock_t {
+ /* @num_inodelks:
+ Number of inodelks queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ int32_t num_inodelks;
+ unsigned int event_generation;
+ gf_timer_t *delay_timer;
+ struct list_head owners; /*Transactions that are performing fop*/
+ struct list_head post_op; /*Transactions that are done with the fop
+ *So can not conflict with the fops*/
+ struct list_head waiting; /*Transaction that are waiting for
+ *conflicting transactions to complete*/
+ struct list_head frozen; /*Transactions that need to go as part of
+ * next batch of eager-lock*/
+ gf_boolean_t release;
+ gf_boolean_t acquired;
+} afr_lock_t;
+
+typedef struct _afr_inode_ctx {
+ uint64_t read_subvol;
+ uint64_t write_subvol;
+ int lock_count;
+ int spb_choice;
+ gf_timer_t *timer;
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+ /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
+ afr_lock_t lock[2];
+
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ uint32_t open_fd_count;
+ gf_boolean_t need_refresh;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+} afr_inode_ctx_t;
+typedef struct _afr_local {
+ glusterfs_fop_t op;
+ unsigned int call_count;
- /* array of stat's, one for each child */
- struct iatt *buf;
- struct iatt parentbuf;
+ /* @event_generation: copy of priv->event_generation taken at the
+ time of starting the transaction. The copy is made so that we
+ have a stable value through the various phases of the transaction.
+ */
+ unsigned int event_generation;
- /* array of xattr's, one for each child */
- dict_t **xattr;
+ uint32_t open_fd_count;
+ int32_t num_inodelks;
- /* array of errno's, one for each child */
- int *child_errno;
+ int32_t op_ret;
+ int32_t op_errno;
- int32_t **pending_matrix;
- int32_t **delta_matrix;
+ int dirty[AFR_NUM_CHANGE_LOGS];
- int *sources;
- int source;
- int active_source;
- int active_sinks;
- int *success;
- unsigned char *locked_nodes;
- int lock_count;
+ int32_t **pending;
- mode_t impunging_entry_mode;
- const char *linkname;
+ loc_t loc;
+ loc_t newloc;
- int op_failed;
+ fd_t *fd;
+ afr_fd_ctx_t *fd_ctx;
- int file_has_holes;
- blksize_t block_size;
- off_t file_size;
- off_t offset;
+ /* @child_up: copy of priv->child_up taken at the time of transaction
+ start. The copy is taken so that we have a stable child_up array
+ through the phases of the transaction as priv->child_up[i] can keep
+ changing through time.
+ */
+ unsigned char *child_up;
- loc_t parent_loc;
+ /* @read_attempted:
+ array of flags representing subvolumes where read operations of
+ the read transaction have already been attempted. The array is
+ first pre-filled with down subvolumes, and as reads are performed
+ on other subvolumes, those are set as well. This way if the read
+ operation fails we do not retry on that subvolume again.
+ */
+ unsigned char *read_attempted;
- call_frame_t *orig_frame;
- gf_boolean_t unwound;
+ /* @readfn:
- /* private data for the particular self-heal algorithm */
- void *private;
+ pointer to function which will perform the read operation on a given
+ subvolume. Used in read transactions.
+ */
- int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this);
+ afr_read_txn_wind_t readfn;
- int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
+ /* @inode:
- call_frame_t *sh_frame;
-} afr_self_heal_t;
+ the inode on which the read txn is performed on. ref'ed and copied
+ from either fd->inode or loc.inode
+ */
+ inode_t *inode;
-typedef enum {
- AFR_DATA_TRANSACTION, /* truncate, write, ... */
- AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
- AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
- AFR_ENTRY_RENAME_TRANSACTION, /* rename */
-} afr_transaction_type;
+ /* @parent[2]:
-typedef enum {
- AFR_TRANSACTION_LK,
- AFR_SELFHEAL_LK,
-} transaction_lk_type_t;
+ parent inode[s] on which directory transactions are performed.
+ */
-typedef enum {
- AFR_LOCK_OP,
- AFR_UNLOCK_OP,
-} afr_lock_op_type_t;
+ inode_t *parent;
+ inode_t *parent2;
-typedef enum {
- AFR_DATA_SELF_HEAL_LK,
- AFR_METADATA_SELF_HEAL_LK,
- AFR_ENTRY_SELF_HEAL_LK,
-}selfheal_lk_type_t;
+ /* @readable:
-typedef enum {
- AFR_INODELK_TRANSACTION,
- AFR_INODELK_NB_TRANSACTION,
- AFR_ENTRYLK_TRANSACTION,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_INODELK_SELFHEAL,
- AFR_INODELK_NB_SELFHEAL,
- AFR_ENTRYLK_SELFHEAL,
- AFR_ENTRYLK_NB_SELFHEAL,
-} afr_lock_call_type_t;
+ array of flags representing servers from which a read can be
+ performed. This is the output of afr_inode_refresh()
+ */
+ unsigned char *readable;
+ unsigned char *readable2; /*For rename transaction*/
-/*
- xattr format: trusted.afr.volume = [x y z]
- x - data pending
- y - metadata pending
- z - entry pending
-*/
+ afr_inode_refresh_cbk_t refreshfn;
-static inline int
-afr_index_for_transaction_type (afr_transaction_type type)
-{
- switch (type) {
+ /* @refreshinode:
- case AFR_DATA_TRANSACTION:
- return 0;
+ Inode currently getting refreshed.
+ */
+ inode_t *refreshinode;
- case AFR_METADATA_TRANSACTION:
- return 1;
+ dict_t *xattr_req;
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- return 2;
- }
+ dict_t *dict;
- return -1; /* make gcc happy */
-}
+ int read_subvol; /* Current read subvolume */
+ int optimistic_change_log;
-typedef struct {
- loc_t *lk_loc;
- struct gf_flock lk_flock;
+ afr_internal_lock_t internal_lock;
- const char *lk_basename;
- const char *lower_basename;
- const char *higher_basename;
- char lower_locked;
- char higher_locked;
+ /*To handle setattr/setxattr on yet to be linked inode from dht*/
+ uuid_t refreshgfid;
- unsigned char *locked_nodes;
- unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
- unsigned char *entry_locked_nodes;
+ /* @refreshed:
- selfheal_lk_type_t selfheal_lk_type;
- transaction_lk_type_t transaction_lk_type;
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
- int32_t lock_count;
- int32_t inodelk_lock_count;
- int32_t entrylk_lock_count;
+ gf_boolean_t update_num_inodelks;
+ gf_boolean_t update_open_fd_count;
- uint64_t lock_number;
- int32_t lk_call_count;
+ /*
+ @pre_op_compat:
- int32_t lock_op_ret;
- int32_t lock_op_errno;
+ compatibility mode of pre-op. send a separate pre-op and
+ op operations as part of transaction, rather than combining
+ */
- int (*lock_cbk) (call_frame_t *, xlator_t *);
+ gf_boolean_t pre_op_compat;
-} afr_internal_lock_t;
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
+ /*
+ This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
+
+ struct {
+ struct {
+ struct statvfs buf;
+ unsigned char buf_set;
+ } statfs;
+
+ struct {
+ fd_t *fd;
+ int32_t flags;
+ } open;
+
+ struct {
+ struct gf_flock user_flock;
+ struct gf_flock ret_flock;
+ unsigned char *locked_nodes;
+ int32_t cmd;
+ /*For lock healing only.*/
+ unsigned char *dom_locked_nodes;
+ int32_t *dom_lock_op_ret;
+ int32_t *dom_lock_op_errno;
+ struct gf_flock *getlk_rsp;
+ } lk;
+
+ /* inode read */
+
+ struct {
+ int32_t mask;
+ int last_index; /* index of the child we tried previously */
+ } access;
+
+ struct {
+ int last_index;
+ } stat;
+
+ struct {
+ int last_index;
+ } fstat;
+
+ struct {
+ size_t size;
+ int last_index;
+ } readlink;
+
+ struct {
+ char *name;
+ long xattr_len;
+ int last_index;
+ } getxattr;
+
+ struct {
+ size_t size;
+ off_t offset;
+ int last_index;
+ uint32_t flags;
+ } readv;
+
+ /* dir read */
+
+ struct {
+ uint32_t *checksum;
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+ } opendir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+ dict_t *dict;
+ int last_index;
+ gf_boolean_t failed;
+ } readdir;
+ /* inode write */
+
+ struct {
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } inode_wfop; // common structure for all inode-write-fops
+
+ struct {
+ struct iovec *vector;
+ struct iobref *iobref;
+ off_t offset;
+ int32_t op_ret;
+ int32_t count;
+ uint32_t flags;
+ } writev;
+
+ struct {
+ off_t offset;
+ } truncate;
+
+ struct {
+ off_t offset;
+ } ftruncate;
+
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } setattr;
+
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } fsetattr;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } fsetxattr;
+
+ struct {
+ char *name;
+ } removexattr;
+
+ struct {
+ dict_t *xattr;
+ gf_xattrop_flags_t optype;
+ } xattrop;
+
+ /* dir write */
+
+ struct {
+ inode_t *inode;
+ struct iatt buf;
+ struct iatt preparent;
+ struct iatt postparent;
+ struct iatt prenewparent;
+ struct iatt postnewparent;
+ } dir_fop; // common structure for all dir fops
+
+ struct {
+ fd_t *fd;
+ dict_t *params;
+ int32_t flags;
+ mode_t mode;
+ } create;
+
+ struct {
+ dict_t *params;
+ dev_t dev;
+ mode_t mode;
+ } mknod;
+
+ struct {
+ dict_t *params;
+ int32_t mode;
+ } mkdir;
+
+ struct {
+ dict_t *params;
+ char *linkpath;
+ } symlink;
+
+ struct {
+ off_t offset;
+ size_t len;
+ int32_t mode;
+ } fallocate;
+
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+
+ struct {
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
+
+ struct {
+ char *volume;
+ int32_t cmd;
+ int32_t in_cmd;
+ struct gf_flock in_flock;
+ struct gf_flock flock;
+ void *xdata;
+ } inodelk;
+
+ struct {
+ char *volume;
+ char *basename;
+ void *xdata;
+ entrylk_cmd in_cmd;
+ entrylk_cmd cmd;
+ entrylk_type type;
+ } entrylk;
+
+ struct {
+ off_t offset;
+ gf_seek_what_t what;
+ } seek;
+
+ struct {
+ struct gf_lease user_lease;
+ struct gf_lease ret_lease;
+ unsigned char *locked_nodes;
+ } lease;
+
+ struct {
+ int flags;
+ } rmdir;
+
+ struct {
+ int32_t datasync;
+ } fsync;
+
+ struct {
+ uuid_t gfid_req;
+ gf_boolean_t needs_fresh_lookup;
+ } lookup;
+
+ } cont;
+
+ struct {
+ char *basename;
+ char *new_basename;
+
+ loc_t parent_loc;
+ loc_t new_parent_loc;
+
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
+
+ struct list_head owner_list;
+ struct list_head wait_list;
+
+ unsigned char *pre_op;
+
+ /* Changelog xattr dict for [f]xattrop*/
+ dict_t **changelog_xdata;
+ unsigned char *pre_op_sources;
+
+ /* @failed_subvols: subvolumes on which a pre-op or a
+ FOP failed. */
+ unsigned char *failed_subvols;
+
+ call_frame_t *main_frame; /*Fop frame*/
+ call_frame_t *frame; /*Transaction frame*/
+
+ int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
+
+ int (*unwind)(call_frame_t *frame, xlator_t *this);
+
+ off_t start, len;
+
+ afr_transaction_type type;
+
+ int32_t in_flight_sb_errno; /* This is where the cause of the
+ failure on the last good copy of
+ the file is stored.
+ */
+
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+ afr_changelog_resume_t changelog_resume;
+
+ gf_boolean_t eager_lock_on;
+ gf_boolean_t do_eager_unlock;
+
+ /* @dirtied: flag which indicates whether we set dirty flag
+ in the OP. Typically true when we are performing operation
+ on more than one subvol and optimistic changelog is disabled
+
+ A 'true' value set in @dirtied flag means an 'undirtying'
+ has to be done in POST-OP phase.
+ */
+ gf_boolean_t dirtied;
+
+ /* @inherited: flag which indicates that the dirty flags
+ of the previous transaction were inherited
+ */
+ gf_boolean_t inherited;
+
+ /*
+ @no_uninherit: flag which indicates that a pre_op_uninherit()
+ must _not_ be attempted (and returned as failure) always. This
+ flag is set when a hard pre-op is performed, but not accounted
+ for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+ from the pre-op piggybacking entirely and therefore uninherit
+ must not be attempted.
+ */
+ gf_boolean_t no_uninherit;
+
+ gf_boolean_t in_flight_sb; /* Indicator for occurrence of
+ split-brain while in the middle of
+ a txn. */
+
+ /* @uninherit_done:
+ @uninherit_value:
+
+ The above pair variables make pre_op_uninherit() idempotent.
+ Both are FALSE initially. The first call to pre_op_uninherit
+ sets @uninherit_done to TRUE and the return value to
+ @uninherit_value. Further calls will check for @uninherit_done
+ to be TRUE and if so will simply return @uninherit_value.
+ */
+ gf_boolean_t uninherit_done;
+ gf_boolean_t uninherit_value;
+
+ gf_boolean_t disable_delayed_post_op;
+ } transaction;
+
+ syncbarrier_t barrier;
-typedef struct _afr_locked_fd {
- fd_t *fd;
- struct list_head list;
-} afr_locked_fd_t;
+ /* extra data for fops */
+ dict_t *xdata_req;
+ dict_t *xdata_rsp;
-typedef struct _afr_local {
- unsigned int call_count;
- unsigned int success_count;
- unsigned int enoent_count;
+ dict_t *xattr_rsp; /*for [f]xattrop*/
+
+ mode_t umask;
+ int xflag;
+ struct afr_reply *replies;
+
+ /* For client side background heals. */
+ struct list_head healer;
+ call_frame_t *heal_frame;
+
+ afr_inode_ctx_t *inode_ctx;
+
+ /*For thin-arbiter transactions.*/
+ int ta_failed_subvol;
+ int ta_event_gen;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+ afr_ta_fop_state_t fop_state;
+ afr_fop_lock_state_t fop_lock_state;
+ gf_lkowner_t saved_lk_owner;
+ unsigned char read_txn_query_child;
+ unsigned char ta_child_up;
+ gf_boolean_t do_discovery;
+ gf_boolean_t need_full_crawl;
+ gf_boolean_t is_read_txn;
+ gf_boolean_t is_new_entry;
+} afr_local_t;
+typedef struct afr_spbc_timeout {
+ call_frame_t *frame;
+ loc_t *loc;
+ int spb_child_index;
+ gf_boolean_t d_spb;
+ gf_boolean_t m_spb;
+} afr_spbc_timeout_t;
+
+typedef struct afr_spb_status {
+ call_frame_t *frame;
+ loc_t *loc;
+} afr_spb_status_t;
+
+typedef struct afr_empty_brick_args {
+ call_frame_t *frame;
+ char *op_type;
+ loc_t loc;
+ int empty_index;
+} afr_empty_brick_args_t;
+
+typedef struct afr_read_subvol_args {
+ ia_type_t ia_type;
+ uuid_t gfid;
+} afr_read_subvol_args_t;
+
+typedef struct afr_granular_esh_args {
+ fd_t *heal_fd;
+ xlator_t *xl;
+ call_frame_t *frame;
+ gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid
+ mismatch */
+} afr_granular_esh_args_t;
- unsigned int govinda_gOvinda;
+int
+afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type);
+int
+afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+int
+__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
- unsigned int read_child_index;
- unsigned char read_child_returned;
- unsigned int first_up_child;
+int
+__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvol,
+ int event_generation);
+int
+afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int event_generation);
- pid_t saved_pid;
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
- int32_t op_ret;
- int32_t op_errno;
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
- int32_t **pending;
+int
+afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
+ unsigned char *readable,
+ afr_read_subvol_args_t *args);
- loc_t loc;
- loc_t newloc;
+int
+afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type);
+int
+afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
+ unsigned char *readables, int *event_p,
+ afr_transaction_type type, afr_read_subvol_args_t *args);
- fd_t *fd;
+#define afr_data_subvol_get(i, t, s, r, e, a) \
+ afr_read_subvol_get(i, t, s, r, e, AFR_DATA_TRANSACTION, a)
- glusterfs_fop_t fop;
+#define afr_metadata_subvol_get(i, t, s, r, e, a) \
+ afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a)
- unsigned char *child_up;
+int
+afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, afr_inode_refresh_cbk_t cbk);
- int32_t *child_errno;
+int32_t
+afr_notify(xlator_t *this, int32_t event, void *data, void *data2);
- dict_t *xattr_req;
+int
+xattr_is_equal(dict_t *this, char *key1, data_t *value1, void *data);
- int32_t inodelk_count;
- int32_t entrylk_count;
+int
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+ int child_count);
- afr_internal_lock_t internal_lock;
+int
+afr_add_inode_lockee(afr_local_t *local, int child_count);
- afr_locked_fd_t *locked_fd;
- int32_t source_child;
- int32_t lock_recovery_child;
+void
+afr_lockees_cleanup(afr_internal_lock_t *int_lock);
- dict_t *dict;
+int
+afr_attempt_lock_recovery(xlator_t *this, int32_t child_index);
- int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this);
+int
+afr_mark_locked_nodes(xlator_t *this, fd_t *fd, unsigned char *locked_nodes);
- /*
- This struct contains the arguments for the "continuation"
- (scheme-like) of fops
- */
+void
+afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner);
- int op;
- struct {
- struct {
- unsigned char buf_set;
- struct statvfs buf;
- } statfs;
+int
+afr_set_lock_number(call_frame_t *frame, xlator_t *this);
- struct {
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt postparent;
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- dict_t *xattr;
- dict_t **xattrs;
- gf_boolean_t is_revalidate;
- } lookup;
+int32_t
+afr_unlock(call_frame_t *frame, xlator_t *this);
- struct {
- int32_t flags;
- int32_t wbflags;
- } open;
+int
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this);
- struct {
- int32_t cmd;
- struct gf_flock user_flock;
- struct gf_flock ret_flock;
- unsigned char *locked_nodes;
- } lk;
-
- /* inode read */
+int
+afr_blocking_lock(call_frame_t *frame, xlator_t *this);
- struct {
- int32_t mask;
- int last_tried; /* index of the child we tried previously */
- } access;
-
- struct {
- int last_tried;
- ino_t ino;
- } stat;
-
- struct {
- int last_tried;
- ino_t ino;
- } fstat;
-
- struct {
- size_t size;
- int last_tried;
- ino_t ino;
- } readlink;
-
- struct {
- char *name;
- int last_tried;
- } getxattr;
-
- struct {
- ino_t ino;
- size_t size;
- off_t offset;
- int last_tried;
- } readv;
-
- /* dir read */
-
- struct {
- int success_count;
- int32_t op_ret;
- int32_t op_errno;
-
- uint32_t *checksum;
- } opendir;
-
- struct {
- int32_t op_ret;
- int32_t op_errno;
- size_t size;
- off_t offset;
-
- gf_boolean_t failed;
- int last_tried;
- } readdir;
-
- struct {
- int32_t op_ret;
- int32_t op_errno;
-
- size_t size;
- off_t offset;
- int32_t flag;
-
- int last_tried;
- } getdents;
-
- /* inode write */
-
- struct {
- ino_t ino;
- struct iatt prebuf;
- struct iatt postbuf;
-
- int32_t op_ret;
-
- struct iovec *vector;
- struct iobref *iobref;
- int32_t count;
- off_t offset;
- } writev;
-
- struct {
- ino_t ino;
- struct iatt prebuf;
- struct iatt postbuf;
- } fsync;
-
- struct {
- ino_t ino;
- off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
- } truncate;
-
- struct {
- ino_t ino;
- off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
- } ftruncate;
-
- struct {
- ino_t ino;
- struct iatt in_buf;
- int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
- } setattr;
-
- struct {
- ino_t ino;
- struct iatt in_buf;
- int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
- } fsetattr;
-
- struct {
- dict_t *dict;
- int32_t flags;
- } setxattr;
-
- struct {
- char *name;
- } removexattr;
-
- /* dir write */
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- fd_t *fd;
- dict_t *params;
- int32_t flags;
- mode_t mode;
- inode_t *inode;
- struct iatt buf;
- struct iatt preparent;
- struct iatt postparent;
- struct iatt read_child_buf;
- } create;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- dev_t dev;
- mode_t mode;
- dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt preparent;
- struct iatt postparent;
- struct iatt read_child_buf;
- } mknod;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- int32_t mode;
- dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
- } mkdir;
-
- struct {
- ino_t parent_ino;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
- } unlink;
-
- struct {
- int flags;
- ino_t parent_ino;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
- } rmdir;
-
- struct {
- ino_t oldparent_ino;
- ino_t newparent_ino;
- ino_t ino;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preoldparent;
- struct iatt prenewparent;
- struct iatt postoldparent;
- struct iatt postnewparent;
- } rename;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
- } link;
-
- struct {
- ino_t ino;
- uint64_t gen;
- ino_t parent_ino;
- inode_t *inode;
- dict_t *params;
- struct iatt buf;
- struct iatt read_child_buf;
- char *linkpath;
- struct iatt preparent;
- struct iatt postparent;
- } symlink;
-
- struct {
- int32_t flags;
- dir_entry_t *entries;
- int32_t count;
- } setdents;
- } cont;
-
- struct {
- off_t start, len;
-
- char *basename;
- char *new_basename;
-
- loc_t parent_loc;
- loc_t new_parent_loc;
-
- afr_transaction_type type;
-
- int success_count;
- int erase_pending;
- int failure_count;
-
- int last_tried;
- int32_t *child_errno;
-
- call_frame_t *main_frame;
-
- int (*fop) (call_frame_t *frame, xlator_t *this);
-
- int (*done) (call_frame_t *frame, xlator_t *this);
-
- int (*resume) (call_frame_t *frame, xlator_t *this);
-
- int (*unwind) (call_frame_t *frame, xlator_t *this);
-
- /* post-op hook */
- } transaction;
-
- afr_self_heal_t self_heal;
-} afr_local_t;
+int
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this);
+int
+__afr_fd_ctx_set(xlator_t *this, fd_t *fd);
-typedef struct {
- unsigned int *pre_op_done;
- unsigned int *opened_on; /* which subvolumes the fd is open on */
- unsigned int *pre_op_piggyback;
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this);
- int flags;
- int32_t wbflags;
- uint64_t up_count; /* number of CHILD_UPs this fd has seen */
- uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
+int
+afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno);
- int32_t last_tried;
+int
+afr_locked_nodes_count(unsigned char *locked_nodes, int child_count);
- int hit, miss;
- gf_boolean_t failed_over;
- struct list_head entries; /* needed for readdir failover */
+int
+afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t *start_heal);
- unsigned char *locked_on; /* which subvolumes locks have been successful */
-} afr_fd_ctx_t;
+void
+afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv);
+void
+afr_local_cleanup(afr_local_t *local, xlator_t *this);
-/* try alloc and if it fails, goto label */
-#define ALLOC_OR_GOTO(var, type, label) do { \
- var = GF_CALLOC (sizeof (type), 1, \
- gf_afr_mt_##type); \
- if (!var) { \
- gf_log (this->name, GF_LOG_ERROR, \
- "out of memory :("); \
- op_errno = ENOMEM; \
- goto label; \
- } \
- } while (0);
+int
+afr_frame_return(call_frame_t *frame);
+int
+afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
-/* did a call fail due to a child failing? */
-#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
- ((op_errno == ENOTCONN) || \
- (op_errno == EBADFD)))
+void
+afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this);
-#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1)
+int
+afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
+
+#define AFR_STACK_UNWIND(fop, frame, op_ret, op_errno, params...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ int32_t __op_ret = 0; \
+ int32_t __op_errno = 0; \
+ \
+ __op_ret = op_ret; \
+ __op_errno = op_errno; \
+ if (frame) { \
+ __local = frame->local; \
+ __this = frame->this; \
+ afr_handle_inconsistent_fop(frame, &__op_ret, &__op_errno); \
+ if (__local && __local->is_read_txn) \
+ afr_pending_read_decrement(__this->private, \
+ __local->read_subvol); \
+ if (__local && __local->xdata_req && \
+ afr_is_lock_mode_mandatory(__local->xdata_req)) \
+ afr_dom_lock_release(frame); \
+ frame->local = NULL; \
+ } \
+ \
+ STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ } while (0)
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY(frame->root); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ } while (0);
+
+#define AFR_FRAME_INIT(frame, op_errno) \
+ ({ \
+ frame->local = mem_get0(THIS->local_pool); \
+ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \
+ afr_local_cleanup(frame->local, frame->this); \
+ mem_put(frame->local); \
+ frame->local = NULL; \
+ }; \
+ frame->local; \
+ })
+
+#define AFR_STACK_RESET(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ int __opr; \
+ STACK_RESET(frame->root); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ AFR_FRAME_INIT(frame, __opr); \
+ } while (0)
-/* have we tried all children? */
-#define all_tried(i, count) ((i) == (count) - 1)
+/* allocate and return a string that is the basename of argument */
+static inline char *
+AFR_BASENAME(const char *str)
+{
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = gf_strdup(str);
+ __basename_str = gf_strdup(basename(__tmp_str));
+ GF_FREE(__tmp_str);
+ return __basename_str;
+}
-int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid);
+call_frame_t *
+afr_copy_frame(call_frame_t *base);
int
-pump_command_reply (call_frame_t *frame, xlator_t *this);
+afr_transaction_local_init(afr_local_t *local, xlator_t *this);
int32_t
-afr_notify (xlator_t *this, int32_t event,
- void *data, ...);
+afr_marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, afr_local_t *local, afr_private_t *priv);
int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
+afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
int
-afr_save_locked_fd (xlator_t *this, fd_t *fd);
+afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count);
int
-afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
- unsigned char *locked_nodes);
+afr_higher_errno(int32_t old_errno, int32_t new_errno);
+
+int
+afr_final_errno(afr_local_t *local, afr_private_t *priv);
+
+int
+afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req);
void
-afr_set_lk_owner (call_frame_t *frame, xlator_t *this);
+afr_fix_open(fd_t *fd, xlator_t *this);
+
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this);
+void
+afr_set_low_priority(call_frame_t *frame);
int
-afr_set_lock_number (call_frame_t *frame, xlator_t *this);
+afr_child_fd_ctx_set(xlator_t *this, fd_t *fd, int32_t child, int flags);
+void
+afr_matrix_cleanup(int32_t **pending, unsigned int m);
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2);
+int32_t **
+afr_matrix_create(unsigned int m, unsigned int n);
-int32_t
-afr_unlock (call_frame_t *frame, xlator_t *this);
+int **
+afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, ia_type_t iat);
+
+void
+afr_filter_xattrs(dict_t *xattr);
+
+/*
+ * Special value indicating we should use the "auto" quorum method instead of
+ * a fixed value (including zero to turn off quorum enforcement).
+ */
+#define AFR_QUORUM_AUTO INT_MAX
int
-afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this);
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode);
+
+void
+afr_reply_wipe(struct afr_reply *reply);
+
+void
+afr_replies_wipe(struct afr_reply *replies, int count);
+
+gf_boolean_t
+afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2);
+
+gf_boolean_t
+afr_is_xattr_ignorable(char *key);
int
-afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this);
+afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc);
int
-afr_blocking_lock (call_frame_t *frame, xlator_t *this);
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
+afr_get_split_brain_status(void *opaque);
+int
+afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque);
-int pump_start (call_frame_t *frame, xlator_t *this);
+int
+afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
+ int spb_choice);
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol);
+int
+afr_get_child_index_from_name(xlator_t *this, char *name);
int
-afr_fd_ctx_set (xlator_t *this, fd_t *fd);
+afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb);
+int
+afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode);
-uint64_t
-afr_read_child (xlator_t *this, inode_t *inode);
+int
+afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque);
-void
-afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child);
+gf_boolean_t
+afr_get_need_heal(xlator_t *this);
void
-afr_build_parent_loc (loc_t *parent, loc_t *child);
+afr_set_need_heal(xlator_t *this, afr_local_t *local);
int
-afr_up_children_count (int child_count, unsigned char *child_up);
+afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd);
int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
-
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index);
+afr_get_msg_id(char *op_type);
int
-afr_deitransform (ino64_t ino, int child_count);
+afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame,
+ inode_t *inode);
+int32_t
+afr_quorum_errno(afr_private_t *priv);
+
+gf_boolean_t
+afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
+ int32_t *op_errno);
void
-afr_local_cleanup (afr_local_t *local, xlator_t *this);
+afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
+ int32_t *op_errno);
-int
-afr_frame_return (call_frame_t *frame);
+void
+afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
+void
+afr_process_post_writev(call_frame_t *frame, xlator_t *this);
-uint64_t
-afr_is_split_brain (xlator_t *this, inode_t *inode);
+void
+afr_writev_unwind(call_frame_t *frame, xlator_t *this);
void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set);
+afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame);
+
+void
+afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock,
+ int32_t child_index);
+afr_fd_ctx_t *
+__afr_fd_ctx_get(fd_t *fd, xlator_t *this);
+
+gf_boolean_t
+afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
+ int event_gen2);
int
-afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags);
+afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
+ char *buf, const char *default_str,
+ int32_t *serz_len, char delimiter);
+gf_boolean_t
+afr_is_symmetric_error(call_frame_t *frame, xlator_t *this);
-void
-afr_set_opendir_done (xlator_t *this, inode_t *inode);
+int
+__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx);
uint64_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode);
+afr_write_subvol_get(call_frame_t *frame, xlator_t *this);
-void
-afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
-
-int
-afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
-
-int
-afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd);
-
-#define AFR_STACK_UNWIND(fop, frame, params ...) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- if (frame) { \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- } \
- STACK_UNWIND_STRICT (fop, frame, params); \
- afr_local_cleanup (__local, __this); \
- GF_FREE (__local); \
- } while (0);
-
-#define AFR_STACK_DESTROY(frame) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- STACK_DESTROY (frame->root); \
- afr_local_cleanup (__local, __this); \
- GF_FREE (__local); \
- } while (0);
+int
+afr_write_subvol_set(call_frame_t *frame, xlator_t *this);
-/* allocate and return a string that is the basename of argument */
-static inline char *
-AFR_BASENAME (const char *str)
-{
- char *__tmp_str = NULL;
- char *__basename_str = NULL;
- __tmp_str = gf_strdup (str);
- __basename_str = gf_strdup (basename (__tmp_str));
- GF_FREE (__tmp_str);
- return __basename_str;
-}
+int
+afr_write_subvol_reset(call_frame_t *frame, xlator_t *this);
-/* initialize local_t */
-static inline int
-AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
-{
- local->child_up = GF_CALLOC (sizeof (*local->child_up),
- priv->child_count,
- gf_afr_mt_char);
- if (!local->child_up) {
- return -ENOMEM;
- }
+int
+afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode);
+
+int
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop);
- memcpy (local->child_up, priv->child_up,
- sizeof (*local->child_up) * priv->child_count);
+int
+afr_ta_post_op_lock(xlator_t *this, loc_t *loc);
+int
+afr_ta_post_op_unlock(xlator_t *this, loc_t *loc);
- local->call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (local->call_count == 0)
- return -ENOTCONN;
+gf_boolean_t
+afr_is_pending_set(xlator_t *this, dict_t *xdata, int type);
- local->transaction.erase_pending = 1;
+int
+__afr_get_up_children_count(afr_private_t *priv);
- local->op_ret = -1;
- local->op_errno = EUCLEAN;
+call_frame_t *
+afr_ta_frame_create(xlator_t *this);
- local->internal_lock.lock_op_ret = -1;
- local->internal_lock.lock_op_errno = EUCLEAN;
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
+void
+afr_ta_lock_release_synctask(xlator_t *this);
- return 0;
-}
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv);
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame,
+ const unsigned int up_children_count);
-/**
- * first_up_child - return the index of the first child that is up
- */
+void
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this);
-static inline int
-afr_first_up_child (afr_private_t *priv)
-{
- xlator_t ** children = NULL;
- int ret = -1;
- int i = 0;
-
- LOCK (&priv->lock);
- {
- children = priv->children;
- for (i = 0; i < priv->child_count; i++) {
- if (priv->child_up[i]) {
- ret = i;
- break;
- }
- }
- }
- UNLOCK (&priv->lock);
-
- return ret;
-}
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this);
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,
+ int child);
-static inline int
-afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
-{
- int i;
-
- local->first_up_child = afr_first_up_child (priv);
-
- local->child_errno = GF_CALLOC (sizeof (*local->child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!local->child_errno) {
- return -ENOMEM;
- }
-
- local->pending = GF_CALLOC (sizeof (*local->pending),
- priv->child_count,
- gf_afr_mt_int32_t);
-
- if (!local->pending) {
- return -ENOMEM;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]),
- 3, /* data + metadata + entry */
- gf_afr_mt_int32_t);
- if (!local->pending[i])
- return -ENOMEM;
- }
-
- local->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- local->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- local->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*local->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- local->internal_lock.lower_locked_nodes
- = GF_CALLOC (sizeof (*local->internal_lock.lower_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- local->transaction.child_errno = GF_CALLOC (sizeof (*local->transaction.child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
-
- local->internal_lock.transaction_lk_type = AFR_TRANSACTION_LK;
-
- return 0;
-}
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata);
+
+void
+afr_dom_lock_release(call_frame_t *frame);
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies);
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid);
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
deleted file mode 100644
index a162d3fa164..00000000000
--- a/xlators/cluster/afr/src/pump.c
+++ /dev/null
@@ -1,2528 +0,0 @@
-/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <unistd.h>
-#include <sys/time.h>
-#include <stdlib.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "afr-common.c"
-#include "defaults.c"
-
-static int
-pump_mark_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- pump_priv->pump_start_pending = 1;
-
- return 0;
-}
-
-static int
-is_pump_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- return (pump_priv->pump_start_pending);
-}
-
-static int
-pump_remove_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- pump_priv->pump_start_pending = 0;
-
- return 0;
-}
-
-static pump_state_t
-pump_get_state ()
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t ret;
-
- this = THIS;
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->pump_state_lock);
- {
- ret = pump_priv->pump_state;
- }
- UNLOCK (&pump_priv->pump_state_lock);
-
- return ret;
-}
-
-int
-pump_change_state (xlator_t *this, pump_state_t state)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t state_old;
- pump_state_t state_new;
-
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (pump_priv);
-
- LOCK (&pump_priv->pump_state_lock);
- {
- state_old = pump_priv->pump_state;
- state_new = state;
-
- pump_priv->pump_state = state;
-
- }
- UNLOCK (&pump_priv->pump_state_lock);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump changing state from %d to %d",
- state_old,
- state_new);
-
- return 0;
-}
-
-static int
-pump_set_resume_path (xlator_t *this, const char *path)
-{
- int ret = 0;
-
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (pump_priv);
-
- LOCK (&pump_priv->resume_path_lock);
- {
- pump_priv->resume_path = strdup (path);
- if (!pump_priv->resume_path)
- ret = -1;
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- return ret;
-}
-
-static void
-build_child_loc (loc_t *parent, loc_t *child, char *path, char *name)
-{
- child->path = path;
- child->name = name;
-
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
-}
-
-static char *
-build_file_path (loc_t *loc, gf_dirent_t *entry)
-{
- xlator_t *this = NULL;
- char *file_path = NULL;
- int pathlen = 0;
- int total_size = 0;
-
- this = THIS;
-
- pathlen = STRLEN_0 (loc->path);
-
- if (IS_ROOT_PATH (loc->path)) {
- total_size = pathlen + entry->d_len;
- file_path = GF_CALLOC (1, total_size, gf_afr_mt_char);
- if (!file_path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return NULL;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "constructing file path of size=%d"
- "pathlen=%d, d_len=%d",
- total_size, pathlen,
- entry->d_len);
-
- snprintf(file_path, total_size, "%s%s", loc->path, entry->d_name);
-
- } else {
- total_size = pathlen + entry->d_len + 1; /* for the extra '/' in the path */
- file_path = GF_CALLOC (1, total_size + 1, gf_afr_mt_char);
- if (!file_path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return NULL;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "constructing file path of size=%d"
- "pathlen=%d, d_len=%d",
- total_size, pathlen,
- entry->d_len);
-
- snprintf(file_path, total_size, "%s/%s", loc->path, entry->d_name);
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "path=%s and d_name=%s", loc->path, entry->d_name);
- gf_log (this->name, GF_LOG_TRACE,
- "constructed file_path=%s of size=%d", file_path, total_size);
-
- return file_path;
-}
-
-static int
-pump_save_path (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- pump_state_t state;
- dict_t *dict = NULL;
- loc_t loc;
- int dict_ret = 0;
- int ret = -1;
-
- state = pump_get_state ();
- if (state == PUMP_STATE_RESUME)
- return 0;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
-
- dict = dict_new ();
- dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path);
-
- ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setxattr failed - could not save path=%s", path);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "setxattr succeeded - saved path=%s", path);
- gf_log (this->name, GF_LOG_DEBUG,
- "Saving path for status info");
- }
-
- dict_unref (dict);
-
- return 0;
-}
-
-static int
-pump_check_and_update_status (xlator_t *this)
-{
- pump_state_t state;
- int ret = -1;
-
- state = pump_get_state ();
-
- switch (state) {
-
- case PUMP_STATE_RESUME:
- case PUMP_STATE_RUNNING:
- {
- ret = 0;
- break;
- }
- case PUMP_STATE_PAUSE:
- {
- ret = -1;
- break;
- }
- case PUMP_STATE_ABORT:
- {
- pump_save_path (this, "/");
- ret = -1;
- break;
- }
- default:
- {
- gf_log (this->name, GF_LOG_DEBUG,
- "Unknown pump state");
- ret = -1;
- break;
- }
-
- }
-
- return ret;
-}
-
-static const char *
-pump_get_resume_path (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- const char *resume_path = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- resume_path = pump_priv->resume_path;
-
- return resume_path;
-}
-
-static int
-pump_update_resume_state (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t state;
- const char *resume_path = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- state = pump_get_state ();
-
- if (state == PUMP_STATE_RESUME) {
- resume_path = pump_get_resume_path (this);
- if (strcmp (resume_path, "/") == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reached the resume path (/). Proceeding to change state"
- " to running");
- pump_change_state (this, PUMP_STATE_RUNNING);
- } else if (strcmp (resume_path, path) == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reached the resume path. Proceeding to change state"
- " to running");
- pump_change_state (this, PUMP_STATE_RUNNING);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not yet hit the resume path:res-path=%s,path=%s",
- resume_path, path);
- }
- }
-
- return 0;
-}
-
-static gf_boolean_t
-is_pump_traversal_allowed (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t state;
- const char *resume_path = NULL;
- gf_boolean_t ret = _gf_true;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- state = pump_get_state ();
-
- if (state == PUMP_STATE_RESUME) {
- resume_path = pump_get_resume_path (this);
- if (strstr (resume_path, path)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "On the right path to resumption path");
- ret = _gf_true;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not the right path to resuming=> ignoring traverse");
- ret = _gf_false;
- }
- }
-
- return ret;
-}
-
-static int
-pump_save_file_stats (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->resume_path_lock);
- {
- pump_priv->number_files_pumped++;
-
- strncpy (pump_priv->current_file, path,
- PATH_MAX);
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- return 0;
-}
-
-static int
-gf_pump_traverse_directory (loc_t *loc)
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
-
- off_t offset = 0;
- loc_t entry_loc;
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- gf_dirent_t entries;
-
- struct iatt iatt, parent;
- dict_t *xattr_rsp;
-
- int source = 0;
-
- char *file_path = NULL;
- int ret = 0;
-
- INIT_LIST_HEAD (&entries.list);
- this = THIS;
- priv = this->private;
-
- GF_ASSERT (loc->inode);
-
- fd = fd_create (loc->inode, PUMP_PID);
- if (!fd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to create fd for %s", loc->path);
- goto out;
- }
-
- ret = syncop_opendir (priv->children[source], loc, fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "opendir failed on %s", loc->path);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "pump opendir on %s returned=%d",
- loc->path, ret);
-
- while (syncop_readdirp (priv->children[source], fd, 131072, offset, &entries)) {
-
- if (list_empty (&entries.list)) {
- gf_log (this->name, GF_LOG_TRACE,
- "no more entries in directory");
- goto out;
- }
-
- list_for_each_entry_safe (entry, tmp, &entries.list, list) {
- gf_log (this->name, GF_LOG_DEBUG,
- "found readdir entry=%s", entry->d_name);
-
- file_path = build_file_path (loc, entry);
- if (!file_path) {
- gf_log (this->name, GF_LOG_DEBUG,
- "file path construction failed");
- goto out;
- }
-
- build_child_loc (loc, &entry_loc, file_path, entry->d_name);
-
- if (!IS_ENTRY_CWD (entry->d_name) &&
- !IS_ENTRY_PARENT (entry->d_name)) {
-
- ret = syncop_lookup (this, &entry_loc, NULL,
- &iatt, &xattr_rsp, &parent);
-
- entry_loc.ino = iatt.ia_ino;
- entry_loc.inode->ino = iatt.ia_ino;
- memcpy (entry_loc.inode->gfid, iatt.ia_gfid, 16);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup %s => %"PRId64,
- entry_loc.path,
- iatt.ia_ino);
-
- ret = syncop_lookup (this, &entry_loc, NULL,
- &iatt, &xattr_rsp, &parent);
-
-
- gf_log (this->name, GF_LOG_DEBUG,
- "second lookup ret=%d: %s => %"PRId64,
- ret,
- entry_loc.path,
- iatt.ia_ino);
-
- pump_update_resume_state (this, entry_loc.path);
-
- pump_save_path (this, entry_loc.path);
- pump_save_file_stats (this, entry_loc.path);
-
- ret = pump_check_and_update_status (this);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump beginning to exit out");
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "type of file=%d, IFDIR=%d",
- iatt.ia_type, IA_IFDIR);
-
- if (IA_ISDIR (iatt.ia_type)) {
- if (is_pump_traversal_allowed (this, entry_loc.path)) {
- gf_log (this->name, GF_LOG_TRACE,
- "entering dir=%s",
- entry->d_name);
- gf_pump_traverse_directory (&entry_loc);
- }
- }
- }
- offset = entry->d_off;
- loc_wipe (&entry_loc);
- }
-
- gf_dirent_free (&entries);
- gf_log (this->name, GF_LOG_TRACE,
- "offset incremented to %d",
- (int32_t ) offset);
-
- }
-
-out:
- return 0;
-
-}
-
-void
-build_root_loc (inode_t *inode, loc_t *loc)
-{
- loc->path = "/";
- loc->name = "";
- loc->inode = inode;
- loc->ino = 1;
- loc->inode->ino = 1;
- memset (loc->inode->gfid, 0, 16);
- loc->inode->gfid[15] = 1;
-
-}
-
-static int
-pump_update_resume_path (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- const char *resume_path = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- resume_path = pump_get_resume_path (this);
-
- if (resume_path) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Found a path to resume from: %s",
- resume_path);
-
- }else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Did not find a path=> setting to '/'");
- pump_set_resume_path (this, "/");
- }
-
- pump_change_state (this, PUMP_STATE_RESUME);
-
- return 0;
-}
-
-static int
-pump_complete_migration (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- dict_t *dict = NULL;
- pump_state_t state;
- loc_t loc;
- int dict_ret = 0;
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
-
- dict = dict_new ();
-
- state = pump_get_state ();
- if (state == PUMP_STATE_RUNNING) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump finished pumping");
-
- pump_priv->pump_finished = _gf_true;
-
- dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon");
-
- ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setxattr failed - while notifying source complete");
- }
- dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon");
-
- ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setxattr failed - while notifying sink complete");
- }
- }
-
- return 0;
-}
-
-static int
-pump_set_root_gfid (dict_t *dict)
-{
- uuid_t gfid;
- int ret = 0;
-
- memset (gfid, 0, 16);
- gfid[15] = 1;
-
- ret = afr_set_dict_gfid (dict, gfid);
-
- return ret;
-}
-
-static int
-pump_lookup_sink (loc_t *loc)
-{
- xlator_t *this = NULL;
- struct iatt iatt, parent;
- dict_t *xattr_rsp;
- dict_t *xattr_req = NULL;
- int ret = 0;
-
- this = THIS;
-
- xattr_req = dict_new ();
-
- ret = pump_set_root_gfid (xattr_req);
- if (ret)
- goto out;
-
- ret = syncop_lookup (PUMP_SINK_CHILD (this), loc,
- xattr_req, &iatt, &xattr_rsp, &parent);
-
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Lookup on sink child failed");
- goto out;
- }
-
-out:
- if (xattr_req)
- dict_unref (xattr_req);
-
- return ret;
-}
-
-static int
-pump_task (void *data)
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
-
-
- loc_t loc;
- struct iatt iatt, parent;
- dict_t *xattr_rsp = NULL;
- dict_t *xattr_req = NULL;
-
- int ret = -1;
-
- this = THIS;
- priv = this->private;
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
- xattr_req = dict_new ();
- if (!xattr_req) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- pump_set_root_gfid (xattr_req);
- ret = syncop_lookup (this, &loc, xattr_req,
- &iatt, &xattr_rsp, &parent);
-
- gf_log (this->name, GF_LOG_TRACE,
- "lookup: ino=%"PRId64", path=%s",
- loc.ino,
- loc.path);
-
- ret = pump_check_and_update_status (this);
- if (ret < 0) {
- goto out;
- }
-
- pump_update_resume_path (this);
-
- pump_set_root_gfid (xattr_req);
- ret = pump_lookup_sink (&loc);
- if (ret) {
- pump_update_resume_path (this);
- goto out;
- }
-
- gf_pump_traverse_directory (&loc);
-
- pump_complete_migration (this);
-out:
- if (xattr_req)
- dict_unref (xattr_req);
-
- return 0;
-}
-
-
-static int
-pump_task_completion (int ret, void *data)
-{
- xlator_t *this = NULL;
- call_frame_t *frame = NULL;
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- this = THIS;
-
- frame = (call_frame_t *) data;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- inode_unref (priv->root_inode);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump xlator exiting");
- return 0;
-}
-
-int
-pump_start (call_frame_t *pump_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- if (!pump_frame->root->lk_owner)
- pump_frame->root->lk_owner = PUMP_LK_OWNER;
-
- ret = synctask_new (pump_priv->env, pump_task,
- pump_task_completion,
- pump_frame);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "starting pump failed");
- pump_change_state (this, PUMP_STATE_ABORT);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "setting pump as started");
-
- priv->use_afr_in_pump = 1;
-out:
- return ret;
-}
-
-static int
-pump_start_synctask (xlator_t *this)
-{
- call_frame_t *frame = NULL;
- int ret = 0;
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- pump_change_state (this, PUMP_STATE_RUNNING);
-
- ret = pump_start (frame, this);
-
-out:
- return ret;
-}
-
-int32_t
-pump_cmd_start_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-
-{
- afr_local_t *local = NULL;
- int ret = 0;
-
- local = frame->local;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not initiate destination "
- "brick connect");
- ret = op_ret;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Successfully initiated destination "
- "brick connect");
-
- pump_mark_start_pending (this);
-
-out:
- local->op_ret = ret;
- pump_command_reply (frame, this);
-
- return 0;
-}
-
-static int
-pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *dict = NULL;
- char *dst_brick = NULL;
- loc_t loc;
-
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
-
- ret = dict_get_str (local->dict, PUMP_CMD_START, &dst_brick);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not get destination brick value");
- goto out;
- }
-
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- GF_ASSERT (dst_brick);
- gf_log (this->name, GF_LOG_DEBUG,
- "Got destination brick as %s", dst_brick);
-
- ret = dict_set_str (dict, CLIENT_CMD_CONNECT, dst_brick);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not inititiate destination brick "
- "connect");
- goto out;
- }
-
- STACK_WIND (frame,
- pump_cmd_start_setxattr_cbk,
- PUMP_SINK_CHILD(this),
- PUMP_SINK_CHILD(this)->fops->setxattr,
- &loc,
- dict,
- 0);
-
- ret = 0;
-
- dict_unref (dict);
-out:
- return ret;
-}
-
-static int
-is_pump_aborted (xlator_t *this)
-{
- pump_state_t state;
-
- state = pump_get_state ();
-
- return ((state == PUMP_STATE_ABORT));
-}
-
-int32_t
-pump_cmd_start_getxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- afr_local_t *local = NULL;
- char *path = NULL;
-
- pump_state_t state;
- int ret = 0;
- int need_unwind = 0;
- int dict_ret = -1;
-
- local = frame->local;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "getxattr failed - changing pump "
- "state to RUNNING with '/'");
- path = "/";
- ret = op_ret;
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "getxattr succeeded");
-
- dict_ret = dict_get_str (dict, PUMP_PATH, &path);
- if (dict_ret < 0)
- path = "/";
- }
-
- state = pump_get_state ();
- if ((state == PUMP_STATE_RUNNING) ||
- (state == PUMP_STATE_RESUME)) {
- gf_log (this->name, GF_LOG_ERROR,
- "Pump is already started");
- ret = -1;
- goto out;
- }
-
- pump_set_resume_path (this, path);
-
- if (is_pump_aborted (this))
- /* We're re-starting pump afresh */
- ret = pump_initiate_sink_connect (frame, this);
- else {
- /* We're re-starting pump from a previous
- pause */
- gf_log (this->name, GF_LOG_DEBUG,
- "about to start synctask");
- ret = pump_start_synctask (this);
- need_unwind = 1;
- }
-
-out:
- if ((ret < 0) || (need_unwind == 1)) {
- local->op_ret = ret;
- pump_command_reply (frame, this);
- }
- return 0;
-}
-
-int
-pump_execute_status (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- uint64_t number_files = 0;
-
- char filename[PATH_MAX];
- char *dict_str = NULL;
-
- int32_t op_ret = 0;
- int32_t op_errno = 0;
-
- dict_t *dict = NULL;
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->resume_path_lock);
- {
- number_files = pump_priv->number_files_pumped;
- strncpy (filename, pump_priv->current_file, PATH_MAX);
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- dict_str = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char);
- if (!dict_str) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- if (pump_priv->pump_finished) {
- snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Migration complete ",
- number_files);
- } else {
- snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Current file= %s ",
- number_files, filename);
- }
-
- dict = dict_new ();
-
- ret = dict_set_str (dict, PUMP_CMD_STATUS, dict_str);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "dict_set_str returned negative value");
- }
-
- op_ret = 0;
-
-out:
-
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
-
- dict_unref (dict);
- GF_FREE (dict_str);
-
- return 0;
-}
-
-int
-pump_execute_pause (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- pump_change_state (this, PUMP_STATE_PAUSE);
-
- local->op_ret = 0;
- pump_command_reply (frame, this);
-
- return 0;
-}
-
-int
-pump_execute_start (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = 0;
- loc_t loc;
-
- priv = this->private;
- local = frame->local;
-
- if (!priv->root_inode) {
- gf_log (this->name, GF_LOG_ERROR,
- "Pump xlator cannot be started without an initial "
- "lookup");
- ret = -1;
- goto out;
- }
-
- GF_ASSERT (priv->root_inode);
-
- build_root_loc (priv->root_inode, &loc);
-
- STACK_WIND (frame,
- pump_cmd_start_getxattr_cbk,
- PUMP_SOURCE_CHILD(this),
- PUMP_SOURCE_CHILD(this)->fops->getxattr,
- &loc,
- PUMP_PATH);
-
- ret = 0;
-
-out:
- if (ret < 0) {
- local->op_ret = ret;
- pump_command_reply (frame, this);
- }
-
- return 0;
-}
-
-int
-pump_execute_abort (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
- local = frame->local;
-
- pump_change_state (this, PUMP_STATE_ABORT);
-
- LOCK (&pump_priv->resume_path_lock);
- {
- pump_priv->number_files_pumped = 0;
- pump_priv->current_file[0] = '\0';
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- local->op_ret = 0;
- pump_command_reply (frame, this);
-
- return 0;
-}
-
-gf_boolean_t
-pump_command_status (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, PUMP_CMD_STATUS, &cmd);
- if (dict_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not a pump status command");
- ret = _gf_false;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit a pump command - status");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_pause (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, PUMP_CMD_PAUSE, &cmd);
- if (dict_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not a pump pause command");
- ret = _gf_false;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit a pump command - pause");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_abort (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, PUMP_CMD_ABORT, &cmd);
- if (dict_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not a pump abort command");
- ret = _gf_false;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit a pump command - abort");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_start (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, PUMP_CMD_START, &cmd);
- if (dict_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Not a pump start command");
- ret = _gf_false;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit a pump command - start");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-struct _xattr_key {
- char *key;
- struct list_head list;
-};
-
-static void
-__gather_xattr_keys (dict_t *dict, char *key, data_t *value,
- void *data)
-{
- struct list_head * list = data;
- struct _xattr_key * xkey = NULL;
-
- if (!strncmp (key, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
-
- xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key);
- if (!xkey)
- return;
-
- xkey->key = key;
- INIT_LIST_HEAD (&xkey->list);
-
- list_add_tail (&xkey->list, list);
- }
-}
-
-static void
-__filter_xattrs (dict_t *dict)
-{
- struct list_head keys;
-
- struct _xattr_key *key;
- struct _xattr_key *tmp;
-
- INIT_LIST_HEAD (&keys);
-
- dict_foreach (dict, __gather_xattr_keys,
- (void *) &keys);
-
- list_for_each_entry_safe (key, tmp, &keys, list) {
- dict_del (dict, key->key);
-
- list_del_init (&key->list);
-
- GF_FREE (key);
- }
-}
-
-int32_t
-pump_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
-
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
-
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.getxattr.last_tried;
-
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.getxattr.last_tried;
-
- if (this_try == read_child) {
- goto retry;
- }
-
- unwind = 0;
- STACK_WIND_COOKIE (frame, pump_getxattr_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name);
- }
-
-out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
-
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
- }
-
- return 0;
-}
-
-int32_t
-pump_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
-{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t * local = NULL;
-
- int read_child = -1;
-
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- frame->local = local;
-
- if (name) {
- if (!strncmp (name, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
-
- op_errno = ENODATA;
- goto out;
- }
-
- if (!strcmp (name, PUMP_CMD_STATUS)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Hit pump command - status");
- pump_execute_status (frame, this);
- op_ret = 0;
- goto out;
- }
- }
-
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_getxattr_cbk,
- FIRST_CHILD (this),
- (FIRST_CHILD (this))->fops->getxattr,
- loc, name);
- return 0;
- }
-
- read_child = afr_read_child (this, loc->inode);
-
- if (read_child >= 0) {
- call_child = read_child;
-
- local->cont.getxattr.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
-
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_DEBUG,
- "no child is up");
- goto out;
- }
-
- local->cont.getxattr.last_tried = call_child;
- }
-
- loc_copy (&local->loc, loc);
- if (name)
- local->cont.getxattr.name = gf_strdup (name);
-
- STACK_WIND_COOKIE (frame, pump_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->getxattr,
- loc, name);
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL);
- }
- return 0;
-}
-
-static int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (setxattr, main_frame,
- local->op_ret, local->op_errno)
- }
- return 0;
-}
-
-static int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
-}
-
-static int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-static int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int32_t
-pump_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- return 0;
-}
-
-int
-pump_command_reply (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- if (local->op_ret < 0)
- gf_log (this->name, GF_LOG_NORMAL,
- "Command failed");
- else
- gf_log (this->name, GF_LOG_NORMAL,
- "Command succeeded");
-
- dict_unref (local->dict);
-
- AFR_STACK_UNWIND (setxattr,
- frame,
- local->op_ret,
- local->op_errno);
-
- return 0;
-}
-
-int
-pump_parse_command (call_frame_t *frame, xlator_t *this,
- afr_local_t *local, dict_t *dict)
-{
-
- int ret = -1;
-
- if (pump_command_start (this, dict)) {
- frame->local = local;
- local->dict = dict_ref (dict);
- ret = pump_execute_start (frame, this);
-
- } else if (pump_command_pause (this, dict)) {
- frame->local = local;
- local->dict = dict_ref (dict);
- ret = pump_execute_pause (frame, this);
-
- } else if (pump_command_abort (this, dict)) {
- frame->local = local;
- local->dict = dict_ref (dict);
- ret = pump_execute_abort (frame, this);
- }
- return ret;
-}
-
-int
-pump_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- ret = pump_parse_command (frame, this,
- local, dict);
- if (ret >= 0) {
- op_ret = 0;
- goto out;
- }
-
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_setxattr_cbk,
- FIRST_CHILD (this),
- (FIRST_CHILD (this))->fops->setxattr,
- loc, dict, flags);
- return 0;
- }
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
- transaction_frame->local = local;
-
- local->op_ret = -1;
-
- local->cont.setxattr.dict = dict_ref (dict);
- local->cont.setxattr.flags = flags;
-
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
- local->transaction.unwind = afr_setxattr_unwind;
-
- loc_copy (&local->loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
-
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno);
- }
-
- return 0;
-}
-
-/* Defaults */
-static int32_t
-pump_lookup (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *xattr_req)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_lookup_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lookup,
- loc,
- xattr_req);
- return 0;
- }
-
- afr_lookup (frame, this, loc, xattr_req);
- return 0;
-}
-
-
-static int32_t
-pump_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc,
- offset);
- return 0;
- }
-
- afr_truncate (frame, this, loc, offset);
- return 0;
-}
-
-
-static int32_t
-pump_ftruncate (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- off_t offset)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd,
- offset);
- return 0;
- }
-
- afr_ftruncate (frame, this, fd, offset);
- return 0;
-}
-
-
-
-
-int
-pump_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t rdev, dict_t *parms)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_mknod_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod,
- loc, mode, rdev, parms);
- return 0;
- }
- afr_mknod (frame, this, loc, mode, rdev, parms);
- return 0;
-
-}
-
-
-
-int
-pump_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_mkdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir,
- loc, mode, params);
- return 0;
- }
- afr_mkdir (frame, this, loc, mode, params);
- return 0;
-
-}
-
-
-static int32_t
-pump_unlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- loc);
- return 0;
- }
- afr_unlink (frame, this, loc);
- return 0;
-
-}
-
-
-static int
-pump_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags)
-{
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_rmdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rmdir,
- loc, flags);
- return 0;
- }
-
- afr_rmdir (frame, this, loc, flags);
- return 0;
-
-}
-
-
-
-int
-pump_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, dict_t *params)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_symlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink,
- linkpath, loc, params);
- return 0;
- }
- afr_symlink (frame, this, linkpath, loc, params);
- return 0;
-
-}
-
-
-static int32_t
-pump_rename (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_rename_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename,
- oldloc, newloc);
- return 0;
- }
- afr_rename (frame, this, oldloc, newloc);
- return 0;
-
-}
-
-
-static int32_t
-pump_link (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_link_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link,
- oldloc, newloc);
- return 0;
- }
- afr_link (frame, this, oldloc, newloc);
- return 0;
-
-}
-
-
-static int32_t
-pump_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, fd, params);
- return 0;
- }
- afr_create (frame, this, loc, flags, mode, fd, params);
- return 0;
-
-}
-
-
-static int32_t
-pump_open (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags, fd_t *fd,
- int32_t wbflags)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_open_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- loc, flags, fd, wbflags);
- return 0;
- }
- afr_open (frame, this, loc, flags, fd, wbflags);
- return 0;
-
-}
-
-
-static int32_t
-pump_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t off,
- struct iobref *iobref)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_writev_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev,
- fd,
- vector,
- count,
- off,
- iobref);
- return 0;
- }
- afr_writev (frame, this, fd, vector, count, off, iobref);
- return 0;
-
-}
-
-
-static int32_t
-pump_flush (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_flush_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- return 0;
- }
- afr_flush (frame, this, fd);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsync (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsync_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync,
- fd,
- flags);
- return 0;
- }
- afr_fsync (frame, this, fd, flags);
- return 0;
-
-}
-
-
-static int32_t
-pump_opendir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_opendir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->opendir,
- loc, fd);
- return 0;
- }
- afr_opendir (frame, this, loc, fd);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsyncdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsyncdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsyncdir,
- fd,
- flags);
- return 0;
- }
- afr_fsyncdir (frame, this, fd, flags);
- return 0;
-
-}
-
-
-static int32_t
-pump_xattrop (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- gf_xattrop_flags_t flags,
- dict_t *dict)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_xattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->xattrop,
- loc,
- flags,
- dict);
- return 0;
- }
- afr_xattrop (frame, this, loc, flags, dict);
- return 0;
-
-}
-
-static int32_t
-pump_fxattrop (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- gf_xattrop_flags_t flags,
- dict_t *dict)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fxattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fxattrop,
- fd,
- flags,
- dict);
- return 0;
- }
- afr_fxattrop (frame, this, fd, flags, dict);
- return 0;
-
-}
-
-
-static int32_t
-pump_removexattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_removexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
- loc,
- name);
- return 0;
- }
- afr_removexattr (frame, this, loc, name);
- return 0;
-
-}
-
-
-
-static int32_t
-pump_readdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t off)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_readdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdir,
- fd, size, off);
- return 0;
- }
- afr_readdir (frame, this, fd, size, off);
- return 0;
-
-}
-
-
-static int32_t
-pump_readdirp (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t off)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_readdirp_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdirp,
- fd, size, off);
- return 0;
- }
- afr_readdirp (frame, this, fd, size, off);
- return 0;
-
-}
-
-
-
-static int32_t
-pump_releasedir (xlator_t *this,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (priv->use_afr_in_pump)
- afr_releasedir (this, fd);
- return 0;
-
-}
-
-static int32_t
-pump_release (xlator_t *this,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (priv->use_afr_in_pump)
- afr_release (this, fd);
- return 0;
-
-}
-
-
-static int32_t
-pump_setattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- struct iatt *stbuf,
- int32_t valid)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_setattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setattr,
- loc, stbuf, valid);
- return 0;
- }
- afr_setattr (frame, this, loc, stbuf, valid);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsetattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iatt *stbuf,
- int32_t valid)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsetattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetattr,
- fd, stbuf, valid);
- return 0;
- }
- afr_fsetattr (frame, this, fd, stbuf, valid);
- return 0;
-
-}
-
-
-/* End of defaults */
-
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1);
-
- if (ret != 0) {
- gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-
-static int
-is_xlator_pump_sink (xlator_t *child)
-{
- return (child == PUMP_SINK_CHILD(THIS));
-}
-
-static int
-is_xlator_pump_source (xlator_t *child)
-{
- return (child == PUMP_SOURCE_CHILD(THIS));
-}
-
-int32_t
-notify (xlator_t *this, int32_t event,
- void *data, ...)
-{
- int ret = -1;
- xlator_t *child_xl = NULL;
-
- child_xl = (xlator_t *) data;
-
- ret = afr_notify (this, event, data);
-
- switch (event) {
- case GF_EVENT_CHILD_DOWN:
- if (is_xlator_pump_source (child_xl))
- pump_change_state (this, PUMP_STATE_ABORT);
- break;
-
- case GF_EVENT_CHILD_UP:
- if (is_xlator_pump_sink (child_xl))
- if (is_pump_start_pending (this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "about to start synctask");
- ret = pump_start_synctask (this);
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not start pump "
- "synctask");
- else
- pump_remove_start_pending (this);
- }
- }
-
- return ret;
-}
-
-int32_t
-init (xlator_t *this)
-{
- afr_private_t * priv = NULL;
- pump_private_t *pump_priv = NULL;
- int child_count = 0;
- xlator_list_t * trav = NULL;
- int i = 0;
- int ret = -1;
- int op_errno = 0;
-
- int source_child = 0;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "pump translator needs a source and sink"
- "subvolumes defined.");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "Volume is dangling.");
- }
-
- ALLOC_OR_GOTO (this->private, afr_private_t, out);
-
- priv = this->private;
-
- priv->read_child = source_child;
- priv->favorite_child = source_child;
- priv->background_self_heal_count = 0;
-
- priv->data_self_heal = 1;
- priv->metadata_self_heal = 1;
- priv->entry_self_heal = 1;
-
- priv->data_self_heal_algorithm = "";
-
- priv->data_self_heal_window_size = 16;
-
- priv->data_change_log = 1;
- priv->metadata_change_log = 1;
- priv->entry_change_log = 1;
- priv->use_afr_in_pump = 1;
-
- /* Locking options */
-
- /* Lock server count infact does not matter. Locks are held
- on all subvolumes, in this case being the source
- and the sink.
- */
-
- priv->data_lock_server_count = 2;
- priv->metadata_lock_server_count = 2;
- priv->entry_lock_server_count = 2;
-
- priv->strict_readdir = _gf_false;
-
- trav = this->children;
- while (trav) {
- child_count++;
- trav = trav->next;
- }
-
- priv->wait_count = 1;
-
- if (child_count != 2) {
- gf_log (this->name, GF_LOG_ERROR,
- "There should be exactly 2 children - one source "
- "and one sink");
- return -1;
- }
- priv->child_count = child_count;
-
- LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
-
- priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
- gf_afr_mt_char);
- if (!priv->child_up) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- goto out;
- }
-
- priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
- gf_afr_mt_xlator_t);
- if (!priv->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- goto out;
- }
-
- priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key),
- child_count,
- gf_afr_mt_char);
- if (!priv->pending_key) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- goto out;
- }
-
- trav = this->children;
- i = 0;
- while (i < child_count) {
- priv->children[i] = trav->xlator;
-
- ret = asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
- trav->xlator->name);
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed to set pending key");
- op_errno = ENOMEM;
- goto out;
- }
-
- trav = trav->next;
- i++;
- }
-
- priv->first_lookup = 1;
- priv->root_inode = NULL;
-
- pump_priv = GF_CALLOC (1, sizeof (*pump_priv),
- gf_afr_mt_pump_priv);
- if (!pump_priv) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto out;
- }
-
- LOCK_INIT (&pump_priv->resume_path_lock);
- LOCK_INIT (&pump_priv->pump_state_lock);
-
- pump_priv->resume_path = GF_CALLOC (1, PATH_MAX,
- gf_afr_mt_char);
- if (!pump_priv->resume_path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- pump_priv->env = syncenv_new (0);
- if (!pump_priv->env) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not create new sync-environment");
- ret = -1;
- goto out;
- }
-
- priv->pump_private = pump_priv;
-
- pthread_mutex_init (&priv->mutex, NULL);
- INIT_LIST_HEAD (&priv->saved_fds);
-
- pump_change_state (this, PUMP_STATE_ABORT);
-
- ret = 0;
-out:
- return ret;
-}
-
-int
-fini (xlator_t *this)
-{
- return 0;
-}
-
-
-struct xlator_fops fops = {
- .lookup = pump_lookup,
- .open = pump_open,
- .flush = pump_flush,
- .fsync = pump_fsync,
- .fsyncdir = pump_fsyncdir,
- .xattrop = pump_xattrop,
- .fxattrop = pump_fxattrop,
- .getxattr = pump_getxattr,
-
- /* inode write */
- .writev = pump_writev,
- .truncate = pump_truncate,
- .ftruncate = pump_ftruncate,
- .setxattr = pump_setxattr,
- .setattr = pump_setattr,
- .fsetattr = pump_fsetattr,
- .removexattr = pump_removexattr,
-
- /* dir read */
- .opendir = pump_opendir,
- .readdir = pump_readdir,
- .readdirp = pump_readdirp,
-
- /* dir write */
- .create = pump_create,
- .mknod = pump_mknod,
- .mkdir = pump_mkdir,
- .unlink = pump_unlink,
- .rmdir = pump_rmdir,
- .link = pump_link,
- .symlink = pump_symlink,
- .rename = pump_rename,
-};
-
-struct xlator_dumpops dumpops = {
- .priv = afr_priv_dump,
-};
-
-
-struct xlator_cbks cbks = {
- .release = pump_release,
- .releasedir = pump_releasedir,
-};
-
-struct volume_options options[] = {
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h
deleted file mode 100644
index a46f9d7a542..00000000000
--- a/xlators/cluster/afr/src/pump.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __PUMP_H__
-#define __PUMP_H__
-
-#include "syncop.h"
-
-/* FIXME: Needs to be defined in a common file */
-#define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect"
-#define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect"
-
-#define PUMP_PID 696969
-#define PUMP_LK_OWNER 696969
-
-#define IS_ROOT_PATH(path) (!strcmp (path, "/"))
-#define IS_ENTRY_CWD(entry) (!strcmp (entry, "."))
-#define IS_ENTRY_PARENT(entry) (!strcmp (entry, ".."))
-
-#define PUMP_CMD_START "trusted.glusterfs.pump.start"
-#define PUMP_CMD_ABORT "trusted.glusterfs.pump.abort"
-#define PUMP_CMD_PAUSE "trusted.glusterfs.pump.pause"
-#define PUMP_CMD_STATUS "trusted.glusterfs.pump.status"
-
-#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete"
-#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete"
-
-#define PUMP_PATH "trusted.glusterfs.pump-path"
-
-#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator)
-#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator)
-
-typedef enum {
- PUMP_STATE_RUNNING, /* Pump is running and migrating files */
- PUMP_STATE_RESUME, /* Pump is resuming from a previous pause */
- PUMP_STATE_PAUSE, /* Pump is paused */
- PUMP_STATE_ABORT, /* Pump is aborted */
-} pump_state_t;
-
-typedef struct _pump_private {
- struct syncenv *env; /* The env pointer to the pump synctask */
- const char *resume_path; /* path to resume from the last pause */
- gf_lock_t resume_path_lock; /* Synchronize resume_path changes */
- gf_lock_t pump_state_lock; /* Synchronize pump_state changes */
- pump_state_t pump_state; /* State of pump */
- char current_file[PATH_MAX]; /* Current file being pumped */
- uint64_t number_files_pumped; /* Number of files pumped */
- gf_boolean_t pump_finished; /* Boolean to indicate pump termination */
- char pump_start_pending; /* Boolean to mark start pending until
- CHILD_UP */
-} pump_private_t;
-
-void
-build_root_loc (inode_t *inode, loc_t *loc);
-int pump_start (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_start (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_start (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_pause (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_pause (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_abort (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_abort (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_status (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_status (call_frame_t *frame, xlator_t *this);
-
-#endif /* __PUMP_H__ */
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index 4b69aa07100..56f1f2ad7c8 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,34 +1,48 @@
-
xlator_LTLIBRARIES = dht.la nufa.la switch.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+AM_CFLAGS = -Wall $(GF_CFLAGS)
-dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \
- dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-dht_la_SOURCES = $(dht_common_source) dht.c
+dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \
+ dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \
+ dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \
+ dht-lock.c $(top_builddir)/xlators/lib/src/libxlator.c
+
+dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
-dht_la_LDFLAGS = -module -avoidversion
+dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-nufa_la_LDFLAGS = -module -avoidversion
+nufa_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-switch_la_LDFLAGS = -module -avoidversion
+switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = dht-common.h dht-common.c dht-mem-types.h
+noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \
+ dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(top_srcdir)/xlators/lib/src \
+ -DDATADIR=\"$(localstatedir)\" \
+ -DLIBDIR=\"$(libdir)\"
-CLEANFILES =
+CLEANFILES =
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/distribute.so
install-data-hook:
- ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so \ No newline at end of file
+ ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so
+
+if UNITTEST
+CLEANFILES += *.gcda *.gcno *_xunit.xml
+noinst_PROGRAMS =
+TESTS =
+endif
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index d1e04b16ae7..8ba0cc4c732 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -1,4944 +1,11391 @@
/*
- Copyright (c) 2009-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
/* TODO: add NS locking */
-#include "glusterfs.h"
-#include "xlator.h"
+#include "libxlator.h"
#include "dht-common.h"
-#include "defaults.h"
+#include "dht-lock.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/quota-common-utils.h>
+#include <glusterfs/upcall-utils.h>
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include <glusterfs/common-utils.h>
#include <sys/time.h>
#include <libgen.h>
+#include <signal.h>
-/* TODO:
- - use volumename in xattr instead of "dht"
- - use NS locks
- - handle all cases in self heal layout reconstruction
- - complete linkfile selfheal
-*/
+static int
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata);
+static int
+dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
-int
-dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int op_ret, int op_errno)
-{
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int ret = 0;
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req);
- local = frame->local;
- ret = op_ret;
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this);
- dht_frame_su_undo (frame);
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
- if (ret == 0) {
- layout = local->selfheal.layout;
- ret = dht_layout_set (this, local->inode, layout);
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this);
- if (local->ia_ino) {
- local->stbuf.ia_ino = local->ia_ino;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not find hashed subvolume for %s",
- local->loc.path);
- }
+static const char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL};
- if (local->loc.parent)
- local->postparent.ia_ino = local->loc.parent->ino;
- }
+/* Check the xdata to make sure EBADF has been set by client xlator */
+int32_t
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno)
+{
+ if (op_ret == -1 && (op_errno == EBADF || op_errno == EBADFD) &&
+ !(local->fd_checked)) {
+ return 1;
+ }
+ return 0;
+}
- WIPE (&local->postparent);
+/* Sets the blocks and size values to fixed values. This is to be called
+ * only for dirs. The caller is responsible for checking the type
+ */
+int32_t
+dht_set_fixed_dir_stat(struct iatt *stat)
+{
+ if (stat) {
+ stat->ia_blocks = DHT_DIR_STAT_BLOCKS;
+ stat->ia_size = DHT_DIR_STAT_SIZE;
+ return 0;
+ }
+ return -1;
+}
- DHT_STACK_UNWIND (lookup, frame, ret, local->op_errno, local->inode,
- &local->stbuf, local->xattr, &local->postparent);
+/* Return true if key exists in array
+ */
+static gf_boolean_t
+dht_match_xattr(const char *key)
+{
+ char **xattrs_to_heal = get_xattrs_to_heal();
- return 0;
+ return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0;
}
+static int
+dht_aggregate_quota_xattr(dict_t *dst, char *key, data_t *value)
+{
+ int ret = -1;
+ quota_meta_t *meta_dst = NULL;
+ quota_meta_t *meta_src = NULL;
+ int64_t *size = NULL;
+ int64_t dst_dir_count = 0;
+ int64_t src_dir_count = 0;
+
+ if (value == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL,
+ "data value is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin(dst, key, (void **)&meta_dst);
+ if (ret < 0) {
+ meta_dst = GF_CALLOC(1, sizeof(quota_meta_t), gf_common_quota_meta_t);
+ if (meta_dst == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Memory allocation failed");
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_bin(dst, key, meta_dst, sizeof(quota_meta_t));
+ if (ret < 0) {
+ gf_msg("dht", GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED,
+ "dht aggregate dict set failed");
+ GF_FREE(meta_dst);
+ ret = -1;
+ goto out;
+ }
+ }
-int
-dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- int ret = 0;
- int is_dir = 0;
-
- conf = this->private;
- local = frame->local;
- prev = cookie;
-
- layout = local->layout;
-
- if (!op_ret && uuid_is_null (local->gfid))
- memcpy (local->gfid, stbuf->ia_gfid, 16);
-
- LOCK (&frame->lock);
- {
- /* TODO: assert equal mode on stbuf->st_mode and
- local->stbuf->st_mode
-
- else mkdir/chmod/chown and fix
- */
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, xattr);
-
- if (op_ret == -1) {
- local->op_errno = ENOENT;
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s returned error (%s)",
- local->loc.path, prev->this->name,
- strerror (op_errno));
-
- goto unlock;
- }
-
- is_dir = check_is_dir (inode, stbuf, xattr);
- if (!is_dir) {
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s returned non dir 0%o",
- local->loc.path, prev->this->name,
- stbuf->ia_type);
- local->need_selfheal = 1;
- goto unlock;
- }
-
- local->op_ret = 0;
- if (local->xattr == NULL)
- local->xattr = dict_ref (xattr);
- if (local->inode == NULL)
- local->inode = inode_ref (inode);
+ if (value->len > sizeof(int64_t)) {
+ meta_src = data_to_bin(value);
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
+ meta_dst->size = hton64(ntoh64(meta_dst->size) +
+ ntoh64(meta_src->size));
+ meta_dst->file_count = hton64(ntoh64(meta_dst->file_count) +
+ ntoh64(meta_src->file_count));
- if (prev->this == dht_first_up_subvol (this)) {
- local->ia_ino = local->stbuf.ia_ino;
- }
+ if (value->len > (2 * sizeof(int64_t))) {
+ dst_dir_count = ntoh64(meta_dst->dir_count);
+ src_dir_count = ntoh64(meta_src->dir_count);
+ if (src_dir_count > dst_dir_count)
+ meta_dst->dir_count = meta_src->dir_count;
+ } else {
+ meta_dst->dir_count = 0;
}
-unlock:
- UNLOCK (&frame->lock);
+ } else {
+ size = data_to_bin(value);
+ meta_dst->size = hton64(ntoh64(meta_dst->size) + ntoh64(*size));
+ }
+ ret = 0;
+out:
+ return ret;
+}
- this_call_cnt = dht_frame_return (frame);
-
- if (is_last_call (this_call_cnt)) {
- if (local->need_selfheal) {
- local->need_selfheal = 0;
- dht_lookup_everywhere (frame, this, &local->loc);
- return 0;
- }
+static int
+add_opt(char **optsp, const char *opt)
+{
+ char *newopts = NULL;
+ unsigned oldsize = 0;
+ unsigned newsize = 0;
+
+ if (*optsp == NULL)
+ newopts = gf_strdup(opt);
+ else {
+ oldsize = strlen(*optsp);
+ newsize = oldsize + 1 + strlen(opt) + 1;
+ newopts = GF_REALLOC(*optsp, newsize);
+ if (newopts)
+ sprintf(newopts + oldsize, ",%s", opt);
+ }
+ if (newopts == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices in buffer in add_opt");
+ return -1;
+ }
+ *optsp = newopts;
+ return 0;
+}
- if (local->op_ret == 0) {
- ret = dht_layout_normalize (this, &local->loc, layout);
+/* Return Choice list from Split brain status */
+static char *
+getChoices(const char *value)
+{
+ int i = 0;
+ char *ptr = NULL;
+ char *tok = NULL;
+ char *result = NULL;
+ char *newval = NULL;
+
+ ptr = strstr(value, "Choices:");
+ if (!ptr) {
+ result = ptr;
+ goto out;
+ }
+
+ newval = gf_strdup(ptr);
+ if (!newval) {
+ result = newval;
+ goto out;
+ }
+
+ tok = strtok(newval, ":");
+ if (!tok) {
+ result = tok;
+ goto out;
+ }
+
+ while (tok) {
+ i++;
+ if (i == 2)
+ break;
+ tok = strtok(NULL, ":");
+ }
+
+ result = gf_strdup(tok);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fixing assignment on %s",
- local->loc.path);
- goto selfheal;
- }
+out:
+ if (newval)
+ GF_FREE(newval);
- dht_layout_set (this, local->inode, layout);
+ return result;
+}
- if (local->ia_ino) {
- local->stbuf.ia_ino = local->ia_ino;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not find hashed subvol for %s",
- local->loc.path);
- }
+/* This function prepare a list of choices for key
+ (replica.split-brain-status) in case of metadata split brain
+ only on the basis of key-value passed to this function.
+ After prepare the list of choices it update the same key in dict
+ with this value to reflect the same in
+ replica.split-brain-status attr for file.
- if (local->loc.parent)
- local->postparent.ia_ino =
- local->loc.parent->ino;
- }
+*/
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+static int
+dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value)
+{
+ int ret = 0;
+ char *oldvalue = NULL;
+ char *old_choice = NULL;
+ char *new_choice = NULL;
+ char *full_choice = NULL;
+ char *status = NULL;
+
+ if (value == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL,
+ "GF_AFR_SBRAIN_STATUS value is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str(dst, key, &oldvalue);
+ if (ret)
+ goto out;
+
+ /* skip code that is irrelevant if !oldvalue */
+ if (!oldvalue)
+ goto out;
+
+ if (strstr(oldvalue, "not")) {
+ gf_msg_debug("dht", 0, "Need to update split-brain status in dict");
+ ret = -1;
+ goto out;
+ }
+ if (strstr(oldvalue, "metadata-split-brain:yes") &&
+ (strstr(oldvalue, "data-split-brain:no"))) {
+ if (strstr(value->data, "not")) {
+ gf_msg_debug("dht", 0, "No need to update split-brain status");
+ ret = 0;
+ goto out;
}
+ if (strstr(value->data, "yes") &&
+ (strncmp(oldvalue, value->data, strlen(oldvalue)))) {
+ old_choice = getChoices(oldvalue);
+ if (!old_choice) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to get choices");
+ ret = -1;
+ goto out;
+ }
- return 0;
+ ret = add_opt(&full_choice, old_choice);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices");
+ ret = -1;
+ goto out;
+ }
-selfheal:
- dht_frame_su_do (frame);
- ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk,
- &local->loc, layout);
+ new_choice = getChoices(value->data);
+ if (!new_choice) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to get choices");
+ ret = -1;
+ goto out;
+ }
- return 0;
+ ret = add_opt(&full_choice, new_choice);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices ");
+ ret = -1;
+ goto out;
+ }
+ ret = gf_asprintf(&status,
+ "data-split-brain:%s "
+ "metadata-split-brain:%s Choices:%s",
+ "no", "yes", full_choice);
+
+ if (-1 == ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to prepare status ");
+ goto out;
+ }
+ ret = dict_set_dynstr(dst, key, status);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set full choice");
+ }
+ }
+ }
+
+out:
+ if (old_choice)
+ GF_FREE(old_choice);
+ if (new_choice)
+ GF_FREE(new_choice);
+ if (full_choice)
+ GF_FREE(full_choice);
+
+ return ret;
}
-int
-dht_lookup_root_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- int ret = 0;
- int is_dir = 0;
-
- conf = this->private;
- local = frame->local;
- prev = cookie;
-
- layout = local->layout;
-
- LOCK (&frame->lock);
- {
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, xattr);
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR,
- "lookup of %s on %s returned error (%s)",
- local->loc.path, prev->this->name,
- strerror (op_errno));
- goto unlock;
- }
-
- is_dir = check_is_dir (inode, stbuf, xattr);
- if (!is_dir) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "lookup of %s on %s returned non dir 0%o",
- local->loc.path, prev->this->name,
- stbuf->ia_type);
- goto unlock;
- }
+static int
+dht_aggregate(dict_t *this, char *key, data_t *value, void *data)
+{
+ dict_t *dst = NULL;
+ int32_t ret = -1;
+ data_t *dict_data = NULL;
+
+ dst = data;
+
+ /* compare split brain xattr only */
+ if (strcmp(key, GF_AFR_SBRAIN_STATUS) == 0) {
+ ret = dht_aggregate_split_brain_xattr(dst, key, value);
+ if (!ret)
+ goto out;
+ } else if (strcmp(key, QUOTA_SIZE_KEY) == 0) {
+ ret = dht_aggregate_quota_xattr(dst, key, value);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0,
+ DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED,
+ "Failed to aggregate quota xattr");
+ }
+ goto out;
+ } else if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ ret = gf_get_min_stime(THIS, dst, key, value);
+ goto out;
+ } else {
+ /* compare user xattrs only */
+ if (!strncmp(key, "user.", SLEN("user."))) {
+ ret = dict_lookup(dst, key, &dict_data);
+ if (!ret && dict_data && value) {
+ ret = is_data_equal(dict_data, value);
+ if (!ret)
+ gf_msg_debug("dht", 0, "xattr mismatch for %s", key);
+ }
+ }
+ }
- local->op_ret = 0;
- if (local->xattr == NULL)
- local->xattr = dict_ref (xattr);
- if (local->inode == NULL)
- local->inode = inode_ref (inode);
+ ret = dict_set(dst, key, value);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s", key);
+ }
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+out:
+ return ret;
+}
- if (prev->this == dht_first_up_subvol (this)) {
- local->ia_ino = local->stbuf.ia_ino;
- }
+static void
+dht_aggregate_xattr(dict_t *dst, dict_t *src)
+{
+ if ((dst == NULL) || (src == NULL)) {
+ goto out;
+ }
+
+ dict_foreach(src, dht_aggregate, dst);
+out:
+ return;
+}
+/* Code to save hashed subvol on inode ctx as a mds subvol
+ */
+int
+dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+ uint64_t ctx_int = 0;
+ gf_boolean_t ctx_free = _gf_false;
+
+ LOCK(&inode->lock);
+ {
+ ret = __inode_ctx_get(inode, this, &ctx_int);
+ if (ctx_int) {
+ ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
+ ctx->mds_subvol = mds_subvol;
+ } else {
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+ if (!ctx)
+ goto unlock;
+ ctx->mds_subvol = mds_subvol;
+ ctx_free = _gf_true;
+ ctx_int = (long)ctx;
+ ret = __inode_ctx_set(inode, this, &ctx_int);
}
+ }
unlock:
- UNLOCK (&frame->lock);
+ UNLOCK(&inode->lock);
+ if (ret && ctx_free)
+ GF_FREE(ctx);
+ return ret;
+}
+/*Code to get mds subvol from inode ctx */
- this_call_cnt = dht_frame_return (frame);
+int
+dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- if (is_last_call (this_call_cnt)) {
- if (local->op_ret == 0) {
- ret = dht_layout_normalize (this, &local->loc, layout);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_INFO,
- "fixing assignment on %s",
- local->loc.path);
- }
+ if (!mdsvol)
+ return ret;
+
+ if (__is_root_gfid(inode->gfid)) {
+ (*mdsvol) = FIRST_CHILD(this);
+ return 0;
+ }
- dht_layout_set (this, local->inode, layout);
- }
+ ret = dht_inode_ctx_get(inode, this, &ctx);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+ if (!ret && ctx) {
+ if (ctx->mds_subvol) {
+ *mdsvol = ctx->mds_subvol;
+ ret = 0;
+ } else {
+ ret = -1;
}
+ }
- return 0;
+ return ret;
}
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+
static int
-dht_do_fresh_lookup_on_root (xlator_t *this, call_frame_t *frame)
+dht_lookup_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int call_cnt = 0;
- int i = 0;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
- local = frame->local;
- conf = this->private;
- if (!conf)
- goto err;
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+
+ local = frame->local;
+ conf = this->private;
+ ret = op_ret;
+
+ FRAME_SU_UNDO(frame, dht_local_t);
+
+ if (ret == 0) {
+ layout = local->selfheal.layout;
+ ret = dht_layout_set(this, local->inode, layout);
+ }
+
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, &local->postparent,
+ 1);
+ }
- if (local->layout) {
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
+
+ DHT_STACK_UNWIND(lookup, frame, ret, local->op_errno, local->inode,
+ &local->stbuf, local->xattr, &local->postparent);
+
+out:
+ return ret;
+}
+
+static int
+dht_discover_complete(xlator_t *this, call_frame_t *discover_frame)
+{
+ dht_local_t *local = NULL;
+ dht_local_t *heal_local = NULL;
+ call_frame_t *main_frame = NULL;
+ call_frame_t *heal_frame = NULL;
+ int op_errno = 0;
+ int ret = -1;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ uint32_t vol_commit_hash = 0;
+ xlator_t *source = NULL;
+ int heal_path = 0;
+ int error_while_marking_mds = 0;
+ int i = 0;
+ loc_t loc = {0};
+ int8_t is_read_only = 0, layout_anomalies = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+
+ local = discover_frame->local;
+ layout = local->layout;
+ conf = this->private;
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ LOCK(&discover_frame->lock);
+ {
+ main_frame = local->main_frame;
+ local->main_frame = NULL;
+ }
+ UNLOCK(&discover_frame->lock);
+
+ if (!main_frame)
+ return 0;
+
+ /* Code to update all extended attributed from
+ subvol to local->xattr on that internal xattr has found
+ */
+ if (conf->subvolume_cnt == 1)
+ local->need_xattr_heal = 0;
+ if (local->need_xattr_heal && (local->mds_xattr)) {
+ dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr,
+ NULL, NULL);
+ dict_unref(local->mds_xattr);
+ local->mds_xattr = NULL;
+ }
+
+ ret = dict_get_int8(local->xattr_req, QUOTA_READ_ONLY_KEY, &is_read_only);
+ if (ret < 0)
+ gf_msg_debug(this->name, 0, "key = %s not present in dict",
+ QUOTA_READ_ONLY_KEY);
+
+ if (local->file_count && local->dir_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "path %s exists as a file on one subvolume "
+ "and directory on another. "
+ "Please fix it manually",
+ local->loc.path);
+ op_errno = EIO;
+ goto out;
+ }
+
+ if (local->cached_subvol) {
+ ret = dht_layout_preset(this, local->cached_subvol, local->inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to set layout for subvolume %s",
+ local->cached_subvol ? local->cached_subvol->name : "<nil>");
+ op_errno = EINVAL;
+ goto out;
+ }
+ } else {
+ ret = dht_layout_normalize(this, &local->loc, layout);
+ if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) {
+ /* either the layout is incorrect or the directory is
+ * not found even in one subvolume.
+ */
+ gf_msg_debug(this->name, 0,
+ "normalizing failed on %s "
+ "(overlaps/holes present: %s, "
+ "ENOENT errors: %d)",
+ local->loc.path, (ret < 0) ? "yes" : "no",
+ (ret > 0) ? ret : 0);
+ layout_anomalies = 1;
+ } else if (local->inode) {
+ dht_layout_set(this, local->inode, layout);
}
+ }
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set the dict entry for dht");
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32(local->xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
- call_cnt = local->call_cnt = conf->subvolume_cnt;
+ if (IA_ISDIR(local->stbuf.ia_type) && !is_read_only) {
+ for (i = 0; i < layout->cnt; i++) {
+ if (!source && !layout->list[i].err)
+ source = layout->list[i].xlator;
+ if (layout->list[i].err == ENOENT ||
+ layout->list[i].err == ESTALE) {
+ heal_path = 1;
+ }
+
+ if (source && heal_path)
+ break;
+ }
+ }
+
+ if (IA_ISDIR(local->stbuf.ia_type)) {
+ /* Call function to save hashed subvol on inode ctx if
+ internal mds xattr is not present and all subvols are up
+ */
+ if (!local->op_ret && !__is_root_gfid(local->stbuf.ia_gfid))
+ (void)dht_common_mark_mdsxattr(discover_frame,
+ &error_while_marking_mds, 1);
+
+ if (local->need_xattr_heal && !heal_path) {
+ local->need_xattr_heal = 0;
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "xattr heal failed for "
+ "directory gfid is %s ",
+ gfid_local);
+ }
+ }
+ }
- local->layout = dht_layout_new (this,
- conf->subvolume_cnt);
- if (!local->layout) {
- local->op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ if (source && (heal_path || layout_anomalies || error_while_marking_mds)) {
+ gf_uuid_copy(loc.gfid, local->gfid);
+ if (gf_uuid_is_null(loc.gfid)) {
+ goto done;
}
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_root_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
+ if (local->inode)
+ loc.inode = inode_ref(local->inode);
+ else
+ goto done;
+
+ heal_frame = create_frame(this, this->ctx->pool);
+ if (heal_frame) {
+ heal_local = dht_local_init(heal_frame, &loc, NULL, 0);
+ if (!heal_local)
+ goto cleanup;
+
+ gf_uuid_copy(heal_local->gfid, local->gfid);
+ heal_frame->cookie = source;
+ heal_local->xattr = dict_ref(local->xattr);
+ heal_local->stbuf = local->stbuf;
+ heal_local->postparent = local->postparent;
+ heal_local->inode = inode_ref(loc.inode);
+ heal_local->main_frame = main_frame;
+ FRAME_SU_DO(heal_frame, dht_local_t);
+ ret = synctask_new(this->ctx->env, dht_heal_full_path,
+ dht_heal_full_path_done, heal_frame, heal_frame);
+ if (!ret) {
+ loc_wipe(&loc);
+ return 0;
+ }
+ /*
+ * Failed to spawn the synctask. Returning
+ * with out doing heal.
+ */
+ cleanup:
+ loc_wipe(&loc);
+ DHT_STACK_DESTROY(heal_frame);
}
+ }
+done:
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- return 0;
-err:
- DHT_STACK_UNWIND (lookup, frame, -1, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
- return 0;
+ DHT_STACK_UNWIND(lookup, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(lookup, main_frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ return ret;
}
-int
-dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int is_dir = 0;
- int is_linkfile = 0;
- unsigned char root_gfid[16] = {0,};
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = cookie;
+ int ret = -1;
+ dht_conf_t *conf = 0;
+ dht_layout_t *layout = NULL;
+ int32_t mds_heal_fresh_lookup = 0;
+
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+
+ local = frame->local;
+ conf = this->private;
+ layout = local->selfheal.layout;
+ mds_heal_fresh_lookup = local->mds_heal_fresh_lookup;
+
+ if (op_ret) {
+ gf_msg_debug(this->name, op_ret,
+ "Failed to set %s on the MDS %s for path %s. ",
+ conf->mds_xattr_key, prev->name, local->loc.path);
+ } else {
+ /* Save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set mds subvol on inode ctx"
+ " %s for %s ",
+ prev->name, local->loc.path);
+ }
+ }
+ if (!local->mds_heal_fresh_lookup && layout) {
+ dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffffff,
+ layout);
+ }
+out:
+ if (mds_heal_fresh_lookup)
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
- local = frame->local;
- prev = cookie;
- conf = this->private;
- if (!conf)
+static xlator_t *
+dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc)
+{
+ char *path = NULL;
+ loc_t populate_loc = {
+ 0,
+ };
+ char *name = NULL;
+ xlator_t *hash_subvol = NULL;
+
+ if (!inode)
+ return hash_subvol;
+
+ if (loc && loc->parent && loc->path) {
+ if (!loc->name) {
+ name = strrchr(loc->path, '/');
+ if (name) {
+ loc->name = name + 1;
+ } else {
goto out;
+ }
+ }
+ hash_subvol = dht_subvol_get_hashed(this, loc);
+ goto out;
+ }
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
-
- if ((op_errno != ENOTCONN)
- && (op_errno != ENOENT)
- && (op_errno != ESTALE)) {
- gf_log (this->name, GF_LOG_INFO,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- }
-
- if (op_errno == ESTALE) {
- /* propogate the ESTALE to parent.
- * setting local->layout_mismatch would send
- * ESTALE to parent. */
- local->layout_mismatch = 1;
- }
+ if (!gf_uuid_is_null(inode->gfid)) {
+ populate_loc.inode = inode_ref(inode);
+ populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL);
+ inode_path(populate_loc.inode, NULL, &path);
- goto unlock;
- }
-
- if (stbuf->ia_type != local->inode->ia_type) {
- gf_log (this->name, GF_LOG_INFO,
- "mismatching filetypes 0%o v/s 0%o for %s",
- (stbuf->ia_type), (local->inode->ia_type),
- local->loc.path);
-
- local->op_ret = -1;
- local->op_errno = EINVAL;
-
- goto unlock;
- }
-
- layout = local->layout;
-
- is_dir = check_is_dir (inode, stbuf, xattr);
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
-
- if (is_linkfile) {
- gf_log (this->name, GF_LOG_INFO,
- "linkfile found in revalidate for %s",
- local->loc.path);
- local->layout_mismatch = 1;
-
- goto unlock;
- }
-
- if (is_dir) {
- ret = dht_layout_dir_mismatch (this, layout,
- prev->this, &local->loc,
- xattr);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_INFO,
- "mismatching layouts for %s",
- local->loc.path);
-
- local->layout_mismatch = 1;
-
- goto unlock;
- }
- }
-
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
-
- local->op_ret = 0;
- local->stbuf.ia_ino = local->ia_ino;
-
- if (local->loc.parent)
- local->postparent.ia_ino = local->loc.parent->ino;
-
- if (!local->xattr)
- local->xattr = dict_ref (xattr);
- }
-unlock:
- UNLOCK (&frame->lock);
+ if (!path)
+ goto out;
+
+ populate_loc.path = path;
+ if (!populate_loc.name && populate_loc.path) {
+ name = strrchr(populate_loc.path, '/');
+ if (name) {
+ populate_loc.name = name + 1;
+
+ } else {
+ goto out;
+ }
+ }
+ hash_subvol = dht_subvol_get_hashed(this, &populate_loc);
+ }
out:
- this_call_cnt = dht_frame_return (frame);
-
- if (is_last_call (this_call_cnt)) {
- if (!IA_ISDIR (local->stbuf.ia_type)
- && (local->hashed_subvol != local->cached_subvol)
- && (local->stbuf.ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- local->stbuf.ia_prot.sticky = 1;
- }
-
- if (local->layout_mismatch) {
- local->op_ret = -1;
- local->op_errno = ESTALE;
-
- /* Because for 'root' inode, there is no FRESH lookup
- * sent from FUSE layer upon ESTALE, we need to handle
- * that one case here */
- root_gfid[15] = 1;
- if (!local->loc.parent &&
- !uuid_compare (local->loc.inode->gfid, root_gfid)) {
- dht_do_fresh_lookup_on_root (this, frame);
- return 0;
- }
- }
+ if (populate_loc.inode)
+ loc_wipe(&populate_loc);
+ return hash_subvol;
+}
- WIPE (&local->postparent);
+/* Common function call by revalidate/selfheal code path to populate
+ internal xattr if it is not present, mark_during_fresh_lookup value
+ determines either function is call by revalidate_cbk(discover_complete)
+ or call by selfheal code path while fresh lookup.
+ Here we do wind a call serially in case of fresh lookup and
+ for other lookup code path we do wind a call parallel.The reason
+ to wind a call serially is at the time of fresh lookup directory is not
+ discovered and at the time of revalidate_lookup directory is
+ already discovered. So, revalidate codepath can race with setxattr
+ codepath and can get into spurious heals because of an ongoing setxattr.
+ This can slow down revalidates, if healing happens in foreground.
+ However, if healing happens in background, there is no direct performance
+ penalty.
+*/
+int
+dht_common_mark_mdsxattr(call_frame_t *frame, int *errst,
+ int mark_during_fresh_lookup)
+{
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int ret = 0;
+ int i = 0;
+ dict_t *xattrs = NULL;
+ char gfid_local[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+ int32_t zero[1] = {0};
+ dht_conf_t *conf = 0;
+ dht_layout_t *layout = NULL;
+ dht_local_t *copy_local = NULL;
+ call_frame_t *xattr_frame = NULL;
+ gf_boolean_t vol_down = _gf_false;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ local = frame->local;
+ conf = this->private;
+ layout = local->selfheal.layout;
+ local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ /* Code to update hashed subvol consider as a mds subvol
+ and wind a setxattr call on hashed subvol to update
+ internal xattr
+ */
+ if (!local->xattr || !dict_get(local->xattr, conf->mds_xattr_key)) {
+ /* It means no internal MDS xattr has been set yet
+ */
+ /* Check the status of all subvol are up while call
+ this function call by lookup code path
+ */
+ if (mark_during_fresh_lookup) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ vol_down = _gf_true;
+ break;
+ }
+ }
+ if (vol_down) {
+ gf_msg_debug(this->name, 0,
+ "subvol %s is down. Unable to "
+ " save mds subvol on inode for "
+ " path %s gfid is %s ",
+ conf->subvolumes[i]->name, local->loc.path,
+ gfid_local);
+ goto out;
+ }
+ }
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
- }
+ /* Calculate hashed subvol based on inode and parent node
+ */
+ hashed_subvol = dht_inode_get_hashed_subvol(local->inode, this,
+ &local->loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_DEBUG, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for path %s"
+ "gfid is %s ",
+ local->loc.path, gfid_local);
+ if (errst)
+ (*errst) = 1;
+ ret = -1;
+ goto out;
+ }
+ xattrs = dict_new();
+ if (!xattrs) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dict_new failed");
+ ret = -1;
+ goto out;
+ }
+ /* Add internal MDS xattr on disk for hashed subvol
+ */
+ ret = dht_dict_set_array(xattrs, conf->mds_xattr_key, zero, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary"
+ " value:key = %s for "
+ "path %s",
+ conf->mds_xattr_key, local->loc.path);
+ ret = -1;
+ goto out;
+ }
+ /* Create a new frame to wind a call only while
+ this function call by revalidate_cbk code path
+ To wind a call parallel need to create a new frame
+ */
+ if (mark_during_fresh_lookup) {
+ xattr_frame = create_frame(this, this->ctx->pool);
+ if (!xattr_frame) {
+ ret = -1;
+ goto out;
+ }
+ copy_local = dht_local_init(xattr_frame, &(local->loc), NULL, 0);
+ if (!copy_local) {
+ ret = -1;
+ DHT_STACK_DESTROY(xattr_frame);
+ goto out;
+ }
+ copy_local->stbuf = local->stbuf;
+ copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+ if (!copy_local->inode)
+ copy_local->inode = inode_ref(local->inode);
+ gf_uuid_copy(copy_local->loc.gfid, local->gfid);
+ FRAME_SU_DO(xattr_frame, dht_local_t);
+ STACK_WIND_COOKIE(xattr_frame, dht_common_mark_mdsxattr_cbk,
+ hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->setxattr, &local->loc,
+ xattrs, 0, NULL);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_common_mark_mdsxattr_cbk,
+ (void *)hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->setxattr, &local->loc,
+ xattrs, 0, NULL);
+ }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "internal xattr %s is present on subvol"
+ "on path %s gfid is %s ",
+ conf->mds_xattr_key, local->loc.path, gfid_local);
+ if (!mark_during_fresh_lookup)
+ dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf,
+ 0xffffffff, layout);
+ }
- return 0;
+out:
+ if (xattrs)
+ dict_unref(xattrs);
+ return ret;
}
+/* Get the value of key from dict in the bytewise and save in array after
+ convert from network byte order to host byte order
+*/
+static int32_t
+dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size,
+ int *errst)
+{
+ void *ptr = NULL;
+ int32_t len = -1;
+ int32_t vindex = -1;
+ int32_t err = -1;
+ int ret = 0;
+
+ if (dict == NULL) {
+ (*errst) = -1;
+ return -EINVAL;
+ }
+ err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+ if (err != 0) {
+ (*errst) = -1;
+ return err;
+ }
+
+ if (len != (size * sizeof(int32_t))) {
+ (*errst) = -1;
+ return -EINVAL;
+ }
+
+ for (vindex = 0; vindex < size; vindex++) {
+ value[vindex] = ntoh32(*((int32_t *)ptr + vindex));
+ if (value[vindex] < 0)
+ ret = -1;
+ }
+
+ return ret;
+}
-int
-dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
-
- local = frame->local;
- cached_subvol = local->cached_subvol;
- conf = this->private;
-
- ret = dht_layout_preset (this, local->cached_subvol, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set layout for subvolume %s",
- cached_subvol ? cached_subvol->name : "<nil>");
- local->op_ret = -1;
- local->op_errno = EINVAL;
- goto unwind;
- }
+static int
+dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int32_t check_mds = 0;
+ int is_linkfile = 0;
+ int attempt_unwind = 0;
+ dht_conf_t *conf = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char gfid_node[GF_UUID_BUF_SIZE] = {0};
+ int32_t mds_xattr_val[1] = {0};
+ int errst = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ layout = local->layout;
+
+ /* Check if the gfid is different for file from other node */
+ if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on %s, gfid local = %s"
+ "gfid other = %s",
+ local->loc.path, prev->name, gfid_local, gfid_node);
+ }
+
+ LOCK(&frame->lock);
+ {
+ /* TODO: assert equal mode on stbuf->st_mode and
+ local->stbuf->st_mode
+
+ else mkdir/chmod/chown and fix
+ */
+
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "%s: failed to merge layouts for subvol %s", local->loc.path,
+ prev->name);
- local->op_ret = 0;
- if ((local->stbuf.ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- local->stbuf.ia_prot.sticky = 1;
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno,
+ "lookup of %s on %s returned error", local->loc.path,
+ prev->name);
- if (local->loc.parent)
- local->postparent.ia_ino = local->loc.parent->ino;
+ goto unlock;
+ }
-unwind:
- WIPE (&local->postparent);
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr,
+ conf->link_xattr_name);
+ is_dir = check_is_dir(inode, stbuf, xattr);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
- return 0;
-}
+ if (is_dir) {
+ local->dir_count++;
+ } else {
+ local->file_count++;
+
+ if (!is_linkfile && !local->cached_subvol) {
+ /* real file */
+ /* Ok, we somehow managed to find a file on
+ * more than one subvol. ignore this or we
+ * will end up overwriting information while a
+ * a thread is potentially unwinding from
+ * dht_discover_complete
+ */
+ local->cached_subvol = prev;
+ attempt_unwind = 1;
+ } else {
+ goto unlock;
+ }
+ }
+ local->op_ret = 0;
-int
-dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- int is_linkfile = 0;
- int is_dir = 0;
- xlator_t *subvol = NULL;
- loc_t *loc = NULL;
- xlator_t *link_subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- int ret = -1;
-
- conf = this->private;
-
- local = frame->local;
- loc = &local->loc;
-
- prev = cookie;
- subvol = prev->this;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno != ENOENT)
- local->op_errno = op_errno;
- goto unlock;
- }
- if (uuid_is_null (local->gfid))
- memcpy (local->gfid, buf->ia_gfid, 16);
-
- is_linkfile = check_is_linkfile (inode, buf, xattr);
- is_dir = check_is_dir (inode, buf, xattr);
-
- if (is_linkfile) {
- link_subvol = dht_linkfile_subvol (this, inode, buf,
- xattr);
- gf_log (this->name, GF_LOG_DEBUG,
- "found on %s linkfile %s (-> %s)",
- subvol->name, loc->path,
- link_subvol ? link_subvol->name : "''");
- goto unlock;
- }
-
- if (is_dir) {
- local->dir_count++;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "found on %s directory %s",
- subvol->name, loc->path);
- } else {
- local->file_count++;
-
- if (!local->cached_subvol) {
- /* found one file */
- dht_iatt_merge (this, &local->stbuf, buf,
- subvol);
- local->xattr = dict_ref (xattr);
- local->cached_subvol = subvol;
- gf_log (this->name, GF_LOG_DEBUG,
- "found on %s file %s",
- subvol->name, loc->path);
-
- dht_iatt_merge (this, &local->postparent,
- postparent, subvol);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "multiple subvolumes (%s and %s) have "
- "file %s", local->cached_subvol->name,
- subvol->name, local->loc.path);
- }
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- if (is_linkfile) {
- gf_log (this->name, GF_LOG_DEBUG,
- "deleting stale linkfile %s on %s",
- loc->path, subvol->name);
- dht_linkfile_unlink (frame, this, subvol, loc);
- }
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- hashed_subvol = local->hashed_subvol;
- cached_subvol = local->cached_subvol;
-
- if (local->file_count && local->dir_count) {
- gf_log (this->name, GF_LOG_ERROR,
- "path %s exists as a file on one subvolume "
- "and directory on another. "
- "Please fix it manually",
- loc->path);
- DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL,
- NULL);
- return 0;
- }
+ if (local->xattr == NULL) {
+ local->xattr = dict_ref(xattr);
+ } else {
+ /* Don't aggregate for files. See BZ#1484709 */
+ if (is_dir)
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
- if (local->dir_count) {
- dht_lookup_directory (frame, this, &local->loc);
- return 0;
- }
+ if (local->inode == NULL)
+ local->inode = inode_ref(inode);
- if (!cached_subvol) {
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
- NULL);
- return 0;
- }
-
- if (!hashed_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "cannot create linkfile file for %s on %s: "
- "hashed subvolume cannot be found.",
- loc->path, cached_subvol->name);
-
- local->op_ret = 0;
- local->op_errno = 0;
-
- ret = dht_layout_preset (frame->this, cached_subvol,
- local->inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set layout for subvol %s",
- cached_subvol ? cached_subvol->name :
- "<nil>");
- local->op_ret = -1;
- local->op_errno = EINVAL;
- }
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
- if (local->loc.parent)
- local->postparent.ia_ino =
- local->loc.parent->ino;
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ goto unlock;
+ } else {
+ gf_msg_debug(this->name, 0,
+ "internal xattr %s is present on subvol"
+ "on path %s gfid is %s ",
+ conf->mds_xattr_key, local->loc.path, gfid_local);
+ }
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ /* save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set hashed subvol for %s vol is %s",
+ local->loc.path, prev->name);
+ }
- WIPE (&local->postparent);
+ if ((check_mds < 0) && !errst) {
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "Value of %s is not zero on mds subvol"
+ "so xattr needs to be healed on non mds"
+ " path is %s and vol name is %s "
+ " gfid is %s",
+ conf->mds_xattr_key, local->loc.path, prev->name,
+ gfid_local);
+ local->need_xattr_heal = 1;
+ local->mds_subvol = prev;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+out:
+ /* Make sure, the thread executing dht_discover_complete is the one
+ * which calls STACK_DESTROY (frame). In the case of "attempt_unwind",
+ * this makes sure that the thread don't call dht_frame_return, till
+ * call to dht_discover_complete is done.
+ */
+ if (attempt_unwind) {
+ dht_discover_complete(this, frame);
+ }
- DHT_STACK_UNWIND (lookup, frame, local->op_ret,
- local->op_errno, local->inode,
- &local->stbuf, local->xattr,
- &local->postparent);
- return 0;
- }
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt) && !attempt_unwind) {
+ dht_discover_complete(this, frame);
+ }
- gf_log (this->name, GF_LOG_DEBUG,
- "linking file %s existing on %s to %s (hash)",
- loc->path, cached_subvol->name,
- hashed_subvol->name);
-
- dht_linkfile_create (frame,
- dht_lookup_linkfile_create_cbk,
- cached_subvol, hashed_subvol, loc);
- }
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
+static int
+dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int ret = -EINVAL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf) {
+ goto err;
+ }
+
+ if (!xattr_req) {
+ goto err;
+ }
+
+ /* Used to check whether this is a linkto file.
+ */
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->link_xattr_name, loc->path);
+ goto err;
+ }
+
+ /* This is used to make sure we don't unlink linkto files
+ * which are the target of an ongoing file migration.
+ */
+ ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ GLUSTERFS_OPEN_FD_COUNT, loc->path);
+ goto err;
+ }
+
+ ret = 0;
+err:
+ return ret;
+}
-int
-dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)
+/* This is a gfid based nameless lookup. Without a name, the hashed subvol
+ * cannot be calculated so a lookup is sent to all subvols.
+ */
+static int
+dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int i = 0;
- int call_cnt = 0;
+ int ret;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int call_cnt = 0;
+ int op_errno = EINVAL;
+ int i = 0;
+ call_frame_t *discover_frame = NULL;
+
+ conf = this->private;
+ local = frame->local;
+
+ /* As we do not know if this is a file or directory, request
+ * both file and directory xattrs
+ */
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ goto err;
+ }
+
+ if (loc_is_root(loc)) {
+ /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+ * set on the brick root.
+ */
+ ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
- conf = this->private;
- local = frame->local;
- if (!conf)
- goto out;
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
- if (!local->inode)
- local->inode = inode_ref (loc->inode);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_everywhere_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- loc, local->xattr_req);
- }
+ gf_uuid_copy(local->gfid, loc->gfid);
- return 0;
+ discover_frame = copy_frame(frame);
+ if (!discover_frame) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ discover_frame->local = local;
+ frame->local = NULL;
+ local->main_frame = frame;
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(discover_frame, dht_discover_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/* Code to call syntask to heal custom xattr from hashed subvol
+ to non hashed subvol
+*/
+int
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno)
+{
+ dht_local_t *copy_local = NULL;
+ call_frame_t *copy = NULL;
+ int ret = -1;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+
+ if (gf_uuid_is_null(local->gfid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "No gfid exists for path %s "
+ "so healing xattr is not possible",
+ local->loc.path);
+ *op_errno = EIO;
+ goto out;
+ }
+
+ gf_uuid_unparse(local->gfid, gfid_local);
+ copy = create_frame(this, this->ctx->pool);
+ if (copy) {
+ copy_local = dht_local_init(copy, &(local->loc), NULL, 0);
+ if (!copy_local) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "Memory allocation failed "
+ "for path %s gfid %s ",
+ local->loc.path, gfid_local);
+ *op_errno = ENOMEM;
+ DHT_STACK_DESTROY(copy);
+ } else {
+ copy_local->stbuf = local->stbuf;
+ gf_uuid_copy(copy_local->loc.gfid, local->gfid);
+ copy_local->mds_subvol = local->mds_subvol;
+ FRAME_SU_DO(copy, dht_local_t);
+ ret = synctask_new(this->ctx->env, dht_dir_heal_xattrs,
+ dht_dir_heal_xattrs_done, copy, copy);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "Synctask creation failed to heal xattr "
+ "for path %s gfid %s ",
+ local->loc.path, gfid_local);
+ *op_errno = ENOMEM;
+ DHT_STACK_DESTROY(copy);
+ }
+ }
+ }
out:
- DHT_STACK_UNWIND (lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
- return 0;
+ return ret;
}
+static int
+dht_needs_selfheal(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int needs_selfheal = 0;
+ int ret = 0;
+
+ local = frame->local;
+ layout = local->layout;
+
+ if (local->need_attrheal || local->need_xattr_heal ||
+ local->need_selfheal) {
+ needs_selfheal = 1;
+ }
+
+ ret = dht_layout_normalize(this, &local->loc, layout);
+
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0, "fixing assignment on %s", local->loc.path);
+ needs_selfheal = 1;
+ }
+ return needs_selfheal;
+}
+
+static int
+is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2)
+{
+ if ((prot1->owner.read != prot2->owner.read) ||
+ (prot1->owner.write != prot2->owner.write) ||
+ (prot1->owner.exec != prot2->owner.exec) ||
+ (prot1->group.read != prot2->group.read) ||
+ (prot1->group.write != prot2->group.write) ||
+ (prot1->group.exec != prot2->group.exec) ||
+ (prot1->other.read != prot2->other.read) ||
+ (prot1->other.write != prot2->other.write) ||
+ (prot1->other.exec != prot2->other.exec) ||
+ (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) ||
+ (prot1->sticky != prot2->sticky)) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
int
-dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- call_frame_t *prev = NULL;
- dht_local_t *local = NULL;
- xlator_t *subvol = NULL;
- loc_t *loc = NULL;
- dht_conf_t *conf = NULL;
- int ret = 0;
-
- prev = cookie;
- subvol = prev->this;
- conf = this->private;
- local = frame->local;
- loc = &local->loc;
+dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int32_t check_mds = 0;
+ int errst = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char gfid_node[GF_UUID_BUF_SIZE] = {0};
+ int32_t mds_xattr_val[1] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ layout = local->layout;
+ gf_msg_debug(this->name, op_errno,
+ "%s: lookup on %s returned with op_ret = %d, op_errno = %d",
+ local->loc.path, prev->name, op_ret, op_errno);
+
+ /* The first successful lookup*/
+ if (!op_ret && gf_uuid_is_null(local->gfid)) {
+ memcpy(local->gfid, stbuf->ia_gfid, 16);
+ }
+ if (!gf_uuid_is_null(local->gfid)) {
+ gf_uuid_unparse(local->gfid, gfid_local);
+ }
+
+ /* Check if the gfid is different for file from other node */
+ if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on %s."
+ " gfid local = %s, gfid subvol = %s",
+ local->loc.path, prev->name, gfid_local, gfid_node);
+ }
+
+ LOCK(&frame->lock);
+ {
+ /* TODO: assert equal mode on stbuf->st_mode and
+ local->stbuf->st_mode
+ else mkdir/chmod/chown and fix
+ */
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s (following linkfile) failed (%s)",
- local->loc.path, subvol->name, strerror (op_errno));
- goto err;
- }
+ local->op_errno = op_errno;
- if (check_is_dir (inode, stbuf, xattr)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s (following linkfile) reached dir",
- local->loc.path, subvol->name);
- goto err;
+ /* The GFID is missing on this subvol. Force a heal. */
+ if (op_errno == ENODATA) {
+ local->need_lookup_everywhere = 1;
+ }
+ goto unlock;
}
- if (check_is_linkfile (inode, stbuf, xattr)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup of %s on %s (following linkfile) reached link",
- local->loc.path, subvol->name);
- goto err;
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ if (!is_dir) {
+ gf_msg_debug(this->name, 0,
+ "%s: lookup on %s returned non dir 0%o"
+ "calling lookup_everywhere",
+ local->loc.path, prev->name, stbuf->ia_type);
+
+ local->need_lookup_everywhere = 1;
+ goto unlock;
}
- if ((stbuf->ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- stbuf->ia_prot.sticky = 1;
- }
- dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino);
- if (local->loc.parent)
- postparent->ia_ino = local->loc.parent->ino;
-
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set layout for subvolume %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ local->op_ret = 0;
+ if (local->xattr == NULL) {
+ local->xattr = dict_ref(xattr);
+ } else {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
-out:
- WIPE (postparent);
+ if (__is_root_gfid(stbuf->ia_gfid)) {
+ ret = dht_dir_has_layout(xattr, conf->xattr_name);
+ if (ret >= 0) {
+ if (is_greater_time(local->prebuf.ia_ctime,
+ local->prebuf.ia_ctime_nsec,
+ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+ /* Choose source */
+ local->prebuf.ia_gid = stbuf->ia_gid;
+ local->prebuf.ia_uid = stbuf->ia_uid;
+
+ local->prebuf.ia_ctime = stbuf->ia_ctime;
+ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+ local->prebuf.ia_prot = stbuf->ia_prot;
+ }
+ }
+ }
- DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
- postparent);
+ if (local->stbuf.ia_type != IA_INVAL) {
+ /* This is not the first subvol to respond
+ * Compare values to see if attrs need to be healed
+ */
+ if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+ (local->stbuf.ia_uid != stbuf->ia_uid) ||
+ (is_permission_different(&local->stbuf.ia_prot,
+ &stbuf->ia_prot))) {
+ local->need_attrheal = 1;
+ }
+ }
- return 0;
+ if (local->inode == NULL)
+ local->inode = inode_ref(inode);
-err:
- dht_lookup_everywhere (frame, this, loc);
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
- return 0;
-}
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ gf_msg_debug(this->name, 0,
+ "%s: mds xattr %s is not present "
+ "on %s(gfid = %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid_local);
+ goto unlock;
+ }
+ /* Save the mds subvol info and stbuf. This is the value that will
+ * be used for healing
+ */
+ local->mds_subvol = prev;
+ local->mds_stbuf = *stbuf;
-int
-dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- int call_cnt = 0;
- int i = 0;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
+ /* Save mds subvol on inode ctx */
- conf = this->private;
- local = frame->local;
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "%s: Failed to set mds (%s)", local->loc.path, prev->name);
+ }
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ if ((check_mds < 0) && !errst) {
+ /* Check if xattrs need to be healed on the directories */
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "%s: %s is not zero on %s. Xattrs need to be healed."
+ "(gfid = %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid_local);
+ local->need_xattr_heal = 1;
+ }
+ }
- if (!conf)
- goto unwind;
+unlock:
+ UNLOCK(&frame->lock);
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
+ this_call_cnt = dht_frame_return(frame);
- local->layout = dht_layout_new (this, conf->subvolume_cnt);
- if (!local->layout) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto unwind;
+ if (is_last_call(this_call_cnt)) {
+ /* If the mds subvol is not set correctly*/
+ if (!__is_root_gfid(local->gfid) &&
+ (!dict_get(local->xattr, conf->mds_xattr_key))) {
+ local->need_selfheal = 1;
}
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
+ /* No need to call xattr heal code if volume count is 1
+ */
+ if (conf->subvolume_cnt == 1) {
+ local->need_xattr_heal = 0;
}
- return 0;
-unwind:
- DHT_STACK_UNWIND (lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
- return 0;
-}
+ if (local->need_selfheal || local->need_lookup_everywhere) {
+ /* Set the gfid-req so posix will set the GFID*/
+ if (!gf_uuid_is_null(local->gfid)) {
+ /* Ok, this should _never_ happen */
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid, 16);
+ } else {
+ if (!gf_uuid_is_null(local->gfid_req))
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid_req, 16);
+ }
+ }
+ if (local->need_lookup_everywhere) {
+ local->need_lookup_everywhere = 0;
+ dht_lookup_everywhere(frame, this, &local->loc);
+ return 0;
+ }
-int
-dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- char is_linkfile = 0;
- char is_dir = 0;
- xlator_t *subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- loc_t *loc = NULL;
- call_frame_t *prev = NULL;
- int ret = 0;
- uint64_t tmp_layout = 0;
- dht_layout_t *parent_layout = NULL;
-
- conf = this->private;
- if (!conf)
- goto out;
+ if (local->op_ret == 0) {
+ if (dht_needs_selfheal(frame, this)) {
+ goto selfheal;
+ }
- prev = cookie;
- local = frame->local;
- loc = &local->loc;
+ dht_layout_set(this, local->inode, layout);
+ if (local->inode) {
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+ }
- /* This is required for handling stale linkfile deletion,
- * or any more call which happens from this 'loc'.
- */
- if (uuid_is_null (local->gfid) && !op_ret)
- memcpy (local->gfid, stbuf->ia_gfid, 16);
-
- if (ENTRY_MISSING (op_ret, op_errno)) {
- if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
- if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) &&
- (loc->parent)) {
- ret = inode_ctx_get (loc->parent, this, &tmp_layout);
- parent_layout = (dht_layout_t *)(long)tmp_layout;
- if (parent_layout->search_unhashed) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
- }
- }
-
- if (op_ret == 0) {
- is_dir = check_is_dir (inode, stbuf, xattr);
- if (is_dir) {
- local->inode = inode_ref (inode);
- local->xattr = dict_ref (xattr);
- }
- }
-
- if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) {
- dht_lookup_directory (frame, this, &local->loc);
- return 0;
- }
-
- if (op_ret == -1)
- goto out;
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ }
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
- is_dir = check_is_dir (inode, stbuf, xattr);
-
- if (!is_dir && !is_linkfile) {
- /* non-directory and not a linkfile */
-
- dht_itransform (this, prev->this, stbuf->ia_ino,
- &stbuf->ia_ino);
- if (loc->parent)
- postparent->ia_ino = loc->parent->ino;
-
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not set pre-set layout for subvolume %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
- goto out;
- }
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- if (is_linkfile) {
- subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
-
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "linkfile not having link subvolume. path=%s",
- loc->path);
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ }
- STACK_WIND (frame, dht_lookup_linkfile_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
- }
+ return 0;
- return 0;
+selfheal:
+ FRAME_SU_DO(frame, dht_local_t);
+ ret = dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, &local->loc,
+ layout);
+out:
+ return ret;
+}
+static int
+dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int call_cnt = 0;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, unwind);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+ GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+ GF_VALIDATE_OR_GOTO("dht", loc, unwind);
+
+ conf = this->private;
+ local = frame->local;
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ goto unwind;
+ }
+
+ if (local->xattr != NULL) {
+ dict_unref(local->xattr);
+ local->xattr = NULL;
+ }
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ /* use this gfid in order to heal any missing ones */
+ ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: Failed to set dictionary value:"
+ " key = gfid-req",
+ local->loc.path);
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(
+ frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req);
+ }
+ return 0;
+unwind:
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
out:
- /*
- * FIXME: postparent->ia_size and postparent->st_blocks do not have
- * correct values. since, postparent corresponds to a directory these
- * two members should have values equal to sum of corresponding values
- * from each of the subvolume. See dht_iatt_merge for reference.
+ return 0;
+}
+
+int
+dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int is_linkfile = 0;
+ int follow_link = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ uint32_t vol_commit_hash = 0;
+ xlator_t *subvol = NULL;
+ int32_t check_mds = 0;
+ int errst = 0, i = 0;
+ int32_t mds_xattr_val[1] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, err);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, err);
+ GF_VALIDATE_OR_GOTO("dht", cookie, err);
+ GF_VALIDATE_OR_GOTO("dht", this->private, err);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (!conf->vch_forced) {
+ /* Update the commithash value if available
*/
+ ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
- WIPE (postparent);
+ gf_uuid_unparse(local->loc.gfid, gfid);
- DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
- postparent);
- return 0;
-}
+ gf_msg_debug(this->name, op_errno,
+ "%s: revalidate lookup on %s returned op_ret %d",
+ local->loc.path, prev->name, op_ret);
+ LOCK(&frame->lock);
+ {
+ if (gf_uuid_is_null(local->gfid)) {
+ memcpy(local->gfid, local->loc.gfid, 16);
+ }
-int
-dht_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
-{
- xlator_t *subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int op_errno = -1;
- dht_layout_t *layout = NULL;
- int i = 0;
- int call_cnt = 0;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- conf = this->private;
- if (!conf)
- goto err;
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+
+ if ((op_errno != ENOTCONN) && (op_errno != ENOENT) &&
+ (op_errno != ESTALE)) {
+ gf_msg(this->name, GF_LOG_INFO, op_errno,
+ DHT_MSG_REVALIDATE_CBK_INFO,
+ "Revalidate: subvolume %s for %s "
+ "(gfid = %s) returned -1",
+ prev->name, local->loc.path, gfid);
+ }
+ if (op_errno == ESTALE) {
+ /* propagate the ESTALE to parent.
+ * setting local->return_estale would send
+ * ESTALE to parent. */
+ local->return_estale = 1;
+ }
+
+ /* if it is ENOENT, we may have to do a
+ * 'lookup_everywhere()' to make sure
+ * the file is not migrated */
+ if (op_errno == ENOENT) {
+ if (IA_ISREG(local->loc.inode->ia_type)) {
+ gf_msg_debug(this->name, 0,
+ "found ENOENT for %s. "
+ "Setting "
+ "need_lookup_everywhere"
+ " flag to 1",
+ local->loc.path);
+
+ local->need_lookup_everywhere = 1;
+ } else if (IA_ISDIR(local->loc.inode->ia_type)) {
+ layout = local->layout;
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == prev) {
+ layout->list[i].err = op_errno;
+ break;
+ }
+ }
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
- if (!dht_filter_loc_subvol_key (this, loc, &local->loc,
- &hashed_subvol)) {
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "copying location failed for path=%s",
- loc->path);
- goto err;
+ local->need_selfheal = 1;
}
+ }
+
+ /* The GFID is missing on this subvol. Lookup everywhere to force a
+ * gfid heal
+ */
+ if ((op_errno == ENODATA) &&
+ (IA_ISDIR(local->loc.inode->ia_type))) {
+ local->need_lookup_everywhere = 1;
+ }
+
+ goto unlock;
}
- if (xattr_req) {
- local->xattr_req = dict_ref (xattr_req);
- } else {
- local->xattr_req = dict_new ();
- }
+ if ((!IA_ISINVAL(local->inode->ia_type)) &&
+ stbuf->ia_type != local->inode->ia_type) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "mismatching filetypes 0%o v/s 0%o for %s,"
+ " gfid = %s",
+ (stbuf->ia_type), (local->inode->ia_type), local->loc.path,
+ gfid);
- if (!hashed_subvol)
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
- local->cached_subvol = cached_subvol;
- local->hashed_subvol = hashed_subvol;
+ goto unlock;
+ }
- if (is_revalidate (loc)) {
- local->layout = layout = dht_layout_get (this, loc->inode);
+ layout = local->layout;
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "revalidate without cache. path=%s",
- loc->path);
- op_errno = EINVAL;
- goto err;
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr,
+ conf->link_xattr_name);
+ if (is_linkfile) {
+ follow_link = 1;
+ goto unlock;
+ }
+ if (is_dir) {
+ ret = dht_dir_has_layout(xattr, conf->xattr_name);
+ if (ret >= 0) {
+ if (is_greater_time(local->prebuf.ia_ctime,
+ local->prebuf.ia_ctime_nsec,
+ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+ /* Choose source */
+ local->prebuf.ia_gid = stbuf->ia_gid;
+ local->prebuf.ia_uid = stbuf->ia_uid;
+
+ local->prebuf.ia_ctime = stbuf->ia_ctime;
+ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+
+ if (__is_root_gfid(stbuf->ia_gfid))
+ local->prebuf.ia_prot = stbuf->ia_prot;
}
+ }
+
+ if (local->stbuf.ia_type != IA_INVAL) {
+ if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+ (local->stbuf.ia_uid != stbuf->ia_uid) ||
+ is_permission_different(&local->stbuf.ia_prot,
+ &stbuf->ia_prot)) {
+ local->need_attrheal = 1;
+ }
+ }
+
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ gf_msg_debug(this->name, 0,
+ "%s: internal xattr %s is not present"
+ " on subvol %s(gfid is %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid);
+ } else {
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ local->mds_subvol = prev;
+ local->mds_stbuf.ia_gid = stbuf->ia_gid;
+ local->mds_stbuf.ia_uid = stbuf->ia_uid;
+ local->mds_stbuf.ia_prot = stbuf->ia_prot;
+
+ /* save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set MDS subvol for %s vol is %s",
+ local->loc.path, prev->name);
+ }
+ if ((check_mds < 0) && !errst) {
+ /* Check if xattrs need to be healed on the directory
+ */
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "Value of %s is not zero on "
+ "hashed subvol so xattr needs to"
+ " be healed on non hashed"
+ " path is %s and vol name is %s "
+ " gfid is %s",
+ conf->mds_xattr_key, local->loc.path,
+ prev->name, gfid);
+ local->need_xattr_heal = 1;
+ }
+ }
+ ret = dht_layout_dir_mismatch(this, layout, prev, &local->loc,
+ xattr);
+ if (ret != 0) {
+ /* In memory layout does not match on-disk layout.
+ */
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_MISMATCH,
+ "Mismatching layouts for %s, gfid = %s", local->loc.path,
+ gfid);
+
+ local->layout_mismatch = 1;
+
+ goto unlock;
+ }
+ }
- if (layout->gen && (layout->gen < conf->gen)) {
- gf_log (this->name, GF_LOG_TRACE,
- "incomplete layout failure for path=%s",
- loc->path);
-
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
- goto do_fresh_lookup;
- }
-
- local->inode = inode_ref (loc->inode);
- local->ia_ino = loc->inode->ino;
-
- local->call_cnt = layout->cnt;
- call_cnt = local->call_cnt;
-
- /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
- * revalidates directly go to the cached-subvolume.
- */
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
-
- for (i = 0; i < layout->cnt; i++) {
- subvol = layout->list[i].xlator;
-
- STACK_WIND (frame, dht_revalidate_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
-
- if (!--call_cnt)
- break;
- }
+ gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid);
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
+
+ local->op_ret = 0;
+
+ if (!local->xattr) {
+ local->xattr = dict_ref(xattr);
+ } else if (is_dir) {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (follow_link) {
+ /* Found a linkto file. Follow it to see if the target file exists
+ */
+ gf_uuid_copy(local->gfid, stbuf->ia_gfid);
+
+ subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+ if (!subvol) {
+ op_errno = ESTALE;
+ local->op_ret = -1;
} else {
- do_fresh_lookup:
- /* TODO: remove the hard-coding */
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
-
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
-
- if (!hashed_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s, "
- "checking on all the subvols to see if "
- "it is a directory", loc->path);
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
-
- local->layout = dht_layout_new (this,
- conf->subvolume_cnt);
- if (!local->layout) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
- }
- return 0;
+ STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc,
+ local->xattr_req);
+ return 0;
+ }
+ }
+
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ if (!IA_ISDIR(local->stbuf.ia_type) &&
+ (local->hashed_subvol != local->cached_subvol) &&
+ (local->stbuf.ia_nlink == 1) &&
+ (conf && conf->unhashed_sticky_bit)) {
+ local->stbuf.ia_prot.sticky = 1;
+ }
+ /* No need to call heal code if volume count is 1
+ */
+ if (conf->subvolume_cnt == 1)
+ local->need_xattr_heal = 0;
+
+ if (IA_ISDIR(local->stbuf.ia_type)) {
+ /* No mds xattr found. Trigger a heal to set it */
+ if (!__is_root_gfid(local->loc.inode->gfid) &&
+ (!dict_get(local->xattr, conf->mds_xattr_key)))
+ local->need_selfheal = 1;
+
+ if (dht_needs_selfheal(frame, this)) {
+ if (!__is_root_gfid(local->loc.inode->gfid)) {
+ if (local->mds_subvol) {
+ local->stbuf.ia_gid = local->mds_stbuf.ia_gid;
+ local->stbuf.ia_uid = local->mds_stbuf.ia_uid;
+ local->stbuf.ia_prot = local->mds_stbuf.ia_prot;
+ }
+ } else {
+ local->stbuf.ia_gid = local->prebuf.ia_gid;
+ local->stbuf.ia_uid = local->prebuf.ia_uid;
+ local->stbuf.ia_prot = local->prebuf.ia_prot;
}
- STACK_WIND (frame, dht_lookup_cbk,
- hashed_subvol, hashed_subvol->fops->lookup,
- loc, local->xattr_req);
+ layout = local->layout;
+ dht_selfheal_directory(frame, dht_lookup_selfheal_cbk,
+ &local->loc, layout);
+ return 0;
+ }
}
- return 0;
+ if (local->layout_mismatch) {
+ /* Found layout mismatch in the directory, need to
+ fix this in the inode context */
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+
+ if (local->need_lookup_everywhere) {
+ /* As the current layout gave ENOENT error, we would
+ need a new layout */
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+
+ /* We know that current cached subvol is no longer
+ valid, get the new one */
+ local->cached_subvol = NULL;
+ if (local->xattr_req) {
+ if (!gf_uuid_is_null(local->gfid)) {
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid, 16);
+ }
+ }
+
+ dht_lookup_everywhere(frame, this, &local->loc);
+ return 0;
+ }
+ if (local->return_estale) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ /* local->stbuf is updated only from subvols which have a layout
+ * The reason is to avoid choosing attr heal source from newly
+ * added bricks. In case e.g we have only one subvol and for
+ * some reason layout is not present on it, then local->stbuf
+ * will be EINVAL. This is an indication that the subvols
+ * active in the cluster do not have layouts on disk.
+ * Unwind with ESTALE to trigger a fresh lookup */
+ if (is_dir && local->stbuf.ia_type == IA_INVAL) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
+ }
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
+
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ }
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
+ return ret;
}
+static int
+dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cooie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ local = frame->local;
+ cached_subvol = local->cached_subvol;
+ conf = this->private;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (local->locked)
+ dht_unlock_namespace(frame, &local->lock[0]);
+
+ ret = dht_layout_preset(this, local->cached_subvol, local->loc.inode);
+ if (ret < 0) {
+ gf_msg_debug(this->name, EINVAL,
+ "Failed to set layout for subvolume %s, "
+ "(gfid = %s)",
+ cached_subvol ? cached_subvol->name : "<nil>", gfid);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ goto unwind;
+ }
+
+ local->op_ret = 0;
+ if ((local->stbuf.ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) {
+ local->stbuf.ia_prot.sticky = 1;
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
-int
-dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
-
- dht_iatt_merge (this, &local->prebuf, prebuf, prev->this);
- dht_iatt_merge (this, &local->stbuf, postbuf, prev->this);
-
- if (local->inode) {
- local->stbuf.ia_ino = local->inode->ino;
- local->prebuf.ia_ino = local->inode->ino;
- }
+unwind:
+ gf_msg_debug(this->name, 0,
+ "creation of linkto on hashed subvol:%s, "
+ "returned with op_ret %d and op_errno %d: %s",
+ local->hashed_subvol->name, op_ret, op_errno,
+ uuid_utoa(local->loc.gfid));
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal(frame, this);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (truncate, frame, local->op_ret, local->op_errno,
- &local->prebuf, &local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
- return 0;
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+out:
+ return ret;
}
+static int
+dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+ local = (dht_local_t *)frame->local;
+ path = local->loc.path;
+ FRAME_SU_UNDO(frame, dht_local_t);
-int
-dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *stbuf)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
-
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
-
- if (local->inode)
- local->stbuf.ia_ino = local->inode->ino;
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno,
- &local->stbuf);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_lookup_everywhere_done(frame, this);
+ }
- return 0;
+ return 0;
}
-
-int
-dht_stat (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+static int
+dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
+ local = (dht_local_t *)frame->local;
+ path = local->loc.path;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+ FRAME_SU_UNDO(frame, dht_local_t);
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) {
+ dht_lookup_everywhere_done(frame, this);
+ } else {
+ /*When dht_lookup_everywhere is performed, one cached
+ *and one hashed file was found and hashed file does
+ *not point to the above mentioned cached node. So it
+ *was considered as stale and an unlink was performed.
+ *But unlink fails. So may be rebalance is in progress.
+ *now ideally we have two data-files. One obtained during
+ *lookup_everywhere and one where unlink-failed. So
+ *at this point in time we cannot decide which one to
+ *choose because there are chances of first cached
+ *file is truncated after rebalance and if it is chosen
+ *as cached node, application will fail. So return EIO.*/
+
+ if (op_errno == EBUSY) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_UNLINK_FAILED,
+ "Could not unlink the linkto file as "
+ "either fd is open and/or linkto xattr "
+ "is set for %s",
+ ((path == NULL) ? "null" : path));
+ }
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+ }
+ }
- local->layout = layout = dht_layout_get (this, loc->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ return 0;
+}
- local->inode = inode_ref (loc->inode);
- local->call_cnt = layout->cnt;
+static int
+dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ const char *path = NULL;
- for (i = 0; i < layout->cnt; i++) {
- subvol = layout->list[i].xlator;
+ /* NOTE:
+ * If stale file unlink fails either there is an open-fd or is not an
+ * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
+ * to ENOENT
+ */
- STACK_WIND (frame, dht_attr_cbk,
- subvol, subvol->fops->stat,
- loc);
- }
+ local = frame->local;
- return 0;
+ if (local) {
+ FRAME_SU_UNDO(frame, dht_local_t);
+ if (local->loc.path)
+ path = local->loc.path;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL);
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "Returned with op_ret %d and "
+ "op_errno %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
+
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
+static int
+dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict)
+{
+ int ret = 0;
+
+ ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1);
-int
-dht_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->layout = layout = dht_layout_get (this, fd->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- local->inode = inode_ref (fd->inode);
- local->call_cnt = layout->cnt;;
-
- for (i = 0; i < layout->cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_attr_cbk,
- subvol, subvol->fops->fstat,
- fd);
- }
-
- return 0;
+ if (ret)
+ return -1;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL);
+ ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
- return 0;
+ if (ret)
+ return -1;
+
+ return 0;
}
+static int32_t
+dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int call_cnt = 0, ret = 0;
+ xlator_t *subvol = NULL;
+ uuid_t gfid = {
+ 0,
+ };
+ char gfid_str[GF_UUID_BUF_SIZE] = {0};
+
+ subvol = cookie;
+ local = frame->local;
+
+ if (subvol == local->hashed_subvol) {
+ if ((op_ret == 0) || (op_errno != ENOENT))
+ local->dont_create_linkto = _gf_true;
+ } else {
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_copy(gfid, local->loc.gfid);
+ else
+ gf_uuid_copy(gfid, local->gfid);
+
+ if ((op_ret == 0) && gf_uuid_compare(gfid, buf->ia_gfid)) {
+ gf_uuid_unparse(gfid, gfid_str);
+ gf_msg_debug(this->name, 0,
+ "gfid (%s) different on cached subvol "
+ "(%s) and looked up inode (%s), not "
+ "creating linkto",
+ uuid_utoa(buf->ia_gfid), subvol->name, gfid_str);
+ local->dont_create_linkto = _gf_true;
+ } else if (op_ret == -1) {
+ local->dont_create_linkto = _gf_true;
+ }
+ }
+
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ if (local->dont_create_linkto)
+ goto no_linkto;
+ else {
+ gf_msg_debug(this->name, 0,
+ "Creating linkto file on %s(hash) to "
+ "%s on %s (gfid = %s)",
+ local->hashed_subvol->name, local->loc.path,
+ local->cached_subvol->name, gfid_str);
+
+ ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk,
+ this, local->cached_subvol,
+ local->hashed_subvol, &local->loc);
+
+ if (ret < 0)
+ goto no_linkto;
+ }
+ }
-int
-dht_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset)
+ return 0;
+
+no_linkto:
+ gf_msg_debug(this->name, 0,
+ "skipped linkto creation (path:%s) (gfid:%s) "
+ "(hashed-subvol:%s) (cached-subvol:%s)",
+ local->loc.path, gfid_str, local->hashed_subvol->name,
+ local->cached_subvol->name);
+
+ dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode,
+ &local->stbuf, &local->preparent,
+ &local->postparent, local->xattr);
+ return 0;
+}
+
+static int32_t
+dht_call_lookup_linkfile_create(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int i = 0;
+ xlator_t *subvol = NULL;
+ local = frame->local;
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ else
+ gf_uuid_unparse(local->gfid, gfid);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "protecting namespace failed, skipping linkto "
+ "creation (path:%s)(gfid:%s)(hashed-subvol:%s)"
+ "(cached-subvol:%s)",
+ local->loc.path, gfid, local->hashed_subvol->name,
+ local->cached_subvol->name);
+ goto err;
+ }
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ local->locked = _gf_true;
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+ local->call_cnt = 2;
- local->inode = inode_ref (loc->inode);
- local->call_cnt = 1;
+ for (i = 0; i < 2; i++) {
+ subvol = (subvol == NULL) ? local->hashed_subvol : local->cached_subvol;
- STACK_WIND (frame, dht_truncate_cbk,
- subvol, subvol->fops->truncate,
- loc, offset);
+ STACK_WIND_COOKIE(frame, dht_linkfile_create_lookup_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, NULL);
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL);
-
- return 0;
+ dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode,
+ &local->stbuf, &local->preparent,
+ &local->postparent, local->xattr);
+ return 0;
}
+/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
+ * contains a non-linkto file. After migration it is converted to linkto and
+ * then unlinked. And at hashed_subvolume, first a linkto file is present,
+ * then after migration it is converted to a non-linkto file.
+ *
+ * Lets assume a file is present on cached subvolume and a new brick is added
+ * and new brick is the new_hashed subvolume. So fresh lookup on newly added
+ * hashed subvolume will fail and dht_lookup_everywhere gets called. If just
+ * before sending the dht_lookup_everywhere request rebalance is in progress,
+ *
+ * from cached subvolume it may see: Nonlinkto or linkto or No file
+ * from hashed subvolume it may see: No file or linkto file or non-linkto file
+ *
+ * So this boils down to 9 cases:
+ * at cached_subvol at hashed_subvol
+ * ---------------- -----------------
+ *
+ *a) No file No file
+ * [request reached after [Request reached before
+ * migration] Migration]
+ *
+ *b) No file Linkto File
+ *
+ *c) No file Non-Linkto File
+ *
+ *d) Linkto No-File
+ *
+ *e) Linkto Linkto
+ *
+ *f) Linkto Non-Linkto
+ *
+ *g) NonLinkto No-File
+ *
+ *h) NonLinkto Linkto
+ *
+ *i) NonLinkto NonLinkto
+ *
+ * dht_lookup_everywhere_done takes decision based on any of the above case
+ */
-int
-dht_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset)
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ int ret = 0;
+ dht_local_t *local = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_layout_t *layout = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ gf_boolean_t found_non_linkto_on_hashed = _gf_false;
+
+ local = frame->local;
+ hashed_subvol = local->hashed_subvol;
+ cached_subvol = local->cached_subvol;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (local->file_count && local->dir_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "path %s (gfid = %s)exists as a file on one "
+ "subvolume and directory on another. "
+ "Please fix it manually",
+ local->loc.path, gfid);
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+ return 0;
+ }
+ if (local->op_ret && local->gfid_missing) {
+ if (gf_uuid_is_null(local->gfid_req)) {
+ DHT_STACK_UNWIND(lookup, frame, -1, ENODATA, NULL, NULL, NULL,
+ NULL);
+ return 0;
+ }
+ /* A hack */
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+ if (local->dir_count) {
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+
+ gf_msg_debug(this->name, 0,
+ "STATUS: hashed_subvol %s "
+ "cached_subvol %s",
+ (hashed_subvol == NULL) ? "null" : hashed_subvol->name,
+ (cached_subvol == NULL) ? "null" : cached_subvol->name);
+
+ if (!cached_subvol) {
+ if (local->skip_unlink.handle_valid_link && hashed_subvol) {
+ /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK":
+ * If this lookup is performed by rebalance and this
+ * rebalance process detected hashed file and by
+ * the time it sends the lookup request to cached node,
+ * file got migrated and now at initial hashed_node,
+ * final migrated file is present. With current logic,
+ * because this process fails to find the cached_node,
+ * it will unlink the file at initial hashed_node.
+ *
+ * So we avoid this by setting key, and checking at the
+ * posix_unlink that unlink the file only if file is a
+ * linkto file and not a migrated_file.
+ */
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+ local->xattr_req);
+
+ if (ret) {
+ /* If for some reason, setting key in the dict
+ * fails, return with ENOENT, as with respect to
+ * this process, it detected only a stale link
+ * file.
+ *
+ * Next lookup will delete it.
+ *
+ * Performing deletion of stale link file when
+ * setting key in dict fails, may cause the data
+ * loss because of the above mentioned race.
+ */
+
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_msg_debug(this->name, 0,
+ "No Cached was found and "
+ "unlink on hashed was skipped"
+ " so performing now: %s",
+ local->loc.path);
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_stale_linkto_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "There was no cached file and "
+ "unlink on hashed is not skipped %s",
+ local->loc.path);
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
+ }
+ return 0;
+ }
+
+ /* At the time of dht_lookup, no file was found on hashed and that is
+ * why dht_lookup_everywhere is called, but by the time
+ * dht_lookup_everywhere
+ * reached to server, file might have already migrated. In that case we
+ * will find a migrated file at the hashed_node. In this case store the
+ * layout in context and return successfully.
+ */
+
+ if (hashed_subvol || local->need_lookup_everywhere) {
+ if (local->need_lookup_everywhere) {
+ found_non_linkto_on_hashed = _gf_true;
+
+ } else if ((local->file_count == 1) &&
+ (hashed_subvol == cached_subvol)) {
+ gf_msg_debug(this->name, 0,
+ "found cached file on hashed subvolume "
+ "so store in context and return for %s",
+ local->loc.path);
+
+ found_non_linkto_on_hashed = _gf_true;
+ }
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+ if (found_non_linkto_on_hashed)
+ goto preset_layout;
+ }
- local->inode = inode_ref (fd->inode);
- local->call_cnt = 1;
+ if (hashed_subvol) {
+ if (local->skip_unlink.handle_valid_link == _gf_true) {
+ if (cached_subvol == local->skip_unlink.hash_links_to) {
+ if (gf_uuid_compare(local->skip_unlink.cached_gfid,
+ local->skip_unlink.hashed_gfid)) {
+ /*GFID different, return error*/
+ DHT_STACK_UNWIND(lookup, frame, -1, ESTALE, NULL, NULL,
+ NULL, NULL);
- STACK_WIND (frame, dht_truncate_cbk,
- subvol, subvol->fops->ftruncate,
- fd, offset);
+ return 0;
+ }
- return 0;
+ ret = dht_layout_preset(this, cached_subvol, local->loc.inode);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Could not set pre-set layout "
+ "for subvolume %s",
+ cached_subvol->name);
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL);
+ local->op_ret = (ret == 0) ? ret : -1;
+ local->op_errno = (ret == 0) ? ret : EINVAL;
- return 0;
-}
+ /* Presence of local->cached_subvol validates
+ * that lookup from cached node is successful
+ */
+ if (!local->op_ret && local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
-int
-dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
-
- preparent->ia_ino = local->loc.parent->ino;
- postparent->ia_ino = local->loc.parent->ino;
- local->op_ret = 0;
-
- local->postparent = *postparent;
- local->preparent = *preparent;
-
- WIPE (&local->postparent);
- WIPE (&local->preparent);
- }
-unlock:
- UNLOCK (&frame->lock);
+ gf_msg_debug(this->name, 0,
+ "Skipped unlinking linkto file "
+ "on the hashed subvolume. "
+ "Returning success as it is a "
+ "valid linkto file. Path:%s",
+ local->loc.path);
+
+ goto unwind_hashed_and_cached;
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_msg_debug(this->name, 0,
+ "Linkto file found on hashed "
+ "subvol "
+ "and data file found on cached "
+ "subvolume. But linkto points to "
+ "different cached subvolume (%s) "
+ "path %s",
+ (local->skip_unlink.hash_links_to
+ ? local->skip_unlink.hash_links_to->name
+ : " <nil>"),
+ local->loc.path);
+
+ if (local->skip_unlink.opend_fd_count == 0) {
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+ local->xattr_req);
+
+ if (ret) {
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL,
+ NULL, NULL);
+ } else {
+ local->call_cnt = 1;
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_of_false_linkto_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
+
+ return 0;
+ }
+ }
+ }
+ }
+
+preset_layout:
+
+ if (found_non_linkto_on_hashed) {
+ if (local->need_lookup_everywhere) {
+ if (gf_uuid_compare(local->gfid, local->inode->gfid)) {
+ /* GFID different, return error */
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+ return 0;
+ }
+ }
- DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent);
+ local->op_ret = 0;
+ local->op_errno = 0;
+ layout = dht_layout_for_subvol(this, cached_subvol);
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: no pre-set layout for subvolume %s,"
+ " gfid = %s",
+ local->loc.path,
+ (cached_subvol ? cached_subvol->name : "<nil>"), gfid);
+ }
+ ret = dht_layout_set(this, local->inode, layout);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: failed to set layout for subvol %s, "
+ "gfid = %s",
+ local->loc.path,
+ (cached_subvol ? cached_subvol->name : "<nil>"), gfid);
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
return 0;
+ }
+
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "Cannot create linkfile for %s on %s: "
+ "hashed subvolume cannot be found, gfid = %s.",
+ local->loc.path, cached_subvol->name, gfid);
+
+ local->op_ret = 0;
+ local->op_errno = 0;
+
+ ret = dht_layout_preset(frame->this, cached_subvol, local->inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Failed to set layout for subvol %s"
+ ", gfid = %s",
+ cached_subvol ? cached_subvol->name : "<nil>", gfid);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
+ }
+
+ if (frame->root->op != GF_FOP_RENAME) {
+ local->current = &local->lock[0];
+ ret = dht_protect_namespace(frame, &local->loc, hashed_subvol,
+ &local->current->ns,
+ dht_call_lookup_linkfile_create);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Creating linkto file on %s(hash) to %s on %s "
+ "(gfid = %s)",
+ hashed_subvol->name, local->loc.path, cached_subvol->name,
+ gfid);
+
+ ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, this,
+ cached_subvol, hashed_subvol, &local->loc);
+ }
+
+ return ret;
+
+unwind_hashed_and_cached:
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
}
+static int
+dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ int is_linkfile = 0;
+ int is_dir = 0;
+ loc_t *loc = NULL;
+ xlator_t *link_subvol = NULL;
+ int ret = -1;
+ int32_t fd_count = 0;
+ dht_conf_t *conf = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ dict_t *dict_req = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ local = frame->local;
+ loc = &local->loc;
+ conf = this->private;
+
+ prev = cookie;
+
+ gf_msg_debug(this->name, 0,
+ "returned with op_ret %d and op_errno %d (%s) "
+ "from subvol %s",
+ op_ret, op_errno, loc->path, prev->name);
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno != ENOENT)
+ local->op_errno = op_errno;
+ if (op_errno == ENODATA)
+ local->gfid_missing = _gf_true;
+ goto unlock;
+ }
-int
-dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- xlator_t *cached_subvol = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
-
- local->op_ret = 0;
- }
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_copy(local->gfid, buf->ia_gfid);
+
+ gf_uuid_unparse(local->gfid, gfid);
+
+ if (gf_uuid_compare(local->gfid, buf->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid differs on subvolume %s,"
+ " gfid local = %s, gfid node = %s",
+ loc->path, prev->name, gfid, uuid_utoa(buf->ia_gfid));
+ }
+
+ is_linkfile = check_is_linkfile(inode, buf, xattr,
+ conf->link_xattr_name);
+
+ if (is_linkfile) {
+ link_subvol = dht_linkfile_subvol(this, inode, buf, xattr);
+ gf_msg_debug(this->name, 0, "found on %s linkfile %s (-> %s)",
+ prev->name, loc->path,
+ link_subvol ? link_subvol->name : "''");
+ goto unlock;
+ }
+
+ is_dir = check_is_dir(inode, buf, xattr);
+
+ /* non linkfile GFID takes precedence but don't overwrite
+ gfid if we have already found a cached file*/
+ if (!local->cached_subvol)
+ gf_uuid_copy(local->gfid, buf->ia_gfid);
+
+ if (is_dir) {
+ local->dir_count++;
+
+ gf_msg_debug(this->name, 0, "found on %s directory %s", prev->name,
+ loc->path);
+ } else {
+ local->file_count++;
+
+ gf_msg_debug(this->name, 0, "found cached file on %s for %s",
+ prev->name, loc->path);
+
+ if (!local->cached_subvol) {
+ /* found one file */
+ dht_iatt_merge(this, &local->stbuf, buf);
+
+ local->xattr = dict_ref(xattr);
+ local->cached_subvol = prev;
+
+ gf_msg_debug(this->name, 0,
+ "storing cached on %s file"
+ " %s",
+ prev->name, loc->path);
+
+ dht_iatt_merge(this, &local->postparent, postparent);
+
+ gf_uuid_copy(local->skip_unlink.cached_gfid, buf->ia_gfid);
+ } else {
+ /* This is where we need 'rename' both entries logic */
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_FILE_ON_MULT_SUBVOL,
+ "multiple subvolumes (%s and %s) have "
+ "file %s (preferably rename the file "
+ "in the backend,and do a fresh lookup)",
+ local->cached_subvol->name, prev->name, local->loc.path);
+ }
+ }
+ }
unlock:
- UNLOCK (&frame->lock);
+ UNLOCK(&frame->lock);
+
+ if (is_linkfile) {
+ ret = dict_get_int32(xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
+
+ /* Any linkto file found on the non-hashed subvolume should
+ * be unlinked (performed in the "else if" block below)
+ *
+ * But if a linkto file is found on hashed subvolume, it may be
+ * pointing to valid cached node. So unlinking of linkto
+ * file on hashed subvolume is skipped and inside
+ * dht_lookup_everywhere_done, checks are performed. If this
+ * linkto file is found as stale linkto file, it is deleted
+ * otherwise unlink is skipped.
+ */
- if (op_ret == -1)
- goto err;
+ if (local->hashed_subvol && local->hashed_subvol == prev) {
+ local->skip_unlink.handle_valid_link = _gf_true;
+ local->skip_unlink.opend_fd_count = fd_count;
+ local->skip_unlink.hash_links_to = link_subvol;
+ gf_uuid_copy(local->skip_unlink.hashed_gfid, buf->ia_gfid);
+
+ gf_msg_debug(this->name, 0,
+ "Found"
+ " one linkto file on hashed subvol %s "
+ "for %s: Skipping unlinking till "
+ "everywhere_done",
+ prev->name, loc->path);
+
+ } else if (!ret && (fd_count == 0)) {
+ dict_req = dict_new();
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_req);
+
+ if (ret) {
+ /* Skip unlinking for dict_failure
+ *File is found as a linkto file on non-hashed,
+ *subvolume. In the current implementation,
+ *finding a linkto-file on non-hashed does not
+ *always implies that it is stale. So deletion
+ *of file should be done only when both fd is
+ *closed and linkto-xattr is set. In case of
+ *dict_set failure, avoid skipping of file.
+ *NOTE: dht_frame_return should get called for
+ * this block.
+ */
+
+ dict_unref(dict_req);
+
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "attempting deletion of stale linkfile "
+ "%s on %s (hashed subvol is %s)",
+ loc->path, prev->name,
+ (local->hashed_subvol ? local->hashed_subvol->name
+ : "<null>"));
+ /* *
+ * These stale files may be created using root
+ * user. Hence deletion will work only with
+ * root.
+ */
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_cbk, prev,
+ prev->fops->unlink, loc, 0, dict_req);
+
+ dict_unref(dict_req);
- cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
- if (!cached_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s",
- local->loc.path);
- local->op_errno = EINVAL;
- goto err;
+ return 0;
+ }
}
+ }
- STACK_WIND (frame, dht_unlink_cbk,
- cached_subvol, cached_subvol->fops->unlink,
- &local->loc);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_lookup_everywhere_done(frame, this);
+ }
- return 0;
+out:
+ return ret;
+}
+
+int
+dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", loc, out);
+
+ conf = this->private;
+ local = frame->local;
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+ if (!local->inode)
+ local->inode = inode_ref(loc->inode);
+
+ gf_msg_debug(this->name, 0, "winding lookup call to %d subvols", call_cnt);
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_everywhere_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, loc,
+ local->xattr_req);
+ }
+
+ return 0;
+out:
+ DHT_STACK_UNWIND(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
err:
- DHT_STACK_UNWIND (unlink, frame, -1, local->op_errno,
- NULL, NULL);
- return 0;
+ return -1;
}
-
int
-dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, struct iatt *prebuf, struct iatt *postbuf)
+dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ xlator_t *prev = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ loc_t *loc = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, unwind);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+ GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+ GF_VALIDATE_OR_GOTO("dht", cookie, unwind);
+
+ prev = cookie;
+ subvol = prev;
+ conf = this->private;
+ local = frame->local;
+ loc = &local->loc;
+
+ gf_uuid_unparse(loc->gfid, gfid);
+
+ if (op_ret == -1) {
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "Lookup of %s on %s (following linkfile) failed "
+ ",gfid = %s",
+ local->loc.path, subvol->name, gfid);
+
+ /* If cached subvol returned ENOTCONN, do not do
+ lookup_everywhere. We need to make sure linkfile does not get
+ removed, which can take away the namespace, and subvol is
+ anyways down. */
+
+ local->cached_subvol = NULL;
+ if (op_errno != ENOTCONN)
+ goto err;
+ else
+ goto unwind;
+ }
+
+ if (check_is_dir(inode, stbuf, xattr)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "Lookup of %s on %s (following linkfile) reached dir,"
+ " gfid = %s",
+ local->loc.path, subvol->name, gfid);
+ goto err;
+ }
+
+ if (check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "lookup of %s on %s (following linkfile) reached link,"
+ "gfid = %s",
+ local->loc.path, subvol->name, gfid);
+ goto err;
+ }
+
+ if (gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on data file on %s,"
+ " gfid local = %s, gfid node = %s ",
+ local->loc.path, subvol->name, gfid, uuid_utoa(stbuf->ia_gfid));
+ goto err;
+ }
+
+ if ((stbuf->ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) {
+ stbuf->ia_prot.sticky = 1;
+ }
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Failed to set layout for subvolume %s,"
+ "gfid = %s",
+ prev->name, gfid);
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+unwind:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+ postparent);
- local = frame->local;
- prev = cookie;
+ return 0;
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+err:
+ dht_lookup_everywhere(frame, this, loc);
+out:
+ return 0;
+}
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+/* Code to get hashed subvol based on inode and loc
+ First it check if loc->parent and loc->path exist then it get
+ hashed subvol based on loc.
+*/
+
+static gf_boolean_t
+dht_should_lookup_everywhere(xlator_t *this, dht_conf_t *conf, loc_t *loc)
+{
+ dht_layout_t *parent_layout = NULL;
+ int ret = 0;
+ gf_boolean_t lookup_everywhere = _gf_true;
+
+ /* lookup-optimize supersedes lookup-unhashed settings.
+ * If it is set, do not process search_unhashed
+ * If lookup-optimize if enabled, lookup everywhere if:
+ * - this is the rebalance daemon.
+ * - loc->parent is unavailable.
+ * - parent_layout is unavailable
+ * - parent_layout->commit_hash != conf->vol_commit_hash
+ */
+
+ if (conf->lookup_optimize) {
+ if (!conf->defrag && loc->parent) {
+ ret = dht_inode_ctx_layout_get(loc->parent, this, &parent_layout);
+ if (!ret && parent_layout &&
+ (parent_layout->commit_hash == conf->vol_commit_hash)) {
+ lookup_everywhere = _gf_false;
+ }
+ }
+ goto out;
+ } else {
+ if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+ if (loc->parent) {
+ ret = dht_inode_ctx_layout_get(loc->parent, this,
+ &parent_layout);
+ if (ret || !parent_layout ||
+ (!parent_layout->search_unhashed)) {
+ lookup_everywhere = _gf_false;
+ }
+ } else {
+ lookup_everywhere = _gf_false;
+ }
+
+ goto out;
+ }
+ }
+out:
+ return lookup_everywhere;
+}
+
+int
+dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ char is_linkfile = 0;
+ char is_dir = 0;
+ xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ xlator_t *prev = NULL;
+ int ret = 0;
+ uint32_t vol_commit_hash = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ conf = this->private;
+
+ prev = cookie;
+ local = frame->local;
+ loc = &local->loc;
+
+ gf_msg_debug(this->name, op_errno,
+ "%s: fresh_lookup on %s returned with op_ret %d", loc->path,
+ prev->name, op_ret);
+
+ if (op_ret == -1) {
+ if (ENTRY_MISSING(op_ret, op_errno)) {
+ if (1 == conf->subvolume_cnt) {
+ /* No need to lookup again */
+ goto out;
+ }
+
+ gf_msg_debug(this->name, 0, "Entry %s missing on subvol %s",
+ loc->path, prev->name);
+
+ if (dht_should_lookup_everywhere(this, conf, loc)) {
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere(frame, this, loc);
+ return 0;
+ }
- if (local && (op_ret == 0)) {
- prebuf->ia_ino = local->ia_ino;
- postbuf->ia_ino = local->ia_ino;
+ } else {
+ /* posix returns ENODATA if the gfid is not set but the client and
+ * server protocol layers do not send the stbuf. We need to
+ * heal this so check if this is a directory on the other subvols.
+ */
+ if ((op_errno == ENOTCONN) || (op_errno == ENODATA)) {
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
}
+ gf_msg_debug(this->name, op_errno, "%s: Lookup on subvolume %s failed",
+ loc->path, prev->name);
+ goto out;
+ }
+
+ /* Lookup succeeded - op_ret = 0 */
+
+ /* This is required for handling stale linkfile deletion,
+ * or any more call which happens from this 'loc'.
+ */
+ if (gf_uuid_is_null(local->gfid)) {
+ /*This is set from the first successful response*/
+ memcpy(local->gfid, stbuf->ia_gfid, 16);
+ }
+
+ if (!conf->vch_forced) {
+ /* Update the commit hash in conf if it is found */
+ ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ if (is_dir) {
+ /* A directory is present on all subvols, send the lookup to
+ * all subvols now */
+ local->inode = inode_ref(inode);
+ local->xattr = dict_ref(xattr);
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno,
- prebuf, postbuf);
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+ if (!is_linkfile) {
+ /* non-directory and not a linkto file. This is a data file
+ * Update the layout to point to the cached subvol
+ */
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "%s: could not set pre-set layout for subvolume %s",
+ loc->path, prev->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ goto out;
+ }
+
+ /* This is a linkto file. Get the value of the target subvol from the
+ * linkto xattr and lookup there to see if the file exists
+ */
+ subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: No link subvol for linkto", loc->path);
+ dht_lookup_everywhere(frame, this, loc);
return 0;
+ }
+
+ gf_msg_debug(this->name, 0, "%s: Calling lookup on linkto target %s",
+ loc->path, subvol->name);
+
+ STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, local->xattr_req);
+
+ return 0;
+
+out:
+ /*
+ * FIXME: postparent->ia_size and postparent->st_blocks do not have
+ * correct values. since, postparent corresponds to a directory these
+ * two members should have values equal to sum of corresponding values
+ * from each of the subvolume. See dht_iatt_merge for reference.
+ */
+
+ if (!op_ret && local && local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+ postparent);
+err:
+ return 0;
}
+/* For directories, check if acl xattrs have been requested (by the acl
+ * xlator), if not, request for them. These xattrs are needed for dht dir
+ * self-heal to perform proper self-healing of dirs
+ */
+static void
+dht_check_and_set_acl_xattr_req(xlator_t *this, dict_t *xattr_req)
+{
+ int ret = 0;
+ GF_ASSERT(xattr_req);
-int
-dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno)
+ if (!dict_get(xattr_req, POSIX_ACL_ACCESS_XATTR)) {
+ ret = dict_set_int8(xattr_req, POSIX_ACL_ACCESS_XATTR, 0);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s",
+ POSIX_ACL_ACCESS_XATTR);
+ }
+
+ if (!dict_get(xattr_req, POSIX_ACL_DEFAULT_XATTR)) {
+ ret = dict_set_int8(xattr_req, POSIX_ACL_DEFAULT_XATTR, 0);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s",
+ POSIX_ACL_DEFAULT_XATTR);
+ }
+
+ return;
+}
+
+/* for directories, we need the following info:
+ * the layout : trusted.glusterfs.dht
+ * the mds information : trusted.glusterfs.dht.mds
+ * the acl info: See above
+ */
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ int ret = -EINVAL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf) {
+ goto err;
+ }
+
+ if (!xattr_req) {
+ goto err;
+ }
+
+ /* Xattr to get the layout for a directory
+ */
+ ret = dict_set_uint32(xattr_req, conf->xattr_name, 4 * 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->xattr_name, loc->path);
+ goto err;
+ }
+
+ /*Non-fatal failure */
+ ret = dict_set_uint32(xattr_req, conf->mds_xattr_key, 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->mds_xattr_key, loc->path);
+ }
+
+ dht_check_and_set_acl_xattr_req(this, xattr_req);
+ ret = 0;
+err:
+ return ret;
+}
+/* If the hashed subvol is present, send the lookup to only that subvol first.
+ * If no hashed subvol, send a lookup to all subvols and proceed based on the
+ * responses.
+ */
+static int
+dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int call_cnt = 0;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Since we don't know whether this is a file or a directory,
+ * request all xattrs*/
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ /* Fuse sets a random value in gfid-req. If the gfid is missing
+ * on one or more subvols, posix will set the gfid to this value,
+ * causing GFID mismatches for directories. Remove the value fuse
+ * has sent before sending the lookup.
+ */
+ ret = dict_get_gfuuid(local->xattr_req, "gfid-req", &local->gfid_req);
+ if (ret) {
+ gf_msg_debug(this->name, 0, "%s: No gfid-req available", loc->path);
+ } else {
+ dict_del(local->xattr_req, "gfid-req");
+ }
+ /* This should have been set in dht_lookup */
+ hashed_subvol = local->hashed_subvol;
+
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "%s: no subvolume in layout for path, "
+ "checking on all the subvols to see if "
+ "it is a directory",
+ loc->path);
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
- local = frame->local;
- prev = cookie;
+ /* Allocate a layout. This will be populated and saved in
+ * the dht inode_ctx on successful lookup
+ */
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ gf_msg_debug(this->name, 0,
+ "%s: Found null hashed subvol. Calling lookup"
+ " on all nodes.",
+ loc->path);
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
+ return 0;
+ }
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (setxattr, frame, local->op_ret, local->op_errno);
+ /* if the hashed_subvol is non-null, send the lookup there first so
+ * as to see whether we have a file or a directory */
+ gf_msg_debug(this->name, 0, "%s: Calling fresh lookup on %s", loc->path,
+ hashed_subvol->name);
+
+ STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->lookup, loc, local->xattr_req);
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+static int
+dht_do_revalidate(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+ int gen = 0;
+
+ conf = this->private;
+ if (!conf) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0,
+ "path = %s. No layout found in the inode ctx.", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Generation number has changed. This layout may be stale. */
+ if (layout->gen && (layout->gen < conf->gen)) {
+ gen = layout->gen;
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+ local->cached_subvol = NULL;
+
+ gf_msg_debug(this->name, 0,
+ "path = %s. In memory layout may be stale."
+ "(layout->gen (%d) is less than "
+ "conf->gen (%d)). Calling fresh lookup.",
+ loc->path, gen, conf->gen);
+
+ dht_do_fresh_lookup(frame, this, loc);
+ return 0;
+ }
+
+ local->inode = inode_ref(loc->inode);
+
+ /* Since we don't know whether this has changed,
+ * request all xattrs*/
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ if (IA_ISDIR(local->inode->ia_type)) {
+ ret = dht_inode_ctx_mdsvol_get(local->inode, this, &mds_subvol);
+ if (ret || !mds_subvol) {
+ gf_msg_debug(this->name, 0, "path = %s. No mds subvol in inode ctx",
+ local->loc.path);
}
+ local->mds_subvol = mds_subvol;
+ local->call_cnt = conf->subvolume_cnt;
+
+ /* local->call_cnt will change as responses are processed. Always use a
+ * local copy to loop through the STACK_WIND calls
+ */
+ call_cnt = local->call_cnt;
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_revalidate_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, loc,
+ local->xattr_req);
+ }
return 0;
+ }
+
+ /* If not a dir, this should be 1 */
+ local->call_cnt = layout->cnt;
+ call_cnt = local->call_cnt;
+
+ for (i = 0; i < call_cnt; i++) {
+ subvol = layout->list[i].xlator;
+
+ gf_msg_debug(this->name, 0,
+ "path = %s. Calling "
+ "revalidate lookup on %s",
+ loc->path, subvol->name);
+
+ STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, local->xattr_req);
+ }
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
+/* Depending on the input, decide if this is a:
+ * fresh-lookup: loc->name is provided but no dht inode ctx
+ * revalidation: loc->name is provided, dht inode ctx is present
+ * discover: gfid based nameless lookup.
+ */
int
-dht_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask)
+dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ loc_t new_loc = {
+ 0,
+ };
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ conf = this->private;
+ if (!conf)
+ goto err;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dht_filter_loc_subvol_key(this, loc, &new_loc, &hashed_subvol);
+ if (ret) {
+ loc_wipe(&local->loc);
+ ret = loc_dup(&new_loc, &local->loc);
+
+ /* we no longer need 'new_loc' entries */
+ loc_wipe(&new_loc);
+
+ /* check if loc_dup() is successful */
+ if (ret == -1) {
+ op_errno = errno;
+ gf_msg_debug(this->name, errno,
+ "copying location failed for path=%s", loc->path);
+ goto err;
+ }
+ }
+
+ if (xattr_req) {
+ local->xattr_req = dict_ref(xattr_req);
+ } else {
+ local->xattr_req = dict_new();
+ }
+
+ /* Nameless lookup */
+
+ /* This is usually sent by NFS. Lookups are done based on the gfid and
+ * no name information is available. Without the name, dht cannot calculate
+ * the hash and has to send a lookup to all subvols.
+ */
+ if (gf_uuid_is_null(loc->pargfid) && !gf_uuid_is_null(loc->gfid) &&
+ !__is_root_gfid(loc->inode->gfid)) {
+ local->cached_subvol = NULL;
+ dht_do_discover(frame, this, loc);
+ return 0;
+ }
+ if (loc_is_root(loc)) {
+ /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+ * set on the brick root.
+ */
+ ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+ if (!hashed_subvol)
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ local->hashed_subvol = hashed_subvol;
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ if (is_revalidate(loc)) {
+ /* The entry has been looked up before and has a dht inode_ctx
+ */
+ dht_do_revalidate(frame, this, loc);
+ return 0;
+ } else {
+ /* Entry has not been looked up before
+ */
+ dht_do_fresh_lookup(frame, this, loc);
+ return 0;
+ }
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
- local->call_cnt = 1;
+static int
+dht_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if ((op_ret == -1) &&
+ !((op_errno == ENOENT) || (op_errno == ENOTCONN))) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno,
+ "Unlink link: subvolume %s returned -1", prev->name);
+ goto post_unlock;
+ }
- STACK_WIND (frame, dht_err_cbk,
- subvol, subvol->fops->access,
- loc, mask);
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
- return 0;
+ return 0;
+}
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (access, frame, -1, op_errno);
+static int
+dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *hashed_subvol = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno != ENOENT) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno,
+ "Unlink: subvolume %s returned -1", prev->name);
+ goto post_unlock;
+ }
+
+ local->op_ret = 0;
+
+ local->postparent = *postparent;
+ local->preparent = *preparent;
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!local->op_ret) {
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+ if (hashed_subvol && hashed_subvol != local->cached_subvol) {
+ /*
+ * If hashed and cached are different, then we need
+ * to unlink linkfile from hashed subvol if data
+ * file is deleted successfully
+ */
+ STACK_WIND_COOKIE(frame, dht_unlink_linkfile_cbk, hashed_subvol,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, local->flags, xdata);
+ return 0;
+ }
+ }
- return 0;
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
+
+ return 0;
}
+static int
+dht_common_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
-int
-dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, const char *path, struct iatt *sbuf)
+static int
+dht_fix_layout_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ if (op_ret == 0) {
+ /* update the layout in the inode ctx */
local = frame->local;
- if (op_ret == -1)
- goto err;
+ layout = local->selfheal.layout;
- if (local) {
- sbuf->ia_ino = local->ia_ino;
- } else {
- op_ret = -1;
- op_errno = EINVAL;
+ dht_layout_set(this, local->loc.inode, layout);
+ }
+
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+static int
+dht_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
}
-err:
- DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, sbuf);
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_FSETXATTR)) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+ }
- return 0;
+out:
+ return 0;
}
+/* Set the value[] of key into dict after convert from
+ host byte order to network byte order
+*/
+int32_t
+dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size)
+{
+ int ret = -1;
+ int32_t *ptr = NULL;
+ int32_t vindex;
+
+ if (value == NULL) {
+ return -EINVAL;
+ }
+
+ ptr = GF_MALLOC(sizeof(int32_t) * size, gf_common_mt_char);
+ if (ptr == NULL) {
+ return -ENOMEM;
+ }
+ for (vindex = 0; vindex < size; vindex++) {
+ ptr[vindex] = hton32(value[vindex]);
+ }
+ ret = dict_set_bin(dict, key, ptr, sizeof(int32_t) * size);
+ if (ret)
+ GF_FREE(ptr);
+ return ret;
+}
-int
-dht_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->ia_ino = loc->inode->ino;
-
- STACK_WIND (frame, dht_readlink_cbk,
- subvol, subvol->fops->readlink,
- loc, size);
-
- return 0;
+static int
+dht_common_mds_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = cookie;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL);
+ local = frame->local;
+
+ if (op_ret)
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, op_errno, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, op_errno, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
- return 0;
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, op_errno, NULL);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, op_errno, NULL);
+ }
+
+out:
+ return 0;
}
+/* Code to wind a xattrop call to add 1 on current mds internal xattr
+ value
+*/
+static int
+dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int ret = 0;
+ dict_t *xattrop = NULL;
+ int32_t addone[1] = {1};
+ call_frame_t *prev = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto post_unlock;
+ }
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ if (!local->op_ret) {
+ xattrop = dict_new();
+ if (!xattrop) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
+ "dictionary creation failed");
+ ret = -1;
+ goto out;
+ }
+ ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, addone, 1);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "dictionary set array failed ");
+ ret = -1;
+ goto out;
+ }
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_REMOVEXATTR)) {
+ STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol,
+ local->mds_subvol->fops->xattrop, &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ } else {
+ STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fxattrop, local->fd,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ }
+ } else {
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+ }
+ }
+out:
+ if (ret) {
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
-int
-dht_fix_layout_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
+ }
+ }
+just_return:
+ if (xattrop)
+ dict_unref(xattrop);
+ return 0;
+}
+
+static int
+dht_setxattr_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- DHT_STACK_UNWIND (getxattr, frame, -1, ENODATA, NULL);
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *mds_subvol = NULL;
+ int i = 0;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+ mds_subvol = local->mds_subvol;
+
+ if (op_ret == -1) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto out;
+ }
+
+ local->op_ret = 0;
+ local->call_cnt = conf->subvolume_cnt - 1;
+ local->xdata = dict_ref(xdata);
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (mds_subvol && (mds_subvol == conf->subvolumes[i]))
+ continue;
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->setxattr, &local->loc,
+ local->xattr, local->flags, local->xattr_req);
+ }
- return 0;
+ if (local->fop == GF_FOP_FSETXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fsetxattr, local->fd,
+ local->xattr, local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->removexattr, &local->loc,
+ local->key, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fremovexattr, local->fd,
+ local->key, local->xattr_req);
+ }
+ }
+
+ return 0;
+out:
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+
+just_return:
+ return 0;
+}
+
+static int
+dht_xattrop_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto out;
+ }
+
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->setxattr, &local->loc, local->xattr,
+ local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fsetxattr, local->fd, local->xattr,
+ local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->removexattr, &local->loc,
+ local->key, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+ }
+
+ return 0;
+out:
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+
+just_return:
+ return 0;
}
static void
-fill_layout_info (dht_layout_t *layout, char *buf)
+fill_layout_info(dht_layout_t *layout, char *buf)
{
- int i = 0;
- char tmp_buf[128] = {0,};
+ int i = 0;
+ char tmp_buf[128] = {
+ 0,
+ };
+
+ for (i = 0; i < layout->cnt; i++) {
+ snprintf(tmp_buf, sizeof(tmp_buf), "(%s %u %u)",
+ layout->list[i].xlator->name, layout->list[i].start,
+ layout->list[i].stop);
+ if (i)
+ strcat(buf, " ");
+ strcat(buf, tmp_buf);
+ }
+}
- for (i = 0; i < layout->cnt; i++) {
- snprintf (tmp_buf, 128, "(%s %u %u)",
- layout->list[i].xlator->name,
- layout->list[i].start,
- layout->list[i].stop);
- if (i)
- strcat (buf, " ");
- strcat (buf, tmp_buf);
+static void
+dht_fill_pathinfo_xattr(xlator_t *this, dht_local_t *local, char *xattr_buf,
+ int32_t alloc_len, int flag, char *layout_buf)
+{
+ if (flag) {
+ if (local->xattr_val) {
+ snprintf(xattr_buf, alloc_len,
+ "((<" DHT_PATHINFO_HEADER "%s> %s) (%s-layout %s))",
+ this->name, local->xattr_val, this->name, layout_buf);
+ } else {
+ snprintf(xattr_buf, alloc_len, "(%s-layout %s)", this->name,
+ layout_buf);
}
+ } else if (local->xattr_val) {
+ snprintf(xattr_buf, alloc_len, "(<" DHT_PATHINFO_HEADER "%s> %s)",
+ this->name, local->xattr_val);
+ } else {
+ xattr_buf[0] = '\0';
+ }
}
-int
-dht_pathinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr)
-{
- dht_local_t *local = NULL;
- int ret = 0;
- int flag = 0;
- int this_call_cnt = 0;
- char *value_got = NULL;
- char layout_buf[8192] = {0,};
- char xattr_buf[8192 + 1024] = {0,};
- dict_t *dict = NULL;
+static int
+dht_vgetxattr_alloc_and_fill(dht_local_t *local, dict_t *xattr, xlator_t *this,
+ int op_errno)
+{
+ int ret = -1;
+ char *value = NULL;
+
+ ret = dict_get_str(xattr, local->xsel, &value);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
+ "Subvolume %s returned -1", this->name);
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto out;
+ }
+
+ local->alloc_len += strlen(value);
+
+ if (!local->xattr_val) {
+ local->alloc_len += (SLEN(DHT_PATHINFO_HEADER) + 10);
+ local->xattr_val = GF_MALLOC(local->alloc_len, gf_common_mt_char);
+ if (!local->xattr_val) {
+ ret = -1;
+ goto out;
+ }
+ local->xattr_val[0] = '\0';
+ }
+
+ int plen = strlen(local->xattr_val);
+ if (plen) {
+ /* extra byte(s) for \0 to be safe */
+ local->alloc_len += (plen + 2);
+ local->xattr_val = GF_REALLOC(local->xattr_val, local->alloc_len);
+ if (!local->xattr_val) {
+ ret = -1;
+ goto out;
+ }
+ }
- local = frame->local;
+ (void)strcat(local->xattr_val, value);
+ (void)strcat(local->xattr_val, " ");
+ local->op_ret = 0;
- if (op_ret != -1) {
- ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value_got);
- if (!ret) {
- if (!local->pathinfo)
- local->pathinfo = GF_CALLOC (8192, sizeof (char),
- gf_common_mt_char);
- if (local->pathinfo)
- strcat (local->pathinfo, value_got);
- }
- }
+ ret = 0;
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- if (local->layout->cnt > 1) {
- /* Set it for directory */
- fill_layout_info (local->layout, layout_buf);
- flag = 1;
- }
+out:
+ return ret;
+}
- dict = dict_new ();
+static int
+dht_vgetxattr_fill_and_set(dht_local_t *local, dict_t **dict, xlator_t *this,
+ gf_boolean_t flag)
+{
+ int ret = -1;
+ char *xattr_buf = NULL;
+ char layout_buf[8192] = {
+ 0,
+ };
+
+ if (flag)
+ fill_layout_info(local->layout, layout_buf);
+
+ *dict = dict_new();
+ if (!*dict)
+ goto out;
+
+ local->xattr_val[strlen(local->xattr_val) - 1] = '\0';
+
+ /* we would need max this many bytes to create xattr string
+ * extra 40 bytes is just an estimated amount of additional
+ * space required as we include translator name and some
+ * spaces, brackets etc. when forming the pathinfo string.
+ *
+ * For node-uuid we just don't have all the pretty formatting,
+ * but since this is a generic routine for pathinfo & node-uuid
+ * we don't have conditional space allocation and try to be
+ * generic
+ */
+ local->alloc_len += (2 * strlen(this->name)) + strlen(layout_buf) + 40;
+ xattr_buf = GF_MALLOC(local->alloc_len, gf_common_mt_char);
+ if (!xattr_buf)
+ goto out;
+
+ if (XATTR_IS_PATHINFO(local->xsel)) {
+ (void)dht_fill_pathinfo_xattr(this, local, xattr_buf, local->alloc_len,
+ flag, layout_buf);
+ } else if ((XATTR_IS_NODE_UUID(local->xsel)) ||
+ (XATTR_IS_NODE_UUID_LIST(local->xsel))) {
+ (void)snprintf(xattr_buf, local->alloc_len, "%s", local->xattr_val);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GET_XATTR_FAILED,
+ "Unknown local->xsel (%s)", local->xsel);
+ GF_FREE(xattr_buf);
+ goto out;
+ }
+
+ ret = dict_set_dynstr(*dict, local->xsel, xattr_buf);
+ if (ret)
+ GF_FREE(xattr_buf);
+ GF_FREE(local->xattr_val);
- if (flag && local->pathinfo)
- snprintf (xattr_buf, 9216, "((%s %s) (%s-layout %s))",
- this->name, local->pathinfo, this->name,
- layout_buf);
- else if (local->pathinfo)
- snprintf (xattr_buf, 9216, "(%s %s)",
- this->name, local->pathinfo);
- else if (flag)
- snprintf (xattr_buf, 9216, "(%s-layout %s)",
- this->name, layout_buf);
+out:
+ return ret;
+}
- ret = dict_set_str (dict, GF_XATTR_PATHINFO_KEY,
- xattr_buf);
+static int
+dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
+ int ret = 0;
+ char *uuid_str = NULL;
+ char *uuid_list = NULL;
+ char *next_uuid_str = NULL;
+ char *saveptr = NULL;
+ uuid_t node_uuid = {
+ 0,
+ };
+ char *uuid_list_copy = NULL;
+ int count = 0;
+ int i = 0;
+ int index = 0;
+ int found = 0;
+ nodeuuid_info_t *tmp_ptr = NULL;
+
+ VALIDATE_OR_GOTO(frame, out);
+ VALIDATE_OR_GOTO(frame->local, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ VALIDATE_OR_GOTO(conf->defrag, out);
+
+ gf_msg_debug(this->name, 0, "subvol %s returned", prev->name);
+
+ LOCK(&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ if (op_errno == ENODATA)
+ gf_msg_debug(this->name, 0, "failed to get node-uuid");
+ else
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "failed to get node-uuid");
+ goto post_unlock;
+ }
+
+ ret = dict_get_str(xattr, local->xsel, &uuid_list);
- if (local->pathinfo)
- GF_FREE (local->pathinfo);
- GF_FREE (local->key);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_GET_FAILED,
+ "Failed to get %s", local->xsel);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ goto unlock;
+ }
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict);
+ /* As DHT will not know details of its child xlators
+ * we need to parse this twice to get the count first
+ * and allocate memory later.
+ */
+ count = 0;
+ index = conf->local_subvols_cnt;
- if (dict)
- dict_unref (dict);
+ uuid_list_copy = gf_strdup(uuid_list);
+ if (!uuid_list_copy)
+ goto unlock;
- return 0;
+ for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str;
+ uuid_str = next_uuid_str) {
+ next_uuid_str = strtok_r(NULL, " ", &saveptr);
+ if (gf_uuid_parse(uuid_str, node_uuid)) {
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR,
+ "Failed to parse uuid for %s", prev->name);
+ goto post_unlock;
+ }
+
+ count++;
+ if (gf_uuid_compare(node_uuid, conf->defrag->node_uuid)) {
+ gf_msg_debug(this->name, 0,
+ "subvol %s does not"
+ "belong to this node",
+ prev->name);
+ } else {
+ /* handle multiple bricks of the same replica
+ * on the same node */
+ if (found)
+ continue;
+ conf->local_subvols[(conf->local_subvols_cnt)++] = prev;
+ found = 1;
+ gf_msg_debug(this->name, 0,
+ "subvol %s belongs to"
+ " this node",
+ prev->name);
+ }
}
- if (local->pathinfo)
- strcat (local->pathinfo, " Link: ");
- if (local->hashed_subvol) {
- /* This will happen if there pending */
- STACK_WIND (frame, dht_pathinfo_getxattr_cbk, local->hashed_subvol,
- local->hashed_subvol->fops->getxattr,
- &local->loc, local->key);
+ if (!found) {
+ local->op_ret = 0;
+ goto unlock;
+ }
- return 0;
+ conf->local_nodeuuids[index].count = count;
+ conf->local_nodeuuids[index].elements = GF_CALLOC(
+ count, sizeof(nodeuuid_info_t), 1);
+
+ /* The node-uuids are guaranteed to be returned in the same
+ * order as the bricks
+ * A null node-uuid is returned for a brick that is down.
+ */
+
+ saveptr = NULL;
+ i = 0;
+
+ for (uuid_str = strtok_r(uuid_list_copy, " ", &saveptr); uuid_str;
+ uuid_str = next_uuid_str) {
+ next_uuid_str = strtok_r(NULL, " ", &saveptr);
+ tmp_ptr = &(conf->local_nodeuuids[index].elements[i]);
+ gf_uuid_parse(uuid_str, tmp_ptr->uuid);
+
+ if (!gf_uuid_compare(tmp_ptr->uuid, conf->defrag->node_uuid)) {
+ tmp_ptr->info = REBAL_NODEUUID_MINE;
+ }
+ i++;
+ tmp_ptr = NULL;
}
+ }
- gf_log ("this->name", GF_LOG_ERROR, "Unable to find hashed_subvol for path"
- " %s", local->pathinfo);
-
- DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, dict);
- return 0;
+ local->op_ret = 0;
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!is_last_call(this_call_cnt))
+ goto out;
+
+ if (local->op_ret == -1) {
+ goto unwind;
+ }
+
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, xattr, xdata);
+ goto out;
+
+unwind:
+
+ GF_FREE(conf->local_nodeuuids[index].elements);
+ conf->local_nodeuuids[index].elements = NULL;
+
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, xdata);
+out:
+ GF_FREE(uuid_list_copy);
+ return 0;
}
-int
-dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr)
-{
- int ret = 0;
- char *value = NULL;
-
- if (op_ret != -1) {
- ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value);
- if (!ret) {
- ret = dict_set_str (xattr, GF_XATTR_LINKINFO_KEY, value);
- if (!ret)
- gf_log (this->name, GF_LOG_TRACE,
- "failed to set linkinfo");
- }
+static int
+dht_vgetxattr_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ int ret = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ dict_t *dict = NULL;
+
+ VALIDATE_OR_GOTO(frame, out);
+ VALIDATE_OR_GOTO(frame->local, out);
+
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ if (op_ret < 0) {
+ if (op_errno != ENOTCONN) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir");
+ goto post_unlock;
+ }
+
+ goto unlock;
}
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr);
+ ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_DICT_SET_FAILED,
+ "alloc or fill failure");
+ goto post_unlock;
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!is_last_call(this_call_cnt))
+ goto out;
- return 0;
+ /* -- last call: do patch ups -- */
+
+ if (local->op_ret == -1) {
+ goto unwind;
+ }
+
+ ret = dht_vgetxattr_fill_and_set(local, &dict, this, _gf_true);
+ if (ret)
+ goto unwind;
+
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
+
+unwind:
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL);
+cleanup:
+ if (dict)
+ dict_unref(dict);
+out:
+ return 0;
}
-int
-dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr)
+static int
+dht_vgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
{
- if (op_ret != -1) {
- if (dict_get (xattr, "trusted.glusterfs.dht")) {
- dict_del (xattr, "trusted.glusterfs.dht");
- }
+ dht_local_t *local = NULL;
+ int ret = 0;
+ dict_t *dict = NULL;
+ xlator_t *prev = NULL;
+ gf_boolean_t flag = _gf_true;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
+ "vgetxattr: Subvolume %s returned -1", prev->name);
+ goto unwind;
+ }
+
+ ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Allocation or fill failure");
+ goto unwind;
+ }
+
+ flag = (local->layout->cnt > 1) ? _gf_true : _gf_false;
+
+ ret = dht_vgetxattr_fill_and_set(local, &dict, this, flag);
+ if (ret)
+ goto unwind;
+
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
+
+unwind:
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL);
+cleanup:
+ if (dict)
+ dict_unref(dict);
+
+ return 0;
+}
+
+static int
+dht_linkinfo_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ int ret = 0;
+ char *value = NULL;
+
+ if (op_ret != -1) {
+ ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value);
+ if (!ret) {
+ ret = dict_set_str(xattr, GF_XATTR_LINKINFO_KEY, value);
+ if (!ret)
+ gf_msg_trace(this->name, 0, "failed to set linkinfo");
}
+ }
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr);
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
- return 0;
+ return 0;
}
+static int
+dht_mds_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(frame->local, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+ local = frame->local;
+
+ if (!xattr || (op_ret == -1)) {
+ local->op_ret = op_ret;
+ goto out;
+ }
+ dict_del(xattr, conf->xattr_name);
+ local->op_ret = 0;
+
+ if (!local->xattr) {
+ local->xattr = dict_copy_with_ref(xattr, NULL);
+ }
+
+out:
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
+ xdata);
+ return 0;
+err:
+ DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+ return 0;
+}
int
-dht_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *key)
-{
- xlator_t *subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int op_errno = -1;
- int ret = 0;
- int flag = 0;
- int i = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
- layout = dht_layout_get (this, loc->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_ERROR,
- "layout is NULL");
- op_errno = ENOENT;
- goto err;
+dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(frame->local, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+ local = frame->local;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto err;
+ return 0;
+ }
+
+ LOCK(&frame->lock);
+ {
+ if (!xattr || (op_ret == -1)) {
+ local->op_ret = op_ret;
+ goto unlock;
}
- if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)) {
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
+ dict_del(xattr, conf->xattr_name);
+ dict_del(xattr, conf->mds_xattr_key);
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+ dict_del(xattr, conf->commithash_xattr_name);
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
- local->key = gf_strdup (key);
- if (!local->key) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
- local->layout = layout;
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
- local->call_cnt = 1;
- if (hashed_subvol != cached_subvol) {
- local->call_cnt = 2;
- local->hashed_subvol = hashed_subvol;
- }
+ local->op_ret = 0;
- STACK_WIND (frame, dht_pathinfo_getxattr_cbk, cached_subvol,
- cached_subvol->fops->getxattr, loc, key);
+ if (!local->xattr) {
+ local->xattr = dict_copy_with_ref(xattr, NULL);
+ } else {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
- return 0;
+ if (!local->xdata) {
+ local->xdata = dict_ref(xdata);
+ } else if ((local->inode && IA_ISDIR(local->inode->ia_type)) ||
+ (local->fd && IA_ISDIR(local->fd->inode->ia_type))) {
+ dht_aggregate_xattr(local->xdata, xdata);
}
- if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) {
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
- if (hashed_subvol == cached_subvol) {
- op_errno = ENODATA;
- goto err;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ /* If we have a valid xattr received from any one of the
+ * subvolume, let's return it */
+ if (local->xattr) {
+ local->op_ret = 0;
+ }
+
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
+ local->xdata);
+ }
+ return 0;
+err:
+ DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+ return 0;
+}
+
+static int32_t
+dht_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+static int
+dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ if (local->op_errno == EOPNOTSUPP) {
+ /* Nothing to do here, we have already found
+ * a subvol which does not have the get_real_filename
+ * optimization. If condition is for simple logic.
+ */
+ goto unlock;
+ }
+
+ if (op_ret == -1) {
+ if (op_errno == EOPNOTSUPP) {
+ /* This subvol does not have the optimization.
+ * Better let the user know we don't support it.
+ * Remove previous results if any.
+ */
+
+ if (local->xattr) {
+ dict_unref(local->xattr);
+ local->xattr = NULL;
}
- if (hashed_subvol) {
- STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
- hashed_subvol->fops->getxattr, loc,
- GF_XATTR_PATHINFO_KEY);
- return 0;
+
+ if (local->xattr_req) {
+ dict_unref(local->xattr_req);
+ local->xattr_req = NULL;
}
- op_errno = ENODATA;
- goto err;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UPGRADE_BRICKS,
+ "At least "
+ "one of the bricks does not support "
+ "this operation. Please upgrade all "
+ "bricks.");
+ goto post_unlock;
+ }
+
+ if (op_errno == ENOATTR) {
+ /* Do nothing, our defaults are set to this.
+ */
+ goto unlock;
+ }
+
+ /* This is a place holder for every other error
+ * case. I am not sure of how to interpret
+ * ENOTCONN etc. As of now, choosing to ignore
+ * down subvol and return a good result(if any)
+ * from other subvol.
+ */
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "Failed to get real filename.");
+ goto post_unlock;
}
- if (key && (strcmp (key, GF_XATTR_FIX_LAYOUT_KEY) == 0)) {
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].start == layout->list[i].stop) {
- flag = 1;
- break;
- }
- }
- if ((layout->cnt < conf->subvolume_cnt) || flag) {
- gf_log (this->name, GF_LOG_INFO,
- "expanding layout of %s from %d to %d",
- loc->path, layout->cnt, conf->subvolume_cnt);
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
- local->layout = layout;
- //layout = dht_layout_new (this, conf->subvolume_cnt);
+ /* This subvol has the required file.
+ * There could be other subvols which have returned
+ * success already, choosing to return the latest good
+ * result.
+ */
+ if (local->xattr)
+ dict_unref(local->xattr);
+ local->xattr = dict_ref(xattr);
- dht_selfheal_new_directory (frame, dht_fix_layout_cbk,
- layout);
- return 0;
- }
- op_errno = ENODATA;
- goto err;
+ if (local->xattr_req) {
+ dict_unref(local->xattr_req);
+ local->xattr_req = NULL;
}
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->op_ret = op_ret;
+ local->op_errno = 0;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, 0, "Found a matching file.");
+ goto post_unlock;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->xattr, local->xattr_req);
+ }
+
+ return 0;
+}
- STACK_WIND (frame, dht_getxattr_cbk,
- subvol, subvol->fops->getxattr,
- loc, key);
+static int
+dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int cnt = 0;
+ xlator_t *subvol = NULL;
- return 0;
+ local = frame->local;
+ layout = local->layout;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL);
+ cnt = local->call_cnt = layout->cnt;
- return 0;
+ local->op_ret = -1;
+ local->op_errno = ENOATTR;
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_get_real_filename_cbk, subvol,
+ subvol->fops->getxattr, loc, key, xdata);
+ }
+
+ return 0;
}
+static int
+dht_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+ xlator_t **subvols)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
-int
-dht_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr, int flags)
-{
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
- int op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- conf = this->private;
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->layout = layout = dht_layout_get (this, loc->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local->call_cnt = layout->cnt;
-
- for (i = 0; i < layout->cnt; i++) {
- STACK_WIND (frame, dht_err_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->setxattr,
- loc, xattr, flags);
- }
-
- return 0;
+ local = frame->local;
+ layout = local->layout;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (setxattr, frame, -1, op_errno);
+ for (i = 0; i < layout->cnt; i++)
+ subvols[i] = layout->list[i].xlator;
- return 0;
+ return layout->cnt;
}
-
-int
-dht_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *key)
+static int
+dht_is_debug_xattr_key(const char **array, char *key)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ int i = 0;
+ for (i = 0; array[i]; i++) {
+ if (fnmatch(array[i], key, FNM_NOESCAPE) == 0)
+ return i;
+ }
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
+ return -1;
+}
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+/* Note we already have frame->local initialised here*/
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+static int
+dht_handle_debug_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENODATA;
+ char *value = NULL;
+ loc_t file_loc = {0};
+ const char *name = NULL;
+
+ local = frame->local;
+
+ if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) == -1) {
+ goto out;
+ }
+
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (strncmp(key, DHT_DBG_HASHED_SUBVOL_KEY,
+ SLEN(DHT_DBG_HASHED_SUBVOL_KEY)) == 0) {
+ name = key + strlen(DHT_DBG_HASHED_SUBVOL_KEY);
+ if (strlen(name) == 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
- local->call_cnt = 1;
+ ret = dht_build_child_loc(this, &file_loc, loc, (char *)name);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- STACK_WIND (frame, dht_err_cbk,
- subvol, subvol->fops->removexattr,
- loc, key);
+ local->hashed_subvol = dht_subvol_get_hashed(this, &file_loc);
+ if (local->hashed_subvol == NULL) {
+ op_errno = ENODATA;
+ goto out;
+ }
- return 0;
+ value = gf_strdup(local->hashed_subvol->name);
+ if (!value) {
+ op_errno = ENOMEM;
+ goto out;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (removexattr, frame, -1, op_errno);
+ ret = dict_set_dynstr(local->xattr, (char *)key, value);
+ if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+ goto out;
+ }
- return 0;
+out:
+ loc_wipe(&file_loc);
+ DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+ return 0;
}
+/* Virtual Xattr which returns 1 if all subvols are up,
+ else returns 0. Geo-rep then uses this virtual xattr
+ after a fresh mount and starts the I/O.
+*/
+
+enum dht_vxattr_subvol {
+ DHT_VXATTR_SUBVOLS_UP = 1,
+ DHT_VXATTR_SUBVOLS_DOWN = 0,
+};
int
-dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd)
+dht_vgetxattr_subvol_status(call_frame_t *frame, xlator_t *this,
+ const char *key)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENODATA;
+ int value = DHT_VXATTR_SUBVOLS_UP;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ local = frame->local;
+
+ if (!key) {
+ op_errno = EINVAL;
+ goto out;
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ value = DHT_VXATTR_SUBVOLS_DOWN;
+ gf_msg_debug(this->name, 0, "subvol %s is down ",
+ conf->subvolumes[i]->name);
+ break;
+ }
+ }
+ ret = dict_set_int8(local->xattr, (char *)key, value);
+ if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+ return 0;
+}
- local = frame->local;
- prev = cookie;
+int
+dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
+ dict_t *xdata)
+#define DHT_IS_DIR(layout) (layout->cnt > 1)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int op_errno = -1;
+ int i = 0;
+ int cnt = 0;
+ char *node_uuid_key = NULL;
+ int ret = -1;
+
+ GF_CHECK_XATTR_KEY_AND_GOTO(key, IO_THREADS_QUEUE_SIZE_KEY, op_errno, err);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_GETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL,
+ "Layout is NULL");
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ /* skip over code which is irrelevant without a valid key */
+ if (!key)
+ goto no_key;
+
+ local->key = gf_strdup(key);
+ if (!local->key) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ if (strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) {
+ dht_vgetxattr_subvol_status(frame, this, key);
+ return 0;
+ }
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
+ /* skip over code which is irrelevant if !DHT_IS_DIR(layout) */
+ if (!DHT_IS_DIR(layout))
+ goto no_dht_is_dir;
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ if ((strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY,
+ SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) &&
+ DHT_IS_DIR(layout)) {
+ dht_getxattr_get_real_filename(frame, this, loc, key, xdata);
+ return 0;
+ }
+
+ if (!strcmp(key, GF_REBAL_FIND_LOCAL_SUBVOL)) {
+ ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_LIST_NODE_UUIDS_KEY);
+ if (ret == -1 || !node_uuid_key) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Failed to copy node uuid key");
+ op_errno = ENOMEM;
+ goto err;
+ }
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key);
+ cnt = local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ node_uuid_key, xdata);
+ }
+ if (node_uuid_key)
+ GF_FREE(node_uuid_key);
+ return 0;
+ }
+
+ if (!strcmp(key, GF_REBAL_OLD_FIND_LOCAL_SUBVOL)) {
+ ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_NODE_UUID_KEY);
+ if (ret == -1 || !node_uuid_key) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Failed to copy node uuid key");
+ op_errno = ENOMEM;
+ goto err;
+ }
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key);
+ cnt = local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ node_uuid_key, xdata);
+ }
+ if (node_uuid_key)
+ GF_FREE(node_uuid_key);
+ return 0;
+ }
+
+ /* for file use cached subvolume (obviously!): see if {}
+ * below
+ * for directory:
+ * wind to all subvolumes and exclude subvolumes which
+ * return ENOTCONN (in callback)
+ *
+ * NOTE: Don't trust inode here, as that may not be valid
+ * (until inode_link() happens)
+ */
+
+ if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0) ||
+ (strcmp(key, GF_XATTR_LIST_NODE_UUIDS_KEY) == 0)) {
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key);
+ cnt = local->call_cnt = layout->cnt;
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_vgetxattr_dir_cbk, subvol,
+ subvol->fops->getxattr, loc, key, xdata);
+ }
+ return 0;
+ }
+
+no_dht_is_dir:
+ /* node-uuid or pathinfo for files */
+ if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0)) {
+ cached_subvol = local->cached_subvol;
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key);
+ local->call_cnt = 1;
+ STACK_WIND_COOKIE(frame, dht_vgetxattr_cbk, cached_subvol,
+ cached_subvol, cached_subvol->fops->getxattr, loc,
+ key, xdata);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
- local->fd);
+ return 0;
+ }
+
+ if (strcmp(key, GF_XATTR_LINKINFO_KEY) == 0) {
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ cached_subvol = dht_subvol_get_cached(this, loc->inode);
+ if (!cached_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_CACHED_SUBVOL_GET_FAILED,
+ "Failed to get cached subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (hashed_subvol == cached_subvol) {
+ op_errno = ENODATA;
+ goto err;
+ }
+
+ STACK_WIND(frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
+ hashed_subvol->fops->getxattr, loc, GF_XATTR_PATHINFO_KEY,
+ xdata);
return 0;
-}
+ }
+ if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) >= 0) {
+ dht_handle_debug_getxattr(frame, this, loc, key);
+ return 0;
+ }
+
+no_key:
+ if (cluster_handle_marker_getxattr(frame, loc, key, conf->vol_uuid,
+ dht_getxattr_unwind,
+ dht_marker_populate_args) == 0)
+ return 0;
+
+ if (DHT_IS_DIR(layout)) {
+ local->call_cnt = conf->subvolume_cnt;
+ cnt = conf->subvolume_cnt;
+ ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+ if (!mds_subvol) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Cannot determine MDS, fetching xattr %s randomly"
+ " from a subvol for path %s ",
+ key, loc->path);
+ } else {
+ /* TODO need to handle it, As of now we are
+ choosing availability instead of chossing
+ consistencty, in case of mds_subvol is
+ down winding a getxattr call on other subvol
+ and return xattr
+ */
+ local->mds_subvol = mds_subvol;
+ for (i = 0; i < cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_INFO, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS %s is down for path"
+ " path %s so fetching xattr "
+ "%s randomly from a subvol ",
+ local->mds_subvol->name, loc->path, key);
+ ret = 1;
+ }
+ }
+ }
+ }
+
+ if (!ret && key && local->mds_subvol && dht_match_xattr(key)) {
+ STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol,
+ local->mds_subvol->fops->getxattr, loc, key, xdata);
+
+ return 0;
+ }
+ } else {
+ cnt = local->call_cnt = 1;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, loc,
+ key, xdata);
+ }
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+#undef DHT_IS_DIR
int
-dht_open (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags, fd_t *fd, int wbflags)
-{
- xlator_t *subvol = NULL;
- int ret = -1;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->fd = fd_ref (fd);
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->call_cnt = 1;
-
- STACK_WIND (frame, dht_fd_cbk,
- subvol, subvol->fops->open,
- loc, flags, fd, wbflags);
-
- return 0;
+dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int op_errno = -1;
+ int i = 0;
+ int cnt = 0;
+ xlator_t *mds_subvol = NULL;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(fd->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FGETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL,
+ "Layout is NULL");
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ if (key) {
+ local->key = gf_strdup(key);
+ if (!local->key) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ }
+
+ gf_uuid_unparse(fd->inode->gfid, gfid);
+
+ if ((fd->inode->ia_type == IA_IFDIR) && key &&
+ (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) !=
+ 0)) {
+ local->call_cnt = conf->subvolume_cnt;
+ cnt = conf->subvolume_cnt;
+ ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol);
+
+ if (!mds_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "cannot determine MDS, fetching xattr %s "
+ " randomly from a subvol for gfid %s ",
+ key, gfid);
+ } else {
+ /* TODO need to handle it, As of now we are
+ choosing availability instead of chossing
+ consistencty, in case of hashed_subvol is
+ down winding a getxattr call on other subvol
+ and return xattr
+ */
+ local->mds_subvol = mds_subvol;
+ for (i = 0; i < cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS subvolume %s is down"
+ " for gfid %s so fetching xattr "
+ " %s randomly from a subvol ",
+ local->mds_subvol->name, gfid, key);
+ ret = 1;
+ }
+ }
+ }
+ }
+
+ if (!ret && key && local->mds_subvol && dht_match_xattr(key)) {
+ STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fgetxattr, fd, key, NULL);
+
+ return 0;
+ }
+
+ } else {
+ cnt = local->call_cnt = 1;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, fd,
+ key, NULL);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+static int
+dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto err;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto err;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->setxattr, &local->loc,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, local->fd,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(setxattr, frame, (local ? local->op_ret : -1), op_errno,
+ NULL);
+ return 0;
+}
int
-dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iovec *vector, int count, struct iatt *stbuf,
- struct iobref *iobref)
+dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = frame->local;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ struct iatt *stbuf = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol1 = NULL, *subvol2 = NULL;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
+ local = frame->local;
+ prev = cookie;
+
+ local->op_errno = op_errno;
+
+ if ((local->fop == GF_FOP_FSETXATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+
+ if ((!op_ret) && !stbuf) {
+ goto out;
+ }
+
+ local->op_ret = op_ret;
+ local->rebalance.target_op_fn = dht_setxattr2;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Phase 1 of migration */
+ if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ dht_setxattr2(this, subvol2, frame, 0);
+ return 0;
}
- if (op_ret != -1)
- stbuf->ia_ino = local->ia_ino;
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
out:
- DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf,
- iobref);
- return 0;
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ } else {
+ DHT_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+ }
+
+ return 0;
}
+/* Function is call by dict_foreach_fnmatch if key is match with
+ user.* and set boolean flag to true
+*/
+static int
+dht_is_user_xattr(dict_t *this, char *key, data_t *value, void *data)
+{
+ gf_boolean_t *user_xattr_found = data;
+ *user_xattr_found = _gf_true;
+ return 0;
+}
-int
-dht_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
- op_errno = ENOMEM;
+/* Common code to wind a (f)(set|remove)xattr call to set xattr on directory
+ */
+static int
+dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, dict_t *xattr, int flags,
+ dict_t *xdata, int *op_errno)
+
+{
+ dict_t *xattrop = NULL;
+ int32_t subone[1] = {-1};
+ gf_boolean_t uxattr_key_found = _gf_false;
+ xlator_t *mds_subvol = NULL;
+ xlator_t *travvol = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int i = 0;
+ int call_cnt = 0;
+ dht_local_t *local = NULL;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char **xattrs_to_heal;
+
+ conf = this->private;
+ local = frame->local;
+ call_cnt = conf->subvolume_cnt;
+ local->flags = flags;
+ xattrs_to_heal = get_xattrs_to_heal();
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ gf_uuid_unparse(local->gfid, gfid_local);
+ }
+
+ if ((local->fop == GF_FOP_SETXATTR) || (local->fop == GF_FOP_FSETXATTR)) {
+ /* Check if any user xattr present in xattr
+ */
+ dict_foreach_fnmatch(xattr, "user*", dht_is_user_xattr,
+ &uxattr_key_found);
+
+ /* Check if any custom key xattr present in dict xattr
+ and start index from 1 because user xattr already
+ checked in previous line
+ */
+ for (i = 1; xattrs_to_heal[i]; i++)
+ if (dict_get(xattr, xattrs_to_heal[i]))
+ uxattr_key_found = _gf_true;
+ }
+
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ /* Check if any custom key xattr present in local->key
+ */
+ for (i = 0; xattrs_to_heal[i]; i++)
+ if (strstr(local->key, xattrs_to_heal[i]))
+ uxattr_key_found = _gf_true;
+ }
+
+ /* If there is no custom key xattr present or gfid is root
+ or call_cnt is 1 then wind a (f)setxattr call on all subvols
+ */
+ if (!uxattr_key_found || __is_root_gfid(local->gfid) || call_cnt == 1) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ travvol = conf->subvolumes[i];
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_FSETXATTR)) {
+ if (fd) {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->fsetxattr, fd, xattr,
+ flags, xdata);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->setxattr, loc, xattr,
+ flags, xdata);
+ }
+ }
+
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ if (fd) {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->fremovexattr, fd,
+ local->key, local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->removexattr, loc,
+ local->key, local->xattr_req);
+ }
+ }
+ }
+
+ return 0;
+ }
+
+ /* Calculate hash subvol based on inode and parent inode
+ */
+ if (fd) {
+ ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol);
+ } else {
+ ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+ }
+ if (ret || !mds_subvol) {
+ if (fd) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get mds subvol for fd %p"
+ "gfid is %s ",
+ fd, gfid_local);
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "%s: Failed to get mds subvol. (gfid is %s)", loc->path,
+ gfid_local);
+ }
+ (*op_errno) = ENOENT;
+ goto err;
+ }
+
+ local->mds_subvol = mds_subvol;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS subvol is down for path "
+ " %s gfid is %s Unable to set xattr ",
+ local->loc.path, gfid_local);
+ (*op_errno) = ENOTCONN;
goto err;
+ }
}
+ }
+
+ if (uxattr_key_found) {
+ xattrop = dict_new();
+ if (!xattrop) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
+ "dictionary creation failed for path %s "
+ "for gfid is %s ",
+ local->loc.path, gfid_local);
+ (*op_errno) = ENOMEM;
+ goto err;
+ }
+ local->xattr = dict_ref(xattr);
+ /* Subtract current MDS xattr value to -1 , value of MDS
+ xattr represents no. of times xattr modification failed
+ on non MDS subvols.
+ */
+ ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, subone, 1);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "dictionary set array failed for path %s "
+ "for gfid is %s ",
+ local->loc.path, gfid_local);
+ if (xattrop)
+ dict_unref(xattrop);
+ (*op_errno) = ret;
+ goto err;
+ }
+ /* Wind a xattrop call to use ref counting approach
+ update mds xattr to -1 before update xattr on
+ hashed subvol and update mds xattr to +1 after update
+ xattr on all non hashed subvol
+ */
+ if (fd) {
+ STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fxattrop, fd,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ } else {
+ STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->xattrop, loc,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ }
+ if (xattrop)
+ dict_unref(xattrop);
+ }
+
+ return 0;
+err:
+ return -1;
+}
- local->ia_ino = fd->inode->ino;
- STACK_WIND (frame, dht_readv_cbk,
- subvol, subvol->fops->readv,
- fd, size, off);
+int
+dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(fd->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+ local->call_cnt = call_cnt = layout->cnt;
+
+ if (IA_ISDIR(fd->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, xattr,
+ flags, xdata, &op_errno);
+ if (ret)
+ goto err;
+ } else {
+ local->call_cnt = 1;
+ local->rebalance.xattr = dict_ref(xattr);
+ local->rebalance.flags = flags;
+
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to set dictionary key %s for fd=%p",
+ DHT_IATT_IN_XDATA_KEY, fd);
+ }
- return 0;
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, fd, xattr, flags,
+ local->xattr_req);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+static int
+dht_checking_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ int i = -1;
+ int ret = -1;
+ char *value = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret == -1)
+ goto out;
+
+ ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value);
+ if (ret)
+ goto out;
+
+ if (!strcmp(value, local->key)) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev)
+ conf->decommissioned_bricks[i] = prev;
+ }
+ }
- return 0;
+out:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, ENOTSUP, NULL);
+ }
+ return 0;
}
+static int
+dht_nuke_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+
+static int
+dht_nuke_dir(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *tmp)
+{
+ if (!IA_ISDIR(loc->inode->ia_type)) {
+ DHT_STACK_UNWIND(setxattr, frame, -1, ENOTSUP, NULL);
+ return 0;
+ }
+
+ /* Setxattr didn't need the parent, but rmdir does. */
+ loc->parent = inode_parent(loc->inode, NULL, NULL);
+ if (!loc->parent) {
+ DHT_STACK_UNWIND(setxattr, frame, -1, ENOENT, NULL);
+ return 0;
+ }
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+
+ if (!loc->name && loc->path) {
+ loc->name = strrchr(loc->path, '/');
+ if (loc->name) {
+ ++(loc->name);
+ }
+ }
+
+ /*
+ * We do this instead of calling dht_rmdir_do directly for two reasons.
+ * The first is that we want to reuse all of the initialization that
+ * dht_rmdir does, so if it ever changes we'll just follow along. The
+ * second (i.e. why we don't use STACK_WIND_TAIL) is so that we don't
+ * obscure the fact that we came in via this path instead of a genuine
+ * rmdir. That makes debugging just a tiny bit easier.
+ */
+ STACK_WIND(frame, dht_nuke_dir_cbk, this, this->fops->rmdir, loc, 1, NULL);
+
+ return 0;
+}
int
-dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
+ int flags, dict_t *xdata)
{
- dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int op_errno = EINVAL;
+ int ret = -1;
+ data_t *tmp = NULL;
+ uint32_t dir_spread = 0;
+ char value[4096] = {
+ 0,
+ };
+ gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;
+ int call_cnt = 0;
+ uint32_t new_hash = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, err);
+
+ methods = &(conf->methods);
+
+ /* Rebalance daemon is allowed to set internal keys */
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_SETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->call_cnt = call_cnt = layout->cnt;
+ tmp = dict_get(xattr, conf->mds_xattr_key);
+ if (tmp) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ tmp = dict_get(xattr, GF_XATTR_FILE_MIGRATE_KEY);
+ if (tmp) {
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
- if (op_ret == -1) {
- goto out;
+ /* TODO: need to interpret the 'value' for more meaning
+ (ie, 'target' subvolume given there, etc) */
+ memcpy(value, tmp->data, tmp->len);
+ if (strcmp(value, "force") == 0)
+ forced_rebalance = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS;
+
+ if (conf->decommission_in_progress)
+ forced_rebalance = GF_DHT_MIGRATE_HARDLINK;
+
+ if (!loc->path) {
+ op_errno = EINVAL;
+ goto err;
}
- local = frame->local;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
-
- prebuf->ia_ino = local->ia_ino;
- postbuf->ia_ino = local->ia_ino;
+ if (!local->loc.name)
+ local->loc.name = strrchr(local->loc.path, '/') + 1;
-out:
- DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf);
+ if (!local->loc.parent)
+ local->loc.parent = inode_parent(local->loc.inode, NULL, NULL);
+
+ if ((!local->loc.name) || (!local->loc.parent)) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (gf_uuid_is_null(local->loc.pargfid))
+ gf_uuid_copy(local->loc.pargfid, local->loc.parent->gfid);
+
+ methods->migration_get_dst_subvol(this, local);
+
+ if (!local->rebalance.target_node) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->rebalance.from_subvol = local->cached_subvol;
+ if (local->rebalance.target_node == local->rebalance.from_subvol) {
+ op_errno = EEXIST;
+ goto err;
+ }
+ if (local->rebalance.target_node) {
+ local->flags = forced_rebalance;
+
+ frame->root->pid = GF_CLIENT_PID_DEFRAG;
+
+ ret = dht_start_rebalance_task(this, frame);
+ if (!ret)
+ return 0;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+ "%s: failed to create a new rebalance synctask", loc->path);
+ }
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ tmp = dict_get(xattr, "decommission-brick");
+ if (tmp) {
+ /* This operation should happen only on '/' */
+ if (!__is_root_gfid(loc->inode->gfid)) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ memcpy(value, tmp->data, min(tmp->len, 4095));
+ local->key = gf_strdup(value);
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* Get the pathinfo, and then compare */
+ STACK_WIND_COOKIE(frame, dht_checking_pathinfo_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ GF_XATTR_PATHINFO_KEY, NULL);
+ }
return 0;
-}
+ }
+
+ tmp = dict_get(xattr, GF_XATTR_FIX_LAYOUT_KEY);
+ if (tmp) {
+ ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash);
+ if (ret == 0) {
+ gf_msg_debug(this->name, 0,
+ "updating commit hash for %s from %u to %u",
+ uuid_utoa(loc->gfid), layout->commit_hash, new_hash);
+ layout->commit_hash = new_hash;
+
+ ret = dht_update_commit_hash_for_layout(frame);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_FIX_LAYOUT_INFO,
+ "fixing the layout of %s", loc->path);
-int
-dht_writev (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iovec *vector, int count, off_t off,
- struct iobref *iobref)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
+ ret = dht_fix_directory_layout(frame, dht_fix_layout_setxattr_cbk,
+ layout);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
+
+ tmp = dict_get(xattr, "distribute.directory-spread-count");
+ if (tmp) {
+ /* Setxattr value is packed as 'binary', not string */
+ memcpy(value, tmp->data, min(tmp->len, 4095));
+ ret = gf_string2uint32(value, &dir_spread);
+ if (!ret && ((dir_spread <= conf->subvolume_cnt) && (dir_spread > 0))) {
+ layout->spread_cnt = dir_spread;
+
+ ret = dht_fix_directory_layout(frame, dht_common_setxattr_cbk,
+ layout);
+ if (ret) {
+ op_errno = ENOTCONN;
goto err;
+ }
+ return ret;
}
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_OPERATION_NOT_SUP,
+ "wrong 'directory-spread-count' value (%s)", value);
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ tmp = dict_get(xattr, "glusterfs.dht.nuke");
+ if (tmp) {
+ return dht_nuke_dir(frame, this, loc, tmp);
+ }
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, xattr,
+ flags, xdata, &op_errno);
+ if (ret)
+ goto err;
+ } else {
+ local->rebalance.xattr = dict_ref(xattr);
+ local->rebalance.flags = flags;
+ local->call_cnt = 1;
- local->ia_ino = fd->inode->ino;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
- STACK_WIND (frame, dht_writev_cbk,
- subvol, subvol->fops->writev,
- fd, vector, count, off, iobref);
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->setxattr, loc, xattr, flags,
+ local->xattr_req);
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
-int
-dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+static int
+dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+ if (!frame || !frame->local)
+ goto err;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ local = frame->local;
+ op_errno = local->op_errno;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ local->call_cnt = 2; /* This is the second attempt */
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ local->rebalance.xdata);
+ return 0;
+ }
- local->fd = fd_ref (fd);
- local->call_cnt = 1;
+ if (subvol == NULL)
+ goto err;
- STACK_WIND (frame, dht_err_cbk,
- subvol, subvol->fops->flush, fd);
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->removexattr, &local->loc, local->key,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (flush, frame, -1, op_errno);
-
- return 0;
+ DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+ return 0;
}
-
int
-dht_fsync (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int datasync)
+dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ struct iatt *stbuf = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol1 = NULL, *subvol2 = NULL;
+ local = frame->local;
+ prev = cookie;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ local->op_errno = op_errno;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ if ((local->fop == GF_FOP_FREMOVEXATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
- local->call_cnt = 1;
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
- local->ia_ino = fd->inode->ino;
+ if (local->call_cnt != 1)
+ goto out;
- STACK_WIND (frame, dht_fsync_cbk,
- subvol, subvol->fops->fsync,
- fd, datasync);
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
- return 0;
+ if ((!op_ret) && !stbuf) {
+ goto out;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL);
+ local->op_ret = 0;
- return 0;
-}
+ local->rebalance.target_op_fn = dht_removexattr2;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Phase 1 of migration */
+ if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ dht_removexattr2(this, subvol2, frame, 0);
+ return 0;
+ }
+
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+ } else {
+ DHT_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+ }
+ return 0;
+}
int
-dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct gf_flock *flock)
+dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
{
- DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock);
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int call_cnt = 0;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err);
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_REMOVEXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!local->layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+
+ local->call_cnt = call_cnt = layout->cnt;
+ local->key = gf_strdup(key);
+
+ if (key && (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0)) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, NULL, 0,
+ local->xattr_req, &op_errno);
+ if (ret)
+ goto err;
+
+ } else {
+ local->call_cnt = 1;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to "
+ "set dictionary key %s for %s",
+ DHT_IATT_IN_XDATA_KEY, loc->path);
+ }
- return 0;
-}
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->removexattr, loc, key,
+ local->xattr_req);
+ }
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
int
-dht_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int cmd, struct gf_flock *flock)
+dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int call_cnt = 0;
+ dht_conf_t *conf = 0;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err);
+
+ VALIDATE_OR_GOTO(frame, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FREMOVEXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for inode=%s",
+ uuid_utoa(fd->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!local->layout) {
+ gf_msg_debug(this->name, 0, "no layout for inode=%s",
+ uuid_utoa(fd->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+
+ local->call_cnt = call_cnt = layout->cnt;
+ local->key = gf_strdup(key);
+
+ if (IA_ISDIR(fd->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, NULL, 0,
+ local->xattr_req, &op_errno);
+ if (ret)
+ goto err;
+
+ } else {
+ local->call_cnt = 1;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to "
+ "set dictionary key %s for fd=%p",
+ DHT_IATT_IN_XDATA_KEY, fd);
+ }
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, fd, key,
+ local->xattr_req);
+ }
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ return 0;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
- STACK_WIND (frame, dht_lk_cbk,
- subvol, subvol->fops->lk,
- fd, cmd, flock);
+ return 0;
+}
- return 0;
+int
+dht_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL);
+ local = frame->local;
+ prev = cookie;
- return 0;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
+ }
+
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, local->fd,
+ NULL);
+
+ return 0;
}
/*
* dht_normalize_stats -
*/
static void
-dht_normalize_stats (struct statvfs *buf, unsigned long bsize,
- unsigned long frsize)
+dht_normalize_stats(struct statvfs *buf, unsigned long bsize,
+ unsigned long frsize)
{
- double factor = 0;
+ double factor = 0;
+
+ if (buf->f_bsize != bsize) {
+ buf->f_bsize = bsize;
+ }
+
+ if (buf->f_frsize != frsize) {
+ factor = ((double)buf->f_frsize) / frsize;
+ buf->f_frsize = frsize;
+ buf->f_blocks = (fsblkcnt_t)(factor * buf->f_blocks);
+ buf->f_bfree = (fsblkcnt_t)(factor * buf->f_bfree);
+ buf->f_bavail = (fsblkcnt_t)(factor * buf->f_bavail);
+ }
+}
- if (buf->f_bsize != bsize) {
- buf->f_bsize = bsize;
+static int
+dht_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+ gf_boolean_t event = _gf_false;
+ qdstatfs_action_t action = qdstatfs_action_OFF;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int bsize = 0;
+ int frsize = 0;
+ GF_UNUSED int ret = 0;
+ unsigned long new_usage = 0;
+ unsigned long cur_usage = 0;
+
+ local = frame->local;
+ GF_ASSERT(local);
+
+ if (xdata)
+ ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event);
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ if (!statvfs) {
+ op_errno = EINVAL;
+ local->op_ret = -1;
+ goto unlock;
+ }
+ local->op_ret = 0;
+
+ if (local->quota_deem_statfs) {
+ if (event == _gf_true) {
+ action = qdstatfs_action_COMPARE;
+ } else {
+ action = qdstatfs_action_NEGLECT;
+ }
+ } else {
+ if (event == _gf_true) {
+ action = qdstatfs_action_REPLACE;
+ local->quota_deem_statfs = _gf_true;
+ }
}
- if (buf->f_frsize != frsize) {
- factor = ((double) buf->f_frsize) / frsize;
- buf->f_frsize = frsize;
- buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
- buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree);
- buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
+ if (local->quota_deem_statfs) {
+ switch (action) {
+ case qdstatfs_action_NEGLECT:
+ goto unlock;
+
+ case qdstatfs_action_REPLACE:
+ local->statvfs = *statvfs;
+ goto unlock;
+
+ case qdstatfs_action_COMPARE:
+ new_usage = statvfs->f_blocks - statvfs->f_bfree;
+ cur_usage = local->statvfs.f_blocks -
+ local->statvfs.f_bfree;
+
+ /* Take the max of the usage from subvols */
+ if (new_usage >= cur_usage)
+ local->statvfs = *statvfs;
+ goto unlock;
+ default:
+ break;
+ }
}
-}
-int
-dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct statvfs *statvfs)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- int bsize = 0;
- int frsize = 0;
-
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto unlock;
- }
- local->op_ret = 0;
-
- if (local->statvfs.f_bsize != 0) {
- bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
- frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
- dht_normalize_stats(&local->statvfs, bsize, frsize);
- dht_normalize_stats(statvfs, bsize, frsize);
- } else {
- local->statvfs.f_bsize = statvfs->f_bsize;
- local->statvfs.f_frsize = statvfs->f_frsize;
- }
+ if (local->statvfs.f_bsize != 0) {
+ bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
+ frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
+ dht_normalize_stats(&local->statvfs, bsize, frsize);
+ dht_normalize_stats(statvfs, bsize, frsize);
+ } else {
+ local->statvfs.f_bsize = statvfs->f_bsize;
+ local->statvfs.f_frsize = statvfs->f_frsize;
+ }
- local->statvfs.f_blocks += statvfs->f_blocks;
- local->statvfs.f_bfree += statvfs->f_bfree;
- local->statvfs.f_bavail += statvfs->f_bavail;
- local->statvfs.f_files += statvfs->f_files;
- local->statvfs.f_ffree += statvfs->f_ffree;
- local->statvfs.f_favail += statvfs->f_favail;
- local->statvfs.f_fsid = statvfs->f_fsid;
- local->statvfs.f_flag = statvfs->f_flag;
- local->statvfs.f_namemax = statvfs->f_namemax;
-
- }
+ local->statvfs.f_blocks += statvfs->f_blocks;
+ local->statvfs.f_bfree += statvfs->f_bfree;
+ local->statvfs.f_bavail += statvfs->f_bavail;
+ local->statvfs.f_files += statvfs->f_files;
+ local->statvfs.f_ffree += statvfs->f_ffree;
+ local->statvfs.f_favail += statvfs->f_favail;
+ local->statvfs.f_fsid = statvfs->f_fsid;
+ local->statvfs.f_flag = statvfs->f_flag;
+ local->statvfs.f_namemax = statvfs->f_namemax;
+ }
unlock:
- UNLOCK (&frame->lock);
+ UNLOCK(&frame->lock);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+ &local->statvfs, xdata);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
- &local->statvfs);
-
- return 0;
+ return 0;
}
-
int
-dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
+dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
-
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+ loc_t newloc = {
+ 0,
+ };
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) {
+ itable = loc->inode->table;
+ if (!itable) {
+ op_errno = EINVAL;
+ goto err;
+ }
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (this->private, err);
+ loc = &local->loc2;
- conf = this->private;
+ inode = inode_find(itable, root_gfid);
+ if (!inode) {
+ op_errno = EINVAL;
+ goto err;
+ }
- local = dht_local_init (frame);
- local->call_cnt = conf->subvolume_cnt;
+ dht_build_root_loc(inode, &newloc);
+ loc = &newloc;
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_statfs_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->statfs, loc);
- }
+ local->call_cnt = conf->subvolume_cnt;
- return 0;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND(frame, dht_statfs_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->statfs, loc, xdata);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
-{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int op_errno = -1;
- int i = -1;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->fd = fd_ref (fd);
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->call_cnt = conf->subvolume_cnt;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_fd_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->opendir,
- loc, fd);
- }
-
- return 0;
+dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = 0;
+ gf_boolean_t new_xdata = _gf_false;
+ xlator_t **subvolumes = NULL;
+ int call_count = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_OPENDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ local->first_up_subvol = dht_first_up_subvol(this);
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ new_xdata = _gf_true;
+ }
+
+ ret = dict_set_uint32(xdata, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value : key = %s",
+ conf->link_xattr_name);
+
+ /* dht_readdirp will wind to all subvols so open has to be sent to
+ * all subvols whether or not conf->local_subvols is set */
+
+ call_count = local->call_cnt = conf->subvolume_cnt;
+ subvolumes = conf->subvolumes;
+
+ /* In case of parallel-readdir, the readdir-ahead will be loaded
+ * below dht, in this case, if we want to enable or disable SKIP_DIRs
+ * it has to be done in opendir, so that prefetching logic in
+ * readdir-ahead, honors it */
+ for (i = 0; i < call_count; i++) {
+ if (conf->readdir_optimize == _gf_true) {
+ if (subvolumes[i] != local->first_up_subvol) {
+ ret = dict_set_int32(xdata, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary"
+ " value :key = %s, ret:%d",
+ GF_READDIR_SKIP_DIRS, ret);
+ }
+ }
+
+ STACK_WIND_COOKIE(frame, dht_fd_cbk, subvolumes[i], subvolumes[i],
+ subvolumes[i]->fops->opendir, loc, fd, xdata);
+ dict_del(xdata, GF_READDIR_SKIP_DIRS);
+ }
+
+ if (new_xdata)
+ dict_unref(xdata);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+/* dht_readdirp_cbk creates a new dentry and dentry->inode is not assigned.
+ This functions assigns an inode if all of the following conditions are
+ true:
-int
-dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, gf_dirent_t *orig_entries)
-{
- dht_local_t *local = NULL;
- gf_dirent_t entries;
- gf_dirent_t *orig_entry = NULL;
- gf_dirent_t *entry = NULL;
- call_frame_t *prev = NULL;
- xlator_t *next_subvol = NULL;
- off_t next_offset = 0;
- int count = 0;
- dht_layout_t *layout = 0;
- dht_conf_t *conf = NULL;
- xlator_t *subvol = 0;
-
- INIT_LIST_HEAD (&entries.list);
- prev = cookie;
- local = frame->local;
- conf = this->private;
-
- if (op_ret < 0)
- goto done;
-
- if (!local->layout)
- local->layout = dht_layout_get (this, local->fd->inode);
-
- layout = local->layout;
+ * DHT has only one child. In this case the entire layout is present on
+ this single child and hence we can set complete layout in inode.
+ * backend has complete layout and there are no anomalies in it and from
+ this information layout can be constructed and set in inode.
+*/
- list_for_each_entry (orig_entry, (&orig_entries->list), list) {
- next_offset = orig_entry->d_off;
+static void
+dht_populate_inode_for_dentry(xlator_t *this, xlator_t *subvol,
+ gf_dirent_t *entry, gf_dirent_t *orig_entry)
+{
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+
+ if (gf_uuid_is_null(orig_entry->d_stat.ia_gfid)) {
+ /* this skips the '..' entry for the root of the volume */
+ return;
+ }
+
+ gf_uuid_copy(loc.gfid, orig_entry->d_stat.ia_gfid);
+ loc.inode = inode_ref(orig_entry->inode);
+
+ if (is_revalidate(&loc)) {
+ goto out;
+ }
+
+ layout = dht_layout_new(this, 1);
+ if (!layout)
+ goto out;
+
+ ret = dht_layout_merge(this, layout, subvol, 0, 0, orig_entry->dict);
+ if (!ret) {
+ ret = dht_layout_normalize(this, &loc, layout);
+ if (ret == 0) {
+ dht_layout_set(this, orig_entry->inode, layout);
+ entry->inode = inode_ref(orig_entry->inode);
+ layout = NULL;
+ }
+ }
- if (check_is_linkfile (NULL, (&orig_entry->d_stat), NULL)
- || (check_is_dir (NULL, (&orig_entry->d_stat), NULL)
- && (prev->this != dht_first_up_subvol (this)))) {
- continue;
- }
+ if (layout)
+ dht_layout_unref(this, layout);
- entry = gf_dirent_for_name (orig_entry->d_name);
- if (!entry) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto unwind;
- }
+out:
+ loc_wipe(&loc);
+ return;
+}
- /* Do this if conf->search_unhashed is set to "auto" */
- if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
- subvol = dht_layout_search (this, layout,
- orig_entry->d_name);
- if (!subvol || (subvol != prev->this)) {
- /* TODO: Count the number of entries which need
- linkfile to prove its existance in fs */
- layout->search_unhashed++;
- }
- }
- entry->d_stat = orig_entry->d_stat;
-
- dht_itransform (this, prev->this, orig_entry->d_ino,
- &entry->d_ino);
- dht_itransform (this, prev->this, orig_entry->d_off,
- &entry->d_off);
-
- entry->d_stat.ia_ino = entry->d_ino;
- entry->d_type = orig_entry->d_type;
- entry->d_len = orig_entry->d_len;
-
- list_add_tail (&entry->list, &entries.list);
- count++;
- }
- op_ret = count;
- /* We need to ensure that only the last subvolume's end-of-directory
- * notification is respected so that directory reading does not stop
- * before all subvolumes have been read. That could happen because the
- * posix for each subvolume sends a ENOENT on end-of-directory but in
- * distribute we're not concerned only with a posix's view of the
- * directory but the aggregated namespace' view of the directory.
+/* Posix returns op_errno = ENOENT to indicate that there are no more
+ * entries
+ */
+static int
+dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ xlator_t *subvol = 0;
+ xlator_t *hashed_subvol = 0;
+ int ret = 0;
+ int readdir_optimize = 0;
+ inode_table_t *itable = NULL;
+ inode_t *inode = NULL;
+ gf_boolean_t skip_hashed_check = _gf_false;
+
+ INIT_LIST_HEAD(&entries.list);
+
+ prev = cookie;
+ local = frame->local;
+ GF_VALIDATE_OR_GOTO(this->name, local->fd, unwind);
+
+ itable = local->fd->inode->table;
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
+
+ methods = &(conf->methods);
+
+ if (op_ret <= 0) {
+ goto done;
+ }
+
+ /* Why aren't we skipping DHT entirely in case of a single subvol?
+ * Because if this was a larger volume earlier and all but one subvol
+ * was removed, there might be stale linkto files on the subvol.
+ */
+ if (conf->subvolume_cnt == 1) {
+ /* return all directory and file entries except
+ * linkto files for a single child DHT
*/
- if (prev->this != dht_last_up_subvol (this))
- op_errno = 0;
+ skip_hashed_check = _gf_true;
+ }
-done:
- if (count == 0) {
- /* non-zero next_offset means that
- EOF is not yet hit on the current subvol
- */
- if (next_offset == 0) {
- next_subvol = dht_subvol_next (this, prev->this);
- } else {
- next_subvol = prev->this;
+ if (!local->layout)
+ local->layout = dht_layout_get(this, local->fd->inode);
+
+ layout = local->layout;
+
+ /* This will skip the entries on the subvol without a layout,
+ * hence preventing the crash but rmdir might fail with
+ * "directory not empty" errors*/
+
+ if (layout == NULL)
+ goto done;
+
+ if (conf->readdir_optimize == _gf_true)
+ readdir_optimize = 1;
+
+ gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name);
+
+ list_for_each_entry(orig_entry, (&orig_entries->list), list)
+ {
+ next_offset = orig_entry->d_off;
+
+ gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
+ orig_entry->d_name, orig_entry->d_type);
+
+ if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
+ /*stat failed somewhere- display this entry but the data may
+ * be inaccurate.
+ */
+ gf_msg_debug(this->name, EINVAL, "Invalid stat for %s (gfid %s)",
+ orig_entry->d_name,
+ uuid_utoa(orig_entry->d_stat.ia_gfid));
+ }
+
+ if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict,
+ conf->link_xattr_name)) {
+ gf_msg_debug(this->name, 0, "%s: %s is a linkto file", prev->name,
+ orig_entry->d_name);
+ continue;
+ }
+
+ if (skip_hashed_check) {
+ goto list;
+ }
+
+ if (check_is_dir(NULL, (&orig_entry->d_stat), NULL)) {
+ /*Directory entries filtering :
+ * a) If rebalance is running, pick from first_up_subvol
+ * b) (rebalance not running)hashed subvolume is NULL or
+ * down then filter in first_up_subvolume. Other wise the
+ * corresponding hashed subvolume will take care of the
+ * directory entry.
+ */
+ if (readdir_optimize) {
+ if (prev == local->first_up_subvol)
+ goto list;
+ else
+ continue;
+ }
+
+ hashed_subvol = methods->layout_search(this, layout,
+ orig_entry->d_name);
+
+ if (prev == hashed_subvol)
+ goto list;
+ if ((hashed_subvol && dht_subvol_status(conf, hashed_subvol)) ||
+ (prev != local->first_up_subvol))
+ continue;
+
+ goto list;
+ }
+
+ list:
+ entry = gf_dirent_for_name(orig_entry->d_name);
+ if (!entry) {
+ goto unwind;
+ }
+
+ /* Do this if conf->search_unhashed is set to "auto" */
+ if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+ subvol = methods->layout_search(this, layout, orig_entry->d_name);
+ if (!subvol || (subvol != prev)) {
+ /* TODO: Count the number of entries which need
+ linkfile to prove its existence in fs */
+ layout->search_unhashed++;
+ }
+ }
+
+ entry->d_off = orig_entry->d_off;
+ entry->d_stat = orig_entry->d_stat;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ if (orig_entry->dict)
+ entry->dict = dict_ref(orig_entry->dict);
+
+ /* making sure we set the inode ctx right with layout,
+ currently possible only for non-directories, so for
+ directories don't set entry inodes */
+ if (IA_ISDIR(entry->d_stat.ia_type)) {
+ entry->d_stat.ia_blocks = DHT_DIR_STAT_BLOCKS;
+ entry->d_stat.ia_size = DHT_DIR_STAT_SIZE;
+ if (orig_entry->inode) {
+ dht_inode_ctx_time_update(orig_entry->inode, this,
+ &entry->d_stat, 1);
+
+ if (conf->subvolume_cnt == 1) {
+ dht_populate_inode_for_dentry(this, prev, entry,
+ orig_entry);
+ }
+ }
+ } else {
+ if (orig_entry->dict &&
+ dict_get(orig_entry->dict, conf->link_xattr_name)) {
+ /* Strip out the S and T flags set by rebalance*/
+ DHT_STRIP_PHASE1_FLAGS(&entry->d_stat);
+ }
+
+ if (orig_entry->inode) {
+ ret = dht_layout_preset(this, prev, orig_entry->inode);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout "
+ "in inode for %s",
+ orig_entry->d_name);
+
+ entry->inode = inode_ref(orig_entry->inode);
+ } else if (itable) {
+ /*
+ * orig_entry->inode might be null if any upper
+ * layer xlators below client set to null, to
+ * force a lookup on the inode even if the inode
+ * is present in the inode table. In that case
+ * we just update the ctx to make sure we didn't
+ * missed anything.
+ */
+ inode = inode_find(itable, orig_entry->d_stat.ia_gfid);
+ if (inode) {
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout"
+ " in inode for %s",
+ orig_entry->d_name);
+ inode_unref(inode);
+ inode = NULL;
}
+ }
+ }
- if (!next_subvol) {
- goto unwind;
- }
+ gf_msg_debug(this->name, 0, "%s: Adding entry = %s", prev->name,
+ entry->d_name);
- STACK_WIND (frame, dht_readdirp_cbk,
- next_subvol, next_subvol->fops->readdirp,
- local->fd, local->size, next_offset);
- return 0;
- }
+ list_add_tail(&entry->list, &entries.list);
+ count++;
+ }
-unwind:
- if (op_ret < 0)
- op_ret = 0;
+done:
- DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries);
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ * Possible values:
+ * op_ret == 0 and op_errno != 0
+ * if op_errno != ENOENT : Error.Unwind.
+ * if op_errno == ENOENT : There are no more entries on this subvol.
+ * Move to the next one.
+ * op_ret > 0 and count == 0 :
+ * The subvol returned entries to dht but all were stripped out.
+ * For example, if they were linkto files or dirs where
+ * hashed_subvol != prev. Try to get some entries by winding
+ * to the next subvol. This can be dangerous if parallel readdir
+ * is enabled as it grows the stack.
+ *
+ * op_ret > 0 and count > 0:
+ * We found some entries. Unwind even if the buffer is not full.
+ *
+ */
+
+ op_ret = count;
+ if (count == 0) {
+ /* non-zero next_offset means that
+ * EOF is not yet hit on the current subvol
+ */
+ if ((next_offset == 0) || (op_errno == ENOENT)) {
+ next_offset = 0;
+ next_subvol = dht_subvol_next(this, prev);
+ } else {
+ next_subvol = prev;
+ }
+
+ if (!next_subvol) {
+ goto unwind;
+ }
- gf_dirent_free (&entries);
+ if (conf->readdir_optimize == _gf_true) {
+ if (next_subvol != local->first_up_subvol) {
+ ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ ":key = %s",
+ GF_READDIR_SKIP_DIRS);
+ } else {
+ dict_del(local->xattr, GF_READDIR_SKIP_DIRS);
+ }
+ }
+ STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol,
+ next_subvol->fops->readdirp, local->fd, local->size,
+ next_offset, local->xattr);
return 0;
+ }
+
+unwind:
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+ if (op_ret < 0)
+ op_ret = 0;
+
+ if (prev != dht_last_up_subvol(this))
+ op_errno = 0;
+
+ DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
+
+ gf_dirent_free(&entries);
+ return 0;
}
+static int
+dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+ dht_layout_t *layout = 0;
+ xlator_t *subvol = 0;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ gf_boolean_t skip_hashed_check = _gf_false;
+ INIT_LIST_HEAD(&entries.list);
-int
-dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *orig_entries)
-{
- dht_local_t *local = NULL;
- gf_dirent_t entries;
- gf_dirent_t *orig_entry = NULL;
- gf_dirent_t *entry = NULL;
- call_frame_t *prev = NULL;
- xlator_t *next_subvol = NULL;
- off_t next_offset = 0;
- int count = 0;
- dht_layout_t *layout = 0;
- dht_conf_t *conf = NULL;
- xlator_t *subvol = 0;
-
- INIT_LIST_HEAD (&entries.list);
- prev = cookie;
- local = frame->local;
- conf = this->private;
-
- if (op_ret < 0)
- goto done;
-
- if (!local->layout)
- local->layout = dht_layout_get (this, local->fd->inode);
+ prev = cookie;
+ local = frame->local;
- layout = local->layout;
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, done);
- list_for_each_entry (orig_entry, (&orig_entries->list), list) {
- next_offset = orig_entry->d_off;
+ methods = &(conf->methods);
- subvol = dht_layout_search (this, layout, orig_entry->d_name);
+ if (op_ret <= 0)
+ goto done;
- if (!subvol || (subvol == prev->this)) {
- entry = gf_dirent_for_name (orig_entry->d_name);
- if (!entry) {
- gf_log (this->name, GF_LOG_ERROR,
- "memory allocation failed :(");
- goto unwind;
- }
+ if (!local->layout)
+ local->layout = dht_layout_get(this, local->fd->inode);
- dht_itransform (this, prev->this, orig_entry->d_ino,
- &entry->d_ino);
- dht_itransform (this, prev->this, orig_entry->d_off,
- &entry->d_off);
+ layout = local->layout;
- entry->d_type = orig_entry->d_type;
- entry->d_len = orig_entry->d_len;
+ gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name);
- list_add_tail (&entry->list, &entries.list);
- count++;
- }
- }
- op_ret = count;
- /* We need to ensure that only the last subvolume's end-of-directory
- * notification is respected so that directory reading does not stop
- * before all subvolumes have been read. That could happen because the
- * posix for each subvolume sends a ENOENT on end-of-directory but in
- * distribute we're not concerned only with a posix's view of the
- * directory but the aggregated namespace' view of the directory.
- */
- if (prev->this != dht_last_up_subvol (this))
- op_errno = 0;
+ if (conf->subvolume_cnt == 1) {
+ /*return everything*/
+ skip_hashed_check = _gf_true;
+ count = op_ret;
+ goto done;
+ }
+
+ list_for_each_entry(orig_entry, (&orig_entries->list), list)
+ {
+ next_offset = orig_entry->d_off;
+
+ gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
+ orig_entry->d_name, orig_entry->d_type);
+
+ subvol = methods->layout_search(this, layout, orig_entry->d_name);
+
+ if (!subvol || (subvol == prev)) {
+ entry = gf_dirent_for_name(orig_entry->d_name);
+ if (!entry) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Memory allocation failed ");
+ goto unwind;
+ }
+
+ entry->d_off = orig_entry->d_off;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+ gf_msg_debug(this->name, 0, "%s: Adding = entry %s", prev->name,
+ entry->d_name);
+
+ list_add_tail(&entry->list, &entries.list);
+ count++;
+ }
+ }
done:
- if (count == 0) {
- /* non-zero next_offset means that
- EOF is not yet hit on the current subvol
- */
- if (next_offset == 0) {
- next_subvol = dht_subvol_next (this, prev->this);
- } else {
- next_subvol = prev->this;
- }
+ op_ret = count;
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+ if (count == 0) {
+ if ((next_offset == 0) || (op_errno == ENOENT)) {
+ next_offset = 0;
+ next_subvol = dht_subvol_next(this, prev);
+ } else {
+ next_subvol = prev;
+ }
- if (!next_subvol) {
- goto unwind;
- }
+ if (!next_subvol) {
+ goto unwind;
+ }
- STACK_WIND (frame, dht_readdir_cbk,
- next_subvol, next_subvol->fops->readdir,
- local->fd, local->size, next_offset);
- return 0;
- }
+ STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol,
+ next_subvol->fops->readdir, local->fd, local->size,
+ next_offset, NULL);
+ return 0;
+ }
unwind:
- if (op_ret < 0)
- op_ret = 0;
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+
+ if (prev != dht_last_up_subvol(this))
+ op_errno = 0;
+
+ if (!skip_hashed_check) {
+ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
+ gf_dirent_free(&entries);
+
+ } else {
+ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL);
+ }
+ return 0;
+}
- DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries);
+static int
+dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, int whichop, dict_t *dict)
+{
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ xlator_t *xvol = NULL;
+ int ret = 0;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, NULL, whichop);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_ref(fd);
+ local->size = size;
+ local->xattr_req = (dict) ? dict_ref(dict) : NULL;
+ local->first_up_subvol = dht_first_up_subvol(this);
+ local->op_ret = -1;
+
+ dht_deitransform(this, yoff, &xvol);
+
+ /* TODO: do proper readdir */
+ if (whichop == GF_FOP_READDIRP) {
+ if (dict)
+ local->xattr = dict_ref(dict);
+ else
+ local->xattr = dict_new();
+
+ if (local->xattr) {
+ ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ " : key = %s",
+ conf->link_xattr_name);
+
+ if (conf->readdir_optimize == _gf_true) {
+ if (xvol != local->first_up_subvol) {
+ ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to set "
+ "dictionary value: "
+ "key = %s",
+ GF_READDIR_SKIP_DIRS);
+ } else {
+ dict_del(local->xattr, GF_READDIR_SKIP_DIRS);
+ }
+ }
+
+ if (conf->subvolume_cnt == 1) {
+ ret = dict_set_uint32(local->xattr, conf->xattr_name, 4 * 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary "
+ "value:key = %s ",
+ conf->xattr_name);
+ }
+ }
+ }
- gf_dirent_free (&entries);
+ STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol,
+ xvol->fops->readdirp, fd, size, yoff, local->xattr);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol,
+ xvol->fops->readdir, fd, size, yoff, local->xattr);
+ }
- return 0;
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
}
+int
+dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *xdata)
+{
+ int op = GF_FOP_READDIR;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ op = GF_FOP_READDIRP;
+ break;
+ }
+ }
+
+ if (conf->use_readdirp)
+ op = GF_FOP_READDIRP;
+
+out:
+ dht_do_readdir(frame, this, fd, size, yoff, op, 0);
+ return 0;
+}
int
-dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, int whichop)
+dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *dict)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- xlator_t *xvol = NULL;
- off_t xoff = 0;
+ dht_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict);
+ return 0;
+}
+static int
+dht_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ local = frame->local;
- conf = this->private;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+ else if (op_ret == 0)
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
+ xdata);
+
+ return 0;
+}
- local = dht_local_init (frame);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
+int
+dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
- local->fd = fd_ref (fd);
- local->size = size;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
- dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
+ conf = this->private;
- /* TODO: do proper readdir */
- if (whichop == GF_FOP_READDIR)
- STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir,
- fd, size, xoff);
- else
- STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp,
- fd, size, xoff);
+ local = dht_local_init(frame, NULL, NULL, GF_FOP_FSYNCDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- return 0;
+ local->fd = fd_ref(fd);
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND(frame, dht_fsyncdir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fsyncdir, fd, datasync, xdata);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff)
+dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- int op = GF_FOP_READDIR;
- dht_conf_t *conf = NULL;
- int i = 0;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+ if (op_ret == -1)
+ goto out;
+
+ local = frame->local;
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ prev = cookie;
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg_debug(this->name, EINVAL,
+ "could not set pre-set layout for subvolume %s",
+ prev ? prev->name : NULL);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal(frame, this);
+out:
+ /*
+ * FIXME: ia_size and st_blocks of preparent and postparent do not have
+ * correct values. since, preparent and postparent buffers correspond
+ * to a directory these two members should have values equal to sum of
+ * corresponding values from each of the subvolume.
+ * See dht_iatt_merge for reference.
+ */
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(postparent);
+ dht_set_fixed_dir_stat(preparent);
+
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ /* store op_errno for failure case*/
+ local->op_errno = op_errno;
+ local->refresh_layout_unlock(frame, this, op_ret, 1);
- conf = this->private;
- if (!conf)
- goto out;
+ if (op_ret == 0) {
+ DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf,
+ preparent, postparent, xdata);
+ }
+ } else {
+ DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf,
+ preparent, postparent, xdata);
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->subvolume_status[i]) {
- op = GF_FOP_READDIRP;
- break;
- }
+ return 0;
+}
+
+static int
+dht_mknod_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ conf = this->private;
+ if (!conf) {
+ local->op_errno = EINVAL;
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ cached_subvol = local->cached_subvol;
+
+ if (local->params) {
+ dict_del(local->params, conf->link_xattr_name);
+ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
+ }
+
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)cached_subvol,
+ cached_subvol, cached_subvol->fops->mknod, &local->loc,
+ local->mode, local->rdev, local->umask, local->params);
+
+ return 0;
+err:
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ local->refresh_layout_unlock(frame, this, -1, 1);
+ } else {
+ DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ }
+ return 0;
+}
+
+static int
+dht_mknod_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc, dev_t rdev,
+ mode_t mode, mode_t umask, dict_t *params)
+{
+ dht_local_t *local = NULL;
+ xlator_t *avail_subvol = NULL;
+
+ local = frame->local;
+
+ if (!dht_is_subvol_filled(this, subvol)) {
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ subvol->name);
+
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask, params);
+ } else {
+ avail_subvol = dht_free_disk_available_subvol(this, subvol, local);
+
+ if (avail_subvol != subvol) {
+ local->params = dict_ref(params);
+ local->rdev = rdev;
+ local->mode = mode;
+ local->umask = umask;
+ local->cached_subvol = avail_subvol;
+ local->hashed_subvol = subvol;
+
+ gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)",
+ loc->path, avail_subvol->name, subvol->name);
+
+ dht_linkfile_create(frame, dht_mknod_linkfile_create_cbk, this,
+ avail_subvol, subvol, loc);
+
+ goto out;
}
- if (conf->use_readdirp)
- op = GF_FOP_READDIRP;
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ subvol->name);
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask, params);
+ }
out:
- dht_do_readdir (frame, this, fd, size, yoff, op);
- return 0;
+ return 0;
}
-int
-dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff)
+static int32_t
+dht_mknod_do(call_frame_t *frame)
{
- dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP);
- return 0;
-}
+ dht_local_t *local = NULL;
+ dht_layout_t *refreshed = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ local = frame->local;
+ this = THIS;
-int
-dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
+ conf = this->private;
+
+ GF_VALIDATE_OR_GOTO(this->name, conf, err);
+
+ methods = &(conf->methods);
+ /* We don't need parent_loc anymore */
+ loc_wipe(&local->loc);
- local = frame->local;
+ loc_copy(&local->loc, &local->loc2);
- LOCK (&frame->lock);
- {
- if (op_ret == -1)
- local->op_errno = op_errno;
+ loc_wipe(&local->loc2);
- if (op_ret == 0)
- local->op_ret = 0;
- }
- UNLOCK (&frame->lock);
+ refreshed = local->selfheal.refreshed_layout;
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, local->op_errno);
+ subvol = methods->layout_search(this, refreshed, local->loc.name);
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "no subvolume in "
+ "layout for path=%s",
+ local->loc.path);
+ local->op_errno = ENOENT;
+ goto err;
+ }
+
+ dht_mknod_wind_to_avail_subvol(frame, this, subvol, &local->loc,
+ local->rdev, local->mode, local->umask,
+ local->params);
+ return 0;
+err:
+ local->refresh_layout_unlock(frame, this, -1, 1);
+
+ return 0;
+}
+
+static int32_t
+dht_mknod_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+
+static int32_t
+dht_mknod_finish(call_frame_t *frame, xlator_t *this, int op_ret,
+ int invoke_cbk)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ int lock_count = 0;
+
+ local = frame->local;
+ lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks,
+ local->lock[0].layout.parent_layout.lk_count);
+ if (lock_count == 0)
+ goto done;
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+ lock_frame->root->op);
+ if (lock_local == NULL) {
+ goto done;
+ }
+
+ lock_local->lock[0]
+ .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks;
+ lock_local->lock[0].layout.parent_layout.lk_count =
+ local->lock[0].layout.parent_layout.lk_count;
+
+ local->lock[0].layout.parent_layout.locks = NULL;
+ local->lock[0].layout.parent_layout.lk_count = 0;
+
+ dht_unlock_inodelk(lock_frame,
+ lock_local->lock[0].layout.parent_layout.locks,
+ lock_local->lock[0].layout.parent_layout.lk_count,
+ dht_mknod_unlock_cbk);
+ lock_frame = NULL;
+
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+
+ if (op_ret == 0)
return 0;
+
+ DHT_STACK_UNWIND(mknod, frame, op_ret, local->op_errno, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
+static int32_t
+dht_mknod_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
-int
-dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync)
+ if (!local) {
+ goto err;
+ }
+
+ if (op_ret < 0) {
+ gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "mknod lock failed for file: %s", local->loc2.name);
+
+ local->op_errno = op_errno;
+
+ goto err;
+ }
+
+ local->refresh_layout_unlock = dht_mknod_finish;
+
+ local->refresh_layout_done = dht_mknod_do;
+
+ dht_refresh_layout(frame);
+
+ return 0;
+err:
+ if (local)
+ dht_mknod_finish(frame, this, -1, 0);
+ else
+ DHT_STACK_UNWIND(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t
+dht_mknod_lock(call_frame_t *frame, xlator_t *subvol)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1;
+ dht_lock_t **lk_array = NULL;
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (this->private, err);
+ local = frame->local;
- conf = this->private;
+ lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+ if (lk_array == NULL)
+ goto err;
- local->fd = fd_ref (fd);
- local->call_cnt = conf->subvolume_cnt;
+ lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK,
+ DHT_LAYOUT_HEAL_DOMAIN, NULL,
+ IGNORE_ENOENT_ESTALE);
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_fsyncdir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->fsyncdir,
- fd, datasync);
- }
+ if (lk_array[0] == NULL)
+ goto err;
- return 0;
+ local->lock[0].layout.parent_layout.locks = lk_array;
+ local->lock[0].layout.parent_layout.lk_count = count;
+ ret = dht_blocking_inodelk(frame, lk_array, count, dht_mknod_lock_cbk);
+
+ if (ret < 0) {
+ local->lock[0].layout.parent_layout.locks = NULL;
+ local->lock[0].layout.parent_layout.lk_count = 0;
+ goto err;
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno);
+ if (lk_array != NULL) {
+ dht_lock_array_free(lk_array, count);
+ GF_FREE(lk_array);
+ }
- return 0;
+ return -1;
}
-
-int
-dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent)
+static int
+dht_refresh_parent_layout_resume(call_frame_t *frame, xlator_t *this, int ret,
+ int invoke_cbk)
{
- call_frame_t *prev = NULL;
- int ret = -1;
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL, *parent_local = NULL;
+ call_stub_t *stub = NULL;
+ call_frame_t *parent_frame = NULL;
+ local = frame->local;
- if (op_ret == -1)
- goto out;
+ stub = local->stub;
+ local->stub = NULL;
- local = frame->local;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ parent_frame = stub->frame;
+ parent_local = parent_frame->local;
- prev = cookie;
+ if (ret < 0) {
+ parent_local->op_ret = -1;
+ parent_local->op_errno = local->op_errno ? local->op_errno : EIO;
+ } else {
+ parent_local->op_ret = 0;
+ }
- dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino);
- if (local->loc.parent) {
- preparent->ia_ino = local->loc.parent->ino;
- postparent->ia_ino = local->loc.parent->ino;
-
- WIPE (preparent);
- WIPE (postparent);
- }
-
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not set pre-set layout for subvolume %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
-out:
- /*
- * FIXME: ia_size and st_blocks of preparent and postparent do not have
- * correct values. since, preparent and postparent buffers correspond
- * to a directory these two members should have values equal to sum of
- * corresponding values from each of the subvolume.
- * See dht_iatt_merge for reference.
- */
+ call_resume(stub);
- DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, preparent,
- postparent);
- return 0;
+ DHT_STACK_DESTROY(frame);
+
+ return 0;
}
-int
-dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
+static int
+dht_refresh_parent_layout_done(call_frame_t *frame)
{
- dht_local_t *local = NULL;
- xlator_t *cached_subvol = NULL;
+ dht_local_t *local = NULL;
+ int ret = 0;
- if (op_ret == -1)
- goto err;
+ local = frame->local;
- local = frame->local;
- cached_subvol = local->cached_subvol;
+ if (local->op_ret < 0) {
+ ret = -1;
+ goto resume;
+ }
- STACK_WIND (frame, dht_newfile_cbk,
- cached_subvol, cached_subvol->fops->mknod,
- &local->loc, local->mode, local->rdev,
- local->params);
+ dht_layout_set(frame->this, local->loc.inode,
+ local->selfheal.refreshed_layout);
- return 0;
- err:
- DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
+resume:
+ dht_refresh_parent_layout_resume(frame, frame->this, ret, 1);
+ return 0;
}
-int
-dht_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t rdev, dict_t *params)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- int ret = -1;
- xlator_t *avail_subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- conf = this->private;
-
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = ENOENT;
- goto err;
- }
-
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+static int
+dht_handle_parent_layout_change(xlator_t *this, call_stub_t *stub)
+{
+ call_frame_t *refresh_frame = NULL, *frame = NULL;
+ dht_local_t *refresh_local = NULL, *local = NULL;
- if (!dht_is_subvol_filled (this, subvol)) {
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
-
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, params);
- } else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
- if (avail_subvol != subvol) {
- /* Choose the minimum filled volume, and create the
- files there */
-
- local->params = dict_ref (params);
- local->cached_subvol = avail_subvol;
- local->mode = mode;
- local->rdev = rdev;
-
- dht_linkfile_create (frame,
- dht_mknod_linkfile_create_cbk,
- avail_subvol, subvol, loc);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
-
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, params);
- }
- }
+ frame = stub->frame;
+ local = frame->local;
- return 0;
+ refresh_frame = copy_frame(frame);
+ if (!refresh_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "mem allocation failed for refresh_frame");
+ return -1;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (mknod, frame, -1, op_errno,
- NULL, NULL, NULL, NULL);
+ refresh_local = dht_local_init(refresh_frame, NULL, NULL, stub->fop);
+ if (!refresh_local) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "mem allocation failed for refresh_local");
+ return -1;
+ }
+
+ refresh_local->loc.inode = inode_ref(local->loc.parent);
+ gf_uuid_copy(refresh_local->loc.gfid, local->loc.parent->gfid);
- return 0;
+ refresh_local->stub = stub;
+
+ refresh_local->refresh_layout_unlock = dht_refresh_parent_layout_resume;
+ refresh_local->refresh_layout_done = dht_refresh_parent_layout_done;
+
+ dht_refresh_layout(refresh_frame);
+ return 0;
}
+static int32_t
+dht_call_mkdir_stub(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_stub_t *stub = NULL;
-int
-dht_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkname, loc_t *loc, dict_t *params)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- int ret = -1;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = ENOENT;
- goto err;
- }
-
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_TRACE, "Failed to copy loc");
- op_errno = ENOMEM;
- goto err;
- }
+ local = frame->local;
+ stub = local->stub;
+ local->stub = NULL;
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = 0;
+ }
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->symlink,
- linkname, loc, params);
+ call_resume(stub);
- return 0;
+ return 0;
+}
+static int32_t
+dht_guard_parent_layout_and_namespace(xlator_t *subvol, call_stub_t *stub)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ loc_t *loc = NULL;
+ xlator_t *hashed_subvol = NULL, *this = NULL;
+ ;
+ call_frame_t *frame = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int32_t *parent_disk_layout = NULL;
+ dht_layout_t *parent_layout = NULL;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", stub, err);
+
+ frame = stub->frame;
+ this = frame->this;
+
+ conf = this->private;
+
+ local = frame->local;
+
+ local->stub = stub;
+
+ /* TODO: recheck whether we should lock on src or dst if we do similar
+ * stale layout checks for rename.
+ */
+ loc = &stub->args.loc;
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ if (local->params == NULL) {
+ local->params = dict_new();
+ if (local->params == NULL) {
+ local->op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "dict allocation failed",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (hashed_subvol == NULL) {
+ local->op_errno = EINVAL;
+
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "hashed subvolume not found",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ parent_layout = dht_layout_get(this, loc->parent);
+
+ ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+ &parent_disk_layout);
+ if (ret == -1) {
+ local->op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "extracting in-memory layout of parent failed. ",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout,
+ sizeof(local->parent_disk_layout));
+
+ dht_layout_unref(this, parent_layout);
+ parent_layout = NULL;
+
+ ret = dict_set_str(local->params, GF_PREOP_PARENT_KEY, conf->xattr_name);
+ if (ret < 0) {
+ local->op_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting %s key in params dictionary failed. ",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path,
+ GF_PREOP_PARENT_KEY);
+ goto err;
+ }
+
+ ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout,
+ 4 * 4);
+ if (ret < 0) {
+ local->op_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting parent-layout in params dictionary failed. ",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ parent_disk_layout = NULL;
+ local->hashed_subvol = hashed_subvol;
+
+ local->current = &local->lock[0];
+ ret = dht_protect_namespace(frame, loc, hashed_subvol, &local->current->ns,
+ dht_call_mkdir_stub);
+ if (ret < 0)
+ goto err;
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (link, frame, -1, op_errno,
- NULL, NULL, NULL, NULL);
- return 0;
-}
+ if (parent_disk_layout != NULL)
+ GF_FREE(parent_disk_layout);
+
+ if (parent_layout != NULL)
+ dht_layout_unref(this, parent_layout);
+ return -1;
+}
int
-dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- xlator_t *cached_subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- int ret = -1;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- if (dht_filter_loc_subvol_key (this, loc, &local->loc,
- &cached_subvol)) {
- gf_log (this->name, GF_LOG_NORMAL,
- "unlinking %s on %s (given path %s)",
- local->loc.path, cached_subvol->name, loc->path);
- STACK_WIND (frame, dht_unlink_cbk,
- cached_subvol, cached_subvol->fops->unlink,
- &local->loc);
+dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *params)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ int i = 0;
+ int ret = 0;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = EIO;
+ goto err;
+ }
+
+ /* Post remove-brick, the client layout may not be in sync with
+ * disk layout because of lack of lookup. Hence,a mknod call
+ * may fall on the decommissioned brick. Hence, if the
+ * hashed_subvol is part of decommissioned bricks list, do a
+ * lookup on parent dir. If a fix-layout is already done by the
+ * remove-brick process, the parent directory layout will be in
+ * sync with that of the disk. If fix-layout is still ending
+ * on the parent directory, we can let the file get created on
+ * the decommissioned brick which will be eventually migrated to
+ * non-decommissioned brick based on the new layout.
+ */
+
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] &&
+ conf->decommissioned_bricks[i] == subvol) {
+ gf_msg_debug(this->name, 0,
+ "hashed subvol:%s is "
+ "part of decommission brick list for "
+ "file: %s",
+ subvol->name, loc->path);
+
+ /* dht_refresh_layout needs directory info in
+ * local->loc. Hence, storing the parent_loc in
+ * local->loc and storing the create context in
+ * local->loc2. We will restore this information
+ * in dht_creation do */
+
+ ret = loc_copy(&local->loc2, &local->loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "loc_copy failed %s", loc->path);
+
+ goto err;
+ }
+
+ local->params = dict_ref(params);
+ local->rdev = rdev;
+ local->mode = mode;
+ local->umask = umask;
+
+ loc_wipe(&local->loc);
+
+ ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+ "parent loc build failed");
+ goto err;
+ }
+
+ ret = dht_mknod_lock(frame, subvol);
+
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "locking parent failed");
+ goto err;
+ }
+
goto done;
+ }
}
+ }
+
+ dht_mknod_wind_to_avail_subvol(frame, this, subvol, loc, rdev, mode, umask,
+ params);
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
- if (!cached_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- if (!hashed_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- if (hashed_subvol != cached_subvol) {
- STACK_WIND (frame, dht_unlink_linkfile_cbk,
- hashed_subvol, hashed_subvol->fops->unlink, loc);
- } else {
- STACK_WIND (frame, dht_unlink_cbk,
- cached_subvol, cached_subvol->fops->unlink, loc);
- }
done:
- return 0;
+ return 0;
+
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
+int
+dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+ loc_t *loc, mode_t umask, dict_t *params)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_SYMLINK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = EIO;
+ goto err;
+ }
+
+ gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->symlink, linkname, loc, umask, params);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
int
-dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent)
+dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ cached_subvol = local->cached_subvol;
+ if (!cached_subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->flags = xflag;
+ STACK_WIND_COOKIE(frame, dht_unlink_cbk, cached_subvol, cached_subvol,
+ cached_subvol->fops->unlink, loc, xflag, xdata);
+
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
- prev = cookie;
- local = frame->local;
+ return 0;
+}
- if (op_ret == -1)
- goto out;
+static int
+dht_remove_stale_linkto_cbk(int ret, call_frame_t *sync_frame, void *data)
+{
+ DHT_STACK_DESTROY(sync_frame);
+ return 0;
+}
- layout = dht_layout_for_subvol (this, prev->this);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no pre-set layout for subvolume %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+static int
+dht_remove_stale_linkto(void *data)
+{
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ dict_t *xdata_in = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", data, out);
+
+ frame = data;
+ local = frame->local;
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", local, out);
+ GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out);
+
+ xdata_in = dict_new();
+ if (!xdata_in)
+ goto out;
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(xdata_in);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
+ "Failed to set keys for stale linkto"
+ "deletion on path %s",
+ local->loc.path);
+ goto out;
+ }
+
+ ret = syncop_unlink(local->link_subvol, &local->loc, xdata_in, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
+ "Removal of linkto failed"
+ " on path %s at subvol %s",
+ local->loc.path, local->link_subvol->name);
+ }
+out:
+ if (xdata_in)
+ dict_unref(xdata_in);
+ return ret;
+}
- stbuf->ia_ino = local->loc.inode->ino;
+static int
+dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ gf_boolean_t stbuf_merged = _gf_false;
+ xlator_t *subvol = NULL;
+ call_frame_t *cleanup_frame = NULL;
+ dht_local_t *cleanup_local = NULL;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ /* Remove the linkto if exists */
+ if (local->linked) {
+ cleanup_frame = create_frame(this, this->ctx->pool);
+ if (cleanup_frame) {
+ cleanup_local = dht_local_init(cleanup_frame, &local->loc2,
+ NULL, 0);
+ if (!cleanup_local || !local->link_subvol) {
+ DHT_STACK_DESTROY(cleanup_frame);
+ goto out;
+ }
+ cleanup_local->link_subvol = local->link_subvol;
+ FRAME_SU_DO(cleanup_frame, dht_local_t);
+ ret = synctask_new(this->ctx->env, dht_remove_stale_linkto,
+ dht_remove_stale_linkto_cbk, cleanup_frame,
+ cleanup_frame);
+ }
+ }
+ /* No continuation on DHT inode missing errors, as we should
+ * then have a good stbuf that states P2 happened. We would
+ * get inode missing if, the file completed migrated between
+ * the lookup and the link call */
+ goto out;
+ }
+
+ /* Update parent on success, even if P1/2 checks are positive.
+ * The second call on success will further update the parent */
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ /* Update linkto attrs, if this is the first call and non-P2,
+ * if we detect P2 then we need to trust the attrs from the
+ * second call, not the first */
+ if (local->linked == _gf_true &&
+ ((local->call_cnt == 1 && !IS_DHT_MIGRATION_PHASE2(stbuf)) ||
+ (local->call_cnt != 1 && IS_DHT_MIGRATION_PHASE2(&local->stbuf)))) {
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ stbuf_merged = _gf_true;
+ dht_linkfile_attr_heal(frame, this);
+ }
+
+ /* No further P1/2 checks if we are in the second iteration of
+ * the call */
+ if (local->call_cnt != 1) {
+ goto out;
+ } else {
+ /* Preserve the return values, in case the migration decides
+ * to recreate the link on the same subvol that the current
+ * hased for the link was created on. */
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ if (!stbuf_merged) {
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ stbuf_merged = _gf_true;
+ }
- preparent->ia_ino = local->loc2.parent->ino;
- postparent->ia_ino = local->loc2.parent->ino;
+ local->inode = inode_ref(inode);
+ }
- WIPE (preparent);
- WIPE (postparent);
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->rebalance.target_op_fn = dht_link2;
+ dht_set_local_rebalance(this, local, stbuf, preparent, postparent, xdata);
+ /* Check if the rebalance phase2 is true */
+ if (IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol);
+ if (!subvol) {
+ /* Phase 2 of migration */
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ dht_link2(this, subvol, frame, 0);
+ return 0;
+ }
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+ ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol);
+ if (subvol) {
+ dht_link2(this, subvol, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
out:
- DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent,
- postparent);
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
- return 0;
-}
+ dht_set_fixed_dir_stat(preparent);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, NULL);
+ return 0;
+}
-int
-dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
+static int
+dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- xlator_t *srcvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+ local = frame->local;
+ if (!local)
+ goto err;
- if (op_ret == -1)
- goto err;
+ op_errno = local->op_errno;
- local = frame->local;
- srcvol = local->linkfile.srcvol;
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ DHT_STACK_UNWIND(link, frame, local->op_ret, op_errno, local->inode,
+ &local->stbuf, &local->preparent, &local->postparent,
+ NULL);
+ return 0;
+ }
+
+ if (subvol == NULL) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Second call to create link file could result in EEXIST as the
+ * first call created the linkto in the currently
+ * migrating subvol, which could be the new hashed subvol */
+ if (local->link_subvol == subvol) {
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(link, frame, 0, 0, local->inode, &local->stbuf,
+ &local->preparent, &local->postparent, NULL);
- STACK_WIND (frame, dht_link_cbk,
- srcvol, srcvol->fops->link,
- &local->loc, &local->loc2);
+ return 0;
+ }
- return 0;
+ local->call_cnt = 2;
+ STACK_WIND(frame, dht_link_cbk, subvol, subvol->fops->link, &local->loc,
+ &local->loc2, local->xattr_req);
+
+ return 0;
err:
- DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent,
- postparent);
+ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
+static int
+dht_link_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *srcvol = NULL;
-int
-dht_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
-{
- xlator_t *cached_subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- int op_errno = -1;
- int ret = -1;
- dht_local_t *local = NULL;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (oldloc, err);
- VALIDATE_OR_GOTO (newloc, err);
-
- cached_subvol = dht_subvol_get_cached (this, oldloc->inode);
- if (!cached_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", oldloc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- hashed_subvol = dht_subvol_get_hashed (this, newloc);
- if (!hashed_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- newloc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- ret = loc_copy (&local->loc, oldloc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- ret = loc_copy (&local->loc2, newloc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- if (hashed_subvol != cached_subvol) {
- memcpy (local->gfid, oldloc->inode->gfid, 16);
- dht_linkfile_create (frame, dht_link_linkfile_cbk,
- cached_subvol, hashed_subvol, newloc);
- } else {
- STACK_WIND (frame, dht_link_cbk,
- cached_subvol, cached_subvol->fops->link,
- oldloc, newloc);
- }
-
- return 0;
+ if (op_ret == -1)
+ goto err;
+
+ local = frame->local;
+ srcvol = local->linkfile.srcvol;
+
+ STACK_WIND(frame, dht_link_cbk, srcvol, srcvol->fops->link, &local->loc,
+ &local->loc2, local->xattr_req);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(preparent);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, xdata);
- return 0;
+ return 0;
}
+int
+dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(oldloc, err);
+ VALIDATE_OR_GOTO(newloc, err);
+
+ local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+ local->call_cnt = 1;
+
+ cached_subvol = local->cached_subvol;
+ if (!cached_subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ oldloc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, newloc);
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ newloc->path);
+ op_errno = EIO;
+ goto err;
+ }
+
+ ret = loc_copy(&local->loc2, newloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (hashed_subvol != cached_subvol) {
+ gf_uuid_copy(local->gfid, oldloc->inode->gfid);
+ dht_linkfile_create(frame, dht_link_linkfile_cbk, this, cached_subvol,
+ hashed_subvol, newloc);
+ } else {
+ STACK_WIND(frame, dht_link_cbk, cached_subvol,
+ cached_subvol->fops->link, oldloc, newloc, xdata);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
int
-dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- fd_t *fd, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
+dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- call_frame_t *prev = NULL;
- int ret = -1;
- dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ gf_boolean_t parent_layout_changed = _gf_false;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ local = frame->local;
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ parent_layout_changed = (xdata &&
+ dict_get(xdata, GF_PREOP_CHECK_FAILED))
+ ? _gf_true
+ : _gf_false;
+
+ if (parent_layout_changed) {
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ /* Returning failure as the layout could not be fixed even under
+ * the lock */
+ goto out;
+ }
- if (op_ret == -1)
- goto out;
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "create (%s/%s) (path: %s): parent layout "
+ "changed. Attempting a layout refresh and then a "
+ "retry",
+ pgfid, local->loc.name, local->loc.path);
+
+ /*
+ dht_refresh_layout needs directory info in local->loc.Hence,
+ storing the parent_loc in local->loc and storing the create
+ context in local->loc2. We will restore this information in
+ dht_creation_do.
+ */
+
+ loc_wipe(&local->loc2);
+
+ ret = loc_copy(&local->loc2, &local->loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "loc_copy failed %s", local->loc.path);
- local = frame->local;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
goto out;
- }
+ }
- prev = cookie;
+ loc_wipe(&local->loc);
- dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino);
- if (local->loc.parent) {
- preparent->ia_ino = local->loc.parent->ino;
- postparent->ia_ino = local->loc.parent->ino;
+ ret = dht_build_parent_loc(this, &local->loc, &local->loc2,
+ &op_errno);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+ "parent loc build failed");
+ goto out;
+ }
+
+ subvol = dht_subvol_get_hashed(this, &local->loc2);
- WIPE (preparent);
- WIPE (postparent);
+ ret = dht_create_lock(frame, subvol);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "locking parent failed");
+ goto out;
+ }
+
+ return 0;
}
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not set preset layout for subvol %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ goto out;
+ }
+
+ prev = cookie;
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ ret = dht_fd_ctx_set(this, fd, prev);
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0,
+ "Possible fd leak. "
+ "Could not set fd ctx for subvol %s",
+ prev->name);
+ }
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s",
+ prev->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ local->op_errno = op_errno;
+
+ if (local->linked == _gf_true) {
+ local->stbuf = *stbuf;
+ dht_linkfile_attr_heal(frame, this);
+ }
out:
- DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent,
- postparent);
- return 0;
+
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(preparent);
+ dht_set_fixed_dir_stat(postparent);
+
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ /* store op_errno for failure case*/
+ local->op_errno = op_errno;
+ local->refresh_layout_unlock(frame, this, op_ret, 1);
+
+ if (op_ret == 0) {
+ DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+ preparent, postparent, xdata);
+ }
+ } else {
+ DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+ preparent, postparent, xdata);
+ }
+ return 0;
}
+static int
+dht_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ conf = this->private;
+ if (!conf) {
+ local->op_errno = EINVAL;
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ cached_subvol = local->cached_subvol;
+
+ if (local->params) {
+ dict_del(local->params, conf->link_xattr_name);
+ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
+ }
+
+ STACK_WIND_COOKIE(frame, dht_create_cbk, cached_subvol, cached_subvol,
+ cached_subvol->fops->create, &local->loc, local->flags,
+ local->mode, local->umask, local->fd, local->params);
+
+ return 0;
+err:
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ local->refresh_layout_unlock(frame, this, -1, 1);
+ } else {
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ }
+ return 0;
+}
-int
-dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
+static int
+dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd,
+ dict_t *params)
{
- dht_local_t *local = NULL;
- xlator_t *cached_subvol = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *avail_subvol = NULL;
- if (op_ret == -1)
- goto err;
+ local = frame->local;
- local = frame->local;
- cached_subvol = local->cached_subvol;
+ if (!dht_is_subvol_filled(this, subvol)) {
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ subvol->name);
- STACK_WIND (frame, dht_create_cbk,
- cached_subvol, cached_subvol->fops->create,
- &local->loc, local->flags, local->mode,
- local->fd, local->params);
+ dht_set_parent_layout_in_dict(loc, this, local);
- return 0;
- err:
- DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+ subvol->fops->create, loc, flags, mode, umask, fd,
+ params);
+
+ } else {
+ avail_subvol = dht_free_disk_available_subvol(this, subvol, local);
+
+ if (avail_subvol != subvol) {
+ local->cached_subvol = avail_subvol;
+ local->hashed_subvol = subvol;
+
+ gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)",
+ loc->path, avail_subvol->name, subvol->name);
+
+ dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this,
+ avail_subvol, subvol, loc);
+
+ goto out;
+ }
+
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ subvol->name);
+
+ dht_set_parent_layout_in_dict(loc, this, local);
+
+ STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+ subvol->fops->create, loc, flags, mode, umask, fd,
+ params);
+ }
+out:
+ return 0;
}
int
-dht_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params)
-{
- int op_errno = -1;
- int ret = -1;
- xlator_t *subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- xlator_t *avail_subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- conf = this->private;
-
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- if (dht_filter_loc_subvol_key (this, loc, &local->loc,
- &subvol)) {
- gf_log (this->name, GF_LOG_NORMAL,
- "creating %s on %s (got create on %s)",
- local->loc.path, subvol->name, loc->path);
- STACK_WIND (frame, dht_create_cbk,
- subvol, subvol->fops->create,
- &local->loc, flags, mode, fd, params);
- goto done;
+dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child,
+ int32_t *op_errno)
+{
+ inode_table_t *table = NULL;
+ int ret = -1;
+
+ if (!parent || !child) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
+ }
+
+ if (child->parent) {
+ parent->inode = inode_ref(child->parent);
+ if (!parent->inode) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
}
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = ENOENT;
- goto err;
+ gf_uuid_copy(parent->gfid, child->pargfid);
+
+ ret = 0;
+
+ goto out;
+ } else {
+ if (gf_uuid_is_null(child->pargfid)) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
}
- if (!dht_is_subvol_filled (this, subvol)) {
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_create_cbk,
- subvol, subvol->fops->create,
- loc, flags, mode, fd, params);
- goto done;
+ table = this->itable;
+
+ if (!table) {
+ if (op_errno) {
+ *op_errno = EINVAL;
+ goto out;
+ }
}
- /* Choose the minimum filled volume, and create the
- files there */
- /* TODO */
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
- if (avail_subvol != subvol) {
- local->fd = fd_ref (fd);
- local->params = dict_ref (params);
- local->flags = flags;
- local->mode = mode;
- local->cached_subvol = avail_subvol;
- local->hashed_subvol = subvol;
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s (link at %s)", loc->path,
- avail_subvol->name, subvol->name);
- dht_linkfile_create (frame,
- dht_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
- goto done;
+ parent->inode = inode_find(table, child->pargfid);
+
+ if (!parent->inode) {
+ if (op_errno) {
+ *op_errno = EINVAL;
+ goto out;
+ }
}
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_create_cbk,
- subvol, subvol->fops->create,
- loc, flags, mode, fd, params);
-done:
- return 0;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ gf_uuid_copy(parent->gfid, child->pargfid);
- return 0;
-}
+ ret = 0;
+ }
+out:
+ return ret;
+}
-int
-dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static int32_t
+dht_create_do(call_frame_t *frame)
{
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *refreshed = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ local = frame->local;
- local = frame->local;
- layout = local->selfheal.layout;
+ this = THIS;
- if (op_ret == 0) {
- dht_layout_set (this, local->inode, layout);
- local->stbuf.ia_ino = local->ia_ino;
- if (local->loc.parent) {
- local->preparent.ia_ino = local->loc.parent->ino;
- local->postparent.ia_ino = local->loc.parent->ino;
+ conf = this->private;
- WIPE (&local->preparent);
- WIPE (&local->postparent);
- }
- }
+ GF_VALIDATE_OR_GOTO(this->name, conf, err);
- DHT_STACK_UNWIND (mkdir, frame, op_ret, op_errno,
- local->inode, &local->stbuf, &local->preparent,
- &local->postparent);
+ methods = &(conf->methods);
- return 0;
-}
+ /* We don't need parent_loc anymore */
+ loc_wipe(&local->loc);
-int
-dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- int ret = -1;
- int subvol_filled = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
- local = frame->local;
- prev = cookie;
- layout = local->layout;
-
- subvol_filled = dht_is_subvol_filled (this, prev->this);
-
- LOCK (&frame->lock);
- {
- if (subvol_filled && (op_ret != -1)) {
- ret = dht_layout_merge (this, layout, prev->this,
- -1, ENOSPC, NULL);
- } else {
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, NULL);
- }
+ loc_copy(&local->loc, &local->loc2);
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto unlock;
- }
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preparent, preparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
-
- if (prev->this == dht_first_up_subvol (this)) {
- local->ia_ino = local->stbuf.ia_ino;
- }
+ loc_wipe(&local->loc2);
- }
-unlock:
- UNLOCK (&frame->lock);
+ refreshed = local->selfheal.refreshed_layout;
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_selfheal_new_directory (frame, dht_mkdir_selfheal_cbk,
- layout);
- }
+ subvol = methods->layout_search(this, refreshed, local->loc.name);
- return 0;
-}
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "no subvolume in "
+ "layout for path=%s",
+ local->loc.path);
+ local->op_errno = ENOENT;
+ goto err;
+ }
-int
-dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- int ret = -1;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
- int i = 0;
- xlator_t *hashed_subvol = NULL;
-
- VALIDATE_OR_GOTO (this->private, err);
-
- local = frame->local;
- prev = cookie;
- layout = local->layout;
- conf = this->private;
- hashed_subvol = local->hashed_subvol;
-
- if (uuid_is_null (local->loc.inode->gfid) && !op_ret)
- memcpy (local->loc.inode->gfid, stbuf->ia_gfid, 16);
-
- if (dht_is_subvol_filled (this, hashed_subvol))
- ret = dht_layout_merge (this, layout, prev->this,
- -1, ENOSPC, NULL);
- else
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, NULL);
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto err;
- }
- local->op_ret = 0;
-
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preparent, preparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent, prev->this);
-
- local->ia_ino = local->stbuf.ia_ino;
-
- local->call_cnt = conf->subvolume_cnt - 1;
-
- if (local->call_cnt == 0) {
- dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
- &local->loc, layout);
- }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == hashed_subvol)
- continue;
- STACK_WIND (frame, dht_mkdir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->mkdir,
- &local->loc, local->mode, local->params);
- }
- return 0;
+ dht_create_wind_to_avail_subvol(frame, this, subvol, &local->loc,
+ local->flags, local->mode, local->umask,
+ local->fd, local->params);
+ return 0;
err:
- DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
+ local->refresh_layout_unlock(frame, this, -1, 1);
+
+ return 0;
}
+static int32_t
+dht_create_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
-int
-dht_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params)
-{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int ret = -1;
- xlator_t *hashed_subvol = NULL;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
-
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- hashed_subvol = dht_subvol_get_hashed (this, loc);
-
- if (hashed_subvol == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "hashed subvol not found for %s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local->hashed_subvol = hashed_subvol;
- local->inode = inode_ref (loc->inode);
- ret = loc_copy (&local->loc, loc);
- local->mode = mode;
-
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->params = dict_ref (params);
-
- local->layout = dht_layout_new (this, conf->subvolume_cnt);
- if (!local->layout) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- STACK_WIND (frame, dht_mkdir_hashed_cbk,
- hashed_subvol,
- hashed_subvol->fops->mkdir,
- loc, mode, params);
-
- return 0;
+static int32_t
+dht_create_finish(call_frame_t *frame, xlator_t *this, int op_ret,
+ int invoke_cbk)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ int lock_count = 0;
+
+ local = frame->local;
+ lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks,
+ local->lock[0].layout.parent_layout.lk_count);
+ if (lock_count == 0)
+ goto done;
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+ lock_frame->root->op);
+ if (lock_local == NULL) {
+ goto done;
+ }
+
+ lock_local->lock[0]
+ .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks;
+ lock_local->lock[0].layout.parent_layout.lk_count =
+ local->lock[0].layout.parent_layout.lk_count;
+
+ local->lock[0].layout.parent_layout.locks = NULL;
+ local->lock[0].layout.parent_layout.lk_count = 0;
+
+ dht_unlock_inodelk(lock_frame,
+ lock_local->lock[0].layout.parent_layout.locks,
+ lock_local->lock[0].layout.parent_layout.lk_count,
+ dht_create_unlock_cbk);
+ lock_frame = NULL;
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
- return 0;
-}
+ if (op_ret == 0)
+ return 0;
+ DHT_STACK_UNWIND(create, frame, op_ret, local->op_errno, NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+}
-int
-dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno)
+static int32_t
+dht_create_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (local->loc.parent) {
- local->preparent.ia_ino = local->loc.parent->ino;
- local->postparent.ia_ino = local->loc.parent->ino;
- }
+ if (!local) {
+ goto err;
+ }
- DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent);
+ if (op_ret < 0) {
+ gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "Create lock failed for file: %s", local->loc2.name);
- return 0;
-}
+ local->op_errno = op_errno;
+ goto err;
+ }
-int
-dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent)
+ local->refresh_layout_unlock = dht_create_finish;
+
+ local->refresh_layout_done = dht_create_do;
+
+ dht_refresh_layout(frame);
+
+ return 0;
+err:
+ if (local)
+ dht_create_finish(frame, this, -1, 0);
+ else
+ DHT_STACK_UNWIND(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
+}
+
+int32_t
+dht_create_lock(call_frame_t *frame, xlator_t *subvol)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- local->op_ret = -1;
-
- if (op_errno != ENOENT)
- local->need_selfheal = 1;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "rmdir on %s for %s failed (%s)",
- prev->this->name, local->loc.path,
- strerror (op_errno));
- goto unlock;
- }
-
- dht_iatt_merge (this, &local->preparent, preparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
- }
-unlock:
- UNLOCK (&frame->lock);
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1;
+ dht_lock_t **lk_array = NULL;
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- if (local->need_selfheal) {
- local->layout =
- dht_layout_get (this, local->loc.inode);
+ local = frame->local;
- /* TODO: neater interface needed below */
- local->stbuf.ia_type = local->loc.inode->ia_type;
+ lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
- dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk,
- &local->loc, local->layout);
- } else {
- if (local->loc.parent) {
- local->preparent.ia_ino =
- local->loc.parent->ino;
- local->postparent.ia_ino =
- local->loc.parent->ino;
+ if (lk_array == NULL)
+ goto err;
- WIPE (&local->preparent);
- WIPE (&local->postparent);
- }
+ lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK,
+ DHT_LAYOUT_HEAL_DOMAIN, NULL,
+ IGNORE_ENOENT_ESTALE);
- DHT_STACK_UNWIND (rmdir, frame, local->op_ret,
- local->op_errno, &local->preparent,
- &local->postparent);
- }
- }
+ if (lk_array[0] == NULL)
+ goto err;
- return 0;
+ local->lock[0].layout.parent_layout.locks = lk_array;
+ local->lock[0].layout.parent_layout.lk_count = count;
+
+ ret = dht_blocking_inodelk(frame, lk_array, count, dht_create_lock_cbk);
+
+ if (ret < 0) {
+ local->lock[0].layout.parent_layout.locks = NULL;
+ local->lock[0].layout.parent_layout.lk_count = 0;
+ goto err;
+ }
+
+ return 0;
+err:
+ if (lk_array != NULL) {
+ dht_lock_array_free(lk_array, count);
+ GF_FREE(lk_array);
+ }
+
+ return -1;
}
+int
+dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local)
+{
+ dht_conf_t *conf = this->private;
+ dht_layout_t *parent_layout = NULL;
+ int *parent_disk_layout = NULL;
+ xlator_t *hashed_subvol = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ parent_layout = dht_layout_get(this, loc->parent);
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+
+ ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+ &parent_disk_layout);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "extracting in-memory layout of parent failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ ret = dict_set_str_sizen(local->params, GF_PREOP_PARENT_KEY,
+ conf->xattr_name);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting %s key in params dictionary failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path,
+ GF_PREOP_PARENT_KEY);
+ goto err;
+ }
+
+ ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout,
+ 4 * 4);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting parent-layout in params dictionary failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+err:
+ dht_layout_unref(this, parent_layout);
+ return ret;
+}
int
-dht_rmdir_do (call_frame_t *frame, xlator_t *this)
+dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int i = 0;
+ int op_errno = -1;
+ xlator_t *subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->params = dict_ref(params);
+ local->flags = flags;
+ local->mode = mode;
+ local->umask = umask;
+
+ if (dht_filter_loc_subvol_key(this, loc, &local->loc, &subvol)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "creating %s on %s (got create on %s)", local->loc.path,
+ subvol->name, loc->path);
+
+ /* Since lookup-optimize is enabled by default, we need
+ * to create the linkto file if required.
+ * Note this does not check for decommisioned bricks
+ * and min-free-disk limits as this is a debugging tool
+ * and not expected to be used in production.
+ */
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
- VALIDATE_OR_GOTO (this->private, err);
+ if (hashed_subvol && (hashed_subvol != subvol)) {
+ /* Create the linkto file and then the data file */
+ local->cached_subvol = subvol;
+ local->hashed_subvol = hashed_subvol;
- conf = this->private;
- local = frame->local;
+ dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this,
+ subvol, hashed_subvol, &local->loc);
+ goto done;
+ }
+ /* We either don't have a hashed subvol or the hashed subvol is
+ * the same as the one specified. No need to create the linkto
+ * file as we expect a lookup everywhere if there are problems
+ * with the parent layout
+ */
- if (local->op_ret == -1)
- goto err;
+ dht_set_parent_layout_in_dict(loc, this, local);
+
+ STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+ subvol->fops->create, &local->loc, flags, mode, umask,
+ fd, params);
+ goto done;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "no subvolume in layout for path=%s", loc->path);
+
+ op_errno = EIO;
+ goto err;
+ }
+
+ /* Post remove-brick, the client layout may not be in sync with
+ * disk layout because of lack of lookup. Hence,a create call
+ * may fall on the decommissioned brick. Hence, if the
+ * hashed_subvol is part of decommissioned bricks list, do a
+ * lookup on parent dir. If a fix-layout is already done by the
+ * remove-brick process, the parent directory layout will be in
+ * sync with that of the disk. If fix-layout is still ending
+ * on the parent directory, we can let the file get created on
+ * the decommissioned brick which will be eventually migrated to
+ * non-decommissioned brick based on the new layout.
+ */
+
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] &&
+ conf->decommissioned_bricks[i] == subvol) {
+ gf_msg_debug(this->name, 0,
+ "hashed subvol:%s is "
+ "part of decommission brick list for "
+ "file: %s",
+ subvol->name, loc->path);
+
+ /* dht_refresh_layout needs directory info in
+ * local->loc. Hence, storing the parent_loc in
+ * local->loc and storing the create context in
+ * local->loc2. We will restore this information
+ * in dht_creation do */
+
+ ret = loc_copy(&local->loc2, &local->loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "loc_copy failed %s", loc->path);
+
+ goto err;
+ }
+
+ loc_wipe(&local->loc);
+
+ ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+ "parent loc build failed");
+ goto err;
+ }
+
+ ret = dht_create_lock(frame, subvol);
- local->call_cnt = conf->subvolume_cnt;
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "locking parent failed");
+ goto err;
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_rmdir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->rmdir,
- &local->loc, local->flags);
- }
+ goto done;
+ }
+ }
+ }
- return 0;
+ dht_create_wind_to_avail_subvol(frame, this, subvol, loc, flags, mode,
+ umask, fd, params);
+done:
+ return 0;
err:
- DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent);
- return 0;
-}
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
-int
-dht_rmdir_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent)
+ return 0;
+}
+
+static int
+dht_mkdir_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *src = NULL;
- call_frame_t *main_frame = NULL;
- dht_local_t *main_local = NULL;
- int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
- local = frame->local;
- prev = cookie;
- src = prev->this;
+ local = frame->local;
+ layout = local->selfheal.layout;
- main_frame = local->main_frame;
- main_local = main_frame->local;
+ FRAME_SU_UNDO(frame, dht_local_t);
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "unlinked linkfile %s on %s",
- local->loc.path, src->name);
+ if (op_ret == 0) {
+ dht_layout_set(this, local->inode, layout);
+
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
+
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ }
+
+ DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, local->inode,
+ &local->stbuf, &local->preparent, &local->postparent,
+ NULL);
+
+ return 0;
+}
+
+static int
+dht_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int ret = -1;
+ gf_boolean_t subvol_filled = _gf_false;
+ gf_boolean_t dir_exists = _gf_false;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ layout = local->layout;
+
+ subvol_filled = dht_is_subvol_filled(this, prev);
+
+ LOCK(&frame->lock);
+ {
+ if (subvol_filled && (op_ret != -1)) {
+ ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL);
} else {
- main_local->op_ret = -1;
- main_local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "unlink of %s on %s failed (%s)",
- local->loc.path, src->name, strerror (op_errno));
+ if (op_ret == -1 && op_errno == EEXIST) {
+ /* Very likely just a race between mkdir and
+ self-heal (from lookup of a concurrent mkdir
+ attempt).
+ Ignore error for now. layout setting will
+ anyways fail if this was a different (old)
+ pre-existing different directory.
+ */
+ op_ret = 0;
+ dir_exists = _gf_true;
+ }
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL);
}
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "%s: failed to merge layouts for subvol %s", local->loc.path,
+ prev->name);
- this_call_cnt = dht_frame_return (main_frame);
- if (is_last_call (this_call_cnt))
- dht_rmdir_do (main_frame, this);
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
- DHT_STACK_DESTROY (frame);
- return 0;
+ if (dir_exists)
+ goto unlock;
+
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ /*Unlock entrylk and inodelk once mkdir is done on all subvols*/
+ dht_unlock_namespace(frame, &local->lock[0]);
+ FRAME_SU_DO(frame, dht_local_t);
+ dht_selfheal_new_directory(frame, dht_mkdir_selfheal_cbk, layout);
+ }
+
+ return 0;
}
+static int
+dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
-int
-dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, dict_t *xattr, struct iatt *parent)
+static int
+dht_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *params)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *src = NULL;
- call_frame_t *main_frame = NULL;
- dht_local_t *main_local = NULL;
- int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1, ret = -1;
+ xlator_t *hashed_subvol = NULL;
+ int32_t *parent_disk_layout = NULL;
+ dht_layout_t *parent_layout = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ conf = this->private;
+ local = frame->local;
+
+ if (local->op_ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): refreshing parent layout "
+ "failed.",
+ pgfid, loc->name, loc->path);
+
+ op_errno = local->op_errno;
+ goto err;
+ }
+
+ local->op_ret = -1;
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (hashed_subvol == NULL) {
+ gf_msg_debug(this->name, 0,
+ "mkdir (%s/%s) (path: %s): hashed subvol not "
+ "found",
+ pgfid, loc->name, loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ local->hashed_subvol = hashed_subvol;
+
+ parent_layout = dht_layout_get(this, loc->parent);
+
+ ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+ &parent_disk_layout);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): "
+ "extracting in-memory layout of parent failed. ",
+ pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ if (memcmp(local->parent_disk_layout, parent_disk_layout,
+ sizeof(local->parent_disk_layout)) == 0) {
+ gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): loop detected. "
+ "parent layout didn't change even though "
+ "previous attempt of mkdir failed because of "
+ "in-memory layout not matching with that on disk.",
+ pgfid, loc->name, loc->path);
+ op_errno = EIO;
+ goto err;
+ }
+
+ memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout,
+ sizeof(local->parent_disk_layout));
+
+ dht_layout_unref(this, parent_layout);
+ parent_layout = NULL;
+
+ ret = dict_set_str(params, GF_PREOP_PARENT_KEY, conf->xattr_name);
+ if (ret < 0) {
+ local->op_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): "
+ "setting %s key in params dictionary failed. ",
+ pgfid, loc->name, loc->path, GF_PREOP_PARENT_KEY);
+ goto err;
+ }
+
+ ret = dict_set_bin(params, conf->xattr_name, parent_disk_layout, 4 * 4);
+ if (ret < 0) {
+ local->op_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "setting parent-layout in params dictionary failed. "
+ "mkdir (%s/%s) (path: %s)",
+ pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ parent_disk_layout = NULL;
+
+ STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->mkdir, loc, mode, umask, params);
+
+ return 0;
- local = frame->local;
- prev = cookie;
- src = prev->this;
+err:
+ dht_unlock_namespace(frame, &local->lock[0]);
- main_frame = local->main_frame;
- main_local = main_frame->local;
+ op_errno = local ? local->op_errno : op_errno;
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- if (op_ret != 0)
- goto err;
+ if (parent_disk_layout != NULL)
+ GF_FREE(parent_disk_layout);
+
+ if (parent_layout != NULL)
+ dht_layout_unref(this, parent_layout);
- if (check_is_linkfile (inode, stbuf, xattr) == 0) {
- main_local->op_ret = -1;
- main_local->op_errno = ENOTEMPTY;
+ return 0;
+}
+
+static int
+dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *hashed_subvol = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ gf_boolean_t parent_layout_changed = _gf_false;
+ call_stub_t *stub = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ layout = local->layout;
+ conf = this->private;
+ hashed_subvol = local->hashed_subvol;
+
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+ if (gf_uuid_is_null(local->loc.gfid) && !op_ret)
+ gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid);
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+
+ parent_layout_changed = (xdata &&
+ dict_get(xdata, GF_PREOP_CHECK_FAILED))
+ ? 1
+ : 0;
+ if (parent_layout_changed) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): parent layout "
+ "changed. Attempting a refresh and then a "
+ "retry",
+ pgfid, local->loc.name, local->loc.path);
+
+ stub = fop_mkdir_stub(frame, dht_mkdir_helper, &local->loc,
+ local->mode, local->umask, local->params);
+ if (stub == NULL) {
+ goto err;
+ }
- gf_log (this->name, GF_LOG_WARNING,
- "%s on %s found to be not a linkfile (type=0%o)",
- local->loc.path, src->name, stbuf->ia_type);
+ ret = dht_handle_parent_layout_change(this, stub);
+ if (ret) {
goto err;
+ }
+
+ stub = NULL;
+
+ return 0;
}
- STACK_WIND (frame, dht_rmdir_linkfile_unlink_cbk,
- src, src->fops->unlink, &local->loc);
+ goto err;
+ }
+
+ dict_del(local->params, GF_PREOP_PARENT_KEY);
+ dict_del(local->params, conf->xattr_name);
+
+ if (dht_is_subvol_filled(this, hashed_subvol))
+ ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL);
+ else
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL);
+
+ /* TODO: we may have to return from the function
+ if layout merge fails. For now, lets just log an error */
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "%s: failed to merge layouts for subvol %s", local->loc.path,
+ prev->name);
+
+ local->op_ret = 0;
+
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+
+ local->call_cnt = conf->subvolume_cnt - 1;
+ /* Delete internal mds xattr from params dict to avoid store
+ internal mds xattr on other subvols
+ */
+ dict_del(local->params, conf->mds_xattr_key);
+
+ if (gf_uuid_is_null(local->loc.gfid))
+ gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid);
+
+ /* Set hashed subvol as a mds subvol on inode ctx */
+ /*if (!local->inode)
+ local->inode = inode_ref (inode);
+ */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, hashed_subvol);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set hashed subvol for %s on inode vol is %s",
+ local->loc.path, hashed_subvol->name);
+ }
+
+ if (local->call_cnt == 0) {
+ /*Unlock namespace lock once mkdir is done on all subvols*/
+ dht_unlock_namespace(frame, &local->lock[0]);
+ FRAME_SU_DO(frame, dht_local_t);
+ dht_selfheal_directory(frame, dht_mkdir_selfheal_cbk, &local->loc,
+ layout);
return 0;
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == hashed_subvol)
+ continue;
+ STACK_WIND_COOKIE(frame, dht_mkdir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i], conf->subvolumes[i]->fops->mkdir,
+ &local->loc, local->mode, local->umask,
+ local->params);
+ }
+
+ return 0;
err:
+ if (local->op_ret != 0) {
+ dht_unlock_namespace(frame, &local->lock[0]);
+ }
- this_call_cnt = dht_frame_return (main_frame);
- if (is_last_call (this_call_cnt))
- dht_rmdir_do (main_frame, this);
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- DHT_STACK_DESTROY (frame);
- return 0;
+ return 0;
}
+static int
+dht_mkdir_guard_parent_layout_cbk(call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask,
+ dict_t *params)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = 0;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = -1;
+ int32_t zero[1] = {0};
+
+ local = frame->local;
+ conf = this->private;
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ if (local->op_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): "
+ "Acquiring lock on parent to guard against "
+ "layout-change failed.",
+ pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ local->op_ret = -1;
+ /* Add internal MDS xattr on disk for hashed subvol
+ */
+ ret = dht_dict_set_array(params, conf->mds_xattr_key, zero, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->mds_xattr_key, loc->path);
+ }
+
+ STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, local->hashed_subvol,
+ local->hashed_subvol, local->hashed_subvol->fops->mkdir,
+ loc, mode, umask, params);
+
+ return 0;
+err:
+ DHT_STACK_UNWIND(mkdir, frame, -1, local->op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+
+ return 0;
+}
int
-dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entries, xlator_t *src)
+dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *params)
{
- int ret = 0;
- int build_ret = 0;
- gf_dirent_t *trav = NULL;
- call_frame_t *lookup_frame = NULL;
- dht_local_t *lookup_local = NULL;
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = EINVAL, ret = -1;
+ xlator_t *hashed_subvol = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ call_stub_t *stub = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ conf = this->private;
+
+ if (!params || !dict_get(params, "gfid-req")) {
+ op_errno = EPERM;
+ gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_GFID_NULL,
+ "mkdir: %s is received "
+ "without gfid-req %p",
+ loc->path, params);
+ goto err;
+ }
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (hashed_subvol == NULL) {
+ gf_msg_debug(this->name, 0, "hashed subvol not found for %s",
+ loc->path);
+ local->op_errno = EIO;
+ goto err;
+ }
+
+ local->hashed_subvol = hashed_subvol;
+ local->mode = mode;
+ local->umask = umask;
+ if (params)
+ local->params = dict_ref(params);
+
+ local->inode = inode_ref(loc->inode);
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ /* set the newly created directory hash to the commit hash
+ * if the configuration option is set. If configuration option
+ * is not set, the older clients may still be connecting to the
+ * volume and hence we need to preserve the 1 in disk[0] part of the
+ * layout xattr */
+ if (conf->lookup_optimize)
+ local->layout->commit_hash = conf->vol_commit_hash;
+ else
+ local->layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+
+ stub = fop_mkdir_stub(frame, dht_mkdir_guard_parent_layout_cbk, loc, mode,
+ umask, params);
+ if (stub == NULL) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): "
+ "creating stub failed.",
+ pgfid, loc->name, loc->path);
+ local->op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dht_guard_parent_layout_and_namespace(this, stub);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s) cannot wind lock request to "
+ "guard parent layout",
+ pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ return 0;
- local = frame->local;
+err:
+ op_errno = local ? local->op_errno : op_errno;
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+static int
+dht_rmdir_selfheal_cbk(call_frame_t *heal_frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_local_t *heal_local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ heal_local = heal_frame->local;
+ main_frame = heal_local->main_frame;
+ local = main_frame->local;
+
+ DHT_STACK_DESTROY(heal_frame);
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ DHT_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
+
+ return 0;
+}
- list_for_each_entry (trav, &entries->list, list) {
- if (strcmp (trav->d_name, ".") == 0)
- continue;
- if (strcmp (trav->d_name, "..") == 0)
- continue;
- if (check_is_linkfile (NULL, (&trav->d_stat), NULL) == 1) {
- ret++;
- continue;
+static int
+dht_rmdir_hashed_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
+ dht_conf_t *conf = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ if (conf->subvolume_cnt != 1) {
+ if (op_errno != ENOENT && op_errno != EACCES &&
+ op_errno != ESTALE) {
+ local->need_selfheal = 1;
}
+ }
- /* this entry is either a directory which is neither "." nor "..",
- or a non directory which is not a linkfile. the directory is to
- be treated as non-empty
- */
- return 0;
+ gf_msg_debug(this->name, op_errno,
+ "rmdir on %s for %s failed "
+ "(gfid = %s)",
+ prev->name, local->loc.path, gfid);
+ goto unlock;
}
- list_for_each_entry (trav, &entries->list, list) {
- if (strcmp (trav->d_name, ".") == 0)
- continue;
- if (strcmp (trav->d_name, "..") == 0)
- continue;
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ }
+unlock:
+ UNLOCK(&frame->lock);
- lookup_frame = NULL;
- lookup_local = NULL;
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if (local->need_selfheal) {
+ dht_rmdir_unlock(frame, this);
+ local->layout = dht_layout_get(this, local->loc.inode);
- lookup_frame = copy_frame (frame);
- if (!lookup_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of Memory");
- /* out of memory, let the rmdir fail
- (as non-empty, unfortunately) */
- goto err;
- }
+ /* TODO: neater interface needed below */
+ local->stbuf.ia_type = local->loc.inode->ia_type;
- lookup_local = GF_CALLOC (sizeof (*local), 1,
- gf_dht_mt_dht_local_t);
- if (!lookup_local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of Memory");
- goto err;
- }
+ gf_uuid_copy(local->gfid, local->loc.inode->gfid);
- lookup_frame->local = lookup_local;
- lookup_local->main_frame = frame;
+ /* Use a different frame or else the rmdir op_ret is
+ * overwritten by that of the selfheal */
- build_ret = dht_build_child_loc (this, &lookup_local->loc,
- &local->loc, trav->d_name);
- if (build_ret != 0)
- goto err;
+ heal_frame = copy_frame(frame);
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s",
- lookup_local->loc.path, src->name);
+ if (heal_frame == NULL) {
+ goto err;
+ }
- LOCK (&frame->lock);
- {
- local->call_cnt++;
- }
- UNLOCK (&frame->lock);
+ heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0);
+ if (!heal_local) {
+ DHT_STACK_DESTROY(heal_frame);
+ goto err;
+ }
+
+ heal_local->inode = inode_ref(local->loc.inode);
+ heal_local->main_frame = frame;
+ gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid);
+
+ dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk,
+ &heal_local->loc, heal_local->layout);
+ return 0;
+ } else {
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
- STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk,
- src, src->fops->lookup,
- &lookup_local->loc, NULL);
- ret++;
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ dht_rmdir_unlock(frame, this);
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
}
+ }
+
+ return 0;
- return ret;
err:
- DHT_STACK_DESTROY (lookup_frame);
- return 0;
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL);
+ return 0;
}
+static int
+dht_rmdir_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
-int
-dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *entries)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
- call_frame_t *prev = NULL;
- xlator_t *src = NULL;
- int ret = 0;
-
- local = frame->local;
- prev = cookie;
- src = prev->this;
-
- if (op_ret > 2) {
- ret = dht_rmdir_is_subvol_empty (frame, this, entries, src);
-
- switch (ret) {
- case 0: /* non linkfiles exist */
- gf_log (this->name, GF_LOG_TRACE,
- "readdir on %s for %s returned %d entries",
- prev->this->name, local->loc.path, op_ret);
- local->op_ret = -1;
- local->op_errno = ENOTEMPTY;
- break;
- default:
- /* @ret number of linkfiles are getting unlinked */
- gf_log (this->name, GF_LOG_TRACE,
- "readdir on %s for %s found %d linkfiles",
- prev->this->name, local->loc.path, ret);
- break;
- }
- }
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ int lock_count = 0;
- this_call_cnt = dht_frame_return (frame);
+ local = frame->local;
- if (is_last_call (this_call_cnt)) {
- dht_rmdir_do (frame, this);
- }
+ /* Unlock entrylk */
+ dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns);
- return 0;
-}
+ /* Unlock inodelk */
+ lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks,
+ local->lock[0].ns.parent_layout.lk_count);
+ if (lock_count == 0)
+ goto done;
-int
-dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd)
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL)
+ goto done;
+
+ lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+ lock_frame->root->op);
+ if (lock_local == NULL)
+ goto done;
+
+ lock_local->lock[0].ns.parent_layout.locks = local->lock[0]
+ .ns.parent_layout.locks;
+ lock_local->lock[0]
+ .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count;
+
+ local->lock[0].ns.parent_layout.locks = NULL;
+ local->lock[0].ns.parent_layout.lk_count = 0;
+ dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks,
+ lock_local->lock[0].ns.parent_layout.lk_count,
+ dht_rmdir_unlock_cbk);
+ lock_frame = NULL;
+
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+
+ return 0;
+}
+
+static int
+dht_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
- call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ int done = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if ((op_errno != ENOENT) && (op_errno != ESTALE)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ if (op_errno != EACCES)
+ local->need_selfheal = 1;
+ }
- local = frame->local;
- prev = cookie;
+ gf_uuid_unparse(local->loc.gfid, gfid);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "opendir on %s for %s failed (%s)",
- prev->this->name, local->loc.path,
- strerror (op_errno));
- goto err;
- }
+ gf_msg_debug(this->name, op_errno,
+ "rmdir on %s for %s failed."
+ "(gfid = %s)",
+ prev->name, local->loc.path, gfid);
+ goto unlock;
+ }
- STACK_WIND (frame, dht_rmdir_readdirp_cbk,
- prev->this, prev->this->fops->readdirp,
- local->fd, 4096, 0);
+ /* Track if rmdir succeeded on at least one subvol*/
+ local->fop_succeeded = 1;
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ }
+unlock:
+ UNLOCK(&frame->lock);
- return 0;
+ this_call_cnt = dht_frame_return(frame);
-err:
- this_call_cnt = dht_frame_return (frame);
+ /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */
+ if (local->hashed_subvol && (this_call_cnt == 1)) {
+ done = 1;
+ } else if (!local->hashed_subvol && !this_call_cnt) {
+ done = 1;
+ }
- if (is_last_call (this_call_cnt)) {
- dht_rmdir_do (frame, this);
- }
+ if (done) {
+ if (local->need_selfheal && local->fop_succeeded) {
+ dht_rmdir_unlock(frame, this);
+ local->layout = dht_layout_get(this, local->loc.inode);
- return 0;
-}
+ /* TODO: neater interface needed below */
+ local->stbuf.ia_type = local->loc.inode->ia_type;
+ gf_uuid_copy(local->gfid, local->loc.inode->gfid);
+ heal_frame = copy_frame(frame);
+ if (heal_frame == NULL) {
+ goto err;
+ }
-int
-dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags)
-{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
- int ret = -1;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
-
- local = dht_local_init (frame);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->call_cnt = conf->subvolume_cnt;
- local->op_ret = 0;
-
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->flags = flags;
-
- local->fd = fd_create (local->loc.inode, frame->root->pid);
- if (!local->fd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_rmdir_opendir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->opendir,
- loc, local->fd);
- }
-
- return 0;
+ heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0);
+ if (!heal_local) {
+ DHT_STACK_DESTROY(heal_frame);
+ goto err;
+ }
+
+ heal_local->inode = inode_ref(local->loc.inode);
+ heal_local->main_frame = frame;
+ gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid);
+ ret = dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk,
+ &heal_local->loc, heal_local->layout);
+ if (ret) {
+ DHT_STACK_DESTROY(heal_frame);
+ goto err;
+ }
+
+ } else if (this_call_cnt) {
+ /* If non-hashed subvol's have responded, proceed */
+ if (local->op_ret == 0) {
+ /* Delete the dir from the hashed subvol if:
+ * The fop succeeded on at least one subvol
+ * and did not fail on any
+ * or
+ * The fop failed with ENOENT/ESTALE on
+ * all subvols */
+
+ STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk,
+ local->hashed_subvol, local->hashed_subvol,
+ local->hashed_subvol->fops->rmdir,
+ &local->loc, local->flags, NULL);
+ } else {
+ /* hashed-subvol was non-NULL and rmdir failed on
+ * all non hashed-subvols. Unwind rmdir with
+ * local->op_ret and local->op_errno. */
+ dht_rmdir_unlock(frame, this);
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (rmdir, frame, -1, op_errno,
- NULL, NULL);
+ return 0;
+ }
+ } else if (!this_call_cnt) {
+ /* All subvol's have responded, proceed */
- return 0;
-}
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
-int
-dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
-{
- DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict);
- return 0;
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ dht_rmdir_unlock(frame, this);
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
+ }
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(rmdir, frame, -1, local->op_errno, NULL, NULL, NULL);
+ return 0;
}
+static int
+dht_rmdir_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *hashed_subvol;
-int
-dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t flags, dict_t *dict)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->inode = inode_ref (loc->inode);
- local->call_cnt = 1;
-
- STACK_WIND (frame,
- dht_xattrop_cbk,
- subvol, subvol->fops->xattrop,
- loc, flags, dict);
-
- return 0;
+ conf = this->private;
+ local = frame->local;
+
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "acquiring entrylk after inodelk failed rmdir for %s)",
+ local->loc.path);
+
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ hashed_subvol = local->hashed_subvol;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (hashed_subvol && (hashed_subvol == conf->subvolumes[i]))
+ continue;
+
+ STACK_WIND_COOKIE(frame, dht_rmdir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i], conf->subvolumes[i]->fops->rmdir,
+ &local->loc, local->flags, NULL);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL);
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
- return 0;
+ return 0;
}
+static int
+dht_rmdir_do(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ xlator_t *hashed_subvol = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame->local, err);
+ local = frame->local;
+ VALIDATE_OR_GOTO(this->private, out);
+ conf = this->private;
+
+ if (local->op_ret == -1)
+ goto out;
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ /* first remove from non-hashed_subvol */
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+
+ if (!hashed_subvol) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s (gfid = %s)",
+ local->loc.path, gfid);
+ } else {
+ local->hashed_subvol = hashed_subvol;
+ }
+
+ /* When DHT has only 1 child */
+ if (conf->subvolume_cnt == 1) {
+ STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk,
+ conf->subvolumes[0], conf->subvolumes[0],
+ conf->subvolumes[0]->fops->rmdir, &local->loc,
+ local->flags, NULL);
+ return 0;
+ }
-int
-dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+ local->current = &local->lock[0];
+ ret = dht_protect_namespace(frame, &local->loc, local->hashed_subvol,
+ &local->current->ns, dht_rmdir_lock_cbk);
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = errno ? errno : EINVAL;
+ goto out;
+ }
+
+ return 0;
+
+out:
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
+ return 0;
+err:
+ DHT_STACK_UNWIND(rmdir, frame, -1, EINVAL, NULL, NULL, NULL);
+ return 0;
+}
+
+static void
+dht_rmdir_readdirp_done(call_frame_t *readdirp_frame, xlator_t *this)
{
- DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict);
- return 0;
+ call_frame_t *main_frame = NULL;
+ dht_local_t *main_local = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+ local = readdirp_frame->local;
+ main_frame = local->main_frame;
+ main_local = main_frame->local;
+
+ /* At least one readdirp failed.
+ * This is a bit hit or miss - if readdirp failed on more than
+ * one subvol, we don't know which error is returned.
+ */
+ if (local->op_ret == -1) {
+ main_local->op_ret = local->op_ret;
+ main_local->op_errno = local->op_errno;
+ }
+
+ this_call_cnt = dht_frame_return(main_frame);
+
+ if (is_last_call(this_call_cnt))
+ dht_rmdir_do(main_frame, this);
+
+ DHT_STACK_DESTROY(readdirp_frame);
}
+/* Keep sending readdirp on the subvol until it returns no more entries
+ * It is possible that not all entries will fit in a single readdirp in
+ * which case the rmdir will keep failing with ENOTEMPTY
+ */
-int
-dht_fxattrop (call_frame_t *frame, xlator_t *this,
- fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict)
+static int
+dht_rmdir_readdirp_do(call_frame_t *readdirp_frame, xlator_t *this)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
+ dht_local_t *local = NULL;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ local = readdirp_frame->local;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ if (local->op_ret == -1) {
+ /* there is no point doing another readdirp on this
+ * subvol . */
+ dht_rmdir_readdirp_done(readdirp_frame, this);
+ return 0;
+ }
- STACK_WIND (frame,
- dht_fxattrop_cbk,
- subvol, subvol->fops->fxattrop,
- fd, flags, dict);
+ STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk,
+ local->hashed_subvol, local->hashed_subvol,
+ local->hashed_subvol->fops->readdirp, local->fd, 4096, 0,
+ local->xattr);
- return 0;
+ return 0;
+}
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL);
+static int
+dht_rmdir_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src = NULL;
+ call_frame_t *readdirp_frame = NULL;
+ dht_local_t *readdirp_local = NULL;
+ int this_call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ prev = cookie;
+ src = prev;
+
+ readdirp_frame = local->main_frame;
+ readdirp_local = readdirp_frame->local;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (op_ret == 0) {
+ gf_msg_trace(this->name, 0, "Unlinked linkfile %s on %s, gfid = %s",
+ local->loc.path, src->name, gfid);
+ } else {
+ if (op_errno != ENOENT) {
+ readdirp_local->op_ret = -1;
+ readdirp_local->op_errno = op_errno;
+ }
+ gf_msg_debug(this->name, op_errno,
+ "Unlink of %s on %s failed. (gfid = %s)", local->loc.path,
+ src->name, gfid);
+ }
+
+ this_call_cnt = dht_frame_return(readdirp_frame);
- return 0;
+ if (is_last_call(this_call_cnt))
+ dht_rmdir_readdirp_do(readdirp_frame, this);
+
+ DHT_STACK_DESTROY(frame);
+ return 0;
}
+static int
+dht_rmdir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr, struct iatt *parent)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src = NULL;
+ call_frame_t *readdirp_frame = NULL;
+ dht_local_t *readdirp_local = NULL;
+ int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ prev = cookie;
+ src = prev;
+
+ gf_msg_debug(this->name, 0, "dht_rmdir_lookup_cbk %s", local->loc.path);
+
+ readdirp_frame = local->main_frame;
+ readdirp_local = readdirp_frame->local;
+
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_FILE_LOOKUP_FAILED,
+ "lookup failed for %s on %s", local->loc.path, src->name);
+ goto err;
+ }
+
+ if (!check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+ readdirp_local->op_ret = -1;
+ readdirp_local->op_errno = ENOTEMPTY;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR,
+ "%s on %s is not a linkfile (type=0%o, gfid = %s)",
+ local->loc.path, src->name, stbuf->ia_type, gfid);
+ goto err;
+ }
+
+ STACK_WIND_COOKIE(frame, dht_rmdir_linkfile_unlink_cbk, src, src,
+ src->fops->unlink, &local->loc, 0, NULL);
+ return 0;
+err:
-int
-dht_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ this_call_cnt = dht_frame_return(readdirp_frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_rmdir_readdirp_do(readdirp_frame, this);
+ }
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+
+static int
+dht_rmdir_cached_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *parent)
{
- DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno);
- return 0;
+ dht_local_t *local = NULL;
+ xlator_t *src = NULL;
+ call_frame_t *readdirp_frame = NULL;
+ dht_local_t *readdirp_local = NULL;
+ int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
+ dict_t *xattrs = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ src = local->hashed_subvol;
+
+ /* main_frame here is the readdirp_frame */
+
+ readdirp_frame = local->main_frame;
+ readdirp_local = readdirp_frame->local;
+
+ gf_msg_debug(this->name, 0, "returning for %s ", local->loc.path);
+
+ if (op_ret == 0) {
+ readdirp_local->op_ret = -1;
+ readdirp_local->op_errno = ENOTEMPTY;
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_ERROR,
+ "%s found on cached subvol %s", local->loc.path, src->name);
+ goto err;
+ } else if (op_errno != ENOENT) {
+ readdirp_local->op_ret = -1;
+ readdirp_local->op_errno = op_errno;
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_SUBVOL_ERROR,
+ "%s not found on cached subvol %s", local->loc.path, src->name);
+ goto err;
+ }
+
+ xattrs = dict_new();
+ if (!xattrs) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dict_new failed");
+ goto err;
+ }
+
+ ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s",
+ conf->link_xattr_name);
+ if (xattrs)
+ dict_unref(xattrs);
+ goto err;
+ }
+ STACK_WIND_COOKIE(frame, dht_rmdir_lookup_cbk, src, src, src->fops->lookup,
+ &local->loc, xattrs);
+ if (xattrs)
+ dict_unref(xattrs);
+
+ return 0;
+err:
+
+ this_call_cnt = dht_frame_return(readdirp_frame);
+
+ /* Once all the lookups/unlinks etc have returned, proceed to wind
+ * readdirp on the subvol again until no entries are returned.
+ * This is required if there are more entries than can be returned
+ * in a single readdirp call.
+ */
+
+ if (is_last_call(this_call_cnt))
+ dht_rmdir_readdirp_do(readdirp_frame, this);
+
+ DHT_STACK_DESTROY(frame);
+ return 0;
}
+static int
+dht_rmdir_is_subvol_empty(call_frame_t *frame, xlator_t *this,
+ gf_dirent_t *entries, xlator_t *src)
+{
+ int ret = 0;
+ int build_ret = 0;
+ gf_dirent_t *trav = NULL;
+ call_frame_t *lookup_frame = NULL;
+ dht_local_t *lookup_local = NULL;
+ dht_local_t *local = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = this->private;
+ xlator_t *subvol = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int count = 0;
+ gf_boolean_t unwind = _gf_false;
+
+ local = frame->local;
+
+ list_for_each_entry(trav, &entries->list, list)
+ {
+ if (strcmp(trav->d_name, ".") == 0)
+ continue;
+ if (strcmp(trav->d_name, "..") == 0)
+ continue;
+ if (check_is_linkfile(NULL, (&trav->d_stat), trav->dict,
+ conf->link_xattr_name)) {
+ count++;
+ continue;
+ }
-int32_t
-dht_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->inode = inode_ref (loc->inode);
- local->call_cnt = 1;
-
- STACK_WIND (frame,
- dht_inodelk_cbk,
- subvol, subvol->fops->inodelk,
- volume, loc, cmd, lock);
-
- return 0;
+ /* this entry is either a directory which is neither "." nor "..",
+ or a non directory which is not a linkfile. the directory is to
+ be treated as non-empty
+ */
+ return 0;
+ }
+
+ xattrs = dict_new();
+ if (!xattrs) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dict_new failed");
+ return -1;
+ }
+
+ ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s",
+ conf->link_xattr_name);
+
+ if (xattrs)
+ dict_unref(xattrs);
+ return -1;
+ }
+
+ local->call_cnt = count;
+ ret = 0;
+
+ list_for_each_entry(trav, &entries->list, list)
+ {
+ if (strcmp(trav->d_name, ".") == 0)
+ continue;
+ if (strcmp(trav->d_name, "..") == 0)
+ continue;
+
+ lookup_frame = copy_frame(frame);
+
+ if (!lookup_frame) {
+ /* out of memory, let the rmdir fail
+ (as non-empty, unfortunately) */
+ goto err;
+ }
+
+ lookup_local = dht_local_init(lookup_frame, NULL, NULL, GF_FOP_LOOKUP);
+ if (!lookup_local) {
+ goto err;
+ }
+
+ lookup_frame->local = lookup_local;
+ lookup_local->main_frame = frame;
+ lookup_local->hashed_subvol = src;
+
+ build_ret = dht_build_child_loc(this, &lookup_local->loc, &local->loc,
+ trav->d_name);
+ if (build_ret != 0)
+ goto err;
+
+ gf_uuid_copy(lookup_local->loc.gfid, trav->d_stat.ia_gfid);
+
+ gf_uuid_unparse(lookup_local->loc.gfid, gfid);
+
+ gf_msg_trace(this->name, 0, "looking up %s on subvolume %s, gfid = %s",
+ lookup_local->loc.path, src->name, gfid);
+ subvol = dht_linkfile_subvol(this, NULL, &trav->d_stat, trav->dict);
+ if (!subvol || (subvol == src)) {
+ /* we need to delete the linkto file if it does not have a
+ * valid subvol or it points to itself.
+ */
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_INVALID_LINKFILE,
+ "Linkfile does not have link subvolume. "
+ "path = %s, gfid = %s",
+ lookup_local->loc.path, gfid);
+
+ gf_msg_debug(this->name, 0, "looking up %s on subvol %s, gfid = %s",
+ lookup_local->loc.path, src->name, gfid);
+
+ STACK_WIND_COOKIE(lookup_frame, dht_rmdir_lookup_cbk, src, src,
+ src->fops->lookup, &lookup_local->loc, xattrs);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Looking up linkfile target %s on "
+ " subvol %s, gfid = %s",
+ lookup_local->loc.path, subvol->name, gfid);
+
+ STACK_WIND(lookup_frame, dht_rmdir_cached_lookup_cbk, subvol,
+ subvol->fops->lookup, &lookup_local->loc, xattrs);
+ }
+ ret++;
+
+ lookup_frame = NULL;
+ lookup_local = NULL;
+ }
+
+ if (xattrs)
+ dict_unref(xattrs);
+
+ return ret;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (inodelk, frame, -1, op_errno);
+ if (xattrs)
+ dict_unref(xattrs);
- return 0;
-}
+ if (lookup_frame)
+ DHT_STACK_DESTROY(lookup_frame);
+ /* Handle the case where the wound calls have unwound before the
+ * loop processing is done
+ */
-int
-dht_finodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ LOCK(&frame->lock);
+ {
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
-{
- DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno);
- return 0;
+ local->call_cnt -= (count - ret);
+ if (!local->call_cnt)
+ unwind = _gf_true;
+ }
+ UNLOCK(&frame->lock);
+
+ if (!unwind) {
+ return ret;
+ }
+ return 0;
}
+/*
+ * No more entries on this subvol. Proceed to the actual rmdir operation.
+ */
-int
-dht_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock)
+static int
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src = NULL;
+ int ret = 0;
+ char *path = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ src = prev;
+
+ if (op_ret > 2) {
+ /* dht_rmdir_is_subvol_empty() may free the frame,
+ * copy path for logging.
+ */
+ path = gf_strdup(local->loc.path);
+
+ ret = dht_rmdir_is_subvol_empty(frame, this, entries, src);
+
+ switch (ret) {
+ case 0: /* non linkfiles exist */
+ gf_msg_trace(this->name, 0,
+ "readdir on %s for %s returned %d "
+ "entries",
+ prev->name, local->loc.path, op_ret);
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+ break;
+ default:
+ /* @ret number of linkfiles are getting unlinked */
+ gf_msg_trace(this->name, 0,
+ "readdir on %s for %s found %d "
+ "linkfiles",
+ prev->name, path, ret);
+ break;
+ }
+ }
+
+ /* readdirp failed or no linkto files were found on this subvol */
+ if (!ret)
+ dht_rmdir_readdirp_done(frame, this);
+
+ GF_FREE(path);
+ return 0;
+}
+
+static int
+dht_rmdir_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ xlator_t *prev = NULL;
+ int ret = 0;
+ dht_conf_t *conf = this->private;
+ dict_t *dict = NULL;
+ int i = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_local_t *readdirp_local = NULL;
+ call_frame_t *readdirp_frame = NULL;
+ int cnt = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ this_call_cnt = dht_frame_return(frame);
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ gf_msg_debug(this->name, op_errno,
+ "opendir on %s for %s failed, "
+ "gfid = %s,",
+ prev->name, local->loc.path, gfid);
+ if ((op_errno != ENOENT) && (op_errno != ESTALE)) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ goto err;
+ }
+
+ if (!is_last_call(this_call_cnt))
+ return 0;
+
+ if (local->op_ret == -1)
+ goto err;
+
+ fd_bind(fd);
+
+ dict = dict_new();
+ if (!dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dict_set_uint32(dict, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: Failed to set dictionary value:key = %s", local->loc.path,
+ conf->link_xattr_name);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ cnt = local->call_cnt = conf->subvolume_cnt;
+
+ /* Create a separate frame per subvol as we might need
+ * to resend readdirp multiple times to get all the
+ * entries.
+ */
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ readdirp_frame = copy_frame(frame);
+
+ if (!readdirp_frame) {
+ cnt--;
+ /* Reduce the local->call_cnt as well */
+ (void)dht_frame_return(frame);
+ continue;
+ }
+
+ readdirp_local = dht_local_init(readdirp_frame, &local->loc, local->fd,
+ 0);
+
+ if (!readdirp_local) {
+ DHT_STACK_DESTROY(readdirp_frame);
+ cnt--;
+ /* Reduce the local->call_cnt as well */
+ dht_frame_return(frame);
+ continue;
+ }
+ readdirp_local->main_frame = frame;
+ readdirp_local->op_ret = 0;
+ readdirp_local->xattr = dict_ref(dict);
+ /* overload this field to save the subvol info */
+ readdirp_local->hashed_subvol = conf->subvolumes[i];
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->readdirp,
+ readdirp_local->fd, 4096, 0, readdirp_local->xattr);
+ }
+ if (dict)
+ dict_unref(dict);
- STACK_WIND (frame,
- dht_finodelk_cbk,
- subvol, subvol->fops->finodelk,
- volume, fd, cmd, lock);
+ /* Could not wind readdirp to any subvol */
- return 0;
+ if (!cnt)
+ goto err;
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (finodelk, frame, -1, op_errno);
+ if (is_last_call(this_call_cnt)) {
+ dht_rmdir_do(frame, this);
+ }
- return 0;
+ return 0;
}
-
int
-dht_entrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
+dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno);
- return 0;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = -1;
+ dict_t *xattr_req = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_RMDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+ local->op_ret = 0;
+ local->fop_succeeded = 0;
+
+ local->flags = flags;
+
+ local->fd = fd_create(local->loc.inode, frame->root->pid);
+ if (!local->fd) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (flags) {
+ return dht_rmdir_do(frame, this);
+ }
+ if (xdata) {
+ xattr_req = dict_ref(xdata);
+ } else {
+ xattr_req = dict_new();
+ }
+ if (xattr_req) {
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ /* If parallel-readdir is enabled, this is required
+ * to handle stale linkto files in the directory
+ * being deleted. If this fails, log an error but
+ * do not prevent the operation.
+ */
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s",
+ loc->path, conf->link_xattr_name);
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s",
+ loc->path, conf->link_xattr_name);
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_rmdir_opendir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir, loc, local->fd,
+ xattr_req);
+ }
+
+ if (xattr_req) {
+ dict_unref(xattr_req);
+ }
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
}
+static int
+dht_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+/* TODO
+ * Sending entrylk to cached subvol can result in stale lock
+ * as described in the bug 1311002.
+ */
int
-dht_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->inode = inode_ref (loc->inode);
- local->call_cnt = 1;
-
- STACK_WIND (frame, dht_entrylk_cbk,
- subvol, subvol->fops->entrylk,
- volume, loc, basename, cmd, type);
-
- return 0;
+dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_ENTRYLK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_uuid_unparse(loc->gfid, gfid);
+
+ gf_msg_debug(this->name, 0,
+ "no cached subvolume for path=%s, "
+ "gfid = %s",
+ loc->path, gfid);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND(frame, dht_entrylk_cbk, subvol, subvol->fops->entrylk, volume,
+ loc, basename, cmd, type, xdata);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (entrylk, frame, -1, op_errno);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(entrylk, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
-int
-dht_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+static int
+dht_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno);
- return 0;
+ DHT_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, NULL);
+ return 0;
}
-
int
-dht_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(fd->inode, err);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ gf_uuid_unparse(fd->inode->gfid, gfid);
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ subvol = dht_subvol_get_cached(this, fd->inode);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0,
+ "No cached subvolume for fd=%p,"
+ " gfid = %s",
+ fd, gfid);
+ op_errno = EINVAL;
+ goto err;
+ }
- STACK_WIND (frame, dht_fentrylk_cbk,
- subvol, subvol->fops->fentrylk,
- volume, fd, basename, cmd, type);
+ STACK_WIND(frame, dht_fentrylk_cbk, subvol, subvol->fops->fentrylk, volume,
+ fd, basename, cmd, type, xdata);
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fentrylk, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
+static int32_t
+dht_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
-int
-dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *statpre,
- struct iatt *statpost)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
-
- dht_iatt_merge (this, &local->prebuf, statpre, prev->this);
- dht_iatt_merge (this, &local->stbuf, statpost, prev->this);
-
- if (local->inode) {
- local->prebuf.ia_ino = local->inode->ino;
- local->stbuf.ia_ino = local->inode->ino;
- }
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
- local->op_ret = 0;
- }
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret < 0 && op_errno != ENOTCONN) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ local->op_ret = 0;
+ }
unlock:
- UNLOCK (&frame->lock);
+ UNLOCK(&frame->lock);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno,
- &local->prebuf, &local->stbuf);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno, NULL);
+ }
- return 0;
+out:
+ return 0;
}
+int32_t
+dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+ dht_conf_t *conf = NULL;
+ int call_cnt = 0;
+ int i = 0;
-int
-dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid)
-{
- dht_layout_t *layout = NULL;
- dht_local_t *local = NULL;
- int op_errno = -1;
- int i = -1;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_DEBUG,
- "memory allocation failed :(");
- goto err;
- }
-
- local->layout = layout = dht_layout_get (this, loc->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- if (!layout_is_sane (layout)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "layout is not sane for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- local->inode = inode_ref (loc->inode);
- local->call_cnt = layout->cnt;
-
- for (i = 0; i < layout->cnt; i++) {
- STACK_WIND (frame, dht_setattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->setattr,
- loc, stbuf, valid);
- }
-
- return 0;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL);
+ if (op != GF_IPC_TARGET_UPCALL)
+ goto wind_default;
- return 0;
-}
+ VALIDATE_OR_GOTO(this->private, err);
+ conf = this->private;
+ local = dht_local_init(frame, NULL, NULL, GF_FOP_IPC);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
-int
-dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
- int32_t valid)
-{
- dht_layout_t *layout = NULL;
- dht_local_t *local = NULL;
- int op_errno = -1;
- int i = -1;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->layout = layout = dht_layout_get (this, fd->inode);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no layout for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- if (!layout_is_sane (layout)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "layout is not sane for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- local->inode = inode_ref (fd->inode);
- local->call_cnt = layout->cnt;
-
- for (i = 0; i < layout->cnt; i++) {
- STACK_WIND (frame, dht_setattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->fsetattr,
- fd, stbuf, valid);
- }
-
- return 0;
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ if (xdata) {
+ if (dict_set_int8(xdata, conf->xattr_name, 0) < 0)
+ goto err;
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND(frame, dht_ipc_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->ipc, op, xdata);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL);
+ DHT_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
- return 0;
-}
+ return 0;
+wind_default:
+ STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc, op, xdata);
+ return 0;
+}
int
-dht_forget (xlator_t *this, inode_t *inode)
+dht_forget(xlator_t *this, inode_t *inode)
{
- uint64_t tmp_layout = 0;
- dht_layout_t *layout = NULL;
+ uint64_t ctx_int = 0;
+ dht_inode_ctx_t *ctx = NULL;
+ dht_layout_t *layout = NULL;
+
+ inode_ctx_del(inode, this, &ctx_int);
- inode_ctx_get (inode, this, &tmp_layout);
+ if (!ctx_int)
+ return 0;
- if (!tmp_layout)
- return 0;
+ ctx = (dht_inode_ctx_t *)(long)ctx_int;
- layout = (dht_layout_t *)(long)tmp_layout;
- dht_layout_unref (this, layout);
+ layout = ctx->layout;
+ ctx->layout = NULL;
+ dht_layout_unref(this, layout);
+ GF_FREE(ctx);
- return 0;
+ return 0;
}
+int
+dht_notify(xlator_t *this, int event, void *data, ...)
+{
+ xlator_t *subvol = NULL;
+ int cnt = -1;
+ int i = -1;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int propagate = 0;
+
+ int had_heard_from_all = 0;
+ int have_heard_from_all = 0;
+ gf_defrag_info_t *defrag = NULL;
+ dict_t *dict = NULL;
+ gf_defrag_type cmd = 0;
+ dict_t *output = NULL;
+ va_list ap;
+ struct gf_upcall *up_data = NULL;
+ struct gf_upcall_cache_invalidation *up_ci = NULL;
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+ /* had all subvolumes reported status once till now? */
+ had_heard_from_all = 1;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->last_event[i]) {
+ had_heard_from_all = 0;
+ }
+ }
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ subvol = data;
-int
-dht_init_subvolumes (xlator_t *this, dht_conf_t *conf)
-{
- xlator_list_t *subvols = NULL;
- int cnt = 0;
-
- if (!conf)
- return -1;
-
- for (subvols = this->children; subvols; subvols = subvols->next)
- cnt++;
-
- conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *),
- gf_dht_mt_xlator_t);
- if (!conf->subvolumes) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return -1;
- }
- conf->subvolume_cnt = cnt;
-
- cnt = 0;
- for (subvols = this->children; subvols; subvols = subvols->next)
- conf->subvolumes[cnt++] = subvols->xlator;
-
- conf->subvolume_status = GF_CALLOC (cnt, sizeof (char),
- gf_dht_mt_char);
- if (!conf->subvolume_status) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return -1;
- }
-
- conf->last_event = GF_CALLOC (cnt, sizeof (int),
- gf_dht_mt_char);
- if (!conf->last_event) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return -1;
- }
- return 0;
-}
+ conf->gen++;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
+ }
+ }
-int
-dht_notify (xlator_t *this, int event, void *data, ...)
-{
- xlator_t *subvol = NULL;
- int cnt = -1;
- int i = -1;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int propagate = 0;
+ if (cnt == -1) {
+ gf_msg_debug(this->name, 0,
+ "got GF_EVENT_CHILD_UP bad "
+ "subvolume %s",
+ subvol->name);
+ break;
+ }
- int had_heard_from_all = 0;
- int have_heard_from_all = 0;
+ LOCK(&conf->subvolume_lock);
+ {
+ conf->subvolume_status[cnt] = 1;
+ conf->last_event[cnt] = event;
+ conf->subvol_up_time[cnt] = gf_time();
+ }
+ UNLOCK(&conf->subvolume_lock);
+ /* one of the node came back up, do a stat update */
+ dht_get_du_info_for_subvol(this, cnt);
- conf = this->private;
- if (!conf)
- return ret;
+ break;
- /* had all subvolumes reported status once till now? */
- had_heard_from_all = 1;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->last_event[i]) {
- had_heard_from_all = 0;
+ case GF_EVENT_SOME_DESCENDENT_UP:
+ subvol = data;
+ conf->gen++;
+ propagate = 1;
+
+ break;
+
+ case GF_EVENT_SOME_DESCENDENT_DOWN:
+ subvol = data;
+ propagate = 1;
+
+ break;
+
+ case GF_EVENT_CHILD_DOWN:
+ subvol = data;
+
+ if (conf->assert_no_child_down) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_CHILD_DOWN,
+ "Received CHILD_DOWN. Exiting");
+ if (conf->defrag) {
+ gf_defrag_stop(conf, GF_DEFRAG_STATUS_FAILED, NULL);
+ } else {
+ kill(getpid(), SIGTERM);
}
- }
+ }
- switch (event) {
- case GF_EVENT_CHILD_UP:
- subvol = data;
-
- conf->gen++;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- cnt = i;
- break;
- }
- }
-
- if (cnt == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "got GF_EVENT_CHILD_UP bad subvolume %s",
- subvol->name);
- break;
- }
-
- LOCK (&conf->subvolume_lock);
- {
- conf->subvolume_status[cnt] = 1;
- conf->last_event[cnt] = event;
- }
- UNLOCK (&conf->subvolume_lock);
-
- /* one of the node came back up, do a stat update */
- dht_get_du_info_for_subvol (this, cnt);
-
- break;
-
- case GF_EVENT_CHILD_DOWN:
- subvol = data;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- cnt = i;
- break;
- }
- }
-
- if (cnt == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "got GF_EVENT_CHILD_DOWN bad subvolume %s",
- subvol->name);
- break;
- }
-
- LOCK (&conf->subvolume_lock);
- {
- conf->subvolume_status[cnt] = 0;
- conf->last_event[cnt] = event;
- }
- UNLOCK (&conf->subvolume_lock);
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
+ }
+ }
+ if (cnt == -1) {
+ gf_msg_debug(this->name, 0,
+ "got GF_EVENT_CHILD_DOWN bad "
+ "subvolume %s",
+ subvol->name);
break;
+ }
+
+ LOCK(&conf->subvolume_lock);
+ {
+ conf->subvolume_status[cnt] = 0;
+ conf->last_event[cnt] = event;
+ conf->subvol_up_time[cnt] = 0;
+ }
+ UNLOCK(&conf->subvolume_lock);
+
+ for (i = 0; i < conf->subvolume_cnt; i++)
+ if (conf->last_event[i] != event)
+ event = GF_EVENT_SOME_DESCENDENT_DOWN;
+ break;
+
+ case GF_EVENT_CHILD_CONNECTING:
+ subvol = data;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
+ }
+ }
- case GF_EVENT_CHILD_CONNECTING:
- subvol = data;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- cnt = i;
- break;
- }
- }
-
- if (cnt == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "got GF_EVENT_CHILD_CONNECTING bad subvolume %s",
- subvol->name);
- break;
- }
-
- LOCK (&conf->subvolume_lock);
- {
- conf->last_event[cnt] = event;
- }
- UNLOCK (&conf->subvolume_lock);
-
- break;
- default:
- propagate = 1;
+ if (cnt == -1) {
+ gf_msg_debug(this->name, 0,
+ "got GF_EVENT_CHILD_CONNECTING"
+ " bad subvolume %s",
+ subvol->name);
break;
- }
+ }
+
+ LOCK(&conf->subvolume_lock);
+ {
+ conf->last_event[cnt] = event;
+ }
+ UNLOCK(&conf->subvolume_lock);
+
+ break;
+ case GF_EVENT_VOLUME_DEFRAG: {
+ if (!conf->defrag) {
+ return ret;
+ }
+ defrag = conf->defrag;
+ dict = data;
+ va_start(ap, data);
+ output = va_arg(ap, dict_t *);
+
+ ret = dict_get_int32(dict, "rebalance-command", (int32_t *)&cmd);
+ if (ret) {
+ va_end(ap);
+ return ret;
+ }
+ LOCK(&defrag->lock);
+ {
+ if (defrag->is_exiting)
+ goto unlock;
+ if ((cmd == GF_DEFRAG_CMD_STATUS) ||
+ (cmd == GF_DEFRAG_CMD_DETACH_STATUS))
+ gf_defrag_status_get(conf, output);
+ else if (cmd == GF_DEFRAG_CMD_DETACH_START)
+ defrag->cmd = GF_DEFRAG_CMD_DETACH_START;
+ else if (cmd == GF_DEFRAG_CMD_STOP ||
+ cmd == GF_DEFRAG_CMD_DETACH_STOP)
+ gf_defrag_stop(conf, GF_DEFRAG_STATUS_STOPPED, output);
+ }
+ unlock:
+ UNLOCK(&defrag->lock);
+ va_end(ap);
+ return ret;
+ break;
+ }
+ case GF_EVENT_UPCALL:
+ up_data = (struct gf_upcall *)data;
+ if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
+ break;
+ up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
+
+ /* Since md-cache will be aggressively filtering lookups,
+ * the stale layout issue will be more pronounced. Hence
+ * when a layout xattr is changed by the rebalance process
+ * notify all the md-cache clients to invalidate the existing
+ * stat cache and send the lookup next time*/
+ if (up_ci->dict && dict_get(up_ci->dict, conf->xattr_name))
+ up_ci->flags |= UP_EXPLICIT_LOOKUP;
+
+ /* TODO: Instead of invalidating iatt, update the new
+ * hashed/cached subvolume in dht inode_ctx */
+ if (IS_DHT_LINKFILE_MODE(&up_ci->stat))
+ up_ci->flags |= UP_EXPLICIT_LOOKUP;
+
+ propagate = 1;
+ break;
+ default:
+ propagate = 1;
+ break;
+ }
+
+ /* have all subvolumes reported status once by now? */
+ have_heard_from_all = 1;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->last_event[i])
+ have_heard_from_all = 0;
+ }
+
+ /* if all subvols have reported status, no need to hide anything
+ or wait for anything else. Just propagate blindly */
+ if (have_heard_from_all) {
+ propagate = 1;
+ }
+
+ if (!had_heard_from_all && have_heard_from_all) {
+ static int run_defrag = 0;
+ /* This is the first event which completes aggregation
+ of events from all subvolumes. If at least one subvol
+ had come up, propagate CHILD_UP, but only this time
+ */
+ event = GF_EVENT_CHILD_DOWN;
- /* have all subvolumes reported status once by now? */
- have_heard_from_all = 1;
for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->last_event[i])
- have_heard_from_all = 0;
+ if (conf->last_event[i] == GF_EVENT_CHILD_UP) {
+ event = GF_EVENT_CHILD_UP;
+ break;
+ }
+
+ if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
+ event = GF_EVENT_CHILD_CONNECTING;
+ /* continue to check other events for CHILD_UP */
+ }
}
- /* if all subvols have reported status, no need to hide anything
- or wait for anything else. Just propagate blindly */
- if (have_heard_from_all)
- propagate = 1;
+ /* Rebalance is started with assert_no_child_down. So we do
+ * not need to handle CHILD_DOWN event here.
+ *
+ * If there is a graph switch, we should not restart the
+ * rebalance daemon. Use 'run_defrag' to indicate if the
+ * thread has already started.
+ */
+ if (conf->defrag && !run_defrag) {
+ run_defrag = 1;
+ ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start,
+ this, "dhtdg");
+ if (ret) {
+ GF_FREE(conf->defrag);
+ conf->defrag = NULL;
+ kill(getpid(), SIGTERM);
+ }
+ }
+ }
- if (!had_heard_from_all && have_heard_from_all) {
- /* This is the first event which completes aggregation
- of events from all subvolumes. If at least one subvol
- had come up, propagate CHILD_UP, but only this time
- */
- event = GF_EVENT_CHILD_DOWN;
+ ret = 0;
+ if (propagate)
+ ret = default_notify(this, event, data);
+out:
+ return ret;
+}
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->last_event[i] == GF_EVENT_CHILD_UP) {
- event = GF_EVENT_CHILD_UP;
- break;
- }
+int
+dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
- event = GF_EVENT_CHILD_CONNECTING;
- /* continue to check other events for CHILD_UP */
- }
- }
+ ret = dht_inode_ctx_get(inode, this, &ctx);
+
+ if (!ret && ctx) {
+ if (ctx->layout) {
+ if (layout)
+ *layout = ctx->layout;
+ ret = 0;
+ } else {
+ ret = -1;
}
+ }
+
+ return ret;
+}
+
+void
+dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
+ dht_layout_t *layout)
+{
+ char string[2048] = {0};
+ char *output_string = NULL;
+ int len = 0;
+ int off = 0;
+ int i = 0;
+ gf_loglevel_t log_level = gf_log_get_loglevel();
+ int ret = 0;
+
+ if (log_level < GF_LOG_INFO)
+ return;
+
+ if (!layout)
+ return;
+
+ if (!layout->cnt)
+ return;
+
+ if (!loc)
+ return;
+
+ if (!loc->path)
+ return;
+
+ ret = snprintf(string, sizeof(string), "Setting layout of %s with ",
+ loc->path);
+
+ if (ret < 0)
+ return;
+
+ len += ret;
+
+ /* Calculation of total length of the string required to calloc
+ * output_string. Log includes subvolume-name, start-range, end-range
+ * and err value.
+ *
+ * This log will help to debug cases where:
+ * a) Different processes set different layout of a directory.
+ * b) Error captured in lookup, which will be filled in layout->err
+ * (like ENOENT, ESTALE etc)
+ */
+
+ for (i = 0; i < layout->cnt; i++) {
+ ret = snprintf(string, sizeof(string),
+ "[Subvol_name: %s, Err: %d , Start: "
+ "0x%x, Stop: 0x%x, Hash: 0x%x], ",
+ layout->list[i].xlator->name, layout->list[i].err,
+ layout->list[i].start, layout->list[i].stop,
+ layout->list[i].commit_hash);
+
+ if (ret < 0)
+ return;
+
+ len += ret;
+ }
+
+ len++;
+
+ output_string = GF_MALLOC(len + 1, gf_common_mt_char);
+
+ if (!output_string)
+ return;
+
+ ret = snprintf(output_string, len + 1, "Setting layout of %s with ",
+ loc->path);
+
+ if (ret < 0)
+ goto err;
+
+ off += ret;
+
+ for (i = 0; i < layout->cnt; i++) {
+ ret = snprintf(output_string + off, len - off,
+ "[Subvol_name: %s, Err: %d , Start: "
+ "0x%x, Stop: 0x%x, Hash: 0x%x], ",
+ layout->list[i].xlator->name, layout->list[i].err,
+ layout->list[i].start, layout->list[i].stop,
+ layout->list[i].commit_hash);
+
+ if (ret < 0)
+ goto err;
+
+ off += ret;
+ }
+
+ gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_FIXED_LAYOUT, "%s",
+ output_string);
+
+err:
+ GF_FREE(output_string);
+}
+
+int32_t
+dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local)
+{
+ int ret = -1;
+
+ if (!local)
+ goto out;
+
+ local->rebalance.target_node = dht_subvol_get_hashed(this, &local->loc);
+
+ if (local->rebalance.target_node)
+ ret = 0;
+
+out:
+ return ret;
+}
- if (propagate)
- ret = default_notify (this, event, data);
+/*
+This function should not be called more then once during a FOP
+handling path. It is valid only for for ops on files
+*/
+int32_t
+dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ if (!local)
+ return -1;
+
+ if (local->rebalance.set) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REBAL_STRUCT_SET,
+ "local->rebalance already set");
+ }
+
+ if (stbuf)
+ memcpy(&local->rebalance.stbuf, stbuf, sizeof(struct iatt));
+
+ if (prebuf)
+ memcpy(&local->rebalance.prebuf, prebuf, sizeof(struct iatt));
+
+ if (postbuf)
+ memcpy(&local->rebalance.postbuf, postbuf, sizeof(struct iatt));
+
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
+
+ local->rebalance.set = 1;
+
+ return 0;
+}
+
+int32_t
+dht_release(xlator_t *this, fd_t *fd)
+{
+ return dht_fd_ctx_destroy(this, fd);
+}
+
+static int
+dht_pt_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (!op_ret) {
+ dht_layout_set(this, inode, local->layout);
+ }
+
+ DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, NULL);
+
+ return 0;
+}
+
+int32_t
+dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ bool free_xdata = false;
+ int ret = 0;
+ int op_errno = 0;
+ int32_t *disk_layout_p = NULL;
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!layout)
+ goto wind;
+
+ local->layout = layout;
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata)
+ goto wind;
+ free_xdata = true;
+ }
+
+ /*Set the xlator or the following will crash*/
+ layout->list[0].xlator = conf->subvolumes[0];
+
+ dht_selfheal_layout_new_directory(frame, loc, layout);
+
+ dht_disk_layout_extract(this, layout, 0, &disk_layout_p);
+
+ ret = dict_set_bin(xdata, conf->xattr_name, disk_layout_p, 4 * 4);
+ if (ret) {
+ gf_msg("dht", GF_LOG_DEBUG, EINVAL, DHT_MSG_DICT_SET_FAILED,
+ "dht layout dict set failed");
+ }
+wind:
+ STACK_WIND(frame, dht_pt_mkdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+ if (free_xdata)
+ dict_unref(xdata);
+ return 0;
+
+err:
+ op_errno = local ? local->op_errno : op_errno;
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return ret;
+ return 0;
}
+static int
+dht_pt_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ dict_del(xattr, conf->xattr_name);
+ dict_del(xattr, conf->mds_xattr_key);
+ dict_del(xattr, conf->commithash_xattr_name);
+
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
+
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
+ return 0;
+}
+
+int
+dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
+{
+ STACK_WIND(frame, dht_pt_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr, loc, key, xdata);
+ return 0;
+}
+
+static int
+dht_pt_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ dict_del(xattr, conf->xattr_name);
+
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
+
+ DHT_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+ return 0;
+}
+
+int
+dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
+{
+ STACK_WIND(frame, dht_pt_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata);
+ return 0;
+}
+
+/* The job of this function is to check if all the xlators have updated
+ * error in the layout. */
+int
+dht_dir_layout_error_check(xlator_t *this, inode_t *inode)
+{
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+ layout = dht_layout_get(this, inode);
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == 0) {
+ return 0;
+ }
+ }
+
+ /* Returning the first xlator error as all xlators have errors */
+ return layout->list[0].err;
+}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index f0510f868f3..fe0dc3db34a 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -1,305 +1,1384 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include <regex.h>
#include "dht-mem-types.h"
+#include "dht-messages.h"
+#include <glusterfs/call-stub.h>
+#include "libxlator.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/refcount.h>
+#include <glusterfs/timer.h>
+#include "protocol-common.h"
+#include <glusterfs/glusterfs-acl.h>
#ifndef _DHT_H
#define _DHT_H
-#define GF_XATTR_FIX_LAYOUT_KEY "trusted.distribute.fix.layout"
-#define GF_DHT_LOOKUP_UNHASHED_ON 1
+#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout"
+#define GF_XATTR_FILE_MIGRATE_KEY "trusted.distribute.migrate-data"
+#define DHT_MDS_STR "mds"
+#define GF_DHT_LOOKUP_UNHASHED_OFF 0
+#define GF_DHT_LOOKUP_UNHASHED_ON 1
#define GF_DHT_LOOKUP_UNHASHED_AUTO 2
+#define DHT_PATHINFO_HEADER "DISTRIBUTE:"
+#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate"
+/* Layout synchronization */
+#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal"
+/* Namespace synchronization */
+#define DHT_ENTRY_SYNC_DOMAIN "dht.entry.sync"
+#define DHT_LAYOUT_HASH_INVALID 1
+#define MAX_REBAL_THREADS sysconf(_SC_NPROCESSORS_ONLN)
+
+#define DHT_DIR_STAT_BLOCKS 8
+#define DHT_DIR_STAT_SIZE 4096
+
+/* Virtual xattr for subvols status */
+
+#define DHT_SUBVOL_STATUS_KEY "dht.subvol.status"
+
+/* Virtual xattrs for debugging */
+
+#define DHT_DBG_HASHED_SUBVOL_PATTERN "dht.file.hashed-subvol.*"
+#define DHT_DBG_HASHED_SUBVOL_KEY "dht.file.hashed-subvol."
-#include <fnmatch.h>
+/* Rebalance nodeuuid flags */
+#define REBAL_NODEUUID_MINE 0x01
-typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno);
+typedef int (*dht_selfheal_dir_cbk_t)(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata);
+typedef int (*dht_defrag_cbk_fn_t)(xlator_t *this, xlator_t *dst_node,
+ call_frame_t *frame, int ret);
+typedef int (*dht_refresh_layout_unlock)(call_frame_t *frame, xlator_t *this,
+ int op_ret, int invoke_cbk);
+
+typedef int (*dht_refresh_layout_done_handle)(call_frame_t *frame);
struct dht_layout {
- int cnt;
- int preset;
- int gen;
- int type;
- int ref; /* use with dht_conf_t->layout_lock */
- int search_unhashed;
- struct {
- int err; /* 0 = normal
- -1 = dir exists and no xattr
- >0 = dir lookup failed with errno
- */
- uint32_t start;
- uint32_t stop;
- xlator_t *xlator;
- } list[0];
+ int spread_cnt; /* layout spread count per directory,
+ is controlled by 'setxattr()' with
+ special key */
+ int cnt;
+ int preset;
+ /*
+ * The last *configuration* state for which this directory was known
+ * to be in balance. The corresponding vol_commit_hash changes
+ * whenever bricks are added or removed. This value changes when a
+ * (full) rebalance is complete. If they match, it's safe to assume
+ * that every file is where it should be and there's no need to do
+ * lookups for files elsewhere. If they don't, then we have to do a
+ * global lookup to be sure.
+ */
+ uint32_t commit_hash;
+ /*
+ * The *runtime* state of the volume, changes when connections to
+ * bricks are made or lost.
+ */
+ int gen;
+ int type;
+ gf_atomic_t ref; /* use with dht_conf_t->layout_lock */
+ uint32_t search_unhashed;
+ struct {
+ int err; /* 0 = normal
+ -1 = dir exists and no xattr
+ >0 = dir lookup failed with errno
+ */
+ uint32_t start;
+ uint32_t stop;
+ uint32_t commit_hash;
+ xlator_t *xlator;
+ } list[];
};
typedef struct dht_layout dht_layout_t;
+struct dht_stat_time {
+ uint32_t atime;
+ uint32_t atime_nsec;
+ uint32_t ctime;
+ uint32_t ctime_nsec;
+ uint32_t mtime;
+ uint32_t mtime_nsec;
+};
+
+typedef struct dht_stat_time dht_stat_time_t;
+
+struct dht_inode_ctx {
+ dht_layout_t *layout;
+ dht_stat_time_t time;
+ xlator_t *lock_subvol;
+ xlator_t *mds_subvol; /* This is only used for directories */
+};
+
+typedef struct dht_inode_ctx dht_inode_ctx_t;
typedef enum {
- DHT_HASH_TYPE_DM,
+ DHT_HASH_TYPE_DM,
+ DHT_HASH_TYPE_DM_USER,
} dht_hashfn_type_t;
+typedef enum {
+ DHT_INODELK,
+ DHT_ENTRYLK,
+} dht_lock_type_t;
+
+/* rebalance related */
+struct dht_rebalance_ {
+ xlator_t *from_subvol;
+ xlator_t *target_node;
+ off_t offset;
+ size_t size;
+ int32_t flags;
+ int count;
+ struct iobref *iobref;
+ struct iovec *vector;
+ struct iatt stbuf;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ dht_defrag_cbk_fn_t target_op_fn;
+ dict_t *xdata;
+ dict_t *xattr;
+ dict_t *dict;
+ struct gf_flock flock;
+ int32_t set;
+ int lock_cmd;
+};
+
+/**
+ * Enum to store decided action based on the qdstatfs (quota-deem-statfs)
+ * events
+ **/
+typedef enum {
+ qdstatfs_action_OFF = 0,
+ qdstatfs_action_REPLACE,
+ qdstatfs_action_NEGLECT,
+ qdstatfs_action_COMPARE,
+} qdstatfs_action_t;
+
+typedef enum {
+ REACTION_INVALID,
+ FAIL_ON_ANY_ERROR,
+ IGNORE_ENOENT_ESTALE,
+ IGNORE_ENOENT_ESTALE_EIO,
+} dht_reaction_type_t;
+
+struct dht_skip_linkto_unlink {
+ xlator_t *hash_links_to;
+ uuid_t cached_gfid;
+ uuid_t hashed_gfid;
+ int opend_fd_count;
+ gf_boolean_t handle_valid_link;
+};
+
+typedef struct {
+ xlator_t *xl;
+ loc_t loc; /* contains/points to inode to lock on. */
+ char *domain; /* Only locks within a single domain
+ * contend with each other
+ */
+ char *basename; /* Required for entrylk */
+ gf_boolean_t locked;
+ dht_reaction_type_t do_on_failure;
+ short type; /* read/write lock. */
+ gf_lkowner_t lk_owner;
+} dht_lock_t;
+
+/* The lock structure represents inodelk. */
+typedef struct {
+ fop_inodelk_cbk_t inodelk_cbk;
+ dht_lock_t **locks;
+ int lk_count;
+ dht_reaction_type_t reaction;
+
+ /* whether locking failed on _any_ of the "locks" above */
+ int op_ret;
+ int op_errno;
+} dht_ilock_wrap_t;
+
+/* The lock structure represents entrylk. */
+typedef struct {
+ fop_entrylk_cbk_t entrylk_cbk;
+ dht_lock_t **locks;
+ int lk_count;
+ dht_reaction_type_t reaction;
+
+ /* whether locking failed on _any_ of the "locks" above */
+ int op_ret;
+ int op_errno;
+} dht_elock_wrap_t;
+
+/* The first member of dht_dir_transaction_t should be of type dht_ilock_wrap_t.
+ * Otherwise it can result in subtle memory corruption issues as in most of the
+ * places we use lock[0].layout.my_layout or lock[0].layout.parent_layout and
+ * lock[0].ns.parent_layout (like in dht_local_wipe).
+ */
+typedef union {
+ union {
+ dht_ilock_wrap_t my_layout;
+ dht_ilock_wrap_t parent_layout;
+ } layout;
+ struct dht_namespace {
+ dht_ilock_wrap_t parent_layout;
+ dht_elock_wrap_t directory_ns;
+ fop_entrylk_cbk_t ns_cbk;
+ } ns;
+} dht_dir_transaction_t;
+
+typedef int (*dht_selfheal_layout_t)(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout);
+
+typedef gf_boolean_t (*dht_need_heal_t)(call_frame_t *frame,
+ dht_layout_t **inmem,
+ dht_layout_t **ondisk);
struct dht_local {
- int call_cnt;
- loc_t loc;
- loc_t loc2;
- int op_ret;
- int op_errno;
- int layout_mismatch;
- /* Use stbuf as the postbuf, when we require both
- * pre and post attrs */
- struct iatt stbuf;
- struct iatt prebuf;
- struct iatt preoldparent;
- struct iatt postoldparent;
- struct iatt preparent;
- struct iatt postparent;
- struct statvfs statvfs;
- fd_t *fd;
- inode_t *inode;
- dict_t *params;
- dict_t *xattr;
- dict_t *xattr_req;
- dht_layout_t *layout;
- size_t size;
- ino_t ia_ino;
- xlator_t *src_hashed, *src_cached;
- xlator_t *dst_hashed, *dst_cached;
- xlator_t *cached_subvol;
- xlator_t *hashed_subvol;
- char need_selfheal;
- int file_count;
- int dir_count;
- call_frame_t *main_frame;
- struct {
- fop_mknod_cbk_t linkfile_cbk;
- struct iatt stbuf;
- loc_t loc;
- inode_t *inode;
- dict_t *xattr;
- xlator_t *srcvol;
- } linkfile;
- struct {
- uint32_t hole_cnt;
- uint32_t overlaps_cnt;
- uint32_t missing;
- uint32_t down;
- uint32_t misc;
- dht_selfheal_dir_cbk_t dir_cbk;
- dht_layout_t *layout;
- } selfheal;
- uint32_t uid;
- uint32_t gid;
-
- /* needed by nufa */
- int32_t flags;
- mode_t mode;
- dev_t rdev;
-
- /* need for file-info */
- char *pathinfo;
- char *key;
-
- char *newpath;
-
- /* gfid related */
- uuid_t gfid;
+ loc_t loc;
+ loc_t loc2;
+ int call_cnt;
+ int op_ret;
+ int op_errno;
+ int layout_mismatch;
+ /* Use stbuf as the postbuf, when we require both
+ * pre and post attrs */
+ struct iatt stbuf;
+ struct iatt mds_stbuf;
+ struct iatt prebuf;
+ struct iatt preoldparent;
+ struct iatt postoldparent;
+ struct iatt preparent;
+ struct iatt postparent;
+ struct statvfs statvfs;
+ fd_t *fd;
+ inode_t *inode;
+ dict_t *params;
+ dict_t *xattr;
+ dict_t *mds_xattr;
+ dict_t *xdata; /* dict used to save xdata response by xattr fop */
+ dict_t *xattr_req;
+ dht_layout_t *layout;
+ size_t size;
+ ino_t ia_ino;
+ xlator_t *src_hashed, *src_cached;
+ xlator_t *dst_hashed, *dst_cached;
+ xlator_t *cached_subvol;
+ xlator_t *hashed_subvol;
+ xlator_t *mds_subvol; /* This is use for dir only */
+ int file_count;
+ int dir_count;
+ call_frame_t *main_frame;
+ int fop_succeeded;
+ struct {
+ fop_mknod_cbk_t linkfile_cbk;
+ struct iatt stbuf;
+ loc_t loc;
+ inode_t *inode;
+ dict_t *xattr;
+ xlator_t *srcvol;
+ } linkfile;
+ struct {
+ uint32_t hole_cnt;
+ uint32_t overlaps_cnt;
+ uint32_t down;
+ uint32_t misc;
+ dht_selfheal_dir_cbk_t dir_cbk;
+ dht_selfheal_layout_t healer;
+ dht_need_heal_t should_heal;
+ dht_layout_t *layout, *refreshed_layout;
+ uint32_t missing_cnt;
+ gf_boolean_t force_mkdir;
+ } selfheal;
+
+ dht_refresh_layout_unlock refresh_layout_unlock;
+ dht_refresh_layout_done_handle refresh_layout_done;
+
+ uint32_t uid;
+ uint32_t gid;
+ pid_t pid;
+
+ glusterfs_fop_t fop;
+
+ /* need for file-info */
+ char *xattr_val;
+ char *key;
+
+ /* needed by nufa */
+ int32_t flags;
+ mode_t mode;
+ dev_t rdev;
+ mode_t umask;
+
+ /* which xattr request? */
+ char xsel[256];
+ int32_t alloc_len;
+
+ /* gfid related */
+ uuid_t gfid;
+ uuid_t gfid_req;
+
+ xlator_t *link_subvol;
+
+ struct dht_rebalance_ rebalance;
+ xlator_t *first_up_subvol;
+
+ struct dht_skip_linkto_unlink skip_unlink;
+
+ dht_dir_transaction_t lock[2], *current;
+
+ /* inodelks during filerename for backward compatibility */
+ dht_lock_t **rename_inodelk_backward_compatible;
+
+ call_stub_t *stub;
+ int32_t parent_disk_layout[4];
+
+ /* rename rollback */
+ int *ret_cache;
+
+ loc_t loc2_copy;
+
+ int rename_inodelk_bc_count;
+ /* This is use only for directory operation */
+ int32_t valid;
+ int32_t mds_heal_fresh_lookup;
+ short lock_type;
+ char need_selfheal;
+ char need_xattr_heal;
+ char need_attrheal;
+ /* flag used to make sure we need to return estale in
+ {lookup,revalidate}_cbk */
+ char return_estale;
+ char need_lookup_everywhere;
+ /* fd open check */
+ gf_boolean_t fd_checked;
+ gf_boolean_t linked;
+ gf_boolean_t added_link;
+ gf_boolean_t is_linkfile;
+ gf_boolean_t quota_deem_statfs;
+ gf_boolean_t heal_layout;
+ gf_boolean_t locked;
+ gf_boolean_t dont_create_linkto;
+ gf_boolean_t gfid_missing;
};
typedef struct dht_local dht_local_t;
/* du - disk-usage */
struct dht_du {
- double avail_percent;
- uint64_t avail_space;
- uint32_t log;
+ double avail_percent;
+ double avail_inodes;
+ uint64_t avail_space;
+ uint32_t log;
+ uint32_t chunks;
+ uint32_t total_blocks;
+ uint32_t avail_blocks;
+ uint32_t frsize; /*fragment size*/
};
typedef struct dht_du dht_du_t;
+enum gf_defrag_type {
+ GF_DEFRAG_CMD_NONE = 0,
+ GF_DEFRAG_CMD_START = 1,
+ GF_DEFRAG_CMD_STOP = 1 + 1,
+ GF_DEFRAG_CMD_STATUS = 1 + 2,
+ GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3,
+ GF_DEFRAG_CMD_START_FORCE = 1 + 4,
+ GF_DEFRAG_CMD_DETACH_STATUS = 1 + 11,
+ GF_DEFRAG_CMD_DETACH_START = 1 + 13,
+ GF_DEFRAG_CMD_DETACH_COMMIT = 1 + 14,
+ GF_DEFRAG_CMD_DETACH_COMMIT_FORCE = 1 + 15,
+ GF_DEFRAG_CMD_DETACH_STOP = 1 + 16,
+ /* new labels are used so it will help
+ * while removing old labels by easily differentiating.
+ * A few labels are added so that the count remains same
+ * between this enum and the ones on the xdr file.
+ * different values for the same enum cause errors and
+ * confusion.
+ */
+};
+typedef enum gf_defrag_type gf_defrag_type;
+
+enum gf_defrag_status_t {
+ GF_DEFRAG_STATUS_NOT_STARTED,
+ GF_DEFRAG_STATUS_STARTED,
+ GF_DEFRAG_STATUS_STOPPED,
+ GF_DEFRAG_STATUS_COMPLETE,
+ GF_DEFRAG_STATUS_FAILED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
+};
+typedef enum gf_defrag_status_t gf_defrag_status_t;
+
+typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t;
+
+struct gf_defrag_pattern_list {
+ char path_pattern[256];
+ uint64_t size;
+ gf_defrag_pattern_list_t *next;
+};
+
+struct dht_container {
+ union {
+ struct list_head list;
+ struct {
+ struct _gf_dirent_t *next;
+ struct _gf_dirent_t *prev;
+ };
+ };
+ gf_dirent_t *df_entry;
+ xlator_t *this;
+ loc_t *parent_loc;
+ dict_t *migrate_data;
+ int local_subvol_index;
+};
+
+typedef struct nodeuuid_info {
+ char info; /* Set to 1 is this is my node's uuid*/
+ uuid_t uuid; /* Store the nodeuuid as well for debugging*/
+} nodeuuid_info_t;
+
+typedef struct subvol_nodeuuids_info {
+ nodeuuid_info_t *elements;
+ int count;
+} subvol_nodeuuids_info_t;
+
+struct gf_defrag_info_ {
+ uint64_t total_files;
+ uint64_t total_data;
+ uint64_t num_files_lookedup;
+ uint64_t total_failures;
+ uint64_t skipped;
+ uint64_t num_dirs_processed;
+ uint64_t size_processed;
+ gf_lock_t lock;
+ pthread_t th;
+ struct rpc_clnt *rpc;
+ uint32_t connected;
+ uint32_t is_exiting;
+ pid_t pid;
+ int cmd;
+ inode_t *root_inode;
+ uuid_t node_uuid;
+ time_t start_time;
+ uint32_t new_commit_hash;
+ gf_defrag_status_t defrag_status;
+ gf_defrag_pattern_list_t *defrag_pattern;
+
+ pthread_cond_t parallel_migration_cond;
+ pthread_mutex_t dfq_mutex;
+ pthread_cond_t rebalance_crawler_alarm;
+ int32_t q_entry_count;
+ int32_t global_error;
+ struct dht_container *queue;
+ int32_t crawl_done;
+ int32_t abort;
+ int32_t wakeup_crawler;
+
+ /*Throttle params*/
+ /*stands for reconfigured thread count*/
+ int32_t recon_thread_count;
+ pthread_cond_t df_wakeup_thread;
+
+ /* backpointer to make it easier to write functions for rebalance */
+ xlator_t *this;
+
+ pthread_cond_t fc_wakeup_cond;
+ pthread_mutex_t fc_mutex;
+
+ /*stands for current running thread count*/
+ int32_t current_thread_count;
+
+ gf_boolean_t stats;
+ /* lock migration flag */
+ gf_boolean_t lock_migration_enabled;
+};
+
+typedef struct gf_defrag_info_ gf_defrag_info_t;
+
+struct dht_methods_s {
+ int32_t (*migration_get_dst_subvol)(xlator_t *this, dht_local_t *local);
+ int32_t (*migration_other)(xlator_t *this, gf_defrag_info_t *defrag);
+ xlator_t *(*layout_search)(xlator_t *this, dht_layout_t *layout,
+ const char *name);
+};
+
+typedef struct dht_methods_s dht_methods_t;
+
struct dht_conf {
- gf_lock_t subvolume_lock;
- int subvolume_cnt;
- xlator_t **subvolumes;
- char *subvolume_status;
- int *last_event;
- dht_layout_t **file_layouts;
- dht_layout_t **dir_layouts;
- dht_layout_t *default_dir_layout;
- gf_boolean_t search_unhashed;
- int gen;
- dht_du_t *du_stats;
- uint64_t min_free_disk;
- char disk_unit;
- int32_t refresh_interval;
- gf_boolean_t unhashed_sticky_bit;
- struct timeval last_stat_fetch;
- gf_lock_t layout_lock;
- void *private; /* Can be used by wrapper xlators over
- dht */
- gf_boolean_t use_readdirp;
+ xlator_t **subvolumes;
+ char *subvolume_status;
+ int *last_event;
+ dht_layout_t **file_layouts;
+ dht_layout_t **dir_layouts;
+ unsigned int search_unhashed;
+ int gen;
+ dht_du_t *du_stats;
+ double min_free_disk;
+ double min_free_inodes;
+ int subvolume_cnt;
+ int32_t refresh_interval;
+ gf_lock_t subvolume_lock;
+ time_t last_stat_fetch;
+ gf_lock_t layout_lock;
+ dict_t *leaf_to_subvol;
+ void *private; /* Can be used by wrapper xlators over
+ dht */
+ time_t *subvol_up_time;
+
+ /* to keep track of nodes which are decommissioned */
+ xlator_t **decommissioned_bricks;
+ int decommission_in_progress;
+ int decommission_subvols_cnt;
+
+ /* defrag related */
+ gf_defrag_info_t *defrag;
+
+ /* Support regex-based name reinterpretation. */
+ regex_t rsync_regex;
+ regex_t extra_regex;
+
+ /* Support variable xattr names. */
+ char *xattr_name;
+ char *mds_xattr_key;
+ char *link_xattr_name;
+ char *commithash_xattr_name;
+ char *wild_xattr_name;
+
+ dht_methods_t methods;
+
+ struct mem_pool *lock_pool;
+
+ /*local subvol storage for rebalance*/
+ xlator_t **local_subvols;
+ subvol_nodeuuids_info_t *local_nodeuuids;
+ int32_t local_subvols_cnt;
+
+ int dthrottle;
+
+ /* Hard link handle requirement for migration triggered from client*/
+ synclock_t link_lock;
+
+ /* lock migration */
+ gf_lock_t lock;
+
+ /* This is the count used as the distribute layout for a directory */
+ /* Will be a global flag to control the layout spread count */
+ uint32_t dir_spread_cnt;
+
+ /*
+ * "Commit hash" for this volume topology. Changed whenever bricks
+ * are added or removed.
+ */
+ uint32_t vol_commit_hash;
+
+ char vol_uuid[UUID_SIZE + 1];
+
+ char disk_unit;
+
+ gf_boolean_t lock_migration_enabled;
+
+ gf_boolean_t vch_forced;
+
+ gf_boolean_t use_fallocate;
+
+ gf_boolean_t force_migration;
+
+ gf_boolean_t lookup_optimize;
+
+ gf_boolean_t unhashed_sticky_bit;
+
+ gf_boolean_t assert_no_child_down;
+
+ gf_boolean_t use_readdirp;
+
+ /* Request to filter directory entries in readdir request */
+ gf_boolean_t readdir_optimize;
+
+ gf_boolean_t rsync_regex_valid;
+
+ gf_boolean_t extra_regex_valid;
+
+ /* Support size-weighted rebalancing (heterogeneous bricks). */
+ gf_boolean_t do_weighting;
+
+ gf_boolean_t randomize_by_gfid;
};
typedef struct dht_conf dht_conf_t;
+struct dht_dfoffset_ctx {
+ xlator_t *this;
+ off_t offset;
+ int32_t readdir_done;
+};
+typedef struct dht_dfoffset_ctx dht_dfoffset_ctx_t;
struct dht_disk_layout {
- uint32_t cnt;
- uint32_t type;
- struct {
- uint32_t start;
- uint32_t stop;
- } list[1];
+ uint32_t cnt;
+ uint32_t type;
+ struct {
+ uint32_t start;
+ uint32_t stop;
+ } list[1];
};
typedef struct dht_disk_layout dht_disk_layout_t;
-#define WIPE(statp) do { typeof(*statp) z = {0,}; if (statp) *statp = z; } while (0)
+typedef enum {
+ GF_DHT_MIGRATE_DATA,
+ GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS,
+ GF_DHT_MIGRATE_HARDLINK,
+ GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS
+} gf_dht_migrate_data_type_t;
+
+typedef enum {
+ GF_DHT_EQUAL_DISTRIBUTION,
+ GF_DHT_WEIGHTED_DISTRIBUTION
+} dht_distribution_type_t;
-#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
+struct dir_dfmeta {
+ gf_dirent_t *equeue;
+ dht_dfoffset_ctx_t *offset_var;
+ struct list_head **head;
+ struct list_head **iterator;
+ int *fetch_entries;
+ /* fds corresponding to local subvols only */
+ fd_t **lfd;
+};
-#define is_fs_root(loc) (strcmp (loc->path, "/") == 0)
+typedef struct dht_migrate_info {
+ xlator_t *src_subvol;
+ xlator_t *dst_subvol;
+ GF_REF_DECL;
+} dht_migrate_info_t;
-#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0)
+typedef struct dht_fd_ctx {
+ uint64_t opened_on_dst;
+ GF_REF_DECL;
+} dht_fd_ctx_t;
+
+#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
+
+#define is_revalidate(loc) \
+ (dht_inode_ctx_layout_get((loc)->inode, this, NULL) == 0)
#define is_last_call(cnt) (cnt == 0)
-#define DHT_LINKFILE_MODE (S_ISVTX)
-#define check_is_linkfile(i,s,x) ( \
- ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) \
- == DHT_LINKFILE_MODE) && \
- (s->ia_size == 0))
+#define DHT_MIGRATION_IN_PROGRESS 1
+#define DHT_MIGRATION_COMPLETED 2
+
+#define check_is_linkfile(i, s, x, n) \
+ (IS_DHT_LINKFILE_MODE(s) && dict_get(x, n))
+
+#define IS_DHT_MIGRATION_PHASE2(buf) \
+ (IA_ISREG((buf)->ia_type) && \
+ ((st_mode_from_ia((buf)->ia_prot, (buf)->ia_type) & ~S_IFMT) == \
+ DHT_LINKFILE_MODE))
+
+#define IS_DHT_MIGRATION_PHASE1(buf) \
+ (IA_ISREG((buf)->ia_type) && ((buf)->ia_prot.sticky == 1) && \
+ ((buf)->ia_prot.sgid == 1))
-#define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type))
+#define DHT_STRIP_PHASE1_FLAGS(buf) \
+ do { \
+ if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \
+ (buf)->ia_prot.sticky = 0; \
+ (buf)->ia_prot.sgid = 0; \
+ } \
+ } while (0)
+
+#define dht_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE)
+
+#define check_is_dir(i, s, x) (IA_ISDIR(s->ia_type))
#define layout_is_sane(layout) ((layout) && (layout->cnt > 0))
-#define DHT_STACK_UNWIND(fop, frame, params ...) do { \
- dht_local_t *__local = NULL; \
- xlator_t *__xl = NULL; \
- if (frame) { \
- __xl = frame->this; \
- __local = frame->local; \
- frame->local = NULL; \
- } \
- STACK_UNWIND_STRICT (fop, frame, params); \
- dht_local_wipe (__xl, __local); \
- } while (0)
-
-#define DHT_STACK_DESTROY(frame) do { \
- dht_local_t *__local = NULL; \
- xlator_t *__xl = NULL; \
- __xl = frame->this; \
- __local = frame->local; \
- frame->local = NULL; \
- STACK_DESTROY (frame->root); \
- dht_local_wipe (__xl, __local); \
- } while (0)
-
-dht_layout_t *dht_layout_new (xlator_t *this, int cnt);
-dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode);
-dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout,
- const char *name);
-int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout);
-int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
- uint32_t *holes_p, uint32_t *overlaps_p,
- uint32_t *missing_p, uint32_t *down_p,
- uint32_t *misc_p);
-int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout,
- xlator_t *subvol, loc_t *loc, dict_t *xattr);
-
-xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode,
- struct iatt *buf, dict_t *xattr);
-int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
- xlator_t *subvol, loc_t *loc);
-
-int dht_layouts_init (xlator_t *this, dht_conf_t *conf);
-int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
- int op_ret, int op_errno, dict_t *xattr);
-
-int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
- int pos, int32_t **disk_layout_p);
-int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
- int pos, void *disk_layout_raw);
-
-
-int dht_frame_return (call_frame_t *frame);
-
-int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y);
-int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol,
- uint64_t *x);
-
-void dht_local_wipe (xlator_t *this, dht_local_t *local);
-dht_local_t *dht_local_init (call_frame_t *frame);
-int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from,
- xlator_t *subvol);
-
-xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
-xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
-xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
-int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
-
-int dht_hash_compute (int type, const char *name, uint32_t *hash_p);
-
-int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
- xlator_t *tovol, xlator_t *fromvol, loc_t *loc);
-int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc);
-int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc);
-int
-dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
- loc_t *loc, dht_layout_t *layout);
-int
-dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
- dht_layout_t *layout);
-int
-dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
- loc_t *loc, dht_layout_t *layout);
-int
-dht_layout_sort_volname (dht_layout_t *layout);
-
-int dht_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc);
-
-int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc);
-
-int dht_is_subvol_filled (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol);
-int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
-
-int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
-int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);
-void dht_layout_unref (xlator_t *this, dht_layout_t *layout);
-dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout);
-xlator_t *dht_first_up_subvol (xlator_t *this);
-xlator_t *dht_last_up_subvol (xlator_t *this);
-int dht_frame_su_do (call_frame_t *frame);
-int dht_frame_su_undo (call_frame_t *frame);
-
-int dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name);
-
-int dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc,
- xlator_t **subvol);
+#define we_are_not_migrating(x) ((x) == 1)
+
+#define DHT_STACK_UNWIND(fop, frame, params...) \
+ do { \
+ dht_local_t *__local = NULL; \
+ xlator_t *__xl = NULL; \
+ if (frame) { \
+ __xl = frame->this; \
+ __local = frame->local; \
+ frame->local = NULL; \
+ } \
+ STACK_UNWIND_STRICT(fop, frame, params); \
+ dht_local_wipe(__xl, __local); \
+ } while (0)
+
+#define DHT_STACK_DESTROY(frame) \
+ do { \
+ dht_local_t *__local = NULL; \
+ xlator_t *__xl = NULL; \
+ __xl = frame->this; \
+ __local = frame->local; \
+ frame->local = NULL; \
+ STACK_DESTROY(frame->root); \
+ dht_local_wipe(__xl, __local); \
+ } while (0)
+
+#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, post) \
+ do { \
+ if (ctx_sec == new_sec) \
+ new_nsec = max(new_nsec, ctx_nsec); \
+ else if (ctx_sec > new_sec) { \
+ new_sec = ctx_sec; \
+ new_nsec = ctx_nsec; \
+ } \
+ if (post) { \
+ ctx_sec = new_sec; \
+ ctx_nsec = new_nsec; \
+ } \
+ } while (0)
+
+#define is_greater_time(a, an, b, bn) \
+ (((a) < (b)) || (((a) == (b)) && ((an) < (bn))))
+
+#define DHT_MARK_FOP_INTERNAL(xattr) \
+ do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new(); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str(xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \
+ if (tmp) { \
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \
+ "Failed to set dictionary value: key = %s," \
+ " path = %s", \
+ GLUSTERFS_INTERNAL_FOP_KEY, local->loc.path); \
+ } \
+ } while (0)
+
+dht_layout_t *
+dht_layout_new(xlator_t *this, int cnt);
+dht_layout_t *
+dht_layout_get(xlator_t *this, inode_t *inode);
+dht_layout_t *
+dht_layout_for_subvol(xlator_t *this, xlator_t *subvol);
+xlator_t *
+dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name);
+int32_t
+dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local);
+int32_t
+dht_migration_needed(xlator_t *this);
+int
+dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout);
+void
+dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
+ uint32_t *holes_p, uint32_t *overlaps_p,
+ uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
+ uint32_t *no_space_p);
+int
+dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ loc_t *loc, dict_t *xattr);
+xlator_t *
+dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *buf,
+ dict_t *xattr);
+int
+dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol,
+ loc_t *loc);
+
+int
+dht_layouts_init(xlator_t *this, dht_conf_t *conf);
+int
+dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ int op_ret, int op_errno, dict_t *xattr);
+
+int
+dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos,
+ int32_t **disk_layout_p);
+int
+dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
+ xlator_t *subvol, int32_t **disk_layout_p);
+
+int
+dht_frame_return(call_frame_t *frame);
+
+int
+dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol);
+void
+dht_local_wipe(xlator_t *this, dht_local_t *local);
+dht_local_t *
+dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop);
+int
+dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from);
+
+xlator_t *
+dht_subvol_get_hashed(xlator_t *this, loc_t *loc);
+xlator_t *
+dht_subvol_get_cached(xlator_t *this, inode_t *inode);
+xlator_t *
+dht_subvol_next(xlator_t *this, xlator_t *prev);
+xlator_t *
+dht_subvol_next_available(xlator_t *this, xlator_t *prev);
+int
+dht_subvol_cnt(xlator_t *this, xlator_t *subvol);
+
+int
+dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p);
+
+int
+dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *this, xlator_t *tovol, xlator_t *fromvol,
+ loc_t *loc);
+int
+dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc);
+int
+dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+int
+dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ dht_layout_t *layout);
+int
+dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+void
+dht_layout_sort_volname(dht_layout_t *layout);
+
+int
+dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc);
+
+gf_boolean_t
+dht_is_subvol_filled(xlator_t *this, xlator_t *subvol);
+xlator_t *
+dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol,
+ dht_local_t *layout);
+int
+dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx);
+int
+dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode);
+int
+dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout);
+;
+void
+dht_layout_unref(xlator_t *this, dht_layout_t *layout);
+dht_layout_t *
+dht_layout_ref(xlator_t *this, dht_layout_t *layout);
+int
+dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol);
+xlator_t *
+dht_first_up_subvol(xlator_t *this);
+xlator_t *
+dht_last_up_subvol(xlator_t *this);
+
+int
+dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name);
+
+int
+dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc,
+ xlator_t **subvol);
+
+int
+dht_rename_cleanup(call_frame_t *frame);
+int
+dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+
+int
+dht_update_commit_hash_for_layout(call_frame_t *frame);
+int
+dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ dht_layout_t *layout);
+
+int
+dht_init_subvolumes(xlator_t *this, dht_conf_t *conf);
+
+/* migration/rebalance */
+int
+dht_start_rebalance_task(xlator_t *this, call_frame_t *frame);
+
+int
+dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame);
+int
+dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame);
+
+int
+dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf);
+
+/* FOPS */
+int32_t
+dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req);
+
+int32_t
+dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int32_t
+dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata);
+
+int32_t
+dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata);
+
+int32_t
+dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata);
+
+int32_t
+dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata);
+
+int32_t
+dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata);
+
+int32_t
+dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
+
+int32_t
+dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata);
+
+int32_t
+dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata);
+
+int32_t
+dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata);
+
+int32_t
+dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int32_t
+dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int32_t
+dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *params);
+
+int32_t
+dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
+
+int32_t
+dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata);
+
+int32_t
+dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata);
+
+int32_t
+dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata);
+
+int32_t
+dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata);
+
+int32_t
+dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int32_t
+dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata);
+
+int32_t
+dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata);
+
+int32_t
+dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata);
+
+int32_t
+dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata);
+
+int32_t
+dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
+int32_t
+dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+int32_t
+dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata);
+
+int32_t
+dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata);
+
+int32_t
+dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata);
+
+int32_t
+dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata);
+
+int32_t
+dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict);
+
+int32_t
+dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+dht_forget(xlator_t *this, inode_t *inode);
+int32_t
+dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata);
+int32_t
+dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata);
+int32_t
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
+int32_t
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
+int32_t
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata);
+int32_t
+dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata);
+
+int
+dht_set_subvol_range(xlator_t *this);
+int32_t
+dht_init(xlator_t *this);
+void
+dht_fini(xlator_t *this);
+int
+dht_reconfigure(xlator_t *this, dict_t *options);
+int32_t
+dht_notify(xlator_t *this, int32_t event, void *data, ...);
+
+/* definitions for nufa/switch */
+int
+dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent);
+int
+dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent);
+int
+dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent);
+int
+dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent);
+int
+dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
+int
+dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
+
+int
+dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata);
+
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+int
+gf_defrag_status_get(dht_conf_t *conf, dict_t *dict);
+
+int
+gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output);
+
+void *
+gf_defrag_start(void *this);
+
+int32_t
+gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno);
+int
+dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+ int flag, int *fop_errno);
+int
+dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this,
+ dht_layout_t **layout_int);
+int
+dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this,
+ dht_layout_t *layout_int);
+int
+dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat,
+ int32_t update_ctx);
+void
+dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat);
+
+int
+dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx);
+int
+dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx);
+int
+dht_dir_attr_heal(void *data);
+int
+dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data);
+xlator_t *
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+ xlator_t *ignore, dht_layout_t *layout,
+ uint64_t filesize);
+xlator_t *
+dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
+int
+dht_dir_has_layout(dict_t *xattr, char *name);
+int
+dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this);
+
+int32_t
+dht_priv_dump(xlator_t *this);
+int32_t
+dht_inodectx_dump(xlator_t *this, inode_t *inode);
+
+gf_boolean_t
+dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator);
+
+int
+dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode,
+ xlator_t **src_subvol, xlator_t **dst_subvol);
+gf_boolean_t
+dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol,
+ xlator_t *dst_subvol);
+
+int
+dht_subvol_status(dht_conf_t *conf, xlator_t *subvol);
+
+void
+dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
+ dht_layout_t *layout);
+
+int
+dht_layout_sort(dht_layout_t *layout);
+
+int
+dht_heal_full_path(void *data);
+
+int
+dht_heal_full_path_done(int op_ret, call_frame_t *frame, void *data);
+
+int
+dht_layout_missing_dirs(dht_layout_t *layout);
+
+int
+dht_refresh_layout(call_frame_t *frame);
+
+int
+dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child,
+ int32_t *op_errno);
+
+int32_t
+dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+void
+dht_build_root_loc(inode_t *inode, loc_t *loc);
+
+gf_boolean_t
+dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst);
+
+int32_t
+dht_fd_ctx_destroy(xlator_t *this, fd_t *fd);
+
+int32_t
+dht_release(xlator_t *this, fd_t *fd);
+
+int32_t
+dht_set_fixed_dir_stat(struct iatt *stat);
+
+xlator_t *
+dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock,
+ dht_local_t *local);
+
+int
+dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret);
+
+int
+dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *subvol);
+
+int
+dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame);
+
+/* FD fop callbacks */
+
+int
+dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata);
+
+int
+dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
+ struct iobref *iobref, dict_t *xdata);
+
+int
+dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *stbuf, dict_t *xdata);
+
+int
+dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
+
+int
+dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
+
+/* All custom xattr heal functions */
+int
+dht_dir_heal_xattrs(void *data);
+
+int
+dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data);
+
+int32_t
+dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size);
+
+int
+dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data);
+
+void
+dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
+ dict_t *src, int *uret, int *uflag);
+
+int
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno);
+
+int
+dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, int flag);
+
+int
+dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol);
+
+int
+dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dht_layout_t *layout);
+
+/* Abstract out the DHT-IATT-IN-DICT */
+
+void
+dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *new_layout);
+
+int
+dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata);
+
+int
+dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata);
+
+int32_t
+dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
+
+int
+dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int32_t
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno);
+
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int32_t
+dht_create_lock(call_frame_t *frame, xlator_t *subvol);
+
+int
+dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local);
+
+int
+dht_dir_layout_error_check(xlator_t *this, inode_t *inode);
+
+int
+dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol);
#endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 14e278b1263..c0588828fdb 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -1,263 +1,487 @@
/*
- Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
/* TODO: add NS locking */
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "defaults.h"
#include <sys/time.h>
+#include <glusterfs/events.h>
-
-int
-dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct statvfs *statvfs)
+int
+dht_du_info_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
{
- dht_conf_t *conf = NULL;
- call_frame_t *prev = NULL;
- int this_call_cnt = 0;
- int i = 0;
- double percent = 0;
- uint64_t bytes = 0;
-
- conf = this->private;
- prev = cookie;
-
- if (op_ret == -1)
- goto out;
-
- if (statvfs && statvfs->f_blocks) {
- percent = (statvfs->f_bfree * 100) / statvfs->f_blocks;
- bytes = (statvfs->f_bfree * statvfs->f_frsize);
- }
-
- LOCK (&conf->subvolume_lock);
- {
- for (i = 0; i < conf->subvolume_cnt; i++)
- if (prev->this == conf->subvolumes[i]) {
- conf->du_stats[i].avail_percent = percent;
- conf->du_stats[i].avail_space = bytes;
- gf_log (this->name, GF_LOG_DEBUG,
- "on subvolume '%s': avail_percent is: "
- "%.2f and avail_space is: %"PRIu64"",
- prev->this->name,
- conf->du_stats[i].avail_percent,
- conf->du_stats[i].avail_space);
- }
- }
- UNLOCK (&conf->subvolume_lock);
-
- out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_DESTROY (frame);
-
- return 0;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
+ int i = 0;
+ double percent = 0;
+ double percent_inodes = 0;
+ uint64_t bytes = 0;
+ uint32_t bpc; /* blocks per chunk */
+ uint32_t chunks = 0;
+
+ conf = this->private;
+ prev = cookie;
+
+ if (op_ret == -1 || !statvfs) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_GET_DISK_INFO_ERROR, "failed to get disk info from %s",
+ prev->name);
+ goto out;
+ }
+
+ if (statvfs->f_blocks) {
+ percent = (statvfs->f_bavail * 100) / statvfs->f_blocks;
+ bytes = (statvfs->f_bavail * statvfs->f_frsize);
+ /*
+ * A 32-bit count of 1MB chunks allows a maximum brick size of
+ * ~4PB. It's possible that we could see a single local FS
+ * bigger than that some day, but this code is likely to be
+ * irrelevant by then. Meanwhile, it's more important to keep
+ * the chunk size small so the layout-calculation code that
+ * uses this value can be tested on normal machines.
+ */
+ bpc = (1 << 20) / statvfs->f_bsize;
+ chunks = (statvfs->f_blocks + bpc - 1) / bpc;
+ }
+
+ if (statvfs->f_files) {
+ percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files;
+ } else {
+ /*
+ * Set percent inodes to 100 for dynamically allocated inode
+ * filesystems. The rationale is that distribute need not
+ * worry about total inodes; rather, let the 'create()' be
+ * scheduled on the hashed subvol regardless of the total
+ * inodes.
+ */
+ percent_inodes = 100;
+ }
+
+ LOCK(&conf->subvolume_lock);
+ {
+ for (i = 0; i < conf->subvolume_cnt; i++)
+ if (prev == conf->subvolumes[i]) {
+ conf->du_stats[i].avail_percent = percent;
+ conf->du_stats[i].avail_space = bytes;
+ conf->du_stats[i].avail_inodes = percent_inodes;
+ conf->du_stats[i].chunks = chunks;
+ conf->du_stats[i].total_blocks = statvfs->f_blocks;
+ conf->du_stats[i].avail_blocks = statvfs->f_bavail;
+ conf->du_stats[i].frsize = statvfs->f_frsize;
+
+ gf_msg_debug(this->name, 0,
+ "subvolume '%s': avail_percent "
+ "is: %.2f and avail_space "
+ "is: %" PRIu64
+ " and avail_inodes"
+ " is: %.2f",
+ prev->name, conf->du_stats[i].avail_percent,
+ conf->du_stats[i].avail_space,
+ conf->du_stats[i].avail_inodes);
+ break; /* no point in looping further */
+ }
+ }
+ UNLOCK(&conf->subvolume_lock);
+
+out:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_DESTROY(frame);
+
+ return 0;
}
int
-dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx)
+dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx)
{
- dht_conf_t *conf = NULL;
- call_frame_t *statfs_frame = NULL;
- dht_local_t *statfs_local = NULL;
- call_pool_t *pool = NULL;
+ dht_conf_t *conf = NULL;
+ call_frame_t *statfs_frame = NULL;
+ dht_local_t *statfs_local = NULL;
+ call_pool_t *pool = NULL;
+ loc_t tmp_loc = {
+ 0,
+ };
+
+ conf = this->private;
+ pool = this->ctx->pool;
+
+ statfs_frame = create_frame(this, pool);
+ if (!statfs_frame) {
+ goto err;
+ }
+
+ /* local->fop value is not used in this case */
+ statfs_local = dht_local_init(statfs_frame, NULL, NULL, GF_FOP_MAXVALUE);
+ if (!statfs_local) {
+ goto err;
+ }
+
+ /* make it root gfid, should be enough to get the proper info back */
+ tmp_loc.gfid[15] = 1;
+
+ statfs_local->call_cnt = 1;
+ STACK_WIND_COOKIE(
+ statfs_frame, dht_du_info_cbk, conf->subvolumes[subvol_idx],
+ conf->subvolumes[subvol_idx],
+ conf->subvolumes[subvol_idx]->fops->statfs, &tmp_loc, NULL);
+
+ return 0;
+err:
+ if (statfs_frame)
+ DHT_STACK_DESTROY(statfs_frame);
- conf = this->private;
- pool = this->ctx->pool;
+ return -1;
+}
- statfs_frame = create_frame (this, pool);
+int
+dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int i = 0;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ call_frame_t *statfs_frame = NULL;
+ dht_local_t *statfs_local = NULL;
+ loc_t tmp_loc = {
+ 0,
+ };
+ time_t now;
+
+ conf = this->private;
+ now = gf_time();
+ /* make it root gfid, should be enough to get the proper
+ info back */
+ tmp_loc.gfid[15] = 1;
+
+ if (now > (conf->refresh_interval + conf->last_stat_fetch)) {
+ statfs_frame = copy_frame(frame);
if (!statfs_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ goto err;
}
-
- statfs_local = dht_local_init (statfs_frame);
+
+ /* In this case, 'local->fop' is not used */
+ statfs_local = dht_local_init(statfs_frame, loc, NULL, GF_FOP_MAXVALUE);
if (!statfs_local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ goto err;
+ }
+
+ statfs_local->params = dict_new();
+ if (!statfs_local->params)
+ goto err;
+
+ ret = dict_set_int8(statfs_local->params,
+ GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
+ goto err;
}
-
- loc_t tmp_loc = { .inode = NULL,
- .path = "/",
- };
-
- statfs_local->call_cnt = 1;
- STACK_WIND (statfs_frame, dht_du_info_cbk,
- conf->subvolumes[subvol_idx],
- conf->subvolumes[subvol_idx]->fops->statfs,
- &tmp_loc);
-
- return 0;
- err:
- if (statfs_frame)
- DHT_STACK_DESTROY (statfs_frame);
-
- return -1;
+
+ statfs_local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND_COOKIE(statfs_frame, dht_du_info_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->statfs, &tmp_loc,
+ statfs_local->params);
+ }
+
+ conf->last_stat_fetch = now;
+ }
+ return 0;
+err:
+ if (statfs_frame)
+ DHT_STACK_DESTROY(statfs_frame);
+
+ return -1;
}
-int
-dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
+gf_boolean_t
+dht_is_subvol_filled(xlator_t *this, xlator_t *subvol)
{
- int i = 0;
- dht_conf_t *conf = NULL;
- call_frame_t *statfs_frame = NULL;
- dht_local_t *statfs_local = NULL;
- struct timeval tv = {0,};
-
- conf = this->private;
-
- gettimeofday (&tv, NULL);
- if (tv.tv_sec > (conf->refresh_interval
- + conf->last_stat_fetch.tv_sec)) {
-
- statfs_frame = copy_frame (frame);
- if (!statfs_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ int i = 0;
+ char vol_name[256];
+ dht_conf_t *conf = NULL;
+ gf_boolean_t subvol_filled_inodes = _gf_false;
+ gf_boolean_t subvol_filled_space = _gf_false;
+ gf_boolean_t is_subvol_filled = _gf_false;
+ double usage = 0;
+
+ conf = this->private;
+
+ /* Check for values above specified percent or free disk */
+ LOCK(&conf->subvolume_lock);
+ {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ if (conf->disk_unit == 'p') {
+ if (conf->du_stats[i].avail_percent < conf->min_free_disk) {
+ subvol_filled_space = _gf_true;
+ break;
+ }
+
+ } else {
+ if (conf->du_stats[i].avail_space < conf->min_free_disk) {
+ subvol_filled_space = _gf_true;
+ break;
+ }
}
-
- statfs_local = dht_local_init (statfs_frame);
- if (!statfs_local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ if (conf->du_stats[i].avail_inodes < conf->min_free_inodes) {
+ subvol_filled_inodes = _gf_true;
+ break;
}
+ }
+ }
+ }
+ UNLOCK(&conf->subvolume_lock);
- loc_copy (&statfs_local->loc, loc);
- loc_t tmp_loc = { .inode = NULL,
- .path = "/",
- };
-
- statfs_local->call_cnt = conf->subvolume_cnt;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (statfs_frame, dht_du_info_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->statfs,
- &tmp_loc);
- }
+ if (subvol_filled_space && conf->subvolume_status[i]) {
+ if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
+ usage = 100 - conf->du_stats[i].avail_percent;
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+ "disk space on subvolume '%s' is getting "
+ "full (%.2f %%), consider adding more bricks",
+ subvol->name, usage);
- conf->last_stat_fetch.tv_sec = tv.tv_sec;
+ (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name);
+ vol_name[(strlen(this->name) - 4)] = '\0';
+
+ gf_event(EVENT_DHT_DISK_USAGE, "volume=%s;subvol=%s;usage=%.2f %%",
+ vol_name, subvol->name, usage);
}
- return 0;
-err:
- if (statfs_frame)
- DHT_STACK_DESTROY (statfs_frame);
+ }
+
+ if (subvol_filled_inodes && conf->subvolume_status[i]) {
+ if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
+ usage = 100 - conf->du_stats[i].avail_inodes;
+ gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_SUBVOL_INSUFF_INODES,
+ "inodes on subvolume '%s' are at "
+ "(%.2f %%), consider adding more bricks",
+ subvol->name, usage);
+
+ (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name);
+ vol_name[(strlen(this->name) - 4)] = '\0';
+
+ gf_event(EVENT_DHT_INODES_USAGE,
+ "volume=%s;subvol=%s;usage=%.2f %%", vol_name,
+ subvol->name, usage);
+ }
+ }
- return -1;
+ is_subvol_filled = (subvol_filled_space || subvol_filled_inodes);
+
+ return is_subvol_filled;
}
+/*Get the best subvolume to create the file in*/
+xlator_t *
+dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol,
+ dht_local_t *local)
+{
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ loc_t *loc = NULL;
+
+ conf = this->private;
+ if (!local)
+ goto out;
+ loc = &local->loc;
+ if (!local->layout) {
+ layout = dht_layout_get(this, loc->parent);
+
+ if (!layout) {
+ gf_msg_debug(this->name, 0,
+ "Missing layout. path=%s,"
+ " parent gfid = %s",
+ loc->path, uuid_utoa(loc->parent->gfid));
+ goto out;
+ }
+ } else {
+ layout = dht_layout_ref(this, local->layout);
+ }
+
+ LOCK(&conf->subvolume_lock);
+ {
+ avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL,
+ layout, 0);
+ if (!avail_subvol) {
+ avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol,
+ layout);
+ }
+ }
+ UNLOCK(&conf->subvolume_lock);
+out:
+ if (!avail_subvol) {
+ gf_msg_debug(this->name, 0,
+ "No subvolume has enough free space \
+ and/or inodes to create");
+ avail_subvol = subvol;
+ }
+
+ if (layout)
+ dht_layout_unref(this, layout);
+ return avail_subvol;
+}
-int
-dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
+static inline int32_t
+dht_subvol_has_err(dht_conf_t *conf, xlator_t *this, xlator_t *ignore,
+ dht_layout_t *layout)
{
- int i = 0;
- int subvol_filled = 0;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
-
- /* Check for values above specified percent or free disk */
- LOCK (&conf->subvolume_lock);
- {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- if (conf->disk_unit == 'p') {
- if (conf->du_stats[i].avail_percent <
- conf->min_free_disk) {
- subvol_filled = 1;
- break;
- }
- } else {
- if (conf->du_stats[i].avail_space <
- conf->min_free_disk) {
- subvol_filled = 1;
- break;
- }
- }
- }
- }
+ int ret = -1;
+ int i = 0;
+
+ if (!this || !layout)
+ goto out;
+
+ /* this check is meant for rebalance process. The source of the file
+ * should be ignored for space check */
+ if (this == ignore) {
+ goto out;
+ }
+
+ /* check if subvol has layout errors, before selecting it */
+ for (i = 0; i < layout->cnt; i++) {
+ if (!strcmp(layout->list[i].xlator->name, this->name) &&
+ (layout->list[i].err != 0)) {
+ ret = -1;
+ goto out;
}
- UNLOCK (&conf->subvolume_lock);
-
- if (subvol_filled && conf->subvolume_status[i]) {
- if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
- gf_log (this->name, GF_LOG_WARNING,
- "disk space on subvolume '%s' is getting "
- "full (%.2f %%), consider adding more nodes",
- subvol->name,
- (100 - conf->du_stats[i].avail_percent));
- }
+ }
+
+ /* discard decommissioned subvol */
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] &&
+ conf->decommissioned_bricks[i] == this) {
+ ret = -1;
+ goto out;
+ }
}
+ }
- return subvol_filled;
+ ret = 0;
+out:
+ return ret;
}
+/*Get subvolume which has both space and inodes more than the min criteria*/
xlator_t *
-dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+ xlator_t *ignore, dht_layout_t *layout,
+ uint64_t filesize)
{
- int i = 0;
- double max= 0;
- xlator_t *avail_subvol = NULL;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
-
- LOCK (&conf->subvolume_lock);
- {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->disk_unit == 'p') {
- if (conf->du_stats[i].avail_percent > max) {
- max = conf->du_stats[i].avail_percent;
- avail_subvol = conf->subvolumes[i];
- }
- } else {
- if (conf->du_stats[i].avail_space > max) {
- max = conf->du_stats[i].avail_space;
- avail_subvol = conf->subvolumes[i];
- }
- }
- }
+ int i = 0;
+ double max = 0;
+ double max_inodes = 0;
+ int ignore_subvol = 0;
+ uint64_t total_blocks = 0;
+ uint64_t avail_blocks = 0;
+ uint64_t frsize = 0;
+ double post_availspace = 0;
+ double post_percent = 0;
+
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors and also it is not a
+ * decommissioned brick, before selecting it */
+ ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], ignore,
+ layout);
+ if (ignore_subvol)
+ continue;
+
+ if ((conf->disk_unit == 'p') &&
+ (conf->du_stats[i].avail_percent > conf->min_free_disk) &&
+ (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+ if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+ (conf->du_stats[i].avail_percent > max)) {
+ max = conf->du_stats[i].avail_percent;
+ max_inodes = conf->du_stats[i].avail_inodes;
+ avail_subvol = conf->subvolumes[i];
+ total_blocks = conf->du_stats[i].total_blocks;
+ avail_blocks = conf->du_stats[i].avail_blocks;
+ frsize = conf->du_stats[i].frsize;
+ }
}
- UNLOCK (&conf->subvolume_lock);
- if (!avail_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume has enough free space to create");
+ if ((conf->disk_unit != 'p') &&
+ (conf->du_stats[i].avail_space > conf->min_free_disk) &&
+ (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+ if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+ (conf->du_stats[i].avail_space > max)) {
+ max = conf->du_stats[i].avail_space;
+ max_inodes = conf->du_stats[i].avail_inodes;
+ avail_subvol = conf->subvolumes[i];
+ }
}
+ }
+
+ if (avail_subvol) {
+ if (conf->disk_unit == 'p') {
+ post_availspace = (avail_blocks * frsize) - filesize;
+ post_percent = (post_availspace * 100) / (total_blocks * frsize);
+ if (post_percent < conf->min_free_disk)
+ avail_subvol = NULL;
+ }
+ if (conf->disk_unit != 'p') {
+ if ((max - filesize) < conf->min_free_disk)
+ avail_subvol = NULL;
+ }
+ }
- if (max < conf->min_free_disk)
- avail_subvol = subvol;
+ return avail_subvol;
+}
- if (!avail_subvol)
- avail_subvol = subvol;
+/* Get subvol which has at least one inode and maximum space */
+xlator_t *
+dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
+{
+ int i = 0;
+ double max = 0;
+ int ignore_subvol = 0;
+
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors and also it is not a
+ * decommissioned brick, before selecting it*/
+
+ ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], NULL,
+ layout);
+ if (ignore_subvol)
+ continue;
+
+ if (conf->disk_unit == 'p') {
+ if ((conf->du_stats[i].avail_percent > max) &&
+ (conf->du_stats[i].avail_inodes > 0)) {
+ max = conf->du_stats[i].avail_percent;
+ avail_subvol = conf->subvolumes[i];
+ }
+ } else {
+ if ((conf->du_stats[i].avail_space > max) &&
+ (conf->du_stats[i].avail_inodes > 0)) {
+ max = conf->du_stats[i].avail_space;
+ avail_subvol = conf->subvolumes[i];
+ }
+ }
+ }
- return avail_subvol;
+ return avail_subvol;
}
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
index 528ac2f904b..acda67c312a 100644
--- a/xlators/cluster/dht/src/dht-hashfn.c
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -1,81 +1,110 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "hashfn.h"
+#include <glusterfs/hashfn.h>
-
-int
-dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p)
+static int
+dht_hash_compute_internal(int type, const char *name, const int len,
+ uint32_t *hash_p)
{
- int ret = 0;
- uint32_t hash = 0;
-
- switch (type) {
- case DHT_HASH_TYPE_DM:
- hash = gf_dm_hashfn (name, strlen (name));
- break;
- default:
- ret = -1;
- break;
- }
-
- if (ret == 0) {
- *hash_p = hash;
- }
-
- return ret;
+ int ret = 0;
+ uint32_t hash = 0;
+
+ switch (type) {
+ case DHT_HASH_TYPE_DM:
+ case DHT_HASH_TYPE_DM_USER:
+ hash = gf_dm_hashfn(name, len);
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+ if (ret == 0) {
+ *hash_p = hash;
+ }
+
+ return ret;
}
-
-#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \
- rsync_frndly_name = (char *) name; \
- if (name[0] == '.') { \
- char *dot = 0; \
- int namelen = 0; \
- \
- dot = strrchr (name, '.'); \
- if (dot && dot > (name + 1) && *(dot + 1)) { \
- namelen = (dot - name); \
- rsync_frndly_name = alloca (namelen); \
- strncpy (rsync_frndly_name, name + 1, \
- namelen); \
- rsync_frndly_name[namelen - 1] = 0; \
- } \
- } \
- } while (0);
-
+/* The function returns:
+ * 0 : in case no munge took place
+ * >0 : the length (inc. terminating NULL!) of the newly modified string,
+ * if it was munged.
+ */
+static int
+dht_munge_name(const char *original, char *modified, size_t len, regex_t *re)
+{
+ regmatch_t matches[2] = {
+ {0},
+ };
+ size_t new_len = 0;
+ int ret = 0;
+
+ ret = regexec(re, original, 2, matches, 0);
+
+ if (ret != REG_NOMATCH) {
+ if (matches[1].rm_so != -1) {
+ new_len = matches[1].rm_eo - matches[1].rm_so;
+ /* Equal would fail due to the NUL at the end. */
+ if (new_len < len) {
+ memcpy(modified, original + matches[1].rm_so, new_len);
+ modified[new_len] = '\0';
+ return new_len + 1; /* +1 for the terminating NULL */
+ }
+ }
+ }
+
+ /* This is guaranteed safe because of how the dest was allocated. */
+ strcpy(modified, original);
+ return 0;
+}
int
-dht_hash_compute (int type, const char *name, uint32_t *hash_p)
+dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p)
{
- char *rsync_friendly_name = NULL;
-
- MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name);
-
- return dht_hash_compute_internal (type, rsync_friendly_name, hash_p);
+ char *rsync_friendly_name = NULL;
+ dht_conf_t *priv = NULL;
+ size_t len = 0;
+ int munged = 0;
+
+ priv = this->private;
+
+ if (name == NULL)
+ return -1;
+
+ len = strlen(name) + 1;
+ rsync_friendly_name = alloca(len);
+
+ LOCK(&priv->lock);
+ {
+ if (priv->extra_regex_valid) {
+ munged = dht_munge_name(name, rsync_friendly_name, len,
+ &priv->extra_regex);
+ }
+
+ if (!munged && priv->rsync_regex_valid) {
+ gf_msg_trace(this->name, 0, "trying regex for %s", name);
+ munged = dht_munge_name(name, rsync_friendly_name, len,
+ &priv->rsync_regex);
+ }
+ }
+ UNLOCK(&priv->lock);
+ if (munged) {
+ gf_msg_debug(this->name, 0, "munged down to %s", rsync_friendly_name);
+ len = munged;
+ } else {
+ rsync_friendly_name = (char *)name;
+ }
+
+ return dht_hash_compute_internal(type, rsync_friendly_name, len - 1,
+ hash_p);
}
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index f6ecebf93a7..3f2fe43d5f3 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -1,525 +1,2304 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include "dht-common.h"
+#include "dht-lock.h"
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+static void
+dht_free_fd_ctx(dht_fd_ctx_t *fd_ctx)
+{
+ GF_FREE(fd_ctx);
+}
-#include "glusterfs.h"
-#include "xlator.h"
-#include "dht-common.h"
+int32_t
+dht_fd_ctx_destroy(xlator_t *this, fd_t *fd)
+{
+ dht_fd_ctx_t *fd_ctx = NULL;
+ uint64_t value = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+ ret = fd_ctx_del(fd, this, &value);
+ if (ret) {
+ goto out;
+ }
+
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value;
+ if (fd_ctx) {
+ GF_REF_PUT(fd_ctx);
+ }
+out:
+ return ret;
+}
+
+static int
+__dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst)
+{
+ dht_fd_ctx_t *fd_ctx = NULL;
+ uint64_t value = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, fd, out);
+ fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_dht_mt_fd_ctx_t);
+
+ if (!fd_ctx) {
+ goto out;
+ }
+
+ fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst;
+ GF_REF_INIT(fd_ctx, dht_free_fd_ctx);
+
+ value = (uint64_t)(uintptr_t)fd_ctx;
+
+ ret = __fd_ctx_set(fd, this, value);
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FD_CTX_SET_FAILED,
+ "fd=0x%p", fd, NULL);
+ GF_REF_PUT(fd_ctx);
+ }
+out:
+ return ret;
+}
int
-dht_frame_return (call_frame_t *frame)
+dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst)
{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
+ dht_fd_ctx_t *fd_ctx = NULL;
+ uint64_t value = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+ LOCK(&fd->lock);
+ {
+ ret = __fd_ctx_get(fd, this, &value);
+ if (ret && value) {
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value;
+ if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) {
+ /* This could happen due to racing
+ * check_progress tasks*/
+ goto unlock;
+ } else {
+ /* This would be a big problem*/
+ /* Overwrite and hope for the best*/
+ fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst;
+ UNLOCK(&fd->lock);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_VALUE,
+ NULL);
- if (!frame)
- return -1;
+ goto out;
+ }
+ }
+ ret = __dht_fd_ctx_set(this, fd, dst);
+ }
+unlock:
+ UNLOCK(&fd->lock);
+out:
+ return ret;
+}
- local = frame->local;
+static dht_fd_ctx_t *
+dht_fd_ctx_get(xlator_t *this, fd_t *fd)
+{
+ dht_fd_ctx_t *fd_ctx = NULL;
+ int ret = -1;
+ uint64_t tmp_val = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+ LOCK(&fd->lock);
+ {
+ ret = __fd_ctx_get(fd, this, &tmp_val);
+ if ((ret < 0) || (tmp_val == 0)) {
+ goto unlock;
+ }
- LOCK (&frame->lock);
- {
- this_call_cnt = --local->call_cnt;
- }
- UNLOCK (&frame->lock);
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)tmp_val;
+ GF_REF_GET(fd_ctx);
+ }
+unlock:
+ UNLOCK(&fd->lock);
- return this_call_cnt;
+out:
+ return fd_ctx;
}
+gf_boolean_t
+dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst)
+{
+ dht_fd_ctx_t *fd_ctx = NULL;
+ gf_boolean_t opened = _gf_false;
-int
-dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
+ fd_ctx = dht_fd_ctx_get(this, fd);
+
+ if (fd_ctx) {
+ if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) {
+ opened = _gf_true;
+ }
+ GF_REF_PUT(fd_ctx);
+ }
+
+ return opened;
+}
+
+void
+dht_free_mig_info(void *data)
{
- dht_conf_t *conf = NULL;
- int cnt = 0;
- int max = 0;
- uint64_t y = 0;
+ dht_migrate_info_t *miginfo = NULL;
+ miginfo = data;
+ GF_FREE(miginfo);
- if (x == ((uint64_t) -1)) {
- y = (uint64_t) -1;
- goto out;
- }
+ return;
+}
- conf = this->private;
- if (!conf)
- goto out;
+static int
+dht_inode_ctx_set_mig_info(xlator_t *this, inode_t *inode, xlator_t *src_subvol,
+ xlator_t *dst_subvol)
+{
+ dht_migrate_info_t *miginfo = NULL;
+ uint64_t value = 0;
+ int ret = -1;
+
+ miginfo = GF_CALLOC(1, sizeof(*miginfo), gf_dht_mt_miginfo_t);
+ if (miginfo == NULL)
+ goto out;
- max = conf->subvolume_cnt;
- cnt = dht_subvol_cnt (this, subvol);
+ miginfo->src_subvol = src_subvol;
+ miginfo->dst_subvol = dst_subvol;
+ GF_REF_INIT(miginfo, dht_free_mig_info);
- y = ((x * max) + cnt);
+ value = (uint64_t)(uintptr_t)miginfo;
+
+ ret = inode_ctx_set1(inode, this, &value);
+ if (ret < 0) {
+ GF_REF_PUT(miginfo);
+ }
out:
- if (y_p)
- *y_p = y;
+ return ret;
+}
- return 0;
+int
+dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode,
+ xlator_t **src_subvol, xlator_t **dst_subvol)
+{
+ int ret = -1;
+ uint64_t tmp_miginfo = 0;
+ dht_migrate_info_t *miginfo = NULL;
+
+ LOCK(&inode->lock);
+ {
+ ret = __inode_ctx_get1(inode, this, &tmp_miginfo);
+ if ((ret < 0) || (tmp_miginfo == 0)) {
+ UNLOCK(&inode->lock);
+ goto out;
+ }
+
+ miginfo = (dht_migrate_info_t *)(uintptr_t)tmp_miginfo;
+ GF_REF_GET(miginfo);
+ }
+ UNLOCK(&inode->lock);
+
+ if (src_subvol)
+ *src_subvol = miginfo->src_subvol;
+
+ if (dst_subvol)
+ *dst_subvol = miginfo->dst_subvol;
+
+ GF_REF_PUT(miginfo);
+
+out:
+ return ret;
}
+gf_boolean_t
+dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol,
+ xlator_t *dst_subvol)
+{
+ /* Not set
+ */
+ if (!src_subvol || !dst_subvol)
+ return _gf_true;
+
+ /* Invalid scenarios:
+ * The src_subvol does not match the subvol on which the current op was sent
+ * so the cached subvol has changed between the last mig_info_set and now.
+ * src_subvol == dst_subvol. The file was migrated without any FOP detecting
+ * a P2 so the old dst is now the current subvol.
+ *
+ * There is still one scenario where the info could be outdated - if
+ * file has undergone multiple migrations and ends up on the same src_subvol
+ * on which the mig_info was first set.
+ */
+ if ((current == dst_subvol) || (current != src_subvol))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+/* Used to check if fd fops have the fd opened on the cached subvol
+ * This is required when:
+ * 1. an fd is opened on FILE1 on subvol1
+ * 2. the file is migrated to subvol2
+ * 3. a lookup updates the cached subvol in the inode_ctx to subvol2
+ * 4. a write comes on the fd
+ * The write is sent to subvol2 on an fd which has been opened only on fd1
+ * Since the migration phase checks don't kick in, the fop fails with EBADF
+ *
+ */
+
int
-dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc,
- xlator_t **subvol)
+dht_check_and_open_fd_on_subvol_complete(int ret, call_frame_t *frame,
+ void *data)
{
- char *new_name = NULL;
- char *new_path = NULL;
- xlator_list_t *trav = NULL;
- char key[1024] = {0,};
- int ret = 0; /* not found */
+ glusterfs_fop_t fop = 0;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ fd_t *fd = NULL;
+ int op_errno = -1;
+
+ local = frame->local;
+ this = frame->this;
+ fop = local->fop;
+ subvol = local->cached_subvol;
+ fd = local->fd;
+
+ if (ret) {
+ op_errno = local->op_errno;
+ goto handle_err;
+ }
+
+ switch (fop) {
+ case GF_FOP_WRITE:
+ STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+ subvol->fops->writev, fd, local->rebalance.vector,
+ local->rebalance.count, local->rebalance.offset,
+ local->rebalance.flags, local->rebalance.iobref,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FLUSH:
+ STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FSETATTR:
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->fsetattr, fd,
+ &local->rebalance.stbuf, local->rebalance.flags,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_ZEROFILL:
+ STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+ subvol->fops->zerofill, fd,
+ local->rebalance.offset, local->rebalance.size,
+ local->xattr_req);
+
+ break;
+
+ case GF_FOP_DISCARD:
+ STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+ subvol->fops->discard, local->fd,
+ local->rebalance.offset, local->rebalance.size,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FALLOCATE:
+ STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+ subvol->fops->fallocate, fd,
+ local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->ftruncate, fd,
+ local->rebalance.offset, local->xattr_req);
+ break;
+
+ case GF_FOP_FSYNC:
+ STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol,
+ subvol->fops->fsync, local->fd,
+ local->rebalance.flags, local->xattr_req);
+ break;
+
+ case GF_FOP_READ:
+ STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv,
+ local->fd, local->rebalance.size,
+ local->rebalance.offset, local->rebalance.flags,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FSTAT:
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->fstat, fd, local->xattr_req);
+ break;
+
+ case GF_FOP_FSETXATTR:
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, local->fd,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+
+ break;
+
+ case GF_FOP_FXATTROP:
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+ subvol->fops->fxattrop, local->fd,
+ local->rebalance.flags, local->rebalance.xattr,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr,
+ local->fd, local->key, NULL);
+ break;
+
+ case GF_FOP_FINODELK:
+ STACK_WIND(frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk,
+ local->key, local->fd, local->rebalance.lock_cmd,
+ &local->rebalance.flock, local->xattr_req);
+ break;
+ default:
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p",
+ fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s",
+ subvol->name, NULL);
+ break;
+ }
+
+ goto out;
+
+ /* Could not open the fd on the dst. Unwind */
+
+handle_err:
+
+ switch (fop) {
+ case GF_FOP_WRITE:
+ DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_FLUSH:
+ DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+ break;
+
+ case GF_FOP_FSETATTR:
+ DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_ZEROFILL:
+ DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_DISCARD:
+ DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_FALLOCATE:
+ DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_FSYNC:
+ DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_READ:
+ DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL,
+ NULL);
+ break;
+
+ case GF_FOP_FSTAT:
+ DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
+ break;
+
+ case GF_FOP_FSETXATTR:
+ DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
+ break;
+
+ case GF_FOP_FXATTROP:
+ DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
+ break;
+
+ case GF_FOP_FINODELK:
+ DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL);
+ break;
+
+ default:
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p",
+ fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s",
+ subvol->name, NULL);
+ break;
+ }
- /* Why do other tasks if first required 'char' itself is not there */
- if (loc->name && !strchr (loc->name, '@'))
- goto out;
+out:
- trav = this->children;
- while (trav) {
- snprintf (key, 1024, "*@%s:%s", this->name, trav->xlator->name);
- if (fnmatch (key, loc->name, FNM_NOESCAPE) == 0) {
- new_name = GF_CALLOC(strlen (loc->name),
- sizeof (char),
- gf_common_mt_char);
- if (!new_name)
- goto out;
- if (fnmatch (key, loc->path, FNM_NOESCAPE) == 0) {
- new_path = GF_CALLOC(strlen (loc->path),
- sizeof (char),
- gf_common_mt_char);
- if (!new_path)
- goto out;
- strncpy (new_path, loc->path, (strlen (loc->path) -
- strlen (key) + 1));
- }
- strncpy (new_name, loc->name, (strlen (loc->name) -
- strlen (key) + 1));
-
- if (new_loc) {
- new_loc->path = ((new_path) ? new_path:
- gf_strdup (loc->path));
- new_loc->name = new_name;
- new_loc->ino = loc->ino;
- new_loc->inode = inode_ref (loc->inode);
- new_loc->parent = inode_ref (loc->parent);
- }
- *subvol = trav->xlator;
- ret = 1; /* success */
- goto out;
- }
- trav = trav->next;
+ return 0;
+}
+
+/* Check once again if the fd has been opened on the cached subvol.
+ * If not, open and update the fd_ctx.
+ */
+
+int
+dht_check_and_open_fd_on_subvol_task(void *data)
+{
+ loc_t loc = {
+ 0,
+ };
+ int ret = -1;
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ fd_t *fd = NULL;
+ xlator_t *this = NULL;
+ xlator_t *subvol = NULL;
+
+ frame = data;
+ local = frame->local;
+ this = THIS;
+ fd = local->fd;
+ subvol = local->cached_subvol;
+
+ local->fd_checked = _gf_true;
+
+ if (fd_is_anonymous(fd) || dht_fd_open_on_dst(this, fd, subvol)) {
+ ret = 0;
+ goto out;
+ }
+
+ gf_msg_debug(this->name, 0, "Opening fd (%p, flags=0%o) on file %s @ %s",
+ fd, fd->flags, uuid_utoa(fd->inode->gfid), subvol->name);
+
+ loc.inode = inode_ref(fd->inode);
+ gf_uuid_copy(loc.gfid, fd->inode->gfid);
+
+ /* Open this on the dst subvol */
+
+ SYNCTASK_SETID(0, 0);
+
+ ret = syncop_open(subvol, &loc, (fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+ fd, NULL, NULL);
+
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_OPEN_FD_ON_DST_FAILED,
+ "fd=%p", fd, "flags=0%o", fd->flags, "gfid=%s",
+ uuid_utoa(fd->inode->gfid), "name=%s", subvol->name, NULL);
+ /* This can happen if the cached subvol was updated in the
+ * inode_ctx and the fd was opened on the new cached suvol
+ * after this fop was wound on the old cached subvol.
+ * As we do not close the fd on the old subvol (a leak)
+ * don't treat ENOENT as an error and allow the phase1/phase2
+ * checks to handle it.
+ */
+
+ if ((-ret != ENOENT) && (-ret != ESTALE)) {
+ local->op_errno = -ret;
+ ret = -1;
+ } else {
+ ret = 0;
}
+
+ local->op_errno = -ret;
+ ret = -1;
+
+ } else {
+ dht_fd_ctx_set(this, fd, subvol);
+ }
+
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
out:
- if (!ret) {
- /* !success */
- if (new_path)
- GF_FREE (new_path);
- if (new_name)
- GF_FREE (new_name);
- }
- return ret;
+ loc_wipe(&loc);
+
+ return ret;
+}
+
+int
+dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame)
+{
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+ /*
+ if (dht_fd_open_on_dst (this, fd, subvol))
+ goto out;
+ */
+ local = frame->local;
+
+ ret = synctask_new(this->ctx->env, dht_check_and_open_fd_on_subvol_task,
+ dht_check_and_open_fd_on_subvol_complete, frame, frame);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SYNCTASK_CREATE_FAILED,
+ "to-check-and-open fd=%p", local->fd, NULL);
+ }
+
+ return ret;
}
int
-dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
- uint64_t *x_p)
+dht_frame_return(call_frame_t *frame)
{
- dht_conf_t *conf = NULL;
- int cnt = 0;
- int max = 0;
- uint64_t x = 0;
- xlator_t *subvol = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+
+ if (!frame)
+ return -1;
+
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ }
+ UNLOCK(&frame->lock);
+
+ return this_call_cnt;
+}
+
+/*
+ * Use this function to specify which subvol you want the file created
+ * on - this need not be the hashed subvol.
+ * Format: <filename>@<this->name>:<subvol-name>
+ * Eg: file-1@vol1-dht:vol1-client-0
+ * where vol1 is a pure distribute volume
+ * will create file-1 on vol1-client-0
+ */
- if (!this->private)
+int
+dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc,
+ xlator_t **subvol)
+{
+ char *new_name = NULL;
+ char *new_path = NULL;
+ xlator_list_t *trav = NULL;
+ char key[1024] = {
+ 0,
+ };
+ int ret = 0; /* not found */
+ int keylen = 0;
+ int name_len = 0;
+ int path_len = 0;
+
+ /* Why do other tasks if first required 'char' itself is not there */
+ if (!new_loc || !loc || !loc->name || !strchr(loc->name, '@')) {
+ /* Skip the GF_FREE checks here */
+ return ret;
+ }
+
+ trav = this->children;
+ while (trav) {
+ keylen = snprintf(key, sizeof(key), "*@%s:%s", this->name,
+ trav->xlator->name);
+ /* Ignore '*' */
+ keylen = keylen - 1;
+ if (fnmatch(key, loc->name, FNM_NOESCAPE) == 0) {
+ name_len = strlen(loc->name) - keylen;
+ new_name = GF_MALLOC(name_len + 1, gf_common_mt_char);
+ if (!new_name)
goto out;
+ if (fnmatch(key, loc->path, FNM_NOESCAPE) == 0) {
+ path_len = strlen(loc->path) - keylen;
+ new_path = GF_MALLOC(path_len + 1, gf_common_mt_char);
+ if (!new_path)
+ goto out;
+ snprintf(new_path, path_len + 1, "%s", loc->path);
+ }
+ snprintf(new_name, name_len + 1, "%s", loc->name);
+
+ if (new_loc) {
+ new_loc->path = ((new_path) ? new_path : gf_strdup(loc->path));
+ new_loc->name = new_name;
+ new_loc->inode = inode_ref(loc->inode);
+ new_loc->parent = inode_ref(loc->parent);
+ }
+ *subvol = trav->xlator;
+ ret = 1; /* success */
+ goto out;
+ }
+ trav = trav->next;
+ }
+out:
+ if (!ret) {
+ /* !success */
+ GF_FREE(new_path);
+ GF_FREE(new_name);
+ }
+ return ret;
+}
- conf = this->private;
- max = conf->subvolume_cnt;
+static xlator_t *
+dht_get_subvol_from_id(xlator_t *this, int client_id)
+{
+ xlator_t *xl = NULL;
+ dht_conf_t *conf = NULL;
+ char *sid = NULL;
+ int32_t ret = -1;
- cnt = y % max;
- x = y / max;
+ conf = this->private;
- subvol = conf->subvolumes[cnt];
+ ret = gf_asprintf(&sid, "%d", client_id);
+ if (ret == -1) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_ASPRINTF_FAILED, NULL);
+ goto out;
+ }
- if (subvol_p)
- *subvol_p = subvol;
+ if (dict_get_ptr(conf->leaf_to_subvol, sid, (void **)&xl))
+ xl = NULL;
- if (x_p)
- *x_p = x;
+ GF_FREE(sid);
out:
- return 0;
+ return xl;
}
+int
+dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol_p)
+{
+ int client_id = 0;
+ xlator_t *subvol = 0;
+ dht_conf_t *conf = NULL;
+
+ if (!this->private)
+ return -1;
+
+ conf = this->private;
+
+ client_id = gf_deitransform(this, y);
+
+ subvol = dht_get_subvol_from_id(this, client_id);
+
+ if (!subvol)
+ subvol = conf->subvolumes[0];
+
+ if (subvol_p)
+ *subvol_p = subvol;
+
+ return 0;
+}
void
-dht_local_wipe (xlator_t *this, dht_local_t *local)
+dht_local_wipe(xlator_t *this, dht_local_t *local)
{
- if (!local)
- return;
+ int i = 0;
- loc_wipe (&local->loc);
- loc_wipe (&local->loc2);
+ if (!local)
+ return;
- if (local->xattr)
- dict_unref (local->xattr);
+ loc_wipe(&local->loc);
+ loc_wipe(&local->loc2);
+ loc_wipe(&local->loc2_copy);
- if (local->inode)
- inode_unref (local->inode);
+ if (local->xattr)
+ dict_unref(local->xattr);
- if (local->layout) {
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
- }
+ if (local->inode)
+ inode_unref(local->inode);
- loc_wipe (&local->linkfile.loc);
+ if (local->layout) {
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+ }
- if (local->linkfile.xattr)
- dict_unref (local->linkfile.xattr);
+ loc_wipe(&local->linkfile.loc);
- if (local->linkfile.inode)
- inode_unref (local->linkfile.inode);
+ if (local->linkfile.xattr)
+ dict_unref(local->linkfile.xattr);
- if (local->fd) {
- fd_unref (local->fd);
- local->fd = NULL;
- }
+ if (local->linkfile.inode)
+ inode_unref(local->linkfile.inode);
- if (local->params) {
- dict_unref (local->params);
- local->params = NULL;
- }
+ if (local->fd) {
+ fd_unref(local->fd);
+ local->fd = NULL;
+ }
- if (local->xattr_req)
- dict_unref (local->xattr_req);
+ if (local->params) {
+ dict_unref(local->params);
+ local->params = NULL;
+ }
- if (local->selfheal.layout) {
- dht_layout_unref (this, local->selfheal.layout);
- local->selfheal.layout = NULL;
- }
+ if (local->xattr_req)
+ dict_unref(local->xattr_req);
+ if (local->mds_xattr)
+ dict_unref(local->mds_xattr);
+ if (local->xdata)
+ dict_unref(local->xdata);
- if (local->newpath) {
- GF_FREE (local->newpath);
- }
+ if (local->selfheal.layout) {
+ dht_layout_unref(this, local->selfheal.layout);
+ local->selfheal.layout = NULL;
+ }
- GF_FREE (local);
-}
+ if (local->selfheal.refreshed_layout) {
+ dht_layout_unref(this, local->selfheal.refreshed_layout);
+ local->selfheal.refreshed_layout = NULL;
+ }
+ for (i = 0; i < 2; i++) {
+ dht_lock_array_free(local->lock[i].ns.parent_layout.locks,
+ local->lock[i].ns.parent_layout.lk_count);
-dht_local_t *
-dht_local_init (call_frame_t *frame)
-{
- dht_local_t *local = NULL;
+ GF_FREE(local->lock[i].ns.parent_layout.locks);
- /* TODO: use mem-pool */
- local = GF_CALLOC (1, sizeof (*local),
- gf_dht_mt_dht_local_t);
+ dht_lock_array_free(local->lock[i].ns.directory_ns.locks,
+ local->lock[i].ns.directory_ns.lk_count);
+ GF_FREE(local->lock[i].ns.directory_ns.locks);
+ }
- if (!local)
- return NULL;
+ GF_FREE(local->key);
- local->op_ret = -1;
- local->op_errno = EUCLEAN;
+ if (local->rebalance.xdata)
+ dict_unref(local->rebalance.xdata);
- frame->local = local;
+ if (local->rebalance.xattr)
+ dict_unref(local->rebalance.xattr);
- return local;
-}
+ if (local->rebalance.dict)
+ dict_unref(local->rebalance.dict);
+ GF_FREE(local->rebalance.vector);
-char *
-basestr (const char *str)
-{
- char *basestr = NULL;
+ if (local->rebalance.iobref)
+ iobref_unref(local->rebalance.iobref);
- basestr = strrchr (str, '/');
- if (basestr)
- basestr ++;
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ local->stub = NULL;
+ }
- return basestr;
-}
+ if (local->ret_cache)
+ GF_FREE(local->ret_cache);
+ mem_put(local);
+}
-xlator_t *
-dht_first_up_subvol (xlator_t *this)
+dht_local_t *
+dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop)
{
- dht_conf_t *conf = NULL;
- xlator_t *child = NULL;
- int i = 0;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ int ret = 0;
- conf = this->private;
- if (!conf)
- goto out;
+ local = mem_get0(THIS->local_pool);
+ if (!local)
+ goto out;
+
+ if (loc) {
+ ret = loc_copy(&local->loc, loc);
+ if (ret)
+ goto out;
+
+ inode = loc->inode;
+ }
+
+ if (fd) {
+ local->fd = fd_ref(fd);
+ if (!inode)
+ inode = fd->inode;
+ }
+
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+ local->fop = fop;
- LOCK (&conf->subvolume_lock);
- {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolume_status[i]) {
- child = conf->subvolumes[i];
- break;
- }
- }
- }
- UNLOCK (&conf->subvolume_lock);
+ if (inode) {
+ local->layout = dht_layout_get(frame->this, inode);
+ local->cached_subvol = dht_subvol_get_cached(frame->this, inode);
+ }
+
+ frame->local = local;
out:
- return child;
+ if (ret) {
+ if (local)
+ mem_put(local);
+ local = NULL;
+ }
+ return local;
}
xlator_t *
-dht_last_up_subvol (xlator_t *this)
+dht_first_up_subvol(xlator_t *this)
{
- dht_conf_t *conf = NULL;
- xlator_t *child = NULL;
- int i = 0;
+ dht_conf_t *conf = NULL;
+ xlator_t *child = NULL;
+ int i = 0;
+ time_t time = 0;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ LOCK(&conf->subvolume_lock);
+ {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvol_up_time[i]) {
+ if (!time) {
+ time = conf->subvol_up_time[i];
+ child = conf->subvolumes[i];
+ } else if (time > conf->subvol_up_time[i]) {
+ time = conf->subvol_up_time[i];
+ child = conf->subvolumes[i];
+ }
+ }
+ }
+ }
+ UNLOCK(&conf->subvolume_lock);
- conf = this->private;
- if (!conf)
- goto out;
+out:
+ return child;
+}
- LOCK (&conf->subvolume_lock);
- {
- for (i = conf->subvolume_cnt-1; i >= 0; i--) {
- if (conf->subvolume_status[i]) {
- child = conf->subvolumes[i];
- break;
- }
- }
+xlator_t *
+dht_last_up_subvol(xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *child = NULL;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ LOCK(&conf->subvolume_lock);
+ {
+ for (i = conf->subvolume_cnt - 1; i >= 0; i--) {
+ if (conf->subvolume_status[i]) {
+ child = conf->subvolumes[i];
+ break;
+ }
}
- UNLOCK (&conf->subvolume_lock);
+ }
+ UNLOCK(&conf->subvolume_lock);
out:
- return child;
+ return child;
}
xlator_t *
-dht_subvol_get_hashed (xlator_t *this, loc_t *loc)
+dht_subvol_get_hashed(xlator_t *this, loc_t *loc)
{
- dht_layout_t *layout = NULL;
- xlator_t *subvol = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
- if (is_fs_root (loc)) {
- subvol = dht_first_up_subvol (this);
- goto out;
- }
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, loc, out);
- layout = dht_layout_get (this, loc->parent);
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, out);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "layout missing path=%s parent=%"PRId64,
- loc->path, loc->parent->ino);
- goto out;
- }
+ methods = &(conf->methods);
- subvol = dht_layout_search (this, layout, loc->name);
+ if (__is_root_gfid(loc->gfid)) {
+ subvol = dht_first_up_subvol(this);
+ goto out;
+ }
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not find subvolume for path=%s",
- loc->path);
- goto out;
- }
+ GF_VALIDATE_OR_GOTO(this->name, loc->parent, out);
+ GF_VALIDATE_OR_GOTO(this->name, loc->name, out);
+
+ layout = dht_layout_get(this, loc->parent);
+
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "Missing layout. path=%s, parent gfid =%s",
+ loc->path, uuid_utoa(loc->parent->gfid));
+ goto out;
+ }
+
+ subvol = methods->layout_search(this, layout, loc->name);
+
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "No hashed subvolume for path=%s",
+ loc->path);
+ goto out;
+ }
out:
- if (layout) {
- dht_layout_unref (this, layout);
- }
+ if (layout) {
+ dht_layout_unref(this, layout);
+ }
- return subvol;
+ return subvol;
}
-
xlator_t *
-dht_subvol_get_cached (xlator_t *this, inode_t *inode)
+dht_subvol_get_cached(xlator_t *this, inode_t *inode)
{
- dht_layout_t *layout = NULL;
- xlator_t *subvol = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- layout = dht_layout_get (this, inode);
+ layout = dht_layout_get(this, inode);
- if (!layout) {
- goto out;
+ if (!layout) {
+ goto out;
+ }
+
+ subvol = layout->list[0].xlator;
+
+out:
+ if (layout) {
+ dht_layout_unref(this, layout);
+ }
+
+ return subvol;
+}
+
+xlator_t *
+dht_subvol_next(xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ if ((i + 1) < conf->subvolume_cnt)
+ next = conf->subvolumes[i + 1];
+ break;
}
+ }
- subvol = layout->list[0].xlator;
+out:
+ return next;
+}
+
+/* This func wraps around, if prev is actually the last subvol.
+ */
+xlator_t *
+dht_subvol_next_available(xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ /* if prev is last in conf->subvolumes, then wrap
+ * around.
+ */
+ if ((i + 1) < conf->subvolume_cnt) {
+ next = conf->subvolumes[i + 1];
+ } else {
+ next = conf->subvolumes[0];
+ }
+ break;
+ }
+ }
out:
- if (layout) {
- dht_layout_unref (this, layout);
+ return next;
+}
+int
+dht_subvol_cnt(xlator_t *this, xlator_t *subvol)
+{
+ int i = 0;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ ret = i;
+ break;
}
+ }
- return subvol;
+out:
+ return ret;
}
+#define set_if_greater(a, b) \
+ do { \
+ if ((a) < (b)) \
+ (a) = (b); \
+ } while (0)
-xlator_t *
-dht_subvol_next (xlator_t *this, xlator_t *prev)
+#define set_if_greater_time(a, an, b, bn) \
+ do { \
+ if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) { \
+ (a) = (b); \
+ (an) = (bn); \
+ } \
+ } while (0)
+
+int
+dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from)
{
- dht_conf_t *conf = NULL;
- int i = 0;
- xlator_t *next = NULL;
+ if (!from || !to)
+ return 0;
- conf = this->private;
- if (!conf)
- goto out;
+ to->ia_dev = from->ia_dev;
+
+ gf_uuid_copy(to->ia_gfid, from->ia_gfid);
+
+ to->ia_ino = from->ia_ino;
+ to->ia_prot = from->ia_prot;
+ to->ia_type = from->ia_type;
+ to->ia_nlink = from->ia_nlink;
+ to->ia_rdev = from->ia_rdev;
+ to->ia_size += from->ia_size;
+ to->ia_blksize = from->ia_blksize;
+ to->ia_blocks += from->ia_blocks;
+
+ if (IA_ISDIR(from->ia_type)) {
+ to->ia_blocks = DHT_DIR_STAT_BLOCKS;
+ to->ia_size = DHT_DIR_STAT_SIZE;
+ }
+ set_if_greater(to->ia_uid, from->ia_uid);
+ set_if_greater(to->ia_gid, from->ia_gid);
+
+ set_if_greater_time(to->ia_atime, to->ia_atime_nsec, from->ia_atime,
+ from->ia_atime_nsec);
+ set_if_greater_time(to->ia_mtime, to->ia_mtime_nsec, from->ia_mtime,
+ from->ia_mtime_nsec);
+ set_if_greater_time(to->ia_ctime, to->ia_ctime_nsec, from->ia_ctime,
+ from->ia_ctime_nsec);
+
+ return 0;
+}
+
+int
+dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name)
+{
+ if (!child) {
+ goto err;
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == prev) {
- if ((i + 1) < conf->subvolume_cnt)
- next = conf->subvolumes[i + 1];
- break;
- }
- }
+ if (strcmp(parent->path, "/") == 0)
+ gf_asprintf((char **)&child->path, "/%s", name);
+ else
+ gf_asprintf((char **)&child->path, "%s/%s", parent->path, name);
-out:
- return next;
+ if (!child->path) {
+ goto err;
+ }
+
+ child->name = strrchr(child->path, '/');
+ if (child->name)
+ child->name++;
+
+ child->parent = inode_ref(parent->inode);
+ child->inode = inode_new(parent->inode->table);
+
+ if (!child->inode) {
+ goto err;
+ }
+
+ return 0;
+err:
+ if (child) {
+ loc_wipe(child);
+ }
+ return -1;
}
+int
+dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf)
+{
+ xlator_list_t *subvols = NULL;
+ int cnt = 0;
+
+ if (!conf)
+ return -1;
+
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ cnt++;
+
+ conf->local_subvols = GF_CALLOC(cnt, sizeof(xlator_t *),
+ gf_dht_mt_xlator_t);
+
+ /* FIX FIX : do this dynamically*/
+ conf->local_nodeuuids = GF_CALLOC(cnt, sizeof(subvol_nodeuuids_info_t),
+ gf_dht_nodeuuids_t);
+
+ if (!conf->local_subvols || !conf->local_nodeuuids) {
+ return -1;
+ }
+
+ conf->local_subvols_cnt = 0;
+
+ return 0;
+}
int
-dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
+dht_init_subvolumes(xlator_t *this, dht_conf_t *conf)
{
- int i = 0;
- int ret = -1;
- dht_conf_t *conf = NULL;
+ xlator_list_t *subvols = NULL;
+ int cnt = 0;
- conf = this->private;
- if (!conf)
- goto out;
+ if (!conf)
+ return -1;
+
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ cnt++;
+
+ conf->subvolumes = GF_CALLOC(cnt, sizeof(xlator_t *), gf_dht_mt_xlator_t);
+ if (!conf->subvolumes) {
+ return -1;
+ }
+ conf->subvolume_cnt = cnt;
+ /* Doesn't make sense to do any dht layer tasks
+ if the subvol count is 1. Set it as pass_through */
+ if (cnt == 1)
+ this->pass_through = _gf_true;
+
+ conf->local_subvols_cnt = 0;
+
+ dht_set_subvol_range(this);
+
+ cnt = 0;
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ conf->subvolumes[cnt++] = subvols->xlator;
+
+ conf->subvolume_status = GF_CALLOC(cnt, sizeof(char), gf_dht_mt_char);
+ if (!conf->subvolume_status) {
+ return -1;
+ }
+
+ conf->last_event = GF_CALLOC(cnt, sizeof(int), gf_dht_mt_char);
+ if (!conf->last_event) {
+ return -1;
+ }
+
+ conf->subvol_up_time = GF_CALLOC(cnt, sizeof(time_t),
+ gf_dht_mt_subvol_time);
+ if (!conf->subvol_up_time) {
+ return -1;
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- ret = i;
- break;
- }
- }
+ conf->du_stats = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_du_t),
+ gf_dht_mt_dht_du_t);
+ if (!conf->du_stats) {
+ return -1;
+ }
+
+ conf->decommissioned_bricks = GF_CALLOC(cnt, sizeof(xlator_t *),
+ gf_dht_mt_xlator_t);
+ if (!conf->decommissioned_bricks) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ op_ret values :
+ 0 : Success.
+ -1 : Failure.
+ 1 : File is being migrated but not by this DHT layer.
+*/
+
+static int
+dht_migration_complete_check_done(int op_ret, call_frame_t *frame, void *data)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (local->cached_subvol == NULL) {
+ local->op_errno = EINVAL;
+ goto out;
+ }
+
+ subvol = local->cached_subvol;
out:
- return ret;
+ local->rebalance.target_op_fn(THIS, subvol, frame, op_ret);
+
+ return 0;
}
+int
+dht_migration_complete_check_task(void *data)
+{
+ int ret = -1;
+ xlator_t *src_node = NULL;
+ xlator_t *dst_node = NULL, *linkto_target = NULL;
+ dht_local_t *local = NULL;
+ dict_t *dict = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ xlator_t *this = NULL;
+ call_frame_t *frame = NULL;
+ loc_t tmp_loc = {
+ 0,
+ };
+ char *path = NULL;
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ fd_t *tmp = NULL;
+ uint64_t tmp_miginfo = 0;
+ dht_migrate_info_t *miginfo = NULL;
+ gf_boolean_t skip_open = _gf_false;
+ int open_failed = 0;
+
+ this = THIS;
+ frame = data;
+ local = frame->local;
+ conf = this->private;
+
+ src_node = local->cached_subvol;
+
+ if (!local->loc.inode && !local->fd) {
+ local->op_errno = EINVAL;
+ goto out;
+ }
+
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check won't be done*/
+
+ if (!local->loc.inode) {
+ ret = syncop_fgetxattr(src_node, local->fd, &dict,
+ conf->link_xattr_name, NULL, NULL);
+ } else {
+ SYNCTASK_SETID(0, 0);
+ ret = syncop_getxattr(src_node, &local->loc, &dict,
+ conf->link_xattr_name, NULL, NULL);
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+ }
+
+ /*
+ * Each DHT xlator layer has its own name for the linkto xattr.
+ * If the file mode bits indicate the the file is being migrated but
+ * this layer's linkto xattr is not set, it means that another
+ * DHT layer is migrating the file. In this case, return 1 so
+ * the mode bits can be passed on to the higher layer for appropriate
+ * action.
+ */
+ if (-ret == ENODATA) {
+ /* This DHT translator is not migrating this file */
+
+ ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+ if (tmp_miginfo) {
+ /* This can be a problem if the file was
+ * migrated by two different layers. Raise
+ * a warning here.
+ */
+ gf_smsg(
+ this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL);
+
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
+ GF_REF_PUT(miginfo);
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (!ret)
+ linkto_target = dht_linkfile_subvol(this, NULL, NULL, dict);
+
+ if (local->loc.inode) {
+ loc_copy(&tmp_loc, &local->loc);
+ } else {
+ tmp_loc.inode = inode_ref(inode);
+ gf_uuid_copy(tmp_loc.gfid, inode->gfid);
+ }
+
+ ret = syncop_lookup(this, &tmp_loc, &stbuf, 0, 0, 0);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
+ "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", this->name, NULL);
+ local->op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ dst_node = dht_subvol_get_cached(this, tmp_loc.inode);
+ if (linkto_target && dst_node != linkto_target) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_LINKFILE,
+ "linkto_target_name=%s", linkto_target->name, "dst_name=%s",
+ dst_node->name, NULL);
+ }
+
+ if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "dst_name=%s", dst_node->name, NULL);
+ ret = -1;
+ local->op_errno = EIO;
+ goto out;
+ }
+
+ /* update local. A layout is set in inode-ctx in lookup already */
+
+ dht_layout_unref(this, local->layout);
+
+ local->layout = dht_layout_get(frame->this, inode);
+ local->cached_subvol = dst_node;
+
+ ret = 0;
+
+ /* once we detect the migration complete, the inode-ctx2 is no more
+ required.. delete the ctx and also, it means, open() already
+ done on all the fd of inode */
+ ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+ if (tmp_miginfo) {
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
+ GF_REF_PUT(miginfo);
+ goto out;
+ }
+
+ /* perform 'open()' on all the fd's present on the inode */
+ if (tmp_loc.path == NULL) {
+ inode_path(inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ }
+
+ LOCK(&inode->lock);
+
+ if (list_empty(&inode->fd_list))
+ goto unlock;
+
+ /* perform open as root:root. There is window between linkfile
+ * creation(root:root) and setattr with the correct uid/gid
+ */
+ SYNCTASK_SETID(0, 0);
+
+ /* It's possible that we are the last user of iter_fd after each
+ * iteration. In this case the fd_unref() of iter_fd at the end of
+ * the loop will cause the destruction of the fd. So we need to
+ * iterate the list safely because iter_fd cannot be trusted.
+ */
+ iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list);
+ while (&iter_fd->inode_list != (&inode->fd_list)) {
+ if (fd_is_anonymous(iter_fd) ||
+ (dht_fd_open_on_dst(this, iter_fd, dst_node))) {
+ if (!tmp) {
+ iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd),
+ inode_list);
+ continue;
+ }
+ skip_open = _gf_true;
+ }
+ /* We need to release the inode->lock before calling
+ * syncop_open() to avoid possible deadlocks. However this
+ * can cause the iter_fd to be released by other threads.
+ * To avoid this, we take a reference before releasing the
+ * lock.
+ */
+ fd_ref(iter_fd);
+
+ UNLOCK(&inode->lock);
+
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
+ if (skip_open)
+ goto next;
+
+ /* flags for open are stripped down to allow following the
+ * new location of the file, otherwise we can get EEXIST or
+ * truncate the file again as rebalance is moving the data */
+ ret = syncop_open(dst_node, &tmp_loc,
+ (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+ iter_fd, NULL, NULL);
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_OPEN_FD_ON_DST_FAILED, "id=%p", iter_fd,
+ "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s",
+ dst_node->name, NULL);
+
+ open_failed = 1;
+ local->op_errno = -ret;
+ ret = -1;
+ } else {
+ dht_fd_ctx_set(this, iter_fd, dst_node);
+ }
+
+ next:
+ LOCK(&inode->lock);
+ skip_open = _gf_false;
+ tmp = iter_fd;
+ iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list);
+ }
+
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+
+ if (open_failed) {
+ ret = -1;
+ goto unlock;
+ }
+ ret = 0;
+
+unlock:
+ UNLOCK(&inode->lock);
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
-#define set_if_greater(a, b) do { \
- if ((a) < (b)) \
- (a) = (b); \
- } while (0)
+out:
+ if (dict) {
+ dict_unref(dict);
+ }
+
+ loc_wipe(&tmp_loc);
+
+ return ret;
+}
int
-dht_iatt_merge (xlator_t *this, struct iatt *to,
- struct iatt *from, xlator_t *subvol)
+dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame)
{
- if (!from || !to)
- return 0;
+ int ret = -1;
+
+ ret = synctask_new(this->ctx->env, dht_migration_complete_check_task,
+ dht_migration_complete_check_done, frame, frame);
+ return ret;
+}
- to->ia_dev = from->ia_dev;
+/* During 'in-progress' state, both nodes should have the file */
+/*
+ op_ret values :
+ 0 : Success
+ -1 : Failure.
+ 1 : File is being migrated but not by this DHT layer.
+*/
+static int
+dht_inprogress_check_done(int op_ret, call_frame_t *frame, void *data)
+{
+ dht_local_t *local = NULL;
+ xlator_t *dst_subvol = NULL, *src_subvol = NULL;
+ inode_t *inode = NULL;
- uuid_copy (to->ia_gfid, from->ia_gfid);
+ local = frame->local;
- dht_itransform (this, subvol, from->ia_ino, &to->ia_ino);
+ if (op_ret != 0)
+ goto out;
- to->ia_prot = from->ia_prot;
- to->ia_type = from->ia_type;
- to->ia_nlink = from->ia_nlink;
- to->ia_rdev = from->ia_rdev;
- to->ia_size += from->ia_size;
- to->ia_blksize = from->ia_blksize;
- to->ia_blocks += from->ia_blocks;
+ inode = local->loc.inode ? local->loc.inode : local->fd->inode;
- set_if_greater (to->ia_uid, from->ia_uid);
- set_if_greater (to->ia_gid, from->ia_gid);
+ dht_inode_ctx_get_mig_info(THIS, inode, &src_subvol, &dst_subvol);
+ if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, dst_subvol)) {
+ dst_subvol = dht_subvol_get_cached(THIS, inode);
+ if (!dst_subvol) {
+ local->op_errno = EINVAL;
+ goto out;
+ }
+ }
- set_if_greater (to->ia_atime, from->ia_atime);
- set_if_greater (to->ia_mtime, from->ia_mtime);
- set_if_greater (to->ia_ctime, from->ia_ctime);
+out:
+ local->rebalance.target_op_fn(THIS, dst_subvol, frame, op_ret);
- return 0;
+ return 0;
+}
+
+static int
+dht_rebalance_inprogress_task(void *data)
+{
+ int ret = -1;
+ xlator_t *src_node = NULL;
+ xlator_t *dst_node = NULL;
+ dht_local_t *local = NULL;
+ dict_t *dict = NULL;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ char *path = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ loc_t tmp_loc = {
+ 0,
+ };
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ fd_t *tmp = NULL;
+ int open_failed = 0;
+ uint64_t tmp_miginfo = 0;
+ dht_migrate_info_t *miginfo = NULL;
+ gf_boolean_t skip_open = _gf_false;
+
+ this = THIS;
+ frame = data;
+ local = frame->local;
+ conf = this->private;
+
+ src_node = local->cached_subvol;
+
+ if (!local->loc.inode && !local->fd)
+ goto out;
+
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check won't be done*/
+ if (local->loc.inode) {
+ SYNCTASK_SETID(0, 0);
+ ret = syncop_getxattr(src_node, &local->loc, &dict,
+ conf->link_xattr_name, NULL, NULL);
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+ } else {
+ ret = syncop_fgetxattr(src_node, local->fd, &dict,
+ conf->link_xattr_name, NULL, NULL);
+ }
+
+ /*
+ * Each DHT xlator layer has its own name for the linkto xattr.
+ * If the file mode bits indicate the the file is being migrated but
+ * this layer's linkto xattr is not present, it means that another
+ * DHT layer is migrating the file. In this case, return 1 so
+ * the mode bits can be passed on to the higher layer for appropriate
+ * action.
+ */
+
+ if (-ret == ENODATA) {
+ /* This DHT layer is not migrating this file */
+ ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+ if (tmp_miginfo) {
+ /* This can be a problem if the file was
+ * migrated by two different layers. Raise
+ * a warning here.
+ */
+ gf_smsg(
+ this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL);
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
+ GF_REF_PUT(miginfo);
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_GET_XATTR_FAILED,
+ "path=%s", local->loc.path, NULL);
+ ret = -1;
+ goto out;
+ }
+
+ dst_node = dht_linkfile_subvol(this, NULL, NULL, dict);
+ if (!dst_node) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GET_XATTR_FAILED,
+ "path=%s", local->loc.path, NULL);
+ ret = -1;
+ goto out;
+ }
+
+ local->rebalance.target_node = dst_node;
+
+ if (local->loc.inode) {
+ loc_copy(&tmp_loc, &local->loc);
+ } else {
+ tmp_loc.inode = inode_ref(inode);
+ gf_uuid_copy(tmp_loc.gfid, inode->gfid);
+ }
+
+ /* lookup on dst */
+ ret = syncop_lookup(dst_node, &tmp_loc, &stbuf, NULL, NULL, NULL);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
+ "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", dst_node->name, NULL);
+ ret = -1;
+ goto out;
+ }
+
+ if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", dst_node->name, NULL);
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+
+ if (tmp_loc.path == NULL) {
+ inode_path(inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ }
+
+ LOCK(&inode->lock);
+
+ if (list_empty(&inode->fd_list))
+ goto unlock;
+
+ /* perform open as root:root. There is window between linkfile
+ * creation(root:root) and setattr with the correct uid/gid
+ */
+ SYNCTASK_SETID(0, 0);
+
+ /* It's possible that we are the last user of iter_fd after each
+ * iteration. In this case the fd_unref() of iter_fd at the end of
+ * the loop will cause the destruction of the fd. So we need to
+ * iterate the list safely because iter_fd cannot be trusted.
+ */
+ iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list);
+ while (&iter_fd->inode_list != (&inode->fd_list)) {
+ /* We need to release the inode->lock before calling
+ * syncop_open() to avoid possible deadlocks. However this
+ * can cause the iter_fd to be released by other threads.
+ * To avoid this, we take a reference before releasing the
+ * lock.
+ */
+
+ if (fd_is_anonymous(iter_fd) ||
+ (dht_fd_open_on_dst(this, iter_fd, dst_node))) {
+ if (!tmp) {
+ iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd),
+ inode_list);
+ continue;
+ }
+ skip_open = _gf_true;
+ }
+
+ /* Yes, this is ugly but there isn't a cleaner way to do this
+ * the fd_ref is an atomic increment so not too bad. We want to
+ * reduce the number of inode locks and unlocks.
+ */
+
+ fd_ref(iter_fd);
+ UNLOCK(&inode->lock);
+
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
+ if (skip_open)
+ goto next;
+
+ /* flags for open are stripped down to allow following the
+ * new location of the file, otherwise we can get EEXIST or
+ * truncate the file again as rebalance is moving the data */
+ ret = syncop_open(dst_node, &tmp_loc,
+ (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+ iter_fd, NULL, NULL);
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_OPEN_FD_ON_DST_FAILED, "fd=%p", iter_fd,
+ "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s",
+ dst_node->name, NULL);
+ ret = -1;
+ open_failed = 1;
+ } else {
+ /* Potential fd leak if this fails here as it will be
+ reopened at the next Phase1/2 check */
+ dht_fd_ctx_set(this, iter_fd, dst_node);
+ }
+
+ next:
+ LOCK(&inode->lock);
+ skip_open = _gf_false;
+ tmp = iter_fd;
+ iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list);
+ }
+
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+
+unlock:
+ UNLOCK(&inode->lock);
+
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
+ if (open_failed) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dht_inode_ctx_set_mig_info(this, inode, src_node, dst_node);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "path=%s", local->loc.path, "name=%s", dst_node->name, NULL);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (dict) {
+ dict_unref(dict);
+ }
+
+ loc_wipe(&tmp_loc);
+ return ret;
+}
+
+int
+dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame)
+{
+ int ret = -1;
+
+ ret = synctask_new(this->ctx->env, dht_rebalance_inprogress_task,
+ dht_inprogress_check_done, frame, frame);
+ return ret;
}
int
-dht_frame_su_do (call_frame_t *frame)
+dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this,
+ dht_layout_t *layout_int)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+
+ ret = dht_inode_ctx_get(inode, this, &ctx);
+ if (!ret && ctx) {
+ ctx->layout = layout_int;
+ } else {
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+ if (!ctx)
+ return ret;
+ ctx->layout = layout_int;
+ }
+
+ ret = dht_inode_ctx_set(inode, this, ctx);
+
+ return ret;
+}
+
+void
+dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat)
{
- dht_local_t *local = NULL;
+ dht_inode_ctx_t *ctx = NULL;
+ dht_stat_time_t *time = 0;
+ int ret = -1;
- local = frame->local;
+ ret = dht_inode_ctx_get(inode, this, &ctx);
- local->uid = frame->root->uid;
- local->gid = frame->root->gid;
+ if (ret)
+ return;
- frame->root->uid = 0;
- frame->root->gid = 0;
+ time = &ctx->time;
- return 0;
+ time->mtime = stat->ia_mtime;
+ time->mtime_nsec = stat->ia_mtime_nsec;
+
+ time->ctime = stat->ia_ctime;
+ time->ctime_nsec = stat->ia_ctime_nsec;
+
+ time->atime = stat->ia_atime;
+ time->atime_nsec = stat->ia_atime_nsec;
+
+ return;
}
+int
+dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat,
+ int32_t post)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ dht_stat_time_t *time = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO(this->name, stat, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+ ret = dht_inode_ctx_get(inode, this, &ctx);
+
+ if (ret) {
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+ if (!ctx)
+ return -1;
+ }
+
+ time = &ctx->time;
+
+ LOCK(&inode->lock);
+ {
+ DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, stat->ia_mtime,
+ stat->ia_mtime_nsec, post);
+ DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, stat->ia_ctime,
+ stat->ia_ctime_nsec, post);
+ DHT_UPDATE_TIME(time->atime, time->atime_nsec, stat->ia_atime,
+ stat->ia_atime_nsec, post);
+ }
+ UNLOCK(&inode->lock);
+
+ ret = dht_inode_ctx_set(inode, this, ctx);
+out:
+ return 0;
+}
int
-dht_frame_su_undo (call_frame_t *frame)
+dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx)
{
- dht_local_t *local = NULL;
+ int ret = -1;
+ uint64_t ctx_int = 0;
- local = frame->local;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- frame->root->uid = local->uid;
- frame->root->gid = local->gid;
+ ret = inode_ctx_get(inode, this, &ctx_int);
- return 0;
+ if (ret)
+ return ret;
+
+ if (ctx)
+ *ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
+out:
+ return ret;
}
+int
+dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, ctx, out);
+
+ ctx_int = (long)ctx;
+ ret = inode_ctx_set(inode, this, &ctx_int);
+out:
+ return ret;
+}
int
-dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+dht_subvol_status(dht_conf_t *conf, xlator_t *subvol)
+{
+ int i;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == subvol) {
+ return conf->subvolume_status[i];
+ }
+ }
+ return 0;
+}
+
+inode_t *
+dht_heal_path(xlator_t *this, char *path, inode_table_t *itable)
{
- if (!child) {
- goto err;
+ int ret = -1;
+ struct iatt iatt = {
+ 0,
+ };
+ inode_t *linked_inode = NULL;
+ loc_t loc = {
+ 0,
+ };
+ char *bname = NULL;
+ char *save_ptr = NULL;
+ static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+ char *tmp_path = NULL;
+
+ tmp_path = gf_strdup(path);
+ if (!tmp_path) {
+ goto out;
+ }
+
+ gf_uuid_copy(loc.pargfid, gfid);
+ loc.parent = inode_ref(itable->root);
+
+ bname = strtok_r(tmp_path, "/", &save_ptr);
+
+ /* sending a lookup on parent directory,
+ * Eg: if path is like /a/b/c/d/e/f/g/
+ * then we will send a lookup on a first and then b,c,d,etc
+ */
+
+ while (bname) {
+ linked_inode = NULL;
+ loc.inode = inode_grep(itable, loc.parent, bname);
+ if (loc.inode == NULL) {
+ loc.inode = inode_new(itable);
+ if (loc.inode == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else {
+ /*
+ * Inode is already populated in the inode table.
+ * Which means we already looked up the inode and
+ * linked with a dentry. So that we will skip
+ * lookup on this entry, and proceed to next.
+ */
+ linked_inode = loc.inode;
+ bname = strtok_r(NULL, "/", &save_ptr);
+ if (!bname) {
+ goto out;
+ }
+ inode_unref(loc.parent);
+ loc.parent = loc.inode;
+ gf_uuid_copy(loc.pargfid, loc.inode->gfid);
+ loc.inode = NULL;
+ continue;
}
- if (strcmp (parent->path, "/") == 0)
- gf_asprintf ((char **)&child->path, "/%s", name);
- else
- gf_asprintf ((char **)&child->path, "%s/%s", parent->path, name);
+ loc.name = bname;
+ ret = loc_path(&loc, bname);
- if (!child->path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ ret = syncop_lookup(this, &loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED,
+ "path=%s", path, "subvolume=%s", this->name, "bname=%s",
+ bname, NULL);
+ goto out;
}
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
+ linked_inode = inode_link(loc.inode, loc.parent, bname, &iatt);
+ if (!linked_inode)
+ goto out;
+
+ loc_wipe(&loc);
+ gf_uuid_copy(loc.pargfid, linked_inode->gfid);
+ loc.inode = NULL;
+
+ bname = strtok_r(NULL, "/", &save_ptr);
+ if (bname)
+ loc.parent = linked_inode;
+ }
+out:
+ inode_ref(linked_inode);
+ loc_wipe(&loc);
+ GF_FREE(tmp_path);
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
+ return linked_inode;
+}
- if (!child->inode) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+int
+dht_heal_full_path(void *data)
+{
+ call_frame_t *heal_frame = data;
+ dht_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+ dict_t *dict = NULL;
+ char *path = NULL;
+ int ret = -1;
+ xlator_t *source = NULL;
+ xlator_t *this = NULL;
+ inode_table_t *itable = NULL;
+ inode_t *inode = NULL;
+ inode_t *tmp_inode = NULL;
+
+ GF_VALIDATE_OR_GOTO("DHT", heal_frame, out);
+
+ local = heal_frame->local;
+ this = heal_frame->this;
+ source = heal_frame->cookie;
+ heal_frame->cookie = NULL;
+ gf_uuid_copy(loc.gfid, local->gfid);
+
+ if (local->loc.inode)
+ loc.inode = inode_ref(local->loc.inode);
+ else
+ goto out;
+
+ itable = loc.inode->table;
+ ret = syncop_getxattr(source, &loc, &dict, GET_ANCESTRY_PATH_KEY, NULL,
+ NULL);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_HEAL_ABORT,
+ "subvol=%s", source->name, NULL);
+ goto out;
+ }
+
+ ret = dict_get_str(dict, GET_ANCESTRY_PATH_KEY, &path);
+ if (path) {
+ inode = dht_heal_path(this, path, itable);
+ if (inode && inode != local->inode) {
+ /*
+ * if inode returned by heal function is different
+ * from what we passed, which means a racing thread
+ * already linked a different inode for dentry.
+ * So we will update our local->inode, so that we can
+ * retrurn proper inode.
+ */
+ tmp_inode = local->inode;
+ local->inode = inode;
+ inode_unref(tmp_inode);
+ tmp_inode = NULL;
+ } else {
+ inode_unref(inode);
}
+ }
- return 0;
-err:
- loc_wipe (child);
+out:
+ loc_wipe(&loc);
+ if (dict)
+ dict_unref(dict);
+ return 0;
+}
+
+int
+dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data)
+{
+ call_frame_t *main_frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ local = heal_frame->local;
+ main_frame = local->main_frame;
+ local->main_frame = NULL;
+ this = heal_frame->this;
+
+ dht_set_fixed_dir_stat(&local->postparent);
+ if (local->need_xattr_heal) {
+ local->need_xattr_heal = 0;
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path,
+ NULL);
+ }
+ }
+
+ DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf,
+ local->xattr, &local->postparent);
+
+ DHT_STACK_DESTROY(heal_frame);
+ return 0;
+}
+
+/* This function must be called inside an inode lock */
+int
+__dht_lock_subvol_set(inode_t *inode, xlator_t *this, xlator_t *lock_subvol)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+ uint64_t value = 0;
+
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+ ret = __inode_ctx_get0(inode, this, &value);
+ if (ret || !value) {
return -1;
+ }
+
+ ctx = (dht_inode_ctx_t *)(uintptr_t)value;
+ ctx->lock_subvol = lock_subvol;
+out:
+ return ret;
+}
+
+xlator_t *
+dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock,
+ dht_local_t *local)
+{
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ int32_t ret = -1;
+ uint64_t value = 0;
+ xlator_t *cached_subvol = NULL;
+ dht_inode_ctx_t *ctx = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO(this->name, lock, out);
+ GF_VALIDATE_OR_GOTO(this->name, local, out);
+
+ cached_subvol = local->cached_subvol;
+
+ if (local->loc.inode || local->fd) {
+ inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+ }
+
+ if (!inode)
+ goto out;
+
+ if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) {
+ /*
+ * We may get non-linked inode for directories as part
+ * of the selfheal code path. So checking for IA_INVAL
+ * type also. This will only happen for directory.
+ */
+ subvol = local->cached_subvol;
+ goto out;
+ }
+
+ if (lock->l_type != F_UNLCK) {
+ /*
+ * inode purging might happen on NFS between a lk
+ * and unlk. Due to this lk and unlk might be sent
+ * to different subvols.
+ * So during a lock request, taking a ref on inode
+ * to prevent inode purging. inode unref will happen
+ * in unlock cbk code path.
+ */
+ inode_ref(inode);
+ }
+
+ LOCK(&inode->lock);
+ ret = __inode_ctx_get0(inode, this, &value);
+ if (!ret && value) {
+ ctx = (dht_inode_ctx_t *)(uintptr_t)value;
+ subvol = ctx->lock_subvol;
+ }
+ if (!subvol && lock->l_type != F_UNLCK && cached_subvol) {
+ ret = __dht_lock_subvol_set(inode, this, cached_subvol);
+ if (ret) {
+ gf_uuid_unparse(inode->gfid, gfid);
+ UNLOCK(&inode->lock);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "lock_subvol gfid=%s", gfid, NULL);
+ goto post_unlock;
+ }
+ subvol = cached_subvol;
+ }
+ UNLOCK(&inode->lock);
+post_unlock:
+ if (!subvol && inode && lock->l_type != F_UNLCK) {
+ inode_unref(inode);
+ }
+out:
+ return subvol;
+}
+
+int
+dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret)
+{
+ int ret = -1;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ xlator_t *this = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ this = frame->this;
+
+ if (local->loc.inode || local->fd) {
+ inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+ }
+ if (!inode) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LOCK_INODE_UNREF_FAILED,
+ NULL);
+ goto out;
+ }
+
+ if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) {
+ ret = 0;
+ goto out;
+ }
+
+ switch (local->lock_type) {
+ case F_RDLCK:
+ case F_WRLCK:
+ if (op_ret) {
+ gf_uuid_unparse(inode->gfid, gfid);
+ gf_msg_debug(this->name, 0, "lock request failed for gfid %s",
+ gfid);
+ inode_unref(inode);
+ goto out;
+ }
+ break;
+
+ case F_UNLCK:
+ if (!op_ret) {
+ inode_unref(inode);
+ } else {
+ gf_uuid_unparse(inode->gfid, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCK_INODE_UNREF_FAILED, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+ default:
+ break;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/* Code to update custom extended attributes from src dict to dst dict
+ */
+void
+dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
+ dict_t *src, int *uret, int *uflag)
+{
+ int ret = -1;
+ data_t *keyval = NULL;
+ int luret = -1;
+ int luflag = -1;
+ int i = 0;
+ char **xattrs_to_heal;
+
+ if (!src || !dst) {
+ gf_smsg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DST_NULL_SET_FAILED,
+ "path=%s", local->loc.path, NULL);
+ return;
+ }
+ /* Check if any user xattr present in src dict and set
+ it to dst dict
+ */
+ luret = dict_foreach_fnmatch(src, "user.*", dht_set_user_xattr, dst);
+ /* Check if any other custom xattr present in src dict
+ and set it to dst dict, here index start from 1 because
+ user xattr already checked in previous statement
+ */
+
+ xattrs_to_heal = get_xattrs_to_heal();
+
+ for (i = 1; xattrs_to_heal[i]; i++) {
+ keyval = dict_get(src, xattrs_to_heal[i]);
+ if (keyval) {
+ luflag = 1;
+ ret = dict_set(dst, xattrs_to_heal[i], keyval);
+ if (ret)
+ gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_DICT_SET_FAILED, "key=%s", xattrs_to_heal[i],
+ "path=%s", local->loc.path, NULL);
+ keyval = NULL;
+ }
+ }
+ if (uret)
+ (*uret) = luret;
+ if (uflag)
+ (*uflag) = luflag;
}
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
new file mode 100644
index 00000000000..dbb8070b0da
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -0,0 +1,1658 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+
+static int
+dht_access2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_readv2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_attr2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_open2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_flush2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_lk2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_fsync2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
+ int ret);
+
+static int
+dht_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ local->op_errno = op_errno;
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ /* Update ctx if the fd has been opened on the target*/
+ if (!op_ret && (local->call_cnt == 1)) {
+ dht_fd_ctx_set(this, fd, prev);
+ goto out;
+ }
+
+ if (!op_ret || (local->call_cnt != 1))
+ goto out;
+
+ /* rebalance would have happened */
+ local->rebalance.target_op_fn = dht_open2;
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(open, frame, op_ret, op_errno, local->fd, xdata);
+
+ return 0;
+}
+
+static int
+dht_open2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This DHT layer is not migrating the file */
+ DHT_STACK_UNWIND(open, frame, -1, local->op_errno, NULL,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2;
+
+ STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open,
+ &local->loc, local->rebalance.flags, local->fd,
+ local->xattr_req);
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+int
+dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_OPEN);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->rebalance.flags = flags;
+ local->call_cnt = 1;
+
+ STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open,
+ loc, flags, fd, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *stbuf, dict_t *xdata)
+{
+ xlator_t *subvol1 = 0;
+ xlator_t *subvol2 = 0;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((local->fop == GF_FOP_FSTAT) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ local->op_errno = op_errno;
+ local->op_ret = op_ret;
+
+ /* Check if the rebalance phase2 is true */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ local->rebalance.target_op_fn = dht_attr2;
+ dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata);
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+ if (dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ /* Phase 2 of migration */
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ /* it is a non-fd op or it is an fd based Fop and
+ opened on the dst.*/
+ if (local->fd && !dht_fd_open_on_dst(this, local->fd, subvol2)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ dht_attr2(this, subvol2, frame, 0);
+ return 0;
+ }
+ }
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ DHT_STACK_UNWIND(stat, frame, op_ret, op_errno, stbuf, xdata);
+err:
+ return 0;
+}
+
+static int
+dht_attr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(stat, frame, local->op_ret, op_errno,
+ &local->rebalance.postbuf, local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2;
+
+ if (local->fop == GF_FOP_FSTAT) {
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->fstat, local->fd, local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->stat, &local->loc, local->xattr_req);
+ }
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+static int
+dht_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *stbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+
+ goto post_unlock;
+ }
+
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno,
+ &local->stbuf, xdata);
+ }
+
+ return 0;
+}
+
+int
+dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_STAT);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (IA_ISREG(loc->inode->ia_type)) {
+ local->call_cnt = 1;
+
+ subvol = local->cached_subvol;
+
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->stat, loc, xdata);
+
+ return 0;
+ }
+
+ local->call_cnt = call_cnt = layout->cnt;
+
+ for (i = 0; i < call_cnt; i++) {
+ subvol = layout->list[i].xlator;
+
+ STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol,
+ subvol->fops->stat, loc, xdata);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSTAT);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (IA_ISREG(fd->inode->ia_type)) {
+ local->call_cnt = 1;
+
+ subvol = local->cached_subvol;
+
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->fstat, fd, xdata);
+ return 0;
+ }
+
+ local->call_cnt = call_cnt = layout->cnt;
+
+ for (i = 0; i < call_cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol,
+ subvol->fops->fstat, fd, xdata);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
+ struct iobref *iobref, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
+ xlator_t *src_subvol = 0;
+ xlator_t *dst_subvol = 0;
+
+ local = frame->local;
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ /* This is already second try, no need for re-check */
+ if (local->call_cnt != 1)
+ goto out;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno))
+ goto out;
+
+ local->op_errno = op_errno;
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ local->op_ret = op_ret;
+ local->rebalance.target_op_fn = dht_readv2;
+ dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata);
+ /* File would be migrated to other node */
+ ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+ &dst_subvol);
+
+ if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol) ||
+ !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ /* value is already set in fd_ctx, that means no need
+ to check for whether its complete or not. */
+ dht_readv2(this, dst_subvol, frame, 0);
+ return 0;
+ }
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+
+ DHT_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf,
+ iobref, xdata);
+
+ return 0;
+}
+
+static int
+dht_readv2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(readv, frame, local->op_ret, op_errno, NULL, 0,
+ &local->rebalance.postbuf, NULL,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2;
+
+ STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd,
+ local->rebalance.size, local->rebalance.offset,
+ local->rebalance.flags, local->xattr_req);
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off,
+ uint32_t flags, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_READ);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->rebalance.offset = off;
+ local->rebalance.size = size;
+ local->rebalance.flags = flags;
+ local->call_cnt = 1;
+
+ STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd,
+ local->rebalance.size, local->rebalance.offset,
+ local->rebalance.flags, local->xattr_req);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
+
+ return 0;
+}
+
+static int
+dht_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (!prev)
+ goto out;
+ if (local->call_cnt != 1)
+ goto out;
+ if ((op_ret == -1) &&
+ ((op_errno == ENOTCONN) || dht_inode_missing(op_errno)) &&
+ IA_ISDIR(local->loc.inode->ia_type)) {
+ subvol = dht_subvol_next_available(this, prev);
+ if (!subvol)
+ goto out;
+
+ /* check if we are done with visiting every node */
+ if (subvol == local->cached_subvol) {
+ goto out;
+ }
+
+ STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+ subvol->fops->access, &local->loc,
+ local->rebalance.flags, NULL);
+ return 0;
+ }
+ if ((op_ret == -1) && dht_inode_missing(op_errno) &&
+ !(IA_ISDIR(local->loc.inode->ia_type))) {
+ /* File would be migrated to other node */
+ local->op_errno = op_errno;
+ local->rebalance.target_op_fn = dht_access2;
+ ret = dht_rebalance_complete_check(frame->this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+static int
+dht_access2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+
+ DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2;
+
+ STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+ subvol->fops->access, &local->loc, local->rebalance.flags,
+ local->xattr_req);
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int
+dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_ACCESS);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mask;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+ subvol->fops->access, loc, mask, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int
+dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = 0;
+ int ret = 0;
+
+ local = frame->local;
+
+ local->op_errno = op_errno;
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ local->rebalance.target_op_fn = dht_flush2;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ /* If context is set, then send flush() it to the destination */
+ dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol);
+ if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) {
+ dht_flush2(this, subvol, frame, 0);
+ return 0;
+ }
+
+ if (op_errno == EREMOTE) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret) {
+ return 0;
+ }
+ }
+
+out:
+ DHT_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+static int
+dht_flush2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
+
+ local = frame->local;
+
+ op_errno = local->op_errno;
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, local->fd,
+ local->xattr_req);
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int
+dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FLUSH);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->call_cnt = 1;
+
+ STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd,
+ local->xattr_req);
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int
+dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+ xlator_t *src_subvol = 0;
+ xlator_t *dst_subvol = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ local->op_errno = op_errno;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if (op_ret == -1 && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
+ }
+ goto out;
+ }
+
+ local->op_ret = op_ret;
+ inode = local->fd->inode;
+
+ local->rebalance.target_op_fn = dht_fsync2;
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
+
+ if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol) ||
+ !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ dht_fsync2(this, dst_subvol, frame, 0);
+ return 0;
+ }
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+ DHT_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+ return 0;
+}
+
+static int
+dht_fsync2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(fsync, frame, local->op_ret, op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync,
+ local->fd, local->rebalance.flags, local->xattr_req);
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSYNC);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->call_cnt = 1;
+ local->rebalance.flags = datasync;
+
+ subvol = local->cached_subvol;
+
+ STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync,
+ local->fd, local->rebalance.flags, local->xattr_req);
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to
+ indicate that lock migration happened on the fd, so we can consider it as
+ phase 2 of migration */
+static int
+dht_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct gf_flock *flock, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ local->rebalance.target_op_fn = dht_lk2;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
+
+ if (op_errno == EREMOTE) {
+ dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol);
+ if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) {
+ dht_lk2(this, subvol, frame, 0);
+ return 0;
+ } else {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret) {
+ return 0;
+ }
+ }
+ }
+
+out:
+ dht_lk_inode_unref(frame, op_ret);
+ DHT_STACK_UNWIND(lk, frame, op_ret, op_errno, flock, xdata);
+
+ return 0;
+}
+
+static int
+dht_lk2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
+
+ local = frame->local;
+
+ op_errno = local->op_errno;
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_lk_cbk, subvol, subvol->fops->lk, local->fd,
+ local->rebalance.lock_cmd, &local->rebalance.flock,
+ local->xattr_req);
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+int
+dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ xlator_t *lock_subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_LK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->lock_type = flock->l_type;
+ lock_subvol = dht_get_lock_subvolume(this, flock, local);
+ if (!lock_subvol) {
+ gf_msg_debug(this->name, 0, "no lock subvolume for path=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /*
+ local->cached_subvol = lock_subvol;
+ ret = dht_check_and_open_fd_on_subvol (this, frame);
+ if (ret)
+ goto err;
+ */
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->rebalance.flock = *flock;
+ local->rebalance.lock_cmd = cmd;
+
+ local->call_cnt = 1;
+
+ STACK_WIND(frame, dht_lk_cbk, lock_subvol, lock_subvol->fops->lk, fd, cmd,
+ flock, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+static int
+dht_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct gf_lease *lease, dict_t *xdata)
+{
+ DHT_STACK_UNWIND(lease, frame, op_ret, op_errno, lease, xdata);
+
+ return 0;
+}
+
+int
+dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ subvol = dht_subvol_get_cached(this, loc->inode);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* TODO: for rebalance, we need to preserve the fop arguments */
+ STACK_WIND(frame, dht_lease_cbk, subvol, subvol->fops->lease, loc, lease,
+ xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+/* Symlinks are currently not migrated, so no need for any check here */
+static int
+dht_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, const char *path, struct iatt *stbuf,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+ if (op_ret == -1)
+ goto err;
+
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
+
+err:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ DHT_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, stbuf, xdata);
+
+ return 0;
+}
+
+int
+dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_READLINK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND(frame, dht_readlink_cbk, subvol, subvol->fops->readlink, loc,
+ size, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(readlink, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/* Get both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
+ * Use DHT_MODE_IN_XDATA_KEY if available, else fall back to
+ * DHT_IATT_IN_XDATA_KEY
+ * This will return a dummy iatt with only the mode and type set
+ */
+static int
+dht_read_iatt_from_xdata(dict_t *xdata, struct iatt *stbuf)
+{
+ int ret = -1;
+ int32_t mode = 0;
+
+ ret = dict_get_int32(xdata, DHT_MODE_IN_XDATA_KEY, &mode);
+
+ if (ret) {
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+ } else {
+ stbuf->ia_prot = ia_prot_from_st_mode(mode);
+ stbuf->ia_type = ia_type_from_st_mode(mode);
+ }
+
+ return ret;
+}
+
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *call_frame = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src_subvol = NULL;
+ xlator_t *dst_subvol = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ int ret = -1;
+ inode_t *inode = NULL;
+
+ local = frame->local;
+ call_frame = cookie;
+ prev = call_frame->this;
+
+ local->op_errno = op_errno;
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ ret = dht_read_iatt_from_xdata(xdata, &stbuf);
+
+ if ((!op_ret) && (ret)) {
+ /* This is a potential problem and can cause corruption
+ * with sharding.
+ * Oh well. We tried.
+ */
+ goto out;
+ }
+
+ local->op_ret = op_ret;
+ local->rebalance.target_op_fn = dht_common_xattrop2;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
+
+ if (dict)
+ local->rebalance.dict = dict_ref(dict);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(&stbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(&stbuf)) {
+ inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+ dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
+
+ if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol) ||
+ !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ dht_common_xattrop2(this, dst_subvol, frame, 0);
+ return 0;
+ }
+ }
+
+out:
+ if (local->fop == GF_FOP_XATTROP) {
+ DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+ } else {
+ DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+ }
+
+ return 0;
+}
+
+static int
+dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
+ int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ if (local->fop == GF_FOP_XATTROP) {
+ DHT_STACK_UNWIND(xattrop, frame, local->op_ret, op_errno,
+ local->rebalance.dict, local->rebalance.xdata);
+ } else {
+ DHT_STACK_UNWIND(fxattrop, frame, local->op_ret, op_errno,
+ local->rebalance.dict, local->rebalance.xdata);
+ }
+
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ if (local->fop == GF_FOP_XATTROP) {
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop,
+ &local->loc, local->rebalance.flags, local->rebalance.xattr,
+ local->xattr_req);
+ } else {
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+ subvol->fops->fxattrop, local->fd, local->rebalance.flags,
+ local->rebalance.xattr, local->xattr_req);
+ }
+
+ return 0;
+
+out:
+
+ /* If local is unavailable we could be unwinding the wrong
+ * function here */
+
+ if (local && (local->fop == GF_FOP_XATTROP)) {
+ DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+ } else {
+ DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+ }
+ return 0;
+}
+
+static int
+dht_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+ DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+/* Set both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
+ * Use DHT_MODE_IN_XDATA_KEY if available. Else fall back to
+ * DHT_IATT_IN_XDATA_KEY
+ */
+static int
+dht_request_iatt_in_xdata(dict_t *xattr_req)
+{
+ int ret = -1;
+
+ ret = dict_set_int8(xattr_req, DHT_MODE_IN_XDATA_KEY, 1);
+ ret = dict_set_int8(xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+
+ /* At least one call succeeded */
+ return ret;
+}
+
+int
+dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ int ret = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_XATTROP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s",
+ uuid_utoa(loc->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Todo : Handle dirs as well. At the moment the only xlator above dht
+ * that uses xattrop is sharding and that is only for files */
+
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ STACK_WIND(frame, dht_xattrop_cbk, subvol, subvol->fops->xattrop, loc,
+ flags, dict, xdata);
+
+ } else {
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+ local->call_cnt = 1;
+
+ local->rebalance.xattr = dict_ref(dict);
+ local->rebalance.flags = flags;
+
+ ret = dht_request_iatt_in_xdata(local->xattr_req);
+
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to set dictionary key %s file=%s",
+ DHT_IATT_IN_XDATA_KEY, loc->path);
+ }
+
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop,
+ loc, local->rebalance.flags, local->rebalance.xattr,
+ local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+static int
+dht_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+ DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+int
+dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ int ret = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ subvol = dht_subvol_get_cached(this, fd->inode);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FXATTROP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ /* Todo : Handle dirs as well. At the moment the only xlator above dht
+ * that uses xattrop is sharding and that is only for files */
+
+ if (IA_ISDIR(fd->inode->ia_type)) {
+ STACK_WIND(frame, dht_fxattrop_cbk, subvol, subvol->fops->fxattrop, fd,
+ flags, dict, xdata);
+
+ } else {
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+ local->call_cnt = 1;
+
+ local->rebalance.xattr = dict_ref(dict);
+ local->rebalance.flags = flags;
+
+ ret = dht_request_iatt_in_xdata(local->xattr_req);
+
+ if (ret) {
+ gf_msg_debug(this->name, 0, "Failed to set dictionary key %s fd=%p",
+ DHT_IATT_IN_XDATA_KEY, fd);
+ }
+
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+ subvol->fops->fxattrop, fd, local->rebalance.flags,
+ local->rebalance.xattr, local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+/* Currently no translators on top of 'distribute' will be using
+ * below fops, hence not implementing 'migration' related checks
+ */
+
+static int
+dht_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+ dht_lk_inode_unref(frame, op_ret);
+ DHT_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+ xlator_t *lock_subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_INODELK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->lock_type = lock->l_type;
+ lock_subvol = dht_get_lock_subvolume(this, lock, local);
+ if (!lock_subvol) {
+ gf_msg_debug(this->name, 0, "no lock subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND(frame, dht_inodelk_cbk, lock_subvol, lock_subvol->fops->inodelk,
+ volume, loc, cmd, lock, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(inodelk, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int
+dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+
+ local = frame->local;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+out:
+ dht_lk_inode_unref(frame, op_ret);
+ DHT_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int
+dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+ xlator_t *lock_subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_INODELK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->call_cnt = 1;
+ local->lock_type = lock->l_type;
+
+ lock_subvol = dht_get_lock_subvolume(this, lock, local);
+ if (!lock_subvol) {
+ gf_msg_debug(this->name, 0, "no lock subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /*
+ local->cached_subvol = lock_subvol;
+ ret = dht_check_and_open_fd_on_subvol (this, frame);
+ if (ret)
+ goto err;
+ */
+ local->rebalance.flock = *lock;
+ local->rebalance.lock_cmd = cmd;
+ local->key = gf_strdup(volume);
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND(frame, dht_finodelk_cbk, lock_subvol,
+ lock_subvol->fops->finodelk, volume, fd, cmd, lock, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
new file mode 100644
index 00000000000..2f23ce90fbd
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -0,0 +1,1404 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+
+static int
+dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+
+int
+dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol1 = NULL;
+ xlator_t *subvol2 = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ /* writev fails with EBADF if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could be a valid bad fd error.
+ */
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if (op_ret == -1 && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, 0, "subvolume %s returned -1 (%s)", prev->name,
+ strerror(op_errno));
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ /* preserve the modes of source */
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
+ }
+ goto out;
+ }
+
+ local->rebalance.target_op_fn = dht_writev2;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ /* We might need to pass the stbuf information to the higher DHT
+ * layer for appropriate handling.
+ */
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ if (!local->xattr_req) {
+ local->xattr_req = dict_new();
+ if (!local->xattr_req) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+ "insufficient memory");
+ local->op_errno = ENOMEM;
+ local->op_ret = -1;
+ goto out;
+ }
+ }
+
+ ret = dict_set_uint32(local->xattr_req, GF_PROTECT_FROM_EXTERNAL_WRITES,
+ 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0,
+ "Failed to set key %s in dictionary",
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
+ local->op_errno = ENOMEM;
+ local->op_ret = -1;
+ goto out;
+ }
+
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1,
+ &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ if (dht_fd_open_on_dst(this, local->fd, subvol2)) {
+ dht_writev2(this, subvol2, frame, 0);
+ return 0;
+ }
+ }
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+ DHT_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+ return 0;
+}
+
+static int
+dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+ subvol->fops->writev, local->fd, local->rebalance.vector,
+ local->rebalance.count, local->rebalance.offset,
+ local->rebalance.flags, local->rebalance.iobref,
+ local->xattr_req);
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int count, off_t off, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_WRITE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->rebalance.vector = iov_dup(vector, count);
+ local->rebalance.offset = off;
+ local->rebalance.count = count;
+ local->rebalance.flags = flags;
+ local->rebalance.iobref = iobref_ref(iobref);
+ local->call_cnt = 1;
+
+ STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+ subvol->fops->writev, fd, local->rebalance.vector,
+ local->rebalance.count, local->rebalance.offset,
+ local->rebalance.flags, local->rebalance.iobref,
+ local->xattr_req);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *src_subvol = NULL;
+ xlator_t *dst_subvol = NULL;
+ inode_t *inode = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ /* Needs to be checked only for ftruncate.
+ * ftruncate fails with EBADF/EINVAL if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could actually be a valid error.
+ */
+
+ if ((local->fop == GF_FOP_FTRUNCATE) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
+ }
+ goto out;
+ }
+
+ local->rebalance.target_op_fn = dht_truncate2;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ /* We might need to pass the stbuf information to the higher DHT
+ * layer for appropriate handling.
+ */
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol)) {
+ if ((!local->fd) ||
+ ((local->fd) &&
+ dht_fd_open_on_dst(this, local->fd, dst_subvol))) {
+ dht_truncate2(this, dst_subvol, frame, 0);
+ return 0;
+ }
+ }
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+ DHT_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+static int
+dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ /* This dht xlator is not migrating the file */
+ if (we_are_not_migrating(ret)) {
+ DHT_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ if (local->fop == GF_FOP_TRUNCATE) {
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->truncate, &local->loc,
+ local->rebalance.offset, local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->ftruncate, local->fd,
+ local->rebalance.offset, local->xattr_req);
+ }
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_TRUNCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s",
+ uuid_utoa(loc->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->truncate, loc, offset, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FTRUNCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->ftruncate, fd, local->rebalance.offset,
+ local->xattr_req);
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *src_subvol = NULL;
+ xlator_t *dst_subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ /* fallocate fails with EBADF if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could actually be a valid error.
+ */
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
+ }
+ goto out;
+ }
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->rebalance.target_op_fn = dht_fallocate2;
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+ &dst_subvol);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol)) {
+ if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ dht_fallocate2(this, dst_subvol, frame, 0);
+ return 0;
+ }
+ }
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+ DHT_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+err:
+ return 0;
+}
+
+static int
+dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(fallocate, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+ subvol->fops->fallocate, local->fd,
+ local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FALLOCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mode;
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+ subvol->fops->fallocate, fd, local->rebalance.flags,
+ local->rebalance.offset, local->rebalance.size,
+ local->xattr_req);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *src_subvol = NULL;
+ xlator_t *dst_subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ /* discard fails with EBADF if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could actually be a valid error.
+ */
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
+ }
+ goto out;
+ }
+
+ local->rebalance.target_op_fn = dht_discard2;
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+ &dst_subvol);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol)) {
+ if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ dht_discard2(this, dst_subvol, frame, 0);
+ return 0;
+ }
+ }
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+ DHT_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+static int
+dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(discard, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+ subvol->fops->discard, local->fd, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_DISCARD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+ subvol->fops->discard, fd, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol1 = NULL, *subvol2 = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ /* zerofill fails with EBADF if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could actually be a valid error.
+ */
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
+ }
+ goto out;
+ }
+
+ local->rebalance.target_op_fn = dht_zerofill2;
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1,
+ &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ if (dht_fd_open_on_dst(this, local->fd, subvol2)) {
+ dht_zerofill2(this, subvol2, frame, 0);
+ return 0;
+ }
+ }
+
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+ DHT_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+static int
+dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto out;
+
+ local = frame->local;
+
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(zerofill, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+ subvol->fops->zerofill, local->fd,
+ local->rebalance.offset, local->rebalance.size,
+ local->xattr_req);
+
+ return 0;
+
+out:
+
+ DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_ZEROFILL);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+ subvol->fops->zerofill, fd, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/* handle cases of migration here for 'setattr()' calls */
+int
+dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ prev = cookie;
+
+ local->op_errno = op_errno;
+
+ if ((local->fop == GF_FOP_FSETATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ local->rebalance.target_op_fn = dht_setattr2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* At the end of the migration process, whatever 'attr' we
+ have on source file will be migrated to destination file
+ in one shot, hence we don't need to check for in progress
+ state here (ie, PHASE1) */
+out:
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+ DHT_STACK_UNWIND(setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+ return 0;
+}
+
+static int
+dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if (!frame || !frame->local)
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ if (local->fop == GF_FOP_SETATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->setattr, &local->loc,
+ &local->rebalance.stbuf, local->rebalance.flags,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->fsetattr, local->fd,
+ &local->rebalance.stbuf, local->rebalance.flags,
+ local->xattr_req);
+ }
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+/* Keep the existing code same for all the cases other than regular file */
+int
+dht_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *statpre, struct iatt *statpost,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
+ }
+
+ dht_iatt_merge(this, &local->prebuf, statpre);
+ dht_iatt_merge(this, &local->stbuf, statpost);
+
+ local->op_ret = 0;
+ local->op_errno = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if (local->op_ret == 0)
+ dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf);
+ DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+ &local->prebuf, &local->stbuf, xdata);
+ }
+
+ return 0;
+}
+
+/* Keep the existing code same for all the cases other than regular file */
+int
+dht_non_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_msg(this->name, op_errno, 0, 0, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
+ }
+
+ LOCK(&frame->lock);
+ {
+ dht_iatt_merge(this, &local->prebuf, statpre);
+ dht_iatt_merge(this, &local->stbuf, statpost);
+
+ local->op_ret = 0;
+ local->op_errno = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf);
+ DHT_STACK_UNWIND(setattr, frame, 0, 0, &local->prebuf, &local->stbuf,
+ xdata);
+ }
+
+ return 0;
+}
+
+int
+dht_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
+
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *mds_subvol = NULL;
+ struct iatt loc_stbuf = {
+ 0,
+ };
+ int i = 0;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+ mds_subvol = local->mds_subvol;
+
+ if (op_ret == -1) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ local->op_ret = 0;
+ loc_stbuf = local->stbuf;
+ dht_iatt_merge(this, &local->prebuf, statpre);
+ dht_iatt_merge(this, &local->stbuf, statpost);
+
+ local->call_cnt = conf->subvolume_cnt - 1;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (mds_subvol == conf->subvolumes[i])
+ continue;
+ STACK_WIND_COOKIE(frame, dht_non_mds_setattr_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->setattr, &local->loc,
+ &loc_stbuf, local->valid, local->xattr_req);
+ }
+
+ return 0;
+out:
+ DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+ &local->prebuf, &local->stbuf, xdata);
+
+ return 0;
+}
+
+int
+dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = -1;
+ int call_cnt = 0;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ conf = this->private;
+ local = dht_local_init(frame, loc, NULL, GF_FOP_SETATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane(layout)) {
+ gf_msg_debug(this->name, 0, "layout is not sane for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (IA_ISREG(loc->inode->ia_type)) {
+ /* in the regular file _cbk(), we need to check for
+ migration possibilities */
+ local->rebalance.stbuf = *stbuf;
+ local->rebalance.flags = valid;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->setattr, loc, stbuf, valid, xdata);
+
+ return 0;
+ }
+
+ local->call_cnt = call_cnt = layout->cnt;
+
+ if (IA_ISDIR(loc->inode->ia_type) && !__is_root_gfid(loc->inode->gfid) &&
+ call_cnt != 1) {
+ ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+ if (ret || !mds_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get mds subvol for path %s", local->loc.path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->mds_subvol = mds_subvol;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_WARNING, layout->list[i].err,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS subvol is down for path "
+ " %s Unable to set attr ",
+ local->loc.path);
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ }
+ }
+ local->valid = valid;
+ local->stbuf = *stbuf;
+
+ STACK_WIND_COOKIE(frame, dht_mds_setattr_cbk, local->mds_subvol,
+ local->mds_subvol, local->mds_subvol->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+ } else {
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->setattr, loc, stbuf,
+ valid, xdata);
+ }
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSETATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane(layout)) {
+ gf_msg_debug(this->name, 0, "layout is not sane for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (IA_ISREG(fd->inode->ia_type)) {
+ /* in the regular file _cbk(), we need to check for
+ migration possibilities */
+ local->rebalance.stbuf = *stbuf;
+ local->rebalance.flags = valid;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->fsetattr, fd, &local->rebalance.stbuf,
+ local->rebalance.flags, local->xattr_req);
+ return 0;
+ }
+
+ local->call_cnt = call_cnt = layout->cnt;
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->fsetattr, fd, stbuf,
+ valid, xdata);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 8a5f45d884b..fda904c92c9 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -1,728 +1,808 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
+#include "unittest/unittest.h"
-#define layout_base_size (sizeof (dht_layout_t))
+#define layout_base_size (sizeof(dht_layout_t))
-#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0])
+#define layout_entry_size (sizeof((dht_layout_t *)NULL)->list[0])
#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size))
-
dht_layout_t *
-dht_layout_new (xlator_t *this, int cnt)
+dht_layout_new(xlator_t *this, int cnt)
{
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ REQUIRE(NULL != this);
+ REQUIRE(cnt >= 0);
- conf = this->private;
+ conf = this->private;
- layout = GF_CALLOC (1, layout_size (cnt),
- gf_dht_mt_dht_layout_t);
- if (!layout) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto out;
- }
+ layout = GF_CALLOC(1, layout_size(cnt), gf_dht_mt_dht_layout_t);
+ if (!layout) {
+ goto out;
+ }
- layout->type = DHT_HASH_TYPE_DM;
- layout->cnt = cnt;
- if (conf)
- layout->gen = conf->gen;
+ layout->type = DHT_HASH_TYPE_DM;
+ layout->cnt = cnt;
- layout->ref = 1;
+ if (conf) {
+ layout->spread_cnt = conf->dir_spread_cnt;
+ layout->gen = conf->gen;
+ }
+
+ GF_ATOMIC_INIT(layout->ref, 1);
+
+ ENSURE(NULL != layout);
+ ENSURE(layout->type == DHT_HASH_TYPE_DM);
+ ENSURE(layout->cnt == cnt);
+ ENSURE(GF_ATOMIC_GET(layout->ref) == 1);
out:
- return layout;
+ return layout;
}
-
dht_layout_t *
-dht_layout_get (xlator_t *this, inode_t *inode)
+dht_layout_get(xlator_t *this, inode_t *inode)
{
- dht_conf_t *conf = NULL;
- uint64_t layout_int = 0;
- dht_layout_t *layout = NULL;
- int ret = -1;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- LOCK (&conf->layout_lock);
- {
- ret = inode_ctx_get (inode, this, &layout_int);
- if (ret == 0) {
- layout = (dht_layout_t *) (unsigned long) layout_int;
- layout->ref++;
- }
- }
- UNLOCK (&conf->layout_lock);
-
-out:
- return layout;
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+
+ ret = dht_inode_ctx_layout_get(inode, this, &layout);
+ if ((!ret) && layout) {
+ GF_ATOMIC_INC(layout->ref);
+ }
+ return layout;
}
-
int
-dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout)
+dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout)
{
- dht_conf_t *conf = NULL;
- int oldret = -1;
- int ret = 0;
- dht_layout_t *old_layout;
- uint64_t old_layout_int;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- LOCK (&conf->layout_lock);
- {
- oldret = inode_ctx_get (inode, this, &old_layout_int);
-
- layout->ref++;
- ret = inode_ctx_put (inode, this, (uint64_t) (unsigned long)
- layout);
- }
- UNLOCK (&conf->layout_lock);
-
- if (oldret == 0) {
- old_layout = (dht_layout_t *) (unsigned long) old_layout_int;
- dht_layout_unref (this, old_layout);
- }
+ dht_conf_t *conf = NULL;
+ int oldret = -1;
+ int ret = -1;
+ dht_layout_t *old_layout;
+
+ conf = this->private;
+ if (!conf || !layout)
+ goto out;
+
+ LOCK(&conf->layout_lock);
+ {
+ oldret = dht_inode_ctx_layout_get(inode, this, &old_layout);
+ if (layout)
+ GF_ATOMIC_INC(layout->ref);
+ ret = dht_inode_ctx_layout_set(inode, this, layout);
+ }
+ UNLOCK(&conf->layout_lock);
+
+ if (!oldret) {
+ dht_layout_unref(this, old_layout);
+ }
+ if (ret)
+ GF_ATOMIC_DEC(layout->ref);
out:
- return ret;
+ return ret;
}
-
void
-dht_layout_unref (xlator_t *this, dht_layout_t *layout)
+dht_layout_unref(xlator_t *this, dht_layout_t *layout)
{
- dht_conf_t *conf = NULL;
- int ref = 0;
+ int ref = 0;
- if (layout->preset || !this->private)
- return;
+ if (!layout || layout->preset || !this->private)
+ return;
- conf = this->private;
-
- LOCK (&conf->layout_lock);
- {
- ref = --layout->ref;
- }
- UNLOCK (&conf->layout_lock);
+ ref = GF_ATOMIC_DEC(layout->ref);
- if (!ref)
- GF_FREE (layout);
+ if (!ref)
+ GF_FREE(layout);
}
-
dht_layout_t *
-dht_layout_ref (xlator_t *this, dht_layout_t *layout)
+dht_layout_ref(xlator_t *this, dht_layout_t *layout)
{
- dht_conf_t *conf = NULL;
-
- if (layout->preset || !this->private)
- return layout;
+ if (layout->preset || !this->private)
+ return layout;
- conf = this->private;
- LOCK (&conf->layout_lock);
- {
- layout->ref++;
- }
- UNLOCK (&conf->layout_lock);
+ GF_ATOMIC_INC(layout->ref);
- return layout;
+ return layout;
}
-
xlator_t *
-dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name)
+dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name)
{
- uint32_t hash = 0;
- xlator_t *subvol = NULL;
- int i = 0;
- int ret = 0;
-
-
- ret = dht_hash_compute (layout->type, name, &hash);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "hash computation failed for type=%d name=%s",
- layout->type, name);
- goto out;
- }
-
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].start <= hash
- && layout->list[i].stop >= hash) {
- subvol = layout->list[i].xlator;
- break;
- }
- }
-
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume for hash (value) = %u", hash);
- }
+ uint32_t hash = 0;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int ret = 0;
+
+ ret = dht_hash_compute(this, layout->type, name, &hash);
+ if (ret != 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMPUTE_HASH_FAILED,
+ "type=%d", layout->type, "name=%s", name, NULL);
+ goto out;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].start <= hash && layout->list[i].stop >= hash) {
+ subvol = layout->list[i].xlator;
+ break;
+ }
+ }
+
+ if (!subvol) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "hash-value=0x%x", hash, NULL);
+ }
out:
- return subvol;
+ return subvol;
}
-
dht_layout_t *
-dht_layout_for_subvol (xlator_t *this, xlator_t *subvol)
+dht_layout_for_subvol(xlator_t *this, xlator_t *subvol)
{
- dht_conf_t *conf = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == subvol) {
- layout = conf->file_layouts[i];
- break;
- }
- }
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == subvol) {
+ layout = conf->file_layouts[i];
+ break;
+ }
+ }
out:
- return layout;
+ return layout;
}
-
int
-dht_layouts_init (xlator_t *this, dht_conf_t *conf)
+dht_layouts_init(xlator_t *this, dht_conf_t *conf)
{
- dht_layout_t *layout = NULL;
- int i = 0;
- int ret = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int ret = -1;
- if (!conf)
- goto out;
+ if (!conf)
+ goto out;
- conf->file_layouts = GF_CALLOC (conf->subvolume_cnt,
- sizeof (dht_layout_t *),
- gf_dht_mt_dht_layout_t);
- if (!conf->file_layouts) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto out;
- }
+ conf->file_layouts = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_layout_t *),
+ gf_dht_mt_dht_layout_t);
+ if (!conf->file_layouts) {
+ goto out;
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- layout = dht_layout_new (this, 1);
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ layout = dht_layout_new(this, 1);
- if (!layout) {
- goto out;
- }
+ if (!layout) {
+ goto out;
+ }
- layout->preset = 1;
+ layout->preset = 1;
- layout->list[0].xlator = conf->subvolumes[i];
+ layout->list[0].xlator = conf->subvolumes[i];
- conf->file_layouts[i] = layout;
- }
+ conf->file_layouts[i] = layout;
+ }
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
}
-
int
-dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
- int pos, int32_t **disk_layout_p)
+dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos,
+ int32_t **disk_layout_p)
{
- int ret = -1;
- int32_t *disk_layout = NULL;
-
- disk_layout = GF_CALLOC (5, sizeof (int),
- gf_dht_mt_int32_t);
- if (!disk_layout) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto out;
- }
-
- disk_layout[0] = hton32 (1);
- disk_layout[1] = hton32 (layout->type);
- disk_layout[2] = hton32 (layout->list[pos].start);
- disk_layout[3] = hton32 (layout->list[pos].stop);
-
- if (disk_layout_p)
- *disk_layout_p = disk_layout;
- ret = 0;
+ int ret = -1;
+ int32_t *disk_layout = NULL;
+
+ disk_layout = GF_CALLOC(5, sizeof(int), gf_dht_mt_int32_t);
+ if (!disk_layout) {
+ goto out;
+ }
+
+ disk_layout[0] = hton32(layout->list[pos].commit_hash);
+ disk_layout[1] = hton32(layout->type);
+ disk_layout[2] = hton32(layout->list[pos].start);
+ disk_layout[3] = hton32(layout->list[pos].stop);
+
+ if (disk_layout_p)
+ *disk_layout_p = disk_layout;
+ else
+ GF_FREE(disk_layout);
+
+ ret = 0;
out:
- return ret;
+ return ret;
}
-
int
-dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
- int pos, void *disk_layout_raw)
+dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
+ xlator_t *subvol, int32_t **disk_layout_p)
{
- int cnt = 0;
- int type = 0;
- int start_off = 0;
- int stop_off = 0;
- int disk_layout[4];
+ int i = 0;
- /* TODO: assert disk_layout_ptr is of required length */
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol)
+ break;
+ }
- memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout));
+ if (i == layout->cnt)
+ return -1;
- cnt = ntoh32 (disk_layout[0]);
- if (cnt != 1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "disk layout has invalid count %d", cnt);
- return -1;
- }
+ return dht_disk_layout_extract(this, layout, i, disk_layout_p);
+}
- /* TODO: assert type is compatible */
- type = ntoh32 (disk_layout[1]);
- start_off = ntoh32 (disk_layout[2]);
- stop_off = ntoh32 (disk_layout[3]);
+static int
+dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
+ void *disk_layout_raw, int disk_layout_len)
+{
+ int type = 0;
+ int start_off = 0;
+ int stop_off = 0;
+ int commit_hash = 0;
+ int disk_layout[4];
+
+ if (!disk_layout_raw) {
+ gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ NULL);
+ return -1;
+ }
+
+ GF_ASSERT(disk_layout_len == sizeof(disk_layout));
+
+ memcpy(disk_layout, disk_layout_raw, disk_layout_len);
+
+ type = ntoh32(disk_layout[1]);
+ switch (type) {
+ case DHT_HASH_TYPE_DM_USER:
+ gf_msg_debug(this->name, 0, "found user-set layout");
+ layout->type = type;
+ /* Fall through. */
+ case DHT_HASH_TYPE_DM:
+ break;
+ default:
+ gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_DISK_LAYOUT,
+ "layout=%d", disk_layout[1], NULL);
+ return -1;
+ }
+
+ commit_hash = ntoh32(disk_layout[0]);
+ start_off = ntoh32(disk_layout[2]);
+ stop_off = ntoh32(disk_layout[3]);
+
+ layout->list[pos].commit_hash = commit_hash;
+ layout->list[pos].start = start_off;
+ layout->list[pos].stop = stop_off;
+
+ gf_msg_trace(this->name, 0,
+ "merged to layout: 0x%x - 0x%x (hash 0x%x, type %d) from %s",
+ start_off, stop_off, commit_hash, type,
+ layout->list[pos].xlator->name);
+
+ return 0;
+}
- layout->list[pos].start = start_off;
- layout->list[pos].stop = stop_off;
+int
+dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ int op_ret, int op_errno, dict_t *xattr)
+{
+ int i = 0;
+ int ret = -1;
+ int err = -1;
+ void *disk_layout_raw = NULL;
+ int disk_layout_len = 0;
+ dht_conf_t *conf = this->private;
+
+ if (op_ret != 0) {
+ err = op_errno;
+ }
+
+ if (!layout)
+ goto out;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == NULL) {
+ layout->list[i].err = err;
+ layout->list[i].xlator = subvol;
+ break;
+ }
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "merged to layout: %u - %u (type %d) from %s",
- start_off, stop_off, type,
- layout->list[pos].xlator->name);
+ if (op_ret != 0) {
+ ret = 0;
+ goto out;
+ }
+
+ if (xattr) {
+ /* during lookup and not mkdir */
+ ret = dict_get_ptr_and_len(xattr, conf->xattr_name, &disk_layout_raw,
+ &disk_layout_len);
+ }
+
+ if (ret != 0) {
+ layout->list[i].err = 0;
+ gf_msg_trace(this->name, 0, "Missing disk layout on %s. err = %d",
+ subvol->name, err);
+ ret = 0;
+ goto out;
+ }
- return 0;
-}
+ ret = dht_disk_layout_merge(this, layout, i, disk_layout_raw,
+ disk_layout_len);
+ if (ret != 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "subvolume=%s", subvol->name, NULL);
+ goto out;
+ }
+ if (layout->commit_hash == 0) {
+ layout->commit_hash = layout->list[i].commit_hash;
+ } else if (layout->commit_hash != layout->list[i].commit_hash) {
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+ }
-int
-dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
- int op_ret, int op_errno, dict_t *xattr)
-{
- int i = 0;
- int ret = -1;
- int err = -1;
- void *disk_layout_raw = NULL;
-
-
- if (op_ret != 0) {
- err = op_errno;
- }
-
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].xlator == NULL) {
- layout->list[i].err = err;
- layout->list[i].xlator = subvol;
- break;
- }
- }
-
- if (op_ret != 0) {
- ret = 0;
- goto out;
- }
-
- if (xattr) {
- /* during lookup and not mkdir */
- ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
- &disk_layout_raw);
- }
-
- if (ret != 0) {
- layout->list[i].err = -1;
- gf_log (this->name, GF_LOG_TRACE,
- "missing disk layout on %s. err = %d",
- subvol->name, err);
- ret = 0;
- goto out;
- }
-
- ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "layout merge from subvolume %s failed",
- subvol->name);
- goto out;
- }
- layout->list[i].err = 0;
+ layout->list[i].err = 0;
out:
- return ret;
+ return ret;
}
+void
+dht_layout_entry_swap(dht_layout_t *layout, int i, int j)
+{
+ uint32_t start_swap = 0;
+ uint32_t stop_swap = 0;
+ uint32_t commit_hash_swap = 0;
+ xlator_t *xlator_swap = 0;
+ int err_swap = 0;
+
+ start_swap = layout->list[i].start;
+ stop_swap = layout->list[i].stop;
+ xlator_swap = layout->list[i].xlator;
+ err_swap = layout->list[i].err;
+ commit_hash_swap = layout->list[i].commit_hash;
+
+ layout->list[i].start = layout->list[j].start;
+ layout->list[i].stop = layout->list[j].stop;
+ layout->list[i].xlator = layout->list[j].xlator;
+ layout->list[i].err = layout->list[j].err;
+ layout->list[i].commit_hash = layout->list[j].commit_hash;
+
+ layout->list[j].start = start_swap;
+ layout->list[j].stop = stop_swap;
+ layout->list[j].xlator = xlator_swap;
+ layout->list[j].err = err_swap;
+ layout->list[j].commit_hash = commit_hash_swap;
+}
void
-dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
+dht_layout_range_swap(dht_layout_t *layout, int i, int j)
{
- uint32_t start_swap = 0;
- uint32_t stop_swap = 0;
- xlator_t *xlator_swap = 0;
- int err_swap = 0;
-
-
- start_swap = layout->list[i].start;
- stop_swap = layout->list[i].stop;
- xlator_swap = layout->list[i].xlator;
- err_swap = layout->list[i].err;
-
- layout->list[i].start = layout->list[j].start;
- layout->list[i].stop = layout->list[j].stop;
- layout->list[i].xlator = layout->list[j].xlator;
- layout->list[i].err = layout->list[j].err;
-
- layout->list[j].start = start_swap;
- layout->list[j].stop = stop_swap;
- layout->list[j].xlator = xlator_swap;
- layout->list[j].err = err_swap;
+ uint32_t start_swap = 0;
+ uint32_t stop_swap = 0;
+
+ start_swap = layout->list[i].start;
+ stop_swap = layout->list[i].stop;
+
+ layout->list[i].start = layout->list[j].start;
+ layout->list[i].stop = layout->list[j].stop;
+
+ layout->list[j].start = start_swap;
+ layout->list[j].stop = stop_swap;
+}
+static int64_t
+dht_layout_entry_cmp_volname(dht_layout_t *layout, int i, int j)
+{
+ return (strcmp(layout->list[i].xlator->name, layout->list[j].xlator->name));
}
-int64_t
-dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j)
+gf_boolean_t
+dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator)
{
- return (strcmp (layout->list[i].xlator->name,
- layout->list[j].xlator->name));
+ int i = 0;
+
+ for (i = 0; i < layout->cnt; i++) {
+ /* Check if xlator is already part of layout, and layout is
+ * non-zero. */
+ if (!strcmp(layout->list[i].xlator->name, xlator->name)) {
+ if (layout->list[i].start != layout->list[i].stop)
+ return _gf_true;
+ break;
+ }
+ }
+ return _gf_false;
}
-int64_t
-dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
+static int64_t
+dht_layout_entry_cmp(dht_layout_t *layout, int i, int j)
{
- int64_t diff = 0;
+ int64_t diff = 0;
- if (layout->list[i].err || layout->list[j].err)
- diff = layout->list[i].err - layout->list[j].err;
- else
- diff = (int64_t) layout->list[i].start
- - (int64_t) layout->list[j].start;
+ /* swap zero'ed out layouts to front, if needed */
+ if (!layout->list[j].start && !layout->list[j].stop) {
+ diff = (int64_t)layout->list[i].stop - (int64_t)layout->list[j].stop;
+ goto out;
+ }
+ diff = (int64_t)layout->list[i].start - (int64_t)layout->list[j].start;
- return diff;
+out:
+ return diff;
}
-
int
-dht_layout_sort (dht_layout_t *layout)
+dht_layout_sort(dht_layout_t *layout)
{
- int i = 0;
- int j = 0;
- int64_t ret = 0;
+ int i = 0;
+ int j = 0;
+ int64_t ret = 0;
- /* TODO: O(n^2) -- bad bad */
+ /* TODO: O(n^2) -- bad bad */
- for (i = 0; i < layout->cnt - 1; i++) {
- for (j = i + 1; j < layout->cnt; j++) {
- ret = dht_layout_entry_cmp (layout, i, j);
- if (ret > 0)
- dht_layout_entry_swap (layout, i, j);
- }
- }
+ for (i = 0; i < layout->cnt - 1; i++) {
+ for (j = i + 1; j < layout->cnt; j++) {
+ ret = dht_layout_entry_cmp(layout, i, j);
+ if (ret > 0)
+ dht_layout_entry_swap(layout, i, j);
+ }
+ }
- return 0;
+ return 0;
}
-int
-dht_layout_sort_volname (dht_layout_t *layout)
+void
+dht_layout_sort_volname(dht_layout_t *layout)
+{
+ int i = 0;
+ int j = 0;
+ int64_t ret = 0;
+
+ /* TODO: O(n^2) -- bad bad */
+
+ for (i = 0; i < layout->cnt - 1; i++) {
+ for (j = i + 1; j < layout->cnt; j++) {
+ ret = dht_layout_entry_cmp_volname(layout, i, j);
+ if (ret > 0)
+ dht_layout_entry_swap(layout, i, j);
+ }
+ }
+}
+
+void
+dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
+ uint32_t *holes_p, uint32_t *overlaps_p,
+ uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
+ uint32_t *no_space_p)
{
- int i = 0;
- int j = 0;
- int64_t ret = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+ uint32_t hole_cnt = 0;
+ uint32_t overlap_cnt = 0;
+ int i = 0;
+ uint32_t prev_stop = 0;
+ uint32_t last_stop = 0;
+ char is_virgin = 1;
+ uint32_t no_space = 0;
+
+ /* This function scans through the layout spread of a directory to
+ check if there are any anomalies. Prior to calling this function
+ the layout entries should be sorted in the ascending order.
+
+ If the layout entry has err != 0
+ then increment the corresponding anomaly.
+ else
+ if (start of the current layout entry > stop + 1 of previous
+ non erroneous layout entry)
+ then it indicates a hole in the layout
+ if (start of the current layout entry < stop + 1 of previous
+ non erroneous layout entry)
+ then it indicates an overlap in the layout
+ */
+ last_stop = layout->list[0].start - 1;
+ prev_stop = last_stop;
+
+ for (i = 0; i < layout->cnt; i++) {
+ switch (layout->list[i].err) {
+ case -1:
+ case ENOENT:
+ case ESTALE:
+ missing++;
+ continue;
+ case ENOTCONN:
+ down++;
+ continue;
+ case ENOSPC:
+ no_space++;
+ continue;
+ case 0:
+ /* if err == 0 and start == stop, then it is a non misc++;
+ * participating subvolume(spread-cnt). Then, do not
+ * check for anomalies. If start != stop, then treat it
+ * as misc err */
+ if (layout->list[i].start == layout->list[i].stop) {
+ continue;
+ }
+ break;
+ default:
+ misc++;
+ continue;
+ }
+
+ is_virgin = 0;
+
+ if ((prev_stop + 1) < layout->list[i].start) {
+ hole_cnt++;
+ }
+
+ if ((prev_stop + 1) > layout->list[i].start) {
+ overlap_cnt++;
+ overlaps += ((prev_stop + 1) - layout->list[i].start);
+ }
+ prev_stop = layout->list[i].stop;
+ }
- /* TODO: O(n^2) -- bad bad */
+ if ((last_stop - prev_stop) || is_virgin)
+ hole_cnt++;
- for (i = 0; i < layout->cnt - 1; i++) {
- for (j = i + 1; j < layout->cnt; j++) {
- ret = dht_layout_entry_cmp_volname (layout, i, j);
- if (ret > 0)
- dht_layout_entry_swap (layout, i, j);
- }
- }
+ if (holes_p)
+ *holes_p = hole_cnt;
- return 0;
+ if (overlaps_p)
+ *overlaps_p = overlap_cnt;
+
+ if (missing_p)
+ *missing_p = missing;
+
+ if (down_p)
+ *down_p = down;
+
+ if (misc_p)
+ *misc_p = misc;
+
+ if (no_space_p)
+ *no_space_p = no_space;
}
+int
+dht_layout_missing_dirs(dht_layout_t *layout)
+{
+ int i = 0, missing = 0;
+
+ if (layout == NULL)
+ goto out;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if ((layout->list[i].err == ENOENT) ||
+ ((layout->list[i].err == -1) && (layout->list[i].start == 0) &&
+ (layout->list[i].stop == 0))) {
+ missing++;
+ }
+ }
+
+out:
+ return missing;
+}
int
-dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
- uint32_t *holes_p, uint32_t *overlaps_p,
- uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p)
+dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout)
{
- uint32_t overlaps = 0;
- uint32_t missing = 0;
- uint32_t down = 0;
- uint32_t misc = 0;
- uint32_t hole_cnt = 0;
- uint32_t overlap_cnt = 0;
- int i = 0;
- int ret = 0;
- uint32_t prev_stop = 0;
- uint32_t last_stop = 0;
- char is_virgin = 1;
-
- /* TODO: explain WTF is happening */
-
- last_stop = layout->list[0].start - 1;
- prev_stop = last_stop;
-
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].err) {
- switch (layout->list[i].err) {
- case -1:
- case ENOENT:
- missing++;
- break;
- case ENOTCONN:
- down++;
- break;
- case ENOSPC:
- down++;
- break;
- default:
- misc++;
- }
- continue;
- }
-
- is_virgin = 0;
-
- if ((prev_stop + 1) < layout->list[i].start) {
- hole_cnt++;
- }
-
- if ((prev_stop + 1) > layout->list[i].start) {
- overlap_cnt++;
- overlaps += ((prev_stop + 1) - layout->list[i].start);
- }
- prev_stop = layout->list[i].stop;
- }
-
- if ((last_stop - prev_stop) || is_virgin)
- hole_cnt++;
-
- if (holes_p)
- *holes_p = hole_cnt;
-
- if (overlaps_p)
- *overlaps_p = overlap_cnt;
-
- if (missing_p)
- *missing_p = missing;
-
- if (down_p)
- *down_p = down;
-
- if (misc_p)
- *misc_p = misc;
-
- return ret;
+ int ret = 0;
+ uint32_t holes = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0, missing_dirs = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ ret = dht_layout_sort(layout);
+ if (ret == -1) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED,
+ NULL);
+ goto out;
+ }
+
+ gf_uuid_unparse(loc->gfid, gfid);
+
+ dht_layout_anomalies(this, loc, layout, &holes, &overlaps, &missing, &down,
+ &misc, NULL);
+
+ if (holes || overlaps) {
+ if (missing == layout->cnt) {
+ gf_msg_debug(this->name, 0,
+ "Directory %s looked up first time"
+ " gfid = %s",
+ loc->path, gfid);
+ } else {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_ANOMALIES_INFO,
+ "path=%s", loc->path, "gfid=%s", gfid, "holes=%d", holes,
+ "overlaps=%d", overlaps, NULL);
+ }
+ ret = -1;
+ }
+
+ if (ret >= 0) {
+ missing_dirs = dht_layout_missing_dirs(layout);
+ /* TODO During DHT selfheal rewrite (almost) find a better place
+ * to detect this - probably in dht_layout_anomalies()
+ */
+ if (missing_dirs > 0)
+ ret += missing_dirs;
+ }
+
+out:
+ return ret;
}
+int
+dht_dir_has_layout(dict_t *xattr, char *name)
+{
+ void *disk_layout_raw = NULL;
+
+ return dict_get_ptr(xattr, name, &disk_layout_raw);
+}
int
-dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout)
+dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ loc_t *loc, dict_t *xattr)
{
- int ret = 0;
- int i = 0;
- uint32_t holes = 0;
- uint32_t overlaps = 0;
- uint32_t missing = 0;
- uint32_t down = 0;
- uint32_t misc = 0;
-
-
- ret = dht_layout_sort (layout);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "sort failed?! how the ....");
- goto out;
- }
-
- ret = dht_layout_anomalies (this, loc, layout,
- &holes, &overlaps,
- &missing, &down, &misc);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_WARNING,
- "error while finding anomalies in %s -- not good news",
- loc->path);
- goto out;
- }
-
- if (holes || overlaps) {
- if (missing == layout->cnt) {
- gf_log (this->name, GF_LOG_DEBUG,
- "directory %s looked up first time",
- loc->path);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "found anomalies in %s. holes=%d overlaps=%d",
- loc->path, holes, overlaps);
- }
- ret = 1;
- }
-
- for (i = 0; i < layout->cnt; i++) {
- /* TODO During DHT selfheal rewrite (almost) find a better place to
- * detect this - probably in dht_layout_anomalies()
- */
- if (layout->list[i].err > 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "path=%s err=%s on subvol=%s",
- loc->path, strerror (layout->list[i].err),
- (layout->list[i].xlator ?
- layout->list[i].xlator->name : "<>"));
- if (layout->list[i].err == ENOENT)
- ret = 1;
- }
- }
+ int idx = 0;
+ int pos = -1;
+ int ret = 0;
+ int err = 0;
+ int dict_ret = 0;
+ int32_t disk_layout[4];
+ void *disk_layout_raw = NULL;
+ uint32_t start_off = -1;
+ uint32_t stop_off = -1;
+ uint32_t commit_hash = -1;
+ dht_conf_t *conf = this->private;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ if (loc && loc->inode)
+ gf_uuid_unparse(loc->inode->gfid, gfid);
+
+ for (idx = 0; idx < layout->cnt; idx++) {
+ if (layout->list[idx].xlator == subvol) {
+ pos = idx;
+ break;
+ }
+ }
+ if (pos == -1) {
+ if (loc) {
+ gf_msg_debug(this->name, 0, "%s - no layout info for subvolume %s",
+ loc ? loc->path : "path not found", subvol->name);
+ }
+ ret = 1;
+ goto out;
+ }
+
+ err = layout->list[pos].err;
+
+ if (!xattr) {
+ if (err == 0) {
+ if (loc) {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL,
+ "path=%s", loc->path, NULL);
+ } else {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL,
+ "path not found", NULL);
+ }
+ ret = -1;
+ }
+ goto out;
+ }
+
+ dict_ret = dict_get_ptr(xattr, conf->xattr_name, &disk_layout_raw);
+
+ if (dict_ret < 0) {
+ if (err == 0 && layout->list[pos].stop) {
+ if (loc) {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
+ } else {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
+ "path not found"
+ "gfid=%s",
+ gfid, NULL);
+ }
+ ret = -1;
+ }
+ goto out;
+ }
+
+ memcpy(disk_layout, disk_layout_raw, sizeof(disk_layout));
+
+ start_off = ntoh32(disk_layout[2]);
+ stop_off = ntoh32(disk_layout[3]);
+ commit_hash = ntoh32(disk_layout[0]);
+
+ if ((layout->list[pos].start != start_off) ||
+ (layout->list[pos].stop != stop_off) ||
+ (layout->list[pos].commit_hash != commit_hash)) {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_INFO, "subvol=%s",
+ layout->list[pos].xlator->name, "inode-layout:start=0x%x",
+ layout->list[pos].start, "inode-layout:stop=0x%x",
+ layout->list[pos].stop, "layout-commit-hash=0x%x; ",
+ layout->list[pos].commit_hash, "disk-layout:start-off=0x%x",
+ start_off, "disk-layout:top-off=0x%x", stop_off,
+ "commit-hash=0x%x", commit_hash, NULL);
+ ret = 1;
+ } else {
+ ret = 0;
+ }
out:
- return ret;
+ return ret;
}
-
int
-dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
- loc_t *loc, dict_t *xattr)
+dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode)
{
- int idx = 0;
- int pos = -1;
- int ret = 0;
- int err = 0;
- int dict_ret = 0;
- int32_t disk_layout[4];
- void *disk_layout_raw = NULL;
- int32_t count = -1;
- uint32_t start_off = -1;
- uint32_t stop_off = -1;
-
-
- for (idx = 0; idx < layout->cnt; idx++) {
- if (layout->list[idx].xlator == subvol) {
- pos = idx;
- break;
- }
- }
-
- if (pos == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s - no layout info for subvolume %s",
- loc->path, subvol->name);
- ret = 1;
- goto out;
- }
-
- err = layout->list[pos].err;
-
- if (!xattr) {
- if (err == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s - xattr dictionary is NULL",
- loc->path);
- ret = -1;
- }
- goto out;
- }
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
- dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
- &disk_layout_raw);
+ conf = this->private;
+ if (!conf)
+ goto out;
- if (dict_ret < 0) {
- if (err == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s - disk layout missing", loc->path);
- ret = -1;
- }
- goto out;
- }
-
- memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout));
-
- count = ntoh32 (disk_layout[0]);
- if (count != 1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s - disk layout has invalid count %d",
- loc->path, count);
- ret = -1;
- goto out;
- }
-
- start_off = ntoh32 (disk_layout[2]);
- stop_off = ntoh32 (disk_layout[3]);
-
- if ((layout->list[pos].start != start_off)
- || (layout->list[pos].stop != stop_off)) {
- gf_log (this->name, GF_LOG_INFO,
- "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; "
- "disk layout - %"PRIu32" - %"PRIu32,
- layout->list[pos].xlator->name,
- layout->list[pos].start, layout->list[pos].stop,
- start_off, stop_off);
- ret = 1;
- } else {
- ret = 0;
- }
+ layout = dht_layout_for_subvol(this, subvol);
+ if (!layout) {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_NO_LAYOUT_INFO,
+ "subvolume=%s", subvol ? subvol->name : "<nil>", NULL);
+ ret = -1;
+ goto out;
+ }
+
+ gf_msg_debug(this->name, 0, "file = %s, subvol = %s",
+ uuid_utoa(inode->gfid), subvol ? subvol->name : "<nil>");
+
+ LOCK(&conf->layout_lock);
+ {
+ dht_inode_ctx_layout_set(inode, this, layout);
+ }
+
+ UNLOCK(&conf->layout_lock);
+
+ ret = 0;
out:
- return ret;
+ return ret;
}
-
int
-dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode)
+dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol)
{
- dht_layout_t *layout = NULL;
- int ret = -1;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- layout = dht_layout_for_subvol (this, subvol);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no pre-set layout for subvolume %s",
- subvol ? subvol->name : "<nil>");
- ret = -1;
- goto out;
- }
-
- LOCK (&conf->layout_lock);
- {
- inode_ctx_put (inode, this, (uint64_t)(long)layout);
+ int i = 0, ret = -1;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ ret = i;
+ break;
}
- UNLOCK (&conf->layout_lock);
+ }
- ret = 0;
-out:
- return ret;
+ return ret;
}
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index 6afba25a147..89ec6cca56e 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -1,247 +1,328 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include <glusterfs/compat.h>
+#include "dht-common.h"
+static int
+dht_linkfile_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ char is_linkfile = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret)
+ goto out;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+ if (!is_linkfile)
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR,
+ "name=%s", prev->name, "path=%s", local->loc.path, "gfid=%s",
+ gfid, NULL);
+out:
+ local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode,
+ stbuf, postparent, postparent, xattr);
+ return 0;
+}
-#include "glusterfs.h"
-#include "xlator.h"
-#include "compat.h"
-#include "dht-common.h"
+static int
+dht_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (!op_ret)
+ local->linked = _gf_true;
+
+ FRAME_SU_UNDO(frame, dht_local_t);
+
+ if (op_ret && (op_errno == EEXIST)) {
+ conf = this->private;
+ subvol = cookie;
+ if (!subvol)
+ goto out;
+ xattrs = dict_new();
+ if (!xattrs)
+ goto out;
+ ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "mame=%s", conf->link_xattr_name, NULL);
+ goto out;
+ }
+
+ STACK_WIND_COOKIE(frame, dht_linkfile_lookup_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->linkfile.loc, xattrs);
+ if (xattrs)
+ dict_unref(xattrs);
+ return 0;
+ }
+out:
+ local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode,
+ stbuf, preparent, postparent, xdata);
+ if (xattrs)
+ dict_unref(xattrs);
+ return 0;
+}
+int
+dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *this, xlator_t *tovol, xlator_t *fromvol,
+ loc_t *loc)
+{
+ dht_local_t *local = NULL;
+ dict_t *dict = NULL;
+ int need_unref = 0;
+ int ret = 0;
+ dht_conf_t *conf = this->private;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ local->linkfile.linkfile_cbk = linkfile_cbk;
+ local->linkfile.srcvol = tovol;
+ loc_copy(&local->linkfile.loc, loc);
+
+ local->linked = _gf_false;
+
+ dict = local->params;
+ if (!dict) {
+ dict = dict_new();
+ if (!dict)
+ goto out;
+ need_unref = 1;
+ }
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ gf_uuid_unparse(local->gfid, gfid);
+
+ ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true);
+ if (ret)
+ gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
+ } else {
+ gf_uuid_unparse(loc->gfid, gfid);
+ }
+
+ ret = dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ if (ret)
+ gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY,
+ "gfid=%s", gfid, NULL);
+
+ ret = dict_set_str(dict, conf->link_xattr_name, tovol->name);
+
+ if (ret < 0) {
+ gf_smsg(frame->this->name, GF_LOG_INFO, 0, DHT_MSG_CREATE_LINK_FAILED,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+
+ local->link_subvol = fromvol;
+ /* Always create as root:root. dht_linkfile_attr_heal fixes the
+ * ownsership */
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND_COOKIE(frame, dht_linkfile_create_cbk, fromvol, fromvol,
+ fromvol->fops->mknod, loc, S_IFREG | DHT_LINKFILE_MODE, 0,
+ 0, dict);
+
+ if (need_unref && dict)
+ dict_unref(dict);
+
+ return 0;
+out:
+ local->linkfile.linkfile_cbk(frame, frame->this, frame->this, -1, ENOMEM,
+ loc->inode, NULL, NULL, NULL, NULL);
+ if (need_unref && dict)
+ dict_unref(dict);
+
+ return 0;
+}
int
-dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno)
+dht_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ local = frame->local;
+ subvol = cookie;
- local = frame->local;
- local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
- local->linkfile.inode,
- &local->linkfile.stbuf, NULL, NULL);
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ gf_smsg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_UNLINK_FAILED,
+ "path=%s", local->loc.path, "gfid=%s", gfid, "subvolume=%s",
+ subvol->name, NULL);
+ }
- return 0;
-}
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
int
-dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent)
+dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol,
+ loc_t *loc)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- dict_t *xattr = NULL;
- data_t *str_data = NULL;
- int ret = -1;
-
- local = frame->local;
- prev = cookie;
-
- if (op_ret == -1)
- goto err;
-
- xattr = get_new_dict ();
- if (!xattr) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->linkfile.xattr = dict_ref (xattr);
- local->linkfile.inode = inode_ref (inode);
-
- str_data = str_to_data (local->linkfile.srcvol->name);
- if (!str_data) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to initialize linkfile data");
- }
- str_data = NULL;
-
- local->linkfile.stbuf = *stbuf;
-
- STACK_WIND (frame, dht_linkfile_xattr_cbk,
- prev->this, prev->this->fops->setxattr,
- &local->linkfile.loc, local->linkfile.xattr, 0);
-
- return 0;
+ call_frame_t *unlink_frame = NULL;
+ dht_local_t *unlink_local = NULL;
+
+ unlink_frame = copy_frame(frame);
+ if (!unlink_frame) {
+ goto err;
+ }
+
+ /* Using non-fop value here, as anyways, 'local->fop' is not used in
+ this particular case */
+ unlink_local = dht_local_init(unlink_frame, loc, NULL, GF_FOP_MAXVALUE);
+ if (!unlink_local) {
+ goto err;
+ }
+ STACK_WIND_COOKIE(unlink_frame, dht_linkfile_unlink_cbk, subvol, subvol,
+ subvol->fops->unlink, &unlink_local->loc, 0, NULL);
+
+ return 0;
err:
- if (str_data) {
- data_destroy (str_data);
- str_data = NULL;
- }
-
- local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
- inode, stbuf, preparent, postparent);
- return 0;
-}
+ if (unlink_frame)
+ DHT_STACK_DESTROY(unlink_frame);
+ return -1;
+}
-int
-dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
- xlator_t *tovol, xlator_t *fromvol, loc_t *loc)
+xlator_t *
+dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr)
{
- dht_local_t *local = NULL;
- dict_t *dict = NULL;
- int ret = 0;
-
- local = frame->local;
- local->linkfile.linkfile_cbk = linkfile_cbk;
- local->linkfile.srcvol = tovol;
- loc_copy (&local->linkfile.loc, loc);
-
- if (!uuid_is_null (local->gfid)) {
- dict = dict_new ();
- if (!dict)
- goto out;
- ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16);
- if (ret)
- gf_log ("dht-linkfile", GF_LOG_DEBUG, "gfid set failed");
- } else if (local->params) {
- dict = dict_ref (local->params);
- }
- if (!dict)
- gf_log ("", GF_LOG_DEBUG,
- "dict is NULL, need to make sure gfid's are same");
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ void *volname = NULL;
+ int i = 0, ret = 0;
- STACK_WIND (frame, dht_linkfile_create_cbk,
- fromvol, fromvol->fops->mknod, loc,
- S_IFREG | DHT_LINKFILE_MODE, 0, dict);
+ conf = this->private;
- if (dict)
- dict_unref (dict);
+ if (!xattr)
+ goto out;
+
+ ret = dict_get_ptr(xattr, conf->link_xattr_name, &volname);
+
+ if ((-1 == ret) || !volname)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (strcmp(conf->subvolumes[i]->name, (char *)volname) == 0) {
+ subvol = conf->subvolumes[i];
+ break;
+ }
+ }
- return 0;
out:
- local->linkfile.linkfile_cbk (frame, NULL, frame->this, -1, ENOMEM,
- loc->inode, NULL, NULL, NULL);
- return 0;
+ return subvol;
}
-
-int
-dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent)
+static int
+dht_linkfile_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
- local = frame->local;
- prev = cookie;
- subvol = prev->this;
+ local = frame->local;
+ loc = &local->loc;
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlinking linkfile %s on %s failed (%s)",
- local->loc.path, subvol->name, strerror (op_errno));
- }
+ if (op_ret)
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_SETATTR_FAILED,
+ "path=%s", (loc->path ? loc->path : "NULL"), "gfid=%s",
+ uuid_utoa(local->gfid), NULL);
- DHT_STACK_DESTROY (frame);
+ DHT_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
-
int
-dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
- xlator_t *subvol, loc_t *loc)
+dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *unlink_frame = NULL;
- dht_local_t *unlink_local = NULL;
-
- unlink_frame = copy_frame (frame);
- if (!unlink_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- unlink_local = dht_local_init (unlink_frame);
- if (!unlink_local) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- loc_copy (&unlink_local->loc, loc);
-
- STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk,
- subvol, subvol->fops->unlink,
- &unlink_local->loc);
-
- return 0;
-err:
- if (unlink_frame)
- DHT_STACK_DESTROY (unlink_frame);
+ int ret = -1;
+ call_frame_t *copy = NULL;
+ dht_local_t *local = NULL;
+ dht_local_t *copy_local = NULL;
+ xlator_t *subvol = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+
+ local = frame->local;
+
+ GF_VALIDATE_OR_GOTO("dht", local, out);
+ GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out);
+
+ if (local->stbuf.ia_type == IA_INVAL)
+ return 0;
- return -1;
-}
+ DHT_MARK_FOP_INTERNAL(xattr);
+ gf_uuid_copy(local->loc.gfid, local->stbuf.ia_gfid);
-xlator_t *
-dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf,
- dict_t *xattr)
-{
- dht_conf_t *conf = NULL;
- xlator_t *subvol = NULL;
- void *volname = NULL;
- int i = 0, ret = 0;
+ copy = copy_frame(frame);
+ if (!copy)
+ goto out;
- conf = this->private;
+ copy_local = dht_local_init(copy, &local->loc, NULL, 0);
- if (!xattr)
- goto out;
+ if (!copy_local)
+ goto out;
- ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname);
+ stbuf = local->stbuf;
+ subvol = local->link_subvol;
- if ((-1 == ret) || !volname)
- goto out;
+ copy->local = copy_local;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) {
- subvol = conf->subvolumes[i];
- break;
- }
- }
+ FRAME_SU_DO(copy, dht_local_t);
+ STACK_WIND(copy, dht_linkfile_setattr_cbk, subvol, subvol->fops->setattr,
+ &copy_local->loc, &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+ xattr);
+ ret = 0;
out:
- return subvol;
-}
+ if ((ret < 0) && (copy))
+ DHT_STACK_DESTROY(copy);
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c
new file mode 100644
index 00000000000..638821ccee5
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-lock.c
@@ -0,0 +1,1392 @@
+/*
+ Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dht-lock.h"
+
+static char *
+dht_lock_asprintf(dht_lock_t *lock)
+{
+ char *lk_buf = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+
+ if (lock == NULL)
+ goto out;
+
+ uuid_utoa_r(lock->loc.gfid, gfid);
+
+ gf_asprintf(&lk_buf, "%s:%s", lock->xl->name, gfid);
+
+out:
+ return lk_buf;
+}
+
+static void
+dht_log_lk_array(char *name, gf_loglevel_t log_level, dht_lock_t **lk_array,
+ int count)
+{
+ int i = 0;
+ char *lk_buf = NULL;
+
+ if ((lk_array == NULL) || (count == 0))
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ lk_buf = dht_lock_asprintf(lk_array[i]);
+ if (!lk_buf)
+ goto out;
+
+ gf_smsg(name, log_level, 0, DHT_MSG_LK_ARRAY_INFO, "index=%d", i,
+ "lk_buf=%s", lk_buf, NULL);
+ GF_FREE(lk_buf);
+ }
+
+out:
+ return;
+}
+
+static void
+dht_lock_stack_destroy(call_frame_t *lock_frame, dht_lock_type_t lk)
+{
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+
+ if (lk == DHT_INODELK) {
+ local->lock[0].layout.my_layout.locks = NULL;
+ local->lock[0].layout.my_layout.lk_count = 0;
+ } else {
+ local->lock[0].ns.directory_ns.locks = NULL;
+ local->lock[0].ns.directory_ns.lk_count = 0;
+ }
+
+ DHT_STACK_DESTROY(lock_frame);
+ return;
+}
+
+static void
+dht_lock_free(dht_lock_t *lock)
+{
+ if (lock == NULL)
+ goto out;
+
+ loc_wipe(&lock->loc);
+ GF_FREE(lock->domain);
+ GF_FREE(lock->basename);
+ mem_put(lock);
+
+out:
+ return;
+}
+
+static void
+dht_set_lkowner(dht_lock_t **lk_array, int count, gf_lkowner_t *lkowner)
+{
+ int i = 0;
+
+ if (!lk_array || !lkowner)
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ lk_array[i]->lk_owner = *lkowner;
+ }
+
+out:
+ return;
+}
+
+static int
+dht_lock_request_cmp(const void *val1, const void *val2)
+{
+ dht_lock_t *lock1 = NULL;
+ dht_lock_t *lock2 = NULL;
+ int ret = -1;
+
+ lock1 = *(dht_lock_t **)val1;
+ lock2 = *(dht_lock_t **)val2;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", lock1, out);
+ GF_VALIDATE_OR_GOTO("dht-locks", lock2, out);
+
+ ret = strcmp(lock1->xl->name, lock2->xl->name);
+
+ if (ret == 0) {
+ ret = gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
+ }
+
+out:
+ return ret;
+}
+
+static int
+dht_lock_order_requests(dht_lock_t **locks, int count)
+{
+ int ret = -1;
+
+ if (!locks || !count)
+ goto out;
+
+ qsort(locks, count, sizeof(*locks), dht_lock_request_cmp);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+void
+dht_lock_array_free(dht_lock_t **lk_array, int count)
+{
+ int i = 0;
+ dht_lock_t *lock = NULL;
+
+ if (lk_array == NULL)
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ lock = lk_array[i];
+ lk_array[i] = NULL;
+ dht_lock_free(lock);
+ }
+
+out:
+ return;
+}
+
+int32_t
+dht_lock_count(dht_lock_t **lk_array, int lk_count)
+{
+ int i = 0, locked = 0;
+
+ if ((lk_array == NULL) || (lk_count == 0))
+ goto out;
+
+ for (i = 0; i < lk_count; i++) {
+ if (lk_array[i]->locked)
+ locked++;
+ }
+out:
+ return locked;
+}
+
+static call_frame_t *
+dht_lock_frame(call_frame_t *parent_frame)
+{
+ call_frame_t *lock_frame = NULL;
+
+ lock_frame = copy_frame(parent_frame);
+ if (lock_frame == NULL)
+ goto out;
+
+ set_lk_owner_from_ptr(&lock_frame->root->lk_owner, parent_frame->root);
+
+out:
+ return lock_frame;
+}
+
+dht_lock_t *
+dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type,
+ const char *domain, const char *basename,
+ dht_reaction_type_t do_on_failure)
+{
+ dht_conf_t *conf = NULL;
+ dht_lock_t *lock = NULL;
+
+ conf = this->private;
+
+ lock = mem_get0(conf->lock_pool);
+ if (lock == NULL)
+ goto out;
+
+ lock->xl = xl;
+ lock->type = type;
+ lock->do_on_failure = do_on_failure;
+
+ lock->domain = gf_strdup(domain);
+ if (lock->domain == NULL) {
+ dht_lock_free(lock);
+ lock = NULL;
+ goto out;
+ }
+
+ if (basename) {
+ lock->basename = gf_strdup(basename);
+ if (lock->basename == NULL) {
+ dht_lock_free(lock);
+ lock = NULL;
+ goto out;
+ }
+ }
+
+ /* Fill only inode and gfid.
+ posix and protocol/server give preference to pargfid/basename over
+ gfid/inode for resolution if all the three parameters of loc_t are
+ present. I want to avoid the following hypothetical situation:
+
+ 1. rebalance did a lookup on a dentry and got a gfid.
+ 2. rebalance acquires lock on loc_t which was filled with gfid and
+ path (pargfid/bname) from step 1.
+ 3. somebody deleted and recreated the same file
+ 4. rename on the same path acquires lock on loc_t which now points
+ to a different inode (and hence gets the lock).
+ 5. rebalance continues to migrate file (note that not all fops done
+ by rebalance during migration are inode/gfid based Eg., unlink)
+ 6. rename continues.
+ */
+ lock->loc.inode = inode_ref(loc->inode);
+ loc_gfid(loc, lock->loc.gfid);
+
+out:
+ return lock;
+}
+
+static int
+dht_local_entrylk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_entrylk_cbk_t entrylk_cbk)
+{
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local == NULL) {
+ local = dht_local_init(frame, NULL, NULL, 0);
+ }
+
+ if (local == NULL) {
+ goto out;
+ }
+
+ local->lock[0].ns.directory_ns.entrylk_cbk = entrylk_cbk;
+ local->lock[0].ns.directory_ns.locks = lk_array;
+ local->lock[0].ns.directory_ns.lk_count = lk_count;
+
+ ret = dht_lock_order_requests(local->lock[0].ns.directory_ns.locks,
+ local->lock[0].ns.directory_ns.lk_count);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void
+dht_entrylk_done(call_frame_t *lock_frame)
+{
+ fop_entrylk_cbk_t entrylk_cbk = NULL;
+ call_frame_t *main_frame = NULL;
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+ main_frame = local->main_frame;
+
+ local->lock[0].ns.directory_ns.locks = NULL;
+ local->lock[0].ns.directory_ns.lk_count = 0;
+
+ entrylk_cbk = local->lock[0].ns.directory_ns.entrylk_cbk;
+ local->lock[0].ns.directory_ns.entrylk_cbk = NULL;
+
+ entrylk_cbk(main_frame, NULL, main_frame->this,
+ local->lock[0].ns.directory_ns.op_ret,
+ local->lock[0].ns.directory_ns.op_errno, NULL);
+
+ dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+ return;
+}
+
+static int32_t
+dht_unlock_entrylk_done(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ gf_uuid_unparse(local->lock[0].ns.directory_ns.locks[0]->loc.inode->gfid,
+ gfid);
+
+ if (op_ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UNLOCK_GFID_FAILED, "gfid=%s", gfid,
+ "DHT_LAYOUT_HEAL_DOMAIN", NULL);
+ }
+
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+
+static int32_t
+dht_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int lk_index = 0, call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ lk_index = (long)cookie;
+
+ local = frame->local;
+
+ uuid_utoa_r(local->lock[0].ns.directory_ns.locks[lk_index]->loc.gfid, gfid);
+
+ if (op_ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
+ "name=%s",
+ local->lock[0].ns.directory_ns.locks[lk_index]->xl->name,
+ "gfid=%s", gfid, NULL);
+ } else {
+ local->lock[0].ns.directory_ns.locks[lk_index]->locked = 0;
+ }
+
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ dht_entrylk_done(frame);
+ }
+
+ return 0;
+}
+
+static int32_t
+dht_unlock_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_entrylk_cbk_t entrylk_cbk)
+{
+ dht_local_t *local = NULL;
+ int ret = -1, i = 0;
+ call_frame_t *lock_frame = NULL;
+ int call_cnt = 0;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, done);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done);
+ GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, done);
+
+ call_cnt = dht_lock_count(lk_array, lk_count);
+ if (call_cnt == 0) {
+ ret = 0;
+ goto done;
+ }
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ NULL);
+
+ dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+ goto done;
+ }
+
+ ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk);
+ if (ret < 0) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ NULL);
+
+ dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+
+ goto done;
+ }
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+ local->call_cnt = call_cnt;
+
+ for (i = 0; i < local->lock[0].ns.directory_ns.lk_count; i++) {
+ if (!local->lock[0].ns.directory_ns.locks[i]->locked)
+ continue;
+
+ lock_frame->root
+ ->lk_owner = local->lock[0].ns.directory_ns.locks[i]->lk_owner;
+ STACK_WIND_COOKIE(
+ lock_frame, dht_unlock_entrylk_cbk, (void *)(long)i,
+ local->lock[0].ns.directory_ns.locks[i]->xl,
+ local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk,
+ local->lock[0].ns.directory_ns.locks[i]->domain,
+ &local->lock[0].ns.directory_ns.locks[i]->loc,
+ local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_UNLOCK,
+ ENTRYLK_WRLCK, NULL);
+ if (!--call_cnt)
+ break;
+ }
+
+ return 0;
+
+done:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+
+ /* no locks acquired, invoke entrylk_cbk */
+ if (ret == 0)
+ entrylk_cbk(frame, NULL, frame->this, 0, 0, NULL);
+
+ return ret;
+}
+
+int32_t
+dht_unlock_entrylk_wrapper(call_frame_t *frame, dht_elock_wrap_t *entrylk)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+
+ local = frame->local;
+
+ if (!entrylk || !entrylk->locks)
+ goto out;
+
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, NULL, NULL, 0);
+ if (lock_local == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_CREATE_FAILED, "local", "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
+ goto done;
+ }
+
+ lock_frame->local = lock_local;
+
+ lock_local->lock[0].ns.directory_ns.locks = entrylk->locks;
+ lock_local->lock[0].ns.directory_ns.lk_count = entrylk->lk_count;
+ entrylk->locks = NULL;
+ entrylk->lk_count = 0;
+
+ ret = dht_unlock_entrylk(
+ lock_frame, lock_local->lock[0].ns.directory_ns.locks,
+ lock_local->lock[0].ns.directory_ns.lk_count, dht_unlock_entrylk_done);
+ if (ret)
+ goto done;
+
+ lock_frame = NULL;
+
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+
+out:
+ return 0;
+}
+
+static int
+dht_entrylk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_entrylk_done(frame);
+ return 0;
+}
+
+static void
+dht_entrylk_cleanup(call_frame_t *lock_frame)
+{
+ dht_lock_t **lk_array = NULL;
+ int lk_count = 0, lk_acquired = 0;
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+
+ lk_array = local->lock[0].ns.directory_ns.locks;
+ lk_count = local->lock[0].ns.directory_ns.lk_count;
+
+ lk_acquired = dht_lock_count(lk_array, lk_count);
+ if (lk_acquired != 0) {
+ dht_unlock_entrylk(lock_frame, lk_array, lk_count,
+ dht_entrylk_cleanup_cbk);
+ } else {
+ dht_entrylk_done(lock_frame);
+ }
+
+ return;
+}
+
+static int32_t
+dht_blocking_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int lk_index = 0;
+ int i = 0;
+ dht_local_t *local = NULL;
+
+ lk_index = (long)cookie;
+
+ local = frame->local;
+ if (op_ret == 0) {
+ local->lock[0].ns.directory_ns.locks[lk_index]->locked = _gf_true;
+ } else {
+ switch (op_errno) {
+ case ESTALE:
+ case ENOENT:
+ if (local->lock[0]
+ .ns.directory_ns.locks[lk_index]
+ ->do_on_failure != IGNORE_ENOENT_ESTALE) {
+ local->lock[0].ns.directory_ns.op_ret = -1;
+ local->lock[0].ns.directory_ns.op_errno = op_errno;
+ goto cleanup;
+ }
+ break;
+ default:
+ local->lock[0].ns.directory_ns.op_ret = -1;
+ local->lock[0].ns.directory_ns.op_errno = op_errno;
+ goto cleanup;
+ }
+ }
+
+ if (lk_index == (local->lock[0].ns.directory_ns.lk_count - 1)) {
+ for (i = 0; (i < local->lock[0].ns.directory_ns.lk_count) &&
+ (!local->lock[0].ns.directory_ns.locks[i]->locked);
+ i++)
+ ;
+
+ if (i == local->lock[0].ns.directory_ns.lk_count) {
+ local->lock[0].ns.directory_ns.op_ret = -1;
+ local->lock[0].ns.directory_ns.op_errno = op_errno;
+ }
+
+ dht_entrylk_done(frame);
+ } else {
+ dht_blocking_entrylk_rec(frame, ++lk_index);
+ }
+
+ return 0;
+
+cleanup:
+ dht_entrylk_cleanup(frame);
+
+ return 0;
+}
+
+void
+dht_blocking_entrylk_rec(call_frame_t *frame, int i)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ STACK_WIND_COOKIE(
+ frame, dht_blocking_entrylk_cbk, (void *)(long)i,
+ local->lock[0].ns.directory_ns.locks[i]->xl,
+ local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk,
+ local->lock[0].ns.directory_ns.locks[i]->domain,
+ &local->lock[0].ns.directory_ns.locks[i]->loc,
+ local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_LOCK,
+ ENTRYLK_WRLCK, NULL);
+
+ return;
+}
+
+int
+dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_entrylk_cbk_t entrylk_cbk)
+{
+ int ret = -1;
+ call_frame_t *lock_frame = NULL;
+ dht_local_t *local = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, out);
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL)
+ goto out;
+
+ ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk);
+ if (ret < 0) {
+ goto out;
+ }
+
+ dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+
+ dht_blocking_entrylk_rec(lock_frame, 0);
+
+ return 0;
+out:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+
+ return -1;
+}
+
+static int
+dht_local_inodelk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk)
+{
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local == NULL) {
+ local = dht_local_init(frame, NULL, NULL, 0);
+ }
+
+ if (local == NULL) {
+ goto out;
+ }
+
+ local->lock[0].layout.my_layout.inodelk_cbk = inodelk_cbk;
+ local->lock[0].layout.my_layout.locks = lk_array;
+ local->lock[0].layout.my_layout.lk_count = lk_count;
+
+ ret = dht_lock_order_requests(local->lock[0].layout.my_layout.locks,
+ local->lock[0].layout.my_layout.lk_count);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void
+dht_inodelk_done(call_frame_t *lock_frame)
+{
+ fop_inodelk_cbk_t inodelk_cbk = NULL;
+ call_frame_t *main_frame = NULL;
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+ main_frame = local->main_frame;
+
+ local->lock[0].layout.my_layout.locks = NULL;
+ local->lock[0].layout.my_layout.lk_count = 0;
+
+ inodelk_cbk = local->lock[0].layout.my_layout.inodelk_cbk;
+ local->lock[0].layout.my_layout.inodelk_cbk = NULL;
+
+ inodelk_cbk(main_frame, NULL, main_frame->this,
+ local->lock[0].layout.my_layout.op_ret,
+ local->lock[0].layout.my_layout.op_errno, NULL);
+
+ dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+ return;
+}
+
+static int32_t
+dht_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int lk_index = 0, call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ lk_index = (long)cookie;
+
+ local = frame->local;
+ if (op_ret < 0) {
+ uuid_utoa_r(local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid,
+ gfid);
+
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
+ "name=%s",
+ local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
+ "gfid=%s", gfid, NULL);
+ } else {
+ local->lock[0].layout.my_layout.locks[lk_index]->locked = 0;
+ }
+
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ dht_inodelk_done(frame);
+ }
+
+ return 0;
+}
+
+static int32_t
+dht_unlock_inodelk_done(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ gf_uuid_unparse(local->lock[0].layout.my_layout.locks[0]->loc.inode->gfid,
+ gfid);
+
+ if (op_ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UNLOCK_GFID_FAILED, "DHT_LAYOUT_HEAL_DOMAIN gfid=%s",
+ gfid, NULL);
+ }
+
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk)
+{
+ dht_local_t *local = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = -1, i = 0;
+ call_frame_t *lock_frame = NULL;
+ int call_cnt = 0;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, done);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done);
+ GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, done);
+
+ call_cnt = dht_lock_count(lk_array, lk_count);
+ if (call_cnt == 0) {
+ ret = 0;
+ goto done;
+ }
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ NULL);
+
+ dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+ goto done;
+ }
+
+ ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+ if (ret < 0) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ NULL);
+
+ dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+
+ goto done;
+ }
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+ local->call_cnt = call_cnt;
+
+ flock.l_type = F_UNLCK;
+
+ for (i = 0; i < local->lock[0].layout.my_layout.lk_count; i++) {
+ if (!local->lock[0].layout.my_layout.locks[i]->locked)
+ continue;
+
+ lock_frame->root
+ ->lk_owner = local->lock[0].layout.my_layout.locks[i]->lk_owner;
+ STACK_WIND_COOKIE(
+ lock_frame, dht_unlock_inodelk_cbk, (void *)(long)i,
+ local->lock[0].layout.my_layout.locks[i]->xl,
+ local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+ local->lock[0].layout.my_layout.locks[i]->domain,
+ &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock,
+ NULL);
+ if (!--call_cnt)
+ break;
+ }
+
+ return 0;
+
+done:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+ /* no locks acquired, invoke inodelk_cbk */
+ if (ret == 0)
+ inodelk_cbk(frame, NULL, frame->this, 0, 0, NULL);
+
+ return ret;
+}
+
+int32_t
+dht_unlock_inodelk_wrapper(call_frame_t *frame, dht_ilock_wrap_t *inodelk)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+
+ local = frame->local;
+
+ if (!inodelk || !inodelk->locks)
+ goto out;
+
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, NULL, NULL, 0);
+ if (lock_local == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_CREATE_FAILED, "local", "gfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
+ goto done;
+ }
+
+ lock_frame->local = lock_local;
+
+ lock_local->lock[0].layout.my_layout.locks = inodelk->locks;
+ lock_local->lock[0].layout.my_layout.lk_count = inodelk->lk_count;
+ inodelk->locks = NULL;
+ inodelk->lk_count = 0;
+
+ ret = dht_unlock_inodelk(
+ lock_frame, lock_local->lock[0].layout.my_layout.locks,
+ lock_local->lock[0].layout.my_layout.lk_count, dht_unlock_inodelk_done);
+
+ if (ret)
+ goto done;
+
+ lock_frame = NULL;
+
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+out:
+ return 0;
+}
+
+static int
+dht_inodelk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_inodelk_done(frame);
+ return 0;
+}
+
+static void
+dht_inodelk_cleanup(call_frame_t *lock_frame)
+{
+ dht_lock_t **lk_array = NULL;
+ int lk_count = 0, lk_acquired = 0;
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+
+ lk_array = local->lock[0].layout.my_layout.locks;
+ lk_count = local->lock[0].layout.my_layout.lk_count;
+
+ lk_acquired = dht_lock_count(lk_array, lk_count);
+ if (lk_acquired != 0) {
+ dht_unlock_inodelk(lock_frame, lk_array, lk_count,
+ dht_inodelk_cleanup_cbk);
+ } else {
+ dht_inodelk_done(lock_frame);
+ }
+
+ return;
+}
+
+static int32_t
+dht_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int lk_index = 0, call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ lk_index = (long)cookie;
+
+ if (op_ret == -1) {
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+
+ if (local && local->lock[0].layout.my_layout.locks[lk_index]) {
+ uuid_utoa_r(local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->loc.inode->gfid,
+ gfid);
+
+ gf_msg_debug(
+ this->name, op_errno,
+ "inodelk failed on gfid: %s "
+ "subvolume: %s",
+ gfid,
+ local->lock[0].layout.my_layout.locks[lk_index]->xl->name);
+ }
+
+ goto out;
+ }
+
+ local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true;
+
+out:
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ if (local->lock[0].layout.my_layout.op_ret < 0) {
+ dht_inodelk_cleanup(frame);
+ return 0;
+ }
+
+ dht_inodelk_done(frame);
+ }
+
+ return 0;
+}
+
+int
+dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array,
+ int lk_count, fop_inodelk_cbk_t inodelk_cbk)
+{
+ struct gf_flock flock = {
+ 0,
+ };
+ int i = 0, ret = 0;
+ dht_local_t *local = NULL;
+ call_frame_t *lock_frame = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out);
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL)
+ goto out;
+
+ ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+ if (ret < 0) {
+ goto out;
+ }
+
+ dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+
+ local->call_cnt = lk_count;
+
+ for (i = 0; i < lk_count; i++) {
+ flock.l_type = local->lock[0].layout.my_layout.locks[i]->type;
+
+ STACK_WIND_COOKIE(
+ lock_frame, dht_nonblocking_inodelk_cbk, (void *)(long)i,
+ local->lock[0].layout.my_layout.locks[i]->xl,
+ local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+ local->lock[0].layout.my_layout.locks[i]->domain,
+ &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock,
+ NULL);
+ }
+
+ return 0;
+
+out:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+ return -1;
+}
+
+static int32_t
+dht_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int lk_index = 0;
+ int i = 0;
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+ dht_reaction_type_t reaction = 0;
+
+ lk_index = (long)cookie;
+
+ local = frame->local;
+ if (op_ret == 0) {
+ local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true;
+ } else {
+ switch (op_errno) {
+ case ESTALE:
+ case ENOENT:
+ reaction = local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->do_on_failure;
+ if ((reaction != IGNORE_ENOENT_ESTALE) &&
+ (reaction != IGNORE_ENOENT_ESTALE_EIO)) {
+ gf_uuid_unparse(local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->loc.gfid,
+ gfid);
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_INODELK_FAILED, "subvol=%s",
+ local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->xl->name,
+ "gfid=%s", gfid, NULL);
+ goto cleanup;
+ }
+ break;
+ case EIO:
+ reaction = local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->do_on_failure;
+ if (reaction != IGNORE_ENOENT_ESTALE_EIO) {
+ gf_uuid_unparse(local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->loc.gfid,
+ gfid);
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_INODELK_FAILED, "subvol=%s",
+ local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->xl->name,
+ "gfid=%s", gfid, NULL);
+ goto cleanup;
+ }
+ break;
+
+ default:
+ gf_uuid_unparse(
+ local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid,
+ gfid);
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+ gf_smsg(
+ this->name, GF_LOG_ERROR, op_errno, DHT_MSG_INODELK_FAILED,
+ "subvol=%s",
+ local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
+ "gfid=%s", gfid, NULL);
+ goto cleanup;
+ }
+ }
+
+ if (lk_index == (local->lock[0].layout.my_layout.lk_count - 1)) {
+ for (i = 0; (i < local->lock[0].layout.my_layout.lk_count) &&
+ (!local->lock[0].layout.my_layout.locks[i]->locked);
+ i++)
+ ;
+
+ if (i == local->lock[0].layout.my_layout.lk_count) {
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+ }
+
+ dht_inodelk_done(frame);
+ } else {
+ dht_blocking_inodelk_rec(frame, ++lk_index);
+ }
+
+ return 0;
+
+cleanup:
+ dht_inodelk_cleanup(frame);
+
+ return 0;
+}
+
+void
+dht_blocking_inodelk_rec(call_frame_t *frame, int i)
+{
+ dht_local_t *local = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ local = frame->local;
+
+ flock.l_type = local->lock[0].layout.my_layout.locks[i]->type;
+
+ STACK_WIND_COOKIE(
+ frame, dht_blocking_inodelk_cbk, (void *)(long)i,
+ local->lock[0].layout.my_layout.locks[i]->xl,
+ local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+ local->lock[0].layout.my_layout.locks[i]->domain,
+ &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLKW, &flock, NULL);
+
+ return;
+}
+
+int
+dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk)
+{
+ int ret = -1;
+ call_frame_t *lock_frame = NULL;
+ dht_local_t *local = NULL;
+ dht_local_t *tmp_local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out);
+
+ tmp_local = frame->local;
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL) {
+ gf_uuid_unparse(tmp_local->loc.gfid, gfid);
+ gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCK_FRAME_FAILED,
+ "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL);
+ goto out;
+ }
+
+ ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+ if (ret < 0) {
+ gf_uuid_unparse(tmp_local->loc.gfid, gfid);
+ gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCAL_LOCK_INIT_FAILED,
+ "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL);
+ goto out;
+ }
+
+ dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+
+ dht_blocking_inodelk_rec(lock_frame, 0);
+
+ return 0;
+out:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+ return -1;
+}
+
+void
+dht_unlock_namespace(call_frame_t *frame, dht_dir_transaction_t *lock)
+{
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lock, out);
+
+ dht_unlock_entrylk_wrapper(frame, &lock->ns.directory_ns);
+ dht_unlock_inodelk_wrapper(frame, &lock->ns.parent_layout);
+
+out:
+ return;
+}
+
+static int32_t
+dht_protect_namespace_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+ if (op_ret != 0)
+ dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout);
+
+ local->current->ns.ns_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ loc_t *loc = NULL;
+ dht_lock_t **lk_array = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int count = 0;
+ dht_elock_wrap_t *entrylk = NULL;
+
+ local = frame->local;
+ entrylk = &local->current->ns.directory_ns;
+
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ loc = &entrylk->locks[0]->loc;
+ gf_uuid_unparse(loc->gfid, pgfid);
+
+ local->op_ret = 0;
+ lk_array = entrylk->locks;
+ count = entrylk->lk_count;
+
+ ret = dht_blocking_entrylk(frame, lk_array, count,
+ dht_protect_namespace_cbk);
+
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_ENTRYLK_FAILED_AFT_INODELK, "fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "basename=%s",
+ entrylk->locks[0]->basename, NULL);
+ goto err;
+ }
+
+ return 0;
+
+err:
+ if (lk_array != NULL) {
+ dht_lock_array_free(lk_array, count);
+ GF_FREE(lk_array);
+ entrylk->locks = NULL;
+ entrylk->lk_count = 0;
+ }
+
+ /* Unlock inodelk. No harm calling unlock twice */
+ dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout);
+ /* Call ns_cbk. It will take care of unwinding */
+ local->current->ns.ns_cbk(frame, NULL, this, local->op_ret, local->op_errno,
+ NULL);
+ return 0;
+}
+
+/* Given the loc and the subvol, this routine takes the inodelk on
+ * the parent inode and entrylk on (parent, loc->name). This routine
+ * is specific as it supports only one subvol on which it takes inodelk
+ * and then entrylk serially.
+ */
+int
+dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
+ struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk)
+{
+ dht_ilock_wrap_t *inodelk = NULL;
+ dht_elock_wrap_t *entrylk = NULL;
+ dht_lock_t **lk_array = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ loc_t parent = {
+ 0,
+ };
+ int ret = -1;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int32_t op_errno = 0;
+ int count = 1;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, loc, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, loc->parent, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, subvol, out);
+
+ local = frame->local;
+ this = frame->this;
+
+ inodelk = &ns->parent_layout;
+ entrylk = &ns->directory_ns;
+
+ /* Initialize entrylk_cbk and parent loc */
+ ns->ns_cbk = ns_cbk;
+
+ ret = dht_build_parent_loc(this, &parent, loc, &op_errno);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_LOC_FAILED,
+ "gfid=%s", loc->gfid, "name=%s", loc->name, "path=%s",
+ loc->path, NULL);
+ goto out;
+ }
+ gf_uuid_unparse(parent.gfid, pgfid);
+
+ /* Alloc inodelk */
+ inodelk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+ if (inodelk->locks == NULL) {
+ local->op_errno = ENOMEM;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_CALLOC_FAILED, "fop=%s", gf_fop_list[local->fop],
+ "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path,
+ NULL);
+ goto out;
+ }
+
+ inodelk->locks[0] = dht_lock_new(this, subvol, &parent, F_RDLCK,
+ DHT_LAYOUT_HEAL_DOMAIN, NULL,
+ FAIL_ON_ANY_ERROR);
+ if (inodelk->locks[0] == NULL) {
+ local->op_errno = ENOMEM;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_LOCK_ALLOC_FAILED, "inodelk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
+ goto err;
+ }
+ inodelk->lk_count = count;
+
+ /* Allock entrylk */
+ entrylk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+ if (entrylk->locks == NULL) {
+ local->op_errno = ENOMEM;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_CALLOC_FAILED, "entrylk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
+
+ goto err;
+ }
+
+ entrylk->locks[0] = dht_lock_new(this, subvol, &parent, F_WRLCK,
+ DHT_ENTRY_SYNC_DOMAIN, loc->name,
+ FAIL_ON_ANY_ERROR);
+ if (entrylk->locks[0] == NULL) {
+ local->op_errno = ENOMEM;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_LOCK_ALLOC_FAILED, "entrylk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
+
+ goto err;
+ }
+ entrylk->lk_count = count;
+
+ /* Take read inodelk on parent. If it is successful, take write entrylk
+ * on name in cbk.
+ */
+ lk_array = inodelk->locks;
+ ret = dht_blocking_inodelk(frame, lk_array, count,
+ dht_blocking_entrylk_after_inodelk);
+ if (ret < 0) {
+ local->op_errno = EIO;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_BLOCK_INODELK_FAILED, "fop=%s", gf_fop_list[local->fop],
+ "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path,
+ NULL);
+
+ goto err;
+ }
+
+ loc_wipe(&parent);
+
+ return 0;
+err:
+ if (entrylk->locks != NULL) {
+ dht_lock_array_free(entrylk->locks, count);
+ GF_FREE(entrylk->locks);
+ entrylk->locks = NULL;
+ entrylk->lk_count = 0;
+ }
+
+ if (inodelk->locks != NULL) {
+ dht_lock_array_free(inodelk->locks, count);
+ GF_FREE(inodelk->locks);
+ inodelk->locks = NULL;
+ inodelk->lk_count = 0;
+ }
+
+ loc_wipe(&parent);
+out:
+ return -1;
+}
diff --git a/xlators/cluster/dht/src/dht-lock.h b/xlators/cluster/dht/src/dht-lock.h
new file mode 100644
index 00000000000..6485c03fb6e
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-lock.h
@@ -0,0 +1,91 @@
+/*
+ Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _DHT_LOCK_H
+#define _DHT_LOCK_H
+
+#include "dht-common.h"
+
+void
+dht_lock_array_free(dht_lock_t **lk_array, int count);
+
+int32_t
+dht_lock_count(dht_lock_t **lk_array, int lk_count);
+
+dht_lock_t *
+dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type,
+ const char *domain, const char *basename,
+ dht_reaction_type_t do_on_failure);
+
+int32_t
+dht_unlock_entrylk_wrapper(call_frame_t *, dht_elock_wrap_t *);
+
+void
+dht_blocking_entrylk_rec(call_frame_t *frame, int i);
+
+int
+dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t entrylk_cbk);
+
+int32_t
+dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk);
+
+int32_t
+dht_unlock_inodelk_wrapper(call_frame_t *, dht_ilock_wrap_t *);
+
+/* Acquire non-blocking inodelk on a list of xlators.
+ *
+ * @lk_array: array of lock requests lock on.
+ *
+ * @lk_count: number of locks in @lk_array
+ *
+ * @inodelk_cbk: will be called after inodelk replies are received
+ *
+ * @retval: -1 if stack_winding inodelk fails. 0 otherwise.
+ * inodelk_cbk is called with appropriate error on errors.
+ * On failure to acquire lock on all members of list, successful
+ * locks are unlocked before invoking cbk.
+ */
+
+int
+dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array,
+ int lk_count, fop_inodelk_cbk_t inodelk_cbk);
+
+void
+dht_blocking_inodelk_rec(call_frame_t *frame, int i);
+
+/* same as dht_nonblocking_inodelk, but issues sequential blocking locks on
+ * @lk_array directly. locks are issued on some order which remains same
+ * for a list of xlators (irrespective of order of xlators within list).
+ */
+
+int
+dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk);
+
+int32_t
+dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata);
+
+int32_t
+dht_blocking_entrylk_after_inodelk_rename(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata);
+
+void
+dht_unlock_namespace(call_frame_t *, dht_dir_transaction_t *);
+
+int
+dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
+ struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk);
+
+#endif /* _DHT_LOCK_H */
diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h
index 570f4c24b34..e3c4471334a 100644
--- a/xlators/cluster/dht/src/dht-mem-types.h
+++ b/xlators/cluster/dht/src/dht-mem-types.h
@@ -1,42 +1,38 @@
-
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#ifndef __DHT_MEM_TYPES_H__
#define __DHT_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_dht_mem_types_ {
- gf_dht_mt_dht_du_t = gf_common_mt_end + 1,
- gf_dht_mt_dht_conf_t,
- gf_dht_mt_char,
- gf_dht_mt_int32_t,
- gf_dht_mt_dht_local_t,
- gf_dht_mt_xlator_t,
- gf_dht_mt_dht_layout_t,
- gf_switch_mt_dht_conf_t,
- gf_switch_mt_dht_du_t,
- gf_switch_mt_switch_sched_array,
- gf_switch_mt_switch_struct,
- gf_dht_mt_end
+ gf_dht_mt_dht_du_t = gf_common_mt_end + 1,
+ gf_dht_mt_dht_conf_t,
+ gf_dht_mt_char,
+ gf_dht_mt_int32_t,
+ gf_dht_mt_xlator_t,
+ gf_dht_mt_dht_layout_t,
+ gf_switch_mt_switch_sched_array,
+ gf_switch_mt_switch_struct,
+ gf_dht_mt_subvol_time,
+ gf_dht_mt_loc_t,
+ gf_defrag_info_mt,
+ gf_dht_mt_inode_ctx_t,
+ gf_dht_mt_dirent_t,
+ gf_dht_mt_container_t,
+ gf_dht_mt_octx_t,
+ gf_dht_mt_miginfo_t,
+ gf_dht_mt_fd_ctx_t,
+ gf_dht_ret_cache_t,
+ gf_dht_nodeuuids_t,
+ gf_dht_mt_end
};
#endif
-
diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h
new file mode 100644
index 00000000000..601f8dad78b
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-messages.h
@@ -0,0 +1,386 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _DHT_MESSAGES_H_
+#define _DHT_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+ DHT, DHT_MSG_CACHED_SUBVOL_GET_FAILED, DHT_MSG_CREATE_LINK_FAILED,
+ DHT_MSG_DICT_SET_FAILED, DHT_MSG_DIR_ATTR_HEAL_FAILED,
+ DHT_MSG_DIR_SELFHEAL_FAILED, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ DHT_MSG_FILE_ON_MULT_SUBVOL, DHT_MSG_FILE_TYPE_MISMATCH,
+ DHT_MSG_GFID_MISMATCH, DHT_MSG_GFID_NULL, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ DHT_MSG_INIT_FAILED, DHT_MSG_INVALID_CONFIGURATION,
+ DHT_MSG_INVALID_DISK_LAYOUT, DHT_MSG_INVALID_OPTION,
+ DHT_MSG_LAYOUT_FIX_FAILED, DHT_MSG_LAYOUT_MERGE_FAILED,
+ DHT_MSG_LAYOUT_MISMATCH, DHT_MSG_LAYOUT_NULL, DHT_MSG_MIGRATE_DATA_COMPLETE,
+ DHT_MSG_MIGRATE_DATA_FAILED, DHT_MSG_MIGRATE_FILE_COMPLETE,
+ DHT_MSG_MIGRATE_FILE_FAILED, DHT_MSG_NO_MEMORY, DHT_MSG_OPENDIR_FAILED,
+ DHT_MSG_REBALANCE_FAILED, DHT_MSG_REBALANCE_START_FAILED,
+ DHT_MSG_REBALANCE_STATUS, DHT_MSG_REBALANCE_STOPPED, DHT_MSG_RENAME_FAILED,
+ DHT_MSG_SETATTR_FAILED, DHT_MSG_SUBVOL_INSUFF_INODES,
+ DHT_MSG_SUBVOL_INSUFF_SPACE, DHT_MSG_UNLINK_FAILED,
+ DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT,
+ DHT_MSG_GET_XATTR_FAILED, DHT_MSG_FILE_LOOKUP_FAILED,
+ DHT_MSG_OPEN_FD_FAILED, DHT_MSG_SET_INODE_CTX_FAILED,
+ DHT_MSG_UNLOCKING_FAILED, DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO,
+ DHT_MSG_CHUNK_SIZE_INFO, DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR,
+ DHT_MSG_LAYOUT_SORT_FAILED, DHT_MSG_REGEX_INFO, DHT_MSG_FOPEN_FAILED,
+ DHT_MSG_SET_HOSTNAME_FAILED, DHT_MSG_BRICK_ERROR, DHT_MSG_SYNCOP_FAILED,
+ DHT_MSG_MIGRATE_INFO, DHT_MSG_SOCKET_ERROR, DHT_MSG_CREATE_FD_FAILED,
+ DHT_MSG_READDIR_ERROR, DHT_MSG_CHILD_LOC_BUILD_FAILED,
+ DHT_MSG_SET_SWITCH_PATTERN_ERROR, DHT_MSG_COMPUTE_HASH_FAILED,
+ DHT_MSG_FIND_LAYOUT_ANOMALIES_ERROR, DHT_MSG_ANOMALIES_INFO,
+ DHT_MSG_LAYOUT_INFO, DHT_MSG_INODE_LK_ERROR, DHT_MSG_RENAME_INFO,
+ DHT_MSG_DATA_NULL, DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED,
+ DHT_MSG_UNLINK_LOOKUP_INFO, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ DHT_MSG_OPERATION_NOT_SUP, DHT_MSG_NOT_LINK_FILE_ERROR, DHT_MSG_CHILD_DOWN,
+ DHT_MSG_UUID_PARSE_ERROR, DHT_MSG_GET_DISK_INFO_ERROR,
+ DHT_MSG_INVALID_VALUE, DHT_MSG_SWITCH_PATTERN_INFO,
+ DHT_MSG_SUBVOL_OP_FAILED, DHT_MSG_LAYOUT_PRESET_FAILED,
+ DHT_MSG_INVALID_LINKFILE, DHT_MSG_FIX_LAYOUT_INFO,
+ DHT_MSG_GET_HOSTNAME_FAILED, DHT_MSG_WRITE_FAILED,
+ DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED, DHT_MSG_FSYNC_FAILED,
+ DHT_MSG_SUBVOL_DECOMMISSION_INFO, DHT_MSG_BRICK_QUERY_FAILED,
+ DHT_MSG_SUBVOL_NO_LAYOUT_INFO, DHT_MSG_OPEN_FD_ON_DST_FAILED,
+ DHT_MSG_SUBVOL_NOT_FOUND, DHT_MSG_FILE_LOOKUP_ON_DST_FAILED,
+ DHT_MSG_DISK_LAYOUT_MISSING, DHT_MSG_DICT_GET_FAILED,
+ DHT_MSG_REVALIDATE_CBK_INFO, DHT_MSG_UPGRADE_BRICKS, DHT_MSG_LK_ARRAY_INFO,
+ DHT_MSG_RENAME_NOT_LOCAL, DHT_MSG_RECONFIGURE_INFO,
+ DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, DHT_MSG_SYS_CALL_GET_TIME_FAILED,
+ DHT_MSG_NO_DISK_USAGE_STATUS, DHT_MSG_SUBVOL_DOWN_ERROR,
+ DHT_MSG_REBAL_THROTTLE_INFO, DHT_MSG_COMMIT_HASH_INFO,
+ DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_SETTLE_HASH_FAILED,
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, DHT_MSG_FD_CTX_SET_FAILED,
+ DHT_MSG_STALE_LOOKUP, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ DHT_MSG_LOCK_MIGRATION_FAILED, DHT_MSG_LOCK_INODE_UNREF_FAILED,
+ DHT_MSG_ASPRINTF_FAILED, DHT_MSG_DIR_LOOKUP_FAILED, DHT_MSG_INODELK_FAILED,
+ DHT_MSG_LOCK_FRAME_FAILED, DHT_MSG_LOCAL_LOCK_INIT_FAILED,
+ DHT_MSG_ENTRYLK_ERROR, DHT_MSG_INODELK_ERROR, DHT_MSG_LOC_FAILED,
+ DHT_MSG_UNKNOWN_FOP, DHT_MSG_MIGRATE_FILE_SKIPPED,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, DHT_MSG_HASHED_SUBVOL_DOWN,
+ DHT_MSG_NON_HASHED_SUBVOL_DOWN, DHT_MSG_SYNCTASK_CREATE_FAILED,
+ DHT_MSG_DIR_HEAL_ABORT, DHT_MSG_MIGRATE_SKIP, DHT_MSG_FD_CREATE_FAILED,
+ DHT_MSG_DICT_NEW_FAILED, DHT_MSG_FAILED_TO_OPEN, DHT_MSG_CREATE_FAILED,
+ DHT_MSG_FILE_NOT_EXIST, DHT_MSG_CHOWN_FAILED, DHT_MSG_FALLOCATE_FAILED,
+ DHT_MSG_FTRUNCATE_FAILED, DHT_MSG_STATFS_FAILED, DHT_MSG_WRITE_CROSS,
+ DHT_MSG_NEW_TARGET_FOUND, DHT_MSG_INSUFF_MEMORY, DHT_MSG_SET_XATTR_FAILED,
+ DHT_MSG_SET_MODE_FAILED, DHT_MSG_FILE_EXISTS_IN_DEST,
+ DHT_MSG_SYMLINK_FAILED, DHT_MSG_LINKFILE_DEL_FAILED, DHT_MSG_MKNOD_FAILED,
+ DHT_MSG_MIGRATE_CLEANUP_FAILED, DHT_MSG_LOCK_MIGRATE,
+ DHT_MSG_PARENT_BUILD_FAILED, DHT_MSG_HASHED_SUBVOL_NOT_FOUND,
+ DHT_MSG_ACQUIRE_ENTRYLK_FAILED, DHT_MSG_CREATE_DST_FAILED,
+ DHT_MSG_MIGRATION_EXIT, DHT_MSG_CHANGED_DST, DHT_MSG_TRACE_FAILED,
+ DHT_MSG_WRITE_LOCK_FAILED, DHT_MSG_GETACTIVELK_FAILED, DHT_MSG_STAT_FAILED,
+ DHT_MSG_UNLINK_PERFORM_FAILED, DHT_MSG_CLANUP_SOURCE_FILE_FAILED,
+ DHT_MSG_UNLOCK_FILE_FAILED, DHT_MSG_REMOVE_XATTR_FAILED,
+ DHT_MSG_DATA_MIGRATE_ABORT, DHT_MSG_DEFRAG_NULL, DHT_MSG_PARENT_NULL,
+ DHT_MSG_GFID_NOT_PRESENT, DHT_MSG_CHILD_LOC_FAILED,
+ DHT_MSG_SET_LOOKUP_FAILED, DHT_MSG_DIR_REMOVED, DHT_MSG_FIX_NOT_COMP,
+ DHT_MSG_SUBVOL_DETER_FAILED, DHT_MSG_LOCAL_SUBVOL, DHT_MSG_NODE_UUID,
+ DHT_MSG_SIZE_FILE, DHT_MSG_GET_DATA_SIZE_FAILED,
+ DHT_MSG_PTHREAD_JOIN_FAILED, DHT_MSG_COUNTER_THREAD_CREATE_FAILED,
+ DHT_MSG_MIGRATION_INIT_QUEUE_FAILED, DHT_MSG_PAUSED_TIMEOUT, DHT_MSG_WOKE,
+ DHT_MSG_ABORT_REBALANCE, DHT_MSG_CREATE_TASK_REBAL_FAILED,
+ DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL, DHT_MSG_ADD_CHOICES_ERROR,
+ DHT_MSG_GET_CHOICES_ERROR, DHT_MSG_PREPARE_STATUS_ERROR,
+ DHT_MSG_SET_CHOICE_FAILED, DHT_MSG_SET_HASHED_SUBVOL_FAILED,
+ DHT_MSG_XATTR_HEAL_NOT_POSS, DHT_MSG_LINKTO_FILE_FAILED,
+ DHT_MSG_STALE_LINKFILE_DELETE, DHT_MSG_NO_SUBVOL_FOR_LINKTO,
+ DHT_MSG_SUBVOL_RETURNED, DHT_MSG_UNKNOWN_LOCAL_XSEL, DHT_MSG_GET_XATTR_ERR,
+ DHT_MSG_ALLOC_OR_FILL_FAILED, DHT_MSG_GET_REAL_NAME_FAILED,
+ DHT_MSG_COPY_UUID_FAILED, DHT_MSG_MDS_DETER_FAILED,
+ DHT_MSG_CREATE_REBAL_FAILED, DHT_MSG_LINK_LAYOUT_FAILED,
+ DHT_MSG_NO_SUBVOL_IN_LAYOUT, DHT_MSG_MEM_ALLOC_FAILED,
+ DHT_MSG_SET_IN_PARAMS_DICT_FAILED, DHT_MSG_LOC_COPY_FAILED,
+ DHT_MSG_PARENT_LOC_FAILED, DHT_MSG_CREATE_LOCK_FAILED,
+ DHT_MSG_PREV_ATTEMPT_FAILED, DHT_MSG_REFRESH_ATTEMPT,
+ DHT_MSG_ACQUIRE_LOCK_FAILED, DHT_MSG_CREATE_STUB_FAILED,
+ DHT_MSG_WIND_LOCK_REQ_FAILED, DHT_MSG_REFRESH_FAILED,
+ DHT_MSG_CACHED_SUBVOL_ERROR, DHT_MSG_NO_LINK_SUBVOL, DHT_MSG_SET_KEY_FAILED,
+ DHT_MSG_REMOVE_LINKTO_FAILED, DHT_MSG_LAYOUT_DICT_SET_FAILED,
+ DHT_MSG_XATTR_DICT_NULL, DHT_MSG_DUMMY_ALLOC_FAILED, DHT_MSG_DICT_IS_NULL,
+ DHT_MSG_LINK_INODE_FAILED, DHT_MSG_SELFHEAL_FAILED, DHT_MSG_NO_MDS_SUBVOL,
+ DHT_MSG_LIST_XATTRS_FAILED, DHT_MSG_RESET_INTER_XATTR_FAILED,
+ DHT_MSG_MDS_DOWN_UNABLE_TO_SET, DHT_MSG_WIND_UNLOCK_FAILED,
+ DHT_MSG_COMMIT_HASH_FAILED, DHT_MSG_UNLOCK_GFID_FAILED,
+ DHT_MSG_UNLOCK_FOLLOW_ENTRYLK, DHT_MSG_COPY_FRAME_FAILED,
+ DHT_MSG_UNLOCK_FOLLOW_LOCKS, DHT_MSG_ENTRYLK_FAILED_AFT_INODELK,
+ DHT_MSG_CALLOC_FAILED, DHT_MSG_LOCK_ALLOC_FAILED,
+ DHT_MSG_BLOCK_INODELK_FAILED,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ DHT_MSG_DST_NULL_SET_FAILED);
+
+#define DHT_MSG_FD_CTX_SET_FAILED_STR "Failed to set fd ctx"
+#define DHT_MSG_INVALID_VALUE_STR "Different dst found in the fd ctx"
+#define DHT_MSG_UNKNOWN_FOP_STR "Unknown FOP on file"
+#define DHT_MSG_OPEN_FD_ON_DST_FAILED_STR "Failed to open the fd on file"
+#define DHT_MSG_SYNCTASK_CREATE_FAILED_STR "Failed to create synctask"
+#define DHT_MSG_ASPRINTF_FAILED_STR \
+ "asprintf failed while fetching subvol from the id"
+#define DHT_MSG_HAS_MIGINFO_STR "Found miginfo in the inode ctx"
+#define DHT_MSG_FILE_LOOKUP_FAILED_STR "failed to lookup the file"
+#define DHT_MSG_INVALID_LINKFILE_STR \
+ "linkto target is different from cached-subvol. treating as destination " \
+ "subvol"
+#define DHT_MSG_GFID_MISMATCH_STR "gfid different on the target file"
+#define DHT_MSG_GET_XATTR_FAILED_STR "failed to get 'linkto' xattr"
+#define DHT_MSG_SET_INODE_CTX_FAILED_STR "failed to set inode-ctx target file"
+#define DHT_MSG_DIR_SELFHEAL_FAILED_STR "Healing of path failed"
+#define DHT_MSG_DIR_HEAL_ABORT_STR \
+ "Failed to get path from subvol. Aborting directory healing"
+#define DHT_MSG_DIR_XATTR_HEAL_FAILED_STR "xattr heal failed for directory"
+#define DHT_MSG_LOCK_INODE_UNREF_FAILED_STR \
+ "Found a NULL inode. Failed to unref the inode"
+#define DHT_MSG_DICT_SET_FAILED_STR "Failed to set dictionary value"
+#define DHT_MSG_NOT_LINK_FILE_ERROR_STR "got non-linkfile"
+#define DHT_MSG_CREATE_LINK_FAILED_STR "failed to initialize linkfile data"
+#define DHT_MSG_UNLINK_FAILED_STR "Unlinking linkfile on subvolume failed"
+#define DHT_MSG_MIGRATE_FILE_FAILED_STR "Migrate file failed"
+#define DHT_MSG_NO_MEMORY_STR "could not allocate memory for dict"
+#define DHT_MSG_SUBVOL_ERROR_STR "Failed to get linkto subvol"
+#define DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED_STR "link failed on subvol"
+#define DHT_MSG_MIGRATE_FILE_SKIPPED_STR "Migration skipped"
+#define DHT_MSG_FD_CREATE_FAILED_STR "fd create failed"
+#define DHT_MSG_DICT_NEW_FAILED_STR "dict_new failed"
+#define DHT_MSG_FAILED_TO_OPEN_STR "failed to open"
+#define DHT_MSG_CREATE_FAILED_STR "failed to create"
+#define DHT_MSG_FILE_NOT_EXIST_STR "file does not exist"
+#define DHT_MSG_CHOWN_FAILED_STR "chown failed"
+#define DHT_MSG_FALLOCATE_FAILED_STR "fallocate failed"
+#define DHT_MSG_FTRUNCATE_FAILED_STR "ftruncate failed"
+#define DHT_MSG_STATFS_FAILED_STR "failed to get statfs"
+#define DHT_MSG_WRITE_CROSS_STR \
+ "write will cross min-fre-disk for file on subvol. looking for new subvol"
+#define DHT_MSG_SUBVOL_INSUFF_SPACE_STR \
+ "Could not find any subvol with space accommodating the file. Cosider " \
+ "adding bricks"
+#define DHT_MSG_NEW_TARGET_FOUND_STR "New target found for file"
+#define DHT_MSG_INSUFF_MEMORY_STR "insufficient memory"
+#define DHT_MSG_SET_XATTR_FAILED_STR "failed to set xattr"
+#define DHT_MSG_SET_MODE_FAILED_STR "failed to set mode"
+#define DHT_MSG_FILE_EXISTS_IN_DEST_STR "file exists in destination"
+#define DHT_MSG_LINKFILE_DEL_FAILED_STR "failed to delete the linkfile"
+#define DHT_MSG_SYMLINK_FAILED_STR "symlink failed"
+#define DHT_MSG_MKNOD_FAILED_STR "mknod failed"
+#define DHT_MSG_SETATTR_FAILED_STR "failed to perform setattr"
+#define DHT_MSG_MIGRATE_CLEANUP_FAILED_STR \
+ "Migrate file cleanup failed: failed to fstat file"
+#define DHT_MSG_LOCK_MIGRATE_STR "locks will be migrated for file"
+#define DHT_MSG_PARENT_BUILD_FAILED_STR \
+ "failed to build parent loc, which is needed to acquire entrylk to " \
+ "synchronize with renames on this path. Skipping migration"
+#define DHT_MSG_HASHED_SUBVOL_NOT_FOUND_STR \
+ "cannot find hashed subvol which is needed to synchronize with renames " \
+ "on this path. Skipping migration"
+#define DHT_MSG_ACQUIRE_ENTRYLK_FAILED_STR "failed to acquire entrylk on subvol"
+#define DHT_MSG_CREATE_DST_FAILED_STR "create dst failed for file"
+#define DHT_MSG_MIGRATION_EXIT_STR "Exiting migration"
+#define DHT_MSG_CHANGED_DST_STR "destination changed fo file"
+#define DHT_MSG_TRACE_FAILED_STR "Trace failed"
+#define DHT_MSG_WRITE_LOCK_FAILED_STR "write lock failed"
+#define DHT_MSG_GETACTIVELK_FAILED_STR "getactivelk failed for file"
+#define DHT_MSG_STAT_FAILED_STR "failed to do a stat"
+#define DHT_MSG_UNLINK_PERFORM_FAILED_STR "failed to perform unlink"
+#define DHT_MSG_MIGRATE_FILE_COMPLETE_STR "completed migration"
+#define DHT_MSG_CLANUP_SOURCE_FILE_FAILED_STR "failed to cleanup source file"
+#define DHT_MSG_UNLOCK_FILE_FAILED_STR "failed to unlock file"
+#define DHT_MSG_REMOVE_XATTR_FAILED_STR "remove xattr failed"
+#define DHT_MSG_SOCKET_ERROR_STR "Failed to unlink listener socket"
+#define DHT_MSG_HASHED_SUBVOL_GET_FAILED_STR "Failed to get hashed subvolume"
+#define DHT_MSG_CACHED_SUBVOL_GET_FAILED_STR "Failed to get cached subvolume"
+#define DHT_MSG_MIGRATE_DATA_FAILED_STR "migrate-data failed"
+#define DHT_MSG_DEFRAG_NULL_STR "defrag is NULL"
+#define DHT_MSG_DATA_MIGRATE_ABORT_STR \
+ "Readdirp failed. Aborting data migration for dict"
+#define DHT_MSG_LAYOUT_FIX_FAILED_STR "fix layout failed"
+#define DHT_MSG_PARENT_NULL_STR "parent is NULL"
+#define DHT_MSG_GFID_NOT_PRESENT_STR "gfid not present"
+#define DHT_MSG_CHILD_LOC_FAILED_STR "Child loc build failed"
+#define DHT_MSG_SET_LOOKUP_FAILED_STR "Failed to set lookup"
+#define DHT_MSG_DIR_LOOKUP_FAILED_STR "lookup failed"
+#define DHT_MSG_DIR_REMOVED_STR "Dir renamed or removed. Skipping"
+#define DHT_MSG_READDIR_ERROR_STR "readdir failed, Aborting fix-layout"
+#define DHT_MSG_SETTLE_HASH_FAILED_STR "Settle hash failed"
+#define DHT_MSG_DEFRAG_PROCESS_DIR_FAILED_STR "gf_defrag_process_dir failed"
+#define DHT_MSG_FIX_NOT_COMP_STR \
+ "Unable to retrieve fixlayout xattr. Assume background fix layout not " \
+ "complete"
+#define DHT_MSG_SUBVOL_DETER_FAILED_STR \
+ "local subvolume determination failed with error"
+#define DHT_MSG_LOCAL_SUBVOL_STR "local subvol"
+#define DHT_MSG_NODE_UUID_STR "node uuid"
+#define DHT_MSG_SIZE_FILE_STR "Total size files"
+#define DHT_MSG_GET_DATA_SIZE_FAILED_STR \
+ "Failed to get the total data size. Unable to estimate time to complete " \
+ "rebalance"
+#define DHT_MSG_PTHREAD_JOIN_FAILED_STR \
+ "file_counter_thread: pthread_join failed"
+#define DHT_MSG_COUNTER_THREAD_CREATE_FAILED_STR \
+ "Failed to create the file counter thread"
+#define DHT_MSG_MIGRATION_INIT_QUEUE_FAILED_STR \
+ "Failed to initialise migration queue"
+#define DHT_MSG_REBALANCE_STOPPED_STR "Received stop command on rebalance"
+#define DHT_MSG_PAUSED_TIMEOUT_STR "Request pause timer timeout"
+#define DHT_MSG_WOKE_STR "woken"
+#define DHT_MSG_ABORT_REBALANCE_STR "Aborting rebalance"
+#define DHT_MSG_REBALANCE_START_FAILED_STR \
+ "Failed to start rebalance: look up on / failed"
+#define DHT_MSG_CREATE_TASK_REBAL_FAILED_STR \
+ "Could not create task for rebalance"
+#define DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL_STR \
+ "Rebalance estimates will not be available"
+#define DHT_MSG_REBALANCE_STATUS_STR "Rebalance status"
+#define DHT_MSG_DATA_NULL_STR "data value is NULL"
+#define DHT_MSG_ADD_CHOICES_ERROR_STR "Error to add choices in buffer"
+#define DHT_MSG_GET_CHOICES_ERROR_STR "Error to get choices"
+#define DHT_MSG_PREPARE_STATUS_ERROR_STR "Error to prepare status"
+#define DHT_MSG_SET_CHOICE_FAILED_STR "Failed to set full choice"
+#define DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED_STR \
+ "Failed to aggregate quota xattr"
+#define DHT_MSG_FILE_TYPE_MISMATCH_STR \
+ "path exists as a file on one subvolume and directory on another. Please " \
+ "fix it manually"
+#define DHT_MSG_LAYOUT_SET_FAILED_STR "failed to set layout for subvolume"
+#define DHT_MSG_LAYOUT_MERGE_FAILED_STR "failed to merge layouts for subvolume"
+#define DHT_MSG_SET_HASHED_SUBVOL_FAILED_STR "Failed to set hashed subvolume"
+#define DHT_MSG_XATTR_HEAL_NOT_POSS_STR \
+ "No gfid exists for path. so healing xattr is not possible"
+#define DHT_MSG_REVALIDATE_CBK_INFO_STR "Revalidate: subvolume returned -1"
+#define DHT_MSG_LAYOUT_MISMATCH_STR "Mismatching layouts"
+#define DHT_MSG_UNLINK_LOOKUP_INFO_STR "lookup_unlink retuened"
+#define DHT_MSG_LINKTO_FILE_FAILED_STR \
+ "Could not unlink the linkto file as either fd is open and/or linkto " \
+ "xattr is set"
+#define DHT_MSG_LAYOUT_PRESET_FAILED_STR \
+ "Could not set pre-set layout for subvolume"
+#define DHT_MSG_FILE_ON_MULT_SUBVOL_STR \
+ "multiple subvolumes have file (preferably rename the file in the " \
+ "backend, and do a fresh lookup"
+#define DHT_MSG_STALE_LINKFILE_DELETE_STR \
+ "attempting deletion of stale linkfile"
+#define DHT_MSG_LINK_FILE_LOOKUP_INFO_STR "Lookup on following linkfile"
+#define DHT_MSG_NO_SUBVOL_FOR_LINKTO_STR "No link subvolume for linkto"
+#define DHT_MSG_SUBVOL_RETURNED_STR "Subvolume returned -1"
+#define DHT_MSG_UNKNOWN_LOCAL_XSEL_STR "Unknown local->xsel"
+#define DHT_MSG_DICT_GET_FAILED_STR "Failed to get"
+#define DHT_MSG_UUID_PARSE_ERROR_STR "Failed to parse uuid"
+#define DHT_MSG_GET_XATTR_ERR_STR "getxattr err for dir"
+#define DHT_MSG_ALLOC_OR_FILL_FAILED_STR "alloc or fill failed"
+#define DHT_MSG_UPGRADE_BRICKS_STR \
+ "At least one of the bricks does not support this operation. Please " \
+ "upgrade all bricks"
+#define DHT_MSG_GET_REAL_NAME_FAILED_STR "Failed to get real filename"
+#define DHT_MSG_LAYOUT_NULL_STR "Layout is NULL"
+#define DHT_MSG_COPY_UUID_FAILED_STR "Failed to copy node uuid key"
+#define DHT_MSG_MDS_DETER_FAILED_STR \
+ "Cannot determine MDS, fetching xattr randomly from a subvol"
+#define DHT_MSG_HASHED_SUBVOL_DOWN_STR \
+ "MDS is down for path, so fetching xattr randomly from subvol"
+#define DHT_MSG_CREATE_REBAL_FAILED_STR \
+ "failed to create a new rebalance synctask"
+#define DHT_MSG_FIX_LAYOUT_INFO_STR "fixing the layout"
+#define DHT_MSG_OPERATION_NOT_SUP_STR "wrong directory-spread-count value"
+#define DHT_MSG_LINK_LAYOUT_FAILED_STR "failed to link the layout in inode"
+#define DHT_MSG_NO_SUBVOL_IN_LAYOUT_STR "no subvolume in layout for path"
+#define DHT_MSG_INODE_LK_ERROR_STR "mknod lock failed for file"
+#define DHT_MSG_MEM_ALLOC_FAILED_STR "mem allocation failed"
+#define DHT_MSG_PARENT_LAYOUT_CHANGED_STR \
+ "extracting in-memory layout of parent failed"
+#define DHT_MSG_SET_IN_PARAMS_DICT_FAILED_STR \
+ "setting in params dictionary failed"
+#define DHT_MSG_LOC_COPY_FAILED_STR "loc_copy failed"
+#define DHT_MSG_LOC_FAILED_STR "parent loc build failed"
+#define DHT_MSG_PARENT_LOC_FAILED_STR "locking parent failed"
+#define DHT_MSG_CREATE_LOCK_FAILED_STR "Create lock failed"
+#define DHT_MSG_PREV_ATTEMPT_FAILED_STR \
+ "mkdir loop detected. parent layout didn't change even though previous " \
+ "attempt of mkdir failed because of in-memory layout not matching with " \
+ "that on disk."
+#define DHT_MSG_REFRESH_ATTEMPT_STR \
+ "mkdir parent layout changed. Attempting a refresh and then a retry"
+#define DHT_MSG_ACQUIRE_LOCK_FAILED_STR \
+ "Acquiring lock on parent to guard against layout-change failed"
+#define DHT_MSG_CREATE_STUB_FAILED_STR "creating stub failed"
+#define DHT_MSG_WIND_LOCK_REQ_FAILED_STR \
+ "cannot wind lock request to guard parent layout"
+#define DHT_MSG_REFRESH_FAILED_STR "refreshing parent layout failed."
+#define DHT_MSG_CACHED_SUBVOL_ERROR_STR "On cached subvol"
+#define DHT_MSG_NO_LINK_SUBVOL_STR "Linkfile does not have link subvolume"
+#define DHT_MSG_SET_KEY_FAILED_STR "failed to set key"
+#define DHT_MSG_CHILD_DOWN_STR "Received CHILD_DOWN. Exiting"
+#define DHT_MSG_LOG_FIXED_LAYOUT_STR "log layout fixed"
+#define DHT_MSG_REBAL_STRUCT_SET_STR "local->rebalance already set"
+#define DHT_MSG_REMOVE_LINKTO_FAILED_STR "Removal of linkto failed at subvol"
+#define DHT_MSG_LAYOUT_DICT_SET_FAILED_STR "dht layout dict set failed"
+#define DHT_MSG_SUBVOL_INFO_STR "creating subvolume"
+#define DHT_MSG_COMPUTE_HASH_FAILED_STR "hash computation failed"
+#define DHT_MSG_INVALID_DISK_LAYOUT_STR \
+ "Invalid disk layout: Catastrophic error layout with unknown type found"
+#define DHT_MSG_LAYOUT_SORT_FAILED_STR "layout sort failed"
+#define DHT_MSG_ANOMALIES_INFO_STR "Found anomalies"
+#define DHT_MSG_XATTR_DICT_NULL_STR "xattr dictionary is NULL"
+#define DHT_MSG_DISK_LAYOUT_MISSING_STR "Disk layout missing"
+#define DHT_MSG_LAYOUT_INFO_STR "layout info"
+#define DHT_MSG_SUBVOL_NO_LAYOUT_INFO_STR "no pre-set layout for subvol"
+#define DHT_MSG_SELFHEAL_XATTR_FAILED_STR "layout setxattr failed"
+#define DHT_MSG_DIR_SELFHEAL_XATTR_FAILED_STR "Directory self heal xattr failed"
+#define DHT_MSG_DUMMY_ALLOC_FAILED_STR "failed to allocate dummy layout"
+#define DHT_MSG_DICT_IS_NULL_STR \
+ "dict is NULL, need to make sure gfids are same"
+#define DHT_MSG_ENTRYLK_ERROR_STR "acquiring entrylk after inodelk failed"
+#define DHT_MSG_NO_DISK_USAGE_STATUS_STR "no du stats"
+#define DHT_MSG_LINK_INODE_FAILED_STR "linking inode failed"
+#define DHT_MSG_SELFHEAL_FAILED_STR "Directory selfheal failed"
+#define DHT_MSG_NO_MDS_SUBVOL_STR "No mds subvol"
+#define DHT_MSG_LIST_XATTRS_FAILED_STR "failed to list xattrs"
+#define DHT_MSG_RESET_INTER_XATTR_FAILED_STR "Failed to reset internal xattr"
+#define DHT_MSG_MDS_DOWN_UNABLE_TO_SET_STR \
+ "mds subvol is down, unable to set xattr"
+#define DHT_MSG_DIR_ATTR_HEAL_FAILED_STR \
+ "Directory attr heal failed. Failed to set uid/gid"
+#define DHT_MSG_WIND_UNLOCK_FAILED_STR \
+ "Winding unlock failed: stale locks left on brick"
+#define DHT_MSG_COMMIT_HASH_FAILED_STR "Directory commit hash updaten failed"
+#define DHT_MSG_LK_ARRAY_INFO_STR "lk info"
+#define DHT_MSG_UNLOCK_GFID_FAILED_STR \
+ "unlock failed on gfid: stale lock might be left"
+#define DHT_MSG_UNLOCKING_FAILED_STR "unlocking failed"
+#define DHT_MSG_UNLOCK_FOLLOW_ENTRYLK_STR "not unlocking following entrylks"
+#define DHT_MSG_COPY_FRAME_FAILED_STR "copy frame failed"
+#define DHT_MSG_UNLOCK_FOLLOW_LOCKS_STR "not unlocking following locks"
+#define DHT_MSG_INODELK_FAILED_STR "inodelk failed on subvol"
+#define DHT_MSG_LOCK_FRAME_FAILED_STR "memory allocation failed for lock_frame"
+#define DHT_MSG_LOCAL_LOCK_INIT_FAILED_STR "dht_local_lock_init failed"
+#define DHT_MSG_ENTRYLK_FAILED_AFT_INODELK_STR \
+ "dht_blocking_entrylk failed after taking inodelk"
+#define DHT_MSG_BLOCK_INODELK_FAILED_STR "dht_blocking_inodelk failed"
+#define DHT_MSG_CALLOC_FAILED_STR "calloc failed"
+#define DHT_MSG_LOCK_ALLOC_FAILED_STR "lock allocation failed"
+#define DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS_STR \
+ "cannot allocate a frame, not unlocking following entrylks"
+#define DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK_STR \
+ "storing locks in local failed, not unlocking following entrylks"
+#define DHT_MSG_DST_NULL_SET_FAILED_STR \
+ "src or dst is NULL, Failed to set dictionary value"
+
+#endif /* _DHT_MESSAGES_H_ */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
new file mode 100644
index 00000000000..8ba8082bd86
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -0,0 +1,4702 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+#include <glusterfs/syscall.h>
+#include <fnmatch.h>
+#include <signal.h>
+#include <glusterfs/events.h>
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+
+#define GF_DISK_SECTOR_SIZE 512
+#define DHT_REBALANCE_PID 4242 /* Change it if required */
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
+#define MAX_MIGRATE_QUEUE_COUNT 500
+#define MIN_MIGRATE_QUEUE_COUNT 200
+#define MAX_REBAL_TYPE_SIZE 16
+#define FILE_CNT_INTERVAL 600 /* 10 mins */
+#define ESTIMATE_START_INTERVAL 600 /* 10 mins */
+#define HARDLINK_MIG_INPROGRESS -2
+#define SKIP_MIGRATION_FD_POSITIVE -3
+#ifndef MAX
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#define GF_CRAWL_INDEX_MOVE(idx, sv_cnt) \
+ { \
+ idx++; \
+ idx %= sv_cnt; \
+ }
+
+uint64_t g_totalfiles = 0;
+uint64_t g_totalsize = 0;
+
+void
+gf_defrag_free_dir_dfmeta(struct dir_dfmeta *meta, int local_subvols_cnt)
+{
+ int i = 0;
+
+ if (meta) {
+ for (i = 0; i < local_subvols_cnt; i++) {
+ if (meta->equeue)
+ gf_dirent_free(&meta->equeue[i]);
+ if (meta->lfd && meta->lfd[i])
+ fd_unref(meta->lfd[i]);
+ }
+
+ GF_FREE(meta->equeue);
+ GF_FREE(meta->head);
+ GF_FREE(meta->iterator);
+ GF_FREE(meta->offset_var);
+ GF_FREE(meta->fetch_entries);
+ GF_FREE(meta->lfd);
+ GF_FREE(meta);
+ }
+}
+
+void
+gf_defrag_free_container(struct dht_container *container)
+{
+ if (container) {
+ gf_dirent_entry_free(container->df_entry);
+
+ if (container->parent_loc) {
+ loc_wipe(container->parent_loc);
+ }
+
+ GF_FREE(container->parent_loc);
+
+ GF_FREE(container);
+ }
+}
+
+void
+dht_set_global_defrag_error(gf_defrag_info_t *defrag, int ret)
+{
+ LOCK(&defrag->lock);
+ {
+ defrag->global_error = ret;
+ }
+ UNLOCK(&defrag->lock);
+ return;
+}
+
+static int
+dht_send_rebalance_event(xlator_t *this, int cmd, gf_defrag_status_t status)
+{
+ int ret = -1;
+ char *volname = NULL;
+ char *tmpstr = NULL;
+ char *ptr = NULL;
+ char *suffix = "-dht";
+ int len = 0;
+
+ eventtypes_t event = EVENT_LAST;
+
+ switch (status) {
+ case GF_DEFRAG_STATUS_COMPLETE:
+ event = EVENT_VOLUME_REBALANCE_COMPLETE;
+ break;
+ case GF_DEFRAG_STATUS_FAILED:
+ event = EVENT_VOLUME_REBALANCE_FAILED;
+ break;
+ case GF_DEFRAG_STATUS_STOPPED:
+ event = EVENT_VOLUME_REBALANCE_STOP;
+ break;
+ default:
+ break;
+ }
+
+ /* DHT volume */
+ len = strlen(this->name) - strlen(suffix);
+ tmpstr = gf_strdup(this->name);
+ if (tmpstr) {
+ ptr = tmpstr + len;
+ if (!strcmp(ptr, suffix)) {
+ tmpstr[len] = '\0';
+ volname = tmpstr;
+ }
+ }
+
+ if (!volname) {
+ /* Better than nothing */
+ volname = this->name;
+ }
+
+ if (event != EVENT_LAST) {
+ gf_event(event, "volume=%s", volname);
+ }
+
+ GF_FREE(tmpstr);
+ return ret;
+}
+
+static void
+dht_strip_out_acls(dict_t *dict)
+{
+ if (dict) {
+ dict_del(dict, "trusted.SGI_ACL_FILE");
+ dict_del(dict, POSIX_ACL_ACCESS_XATTR);
+ }
+}
+
+/*
+ return values:
+ -1 : failure
+ -2 : success
+
+Hard link migration is carried out in three stages.
+
+(Say there are n hardlinks)
+Stage 1: Setting the new hashed subvol information on the 1st hardlink
+ encountered (linkto setxattr)
+
+Stage 2: Creating hardlinks on new hashed subvol for the 2nd to (n-1)th
+ hardlink
+
+Stage 3: Physical migration of the data file for nth hardlink
+
+Why to deem "-2" as success and not "0":
+
+ dht_migrate_file expects return value "0" from _is_file_migratable if
+the file has to be migrated.
+
+ _is_file_migratable returns zero only when it is called with the
+flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS".
+
+ gf_defrag_handle_hardlink calls dht_migrate_file for physical migration
+of the data file with the flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS"
+
+Hence, gf_defrag_handle_hardlink returning "0" for success will force
+"dht_migrate_file" to migrate each of the hardlink which is not intended.
+
+For each of the three stage mentioned above "-2" will be returned and will
+be converted to "0" in dht_migrate_file.
+
+*/
+
+int32_t
+gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno)
+{
+ int32_t ret = -1;
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *linkto_subvol = NULL;
+ data_t *data = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+ int32_t op_errno = 0;
+ dht_conf_t *conf = NULL;
+ gf_loglevel_t loglevel = 0;
+ dict_t *link_xattr = NULL;
+ dict_t *dict = NULL;
+ dict_t *xattr_rsp = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+
+ *fop_errno = EINVAL;
+
+ GF_VALIDATE_OR_GOTO("defrag", loc, out);
+ GF_VALIDATE_OR_GOTO("defrag", loc->name, out);
+ GF_VALIDATE_OR_GOTO("defrag", this, out);
+ GF_VALIDATE_OR_GOTO("defrag", this->private, out);
+
+ conf = this->private;
+
+ if (gf_uuid_is_null(loc->pargfid)) {
+ gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "loc->pargfid is NULL for %s",
+ loc->path);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ if (gf_uuid_is_null(loc->gfid)) {
+ gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "loc->gfid is NULL for %s",
+ loc->path);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ link_xattr = dict_new();
+ if (!link_xattr) {
+ ret = -1;
+ *fop_errno = ENOMEM;
+ goto out;
+ }
+
+ /*
+ Parallel migration can lead to migration of the hard link multiple
+ times which can lead to data loss. Hence, adding a fresh lookup to
+ decide whether migration is required or not.
+
+ Elaborating the scenario for let say 10 hardlinks [link{1..10}]:
+ Let say the first hard link "link1" does the setxattr of the
+ new hashed subvolume info on the cached file. As there are multiple
+ threads working, we might have already all the links created on the
+ new hashed by the time we reach hardlink let say link5. Now the
+ number of links on hashed is equal to that of cached. Hence, file
+ migration will happen for link6.
+
+ Cached Hashed
+ --------T link6 rwxrwxrwx link6
+
+ Now post above state all the link file on the cached will be zero
+ byte linkto files. Hence, if we still do migration for the following
+ files link{7..10}, we will end up migrating 0 data leading to data
+ loss.
+ Hence, a lookup can make sure whether we need to migrate the
+ file or not.
+ */
+
+ dict = dict_new();
+ if (!dict) {
+ ret = -1;
+ *fop_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "could not allocate memory for dict");
+ goto out;
+ }
+
+ ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to set 'linkto' key in dict",
+ loc->path);
+ goto out;
+ }
+
+ ret = syncop_lookup(this, loc, &stbuf, NULL, dict, &xattr_rsp);
+ if (ret) {
+ /*Ignore ENOENT and ESTALE as file might have been
+ migrated already*/
+ if (-ret == ENOENT || -ret == ESTALE) {
+ ret = -2;
+ goto out;
+ }
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:%s lookup failed with ret = %d", loc->path,
+ ret);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ cached_subvol = dht_subvol_get_cached(this, loc->inode);
+ if (!cached_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "Failed to get cached subvol"
+ " for %s on %s",
+ loc->name, this->name);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "Failed to get hashed subvol"
+ " for %s on %s",
+ loc->name, this->name);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ /* Hardlink migration happens only with remove-brick. So this condition will
+ * be true only when the migration has happened. In case hardlinks are
+ * migrated for rebalance case, remove this check. Having this check here
+ * avoid redundant calls below*/
+ if (hashed_subvol == cached_subvol) {
+ ret = -2;
+ goto out;
+ }
+
+ gf_log(this->name, GF_LOG_INFO,
+ "Attempting to migrate hardlink %s "
+ "with gfid %s from %s -> %s",
+ loc->name, uuid_utoa(loc->gfid), cached_subvol->name,
+ hashed_subvol->name);
+
+ data = dict_get(xattr_rsp, conf->link_xattr_name);
+ /* set linkto on cached -> hashed if not present, else link it */
+ if (!data) {
+ ret = dict_set_str(link_xattr, conf->link_xattr_name,
+ hashed_subvol->name);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "Failed to set dictionary value:"
+ " key = %s for %s",
+ conf->link_xattr_name, loc->name);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr(cached_subvol, loc, link_xattr, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "Linkto setxattr failed %s -> %s",
+ cached_subvol->name, loc->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ gf_msg_debug(this->name, 0,
+ "hardlink target subvol created on %s "
+ ",cached %s, file %s",
+ hashed_subvol->name, cached_subvol->name, loc->path);
+
+ ret = -2;
+ goto out;
+ } else {
+ linkto_subvol = dht_linkfile_subvol(this, NULL, NULL, xattr_rsp);
+ if (!linkto_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR,
+ "Failed to get "
+ "linkto subvol for %s",
+ loc->name);
+ } else {
+ hashed_subvol = linkto_subvol;
+ }
+
+ ret = syncop_link(hashed_subvol, loc, loc, &iatt, NULL, NULL);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+
+ loglevel = (op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_ERROR;
+ gf_msg(this->name, loglevel, op_errno,
+ DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED,
+ "link of %s -> %s"
+ " failed on subvol %s",
+ loc->name, uuid_utoa(loc->gfid), hashed_subvol->name);
+ if (op_errno != EEXIST) {
+ *fop_errno = op_errno;
+ goto out;
+ }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "syncop_link successful for"
+ " hardlink %s on subvol %s, cached %s",
+ loc->path, hashed_subvol->name, cached_subvol->name);
+ }
+ }
+
+ ret = syncop_lookup(hashed_subvol, loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :Failed lookup %s on %s ", loc->name,
+ hashed_subvol->name);
+
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* There is a race where on the target subvol for the hardlink
+ * (note: hash subvol for the hardlink might differ from this), some
+ * other client(non-rebalance) would have created a linkto file for that
+ * hardlink as part of lookup. So let say there are 10 hardlinks, on the
+ * 5th hardlink it self the hardlinks might have migrated. Now for
+ * (6..10th) hardlinks the cached and target would be same as the file
+ * has already migrated. Hence this check is needed */
+ if (cached_subvol == hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "source %s and destination %s "
+ "for hardlink %s are same",
+ cached_subvol->name, hashed_subvol->name, loc->path);
+ ret = -2;
+ goto out;
+ }
+
+ if (iatt.ia_nlink == stbuf.ia_nlink) {
+ ret = dht_migrate_file(this, loc, cached_subvol, hashed_subvol,
+ GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS, fop_errno);
+ if (ret) {
+ goto out;
+ }
+ }
+ ret = -2;
+out:
+ if (link_xattr)
+ dict_unref(link_xattr);
+
+ if (xattr_rsp)
+ dict_unref(xattr_rsp);
+
+ if (dict)
+ dict_unref(dict);
+
+ return ret;
+}
+
+static int
+__check_file_has_hardlink(xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ dict_t *xattrs, int flags, gf_defrag_info_t *defrag,
+ dht_conf_t *conf, int *fop_errno)
+{
+ int ret = 0;
+
+ if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) {
+ ret = 0;
+ return ret;
+ }
+ if (stbuf->ia_nlink > 1) {
+ /* support for decomission */
+ if (flags == GF_DHT_MIGRATE_HARDLINK) {
+ synclock_lock(&conf->link_lock);
+ ret = gf_defrag_handle_hardlink(this, loc, fop_errno);
+ synclock_unlock(&conf->link_lock);
+ /*
+ Returning zero will force the file to be remigrated.
+ Checkout gf_defrag_handle_hardlink for more information.
+ */
+ if (ret && ret != -2) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to migrate file with link",
+ loc->path);
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migration skipped for:"
+ "%s: file has hardlinks",
+ loc->path);
+ *fop_errno = ENOTSUP;
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ return values
+ 0 : File will be migrated
+ -2 : File will not be migrated
+ (This is the return value from gf_defrag_handle_hardlink. Checkout
+ gf_defrag_handle_hardlink for description of "returning -2")
+ -1 : failure
+*/
+static int
+__is_file_migratable(xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ dict_t *xattrs, int flags, gf_defrag_info_t *defrag,
+ dht_conf_t *conf, int *fop_errno)
+{
+ int ret = -1;
+ int lock_count = 0;
+
+ if (IA_ISDIR(stbuf->ia_type)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: migrate-file called on directory",
+ loc->path);
+ *fop_errno = EISDIR;
+ ret = -1;
+ goto out;
+ }
+
+ if (!conf->lock_migration_enabled) {
+ ret = dict_get_int32(xattrs, GLUSTERFS_POSIXLK_COUNT, &lock_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: Unable to get lock count for file",
+ loc->path);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ if (lock_count) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s: File has locks."
+ " Skipping file migration",
+ loc->path);
+ *fop_errno = ENOTSUP;
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /* Check if file has hardlink*/
+ ret = __check_file_has_hardlink(this, loc, stbuf, xattrs, flags, defrag,
+ conf, fop_errno);
+out:
+ return ret;
+}
+
+static int
+__dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from,
+ loc_t *loc, struct iatt *stbuf, fd_t **dst_fd,
+ int *fop_errno, int file_has_holes)
+{
+ int ret = -1;
+ int ret2 = -1;
+ fd_t *fd = NULL;
+ struct iatt new_stbuf = {
+ 0,
+ };
+ struct iatt check_stbuf = {
+ 0,
+ };
+ dht_conf_t *conf = NULL;
+ dict_t *dict = NULL;
+ dict_t *xdata = NULL;
+
+ conf = this->private;
+
+ dict = dict_new();
+ if (!dict) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dictionary allocation failed for"
+ "path:%s",
+ loc->path);
+ goto out;
+ }
+ ret = dict_set_gfuuid(dict, "gfid-req", stbuf->ia_gfid, true);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: failed to set dictionary value: key = gfid-req", loc->path);
+ goto out;
+ }
+
+ ret = dict_set_str(dict, conf->link_xattr_name, from->name);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: failed to set dictionary value: key = %s ", loc->path,
+ conf->link_xattr_name);
+ goto out;
+ }
+
+ fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+ if (!fd) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: fd create failed (destination)", loc->path);
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: dict_new failed)", loc->path);
+ goto out;
+ }
+
+ ret = dict_set_int32_sizen(xdata, GF_CLEAN_WRITE_PROTECTION, 1);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: failed to set dictionary value: key = %s ", loc->path,
+ GF_CLEAN_WRITE_PROTECTION);
+ goto out;
+ }
+
+ ret = syncop_lookup(to, loc, &new_stbuf, NULL, xdata, NULL);
+ if (!ret) {
+ /* File exits in the destination, check if gfid matches */
+ if (gf_uuid_compare(stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH,
+ "file %s exists in %s with different gfid", loc->path,
+ to->name);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+ }
+ if ((ret < 0) && (-ret != ENOENT)) {
+ /* File exists in destination, but not accessible */
+ gf_msg(THIS->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to lookup file", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* Create the destination with LINKFILE mode, and linkto xattr,
+ if the linkfile already exists, just open the file */
+ if (!ret) {
+ /*
+ * File already present, just open the file.
+ */
+ ret = syncop_open(to, loc, O_RDWR, fd, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to open %s on %s", loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = syncop_create(to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, &new_stbuf,
+ dict, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to create %s on %s", loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ }
+
+ fd_bind(fd);
+
+ /*Reason of doing lookup after create again:
+ *In the create, there is some time-gap between opening fd at the
+ *server (posix_layer) and binding it in server (incrementing fd count),
+ *so if in that time-gap, if other process sends unlink considering it
+ *as a linkto file, because inode->fd count will be 0, so file will be
+ *unlinked at the backend. And because further operations are performed
+ *on fd, so though migration will be done but will end with no file
+ *at the backend.
+ */
+
+ ret = syncop_lookup(to, loc, &check_stbuf, NULL, NULL, NULL);
+ if (!ret) {
+ if (gf_uuid_compare(stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH,
+ "file %s exists in %s with different gfid,"
+ "found in lookup after create",
+ loc->path, to->name);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (-ret == ENOENT) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: file does not exist"
+ "on %s",
+ loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_fsetattr(to, fd, stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+ NULL, NULL, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "chown failed for %s on %s", loc->path, to->name);
+ }
+
+ /* No need to bother about 0 byte size files */
+ if (stbuf->ia_size > 0) {
+ if (conf->use_fallocate && !file_has_holes) {
+ ret = syncop_fallocate(to, fd, 0, 0, stbuf->ia_size, NULL, NULL);
+ if (ret < 0) {
+ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -ENOSYS) {
+ conf->use_fallocate = _gf_false;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "fallocate failed for %s on %s", loc->path,
+ to->name);
+
+ *fop_errno = -ret;
+
+ /* fallocate does not release the space
+ * in some cases
+ */
+ ret2 = syncop_ftruncate(to, fd, 0, NULL, NULL, NULL, NULL);
+ if (ret2 < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret2,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "ftruncate failed for "
+ "%s on %s",
+ loc->path, to->name);
+ }
+ goto out;
+ }
+ }
+ } else {
+ ret = syncop_ftruncate(to, fd, stbuf->ia_size, NULL, NULL, NULL,
+ NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "ftruncate failed for %s on %s", loc->path, to->name);
+ }
+ }
+ }
+
+ /* success */
+ ret = 0;
+
+ if (dst_fd)
+ *dst_fd = fd;
+
+out:
+ if (ret) {
+ if (fd) {
+ fd_unref(fd);
+ }
+ }
+ if (dict)
+ dict_unref(dict);
+
+ if (xdata)
+ dict_unref(xdata);
+
+ return ret;
+}
+
+static int
+__dht_check_free_space(xlator_t *this, xlator_t *to, xlator_t *from, loc_t *loc,
+ struct iatt *stbuf, int flag, dht_conf_t *conf,
+ gf_boolean_t *target_changed, xlator_t **new_subvol,
+ int *fop_errno)
+{
+ struct statvfs src_statfs = {
+ 0,
+ };
+ struct statvfs dst_statfs = {
+ 0,
+ };
+ int ret = -1;
+ dict_t *xdata = NULL;
+ dht_layout_t *layout = NULL;
+ uint64_t src_statfs_blocks = 1;
+ uint64_t dst_statfs_blocks = 1;
+ double dst_post_availspacepercent = 0;
+ double src_post_availspacepercent = 0;
+ uint64_t file_blocks = 0;
+ uint64_t src_total_blocks = 0;
+ uint64_t dst_total_blocks = 0;
+
+ xdata = dict_new();
+ if (!xdata) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "failed to allocate dictionary");
+ goto out;
+ }
+
+ ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
+ ret = -1;
+ *fop_errno = ENOMEM;
+ goto out;
+ }
+
+ ret = syncop_statfs(from, loc, &src_statfs, xdata, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to get statfs of %s on %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_statfs(to, loc, &dst_statfs, xdata, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to get statfs of %s on %s", loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ gf_msg_debug(this->name, 0,
+ "min_free_disk - %f , block available - %" PRId64
+ ", block size - %lu",
+ conf->min_free_disk, dst_statfs.f_bavail, dst_statfs.f_bsize);
+
+ dst_statfs_blocks = dst_statfs.f_bavail *
+ (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+ src_statfs_blocks = src_statfs.f_bavail *
+ (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+ dst_total_blocks = dst_statfs.f_blocks *
+ (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+ src_total_blocks = src_statfs.f_blocks *
+ (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+ /* if force option is given, do not check for space @ dst.
+ * Check only if space is avail for the file */
+ if (flag != GF_DHT_MIGRATE_DATA)
+ goto check_avail_space;
+
+ /* Check:
+ During rebalance `migrate-data` - Destination subvol experiences
+ a `reduction` in 'blocks' of free space, at the same time source
+ subvol gains certain 'blocks' of free space. A valid check is
+ necessary here to avoid erroneous move to destination where
+ the space could be scantily available.
+ With heterogeneous brick support, an actual space comparison could
+ prevent any files being migrated to newly added bricks if they are
+ smaller then the free space available on the existing bricks.
+ */
+ if (!conf->use_fallocate) {
+ file_blocks = stbuf->ia_size + GF_DISK_SECTOR_SIZE - 1;
+ file_blocks /= GF_DISK_SECTOR_SIZE;
+
+ if (file_blocks >= dst_statfs_blocks) {
+ dst_statfs_blocks = 0;
+ } else {
+ dst_statfs_blocks -= file_blocks;
+ }
+ }
+
+ src_post_availspacepercent = ((src_statfs_blocks + file_blocks) * 100) /
+ src_total_blocks;
+
+ dst_post_availspacepercent = (dst_statfs_blocks * 100) / dst_total_blocks;
+
+ if (dst_post_availspacepercent < src_post_availspacepercent) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "data movement of file "
+ "{blocks:%" PRIu64
+ " name:(%s)} would result in "
+ "dst node (%s:%" PRIu64
+ ") having lower disk "
+ "space than the source node (%s:%" PRIu64
+ ")"
+ ".Skipping file.",
+ stbuf->ia_blocks, loc->path, to->name, dst_statfs_blocks,
+ from->name, src_statfs_blocks);
+
+ /* this is not a 'failure', but we don't want to
+ consider this as 'success' too :-/ */
+ *fop_errno = ENOSPC;
+ ret = 1;
+ goto out;
+ }
+
+check_avail_space:
+ if (conf->disk_unit == 'p' && dst_statfs.f_blocks) {
+ dst_post_availspacepercent = (dst_statfs_blocks * 100) /
+ dst_total_blocks;
+
+ gf_msg_debug(this->name, 0,
+ "file : %s, post_availspacepercent"
+ " : %lf f_bavail : %" PRIu64 " min-free-disk: %lf",
+ loc->path, dst_post_availspacepercent, dst_statfs.f_bavail,
+ conf->min_free_disk);
+
+ if (dst_post_availspacepercent < conf->min_free_disk) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+ "Write will cross min-free-disk for "
+ "file - %s on subvol - %s. Looking "
+ "for new subvol",
+ loc->path, to->name);
+
+ goto find_new_subvol;
+ } else {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ if (conf->disk_unit != 'p') {
+ if ((dst_statfs_blocks * GF_DISK_SECTOR_SIZE) < conf->min_free_disk) {
+ gf_msg_debug(this->name, 0,
+ "file : %s, destination frsize: %lu "
+ "f_bavail : %" PRIu64 " min-free-disk: %lf",
+ loc->path, dst_statfs.f_frsize, dst_statfs.f_bavail,
+ conf->min_free_disk);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+ "write will"
+ " cross min-free-disk for file - %s on subvol -"
+ " %s. looking for new subvol",
+ loc->path, to->name);
+
+ goto find_new_subvol;
+
+ } else {
+ ret = 0;
+ goto out;
+ }
+ }
+
+find_new_subvol:
+ layout = dht_layout_get(this, loc->parent);
+ if (!layout) {
+ gf_log(this->name, GF_LOG_ERROR, "Layout is NULL");
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ *new_subvol = dht_subvol_with_free_space_inodes(this, to, from, layout,
+ stbuf->ia_size);
+ if ((!(*new_subvol)) || (*new_subvol == from)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+ "Could not find any subvol"
+ " with space accommodating the file - %s. Consider "
+ "adding bricks",
+ loc->path);
+
+ *target_changed = _gf_false;
+ *fop_errno = ENOSPC;
+ ret = -1;
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "new target found - %s"
+ " for file - %s",
+ (*new_subvol)->name, loc->path);
+ *target_changed = _gf_true;
+ ret = 0;
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ return ret;
+}
+
+static int
+__dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+ xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst,
+ uint64_t ia_size, int hole_exists, int *fop_errno)
+{
+ int ret = 0;
+ int count = 0;
+ off_t offset = 0;
+ off_t data_offset = 0;
+ off_t hole_offset = 0;
+ struct iovec *vector = NULL;
+ struct iobref *iobref = NULL;
+ uint64_t total = 0;
+ size_t read_size = 0;
+ size_t data_block_size = 0;
+ dict_t *xdata = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ /* if file size is '0', no need to enter this loop */
+ while (total < ia_size) {
+ /* This is a regular file - read it sequentially */
+ if (!hole_exists) {
+ read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
+ ? DHT_REBALANCE_BLKSIZE
+ : (ia_size - total));
+ } else {
+ /* This is a sparse file - read only the data segments in the file
+ */
+
+ /* If the previous data block is fully copied, find the next data
+ * segment
+ * starting at the offset of the last read and written byte, */
+ if (data_block_size <= 0) {
+ ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
+ &data_offset);
+ if (ret) {
+ if (ret == -ENXIO)
+ ret = 0; /* No more data segments */
+ else
+ *fop_errno = -ret; /* Error occurred */
+
+ break;
+ }
+
+ /* If the position of the current data segment is greater than
+ * the position of the next hole, find the next hole in order to
+ * calculate the length of the new data segment */
+ if (data_offset > hole_offset) {
+ /* Starting at the offset of the last data segment, find the
+ * next hole */
+ ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
+ NULL, &hole_offset);
+ if (ret) {
+ /* If an error occurred here it's a real error because
+ * if the seek for a data segment was successful then
+ * necessarily another hole must exist (EOF is a hole)
+ */
+ *fop_errno = -ret;
+ break;
+ }
+
+ /* Calculate the total size of the current data block */
+ data_block_size = hole_offset - data_offset;
+ }
+ } else {
+ /* There is still data in the current segment, move the
+ * data_offset to the position of the last written byte */
+ data_offset = offset;
+ }
+
+ /* Calculate how much data needs to be read and written. If the data
+ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
+ * write DHT_REBALANCE_BLKSIZE data length and the rest in the
+ * next iteration(s) */
+ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
+ ? DHT_REBALANCE_BLKSIZE
+ : data_block_size);
+
+ /* Calculate the remaining size of the data block - maybe there's no
+ * need to seek for data in the next iteration */
+ data_block_size -= read_size;
+
+ /* Set offset to the offset of the data segment so read and write
+ * will have the correct position */
+ offset = data_offset;
+ }
+
+ ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
+ &iobref, NULL, NULL, NULL);
+
+ if (!ret || (ret < 0)) {
+ if (!ret) {
+ /* File was probably truncated*/
+ ret = -1;
+ *fop_errno = ENOSPC;
+ } else {
+ *fop_errno = -ret;
+ }
+ break;
+ }
+
+ if (!conf->force_migration) {
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "insufficient memory");
+ ret = -1;
+ *fop_errno = ENOMEM;
+ break;
+ }
+
+ /* Fail this write and abort rebalance if we
+ * detect a write from client since migration of
+ * this file started. This is done to avoid
+ * potential data corruption due to out of order
+ * writes from rebalance and client to the same
+ * region (as compared between src and dst
+ * files). See
+ * https://github.com/gluster/glusterfs/issues/308
+ * for more details.
+ */
+ ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
+ if (ret) {
+ gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
+ "failed to set dict");
+ ret = -1;
+ *fop_errno = ENOMEM;
+ break;
+ }
+ }
+ }
+
+ ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
+ NULL, xdata, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ break;
+ }
+
+ offset += ret;
+ total += ret;
+
+ GF_FREE(vector);
+ if (iobref)
+ iobref_unref(iobref);
+ iobref = NULL;
+ vector = NULL;
+ }
+ if (iobref)
+ iobref_unref(iobref);
+ GF_FREE(vector);
+
+ if (ret >= 0)
+ ret = 0;
+ else
+ ret = -1;
+
+ if (xdata) {
+ dict_unref(xdata);
+ }
+
+ return ret;
+}
+
+static int
+__dht_rebalance_open_src_file(xlator_t *this, xlator_t *from, xlator_t *to,
+ loc_t *loc, struct iatt *stbuf, fd_t **src_fd,
+ gf_boolean_t *clean_src, int *fop_errno)
+{
+ int ret = 0;
+ fd_t *fd = NULL;
+ dict_t *dict = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ *clean_src = _gf_false;
+
+ fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+ if (!fd) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: fd create failed (source)", loc->path);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_open(from, loc, O_RDWR, fd, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to open file %s on %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ fd_bind(fd);
+
+ if (src_fd)
+ *src_fd = fd;
+
+ ret = -1;
+ dict = dict_new();
+ if (!dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: Could not allocate memory for dict", loc->path);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str(dict, conf->link_xattr_name, to->name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set xattr in dict for %s (linkto:%s)", loc->path,
+ to->name);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ /* Once the migration starts, the source should have 'linkto' key set
+ to show which is the target, so other clients can work around it */
+ ret = syncop_setxattr(from, loc, dict, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to set xattr on %s in %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* Reset source mode/xattr if migration fails*/
+ *clean_src = _gf_true;
+
+ /* mode should be (+S+T) to indicate migration is in progress */
+ iatt.ia_prot = stbuf->ia_prot;
+ iatt.ia_type = stbuf->ia_type;
+ iatt.ia_prot.sticky = 1;
+ iatt.ia_prot.sgid = 1;
+
+ ret = syncop_setattr(from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL, NULL,
+ NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to set mode on %s in %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* success */
+ ret = 0;
+out:
+ if (dict)
+ dict_unref(dict);
+
+ return ret;
+}
+
+int
+migrate_special_files(xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
+ struct iatt *buf, int *fop_errno)
+{
+ int ret = -1;
+ dict_t *rsp_dict = NULL;
+ dict_t *dict = NULL;
+ char *link = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ dht_conf_t *conf = this->private;
+
+ dict = dict_new();
+ if (!dict) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s: failed to set 'linkto' key in dict", loc->path);
+ goto out;
+ }
+
+ /* check in the destination if the file is link file */
+ ret = syncop_lookup(to, loc, &stbuf, NULL, dict, &rsp_dict);
+ if ((ret < 0) && (-ret != ENOENT)) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: lookup failed", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* we no more require this key */
+ dict_del(dict, conf->link_xattr_name);
+
+ /* file exists in target node, only if it is 'linkfile' its valid,
+ otherwise, error out */
+ if (!ret) {
+ if (!check_is_linkfile(loc->inode, &stbuf, rsp_dict,
+ conf->link_xattr_name)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: file exists in destination", loc->path);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ /* as file is linkfile, delete it */
+ ret = syncop_unlink(to, loc, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to delete the linkfile", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* Set the gfid of the source file in dict */
+ ret = dict_set_gfuuid(dict, "gfid-req", buf->ia_gfid, true);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s: failed to set gfid in dict for create", loc->path);
+ goto out;
+ }
+
+ /* Create the file in target */
+ if (IA_ISLNK(buf->ia_type)) {
+ /* Handle symlinks separately */
+ ret = syncop_readlink(from, loc, &link, buf->ia_size, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: readlink on symlink failed", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_symlink(to, loc, link, 0, dict, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED, "%s: creating symlink failed",
+ loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ goto done;
+ }
+
+ ret = syncop_mknod(to, loc, st_mode_from_ia(buf->ia_prot, buf->ia_type),
+ makedev(ia_major(buf->ia_rdev), ia_minor(buf->ia_rdev)),
+ 0, dict, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: mknod failed", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+done:
+ ret = syncop_setattr(to, loc, buf,
+ (GF_SET_ATTR_MTIME | GF_SET_ATTR_UID |
+ GF_SET_ATTR_GID | GF_SET_ATTR_MODE),
+ NULL, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to perform setattr on %s", loc->path, to->name);
+ *fop_errno = -ret;
+ }
+
+ ret = syncop_unlink(from, loc, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: unlink failed", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ }
+
+out:
+ GF_FREE(link);
+ if (dict)
+ dict_unref(dict);
+
+ if (rsp_dict)
+ dict_unref(rsp_dict);
+
+ return ret;
+}
+
+static int
+__dht_migration_cleanup_src_file(xlator_t *this, loc_t *loc, fd_t *fd,
+ xlator_t *from, ia_prot_t *src_ia_prot)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ struct iatt new_stbuf = {
+ 0,
+ };
+
+ if (!this || !fd || !from || !src_ia_prot) {
+ goto out;
+ }
+
+ conf = this->private;
+
+ /*Revert source mode and xattr changes*/
+ ret = syncop_fstat(from, fd, &new_stbuf, NULL, NULL);
+ if (ret < 0) {
+ /* Failed to get the stat info */
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file cleanup failed: failed to fstat "
+ "file %s on %s ",
+ loc->path, from->name);
+ ret = -1;
+ goto out;
+ }
+
+ /* Remove the sticky bit and sgid bit set, reset it to 0*/
+ if (!src_ia_prot->sticky)
+ new_stbuf.ia_prot.sticky = 0;
+
+ if (!src_ia_prot->sgid)
+ new_stbuf.ia_prot.sgid = 0;
+
+ ret = syncop_fsetattr(from, fd, &new_stbuf,
+ (GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL,
+ NULL, NULL);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file cleanup failed:"
+ "%s: failed to perform fsetattr on %s ",
+ loc->path, from->name);
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_fremovexattr(from, fd, conf->link_xattr_name, 0, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to remove linkto xattr on %s (%s)", loc->path,
+ from->name, strerror(-ret));
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ return values:
+
+ -1 : failure
+ 0 : successfully migrated data
+ 1 : not a failure, but we can't migrate data as of now
+*/
+int
+dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+ int flag, int *fop_errno)
+{
+ int ret = -1;
+ struct iatt new_stbuf = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+ struct iatt empty_iatt = {
+ 0,
+ };
+ ia_prot_t src_ia_prot = {
+ 0,
+ };
+ fd_t *src_fd = NULL;
+ fd_t *dst_fd = NULL;
+ dict_t *dict = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xattr_rsp = NULL;
+ int file_has_holes = 0;
+ dht_conf_t *conf = this->private;
+ int rcvd_enoent_from_src = 0;
+ struct gf_flock flock = {
+ 0,
+ };
+ struct gf_flock plock = {
+ 0,
+ };
+ loc_t tmp_loc = {
+ 0,
+ };
+ loc_t parent_loc = {
+ 0,
+ };
+ gf_boolean_t inodelk_locked = _gf_false;
+ gf_boolean_t entrylk_locked = _gf_false;
+ gf_boolean_t p_locked = _gf_false;
+ int lk_ret = -1;
+ gf_defrag_info_t *defrag = NULL;
+ gf_boolean_t clean_src = _gf_false;
+ gf_boolean_t clean_dst = _gf_false;
+ int log_level = GF_LOG_INFO;
+ gf_boolean_t delete_src_linkto = _gf_true;
+ lock_migration_info_t locklist;
+ dict_t *meta_dict = NULL;
+ gf_boolean_t meta_locked = _gf_false;
+ gf_boolean_t target_changed = _gf_false;
+ xlator_t *new_target = NULL;
+ xlator_t *old_target = NULL;
+ xlator_t *hashed_subvol = NULL;
+ fd_t *linkto_fd = NULL;
+ dict_t *xdata = NULL;
+
+ if (from == to) {
+ gf_msg_debug(this->name, 0,
+ "destination and source are same. file %s"
+ " might have migrated already",
+ loc->path);
+ ret = 0;
+ goto out;
+ }
+
+ gf_log(this->name, log_level, "%s: attempting to move from %s to %s",
+ loc->path, from->name, to->name);
+
+ dict = dict_new();
+ if (!dict) {
+ ret = -1;
+ *fop_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Could not allocate memory for dict");
+ goto out;
+ }
+ ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to set 'linkto' key in dict",
+ loc->path);
+ goto out;
+ }
+
+ /* Do not migrate file in case lock migration is not enabled on the
+ * volume*/
+ if (!conf->lock_migration_enabled) {
+ ret = dict_set_int32(dict, GLUSTERFS_POSIXLK_COUNT, sizeof(int32_t));
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s: failed to "
+ "set " GLUSTERFS_POSIXLK_COUNT " key in dict",
+ loc->path);
+ goto out;
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "locks will be migrated"
+ " for file: %s",
+ loc->path);
+ }
+
+ /* The file is locked to prevent a rename during a migration. Renames
+ * and migrations on the file at the same time can lead to data loss.
+ */
+
+ ret = dht_build_parent_loc(this, &parent_loc, loc, fop_errno);
+ if (ret < 0) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to build parent loc, which is needed to "
+ "acquire entrylk to synchronize with renames on this "
+ "path. Skipping migration",
+ loc->path);
+ goto out;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (hashed_subvol == NULL) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: cannot find hashed subvol which is needed to "
+ "synchronize with renames on this path. "
+ "Skipping migration",
+ loc->path);
+ goto out;
+ }
+
+ flock.l_type = F_WRLCK;
+
+ tmp_loc.inode = inode_ref(loc->inode);
+ gf_uuid_copy(tmp_loc.gfid, loc->gfid);
+ tmp_loc.path = gf_strdup(loc->path);
+
+ /* this inodelk happens with flock.owner being zero. But to synchronize
+ * hardlink migration we need to have different lkowner for each migration
+ * Filed a bug here: https://bugzilla.redhat.com/show_bug.cgi?id=1468202 to
+ * track the fix for this. Currently synclock takes care of synchronizing
+ * hardlink migration. Once this bug is fixed we can avoid taking synclock
+ */
+ ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW,
+ &flock, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "migrate file failed: "
+ "%s: failed to lock file on %s",
+ loc->path, from->name);
+ goto out;
+ }
+
+ inodelk_locked = _gf_true;
+
+ /* dht_rename has changed to use entrylk on hashed subvol for
+ * synchronization. So, rebalance too has to acquire an entrylk on
+ * hashed subvol.
+ */
+ ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc,
+ loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to acquire entrylk on subvol %s", loc->path,
+ hashed_subvol->name);
+ goto out;
+ }
+
+ entrylk_locked = _gf_true;
+
+ /* Phase 1 - Data migration is in progress from now on */
+ ret = syncop_lookup(from, loc, &stbuf, NULL, dict, &xattr_rsp);
+ if (ret) {
+ *fop_errno = -ret;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: lookup failed on %s",
+ loc->path, from->name);
+ goto out;
+ }
+
+ /* preserve source mode, so set the same to the destination */
+ src_ia_prot = stbuf.ia_prot;
+
+ /* Check if file can be migrated */
+ ret = __is_file_migratable(this, loc, &stbuf, xattr_rsp, flag, defrag, conf,
+ fop_errno);
+ if (ret) {
+ if (ret == HARDLINK_MIG_INPROGRESS)
+ ret = 0;
+ goto out;
+ }
+
+ /* Take care of the special files */
+ if (!IA_ISREG(stbuf.ia_type)) {
+ /* Special files */
+ ret = migrate_special_files(this, from, to, loc, &stbuf, fop_errno);
+ goto out;
+ }
+
+ /* Try to preserve 'holes' while migrating data */
+ if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE))
+ file_has_holes = 1;
+
+ /* create the destination, with required modes/xattr */
+ ret = __dht_rebalance_create_dst_file(this, to, from, loc, &stbuf, &dst_fd,
+ fop_errno, file_has_holes);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+ "Create dst failed"
+ " on - %s for file - %s",
+ to->name, loc->path);
+ goto out;
+ }
+
+ clean_dst = _gf_true;
+
+ ret = __dht_check_free_space(this, to, from, loc, &stbuf, flag, conf,
+ &target_changed, &new_target, fop_errno);
+ if (target_changed) {
+ /* Can't handle for hardlinks. Marking this as failure */
+ if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+ "Exiting migration for"
+ " file - %s. flag - %d, stbuf.ia_nlink - %d",
+ loc->path, flag, stbuf.ia_nlink);
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)", loc->path,
+ to->name, strerror(-ret));
+ }
+
+ syncop_close(dst_fd);
+ dst_fd = NULL;
+
+ old_target = to;
+ to = new_target;
+
+ clean_dst = _gf_false;
+
+ /* if the file migration is successful to this new target, then
+ * update the xattr on the old destination to point the new
+ * destination. We need to do update this only post migration
+ * as in case of failure the linkto needs to point to the source
+ * subvol */
+ ret = __dht_rebalance_create_dst_file(
+ this, to, from, loc, &stbuf, &dst_fd, fop_errno, file_has_holes);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Create dst failed"
+ " on - %s for file - %s",
+ to->name, loc->path);
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "destination for file "
+ "- %s is changed to - %s",
+ loc->path, to->name);
+ clean_dst = _gf_true;
+ }
+ }
+
+ if (ret) {
+ goto out;
+ }
+
+ /* Open the source, and also update mode/xattr */
+ ret = __dht_rebalance_open_src_file(this, from, to, loc, &stbuf, &src_fd,
+ &clean_src, fop_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: failed to open %s on %s", loc->path,
+ from->name);
+ goto out;
+ }
+
+ /* TODO: move all xattr related operations to fd based operations */
+ ret = syncop_listxattr(from, loc, &xattr, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to get xattr from %s",
+ loc->path, from->name);
+ ret = -1;
+ goto out;
+ }
+
+ /* Copying posix acls to the linkto file messes up the permissions*/
+ dht_strip_out_acls(xattr);
+
+ /* Remove the linkto xattr as we don't want to overwrite the value
+ * set on the dst.
+ */
+ dict_del(xattr, conf->link_xattr_name);
+
+ /* We need to error out if this fails as having the wrong shard xattrs
+ * set on the dst could cause data corruption
+ */
+ ret = syncop_fsetxattr(to, dst_fd, xattr, 0, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to set xattr on %s", loc->path, to->name);
+ ret = -1;
+ goto out;
+ }
+
+ if (xattr_rsp) {
+ /* we no more require this key */
+ dict_del(dict, conf->link_xattr_name);
+ dict_unref(xattr_rsp);
+ }
+
+ ret = syncop_fstat(from, src_fd, &stbuf, dict, &xattr_rsp);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:failed to lookup %s on %s ", loc->path,
+ from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* Check again if file has hardlink */
+ ret = __check_file_has_hardlink(this, loc, &stbuf, xattr_rsp, flag, defrag,
+ conf, fop_errno);
+ if (ret) {
+ if (ret == HARDLINK_MIG_INPROGRESS)
+ ret = 0;
+ goto out;
+ }
+
+ ret = __dht_rebalance_migrate_data(this, defrag, from, to, src_fd, dst_fd,
+ stbuf.ia_size, file_has_holes,
+ fop_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s: failed to migrate data", loc->path);
+
+ ret = -1;
+ goto out;
+ }
+
+ /* TODO: Sync the locks */
+
+ xdata = dict_new();
+ if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s: failed to set last-fsync flag on "
+ "%s (%s)",
+ loc->path, to->name, strerror(ENOMEM));
+ }
+
+ ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, xdata, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)",
+ loc->path, to->name, strerror(-ret));
+ *fop_errno = -ret;
+ }
+
+ /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */
+
+ ret = syncop_fstat(from, src_fd, &new_stbuf, NULL, NULL);
+ if (ret < 0) {
+ /* Failed to get the stat info */
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: failed to fstat file %s on %s ", loc->path,
+ from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* Lock the entire source file to prevent clients from taking a
+ lock on it as dht_lk does not handle file migration.
+
+ This still leaves a small window where conflicting locks can
+ be granted to different clients. If client1 requests a blocking
+ lock on the src file, it will be granted after the migrating
+ process releases its lock. If client2 requests a lock on the dst
+ data file, it will also be granted, but all FOPs will be redirected
+ to the dst data file.
+ */
+
+ /* Take meta lock */
+
+ if (conf->lock_migration_enabled) {
+ meta_dict = dict_new();
+ if (!meta_dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "dict_new failed");
+
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str(meta_dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s,"
+ " path = %s",
+ GLUSTERFS_INTERNAL_FOP_KEY, loc->path);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_int32(meta_dict, GF_META_LOCK_KEY, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace dict_set failed");
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace syncop_setxattr metalock failed");
+
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ } else {
+ meta_locked = _gf_true;
+ }
+ }
+
+ if (!conf->lock_migration_enabled) {
+ plock.l_type = F_WRLCK;
+ plock.l_start = 0;
+ plock.l_len = 0;
+ plock.l_whence = SEEK_SET;
+
+ ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: Failed to lock on %s",
+ loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ p_locked = _gf_true;
+
+ } else {
+ INIT_LIST_HEAD(&locklist.list);
+
+ ret = syncop_getactivelk(from, loc, &locklist, NULL, NULL);
+ if (ret == 0) {
+ gf_log(this->name, GF_LOG_INFO, "No active locks on:%s", loc->path);
+
+ } else if (ret > 0) {
+ ret = syncop_setactivelk(to, loc, &locklist, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_LOCK_MIGRATION_FAILED, "write lock failed on:%s",
+ loc->path);
+
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_LOCK_MIGRATION_FAILED,
+ "getactivelk failed for file: %s", loc->path);
+ *fop_errno = -ret;
+ }
+ }
+
+ /* source would have both sticky bit and sgid bit set, reset it to 0,
+ and set the source permission on destination, if it was not set
+ prior to setting rebalance-modes in source */
+ if (!src_ia_prot.sticky)
+ new_stbuf.ia_prot.sticky = 0;
+
+ if (!src_ia_prot.sgid)
+ new_stbuf.ia_prot.sgid = 0;
+
+ /* TODO: if the source actually had sticky bit, or sgid bit set,
+ we are not handling it */
+
+ ret = syncop_fsetattr(
+ to, dst_fd, &new_stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL,
+ NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to perform setattr on %s ",
+ loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
+ }
+
+ /* Because 'futimes' is not portable */
+ ret = syncop_setattr(to, loc, &new_stbuf,
+ (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME), NULL, NULL,
+ NULL, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to perform setattr on %s ", loc->path, to->name);
+ *fop_errno = -ret;
+ }
+
+ if (target_changed) {
+ dict_del(dict, GLUSTERFS_POSIXLK_COUNT);
+ ret = dict_set_str(dict, conf->link_xattr_name, to->name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set xattr in dict for %s (linkto:%s)", loc->path,
+ to->name);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr(old_target, loc, dict, 0, NULL, NULL);
+ if (ret && -ret != ESTALE && -ret != ENOENT) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to set xattr on %s in %s", loc->path,
+ old_target->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ } else if (-ret == ESTALE || -ret == ENOENT) {
+ /* The failure ESTALE indicates that the linkto
+ * file on the hashed subvol might have been deleted.
+ * In this case will create a linkto file with new target
+ * as linkto xattr value*/
+ linkto_fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+ if (!linkto_fd) {
+ gf_msg(this->name, GF_LOG_ERROR, errno,
+ DHT_MSG_MIGRATE_FILE_FAILED, "%s: fd create failed",
+ loc->path);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+ ret = syncop_create(old_target, loc, O_RDWR, DHT_LINKFILE_MODE,
+ linkto_fd, NULL, dict, NULL);
+ if (ret != 0 && -ret != EEXIST && -ret != ESTALE) {
+ *fop_errno = -ret;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to create linkto file on %s in %s", loc->path,
+ old_target->name);
+ goto out;
+ } else if (ret == 0) {
+ ret = syncop_fsetattr(old_target, linkto_fd, &stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL,
+ NULL, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "chown failed for %s on %s", loc->path,
+ old_target->name);
+ }
+ }
+ }
+ }
+
+ clean_dst = _gf_false;
+
+ /* Posix acls are not set on DHT linkto files as part of the initial
+ * initial xattrs set on the dst file, so these need
+ * to be set on the dst file after the linkto attrs are removed.
+ * TODO: Optimize this.
+ */
+ if (xattr) {
+ dict_unref(xattr);
+ xattr = NULL;
+ }
+
+ /* Set only the Posix ACLs this time */
+ ret = syncop_getxattr(from, loc, &xattr, POSIX_ACL_ACCESS_XATTR, NULL,
+ NULL);
+ if (ret < 0) {
+ if ((-ret != ENODATA) && (-ret != ENOATTR)) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to get xattr from %s",
+ loc->path, from->name);
+ *fop_errno = -ret;
+ }
+ } else {
+ ret = syncop_setxattr(to, loc, xattr, 0, NULL, NULL);
+ if (ret < 0) {
+ /* Potential problem here where Posix ACLs will
+ * not be set on the target file */
+
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to set xattr on %s",
+ loc->path, to->name);
+ *fop_errno = -ret;
+ }
+ }
+
+ /* The src file is being unlinked after this so we don't need
+ to clean it up */
+ clean_src = _gf_false;
+
+ /* Make the source as a linkfile first before deleting it */
+ empty_iatt.ia_prot.sticky = 1;
+ ret = syncop_fsetattr(from, src_fd, &empty_iatt, GF_SET_ATTR_MODE, NULL,
+ NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to perform setattr on %s ",
+ loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
+ }
+
+ /* Free up the data blocks on the source node, as the whole
+ file is migrated */
+ ret = syncop_ftruncate(from, src_fd, 0, NULL, NULL, NULL, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)", loc->path,
+ from->name, strerror(-ret));
+ *fop_errno = -ret;
+ }
+
+ /* remove the 'linkto' xattr from the destination */
+ ret = syncop_fremovexattr(to, dst_fd, conf->link_xattr_name, 0, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to perform removexattr on %s (%s)", loc->path,
+ to->name, strerror(-ret));
+ *fop_errno = -ret;
+ }
+
+ /* Do a stat and check the gfid before unlink */
+
+ /*
+ * Cached file changes its state from non-linkto to linkto file after
+ * migrating data. If lookup from any other mount-point is performed,
+ * converted-linkto-cached file will be treated as a stale and will be
+ * unlinked. But by this time, file is already migrated. So further
+ * failure because of ENOENT should not be treated as error
+ */
+
+ ret = syncop_stat(from, loc, &empty_iatt, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to do a stat on %s", loc->path, from->name);
+
+ if (-ret != ENOENT) {
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
+ }
+
+ rcvd_enoent_from_src = 1;
+ }
+
+ if ((gf_uuid_compare(empty_iatt.ia_gfid, loc->gfid) == 0) &&
+ (!rcvd_enoent_from_src) && delete_src_linkto) {
+ /* take out the source from namespace */
+ ret = syncop_unlink(from, loc, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to perform unlink on %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
+ }
+ }
+
+ ret = syncop_lookup(this, loc, NULL, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "%s: failed to lookup the file on subvolumes", loc->path);
+ *fop_errno = -ret;
+ }
+
+ gf_msg(this->name, log_level, 0, DHT_MSG_MIGRATE_FILE_COMPLETE,
+ "completed migration of %s from subvolume %s to %s", loc->path,
+ from->name, to->name);
+
+ ret = 0;
+
+metaunlock:
+
+ if (conf->lock_migration_enabled && meta_locked) {
+ dict_del(meta_dict, GF_META_LOCK_KEY);
+
+ ret = dict_set_int32(meta_dict, GF_META_UNLOCK_KEY, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace dict_set failed");
+
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ if (clean_dst == _gf_false)
+ ret = dict_set_int32(meta_dict, "status", 1);
+ else
+ ret = dict_set_int32(meta_dict, "status", 0);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace dict_set failed");
+
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace syncop_setxattr meta unlock failed");
+
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ }
+
+out:
+ if (clean_src) {
+ /* Revert source mode and xattr changes*/
+ lk_ret = __dht_migration_cleanup_src_file(this, loc, src_fd, from,
+ &src_ia_prot);
+ if (lk_ret) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to cleanup source file on %s", loc->path,
+ from->name);
+ }
+ }
+
+ /* reset the destination back to 0 */
+ if (clean_dst) {
+ lk_ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL);
+ if (lk_ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: "
+ "%s: failed to reset target size back to 0",
+ loc->path);
+ }
+ }
+
+ if (inodelk_locked) {
+ flock.l_type = F_UNLCK;
+
+ lk_ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc,
+ F_SETLK, &flock, NULL, NULL);
+ if (lk_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to unlock file on %s", loc->path, from->name);
+ }
+ }
+
+ if (entrylk_locked) {
+ lk_ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN,
+ &parent_loc, loc->name, ENTRYLK_UNLOCK,
+ ENTRYLK_UNLOCK, NULL, NULL);
+ if (lk_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to unlock entrylk on %s", loc->path,
+ hashed_subvol->name);
+ }
+ }
+
+ if (p_locked) {
+ plock.l_type = F_UNLCK;
+ lk_ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL);
+
+ if (lk_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to unlock file on %s", loc->path, from->name);
+ }
+ }
+
+ lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES, NULL,
+ NULL);
+ if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0,
+ "%s: removexattr failed key %s", loc->path,
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
+ }
+
+ if (dict)
+ dict_unref(dict);
+
+ if (xattr)
+ dict_unref(xattr);
+ if (xattr_rsp)
+ dict_unref(xattr_rsp);
+
+ if (dst_fd)
+ syncop_close(dst_fd);
+
+ if (src_fd)
+ syncop_close(src_fd);
+ if (linkto_fd)
+ syncop_close(linkto_fd);
+
+ if (xdata)
+ dict_unref(xdata);
+
+ loc_wipe(&tmp_loc);
+ loc_wipe(&parent_loc);
+
+ return ret;
+}
+
+static int
+rebalance_task(void *data)
+{
+ int ret = -1;
+ dht_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int fop_errno = 0;
+
+ frame = data;
+
+ local = frame->local;
+
+ /* This function is 'synchrounous', hence if it returns,
+ we are done with the task */
+ ret = dht_migrate_file(THIS, &local->loc, local->rebalance.from_subvol,
+ local->rebalance.target_node, local->flags,
+ &fop_errno);
+
+ return ret;
+}
+
+static int
+rebalance_task_completion(int op_ret, call_frame_t *sync_frame, void *data)
+{
+ int32_t op_errno = EINVAL;
+
+ if (op_ret == -1) {
+ /* Failure of migration process, mostly due to write process.
+ as we can't preserve the exact errno, lets say there was
+ no space to migrate-data
+ */
+ op_errno = ENOSPC;
+ } else if (op_ret == 1) {
+ /* migration didn't happen, but is not a failure, let the user
+ understand that he doesn't have permission to migrate the
+ file.
+ */
+ op_ret = -1;
+ op_errno = EPERM;
+ } else if (op_ret != 0) {
+ op_errno = -op_ret;
+ op_ret = -1;
+ }
+
+ DHT_STACK_UNWIND(setxattr, sync_frame, op_ret, op_errno, NULL);
+ return 0;
+}
+
+int
+dht_start_rebalance_task(xlator_t *this, call_frame_t *frame)
+{
+ int ret = -1;
+
+ ret = synctask_new(this->ctx->env, rebalance_task,
+ rebalance_task_completion, frame, frame);
+ return ret;
+}
+
+int
+gf_listener_stop(xlator_t *this)
+{
+ glusterfs_ctx_t *ctx = NULL;
+ cmd_args_t *cmd_args = NULL;
+ int ret = 0;
+
+ ctx = this->ctx;
+ GF_ASSERT(ctx);
+ cmd_args = &ctx->cmd_args;
+ if (cmd_args->sock_file) {
+ ret = sys_unlink(cmd_args->sock_file);
+ if (ret && (ENOENT == errno)) {
+ ret = 0;
+ }
+ }
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_SOCKET_ERROR,
+ "Failed to unlink listener "
+ "socket %s",
+ cmd_args->sock_file);
+ }
+ return ret;
+}
+
+void
+dht_build_root_inode(xlator_t *this, inode_t **inode)
+{
+ inode_table_t *itable = NULL;
+ static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+ itable = inode_table_new(0, this);
+ if (!itable)
+ return;
+
+ *inode = inode_find(itable, root_gfid);
+}
+
+void
+dht_build_root_loc(inode_t *inode, loc_t *loc)
+{
+ loc->path = "/";
+ loc->inode = inode;
+ loc->inode->ia_type = IA_IFDIR;
+ memset(loc->gfid, 0, 16);
+ loc->gfid[15] = 1;
+}
+
+/* return values: 1 -> error, bug ignore and continue
+ 0 -> proceed
+ -1 -> error, handle it */
+int32_t
+gf_defrag_handle_migrate_error(int32_t op_errno, gf_defrag_info_t *defrag)
+{
+ int ret = 0;
+ /* if errno is not ENOTCONN, we can still continue
+ with rebalance process */
+ if (op_errno != ENOTCONN) {
+ ret = 1;
+ goto out;
+ }
+
+ if (op_errno == ENOTCONN) {
+ /* Most probably mount point went missing (mostly due
+ to a brick down), say rebalance failure to user,
+ let him restart it if everything is fine */
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static gf_boolean_t
+gf_defrag_pattern_match(gf_defrag_info_t *defrag, char *name, uint64_t size)
+{
+ gf_defrag_pattern_list_t *trav = NULL;
+ gf_boolean_t match = _gf_false;
+ gf_boolean_t ret = _gf_false;
+
+ GF_VALIDATE_OR_GOTO("dht", defrag, out);
+
+ trav = defrag->defrag_pattern;
+ while (trav) {
+ if (!fnmatch(trav->path_pattern, name, FNM_NOESCAPE)) {
+ match = _gf_true;
+ break;
+ }
+ trav = trav->next;
+ }
+
+ if ((match == _gf_true) && (size >= trav->size))
+ ret = _gf_true;
+
+out:
+ return ret;
+}
+
+int
+dht_dfreaddirp_done(dht_dfoffset_ctx_t *offset_var, int cnt)
+{
+ int i;
+ int result = 1;
+
+ for (i = 0; i < cnt; i++) {
+ if (offset_var[i].readdir_done == 0) {
+ result = 0;
+ break;
+ }
+ }
+ return result;
+}
+
+int static gf_defrag_ctx_subvols_init(dht_dfoffset_ctx_t *offset_var,
+ xlator_t *this)
+{
+ int i;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf)
+ return -1;
+
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ offset_var[i].this = conf->local_subvols[i];
+ offset_var[i].offset = (off_t)0;
+ offset_var[i].readdir_done = 0;
+ }
+
+ return 0;
+}
+
+static int
+dht_get_first_non_null_index(subvol_nodeuuids_info_t *entry)
+{
+ int i = 0;
+ int index = 0;
+
+ for (i = 0; i < entry->count; i++) {
+ if (!gf_uuid_is_null(entry->elements[i].uuid)) {
+ index = i;
+ goto out;
+ }
+ }
+
+ if (i == entry->count) {
+ index = -1;
+ }
+out:
+ return index;
+}
+
+/* Return value
+ * 0 : this node does not migrate the file
+ * 1 : this node migrates the file
+ *
+ * Use the hash value of the gfid to determine which node will migrate files.
+ * Using the gfid instead of the name also ensures that the same node handles
+ * all hardlinks.
+ */
+
+gf_boolean_t
+gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid)
+{
+ gf_boolean_t ret = _gf_false;
+ int i = local_subvol_index;
+ char *str = NULL;
+ uint32_t hashval = 0;
+ int32_t index = 0;
+ dht_conf_t *conf = NULL;
+ char buf[UUID_CANONICAL_FORM_LEN + 1] = {
+ 0,
+ };
+ subvol_nodeuuids_info_t *entry = NULL;
+
+ conf = this->private;
+
+ /* Pure distribute. A subvol in this case
+ will be handled by only one node */
+
+ entry = &(conf->local_nodeuuids[i]);
+ if (entry->count == 1) {
+ return 1;
+ }
+
+ str = uuid_utoa_r(gfid, buf);
+ if (dht_hash_compute(this, 0, str, &hashval) == 0) {
+ index = (hashval % entry->count);
+ if (entry->elements[index].info == REBAL_NODEUUID_MINE) {
+ /* Index matches this node's nodeuuid.*/
+ ret = _gf_true;
+ goto out;
+ }
+
+ /* Brick down - some other node has to migrate these files*/
+ if (gf_uuid_is_null(entry->elements[index].uuid)) {
+ /* Fall back to the first non-null index */
+ index = dht_get_first_non_null_index(entry);
+
+ if (index == -1) {
+ /* None of the bricks in the subvol are up.
+ * CHILD_DOWN will kill the process soon */
+
+ return _gf_false;
+ }
+
+ if (entry->elements[index].info == REBAL_NODEUUID_MINE) {
+ /* Index matches this node's nodeuuid.*/
+ ret = _gf_true;
+ goto out;
+ }
+ }
+ }
+out:
+ return ret;
+}
+
+int
+gf_defrag_migrate_single_file(void *opaque)
+{
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ int ret = 0;
+ gf_dirent_t *entry = NULL;
+ struct timeval start = {
+ 0,
+ };
+ loc_t entry_loc = {
+ 0,
+ };
+ loc_t *loc = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+ dict_t *migrate_data = NULL;
+ struct timeval end = {
+ 0,
+ };
+ double elapsed = {
+ 0,
+ };
+ struct dht_container *rebal_entry = NULL;
+ inode_t *inode = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ call_frame_t *statfs_frame = NULL;
+ xlator_t *old_THIS = NULL;
+ data_t *tmp = NULL;
+ int fop_errno = 0;
+ gf_dht_migrate_data_type_t rebal_type = GF_DHT_MIGRATE_DATA;
+ char value[MAX_REBAL_TYPE_SIZE] = {
+ 0,
+ };
+ struct iatt *iatt_ptr = NULL;
+ gf_boolean_t update_skippedcount = _gf_true;
+ int i = 0;
+ gf_boolean_t should_i_migrate = 0;
+
+ rebal_entry = (struct dht_container *)opaque;
+ if (!rebal_entry) {
+ gf_log("DHT", GF_LOG_ERROR, "rebal_entry is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ this = rebal_entry->this;
+
+ conf = this->private;
+
+ defrag = conf->defrag;
+
+ loc = rebal_entry->parent_loc;
+
+ migrate_data = rebal_entry->migrate_data;
+
+ entry = rebal_entry->df_entry;
+ iatt_ptr = &entry->d_stat;
+
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = -1;
+ goto out;
+ }
+
+ if (defrag->stats == _gf_true) {
+ gettimeofday(&start, NULL);
+ }
+
+ if (defrag->defrag_pattern &&
+ (gf_defrag_pattern_match(defrag, entry->d_name,
+ entry->d_stat.ia_size) == _gf_false)) {
+ gf_log(this->name, GF_LOG_ERROR, "pattern_match failed");
+ goto out;
+ }
+
+ memset(&entry_loc, 0, sizeof(entry_loc));
+
+ ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+ if (ret) {
+ LOCK(&defrag->lock);
+ {
+ defrag->total_failures += 1;
+ }
+ UNLOCK(&defrag->lock);
+
+ ret = 0;
+
+ gf_log(this->name, GF_LOG_ERROR, "Child loc build failed");
+
+ goto out;
+ }
+
+ should_i_migrate = gf_defrag_should_i_migrate(
+ this, rebal_entry->local_subvol_index, entry->d_stat.ia_gfid);
+
+ gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid);
+
+ gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+ ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+
+ if (!should_i_migrate) {
+ /* this node isn't supposed to migrate the file. suppressing any
+ * potential error from lookup as this file is under migration by
+ * another node */
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "Ignoring lookup failure: node isn't migrating %s",
+ entry_loc.path);
+ ret = 0;
+ }
+ gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path);
+ goto out;
+ }
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s lookup failed", entry_loc.path);
+
+ /* Increase failure count only for remove-brick op, so that
+ * user is warned to check the removed-brick for any files left
+ * unmigrated
+ */
+ if (conf->decommission_subvols_cnt) {
+ LOCK(&defrag->lock);
+ {
+ defrag->total_failures += 1;
+ }
+ UNLOCK(&defrag->lock);
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ iatt_ptr = &iatt;
+
+ hashed_subvol = dht_subvol_get_hashed(this, &entry_loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s", entry_loc.path);
+ ret = 0;
+ goto out;
+ }
+
+ cached_subvol = dht_subvol_get_cached(this, entry_loc.inode);
+ if (!cached_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CACHED_SUBVOL_GET_FAILED,
+ "Failed to get cached subvol for %s", entry_loc.path);
+
+ ret = 0;
+ goto out;
+ }
+
+ if (hashed_subvol == cached_subvol) {
+ ret = 0;
+ goto out;
+ }
+
+ inode = inode_link(entry_loc.inode, entry_loc.parent, entry->d_name, &iatt);
+ inode_unref(entry_loc.inode);
+ /* use the inode returned by inode_link */
+ entry_loc.inode = inode;
+
+ old_THIS = THIS;
+ THIS = this;
+ statfs_frame = create_frame(this, this->ctx->pool);
+ if (!statfs_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+ "Insufficient memory. Frame creation failed");
+ ret = -1;
+ goto out;
+ }
+
+ /* async statfs information for honoring min-free-disk */
+ dht_get_du_info(statfs_frame, this, loc);
+ THIS = old_THIS;
+
+ tmp = dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY);
+ if (tmp) {
+ memcpy(value, tmp->data, tmp->len);
+ if (strcmp(value, "force") == 0)
+ rebal_type = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS;
+
+ if (conf->decommission_in_progress)
+ rebal_type = GF_DHT_MIGRATE_HARDLINK;
+ }
+
+ ret = dht_migrate_file(this, &entry_loc, cached_subvol, hashed_subvol,
+ rebal_type, &fop_errno);
+ if (ret == 1) {
+ if (fop_errno == ENOSPC) {
+ gf_msg_debug(this->name, 0,
+ "migrate-data skipped for"
+ " %s due to space constraints",
+ entry_loc.path);
+
+ /* For remove-brick case if the source is not one of the
+ * removed-brick, do not mark the error as failure */
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] == cached_subvol) {
+ LOCK(&defrag->lock);
+ {
+ defrag->total_failures += 1;
+ update_skippedcount = _gf_false;
+ }
+ UNLOCK(&defrag->lock);
+
+ break;
+ }
+ }
+ }
+
+ if (update_skippedcount) {
+ LOCK(&defrag->lock);
+ {
+ defrag->skipped += 1;
+ }
+ UNLOCK(&defrag->lock);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED,
+ "File migration skipped for %s.", entry_loc.path);
+ }
+
+ } else if (fop_errno == ENOTSUP) {
+ gf_msg_debug(this->name, 0,
+ "migrate-data skipped for"
+ " hardlink %s ",
+ entry_loc.path);
+ LOCK(&defrag->lock);
+ {
+ defrag->skipped += 1;
+ }
+ UNLOCK(&defrag->lock);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED,
+ "File migration skipped for %s.", entry_loc.path);
+ }
+
+ ret = 0;
+ goto out;
+ } else if (ret < 0) {
+ if (fop_errno != EEXIST) {
+ gf_msg(this->name, GF_LOG_ERROR, fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED, "migrate-data failed for %s",
+ entry_loc.path);
+
+ LOCK(&defrag->lock);
+ {
+ defrag->total_failures += 1;
+ }
+ UNLOCK(&defrag->lock);
+ }
+
+ ret = gf_defrag_handle_migrate_error(fop_errno, defrag);
+
+ if (!ret) {
+ gf_msg(this->name, GF_LOG_ERROR, fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "migrate-data on %s failed:", entry_loc.path);
+ } else if (ret == 1) {
+ ret = 0;
+ }
+
+ goto out;
+ }
+
+ LOCK(&defrag->lock);
+ {
+ defrag->total_files += 1;
+ defrag->total_data += iatt.ia_size;
+ }
+ UNLOCK(&defrag->lock);
+
+ if (defrag->stats == _gf_true) {
+ gettimeofday(&end, NULL);
+ elapsed = gf_tvdiff(&start, &end);
+ gf_log(this->name, GF_LOG_INFO,
+ "Migration of "
+ "file:%s size:%" PRIu64
+ " bytes took %.2f"
+ "secs and ret: %d",
+ entry_loc.name, iatt.ia_size, elapsed / 1e6, ret);
+ }
+
+out:
+ if (statfs_frame) {
+ STACK_DESTROY(statfs_frame->root);
+ }
+
+ if (iatt_ptr) {
+ LOCK(&defrag->lock);
+ {
+ defrag->size_processed += iatt_ptr->ia_size;
+ }
+ UNLOCK(&defrag->lock);
+ }
+ loc_wipe(&entry_loc);
+
+ return ret;
+}
+
+void *
+gf_defrag_task(void *opaque)
+{
+ struct list_head *q_head = NULL;
+ struct dht_container *iterator = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ int ret = 0;
+ pid_t pid = GF_CLIENT_PID_DEFRAG;
+
+ defrag = (gf_defrag_info_t *)opaque;
+ if (!defrag) {
+ gf_msg("dht", GF_LOG_ERROR, 0, 0, "defrag is NULL");
+ goto out;
+ }
+
+ syncopctx_setfspid(&pid);
+
+ q_head = &(defrag->queue[0].list);
+
+ /* The following while loop will dequeue one entry from the defrag->queue
+ under lock. We will update the defrag->global_error only when there
+ is an error which is critical to stop the rebalance process. The stop
+ message will be intimated to other migrator threads by setting the
+ defrag->defrag_status to GF_DEFRAG_STATUS_FAILED.
+
+ In defrag->queue, a low watermark (MIN_MIGRATE_QUEUE_COUNT) is
+ maintained so that crawler does not starve the file migration
+ workers and a high watermark (MAX_MIGRATE_QUEUE_COUNT) so that
+ crawler does not go far ahead in filling up the queue.
+ */
+
+ while (_gf_true) {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
+ pthread_cond_broadcast(&defrag->parallel_migration_cond);
+ goto out;
+ }
+
+ pthread_mutex_lock(&defrag->dfq_mutex);
+ {
+ /*Throttle down:
+ If the reconfigured count is less than current thread
+ count, then the current thread will sleep */
+
+ /*TODO: Need to refactor the following block to work
+ *under defrag->lock. For now access
+ * defrag->current_thread_count and rthcount under
+ * dfq_mutex lock */
+ while (!defrag->crawl_done && (defrag->recon_thread_count <
+ defrag->current_thread_count)) {
+ defrag->current_thread_count--;
+ gf_msg_debug("DHT", 0,
+ "Thread sleeping. "
+ "current thread count: %d",
+ defrag->current_thread_count);
+
+ pthread_cond_wait(&defrag->df_wakeup_thread,
+ &defrag->dfq_mutex);
+
+ defrag->current_thread_count++;
+ gf_msg_debug("DHT", 0,
+ "Thread wokeup. "
+ "current thread count: %d",
+ defrag->current_thread_count);
+ }
+
+ if (defrag->q_entry_count) {
+ iterator = list_entry(q_head->next, typeof(*iterator), list);
+
+ gf_msg_debug("DHT", 0,
+ "picking entry "
+ "%s",
+ iterator->df_entry->d_name);
+
+ list_del_init(&(iterator->list));
+
+ defrag->q_entry_count--;
+
+ if ((defrag->q_entry_count < MIN_MIGRATE_QUEUE_COUNT) &&
+ defrag->wakeup_crawler) {
+ pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
+ }
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+ ret = gf_defrag_migrate_single_file((void *)iterator);
+
+ /*Critical errors: ENOTCONN and ENOSPACE*/
+ if (ret) {
+ dht_set_global_defrag_error(defrag, ret);
+
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
+
+ pthread_cond_broadcast(&defrag->parallel_migration_cond);
+
+ goto out;
+ }
+
+ gf_defrag_free_container(iterator);
+
+ continue;
+ } else {
+ /* defrag->crawl_done flag is set means crawling
+ file system is done and hence a list_empty when
+ the above flag is set indicates there are no more
+ entries to be added to the queue and rebalance is
+ finished */
+
+ if (!defrag->crawl_done) {
+ defrag->current_thread_count--;
+ gf_msg_debug("DHT", 0,
+ "Thread "
+ "sleeping while waiting "
+ "for migration entries. "
+ "current thread count:%d",
+ defrag->current_thread_count);
+
+ pthread_cond_wait(&defrag->parallel_migration_cond,
+ &defrag->dfq_mutex);
+ }
+
+ if (defrag->crawl_done && !defrag->q_entry_count) {
+ defrag->current_thread_count++;
+ gf_msg_debug("DHT", 0, "Exiting thread");
+
+ pthread_cond_broadcast(&defrag->parallel_migration_cond);
+ goto unlock;
+ } else {
+ defrag->current_thread_count++;
+ gf_msg_debug("DHT", 0,
+ "Thread woke up"
+ " as found migrating entries. "
+ "current thread count:%d",
+ defrag->current_thread_count);
+
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+ continue;
+ }
+ }
+ }
+ unlock:
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+ break;
+ }
+out:
+ return NULL;
+}
+
+int static gf_defrag_get_entry(xlator_t *this, int i,
+ struct dht_container **container, loc_t *loc,
+ dht_conf_t *conf, gf_defrag_info_t *defrag,
+ fd_t *fd, dict_t *migrate_data,
+ struct dir_dfmeta *dir_dfmeta, dict_t *xattr_req,
+ int *perrno)
+{
+ int ret = 0;
+ char is_linkfile = 0;
+ gf_dirent_t *df_entry = NULL;
+ struct dht_container *tmp_container = NULL;
+
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = -1;
+ goto out;
+ }
+
+ if (dir_dfmeta->offset_var[i].readdir_done == 1) {
+ ret = 0;
+ goto out;
+ }
+
+ if (dir_dfmeta->fetch_entries[i] == 1) {
+ if (!fd) {
+ dir_dfmeta->fetch_entries[i] = 0;
+ dir_dfmeta->offset_var[i].readdir_done = 1;
+ ret = 0;
+ goto out;
+ }
+
+ ret = syncop_readdirp(conf->local_subvols[i], fd, 131072,
+ dir_dfmeta->offset_var[i].offset,
+ &(dir_dfmeta->equeue[i]), xattr_req, NULL);
+ if (ret == 0) {
+ dir_dfmeta->offset_var[i].readdir_done = 1;
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_DATA_FAILED,
+ "Readdirp failed. Aborting data migration for "
+ "directory: %s",
+ loc->path);
+ *perrno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ if (list_empty(&(dir_dfmeta->equeue[i].list))) {
+ dir_dfmeta->offset_var[i].readdir_done = 1;
+ ret = 0;
+ goto out;
+ }
+
+ dir_dfmeta->fetch_entries[i] = 0;
+ }
+
+ while (1) {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = -1;
+ goto out;
+ }
+
+ df_entry = list_entry(dir_dfmeta->iterator[i]->next, typeof(*df_entry),
+ list);
+
+ if (&df_entry->list == dir_dfmeta->head[i]) {
+ gf_dirent_free(&(dir_dfmeta->equeue[i]));
+ INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list));
+ dir_dfmeta->fetch_entries[i] = 1;
+ dir_dfmeta->iterator[i] = dir_dfmeta->head[i];
+ ret = 0;
+ goto out;
+ }
+
+ dir_dfmeta->iterator[i] = dir_dfmeta->iterator[i]->next;
+
+ dir_dfmeta->offset_var[i].offset = df_entry->d_off;
+ if (!strcmp(df_entry->d_name, ".") || !strcmp(df_entry->d_name, ".."))
+ continue;
+
+ if (IA_ISDIR(df_entry->d_stat.ia_type)) {
+ defrag->size_processed += df_entry->d_stat.ia_size;
+ continue;
+ }
+
+ defrag->num_files_lookedup++;
+
+ if (defrag->defrag_pattern &&
+ (gf_defrag_pattern_match(defrag, df_entry->d_name,
+ df_entry->d_stat.ia_size) == _gf_false)) {
+ defrag->size_processed += df_entry->d_stat.ia_size;
+ continue;
+ }
+
+ is_linkfile = check_is_linkfile(NULL, &df_entry->d_stat, df_entry->dict,
+ conf->link_xattr_name);
+
+ if (is_linkfile) {
+ /* No need to add linkto file to the queue for
+ migration. Only the actual data file need to
+ be checked for migration criteria.
+ */
+
+ gf_msg_debug(this->name, 0,
+ "Skipping linkfile"
+ " %s on subvol: %s",
+ df_entry->d_name, conf->local_subvols[i]->name);
+ continue;
+ }
+
+ /*Build Container Structure */
+
+ tmp_container = GF_CALLOC(1, sizeof(struct dht_container),
+ gf_dht_mt_container_t);
+ if (!tmp_container) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to allocate "
+ "memory for container");
+ ret = -1;
+ goto out;
+ }
+ tmp_container->df_entry = gf_dirent_for_name(df_entry->d_name);
+ if (!tmp_container->df_entry) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to allocate "
+ "memory for df_entry");
+ ret = -1;
+ goto out;
+ }
+
+ tmp_container->local_subvol_index = i;
+
+ tmp_container->df_entry->d_stat = df_entry->d_stat;
+
+ tmp_container->df_entry->d_ino = df_entry->d_ino;
+
+ tmp_container->df_entry->d_type = df_entry->d_type;
+
+ tmp_container->df_entry->d_len = df_entry->d_len;
+
+ tmp_container->parent_loc = GF_CALLOC(1, sizeof(*loc), gf_dht_mt_loc_t);
+ if (!tmp_container->parent_loc) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to allocate "
+ "memory for loc");
+ ret = -1;
+ goto out;
+ }
+
+ ret = loc_copy(tmp_container->parent_loc, loc);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "loc_copy failed");
+ ret = -1;
+ goto out;
+ }
+
+ tmp_container->migrate_data = migrate_data;
+
+ tmp_container->this = this;
+
+ if (df_entry->dict)
+ tmp_container->df_entry->dict = dict_ref(df_entry->dict);
+
+ /*Build Container Structure >> END*/
+
+ ret = 0;
+ goto out;
+ }
+
+out:
+ if (ret == 0) {
+ *container = tmp_container;
+ } else {
+ if (tmp_container) {
+ gf_defrag_free_container(tmp_container);
+ }
+ }
+
+ return ret;
+}
+
+int
+gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+ dict_t *migrate_data, int *perrno)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ gf_dirent_t entries;
+ dict_t *xattr_req = NULL;
+ struct timeval dir_start = {
+ 0,
+ };
+ struct timeval end = {
+ 0,
+ };
+ double elapsed = {
+ 0,
+ };
+ int local_subvols_cnt = 0;
+ int i = 0;
+ int j = 0;
+ struct dht_container *container = NULL;
+ int ldfq_count = 0;
+ int dfc_index = 0;
+ int throttle_up = 0;
+ struct dir_dfmeta *dir_dfmeta = NULL;
+ xlator_t *old_THIS = NULL;
+
+ gf_log(this->name, GF_LOG_INFO, "migrate data called on %s", loc->path);
+ gettimeofday(&dir_start, NULL);
+
+ conf = this->private;
+ local_subvols_cnt = conf->local_subvols_cnt;
+
+ if (!local_subvols_cnt) {
+ ret = 0;
+ goto out;
+ }
+
+ old_THIS = THIS;
+ THIS = this;
+
+ dir_dfmeta = GF_CALLOC(1, sizeof(*dir_dfmeta), gf_common_mt_pointer);
+ if (!dir_dfmeta) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ dir_dfmeta->lfd = GF_CALLOC(local_subvols_cnt, sizeof(fd_t *),
+ gf_common_mt_pointer);
+ if (!dir_dfmeta->lfd) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY,
+ "for dir_dfmeta", NULL);
+ ret = -1;
+ *perrno = ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < local_subvols_cnt; i++) {
+ dir_dfmeta->lfd[i] = fd_create(loc->inode, defrag->pid);
+ if (!dir_dfmeta->lfd[i]) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_FD_CREATE_FAILED,
+ NULL);
+ *perrno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_opendir(conf->local_subvols[i], loc, dir_dfmeta->lfd[i],
+ NULL, NULL);
+ if (ret) {
+ fd_unref(dir_dfmeta->lfd[i]);
+ dir_dfmeta->lfd[i] = NULL;
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FAILED_TO_OPEN,
+ "dir: %s", loc->path, "subvol: %s",
+ conf->local_subvols[i]->name, NULL);
+
+ if (conf->decommission_in_progress) {
+ *perrno = -ret;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ fd_bind(dir_dfmeta->lfd[i]);
+ }
+ }
+
+ dir_dfmeta->head = GF_CALLOC(local_subvols_cnt, sizeof(*(dir_dfmeta->head)),
+ gf_common_mt_pointer);
+ if (!dir_dfmeta->head) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->head is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ dir_dfmeta->iterator = GF_CALLOC(local_subvols_cnt,
+ sizeof(*(dir_dfmeta->iterator)),
+ gf_common_mt_pointer);
+ if (!dir_dfmeta->iterator) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->iterator is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ dir_dfmeta->equeue = GF_CALLOC(local_subvols_cnt, sizeof(entries),
+ gf_dht_mt_dirent_t);
+ if (!dir_dfmeta->equeue) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->equeue is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ dir_dfmeta->offset_var = GF_CALLOC(
+ local_subvols_cnt, sizeof(dht_dfoffset_ctx_t), gf_dht_mt_octx_t);
+ if (!dir_dfmeta->offset_var) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->offset_var is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ ret = gf_defrag_ctx_subvols_init(dir_dfmeta->offset_var, this);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "dht_dfoffset_ctx_t"
+ "initialization failed");
+ ret = -1;
+ goto out;
+ }
+
+ dir_dfmeta->fetch_entries = GF_CALLOC(local_subvols_cnt, sizeof(int),
+ gf_common_mt_int);
+ if (!dir_dfmeta->fetch_entries) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY,
+ "for dir_dfmeta->fetch_entries", NULL);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < local_subvols_cnt; i++) {
+ INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list));
+ dir_dfmeta->head[i] = &(dir_dfmeta->equeue[i].list);
+ dir_dfmeta->iterator[i] = dir_dfmeta->head[i];
+ dir_dfmeta->fetch_entries[i] = 1;
+ }
+
+ xattr_req = dict_new();
+ if (!xattr_req) {
+ gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set dict for "
+ "key: %s",
+ conf->link_xattr_name);
+ ret = -1;
+ goto out;
+ }
+
+ /*
+ Job: Read entries from each local subvol and store the entries
+ in equeue array of linked list. Now pick one entry from the
+ equeue array in a round robin basis and add them to defrag Queue.
+ */
+
+ while (!dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) {
+ pthread_mutex_lock(&defrag->dfq_mutex);
+ {
+ /*Throttle up: If reconfigured count is higher than
+ current thread count, wake up the sleeping threads
+ TODO: Need to refactor this. Instead of making the
+ thread sleep and wake, we should terminate and spawn
+ threads on-demand*/
+
+ if (defrag->recon_thread_count > defrag->current_thread_count) {
+ throttle_up = (defrag->recon_thread_count -
+ defrag->current_thread_count);
+ for (j = 0; j < throttle_up; j++) {
+ pthread_cond_signal(&defrag->df_wakeup_thread);
+ }
+ }
+
+ while (defrag->q_entry_count > MAX_MIGRATE_QUEUE_COUNT) {
+ defrag->wakeup_crawler = 1;
+ pthread_cond_wait(&defrag->rebalance_crawler_alarm,
+ &defrag->dfq_mutex);
+ }
+
+ ldfq_count = defrag->q_entry_count;
+
+ if (defrag->wakeup_crawler) {
+ defrag->wakeup_crawler = 0;
+ }
+ }
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+
+ while (
+ ldfq_count <= MAX_MIGRATE_QUEUE_COUNT &&
+ !dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) {
+ ret = gf_defrag_get_entry(this, dfc_index, &container, loc, conf,
+ defrag, dir_dfmeta->lfd[dfc_index],
+ migrate_data, dir_dfmeta, xattr_req,
+ perrno);
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+ goto out;
+ }
+
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Found "
+ "error from gf_defrag_get_entry");
+
+ ret = -1;
+ goto out;
+ }
+
+ /* Check if we got an entry, else we need to move the
+ index to the next subvol */
+ if (!container) {
+ GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt);
+ continue;
+ }
+
+ /* Q this entry in the dfq */
+ pthread_mutex_lock(&defrag->dfq_mutex);
+ {
+ list_add_tail(&container->list, &(defrag->queue[0].list));
+ defrag->q_entry_count++;
+ ldfq_count = defrag->q_entry_count;
+
+ gf_msg_debug(this->name, 0,
+ "added "
+ "file:%s parent:%s to the queue ",
+ container->df_entry->d_name,
+ container->parent_loc->path);
+
+ pthread_cond_signal(&defrag->parallel_migration_cond);
+ }
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+
+ GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt);
+ }
+ }
+
+ gettimeofday(&end, NULL);
+ elapsed = gf_tvdiff(&dir_start, &end);
+ gf_log(this->name, GF_LOG_INFO,
+ "Migration operation on dir %s took "
+ "%.2f secs",
+ loc->path, elapsed / 1e6);
+ ret = 0;
+out:
+ THIS = old_THIS;
+ gf_defrag_free_dir_dfmeta(dir_dfmeta, local_subvols_cnt);
+
+ if (xattr_req)
+ dict_unref(xattr_req);
+
+ /* It does not matter if it errored out - this number is
+ * used to calculate rebalance estimated time to complete.
+ * No locking required as dirs are processed by a single thread.
+ */
+ defrag->num_dirs_processed++;
+ return ret;
+}
+
+int
+gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+ dict_t *fix_layout)
+{
+ int ret;
+ dht_conf_t *conf = NULL;
+ /*
+ * Now we're ready to update the directory commit hash for the volume
+ * root, so that hash miscompares and broadcast lookups can stop.
+ * However, we want to skip that if fix-layout is all we did. In
+ * that case, we want the miscompares etc. to continue until a real
+ * rebalance is complete.
+ */
+ if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX ||
+ defrag->cmd == GF_DEFRAG_CMD_DETACH_START) {
+ return 0;
+ }
+
+ conf = this->private;
+ if (!conf) {
+ /*Uh oh
+ */
+ return -1;
+ }
+
+ if (conf->local_subvols_cnt == 0 || !conf->lookup_optimize) {
+ /* Commit hash updates are only done on local subvolumes and
+ * only when lookup optimization is needed (for older client
+ * support)
+ */
+ return 0;
+ }
+
+ ret = dict_set_uint32(fix_layout, "new-commit-hash",
+ defrag->new_commit_hash);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to set new-commit-hash");
+ return -1;
+ }
+
+ ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "fix layout on %s failed", loc->path);
+
+ if (-ret == ENOENT || -ret == ESTALE) {
+ /* Dir most likely is deleted */
+ return 0;
+ }
+
+ return -1;
+ }
+
+ /* TBD: find more efficient solution than adding/deleting every time */
+ dict_del(fix_layout, "new-commit-hash");
+
+ return 0;
+}
+
+int
+gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+ dict_t *fix_layout, dict_t *migrate_data)
+{
+ int ret = -1;
+ loc_t entry_loc = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *tmp = NULL;
+ gf_dirent_t *entry = NULL;
+ gf_boolean_t free_entries = _gf_false;
+ off_t offset = 0;
+ struct iatt iatt = {
+ 0,
+ };
+ inode_t *linked_inode = NULL, *inode = NULL;
+ dht_conf_t *conf = NULL;
+ int perrno = 0;
+
+ conf = this->private;
+ if (!conf) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_lookup(this, loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ if (strcmp(loc->path, "/") == 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "lookup failed for:%s", loc->path);
+
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "Dir:%s renamed or removed. Skipping", loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "lookup failed for:%s", loc->path);
+
+ defrag->total_failures++;
+ goto out;
+ }
+ }
+
+ fd = fd_create(loc->inode, defrag->pid);
+ if (!fd) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to create fd");
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_opendir(this, loc, fd, NULL, NULL);
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to open dir %s, "
+ "err:%d",
+ loc->path, -ret);
+
+ ret = -1;
+ goto out;
+ }
+
+ fd_bind(fd);
+ INIT_LIST_HEAD(&entries.list);
+
+ while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, NULL,
+ NULL)) != 0) {
+ if (ret < 0) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_READDIR_ERROR,
+ "readdirp failed for "
+ "path %s. Aborting fix-layout",
+ loc->path);
+
+ ret = -1;
+ goto out;
+ }
+
+ if (list_empty(&entries.list))
+ break;
+
+ free_entries = _gf_true;
+
+ list_for_each_entry_safe(entry, tmp, &entries.list, list)
+ {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = 1;
+ goto out;
+ }
+
+ offset = entry->d_off;
+
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+ continue;
+ if (!IA_ISDIR(entry->d_stat.ia_type)) {
+ continue;
+ }
+ loc_wipe(&entry_loc);
+
+ ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Child loc"
+ " build failed for entry: %s",
+ entry->d_name);
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ continue;
+ }
+ }
+
+ if (gf_uuid_is_null(entry->d_stat.ia_gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid);
+
+ /*In case the gfid stored in the inode by inode_link
+ * and the gfid obtained in the lookup differs, then
+ * client3_3_lookup_cbk will return ESTALE and proper
+ * error will be captured
+ */
+
+ linked_inode = inode_link(entry_loc.inode, loc->inode,
+ entry->d_name, &entry->d_stat);
+
+ inode = entry_loc.inode;
+ entry_loc.inode = linked_inode;
+ inode_unref(inode);
+
+ if (gf_uuid_is_null(loc->gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+ ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret,
+ DHT_MSG_DIR_LOOKUP_FAILED,
+ "Dir:%s renamed or removed. "
+ "Skipping",
+ loc->path);
+ ret = 0;
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s",
+ entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ } else {
+ continue;
+ }
+ }
+ }
+
+ /* A return value of 2 means, either process_dir or
+ * lookup of a dir failed. Hence, don't commit hash
+ * for the current directory*/
+
+ ret = gf_defrag_fix_layout(this, defrag, &entry_loc, fix_layout,
+ migrate_data);
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED ||
+ defrag->defrag_status == GF_DEFRAG_STATUS_FAILED) {
+ goto out;
+ }
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Fix layout failed for %s", entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ /* Let's not commit-hash if
+ * gf_defrag_fix_layout failed*/
+ continue;
+ }
+ }
+ }
+
+ gf_dirent_free(&entries);
+ free_entries = _gf_false;
+ INIT_LIST_HEAD(&entries.list);
+ }
+
+ /* A directory layout is fixed only after its subdirs are healed to
+ * any newly added bricks. If the layout is fixed before subdirs are
+ * healed, the newly added brick will get a non-null layout.
+ * Any subdirs which hash to that layout will no longer show up
+ * in a directory listing until they are healed.
+ */
+
+ ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+ /* In case of a race where the directory is deleted just before
+ * layout setxattr, the errors are updated in the layout structure.
+ * We can use this information to make a decision whether the directory
+ * is deleted entirely.
+ */
+ if (ret == 0) {
+ ret = dht_dir_layout_error_check(this, loc->inode);
+ ret = -ret;
+ }
+
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed. Dir %s "
+ "renamed or removed",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed for %s", loc->path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+ ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
+
+ if (ret) {
+ if (perrno == ENOENT || perrno == ESTALE) {
+ ret = 0;
+ goto out;
+ } else {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
+ "gf_defrag_process_dir failed for "
+ "directory: %s",
+ loc->path);
+
+ if (conf->decommission_in_progress) {
+ goto out;
+ }
+ }
+ }
+ }
+
+ gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
+
+ if (gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
+ "Settle hash failed for %s", loc->path);
+
+ ret = -1;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ if (free_entries)
+ gf_dirent_free(&entries);
+
+ loc_wipe(&entry_loc);
+
+ if (fd)
+ fd_unref(fd);
+
+ return ret;
+}
+
+int
+dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
+ loc_t *loc)
+{
+ dict_t *dict = NULL;
+ uuid_t *uuid_ptr = NULL;
+ int ret = -1;
+ int i = 0;
+ int j = 0;
+
+ /* Find local subvolumes */
+ ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL, NULL,
+ NULL);
+ if (ret && (ret != -ENODATA)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+ "local "
+ "subvolume determination failed with error: %d",
+ -ret);
+ ret = -1;
+ goto out;
+ }
+
+ if (!ret)
+ goto out;
+
+ ret = syncop_getxattr(this, loc, &dict, GF_REBAL_OLD_FIND_LOCAL_SUBVOL,
+ NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+ "local "
+ "subvolume determination failed with error: %d",
+ -ret);
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ if (ret) {
+ return ret;
+ }
+
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "local subvol: "
+ "%s",
+ conf->local_subvols[i]->name);
+
+ for (j = 0; j < conf->local_nodeuuids[i].count; j++) {
+ uuid_ptr = &(conf->local_nodeuuids[i].elements[j].uuid);
+ gf_msg(this->name, GF_LOG_INFO, 0, 0, "node uuid : %s",
+ uuid_utoa(*uuid_ptr));
+ }
+ }
+
+ return ret;
+}
+
+/* Functions for the rebalance estimates feature */
+
+uint64_t
+gf_defrag_subvol_file_size(xlator_t *this, loc_t *root_loc)
+{
+ int ret = -1;
+ struct statvfs buf = {
+ 0,
+ };
+
+ ret = syncop_statfs(this, root_loc, &buf, NULL, NULL);
+ if (ret) {
+ /* Aargh! */
+ return 0;
+ }
+ return ((buf.f_blocks - buf.f_bfree) * buf.f_frsize);
+}
+
+uint64_t
+gf_defrag_total_file_size(xlator_t *this, loc_t *root_loc)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ uint64_t size_files = 0;
+ uint64_t total_size = 0;
+
+ conf = this->private;
+ if (!conf) {
+ return 0;
+ }
+
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ size_files = gf_defrag_subvol_file_size(conf->local_subvols[i],
+ root_loc);
+ total_size += size_files;
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "local subvol: %s,"
+ "cnt = %" PRIu64,
+ conf->local_subvols[i]->name, size_files);
+ }
+
+ gf_msg(this->name, GF_LOG_INFO, 0, 0, "Total size files = %" PRIu64,
+ total_size);
+
+ return total_size;
+}
+
+static void *
+dht_file_counter_thread(void *args)
+{
+ gf_defrag_info_t *defrag = NULL;
+ loc_t root_loc = {
+ 0,
+ };
+ struct timespec time_to_wait = {
+ 0,
+ };
+ uint64_t tmp_size = 0;
+
+ if (!args)
+ return NULL;
+
+ defrag = (gf_defrag_info_t *)args;
+ dht_build_root_loc(defrag->root_inode, &root_loc);
+
+ while (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) {
+ timespec_now(&time_to_wait);
+ time_to_wait.tv_sec += 600;
+
+ pthread_mutex_lock(&defrag->fc_mutex);
+ pthread_cond_timedwait(&defrag->fc_wakeup_cond, &defrag->fc_mutex,
+ &time_to_wait);
+
+ pthread_mutex_unlock(&defrag->fc_mutex);
+
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED)
+ break;
+
+ tmp_size = gf_defrag_total_file_size(defrag->this, &root_loc);
+
+ gf_log("dht", GF_LOG_INFO, "tmp data size =%" PRIu64, tmp_size);
+
+ if (!tmp_size) {
+ gf_msg("dht", GF_LOG_ERROR, 0, 0,
+ "Failed to get "
+ "the total data size. Unable to estimate "
+ "time to complete rebalance.");
+ } else {
+ g_totalsize = tmp_size;
+ gf_msg_debug("dht", 0, "total data size =%" PRIu64, g_totalsize);
+ }
+ }
+
+ return NULL;
+}
+
+int
+gf_defrag_estimates_cleanup(xlator_t *this, gf_defrag_info_t *defrag,
+ pthread_t filecnt_thread)
+{
+ int ret = -1;
+
+ /* Wake up the filecounter thread.
+ * By now the defrag status will no longer be
+ * GF_DEFRAG_STATUS_STARTED so the thread will exit the loop.
+ */
+ pthread_mutex_lock(&defrag->fc_mutex);
+ {
+ pthread_cond_broadcast(&defrag->fc_wakeup_cond);
+ }
+ pthread_mutex_unlock(&defrag->fc_mutex);
+
+ ret = pthread_join(filecnt_thread, NULL);
+ if (ret) {
+ gf_msg("dht", GF_LOG_ERROR, ret, 0,
+ "file_counter_thread: pthread_join failed.");
+ ret = -1;
+ }
+ return ret;
+}
+
+int
+gf_defrag_estimates_init(xlator_t *this, loc_t *loc, pthread_t *filecnt_thread)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+
+ conf = this->private;
+ defrag = conf->defrag;
+
+ g_totalsize = gf_defrag_total_file_size(this, loc);
+ if (!g_totalsize) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+ "Failed to get "
+ "the total data size. Unable to estimate "
+ "time to complete rebalance.");
+ goto out;
+ }
+
+ ret = gf_thread_create(filecnt_thread, NULL, dht_file_counter_thread,
+ (void *)defrag, "dhtfcnt");
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ret, 0,
+ "Failed to "
+ "create the file counter thread ");
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/* Init and cleanup functions for parallel file migration*/
+int
+gf_defrag_parallel_migration_init(xlator_t *this, gf_defrag_info_t *defrag,
+ pthread_t **tid_array, int *thread_index)
+{
+ int ret = -1;
+ int thread_spawn_count = 0;
+ int index = 0;
+ pthread_t *tid = NULL;
+
+ if (!defrag)
+ goto out;
+
+ /* Initialize global entry queue */
+ defrag->queue = GF_CALLOC(1, sizeof(struct dht_container),
+ gf_dht_mt_container_t);
+
+ if (!defrag->queue) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Failed to initialise migration queue");
+ ret = -1;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&(defrag->queue[0].list));
+
+ thread_spawn_count = MAX(MAX_REBAL_THREADS, 4);
+
+ gf_msg_debug(this->name, 0, "thread_spawn_count: %d", thread_spawn_count);
+
+ tid = GF_CALLOC(thread_spawn_count, sizeof(pthread_t),
+ gf_common_mt_pthread_t);
+ if (!tid) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Failed to create migration threads");
+ ret = -1;
+ goto out;
+ }
+ defrag->current_thread_count = thread_spawn_count;
+
+ /*Spawn Threads Here*/
+ while (index < thread_spawn_count) {
+ ret = gf_thread_create(&(tid[index]), NULL, gf_defrag_task,
+ (void *)defrag, "dhtmig%d", (index + 1) & 0x3ff);
+ if (ret != 0) {
+ gf_msg("DHT", GF_LOG_ERROR, ret, 0, "Thread[%d] creation failed. ",
+ index);
+ ret = -1;
+ goto out;
+ } else {
+ gf_log("DHT", GF_LOG_INFO,
+ "Thread[%d] "
+ "creation successful",
+ index);
+ }
+ index++;
+ }
+
+ ret = 0;
+out:
+ *thread_index = index;
+ *tid_array = tid;
+
+ return ret;
+}
+
+int
+gf_defrag_parallel_migration_cleanup(gf_defrag_info_t *defrag,
+ pthread_t *tid_array, int thread_index)
+{
+ int ret = -1;
+ int i = 0;
+
+ if (!defrag)
+ goto out;
+
+ /* Wake up all migration threads */
+ pthread_mutex_lock(&defrag->dfq_mutex);
+ {
+ defrag->crawl_done = 1;
+
+ pthread_cond_broadcast(&defrag->parallel_migration_cond);
+ pthread_cond_broadcast(&defrag->df_wakeup_thread);
+ }
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+
+ /*Wait for all the threads to complete their task*/
+ for (i = 0; i < thread_index; i++) {
+ pthread_join(tid_array[i], NULL);
+ }
+
+ GF_FREE(tid_array);
+
+ /* Cleanup the migration queue */
+ if (defrag->queue) {
+ gf_dirent_free(defrag->queue[0].df_entry);
+ INIT_LIST_HEAD(&(defrag->queue[0].list));
+ }
+
+ GF_FREE(defrag->queue);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+gf_defrag_start_crawl(void *data)
+{
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ dict_t *fix_layout = NULL;
+ dict_t *migrate_data = NULL;
+ dict_t *status = NULL;
+ glusterfs_ctx_t *ctx = NULL;
+ call_frame_t *statfs_frame = NULL;
+ xlator_t *old_THIS = NULL;
+ int ret = -1;
+ loc_t loc = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+ struct iatt parent = {
+ 0,
+ };
+ int thread_index = 0;
+ pthread_t *tid = NULL;
+ pthread_t filecnt_thread;
+ gf_boolean_t fc_thread_started = _gf_false;
+
+ this = data;
+ if (!this)
+ goto exit;
+
+ ctx = this->ctx;
+ if (!ctx)
+ goto exit;
+
+ conf = this->private;
+ if (!conf)
+ goto exit;
+
+ defrag = conf->defrag;
+ if (!defrag)
+ goto exit;
+
+ defrag->start_time = gf_time();
+
+ dht_build_root_inode(this, &defrag->root_inode);
+ if (!defrag->root_inode)
+ goto out;
+
+ dht_build_root_loc(defrag->root_inode, &loc);
+
+ /* fix-layout on '/' first */
+
+ ret = syncop_lookup(this, &loc, &iatt, &parent, NULL, NULL);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_START_FAILED,
+ "Failed to start rebalance: look up on / failed");
+ ret = -1;
+ goto out;
+ }
+
+ old_THIS = THIS;
+ THIS = this;
+
+ statfs_frame = create_frame(this, this->ctx->pool);
+ if (!statfs_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+ "Insufficient memory. Frame creation failed");
+ ret = -1;
+ goto out;
+ }
+
+ /* async statfs update for honoring min-free-disk */
+ dht_get_du_info(statfs_frame, this, &loc);
+ THIS = old_THIS;
+
+ fix_layout = dict_new();
+ if (!fix_layout) {
+ ret = -1;
+ goto out;
+ }
+
+ /*
+ * Unfortunately, we can't do special xattrs (like fix.layout) and
+ * real ones in the same call currently, and changing it seems
+ * riskier than just doing two calls.
+ */
+
+ gf_log(this->name, GF_LOG_INFO, "%s using commit hash %u", __func__,
+ conf->vol_commit_hash);
+
+ ret = dict_set_uint32(fix_layout, conf->commithash_xattr_name,
+ conf->vol_commit_hash);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to set %s",
+ conf->commithash_xattr_name);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to set commit hash on %s. "
+ "Rebalance cannot proceed.",
+ loc.path);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ /* We now return to our regularly scheduled program. */
+
+ ret = dict_set_str(fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes");
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+ "Failed to start rebalance:"
+ "Failed to set dictionary value: key = %s",
+ GF_XATTR_FIX_LAYOUT_KEY);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ defrag->new_commit_hash = conf->vol_commit_hash;
+
+ ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_FAILED,
+ "fix layout on %s failed", loc.path);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+ /* We need to migrate files */
+
+ migrate_data = dict_new();
+ if (!migrate_data) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str(
+ migrate_data, GF_XATTR_FILE_MIGRATE_KEY,
+ (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) ? "force" : "non-force");
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ ret = dht_init_local_subvols_and_nodeuuids(this, conf, &loc);
+ if (ret) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Initialise the structures required for parallel migration */
+ ret = gf_defrag_parallel_migration_init(this, defrag, &tid,
+ &thread_index);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Aborting rebalance.");
+ goto out;
+ }
+
+ ret = gf_defrag_estimates_init(this, &loc, &filecnt_thread);
+ if (ret) {
+ /* Not a fatal error. Allow the rebalance to proceed*/
+ ret = 0;
+ } else {
+ fc_thread_started = _gf_true;
+ }
+ }
+
+ ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ if (gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ gf_log("DHT", GF_LOG_INFO, "crawling file-system completed");
+out:
+
+ /* We are here means crawling the entire file system is done
+ or something failed. Set defrag->crawl_done flag to intimate
+ the migrator threads to exhaust the defrag->queue and terminate*/
+
+ if (ret) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ }
+
+ gf_defrag_parallel_migration_cleanup(defrag, tid, thread_index);
+
+ if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) &&
+ (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
+ }
+
+ if (fc_thread_started) {
+ gf_defrag_estimates_cleanup(this, defrag, filecnt_thread);
+ }
+
+ dht_send_rebalance_event(this, defrag->cmd, defrag->defrag_status);
+
+ status = dict_new();
+ LOCK(&defrag->lock);
+ {
+ gf_defrag_status_get(conf, status);
+ if (ctx && ctx->notify)
+ ctx->notify(GF_EN_DEFRAG_STATUS, status);
+ if (status)
+ dict_unref(status);
+ defrag->is_exiting = 1;
+ }
+ UNLOCK(&defrag->lock);
+
+ GF_FREE(defrag);
+ conf->defrag = NULL;
+
+ if (migrate_data)
+ dict_unref(migrate_data);
+
+ if (statfs_frame) {
+ STACK_DESTROY(statfs_frame->root);
+ }
+exit:
+ return ret;
+}
+
+static int
+gf_defrag_done(int ret, call_frame_t *sync_frame, void *data)
+{
+ gf_listener_stop(sync_frame->this);
+
+ STACK_DESTROY(sync_frame->root);
+ kill(getpid(), SIGTERM);
+ return 0;
+}
+
+void *
+gf_defrag_start(void *data)
+{
+ int ret = -1;
+ call_frame_t *frame = NULL;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ xlator_t *this = NULL;
+ xlator_t *old_THIS = NULL;
+
+ this = data;
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ defrag = conf->defrag;
+ if (!defrag)
+ goto out;
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ frame->root->pid = GF_CLIENT_PID_DEFRAG;
+
+ defrag->pid = frame->root->pid;
+
+ defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
+
+ old_THIS = THIS;
+ THIS = this;
+ ret = synctask_new(this->ctx->env, gf_defrag_start_crawl, gf_defrag_done,
+ frame, this);
+
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+ "Could not create task for rebalance");
+ THIS = old_THIS;
+out:
+ return NULL;
+}
+
+uint64_t
+gf_defrag_get_estimates_based_on_size(dht_conf_t *conf)
+{
+ gf_defrag_info_t *defrag = NULL;
+ double rate_processed = 0;
+ uint64_t total_processed = 0;
+ uint64_t tmp_count = 0;
+ uint64_t time_to_complete = 0;
+ double elapsed = 0;
+
+ defrag = conf->defrag;
+
+ if (!g_totalsize)
+ goto out;
+
+ elapsed = gf_time() - defrag->start_time;
+
+ /* Don't calculate the estimates for the first 10 minutes.
+ * It is unlikely to be accurate and estimates are not required
+ * if the process finishes in less than 10 mins.
+ */
+
+ if (elapsed < ESTIMATE_START_INTERVAL) {
+ gf_msg(THIS->name, GF_LOG_INFO, 0, 0,
+ "Rebalance estimates will not be available for the "
+ "first %d seconds.",
+ ESTIMATE_START_INTERVAL);
+
+ goto out;
+ }
+
+ total_processed = defrag->size_processed;
+
+ /* rate at which files processed */
+ rate_processed = (total_processed) / elapsed;
+
+ tmp_count = g_totalsize;
+
+ if (rate_processed) {
+ time_to_complete = (tmp_count) / rate_processed;
+
+ } else {
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, 0,
+ "Unable to calculate estimated time for rebalance");
+ }
+
+ gf_log(THIS->name, GF_LOG_INFO,
+ "TIME: (size) total_processed=%" PRIu64 " tmp_cnt = %" PRIu64
+ ","
+ "rate_processed=%f, elapsed = %f",
+ total_processed, tmp_count, rate_processed, elapsed);
+
+out:
+ return time_to_complete;
+}
+
+int
+gf_defrag_status_get(dht_conf_t *conf, dict_t *dict)
+{
+ int ret = 0;
+ uint64_t files = 0;
+ uint64_t size = 0;
+ uint64_t lookup = 0;
+ uint64_t failures = 0;
+ uint64_t skipped = 0;
+ char *status = "";
+ double elapsed = 0;
+ uint64_t time_to_complete = 0;
+ uint64_t time_left = 0;
+ gf_defrag_info_t *defrag = conf->defrag;
+
+ if (!defrag)
+ goto out;
+
+ ret = 0;
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED)
+ goto out;
+
+ files = defrag->total_files;
+ size = defrag->total_data;
+ lookup = defrag->num_files_lookedup;
+ failures = defrag->total_failures;
+ skipped = defrag->skipped;
+
+ elapsed = gf_time() - defrag->start_time;
+
+ /* The rebalance is still in progress */
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) {
+ time_to_complete = gf_defrag_get_estimates_based_on_size(conf);
+
+ if (time_to_complete && (time_to_complete > elapsed))
+ time_left = time_to_complete - elapsed;
+
+ gf_log(THIS->name, GF_LOG_INFO,
+ "TIME: Estimated total time to complete (size)= %" PRIu64
+ " seconds, seconds left = %" PRIu64 "",
+ time_to_complete, time_left);
+ }
+
+ if (!dict)
+ goto log;
+
+ ret = dict_set_uint64(dict, "files", files);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set file count");
+
+ ret = dict_set_uint64(dict, "size", size);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set size of xfer");
+
+ ret = dict_set_uint64(dict, "lookups", lookup);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set lookedup file count");
+
+ ret = dict_set_int32(dict, "status", defrag->defrag_status);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set status");
+
+ ret = dict_set_double(dict, "run-time", elapsed);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set run-time");
+
+ ret = dict_set_uint64(dict, "failures", failures);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set failure count");
+
+ ret = dict_set_uint64(dict, "skipped", skipped);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set skipped file count");
+
+ ret = dict_set_uint64(dict, "time-left", time_left);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set time-left");
+
+log:
+ switch (defrag->defrag_status) {
+ case GF_DEFRAG_STATUS_NOT_STARTED:
+ status = "not started";
+ break;
+ case GF_DEFRAG_STATUS_STARTED:
+ status = "in progress";
+ break;
+ case GF_DEFRAG_STATUS_STOPPED:
+ status = "stopped";
+ break;
+ case GF_DEFRAG_STATUS_COMPLETE:
+ status = "completed";
+ break;
+ case GF_DEFRAG_STATUS_FAILED:
+ status = "failed";
+ break;
+ default:
+ break;
+ }
+
+ gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS,
+ "Rebalance is %s. Time taken is %.2f secs", status, elapsed);
+ gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS,
+ "Files migrated: %" PRIu64 ", size: %" PRIu64 ", lookups: %" PRIu64
+ ", failures: %" PRIu64
+ ", skipped: "
+ "%" PRIu64,
+ files, size, lookup, failures, skipped);
+out:
+ return 0;
+}
+
+int
+gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output)
+{
+ /* TODO: set a variable 'stop_defrag' here, it should be checked
+ in defrag loop */
+ int ret = -1;
+ gf_defrag_info_t *defrag = conf->defrag;
+
+ GF_ASSERT(defrag);
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) {
+ goto out;
+ }
+
+ gf_msg("", GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STOPPED,
+ "Received stop command on rebalance");
+ defrag->defrag_status = status;
+
+ if (output)
+ gf_defrag_status_get(conf, output);
+ ret = 0;
+out:
+ gf_msg_debug("", 0, "Returning %d", ret);
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index 52185d2adac..d9dbf50492f 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -1,713 +1,1997 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
/* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should
* delete the newpath if it gets EEXISTS from link() call.
*/
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "defaults.h"
+#include "dht-lock.h"
+#include <glusterfs/defaults.h>
+int
+dht_rename_unlock(call_frame_t *frame, xlator_t *this);
+int32_t
+dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
int
-dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
+dht_rename_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ local = frame->local;
- local = frame->local;
- prev = cookie;
+ dht_set_fixed_dir_stat(&local->preoldparent);
+ dht_set_fixed_dir_stat(&local->postoldparent);
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
- if (op_ret == -1) {
- /* TODO: undo the damage */
+ if (IA_ISREG(local->stbuf.ia_type))
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
- gf_log (this->name, GF_LOG_DEBUG,
- "rename %s -> %s on %s failed (%s)",
- local->loc.path, local->loc2.path,
- prev->this->name, strerror (op_errno));
+ DHT_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno,
+ &local->stbuf, &local->preoldparent, &local->postoldparent,
+ &local->preparent, &local->postparent, local->xattr);
+ return 0;
+}
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- } else {
- /* TODO: construct proper stbuf for dir */
- /*
- * FIXME: is this the correct way to build stbuf and
- * parent bufs?
- */
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent,
- prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent,
- prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent,
- prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent,
- prev->this);
- }
+static void
+dht_rename_dir_unlock_src(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- local->stbuf.ia_ino = local->loc.inode->ino;
+ local = frame->local;
+ dht_unlock_namespace(frame, &local->lock[0]);
+ return;
+}
- local->preoldparent.ia_ino = local->loc.parent->ino;
- local->postoldparent.ia_ino = local->loc.parent->ino;
+static void
+dht_rename_dir_unlock_dst(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ int op_ret = -1;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+
+ /* Unlock entrylk */
+ dht_unlock_entrylk_wrapper(frame, &local->lock[1].ns.directory_ns);
+
+ /* Unlock inodelk */
+ op_ret = dht_unlock_inodelk(frame, local->lock[1].ns.parent_layout.locks,
+ local->lock[1].ns.parent_layout.lk_count,
+ dht_rename_unlock_cbk);
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ if (IA_ISREG(local->stbuf.ia_type))
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+ "winding unlock inodelk failed "
+ "rename (%s:%s:%s %s:%s:%s), "
+ "stale locks left on bricks",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+ else
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+ "winding unlock inodelk failed "
+ "rename (%s:%s %s:%s), "
+ "stale locks left on bricks",
+ local->loc.path, src_gfid, local->loc2.path, dst_gfid);
+
+ dht_rename_unlock_cbk(frame, NULL, this, 0, 0, NULL);
+ }
+
+ return;
+}
- local->preparent.ia_ino = local->loc2.parent->ino;
- local->postparent.ia_ino = local->loc2.parent->ino;
+static int
+dht_rename_dir_unlock(call_frame_t *frame, xlator_t *this)
+{
+ dht_rename_dir_unlock_src(frame, this);
+ dht_rename_dir_unlock_dst(frame, this);
+ return 0;
+}
+int
+dht_rename_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ int i = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int subvol_cnt = -1;
+
+ conf = this->private;
+ local = frame->local;
+ prev = cookie;
+ subvol_cnt = dht_subvol_cnt(this, prev);
+ local->ret_cache[subvol_cnt] = op_ret;
+
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.inode->gfid, gfid);
+
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED,
+ "Rename %s -> %s on %s failed, (gfid = %s)", local->loc.path,
+ local->loc2.path, prev->name, gfid);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ goto unwind;
+ }
+ /* TODO: construct proper stbuf for dir */
+ /*
+ * FIXME: is this the correct way to build stbuf and
+ * parent bufs?
+ */
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preoldparent, preoldparent);
+ dht_iatt_merge(this, &local->postoldparent, postoldparent);
+ dht_iatt_merge(this, &local->preparent, prenewparent);
+ dht_iatt_merge(this, &local->postparent, postnewparent);
- WIPE (&local->preoldparent);
- WIPE (&local->postoldparent);
- WIPE (&local->preparent);
- WIPE (&local->postparent);
+unwind:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ /* We get here with local->call_cnt == 0. Which means
+ * we are the only one executing this code, there is
+ * no contention. Therefore it's safe to manipulate or
+ * deref local->call_cnt directly (without locking).
+ */
+ if (local->ret_cache[conf->subvolume_cnt] == 0) {
+ /* count errant subvols in last field of ret_cache */
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (local->ret_cache[i] != 0)
+ ++local->ret_cache[conf->subvolume_cnt];
+ }
+ if (local->ret_cache[conf->subvolume_cnt]) {
+ /* undoing the damage:
+ * for all subvolumes, where rename
+ * succeeded, we perform the reverse operation
+ */
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (local->ret_cache[i] == 0)
+ ++local->call_cnt;
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (local->ret_cache[i])
+ continue;
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent,
- &local->preparent, &local->postparent);
- }
+ STACK_WIND(frame, dht_rename_dir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->rename, &local->loc2,
+ &local->loc, NULL);
+ }
- return 0;
-}
+ return 0;
+ }
+ }
+ WIPE(&local->preoldparent);
+ WIPE(&local->postoldparent);
+ WIPE(&local->preparent);
+ WIPE(&local->postparent);
+ dht_rename_dir_unlock(frame, this);
+ }
+
+ return 0;
+}
int
-dht_rename_dir_do (call_frame_t *frame, xlator_t *this)
+dht_rename_hashed_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int i = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int call_cnt = 0;
+ xlator_t *prev = NULL;
+ int i = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ conf = this->private;
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.inode->gfid, gfid);
+
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED,
+ "rename %s -> %s on %s failed, (gfid = %s) ", local->loc.path,
+ local->loc2.path, prev->name, gfid);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ goto unwind;
+ }
+ /* TODO: construct proper stbuf for dir */
+ /*
+ * FIXME: is this the correct way to build stbuf and
+ * parent bufs?
+ */
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preoldparent, preoldparent);
+ dht_iatt_merge(this, &local->postoldparent, postoldparent);
+ dht_iatt_merge(this, &local->preparent, prenewparent);
+ dht_iatt_merge(this, &local->postparent, postnewparent);
+
+ call_cnt = local->call_cnt = conf->subvolume_cnt - 1;
+
+ if (!local->call_cnt)
+ goto unwind;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == local->dst_hashed)
+ continue;
+ STACK_WIND_COOKIE(
+ frame, dht_rename_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->rename, &local->loc, &local->loc2, NULL);
+ if (!--call_cnt)
+ break;
+ }
+
+ return 0;
+unwind:
+ WIPE(&local->preoldparent);
+ WIPE(&local->postoldparent);
+ WIPE(&local->preparent);
+ WIPE(&local->postparent);
- conf = this->private;
- local = frame->local;
+ dht_rename_dir_unlock(frame, this);
+ return 0;
+}
- if (local->op_ret == -1)
- goto err;
+int
+dht_rename_dir_do(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
- local->call_cnt = conf->subvolume_cnt;
- local->op_ret = 0;
+ if (local->op_ret == -1)
+ goto err;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_rename_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->rename,
- &local->loc, &local->loc2);
- }
+ local->op_ret = 0;
- return 0;
+ STACK_WIND_COOKIE(frame, dht_rename_hashed_dir_cbk, local->dst_hashed,
+ local->dst_hashed, local->dst_hashed->fops->rename,
+ &local->loc, &local->loc2, NULL);
+ return 0;
err:
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, NULL,
- NULL, NULL, NULL);
- return 0;
+ dht_rename_dir_unlock(frame, this);
+ return 0;
}
-
int
-dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *entries)
+dht_rename_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
- call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ xlator_t *prev = NULL;
- local = frame->local;
- prev = cookie;
+ local = frame->local;
+ prev = cookie;
- if (op_ret > 2) {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir on %s for %s returned %d entries",
- prev->this->name, local->loc.path, op_ret);
- local->op_ret = -1;
- local->op_errno = ENOTEMPTY;
- }
+ if (op_ret > 2) {
+ gf_msg_trace(this->name, 0, "readdir on %s for %s returned %d entries",
+ prev->name, local->loc.path, op_ret);
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+ }
- this_call_cnt = dht_frame_return (frame);
+ this_call_cnt = dht_frame_return(frame);
- if (is_last_call (this_call_cnt)) {
- dht_rename_dir_do (frame, this);
- }
+ if (is_last_call(this_call_cnt)) {
+ dht_rename_dir_do(frame, this);
+ }
- return 0;
+ return 0;
}
-
int
-dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd)
+dht_rename_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
- call_frame_t *prev = NULL;
-
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ xlator_t *prev = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
- local = frame->local;
- prev = cookie;
+ local = frame->local;
+ prev = cookie;
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "opendir on %s for %s failed (%s)",
- prev->this->name, local->loc.path,
- strerror (op_errno));
- goto err;
- }
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.inode->gfid, gfid);
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_OPENDIR_FAILED,
+ "opendir on %s for %s failed,(gfid = %s) ", prev->name,
+ local->loc.path, gfid);
+ goto err;
+ }
- STACK_WIND (frame, dht_rename_readdir_cbk,
- prev->this, prev->this->fops->readdir,
- local->fd, 4096, 0);
+ fd_bind(fd);
+ STACK_WIND_COOKIE(frame, dht_rename_readdir_cbk, prev, prev,
+ prev->fops->readdir, local->fd, 4096, 0, NULL);
- return 0;
+ return 0;
err:
- this_call_cnt = dht_frame_return (frame);
+ this_call_cnt = dht_frame_return(frame);
- if (is_last_call (this_call_cnt)) {
- dht_rename_dir_do (frame, this);
- }
+ if (is_last_call(this_call_cnt)) {
+ dht_rename_dir_do(frame, this);
+ }
- return 0;
+ return 0;
}
-
int
-dht_rename_dir (call_frame_t *frame, xlator_t *this)
+dht_rename_dir_lock2_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int i = 0;
- int op_errno = -1;
+ dht_local_t *local = NULL;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ local = frame->local;
+ conf = this->private;
+
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "acquiring entrylk after inodelk failed"
+ "rename (%s:%s:%s %s:%s:%s)",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ local->fd = fd_create(local->loc.inode, frame->root->pid);
+ if (!local->fd) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->op_ret = 0;
+
+ if (!local->dst_cached) {
+ dht_rename_dir_do(frame, this);
+ return 0;
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_rename_opendir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir, &local->loc2,
+ local->fd, NULL);
+ }
- conf = frame->this->private;
- local = frame->local;
+ return 0;
- local->call_cnt = conf->subvolume_cnt;
+err:
+ /* No harm in calling an extra unlock */
+ dht_rename_dir_unlock(frame, this);
+ return 0;
+}
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->subvolume_status[i]) {
- op_errno = ENOTCONN;
- goto err;
- }
+int
+dht_rename_dir_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+ loc_t *loc = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "acquiring entrylk after inodelk failed"
+ "rename (%s:%s:%s %s:%s:%s)",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ if (local->current == &local->lock[0]) {
+ loc = &local->loc2;
+ subvol = local->dst_hashed;
+ local->current = &local->lock[1];
+ } else {
+ loc = &local->loc;
+ subvol = local->src_hashed;
+ local->current = &local->lock[0];
+ }
+ ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+ dht_rename_dir_lock2_cbk);
+ if (ret < 0) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ return 0;
+err:
+ /* No harm in calling an extra unlock */
+ dht_rename_dir_unlock(frame, this);
+ return 0;
+}
+
+/*
+ * If the hashed subvolumes of both source and dst are the different,
+ * lock in dictionary order of hashed subvol->name. This is important
+ * in case the parent directory is the same for both src and dst to
+ * prevent inodelk deadlocks when racing with a fix-layout op on the parent.
+ *
+ * If the hashed subvols are the same, use the gfid/name to determine
+ * the order of taking locks to prevent entrylk deadlocks when the parent
+ * dirs are the same.
+ *
+ */
+static int
+dht_order_rename_lock(call_frame_t *frame, loc_t **loc, xlator_t **subvol)
+{
+ int ret = 0;
+ int op_ret = 0;
+ dht_local_t *local = NULL;
+ char *src = NULL;
+ char *dst = NULL;
+
+ local = frame->local;
+
+ if (local->src_hashed->name == local->dst_hashed->name) {
+ ret = 0;
+ } else {
+ ret = strcmp(local->src_hashed->name, local->dst_hashed->name);
+ }
+
+ if (ret == 0) {
+ /* hashed subvols are the same for src and dst */
+ /* Entrylks need to be ordered*/
+
+ src = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc.name) + 1);
+ if (!src) {
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Insufficient memory for src");
+ op_ret = -1;
+ goto out;
}
- local->fd = fd_create (local->loc.inode, frame->root->pid);
- if (!local->fd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
+ if (!gf_uuid_is_null(local->loc.pargfid))
+ uuid_utoa_r(local->loc.pargfid, src);
+ else if (local->loc.parent)
+ uuid_utoa_r(local->loc.parent->gfid, src);
+ else
+ src[0] = '\0';
+
+ strcat(src, local->loc.name);
+
+ dst = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc2.name) + 1);
+ if (!dst) {
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Insufficient memory for dst");
+ op_ret = -1;
+ goto out;
+ }
- local->op_ret = 0;
+ if (!gf_uuid_is_null(local->loc2.pargfid))
+ uuid_utoa_r(local->loc2.pargfid, dst);
+ else if (local->loc2.parent)
+ uuid_utoa_r(local->loc2.parent->gfid, dst);
+ else
+ dst[0] = '\0';
- if (!local->dst_cached) {
- dht_rename_dir_do (frame, this);
- return 0;
- }
+ strcat(dst, local->loc2.name);
+ ret = strcmp(src, dst);
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_rename_opendir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->opendir,
- &local->loc2, local->fd);
- }
+ if (ret <= 0) {
+ /*inodelk in dictionary order of hashed subvol names*/
+ /*entrylk in dictionary order of gfid/basename */
+ local->current = &local->lock[0];
+ *loc = &local->loc;
+ *subvol = local->src_hashed;
- return 0;
+ } else {
+ local->current = &local->lock[1];
+ *loc = &local->loc2;
+ *subvol = local->dst_hashed;
+ }
+
+ op_ret = 0;
+
+out:
+ return op_ret;
+}
+
+int
+dht_rename_dir(call_frame_t *frame, xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int ret = 0;
+ int op_errno = -1;
+
+ conf = frame->this->private;
+ local = frame->local;
+
+ local->ret_cache = GF_CALLOC(conf->subvolume_cnt + 1, sizeof(int),
+ gf_dht_ret_cache_t);
+
+ if (local->ret_cache == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+ "Rename dir failed: subvolume down (%s)",
+ conf->subvolumes[i]->name);
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ }
+
+ /* Locks on src and dst needs to ordered which otherwise might cause
+ * deadlocks when rename (src, dst) and rename (dst, src) is done from
+ * two different clients
+ */
+ ret = dht_order_rename_lock(frame, &loc, &subvol);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ /* Rename must take locks on src to avoid lookup selfheal from
+ * recreating src on those subvols where the rename was successful.
+ * The locks can't be issued parallel as two different clients might
+ * attempt same rename command and be in dead lock.
+ */
+ ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+ dht_rename_dir_lock1_cbk);
+ if (ret < 0) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
+}
+
+static int
+dht_rename_track_for_changelog(xlator_t *this, dict_t *xattr, loc_t *oldloc,
+ loc_t *newloc)
+{
+ int ret = -1;
+ dht_changelog_rename_info_t *info = NULL;
+ char *name = NULL;
+ int len1 = 0;
+ int len2 = 0;
+ int size = 0;
+
+ if (!xattr || !oldloc || !newloc || !this)
+ return ret;
+
+ len1 = strlen(oldloc->name) + 1;
+ len2 = strlen(newloc->name) + 1;
+ size = sizeof(dht_changelog_rename_info_t) + len1 + len2;
+
+ info = GF_CALLOC(size, sizeof(char), gf_common_mt_char);
+ if (!info) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to calloc memory");
+ return ret;
+ }
+
+ gf_uuid_copy(info->old_pargfid, oldloc->pargfid);
+ gf_uuid_copy(info->new_pargfid, newloc->pargfid);
+
+ info->oldname_len = len1;
+ info->newname_len = len2;
+ strncpy(info->buffer, oldloc->name, len1);
+ name = info->buffer + len1;
+ strncpy(name, newloc->name, len2);
+
+ ret = dict_set_bin(xattr, DHT_CHANGELOG_RENAME_OP_KEY, info, size);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s,"
+ " path = %s",
+ DHT_CHANGELOG_RENAME_OP_KEY, oldloc->name);
+ GF_FREE(info);
+ }
+
+ return ret;
}
+#define DHT_MARKER_DONT_ACCOUNT(xattr) \
+ do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new(); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str(xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, "yes"); \
+ if (tmp) { \
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \
+ "Failed to set dictionary value: key = %s," \
+ " path = %s", \
+ GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, local->loc.path); \
+ } \
+ } while (0)
+
+#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc) \
+ do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new(); \
+ if (!xattr) { \
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \
+ "Failed to create dictionary to " \
+ "track rename"); \
+ break; \
+ } \
+ } \
+ \
+ tmp = dht_rename_track_for_changelog(this, xattr, oldloc, newloc); \
+ \
+ if (tmp) { \
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \
+ "Failed to set dictionary value: key = %s," \
+ " path = %s", \
+ DHT_CHANGELOG_RENAME_OP_KEY, (oldloc)->path); \
+ } \
+ } while (0)
int
-dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent)
+dht_rename_unlock(call_frame_t *frame, xlator_t *this)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ int op_ret = -1;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_ilock_wrap_t inodelk_wrapper = {
+ 0,
+ };
+
+ local = frame->local;
+ inodelk_wrapper.locks = local->rename_inodelk_backward_compatible;
+ inodelk_wrapper.lk_count = local->rename_inodelk_bc_count;
+
+ op_ret = dht_unlock_inodelk_wrapper(frame, &inodelk_wrapper);
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ if (IA_ISREG(local->stbuf.ia_type))
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+ "winding unlock inodelk failed "
+ "rename (%s:%s:%s %s:%s:%s), "
+ "stale locks left on bricks",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+ else
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+ "winding unlock inodelk failed "
+ "rename (%s:%s %s:%s), "
+ "stale locks left on bricks",
+ local->loc.path, src_gfid, local->loc2.path, dst_gfid);
+ }
+
+ dht_unlock_namespace(frame, &local->lock[0]);
+ dht_unlock_namespace(frame, &local->lock[1]);
+
+ dht_rename_unlock_cbk(frame, NULL, this, local->op_ret, local->op_errno,
+ NULL);
+ return 0;
+}
- local = frame->local;
- prev = cookie;
+int
+dht_rename_done(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR,
- "!local, should not happen");
- goto out;
- }
+ local = frame->local;
- this_call_cnt = dht_frame_return (frame);
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal(frame, this);
+ }
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlink on %s failed (%s)",
- prev->this->name, strerror (op_errno));
- }
+ dht_rename_unlock(frame, this);
+ return 0;
+}
- WIPE (&local->preoldparent);
- WIPE (&local->postoldparent);
- WIPE (&local->preparent);
- WIPE (&local->postparent);
+int
+dht_rename_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent);
- }
+ local = frame->local;
+ prev = cookie;
+
+ FRAME_SU_UNDO(frame, dht_local_t);
+ if (!local) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_VALUE,
+ "!local, should not happen");
+ goto out;
+ }
+
+ this_call_cnt = dht_frame_return(frame);
+
+ if (op_ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLINK_FAILED,
+ "%s: Rename: unlink on %s failed ", local->loc.path, prev->name);
+ }
+
+ WIPE(&local->preoldparent);
+ WIPE(&local->postoldparent);
+ WIPE(&local->preparent);
+ WIPE(&local->postparent);
+
+ if (is_last_call(this_call_cnt)) {
+ dht_rename_done(frame, this);
+ }
out:
- return 0;
+ return 0;
}
-
int
-dht_rename_cleanup (call_frame_t *frame)
+dht_rename_cleanup(call_frame_t *frame)
{
- dht_local_t *local = NULL;
- xlator_t *this = NULL;
- xlator_t *src_hashed = NULL;
- xlator_t *src_cached = NULL;
- xlator_t *dst_hashed = NULL;
- xlator_t *dst_cached = NULL;
- int call_cnt = 0;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ int call_cnt = 0;
+ dict_t *xattr = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ local = frame->local;
+ this = frame->this;
- local = frame->local;
- this = frame->this;
+ src_hashed = local->src_hashed;
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
- src_hashed = local->src_hashed;
- src_cached = local->src_cached;
- dst_hashed = local->dst_hashed;
- dst_cached = local->dst_cached;
+ if (src_cached == dst_cached)
+ goto nolinks;
- if (src_cached == dst_cached)
- goto nolinks;
+ if (local->linked && (dst_hashed != src_hashed) &&
+ (dst_hashed != src_cached)) {
+ call_cnt++;
+ }
- if (dst_hashed != src_hashed && dst_hashed != src_cached)
- call_cnt++;
+ if (local->added_link && (src_cached != dst_hashed)) {
+ call_cnt++;
+ }
- if (src_cached != dst_hashed)
- call_cnt++;
+ local->call_cnt = call_cnt;
- local->call_cnt = call_cnt;
+ if (!call_cnt)
+ goto nolinks;
- if (!call_cnt)
- goto nolinks;
+ DHT_MARK_FOP_INTERNAL(xattr);
- if (dst_hashed != src_hashed && dst_hashed != src_cached) {
- gf_log (this->name, GF_LOG_TRACE,
- "unlinking linkfile %s @ %s => %s",
- local->loc.path, dst_hashed->name, src_cached->name);
- STACK_WIND (frame, dht_rename_unlink_cbk,
- dst_hashed, dst_hashed->fops->unlink,
- &local->loc);
- }
+ gf_uuid_unparse(local->loc.inode->gfid, gfid);
- if (src_cached != dst_hashed) {
- gf_log (this->name, GF_LOG_TRACE,
- "unlinking link %s => %s (%s)", local->loc.path,
- local->loc2.path, src_cached->name);
- STACK_WIND (frame, dht_rename_unlink_cbk,
- src_cached, src_cached->fops->unlink,
- &local->loc2);
- }
+ if (local->linked && (dst_hashed != src_hashed) &&
+ (dst_hashed != src_cached)) {
+ dict_t *xattr_new = NULL;
- return 0;
+ gf_msg_trace(this->name, 0,
+ "unlinking linkfile %s @ %s => %s, (gfid = %s)",
+ local->loc.path, dst_hashed->name, src_cached->name, gfid);
+
+ xattr_new = dict_copy_with_ref(xattr, NULL);
+
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed,
+ dst_hashed->fops->unlink, &local->loc, 0, xattr_new);
+
+ dict_unref(xattr_new);
+ xattr_new = NULL;
+ }
+
+ if (local->added_link && (src_cached != dst_hashed)) {
+ dict_t *xattr_new = NULL;
+
+ gf_msg_trace(this->name, 0, "unlinking link %s => %s (%s), (gfid = %s)",
+ local->loc.path, local->loc2.path, src_cached->name, gfid);
+
+ xattr_new = dict_copy_with_ref(xattr, NULL);
+
+ if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+ /* *
+ * The link to file is created using root permission.
+ * Hence deletion should happen using root. Otherwise
+ * it will fail.
+ */
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_cached, src_cached,
+ src_cached->fops->unlink, &local->loc2, 0, xattr_new);
+
+ dict_unref(xattr_new);
+ xattr_new = NULL;
+ }
+
+ if (xattr)
+ dict_unref(xattr);
+
+ return 0;
nolinks:
- WIPE (&local->preoldparent);
- WIPE (&local->postoldparent);
- WIPE (&local->preparent);
- WIPE (&local->postparent);
+ WIPE(&local->preoldparent);
+ WIPE(&local->postoldparent);
+ WIPE(&local->preparent);
+ WIPE(&local->postparent);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent);
+ dht_rename_unlock(frame, this);
+ return 0;
+}
- return 0;
+int
+dht_rename_unlink(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *rename_subvol = NULL;
+ dict_t *xattr = NULL;
+
+ local = frame->local;
+
+ src_hashed = local->src_hashed;
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+
+ local->call_cnt = 0;
+
+ /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
+ * is called. since rename has already happened on rename_subvol,
+ * unlink shouldn't be sent for oldpath (either linkfile or cached-file)
+ * on rename_subvol. */
+ if (src_cached == dst_cached)
+ rename_subvol = src_cached;
+ else
+ rename_subvol = dst_hashed;
+
+ /* TODO: delete files in background */
+
+ if (src_cached != dst_hashed && src_cached != dst_cached)
+ local->call_cnt++;
+
+ if (src_hashed != rename_subvol && src_hashed != src_cached)
+ local->call_cnt++;
+
+ if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached)
+ local->call_cnt++;
+
+ if (local->call_cnt == 0)
+ goto unwind;
+
+ DHT_MARK_FOP_INTERNAL(xattr);
+
+ if (src_cached != dst_hashed && src_cached != dst_cached) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref(xattr, NULL);
+
+ gf_msg_trace(this->name, 0, "deleting old src datafile %s @ %s",
+ local->loc.path, src_cached->name);
+
+ if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+
+ DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc, &local->loc2);
+ STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_cached, src_cached,
+ src_cached->fops->unlink, &local->loc, 0, xattr_new);
+
+ dict_unref(xattr_new);
+ xattr_new = NULL;
+ }
+
+ if (src_hashed != rename_subvol && src_hashed != src_cached) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref(xattr, NULL);
+
+ gf_msg_trace(this->name, 0, "deleting old src linkfile %s @ %s",
+ local->loc.path, src_hashed->name);
+
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
+ STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_hashed, src_hashed,
+ src_hashed->fops->unlink, &local->loc, 0, xattr_new);
+
+ dict_unref(xattr_new);
+ xattr_new = NULL;
+ }
+
+ if (dst_cached && (dst_cached != dst_hashed) &&
+ (dst_cached != src_cached)) {
+ gf_msg_trace(this->name, 0, "deleting old dst datafile %s @ %s",
+ local->loc2.path, dst_cached->name);
+
+ STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, dst_cached, dst_cached,
+ dst_cached->fops->unlink, &local->loc2, 0, xattr);
+ }
+ if (xattr)
+ dict_unref(xattr);
+ return 0;
+
+unwind:
+ WIPE(&local->preoldparent);
+ WIPE(&local->postoldparent);
+ WIPE(&local->preparent);
+ WIPE(&local->postparent);
+
+ dht_rename_done(frame, this);
+
+ return 0;
}
+int
+dht_rename_links_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ xlator_t *prev = NULL;
+ dht_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ prev = cookie;
+ local = frame->local;
+ main_frame = local->main_frame;
+
+ /* TODO: Handle this case in lookup-optimize */
+ if (op_ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_CREATE_LINK_FAILED,
+ "link/file %s on %s failed", local->loc.path, prev->name);
+ }
+
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal(frame, this);
+ }
+
+ dht_rename_unlink(main_frame, this);
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
int
-dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
-{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *src_hashed = NULL;
- xlator_t *src_cached = NULL;
- xlator_t *dst_hashed = NULL;
- xlator_t *dst_cached = NULL;
- xlator_t *rename_subvol = NULL;
-
- local = frame->local;
- prev = cookie;
-
- src_hashed = local->src_hashed;
- src_cached = local->src_cached;
- dst_hashed = local->dst_hashed;
- dst_cached = local->dst_cached;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "rename on %s failed (%s)", prev->this->name,
- strerror (op_errno));
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- goto cleanup;
- }
-
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent, prev->this);
-
- local->stbuf.ia_ino = local->loc.inode->ino;
-
- local->preoldparent.ia_ino = local->loc.parent->ino;
- local->postoldparent.ia_ino = local->loc.parent->ino;
-
- local->preparent.ia_ino = local->loc2.parent->ino;
- local->postparent.ia_ino = local->loc2.parent->ino;
-
- /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
- * is called. since rename has already happened on rename_subvol,
- * unlink should not be sent for oldpath (either linkfile or cached-file)
- * on rename_subvol. */
- if (src_cached == dst_cached)
- rename_subvol = src_cached;
- else
- rename_subvol = dst_hashed;
-
- /* TODO: delete files in background */
-
- if (src_cached != dst_hashed && src_cached != dst_cached)
- local->call_cnt++;
-
- if (src_hashed != rename_subvol && src_hashed != src_cached)
- local->call_cnt++;
-
- if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached)
- local->call_cnt++;
-
- if (local->call_cnt == 0)
- goto unwind;
-
- if (src_cached != dst_hashed && src_cached != dst_cached) {
- gf_log (this->name, GF_LOG_TRACE,
- "deleting old src datafile %s @ %s",
- local->loc.path, src_cached->name);
-
- STACK_WIND (frame, dht_rename_unlink_cbk,
- src_cached, src_cached->fops->unlink,
- &local->loc);
- }
-
- if (src_hashed != rename_subvol && src_hashed != src_cached) {
- gf_log (this->name, GF_LOG_TRACE,
- "deleting old src linkfile %s @ %s",
- local->loc.path, src_hashed->name);
-
- STACK_WIND (frame, dht_rename_unlink_cbk,
- src_hashed, src_hashed->fops->unlink,
- &local->loc);
- }
-
- if (dst_cached
- && (dst_cached != dst_hashed)
- && (dst_cached != src_cached)) {
- gf_log (this->name, GF_LOG_TRACE,
- "deleting old dst datafile %s @ %s",
- local->loc2.path, dst_cached->name);
-
- STACK_WIND (frame, dht_rename_unlink_cbk,
- dst_cached, dst_cached->fops->unlink,
- &local->loc2);
- }
- return 0;
+dht_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ call_frame_t *link_frame = NULL;
+ dht_local_t *link_local = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+
+ if (local->linked == _gf_true)
+ FRAME_SU_UNDO(frame, dht_local_t);
+
+ /* It is a critical failure iff we fail to rename the cached file
+ * if the rename of the linkto failed, it is not a critical failure,
+ * and we do not want to lose the created hard link for the new
+ * name as that could have been read by other clients.
+ *
+ * NOTE: If another client is attempting the same oldname -> newname
+ * rename, and finds both file names as existing, and are hard links
+ * to each other, then FUSE would send in an unlink for oldname. In
+ * this time duration if we treat the linkto as a critical error and
+ * unlink the newname we created, we would have effectively lost the
+ * file to rename operations.
+ *
+ * Repercussions of treating this as a non-critical error is that
+ * we could leave behind a stale linkto file and/or not create the new
+ * linkto file, the second case would be rectified by a subsequent
+ * lookup, the first case by a rebalance, like for all stale linkto
+ * files */
+
+ if (op_ret == -1) {
+ /* Critical failure: unable to rename the cached file */
+ if (prev == src_cached) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_RENAME_FAILED,
+ "%s: Rename on %s failed, (gfid = %s) ", local->loc.path,
+ prev->name,
+ local->loc.inode ? uuid_utoa(local->loc.inode->gfid) : "");
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ goto cleanup;
+ } else {
+ /* Non-critical failure, unable to rename the linkto
+ * file
+ */
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED,
+ "%s: Rename (linkto file) on %s failed, "
+ "(gfid = %s) ",
+ local->loc.path, prev->name,
+ local->loc.inode ? uuid_utoa(local->loc.inode->gfid) : "");
+ }
+ }
+ if (xdata) {
+ if (!local->xattr)
+ local->xattr = dict_ref(xdata);
+ else
+ local->xattr = dict_copy_with_ref(xdata, local->xattr);
+ }
+
+ /* Merge attrs only from src_cached. In case there of src_cached !=
+ * dst_hashed, this ignores linkfile attrs. */
+ if (prev == src_cached) {
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preoldparent, preoldparent);
+ dht_iatt_merge(this, &local->postoldparent, postoldparent);
+ dht_iatt_merge(this, &local->preparent, prenewparent);
+ dht_iatt_merge(this, &local->postparent, postnewparent);
+ }
+
+ /* Create the linkto file for the dst file */
+ if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) {
+ link_frame = copy_frame(frame);
+ if (!link_frame) {
+ goto unlink;
+ }
-unwind:
- WIPE (&local->preoldparent);
- WIPE (&local->postoldparent);
- WIPE (&local->preparent);
- WIPE (&local->postparent);
+ /* fop value sent as maxvalue because it is not used
+ * anywhere in this case */
+ link_local = dht_local_init(link_frame, &local->loc2, NULL,
+ GF_FOP_MAXVALUE);
+ if (!link_local) {
+ goto unlink;
+ }
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent);
+ if (link_local->loc.inode)
+ inode_unref(link_local->loc.inode);
+ link_local->loc.inode = inode_ref(local->loc.inode);
+ link_local->main_frame = frame;
+ link_local->stbuf = local->stbuf;
+ gf_uuid_copy(link_local->gfid, local->loc.inode->gfid);
- return 0;
+ dht_linkfile_create(link_frame, dht_rename_links_create_cbk, this,
+ src_cached, dst_hashed, &link_local->loc);
+ return 0;
+ }
+
+unlink:
+
+ if (link_frame) {
+ DHT_STACK_DESTROY(link_frame);
+ }
+ dht_rename_unlink(frame, this);
+ return 0;
cleanup:
- dht_rename_cleanup (frame);
+ dht_rename_cleanup(frame);
- return 0;
+ return 0;
}
+int
+dht_do_rename(call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *this = NULL;
+ xlator_t *rename_subvol = NULL;
+
+ local = frame->local;
+ this = frame->this;
+
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+ src_cached = local->src_cached;
+
+ if (src_cached == dst_cached)
+ rename_subvol = src_cached;
+ else
+ rename_subvol = dst_hashed;
+
+ if ((src_cached != dst_hashed) && (rename_subvol == dst_hashed)) {
+ DHT_MARKER_DONT_ACCOUNT(local->xattr_req);
+ }
+
+ if (rename_subvol == src_cached) {
+ DHT_CHANGELOG_TRACK_AS_RENAME(local->xattr_req, &local->loc,
+ &local->loc2);
+ }
+
+ gf_msg_trace(this->name, 0, "renaming %s => %s (%s)", local->loc.path,
+ local->loc2.path, rename_subvol->name);
+
+ if (local->linked == _gf_true)
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND_COOKIE(frame, dht_rename_cbk, rename_subvol, rename_subvol,
+ rename_subvol->fops->rename, &local->loc, &local->loc2,
+ local->xattr_req);
+ return 0;
+}
int
-dht_do_rename (call_frame_t *frame)
+dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
- xlator_t *dst_hashed = NULL;
- xlator_t *src_cached = NULL;
- xlator_t *dst_cached = NULL;
- xlator_t *this = NULL;
- xlator_t *rename_subvol = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ local = frame->local;
+ prev = cookie;
- local = frame->local;
- this = frame->this;
+ if (op_ret == -1) {
+ gf_msg_debug(this->name, 0, "link/file on %s failed (%s)", prev->name,
+ strerror(op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ local->added_link = _gf_false;
+ } else
+ dht_iatt_merge(this, &local->stbuf, stbuf);
- dst_hashed = local->dst_hashed;
- dst_cached = local->dst_cached;
- src_cached = local->src_cached;
+ if (local->op_ret == -1)
+ goto cleanup;
- if (src_cached == dst_cached)
- rename_subvol = src_cached;
- else
- rename_subvol = dst_hashed;
+ dht_do_rename(frame);
- gf_log (this->name, GF_LOG_TRACE,
- "renaming %s => %s (%s)",
- local->loc.path, local->loc2.path, rename_subvol->name);
+ return 0;
- STACK_WIND (frame, dht_rename_cbk,
- rename_subvol, rename_subvol->fops->rename,
- &local->loc, &local->loc2);
+cleanup:
+ dht_rename_cleanup(frame);
- return 0;
+ return 0;
}
-
int
-dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int this_call_cnt = 0;
-
-
- local = frame->local;
- prev = cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "link/file on %s failed (%s)",
- prev->this->name, strerror (op_errno));
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- if (local->op_ret == -1)
- goto cleanup;
-
- dht_do_rename (frame);
- }
-
- return 0;
+dht_rename_linkto_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src_cached = NULL;
+ dict_t *xattr = NULL;
+
+ local = frame->local;
+ DHT_MARK_FOP_INTERNAL(xattr);
+ prev = cookie;
+ src_cached = local->src_cached;
+
+ if (op_ret == -1) {
+ gf_msg_debug(this->name, 0, "link/file on %s failed (%s)", prev->name,
+ strerror(op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+
+ /* If linkto creation failed move to failure cleanup code,
+ * instead of continuing with creating the link file */
+ if (local->op_ret != 0) {
+ goto cleanup;
+ }
+
+ gf_msg_trace(this->name, 0, "link %s => %s (%s)", local->loc.path,
+ local->loc2.path, src_cached->name);
+ if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr);
+ }
+
+ local->added_link = _gf_true;
+
+ STACK_WIND_COOKIE(frame, dht_rename_link_cbk, src_cached, src_cached,
+ src_cached->fops->link, &local->loc, &local->loc2, xattr);
+
+ if (xattr)
+ dict_unref(xattr);
+
+ return 0;
cleanup:
- dht_rename_cleanup (frame);
+ dht_rename_cleanup(frame);
- return 0;
+ if (xattr)
+ dict_unref(xattr);
+
+ return 0;
}
+int
+dht_rename_unlink_links_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ gf_msg_debug(this->name, 0, "unlink of %s on %s failed (%s)",
+ local->loc2.path, prev->name, strerror(op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+
+ if (local->op_ret == -1)
+ goto cleanup;
+
+ dht_do_rename(frame);
+
+ return 0;
+
+cleanup:
+ dht_rename_cleanup(frame);
+
+ return 0;
+}
int
-dht_rename_create_links (call_frame_t *frame)
-{
- dht_local_t *local = NULL;
- xlator_t *this = NULL;
- xlator_t *src_hashed = NULL;
- xlator_t *src_cached = NULL;
- xlator_t *dst_hashed = NULL;
- xlator_t *dst_cached = NULL;
- int call_cnt = 0;
-
-
- local = frame->local;
- this = frame->this;
-
- src_hashed = local->src_hashed;
- src_cached = local->src_cached;
- dst_hashed = local->dst_hashed;
- dst_cached = local->dst_cached;
-
- if (src_cached == dst_cached)
- goto nolinks;
-
- if (dst_hashed != src_hashed && dst_hashed != src_cached)
- call_cnt++;
-
- if (src_cached != dst_hashed)
- call_cnt++;
-
- local->call_cnt = call_cnt;
-
- if (dst_hashed != src_hashed && dst_hashed != src_cached) {
- gf_log (this->name, GF_LOG_TRACE,
- "linkfile %s @ %s => %s",
- local->loc.path, dst_hashed->name, src_cached->name);
- memcpy (local->gfid, local->loc.inode->gfid, 16);
- dht_linkfile_create (frame, dht_rename_links_cbk,
- src_cached, dst_hashed, &local->loc);
- }
-
- if (src_cached != dst_hashed) {
- gf_log (this->name, GF_LOG_TRACE,
- "link %s => %s (%s)", local->loc.path,
- local->loc2.path, src_cached->name);
- STACK_WIND (frame, dht_rename_links_cbk,
- src_cached, src_cached->fops->link,
- &local->loc, &local->loc2);
- }
+dht_rename_create_links(call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ int call_cnt = 0;
+ dict_t *xattr = NULL;
+
+ local = frame->local;
+ this = frame->this;
+
+ src_hashed = local->src_hashed;
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+
+ DHT_MARK_FOP_INTERNAL(xattr);
+
+ if (src_cached == dst_cached) {
+ dict_t *xattr_new = NULL;
+
+ if (dst_hashed == dst_cached)
+ goto nolinks;
+
+ xattr_new = dict_copy_with_ref(xattr, NULL);
+
+ gf_msg_trace(this->name, 0, "unlinking dst linkfile %s @ %s",
+ local->loc2.path, dst_hashed->name);
+
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
+ STACK_WIND_COOKIE(frame, dht_rename_unlink_links_cbk, dst_hashed,
+ dst_hashed, dst_hashed->fops->unlink, &local->loc2, 0,
+ xattr_new);
+
+ dict_unref(xattr_new);
+ if (xattr)
+ dict_unref(xattr);
+
+ return 0;
+ }
+
+ if (src_cached != dst_hashed) {
+ /* needed to create the link file */
+ call_cnt++;
+ if (dst_hashed != src_hashed)
+ /* needed to create the linkto file */
+ call_cnt++;
+ }
+
+ /* We should not have any failures post the link creation, as this
+ * introduces the newname into the namespace. Clients could have cached
+ * the existence of the newname and may start taking actions based on
+ * the same. Hence create the linkto first, and then attempt the link.
+ *
+ * NOTE: If another client is attempting the same oldname -> newname
+ * rename, and finds both file names as existing, and are hard links
+ * to each other, then FUSE would send in an unlink for oldname. In
+ * this time duration if we treat the linkto as a critical error and
+ * unlink the newname we created, we would have effectively lost the
+ * file to rename operations. */
+ if (dst_hashed != src_hashed && src_cached != dst_hashed) {
+ gf_msg_trace(this->name, 0, "linkfile %s @ %s => %s", local->loc.path,
+ dst_hashed->name, src_cached->name);
+
+ memcpy(local->gfid, local->loc.inode->gfid, 16);
+ dht_linkfile_create(frame, dht_rename_linkto_cbk, this, src_cached,
+ dst_hashed, &local->loc);
+ } else if (src_cached != dst_hashed) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref(xattr, NULL);
+
+ gf_msg_trace(this->name, 0, "link %s => %s (%s)", local->loc.path,
+ local->loc2.path, src_cached->name);
+ if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+
+ local->added_link = _gf_true;
+
+ STACK_WIND_COOKIE(frame, dht_rename_link_cbk, src_cached, src_cached,
+ src_cached->fops->link, &local->loc, &local->loc2,
+ xattr_new);
+
+ dict_unref(xattr_new);
+ }
nolinks:
- if (!call_cnt) {
- /* skip to next step */
- dht_do_rename (frame);
- }
+ if (!call_cnt) {
+ /* skip to next step */
+ dht_do_rename(frame);
+ }
+ if (xattr)
+ dict_unref(xattr);
+
+ return 0;
+}
- return 0;
+int
+dht_rename_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int call_cnt = 0;
+ dht_conf_t *conf = NULL;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char gfid_server[GF_UUID_BUF_SIZE] = {0};
+ int child_index = -1;
+ gf_boolean_t is_src = _gf_false;
+ loc_t *loc = NULL;
+
+ child_index = (long)cookie;
+
+ local = frame->local;
+ conf = this->private;
+
+ is_src = (child_index == 0);
+ if (is_src)
+ loc = &local->loc;
+ else
+ loc = &local->loc2;
+
+ if (op_ret >= 0) {
+ if (is_src)
+ local->src_cached = dht_subvol_get_cached(this, local->loc.inode);
+ else {
+ if (loc->inode)
+ gf_uuid_unparse(loc->inode->gfid, gfid_local);
+
+ gf_msg_debug(this->name, 0,
+ "dst_cached before lookup: %s, "
+ "(path:%s)(gfid:%s),",
+ local->loc2.path,
+ local->dst_cached ? local->dst_cached->name : NULL,
+ local->dst_cached ? gfid_local : NULL);
+
+ local->dst_cached = dht_subvol_get_cached(this,
+ local->loc2_copy.inode);
+
+ gf_uuid_unparse(stbuf->ia_gfid, gfid_local);
+
+ gf_msg_debug(this->name, GF_LOG_WARNING,
+ "dst_cached after lookup: %s, "
+ "(path:%s)(gfid:%s)",
+ local->loc2.path,
+ local->dst_cached ? local->dst_cached->name : NULL,
+ local->dst_cached ? gfid_local : NULL);
+
+ if ((local->loc2.inode == NULL) ||
+ gf_uuid_compare(stbuf->ia_gfid, local->loc2.inode->gfid)) {
+ if (local->loc2.inode != NULL) {
+ inode_unlink(local->loc2.inode, local->loc2.parent,
+ local->loc2.name);
+ inode_unref(local->loc2.inode);
+ }
+
+ local->loc2.inode = inode_link(local->loc2_copy.inode,
+ local->loc2_copy.parent,
+ local->loc2_copy.name, stbuf);
+ gf_uuid_copy(local->loc2.gfid, stbuf->ia_gfid);
+ }
+ }
+ }
+
+ if (op_ret < 0) {
+ if (is_src) {
+ /* The meaning of is_linkfile is overloaded here. For locking
+ * to work properly both rebalance and rename should acquire
+ * lock on datafile. The reason for sending this lookup is to
+ * find out whether we've acquired a lock on data file.
+ * Between the lookup before rename and this rename, the
+ * file could be migrated by a rebalance process and now this
+ * file this might be a linkto file. We verify that by sending
+ * this lookup. However, if this lookup fails we cannot really
+ * say whether we've acquired lock on a datafile or linkto file.
+ * So, we act conservatively and _assume_
+ * that this is a linkfile and fail the rename operation.
+ */
+ local->is_linkfile = _gf_true;
+ local->op_errno = op_errno;
+ } else {
+ if (local->dst_cached)
+ gf_msg_debug(this->name, op_errno,
+ "file %s (gfid:%s) was present "
+ "(hashed-subvol=%s, "
+ "cached-subvol=%s) before rename,"
+ " but lookup failed",
+ local->loc2.path,
+ uuid_utoa(local->loc2.inode->gfid),
+ local->dst_hashed->name, local->dst_cached->name);
+ if (dht_inode_missing(op_errno))
+ local->dst_cached = NULL;
+ }
+ } else if (is_src && xattr &&
+ check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+ local->is_linkfile = _gf_true;
+ /* Found linkto file instead of data file, passdown ENOENT
+ * based on the above comment */
+ local->op_errno = ENOENT;
+ }
+
+ if (!local->is_linkfile && (op_ret >= 0) &&
+ gf_uuid_compare(loc->gfid, stbuf->ia_gfid)) {
+ gf_uuid_unparse(loc->gfid, gfid_local);
+ gf_uuid_unparse(stbuf->ia_gfid, gfid_server);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "path:%s, received a different gfid, local_gfid= %s"
+ " server_gfid: %s",
+ local->loc.path, gfid_local, gfid_server);
+
+ /* Will passdown ENOENT anyway since the file we sent on
+ * rename is replaced with a different file */
+ local->op_errno = ENOENT;
+ /* Since local->is_linkfile is used here to detect failure,
+ * marking this to true */
+ local->is_linkfile = _gf_true;
+ }
+
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ if (local->is_linkfile) {
+ local->op_ret = -1;
+ goto fail;
+ }
+
+ dht_rename_create_links(frame);
+ }
+
+ return 0;
+fail:
+ dht_rename_unlock(frame, this);
+ return 0;
}
+int
+dht_rename_file_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+ loc_t *loc = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "protecting namespace of %s failed"
+ "rename (%s:%s:%s %s:%s:%s)",
+ local->current == &local->lock[0] ? local->loc.path
+ : local->loc2.path,
+ local->loc.path, src_gfid, local->src_hashed->name,
+ local->loc2.path, dst_gfid,
+ local->dst_hashed ? local->dst_hashed->name : NULL);
+
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ if (local->current == &local->lock[0]) {
+ loc = &local->loc2;
+ subvol = local->dst_hashed;
+ local->current = &local->lock[1];
+ } else {
+ loc = &local->loc;
+ subvol = local->src_hashed;
+ local->current = &local->lock[0];
+ }
+
+ ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+ dht_rename_lock_cbk);
+ if (ret < 0) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ return 0;
+err:
+ /* No harm in calling an extra unlock */
+ dht_rename_unlock(frame, this);
+ return 0;
+}
+
+int32_t
+dht_rename_file_protect_namespace(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+ loc_t *loc = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "acquiring inodelk failed "
+ "rename (%s:%s:%s %s:%s:%s)",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ goto err;
+ }
+
+ /* Locks on src and dst needs to ordered which otherwise might cause
+ * deadlocks when rename (src, dst) and rename (dst, src) is done from
+ * two different clients
+ */
+ ret = dht_order_rename_lock(frame, &loc, &subvol);
+ if (ret) {
+ local->op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+ dht_rename_file_lock1_cbk);
+ if (ret < 0) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ return 0;
+
+err:
+ /* Its fine to call unlock even when no locks are acquired, as we check
+ * for lock->locked before winding a unlock call.
+ */
+ dht_rename_unlock(frame, this);
+
+ return 0;
+}
+
+int32_t
+dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ dict_t *xattr_req = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *subvol = NULL;
+ dht_lock_t *lock = NULL;
+
+ local = frame->local;
+ conf = this->private;
+
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "protecting namespace of %s failed. "
+ "rename (%s:%s:%s %s:%s:%s)",
+ local->current == &local->lock[0] ? local->loc.path
+ : local->loc2.path,
+ local->loc.path, src_gfid, local->src_hashed->name,
+ local->loc2.path, dst_gfid,
+ local->dst_hashed ? local->dst_hashed->name : NULL);
+
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ goto done;
+ }
+
+ xattr_req = dict_new();
+ if (xattr_req == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto done;
+ }
+
+ op_ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto done;
+ }
+
+ /* dst_cached might've changed. This normally happens for two reasons:
+ * 1. rebalance migrated dst
+ * 2. Another parallel rename was done overwriting dst
+ *
+ * Doing a lookup on local->loc2 when dst exists, but is associated
+ * with a different gfid will result in an ESTALE error. So, do a fresh
+ * lookup with a new inode on dst-path and handle change of dst-cached
+ * in the cbk. Also, to identify dst-cached changes we do a lookup on
+ * "this" rather than the subvol.
+ */
+ loc_copy(&local->loc2_copy, &local->loc2);
+ inode_unref(local->loc2_copy.inode);
+ local->loc2_copy.inode = inode_new(local->loc.inode->table);
+
+ /* Why not use local->lock.locks[?].loc for lookup post lock phase
+ * ---------------------------------------------------------------
+ * "layout.parent_layout.locks[?].loc" does not have the name and pargfid
+ * populated.
+ * Reason: If we had populated the name and pargfid, server might
+ * resolve to a successful lookup even if there is a file with same name
+ * with a different gfid(unlink & create) as server does name based
+ * resolution on first priority. And this can result in operating on a
+ * different inode entirely.
+ *
+ * Now consider a scenario where source file was renamed by some other
+ * client to a new name just before this lock was granted. So if a
+ * lookup would be done on local->lock[0].layout.parent_layout.locks[?].loc,
+ * server will send success even if the entry was renamed (since server will
+ * do a gfid based resolution). So once a lock is granted, make sure the
+ * file exists with the name that the client requested with.
+ * */
+
+ local->call_cnt = 2;
+ for (i = 0; i < 2; i++) {
+ if (i == 0) {
+ lock = local->rename_inodelk_backward_compatible[0];
+ if (gf_uuid_compare(local->loc.gfid, lock->loc.gfid) == 0)
+ subvol = lock->xl;
+ else {
+ lock = local->rename_inodelk_backward_compatible[1];
+ subvol = lock->xl;
+ }
+ } else {
+ subvol = this;
+ }
+
+ STACK_WIND_COOKIE(frame, dht_rename_lookup_cbk, (void *)(long)i, subvol,
+ subvol->fops->lookup,
+ (i == 0) ? &local->loc : &local->loc2_copy,
+ xattr_req);
+ }
+
+ dict_unref(xattr_req);
+ return 0;
+
+done:
+ /* Its fine to call unlock even when no locks are acquired, as we check
+ * for lock->locked before winding a unlock call.
+ */
+ dht_rename_unlock(frame, this);
+
+ if (xattr_req)
+ dict_unref(xattr_req);
+
+ return 0;
+}
+
+int
+dht_rename_lock(call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1;
+ dht_lock_t **lk_array = NULL;
+
+ local = frame->local;
+
+ if (local->dst_cached)
+ count++;
+
+ lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+ if (lk_array == NULL)
+ goto err;
+
+ lk_array[0] = dht_lock_new(frame->this, local->src_cached, &local->loc,
+ F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL,
+ FAIL_ON_ANY_ERROR);
+ if (lk_array[0] == NULL)
+ goto err;
+
+ if (local->dst_cached) {
+ /* dst might be removed by the time inodelk reaches bricks,
+ * which can result in ESTALE errors. POSIX imposes no
+ * restriction for dst to be present for renames to be
+ * successful. So, we'll ignore ESTALE errors. As far as
+ * synchronization on dst goes, we'll achieve the same by
+ * holding entrylk on parent directory of dst in the namespace
+ * of basename(dst). Also, there might not be quorum in cluster
+ * xlators like EC/disperse on errno, in which case they return
+ * EIO. For eg., in a disperse (4 + 2), 3 might return success
+ * and three might return ESTALE. Disperse, having no Quorum
+ * unwinds inodelk with EIO. So, ignore EIO too.
+ */
+ lk_array[1] = dht_lock_new(frame->this, local->dst_cached, &local->loc2,
+ F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL,
+ IGNORE_ENOENT_ESTALE_EIO);
+ if (lk_array[1] == NULL)
+ goto err;
+ }
+
+ local->rename_inodelk_backward_compatible = lk_array;
+ local->rename_inodelk_bc_count = count;
+
+ /* retaining inodelks for the sake of backward compatibility. Please
+ * make sure to remove this inodelk once all of 3.10, 3.12 and 3.13
+ * reach EOL. Better way of getting synchronization would be to acquire
+ * entrylks on src and dst parent directories in the namespace of
+ * basenames of src and dst
+ */
+ ret = dht_blocking_inodelk(frame, lk_array, count,
+ dht_rename_file_protect_namespace);
+ if (ret < 0) {
+ local->rename_inodelk_backward_compatible = NULL;
+ local->rename_inodelk_bc_count = 0;
+ goto err;
+ }
+
+ return 0;
+err:
+ if (lk_array != NULL) {
+ int tmp_count = 0, i = 0;
+
+ for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++)
+ ;
+
+ dht_lock_array_free(lk_array, tmp_count);
+ GF_FREE(lk_array);
+ }
+
+ return -1;
+}
int
-dht_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
-{
- xlator_t *src_cached = NULL;
- xlator_t *src_hashed = NULL;
- xlator_t *dst_cached = NULL;
- xlator_t *dst_hashed = NULL;
- int op_errno = -1;
- int ret = -1;
- dht_local_t *local = NULL;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (oldloc, err);
- VALIDATE_OR_GOTO (newloc, err);
-
- src_hashed = dht_subvol_get_hashed (this, oldloc);
- if (!src_hashed) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- oldloc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- src_cached = dht_subvol_get_cached (this, oldloc->inode);
- if (!src_cached) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no cached subvolume for path=%s", oldloc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- dst_hashed = dht_subvol_get_hashed (this, newloc);
- if (!dst_hashed) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- newloc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- if (newloc->inode)
- dst_cached = dht_subvol_get_cached (this, newloc->inode);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- ret = loc_copy (&local->loc, oldloc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- ret = loc_copy (&local->loc2, newloc);
- if (ret == -1) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- local->src_hashed = src_hashed;
- local->src_cached = src_cached;
- local->dst_hashed = dst_hashed;
- local->dst_cached = dst_cached;
-
- gf_log (this->name, GF_LOG_TRACE,
- "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)",
- oldloc->path, src_hashed->name, src_cached->name,
- newloc->path, dst_hashed->name,
- dst_cached ? dst_cached->name : "<nul>");
-
- if (IA_ISDIR (oldloc->inode->ia_type)) {
- dht_rename_dir (frame, this);
- } else {
- local->op_ret = 0;
- dht_rename_create_links (frame);
- }
-
- return 0;
+dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ xlator_t *src_cached = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ char newgfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(oldloc, err);
+ VALIDATE_OR_GOTO(newloc, err);
+
+ gf_uuid_unparse(oldloc->inode->gfid, gfid);
+
+ src_hashed = dht_subvol_get_hashed(this, oldloc);
+ if (!src_hashed) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+ "No hashed subvolume in layout for path=%s,"
+ "(gfid = %s)",
+ oldloc->path, gfid);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ src_cached = dht_subvol_get_cached(this, oldloc->inode);
+ if (!src_cached) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+ "No cached subvolume for path = %s,"
+ "(gfid = %s)",
+ oldloc->path, gfid);
+
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ dst_hashed = dht_subvol_get_hashed(this, newloc);
+ if (!dst_hashed) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+ "No hashed subvolume in layout for path=%s", newloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (newloc->inode)
+ dst_cached = dht_subvol_get_cached(this, newloc->inode);
+
+ local = dht_local_init(frame, oldloc, NULL, GF_FOP_RENAME);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ /* cached_subvol will be set from dht_local_init, reset it to NULL,
+ as the logic of handling rename is different */
+ local->cached_subvol = NULL;
+
+ ret = loc_copy(&local->loc2, newloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->src_hashed = src_hashed;
+ local->src_cached = src_cached;
+ local->dst_hashed = dst_hashed;
+ local->dst_cached = dst_cached;
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (newloc->inode)
+ gf_uuid_unparse(newloc->inode->gfid, newgfid);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_INFO,
+ "renaming %s (%s) (hash=%s/cache=%s) => %s (%s) "
+ "(hash=%s/cache=%s) ",
+ oldloc->path, gfid, src_hashed->name, src_cached->name, newloc->path,
+ newloc->inode ? newgfid : NULL, dst_hashed->name,
+ dst_cached ? dst_cached->name : "<nul>");
+
+ if (IA_ISDIR(oldloc->inode->ia_type)) {
+ dht_rename_dir(frame, this);
+ } else {
+ local->op_ret = 0;
+ ret = dht_rename_lock(frame);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
- return 0;
+ return 0;
+}
+
+int
+dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ gf_boolean_t free_xdata = _gf_false;
+
+ /* Just a pass through */
+ if (!IA_ISDIR(oldloc->inode->ia_type)) {
+ if (!xdata) {
+ free_xdata = _gf_true;
+ }
+ DHT_CHANGELOG_TRACK_AS_RENAME(xdata, oldloc, newloc);
+ }
+ default_rename(frame, this, oldloc, newloc, xdata);
+ if (free_xdata && xdata) {
+ dict_unref(xdata);
+ xdata = NULL;
+ }
+ return 0;
}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 74c89a4fdde..3e24065227c 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -1,558 +1,2600 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include "dht-lock.h"
+
+#define DHT_SET_LAYOUT_RANGE(layout, i, srt, chunk, path) \
+ do { \
+ layout->list[i].start = srt; \
+ layout->list[i].stop = srt + chunk - 1; \
+ layout->list[i].commit_hash = layout->commit_hash; \
+ \
+ gf_msg_trace(this->name, 0, \
+ "gave fix: 0x%x - 0x%x, with commit-hash 0x%x" \
+ " on %s for %s", \
+ layout->list[i].start, layout->list[i].stop, \
+ layout->list[i].commit_hash, \
+ layout->list[i].xlator->name, path); \
+ } while (0)
+
+#define DHT_RESET_LAYOUT_RANGE(layout) \
+ do { \
+ int cnt = 0; \
+ for (cnt = 0; cnt < layout->cnt; cnt++) { \
+ layout->list[cnt].start = 0; \
+ layout->list[cnt].stop = 0; \
+ } \
+ } while (0)
+
+static int
+dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
+ gf_boolean_t newdir, dht_selfheal_layout_t healer,
+ dht_need_heal_t should_heal);
+
+static uint32_t
+dht_overlap_calc(dht_layout_t *old, int o, dht_layout_t *new, int n)
+{
+ if (o >= old->cnt || n >= new->cnt)
+ return 0;
-#include "glusterfs.h"
-#include "xlator.h"
-#include "dht-common.h"
+ if (old->list[o].err > 0 || new->list[n].err > 0)
+ return 0;
+ if (old->list[o].start == old->list[o].stop) {
+ return 0;
+ }
-int
-dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret)
-{
- dht_local_t *local = NULL;
+ if (new->list[n].start == new->list[n].stop) {
+ return 0;
+ }
+ if ((old->list[o].start > new->list[n].stop) ||
+ (old->list[o].stop < new->list[n].start))
+ return 0;
- local = frame->local;
- local->selfheal.dir_cbk (frame, NULL, frame->this, ret,
- local->op_errno);
+ return min(old->list[o].stop, new->list[n].stop) -
+ max(old->list[o].start, new->list[n].start) + 1;
+}
- return 0;
+int
+dht_selfheal_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_DESTROY(frame);
+ return 0;
}
+int
+dht_selfheal_dir_finish(call_frame_t *frame, xlator_t *this, int ret,
+ int invoke_cbk)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ int lock_count = 0;
+
+ local = frame->local;
+
+ /* Unlock entrylk */
+ dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns);
+
+ /* Unlock inodelk */
+ lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks,
+ local->lock[0].ns.parent_layout.lk_count);
+ if (lock_count == 0)
+ goto done;
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+ lock_frame->root->op);
+ if (lock_local == NULL) {
+ goto done;
+ }
+
+ lock_local->lock[0].ns.parent_layout.locks = local->lock[0]
+ .ns.parent_layout.locks;
+ lock_local->lock[0]
+ .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count;
+
+ local->lock[0].ns.parent_layout.locks = NULL;
+ local->lock[0].ns.parent_layout.lk_count = 0;
+
+ dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks,
+ lock_local->lock[0].ns.parent_layout.lk_count,
+ dht_selfheal_unlock_cbk);
+ lock_frame = NULL;
+
+done:
+ if (invoke_cbk)
+ local->selfheal.dir_cbk(frame, NULL, frame->this, ret, local->op_errno,
+ NULL);
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+
+ return 0;
+}
int
-dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno)
+dht_refresh_layout_done(call_frame_t *frame)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *subvol = NULL;
- int i = 0;
- dht_layout_t *layout = NULL;
- int err = 0;
- int this_call_cnt = 0;
+ int ret = -1;
+ dht_layout_t *refreshed = NULL, *heal = NULL;
+ dht_local_t *local = NULL;
+ dht_need_heal_t should_heal = NULL;
+ dht_selfheal_layout_t healer = NULL;
- local = frame->local;
- layout = local->selfheal.layout;
- prev = cookie;
- subvol = prev->this;
+ local = frame->local;
- if (op_ret == 0)
- err = 0;
- else
- err = op_errno;
+ refreshed = local->selfheal.refreshed_layout;
+ heal = local->selfheal.layout;
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].xlator == subvol) {
- layout->list[i].err = err;
- break;
- }
- }
+ healer = local->selfheal.healer;
+ should_heal = local->selfheal.should_heal;
- this_call_cnt = dht_frame_return (frame);
+ ret = dht_layout_sort(refreshed);
+ if (ret == -1) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SORT_FAILED, NULL);
+ goto err;
+ }
- if (is_last_call (this_call_cnt)) {
- dht_selfheal_dir_finish (frame, this, 0);
- }
+ if (should_heal(frame, &heal, &refreshed)) {
+ healer(frame, &local->loc, heal);
+ } else {
+ local->selfheal.layout = NULL;
+ local->selfheal.refreshed_layout = NULL;
+ local->selfheal.layout = refreshed;
- return 0;
+ dht_layout_unref(frame->this, heal);
+
+ dht_selfheal_dir_finish(frame, frame->this, 0, 1);
+ }
+
+ return 0;
+
+err:
+ dht_selfheal_dir_finish(frame, frame->this, -1, 1);
+ return 0;
}
+int
+dht_refresh_layout_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, err);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, err);
+ GF_VALIDATE_OR_GOTO("dht", this->private, err);
+
+ local = frame->local;
+ prev = cookie;
+
+ layout = local->selfheal.refreshed_layout;
+
+ LOCK(&frame->lock);
+ {
+ op_ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
+
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ local->op_errno = op_errno;
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_FILE_LOOKUP_FAILED, "path=%s", local->loc.path,
+ "name=%s", prev->name, "gfid=%s", gfid, NULL);
+
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ if (local->op_ret == 0) {
+ local->refresh_layout_done(frame);
+ } else {
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ if (local) {
+ local->refresh_layout_unlock(frame, this, -1, 1);
+ }
+ return 0;
+}
int
-dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
- dht_layout_t *layout, int i)
+dht_refresh_layout(call_frame_t *frame)
{
- xlator_t *subvol = NULL;
- dict_t *xattr = NULL;
- int ret = 0;
- xlator_t *this = NULL;
- int32_t *disk_layout = NULL;
+ int call_cnt = 0;
+ int i = 0, ret = -1;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+
+ this = frame->this;
+ conf = this->private;
+ local = frame->local;
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+ local->op_ret = -1;
+
+ if (local->selfheal.refreshed_layout) {
+ dht_layout_unref(this, local->selfheal.refreshed_layout);
+ local->selfheal.refreshed_layout = NULL;
+ }
+
+ local->selfheal.refreshed_layout = dht_layout_new(this,
+ conf->subvolume_cnt);
+ if (!local->selfheal.refreshed_layout) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+
+ if (local->xattr != NULL) {
+ dict_del(local->xattr, conf->xattr_name);
+ }
+
+ if (local->xattr_req == NULL) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ local->xattr_req = dict_new();
+ if (local->xattr_req == NULL) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+ }
+ if (dict_get(local->xattr_req, conf->xattr_name) == 0) {
+ ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+ if (ret)
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", local->loc.path, "key=%s", conf->xattr_name,
+ NULL);
+ }
- subvol = layout->list[i].xlator;
- this = frame->this;
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_refresh_layout_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
- xattr = get_new_dict ();
- if (!xattr) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
+ return 0;
- ret = dht_disk_layout_extract (this, layout, i, &disk_layout);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to extract disk layout");
- goto err;
- }
+out:
+ if (local) {
+ local->refresh_layout_unlock(frame, this, -1, 1);
+ }
+ return 0;
+}
- ret = dict_set_bin (xattr, "trusted.glusterfs.dht",
- disk_layout, 4 * 4);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set xattr dictionary");
- goto err;
- }
- disk_layout = NULL;
+int32_t
+dht_selfheal_layout_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
- gf_log (this->name, GF_LOG_TRACE,
- "setting hash range %u - %u (type %d) on subvolume %s for %s",
- layout->list[i].start, layout->list[i].stop,
- layout->type, subvol->name, loc->path);
+ local = frame->local;
- dict_ref (xattr);
+ if (!local) {
+ goto err;
+ }
- STACK_WIND (frame, dht_selfheal_dir_xattr_cbk,
- subvol, subvol->fops->setxattr,
- loc, xattr, 0);
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ goto err;
+ }
- dict_unref (xattr);
+ local->refresh_layout_unlock = dht_selfheal_dir_finish;
+ local->refresh_layout_done = dht_refresh_layout_done;
- return 0;
+ dht_refresh_layout(frame);
+ return 0;
err:
- if (xattr)
- dict_destroy (xattr);
+ dht_selfheal_dir_finish(frame, this, -1, 1);
+ return 0;
+}
- if (disk_layout)
- GF_FREE (disk_layout);
+gf_boolean_t
+dht_should_heal_layout(call_frame_t *frame, dht_layout_t **heal,
+ dht_layout_t **ondisk)
+{
+ gf_boolean_t fixit = _gf_true;
+ dht_local_t *local = NULL;
+ int heal_missing_dirs = 0;
+
+ local = frame->local;
+
+ if ((heal == NULL) || (*heal == NULL) || (ondisk == NULL) ||
+ (*ondisk == NULL))
+ goto out;
+
+ dht_layout_anomalies(
+ frame->this, &local->loc, *ondisk, &local->selfheal.hole_cnt,
+ &local->selfheal.overlaps_cnt, &local->selfheal.missing_cnt,
+ &local->selfheal.down, &local->selfheal.misc, NULL);
+
+ /* Directories might've been created as part of this self-heal. We've to
+ * sync non-layout xattrs and set range 0-0 on new directories
+ */
+ heal_missing_dirs = local->selfheal.force_mkdir
+ ? local->selfheal.force_mkdir
+ : dht_layout_missing_dirs(*heal);
+
+ if ((local->selfheal.hole_cnt == 0) &&
+ (local->selfheal.overlaps_cnt == 0) && heal_missing_dirs) {
+ dht_layout_t *tmp = NULL;
+
+ /* Just added a brick and need to set 0-0 range on this brick.
+ * But ondisk layout is well-formed. So, swap layouts "heal" and
+ * "ondisk". Now "ondisk" layout will be used for healing
+ * xattrs. If there are any non-participating subvols in
+ * "ondisk" layout, dht_selfheal_dir_xattr_persubvol will set
+ * 0-0 and non-layout xattrs. This way we won't end up in
+ * "corrupting" already set and well-formed "ondisk" layout.
+ */
+ tmp = *heal;
+ *heal = *ondisk;
+ *ondisk = tmp;
+
+ /* Current selfheal code, heals non-layout xattrs only after
+ * an add-brick. In fact non-layout xattrs are considered as
+ * secondary citizens which are healed only if layout xattrs
+ * need to be healed. This is wrong, since for eg., quota can be
+ * set when layout is well-formed, but a node is down. Also,
+ * just for healing non-layout xattrs, we don't need locking.
+ * This issue is _NOT FIXED_ by this patch.
+ */
+ }
+
+ fixit = (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt ||
+ heal_missing_dirs);
- dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this,
- -1, ENOMEM);
- return 0;
+out:
+ return fixit;
}
+int
+dht_layout_span(dht_layout_t *layout)
+{
+ int i = 0, count = 0;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err)
+ continue;
+
+ if (layout->list[i].start != layout->list[i].stop)
+ count++;
+ }
+
+ return count;
+}
int
-dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+dht_decommissioned_bricks_in_layout(xlator_t *this, dht_layout_t *layout)
{
- dht_local_t *local = NULL;
- int missing_xattr = 0;
- int i = 0;
- xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ int count = 0, i = 0, j = 0;
- local = frame->local;
- this = frame->this;
+ if ((this == NULL) || (layout == NULL))
+ goto out;
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].err != -1 || !layout->list[i].stop) {
- /* err != -1 would mean xattr present on the directory
- * or the directory is itself non existant.
- * !layout->list[i].stop would mean layout absent
- */
- continue;
- }
- missing_xattr++;
- }
+ conf = this->private;
- gf_log (this->name, GF_LOG_TRACE,
- "%d subvolumes missing xattr for %s",
- missing_xattr, loc->path);
+ for (i = 0; i < layout->cnt; i++) {
+ for (j = 0; j < conf->subvolume_cnt; j++) {
+ if (conf->decommissioned_bricks[j] &&
+ conf->decommissioned_bricks[j] == layout->list[i].xlator) {
+ count++;
+ }
+ }
+ }
- if (missing_xattr == 0) {
- dht_selfheal_dir_finish (frame, this, 0);
- return 0;
- }
+out:
+ return count;
+}
+
+dht_distribution_type_t
+dht_distribution_type(xlator_t *this, dht_layout_t *layout)
+{
+ dht_distribution_type_t type = GF_DHT_EQUAL_DISTRIBUTION;
+ int i = 0;
+ uint32_t start_range = 0, range = 0, diff = 0;
+
+ if ((this == NULL) || (layout == NULL) || (layout->cnt < 1)) {
+ goto out;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (start_range == 0) {
+ start_range = layout->list[i].stop - layout->list[i].start;
+ continue;
+ }
+
+ range = layout->list[i].stop - layout->list[i].start;
+ diff = (range >= start_range) ? range - start_range
+ : start_range - range;
+
+ if ((range != 0) && (diff > layout->cnt)) {
+ type = GF_DHT_WEIGHTED_DISTRIBUTION;
+ break;
+ }
+ }
+
+out:
+ return type;
+}
+
+gf_boolean_t
+dht_should_fix_layout(call_frame_t *frame, dht_layout_t **inmem,
+ dht_layout_t **ondisk)
+{
+ gf_boolean_t fixit = _gf_true;
+
+ dht_local_t *local = NULL;
+ int layout_span = 0;
+ int decommissioned_bricks = 0;
+ dht_conf_t *conf = NULL;
+ dht_distribution_type_t inmem_dist_type = 0;
+ dht_distribution_type_t ondisk_dist_type = 0;
+
+ conf = frame->this->private;
+
+ local = frame->local;
+
+ if ((inmem == NULL) || (*inmem == NULL) || (ondisk == NULL) ||
+ (*ondisk == NULL))
+ goto out;
+
+ dht_layout_anomalies(frame->this, &local->loc, *ondisk,
+ &local->selfheal.hole_cnt,
+ &local->selfheal.overlaps_cnt, NULL,
+ &local->selfheal.down, &local->selfheal.misc, NULL);
+
+ if (local->selfheal.down || local->selfheal.misc) {
+ fixit = _gf_false;
+ goto out;
+ }
+
+ if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt)
+ goto out;
+
+ /* If commit hashes are being updated, let it through */
+ if ((*inmem)->commit_hash != (*ondisk)->commit_hash)
+ goto out;
- local->call_cnt = missing_xattr;
+ layout_span = dht_layout_span(*ondisk);
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].err != -1 || !layout->list[i].stop)
- continue;
+ decommissioned_bricks = dht_decommissioned_bricks_in_layout(frame->this,
+ *ondisk);
+ inmem_dist_type = dht_distribution_type(frame->this, *inmem);
+ ondisk_dist_type = dht_distribution_type(frame->this, *ondisk);
- dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i);
+ if ((decommissioned_bricks == 0) &&
+ (layout_span ==
+ (conf->subvolume_cnt - conf->decommission_subvols_cnt)) &&
+ (inmem_dist_type == ondisk_dist_type))
+ fixit = _gf_false;
- if (--missing_xattr == 0)
- break;
- }
- return 0;
+out:
+
+ return fixit;
}
+static int
+dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
+ gf_boolean_t newdir, dht_selfheal_layout_t healer,
+ dht_need_heal_t should_heal)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0;
+ dht_lock_t **lk_array = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *tmp = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
+
+ local = frame->local;
+
+ conf = frame->this->private;
+
+ local->selfheal.healer = healer;
+ local->selfheal.should_heal = should_heal;
+
+ tmp = local->selfheal.layout;
+ local->selfheal.layout = dht_layout_ref(frame->this, layout);
+ dht_layout_unref(frame->this, tmp);
+
+ if (!newdir) {
+ count = conf->subvolume_cnt;
+
+ lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char);
+ if (lk_array == NULL) {
+ gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+ gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
+ goto err;
+ }
+
+ for (i = 0; i < count; i++) {
+ lk_array[i] = dht_lock_new(
+ frame->this, conf->subvolumes[i], &local->loc, F_WRLCK,
+ DHT_LAYOUT_HEAL_DOMAIN, NULL, FAIL_ON_ANY_ERROR);
+ if (lk_array[i] == NULL) {
+ gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+ gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_MEM_ALLOC_FAILED, "lk_array-gfid=%s", gfid,
+ "path=%s", local->loc.path, NULL);
+ goto err;
+ }
+ }
+ } else {
+ count = 1;
+ lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char);
+ if (lk_array == NULL) {
+ gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+ gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
+ goto err;
+ }
+
+ lk_array[0] = dht_lock_new(frame->this, local->hashed_subvol,
+ &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN,
+ NULL, FAIL_ON_ANY_ERROR);
+ if (lk_array[0] == NULL) {
+ gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+ gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
+ goto err;
+ }
+ }
+
+ local->lock[0].layout.my_layout.locks = lk_array;
+ local->lock[0].layout.my_layout.lk_count = count;
+
+ ret = dht_blocking_inodelk(frame, lk_array, count,
+ dht_selfheal_layout_lock_cbk);
+ if (ret < 0) {
+ local->lock[0].layout.my_layout.locks = NULL;
+ local->lock[0].layout.my_layout.lk_count = 0;
+ goto err;
+ }
+
+ return 0;
+err:
+ if (lk_array != NULL) {
+ dht_lock_array_free(lk_array, count);
+ GF_FREE(lk_array);
+ }
+ return -1;
+}
+
+static int
+dht_selfheal_dir_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ struct iatt *stbuf = NULL;
+ int i = 0;
+ int ret = 0;
+ dht_layout_t *layout = NULL;
+ int err = 0;
+ int this_call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+ subvol = cookie;
+
+ if (op_ret == 0) {
+ err = 0;
+ } else {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "name=%s", subvol->name,
+ "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
+ err = op_errno;
+ }
+
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+ if (ret < 0) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ gf_msg_debug(this->name, 0,
+ "key = %s not present in dict"
+ ", path:%s gfid:%s",
+ DHT_IATT_IN_XDATA_KEY, local->loc.path, gfid);
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ layout->list[i].err = err;
+ break;
+ }
+ }
+
+ LOCK(&frame->lock);
+ {
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ }
+ UNLOCK(&frame->lock);
+
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ dht_selfheal_dir_finish(frame, this, 0, 1);
+ }
+
+ return 0;
+}
+
+/* Code is required to set user xattr to local->xattr
+ */
int
-dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- call_frame_t *prev = NULL;
- xlator_t *subvol = NULL;
- int i = 0;
- int this_call_cnt = 0;
-
-
- local = frame->local;
- layout = local->selfheal.layout;
- prev = cookie;
- subvol = prev->this;
-
- if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) {
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].xlator == subvol) {
- layout->list[i].err = -1;
- break;
- }
- }
- }
-
- if (op_ret)
- goto out;
+dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data)
+{
+ dict_t *set_xattr = data;
+ int ret = -1;
+
+ ret = dict_set(set_xattr, k, v);
+ return ret;
+}
+
+static int
+dht_selfheal_dir_xattr_persubvol(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout, int i,
+ xlator_t *req_subvol)
+{
+ xlator_t *subvol = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+ int ret = 0;
+ xlator_t *this = NULL;
+ int32_t *disk_layout = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ data_t *data = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ if (req_subvol)
+ subvol = req_subvol;
+ else
+ subvol = layout->list[i].xlator;
+ this = frame->this;
+
+ GF_VALIDATE_OR_GOTO("", this, err);
+ GF_VALIDATE_OR_GOTO(this->name, layout, err);
+ GF_VALIDATE_OR_GOTO(this->name, local, err);
+ GF_VALIDATE_OR_GOTO(this->name, subvol, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ xattr = dict_new();
+ if (!xattr) {
+ goto err;
+ }
+
+ xdata = dict_new();
+ if (!xdata)
+ goto err;
+
+ ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY,
+ "gfid=%s", gfid, NULL);
+ goto err;
+ }
+
+ ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", DHT_IATT_IN_XDATA_KEY,
+ "gfid=%s", gfid, NULL);
+ goto err;
+ }
+
+ gf_uuid_unparse(loc->inode->gfid, gfid);
+
+ ret = dht_disk_layout_extract(this, layout, i, &disk_layout);
+ if (ret == -1) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "extract-disk-layout-failed, path=%s", loc->path, "subvol=%s",
+ subvol->name, "gfid=%s", gfid, NULL);
+ goto err;
+ }
+
+ ret = dict_set_bin(xattr, conf->xattr_name, disk_layout, 4 * 4);
+ if (ret == -1) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s", loc->path,
+ "subvol=%s", subvol->name,
+ "set-xattr-dictionary-failed"
+ "gfid=%s",
+ gfid, NULL);
+ goto err;
+ }
+ disk_layout = NULL;
+
+ gf_msg_trace(this->name, 0,
+ "setting hash range 0x%x - 0x%x (type %d) on subvolume %s"
+ " for %s",
+ layout->list[i].start, layout->list[i].stop, layout->type,
+ subvol->name, loc->path);
+
+ if (local->xattr) {
+ data = dict_get(local->xattr, QUOTA_LIMIT_KEY);
+ if (data) {
+ ret = dict_add(xattr, QUOTA_LIMIT_KEY, data);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", QUOTA_LIMIT_KEY, NULL);
+ }
+ }
+ data = dict_get(local->xattr, QUOTA_LIMIT_OBJECTS_KEY);
+ if (data) {
+ ret = dict_add(xattr, QUOTA_LIMIT_OBJECTS_KEY, data);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", QUOTA_LIMIT_OBJECTS_KEY,
+ NULL);
+ }
+ }
+ }
+
+ if (!gf_uuid_is_null(local->gfid))
+ gf_uuid_copy(loc->gfid, local->gfid);
+
+ STACK_WIND_COOKIE(frame, dht_selfheal_dir_xattr_cbk, (void *)subvol, subvol,
+ subvol->fops->setxattr, loc, xattr, 0, xdata);
+ dict_unref(xattr);
+ dict_unref(xdata);
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- if (prev->this == local->hashed_subvol)
- local->ia_ino = local->stbuf.ia_ino;
+ return 0;
- dht_iatt_merge (this, &local->preparent, preparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent, prev->this);
+err:
+ if (xattr)
+ dict_unref(xattr);
+ if (xdata)
+ dict_unref(xdata);
+
+ GF_FREE(disk_layout);
+
+ dht_selfheal_dir_xattr_cbk(frame, (void *)subvol, frame->this, -1, ENOMEM,
+ NULL);
+ return 0;
+}
+
+static int
+dht_fix_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ int count = 0;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
+
+ local = frame->local;
+ this = frame->this;
+ conf = this->private;
+
+ gf_msg_debug(this->name, 0, "%s: Writing the new range for all subvolumes",
+ loc->path);
+
+ local->call_cnt = count = conf->subvolume_cnt;
+
+ if (gf_log_get_loglevel() >= GF_LOG_DEBUG)
+ dht_log_new_layout_for_dir_selfheal(this, loc, layout);
+
+ for (i = 0; i < layout->cnt; i++) {
+ dht_selfheal_dir_xattr_persubvol(frame, loc, layout, i, NULL);
+
+ if (--count == 0)
+ goto out;
+ }
+ /* if we are here, subvolcount > layout_count. subvols-per-directory
+ * option might be set here. We need to clear out layout from the
+ * non-participating subvolumes, else it will result in overlaps */
+ dummy = dht_layout_new(this, 1);
+ if (!dummy)
+ goto out;
+ dummy->commit_hash = layout->commit_hash;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) {
+ dht_selfheal_dir_xattr_persubvol(frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ if (--count == 0)
+ break;
+ }
+ }
+ dht_layout_unref(this, dummy);
out:
- this_call_cnt = dht_frame_return (frame);
+ return 0;
+}
- if (is_last_call (this_call_cnt)) {
- dht_selfheal_dir_xattr (frame, &local->loc, layout);
- }
+static int
+dht_selfheal_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ int missing_xattr = 0;
+ int i = 0;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+
+ local = frame->local;
+ this = frame->this;
+ conf = this->private;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err != -1 || !layout->list[i].stop) {
+ /* err != -1 would mean xattr present on the directory
+ * or the directory is non existent.
+ * !layout->list[i].stop would mean layout absent
+ */
+
+ continue;
+ }
+ missing_xattr++;
+ }
+ /* Also account for subvolumes with no-layout. Used for zero'ing out
+ * the layouts and for setting quota key's if present */
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) {
+ missing_xattr++;
+ }
+ }
+ gf_msg_trace(this->name, 0, "%d subvolumes missing xattr for %s",
+ missing_xattr, loc->path);
+
+ if (missing_xattr == 0) {
+ dht_selfheal_dir_finish(frame, this, 0, 1);
+ return 0;
+ }
+
+ local->call_cnt = missing_xattr;
+
+ if (gf_log_get_loglevel() >= GF_LOG_DEBUG)
+ dht_log_new_layout_for_dir_selfheal(this, loc, layout);
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err != -1 || !layout->list[i].stop)
+ continue;
+
+ dht_selfheal_dir_xattr_persubvol(frame, loc, layout, i, NULL);
+
+ if (--missing_xattr == 0)
+ break;
+ }
+ dummy = dht_layout_new(this, 1);
+ if (!dummy) {
+ gf_uuid_unparse(loc->gfid, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DUMMY_ALLOC_FAILED,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+ for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) {
+ if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) {
+ dht_selfheal_dir_xattr_persubvol(frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ missing_xattr--;
+ }
+ }
- return 0;
+ dht_layout_unref(this, dummy);
+out:
+ return 0;
}
+int
+dht_selfheal_dir_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int this_call_cnt = 0, ret = -1;
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ if (!local->heal_layout) {
+ gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ",
+ local->loc.path, uuid_utoa(local->gfid));
+
+ dht_selfheal_dir_finish(frame, this, 0, 1);
+ return 0;
+ }
+ ret = dht_selfheal_layout_lock(frame, layout, _gf_false,
+ dht_selfheal_dir_xattr,
+ dht_should_heal_layout);
+
+ if (ret < 0) {
+ dht_selfheal_dir_finish(frame, this, -1, 1);
+ }
+ }
+
+ return 0;
+}
int
-dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc,
- dht_layout_t *layout, int force)
-{
- int missing_dirs = 0;
- int i = 0;
- int ret = -1;
- dht_local_t *local = NULL;
- xlator_t *this = NULL;
- dict_t *dict = NULL;
-
- local = frame->local;
- this = frame->this;
-
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].err == ENOENT || force)
- missing_dirs++;
- }
-
- if (missing_dirs == 0) {
- dht_selfheal_dir_xattr (frame, loc, layout);
- return 0;
- }
-
- local->call_cnt = missing_dirs;
- if (!uuid_is_null (local->gfid)) {
- dict = dict_new ();
- if (!dict)
- return -1;
-
- ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16);
- if (ret)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set gfid in dict");
- } else if (local->params) {
- /* Send the dictionary from higher layers directly */
- dict = dict_ref (local->params);
+dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dht_layout_t *layout)
+{
+ int missing_attr = 0;
+ int i = 0, ret = -1;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ int cnt = 0;
+
+ local = frame->local;
+ this = frame->this;
+ conf = this->private;
+
+ /* We need to heal the attrs if:
+ * 1. Any directories were missing - the newly created dirs will need
+ * to have the correct attrs set
+ * 2. An existing dir does not have the correct permissions -they may
+ * have been changed when a brick was down.
+ */
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == -1)
+ missing_attr++;
+ }
+
+ if ((missing_attr == 0) && (local->need_attrheal == 0)) {
+ if (!local->heal_layout) {
+ gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ",
+ loc->path, uuid_utoa(loc->gfid));
+ dht_selfheal_dir_finish(frame, this, 0, 1);
+ return 0;
+ }
+ ret = dht_selfheal_layout_lock(frame, layout, _gf_false,
+ dht_selfheal_dir_xattr,
+ dht_should_heal_layout);
+
+ if (ret < 0) {
+ dht_selfheal_dir_finish(frame, this, -1, 1);
+ }
+
+ return 0;
+ }
+
+ cnt = local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < cnt; i++) {
+ STACK_WIND(frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator,
+ layout->list[i].xlator->fops->setattr, loc, stbuf, valid,
+ NULL);
+ }
+
+ return 0;
+}
+
+static int
+dht_selfheal_dir_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ int i = 0, ret = -1;
+ int this_call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+ prev = cookie;
+ subvol = prev;
+
+ if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) {
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ layout->list[i].err = -1;
+ break;
+ }
}
+ }
+
+ if (op_ret) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ gf_smsg(this->name,
+ ((op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_WARNING),
+ op_errno, DHT_MSG_DIR_SELFHEAL_FAILED, "path=%s",
+ local->loc.path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ ret = 0;
+
+out:
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ dht_selfheal_dir_finish(frame, this, ret, 0);
+ dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffff,
+ layout);
+ }
+
+ return 0;
+}
+static int
+dht_selfheal_dir_mkdir_lookup_done(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dict_t *dict = NULL;
+ dht_layout_t *layout = NULL;
+ loc_t *loc = NULL;
+ int cnt = 0;
+ int ret = -1;
+
+ VALIDATE_OR_GOTO(this->private, err);
+
+ local = frame->local;
+ layout = local->layout;
+ loc = &local->loc;
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ dict = dict_new();
+ if (!dict)
+ return -1;
+
+ ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true);
+ if (ret)
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=gfid-req", NULL);
+ } else if (local->params) {
+ /* Send the dictionary from higher layers directly */
+
+ dict = dict_ref(local->params);
+ }
+ /* Code to update all extended attributed from local->xattr
+ to dict
+ */
+ dht_dir_set_heal_xattr(this, local, dict, local->xattr, NULL, NULL);
+
+ if (!dict) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_IS_NULL, NULL);
+ dict = dict_new();
if (!dict)
- gf_log (this->name, GF_LOG_DEBUG,
- "dict is NULL, need to make sure gfid's are same");
+ return -1;
+ }
+ ret = dict_set_flag(dict, GF_INTERNAL_CTX_KEY, GF_DHT_HEAL_DIR);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "key=%s",
+ GF_INTERNAL_CTX_KEY, "path=%s", loc->path, NULL);
+ /* We can still continue. As heal can still happen
+ * unless quota limits have reached for the dir.
+ */
+ }
+
+ cnt = layout->cnt;
+ for (i = 0; i < cnt; i++) {
+ if (layout->list[i].err == ESTALE || layout->list[i].err == ENOENT ||
+ local->selfheal.force_mkdir) {
+ gf_msg_debug(this->name, 0, "Creating directory %s on subvol %s",
+ loc->path, layout->list[i].xlator->name);
+
+ STACK_WIND_COOKIE(
+ frame, dht_selfheal_dir_mkdir_cbk, layout->list[i].xlator,
+ layout->list[i].xlator, layout->list[i].xlator->fops->mkdir,
+ loc,
+ st_mode_from_ia(local->stbuf.ia_prot, local->stbuf.ia_type), 0,
+ dict);
+ }
+ }
+
+ if (dict)
+ dict_unref(dict);
+
+ return 0;
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].err == ENOENT || force) {
- gf_log (this->name, GF_LOG_TRACE,
- "creating directory %s on subvol %s",
- loc->path, layout->list[i].xlator->name);
+err:
+ dht_selfheal_dir_finish(frame, this, -1, 1);
+ return 0;
+}
+
+static int
+dht_selfheal_dir_mkdir_lookup_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ int this_call_cnt = 0;
+ int missing_dirs = 0;
+ dht_layout_t *layout = NULL;
+ xlator_t *prev = 0;
+ loc_t *loc = NULL;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ int index = -1;
+
+ VALIDATE_OR_GOTO(this->private, err);
+
+ local = frame->local;
+ layout = local->layout;
+ loc = &local->loc;
+ prev = cookie;
+
+ if (!gf_uuid_is_null(local->gfid))
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ LOCK(&frame->lock);
+ {
+ index = dht_layout_index_for_subvol(layout, prev);
+ if ((op_ret < 0) && (op_errno == ENOENT || op_errno == ESTALE)) {
+ local->selfheal.hole_cnt = !local->selfheal.hole_cnt
+ ? 1
+ : local->selfheal.hole_cnt + 1;
+ /* the status might have changed. Update the layout with the
+ * new status
+ */
+ if (index >= 0) {
+ layout->list[index].err = op_errno;
+ }
+ }
- STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->mkdir,
- loc,
- st_mode_from_ia (local->stbuf.ia_prot,
- local->stbuf.ia_type),
- dict);
- }
- }
+ if (!op_ret) {
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ if (prev == local->mds_subvol) {
+ dict_unref(local->xattr);
+ local->xattr = dict_ref(xattr);
+ }
+ /* the status might have changed. Update the layout with the
+ * new status
+ */
+ if (index >= 0) {
+ layout->list[index].err = -1;
+ }
+ }
+ }
+ UNLOCK(&frame->lock);
+
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ if (local->selfheal.hole_cnt == layout->cnt) {
+ gf_msg_debug(this->name, op_errno,
+ "Lookup failed, an rmdir could have "
+ "deleted this entry %s",
+ loc->name);
+ local->op_errno = op_errno;
+ goto err;
+ } else {
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == ENOENT ||
+ layout->list[i].err == ESTALE ||
+ local->selfheal.force_mkdir)
+ missing_dirs++;
+ }
+
+ if (missing_dirs == 0) {
+ dht_selfheal_dir_finish(frame, this, 0, 0);
+ dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff,
+ layout);
+ return 0;
+ }
+
+ local->call_cnt = missing_dirs;
+ dht_selfheal_dir_mkdir_lookup_done(frame, this);
+ }
+ }
- if (dict)
- dict_unref (dict);
+ return 0;
- return 0;
+err:
+ dht_selfheal_dir_finish(frame, this, -1, 1);
+ return 0;
}
+static int
+dht_selfheal_dir_mkdir_lock_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ int ret = -1;
+ xlator_t *mds_subvol = NULL;
-int
-dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc,
- dht_layout_t *layout)
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+ local = frame->local;
+ mds_subvol = local->mds_subvol;
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ if (op_ret < 0) {
+ if (op_errno == EINVAL) {
+ local->call_cnt = 1;
+ dht_selfheal_dir_mkdir_lookup_done(frame, this);
+ return 0;
+ }
+
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_ENTRYLK_ERROR,
+ "path=%s", local->loc.path, NULL);
+
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ /* After getting locks, perform lookup again to ensure that the
+ directory was not deleted by a racing rmdir
+ */
+ if (!local->xattr_req)
+ local->xattr_req = dict_new();
+
+ ret = dict_set_int32(local->xattr_req, "list-xattr", 1);
+ if (ret)
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "path=%s",
+ local->loc.path, NULL);
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (mds_subvol && conf->subvolumes[i] == mds_subvol) {
+ STACK_WIND_COOKIE(frame, dht_selfheal_dir_mkdir_lookup_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_selfheal_dir_mkdir_lookup_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ NULL);
+ }
+ }
+
+ return 0;
+
+err:
+ dht_selfheal_dir_finish(frame, this, -1, 1);
+ return 0;
+}
+
+static int
+dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout,
+ int force)
+{
+ int missing_dirs = 0;
+ int i = 0;
+ int op_errno = 0;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+ this = frame->this;
+ conf = this->private;
+
+ local->selfheal.force_mkdir = force;
+ local->selfheal.hole_cnt = 0;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == ENOENT || force)
+ missing_dirs++;
+ }
+
+ if (missing_dirs == 0) {
+ /* We don't need to create any directories. Proceed to heal the
+ * attrs and xattrs
+ */
+ if (!__is_root_gfid(local->stbuf.ia_gfid)) {
+ if (local->need_xattr_heal) {
+ local->need_xattr_heal = 0;
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s",
+ local->loc.path, "gfid=%s", local->gfid, NULL);
+ }
+ } else {
+ if (!gf_uuid_is_null(local->gfid))
+ gf_uuid_copy(loc->gfid, local->gfid);
+
+ ret = dht_common_mark_mdsxattr(frame, NULL, 0);
+ if (!ret)
+ return 0;
+
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SET_XATTR_FAILED,
+ "path=%s", local->loc.path, "gfid=%s", local->gfid,
+ NULL);
+ }
+ }
+ dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff, layout);
+ return 0;
+ }
+
+ /* MDS xattr is populated only while DHT is having more than one
+ subvol.In case of graph switch while adding more dht subvols need to
+ consider hash subvol as a MDS to avoid MDS check failure at the time
+ of running fop on directory
+ */
+ if (!dict_get(local->xattr, conf->mds_xattr_key) &&
+ (conf->subvolume_cnt > 1)) {
+ if (local->hashed_subvol == NULL) {
+ local->hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (local->hashed_subvol == NULL) {
+ local->op_errno = EINVAL;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s",
+ loc->pargfid, "name=%s", loc->name, "path=%s",
+ loc->path, NULL);
+ goto err;
+ }
+ }
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this,
+ local->hashed_subvol);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set hashed subvol for %s on inode vol is %s",
+ local->loc.path,
+ local->hashed_subvol ? local->hashed_subvol->name : "NULL");
+ goto err;
+ }
+ }
+
+ if (local->hashed_subvol == NULL) {
+ local->hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (local->hashed_subvol == NULL) {
+ local->op_errno = EINVAL;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", loc->pargfid,
+ "name=%s", loc->name, "path=%s", loc->path, NULL);
+ goto err;
+ }
+ }
+
+ local->current = &local->lock[0];
+ ret = dht_protect_namespace(frame, loc, local->hashed_subvol,
+ &local->current->ns,
+ dht_selfheal_dir_mkdir_lock_cbk);
+
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ return -1;
+}
+
+static int
+dht_selfheal_layout_alloc_start(xlator_t *this, loc_t *loc,
+ dht_layout_t *layout)
{
- int start = 0;
- uint32_t hashval = 0;
- int ret = 0;
+ int start = 0;
+ uint32_t hashval = 0;
+ int ret = 0;
+ const char *str = NULL;
+ dht_conf_t *conf = NULL;
+ char buf[UUID_CANONICAL_FORM_LEN + 1] = {
+ 0,
+ };
+
+ conf = this->private;
+
+ if (conf->randomize_by_gfid) {
+ str = uuid_utoa_r(loc->gfid, buf);
+ } else {
+ str = loc->path;
+ }
+
+ ret = dht_hash_compute(this, layout->type, str, &hashval);
+ if (ret == 0) {
+ start = (hashval % layout->cnt);
+ }
+
+ return start;
+}
- ret = dht_hash_compute (layout->type, loc->path, &hashval);
- if (ret == 0) {
- start = (hashval % layout->cnt);
+static int
+dht_get_layout_count(xlator_t *this, dht_layout_t *layout, int new_layout)
+{
+ int i = 0;
+ int j = 0;
+ int err = 0;
+ int count = 0;
+ dht_conf_t *conf = NULL;
+
+ /* Gets in use only for replace-brick, remove-brick */
+ conf = this->private;
+ for (i = 0; i < layout->cnt; i++) {
+ for (j = 0; j < conf->subvolume_cnt; j++) {
+ if (conf->decommissioned_bricks[j] &&
+ conf->decommissioned_bricks[j] == layout->list[i].xlator) {
+ layout->list[i].err = EINVAL;
+ break;
+ }
}
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ err = layout->list[i].err;
+ if (err == -1 || err == 0 || err == ENOENT) {
+ /* Take this with a pinch of salt. The behaviour seems
+ * to be slightly different when this function is
+ * invoked from mkdir codepath. For eg., err == 0 in
+ * mkdir codepath means directory created but xattr
+ * is not set yet.
+ */
+
+ /* Setting list[i].err = -1 is an indication for
+ dht_selfheal_layout_new_directory() to assign
+ a range. We set it to -1 based on any one of
+ the three criteria:
+
+ - err == -1 already, which means directory
+ existed but layout was not set on it.
+
+ - err == 0, which means directory exists and
+ has an old layout piece which will be
+ overwritten now.
+
+ - err == ENOENT, which means directory does
+ not exist (possibly racing with mkdir or
+ finishing half done mkdir). The missing
+ directory will be attempted to be recreated.
+ */
+ count++;
+ if (!err)
+ layout->list[i].err = -1;
+ }
+ }
+
+ /* no subvolume has enough space, but can't stop directory creation */
+ if (!count || !new_layout) {
+ for (i = 0; i < layout->cnt; i++) {
+ err = layout->list[i].err;
+ if (err == ENOSPC) {
+ layout->list[i].err = -1;
+ count++;
+ }
+ }
+ }
+
+ /* if layout->spread_cnt is set, check if it is <= available
+ * subvolumes (down brick and decommissioned bricks are considered
+ * un-available). Else return count (available up bricks) */
+ count = ((layout->spread_cnt && (layout->spread_cnt <= count))
+ ? layout->spread_cnt
+ : ((count) ? count : 1));
- return start;
+ return count;
}
+void
+dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *new_layout);
void
-dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
- dht_layout_t *layout)
-{
- xlator_t *this = NULL;
- uint32_t chunk = 0;
- int i = 0;
- uint32_t start = 0;
- int cnt = 0;
- int err = 0;
- int start_subvol = 0;
-
- this = frame->this;
-
- for (i = 0; i < layout->cnt; i++) {
- err = layout->list[i].err;
- if (err == -1 || err == 0) {
- layout->list[i].err = -1;
- cnt++;
- }
- }
-
- /* no subvolume has enough space, but can't stop directory creation */
- if (!cnt) {
- for (i = 0; i < layout->cnt; i++) {
- err = layout->list[i].err;
- if (err == ENOSPC) {
- layout->list[i].err = -1;
- cnt++;
- }
+dht_layout_range_swap(dht_layout_t *layout, int i, int j);
+
+/*
+ * It's a bit icky using local variables in a macro, but it makes the rest
+ * of the code a lot clearer.
+ */
+#define OV_ENTRY(x, y) table[x * new->cnt + y]
+
+static void
+dht_selfheal_layout_maximize_overlap(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *new, dht_layout_t *old)
+{
+ int i = 0;
+ int j = 0;
+ uint32_t curr_overlap = 0;
+ uint32_t max_overlap = 0;
+ int max_overlap_idx = -1;
+ uint32_t overlap = 0;
+ uint32_t *table = NULL;
+
+ dht_layout_sort_volname(old);
+ /* Now both old_layout->list[] and new_layout->list[]
+ are match the same xlators/subvolumes. i.e,
+ old_layout->[i] and new_layout->[i] are referring
+ to the same subvolumes
+ */
+
+ /* Build a table of overlaps between new[i] and old[j]. */
+ table = alloca(sizeof(overlap) * old->cnt * new->cnt);
+ if (!table) {
+ return;
+ }
+ memset(table, 0, sizeof(overlap) * old->cnt * new->cnt);
+ for (i = 0; i < new->cnt; ++i) {
+ for (j = 0; j < old->cnt; ++j) {
+ OV_ENTRY(i, j) = dht_overlap_calc(old, j, new, i);
+ }
+ }
+
+ for (i = 0; i < new->cnt; i++) {
+ if (new->list[i].err > 0) {
+ /* Subvol might be marked for decommission
+ with EINVAL, or some other serious error
+ marked with positive errno.
+ */
+ continue;
+ }
+
+ max_overlap = 0;
+ max_overlap_idx = i;
+ for (j = (i + 1); j < new->cnt; ++j) {
+ if (new->list[j].err > 0) {
+ /* Subvol might be marked for decommission
+ with EINVAL, or some other serious error
+ marked with positive errno.
+ */
+ continue;
+ }
+ /* Calculate the overlap now. */
+ curr_overlap = OV_ENTRY(i, i) + OV_ENTRY(j, j);
+ /* Calculate the overlap after the proposed swap. */
+ overlap = OV_ENTRY(i, j) + OV_ENTRY(j, i);
+ /* Are we better than status quo? */
+ if (overlap > curr_overlap) {
+ overlap -= curr_overlap;
+ /* Are we better than the previous choice? */
+ if (overlap > max_overlap) {
+ max_overlap = overlap;
+ max_overlap_idx = j;
}
+ }
}
- chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1);
-
- start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
-
- for (i = start_subvol; i < layout->cnt; i++) {
- err = layout->list[i].err;
- if (err == -1) {
- layout->list[i].start = start;
- layout->list[i].stop = start + chunk - 1;
-
- start = start + chunk;
-
- gf_log (this->name, GF_LOG_TRACE,
- "gave fix: %u - %u on %s for %s",
- layout->list[i].start, layout->list[i].stop,
- layout->list[i].xlator->name, loc->path);
- if (--cnt == 0) {
- layout->list[i].stop = 0xffffffff;
- break;
- }
- }
- }
-
- for (i = 0; i < start_subvol; i++) {
- err = layout->list[i].err;
- if (err == -1) {
- layout->list[i].start = start;
- layout->list[i].stop = start + chunk - 1;
-
- start = start + chunk;
-
- gf_log (this->name, GF_LOG_TRACE,
- "gave fix: %u - %u on %s for %s",
- layout->list[i].start, layout->list[i].stop,
- layout->list[i].xlator->name, loc->path);
- if (--cnt == 0) {
- layout->list[i].stop = 0xffffffff;
- break;
- }
- }
- }
+ if (max_overlap_idx != i) {
+ dht_layout_range_swap(new, i, max_overlap_idx);
+ /* Need to swap the table values too. */
+ for (j = 0; j < old->cnt; ++j) {
+ overlap = OV_ENTRY(i, j);
+ OV_ENTRY(i, j) = OV_ENTRY(max_overlap_idx, j);
+ OV_ENTRY(max_overlap_idx, j) = overlap;
+ }
+ }
+ }
}
+static dht_layout_t *
+dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout)
+{
+ int i = 0;
+ xlator_t *this = NULL;
+ dht_layout_t *new_layout = NULL;
+ dht_conf_t *priv = NULL;
+ dht_local_t *local = NULL;
+ uint32_t subvol_down = 0;
+ gf_boolean_t maximize_overlap = _gf_true;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ this = frame->this;
+ priv = this->private;
+ local = frame->local;
+
+ if (layout->type == DHT_HASH_TYPE_DM_USER) {
+ gf_msg_debug(THIS->name, 0, "leaving %s alone", loc->path);
+ goto done;
+ }
+
+ new_layout = dht_layout_new(this, priv->subvolume_cnt);
+ if (!new_layout) {
+ gf_uuid_unparse(loc->gfid, gfid);
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+ "new_layout, path=%s", loc->path, "gfid=%s", gfid, NULL);
+ goto done;
+ }
+
+ /* If a subvolume is down, do not re-write the layout. */
+ dht_layout_anomalies(this, loc, layout, NULL, NULL, NULL, &subvol_down,
+ NULL, NULL);
+
+ if (subvol_down) {
+ gf_uuid_unparse(loc->gfid, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+ "subvol-down=%u", subvol_down, "Skipping-fix-layout", "path=%s",
+ loc->path, "gfid=%s", gfid, NULL);
+ GF_FREE(new_layout);
+ return NULL;
+ }
+
+ for (i = 0; i < new_layout->cnt; i++) {
+ if (layout->list[i].err != ENOSPC)
+ new_layout->list[i].err = layout->list[i].err;
+ else
+ new_layout->list[i].err = -1;
+
+ new_layout->list[i].xlator = layout->list[i].xlator;
+ }
+
+ new_layout->commit_hash = layout->commit_hash;
+
+ if (priv->du_stats) {
+ for (i = 0; i < priv->subvolume_cnt; ++i) {
+ gf_smsg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_SUBVOL_INFO,
+ "index=%d", i, "name=%s", priv->subvolumes[i]->name,
+ "chunks=%u", priv->du_stats[i].chunks, "path=%s", loc->path,
+ NULL);
+
+ /* Maximize overlap if the bricks are all the same
+ * size.
+ * This is probably not going to be very common on
+ * live setups but will benefit our regression tests
+ */
+ if (i && (priv->du_stats[i].chunks != priv->du_stats[0].chunks)) {
+ maximize_overlap = _gf_false;
+ }
+ }
+ } else {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_DISK_USAGE_STATUS,
+ NULL);
+ }
+
+ /* First give it a layout as though it is a new directory. This
+ ensures rotation to kick in */
+ dht_layout_sort_volname(new_layout);
+ dht_selfheal_layout_new_directory(frame, loc, new_layout);
+
+ /* Maximize overlap if weighted-rebalance is disabled */
+ if (!priv->do_weighting)
+ maximize_overlap = _gf_true;
+
+ /* Now selectively re-assign ranges only when it helps */
+ if (maximize_overlap) {
+ dht_selfheal_layout_maximize_overlap(frame, loc, new_layout, layout);
+ }
+done:
+ if (new_layout) {
+ /* Make sure the extra 'ref' for existing layout is removed */
+ dht_layout_unref(this, local->layout);
+
+ local->layout = new_layout;
+ }
+
+ return local->layout;
+}
-int
-dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
- dht_layout_t *layout)
-{
- dht_conf_t *conf = NULL;
- xlator_t *this = NULL;
- dht_local_t *local = NULL;
- int missing = -1;
- int down = -1;
- int holes = -1;
- int ret = -1;
- int i = -1;
- int overlaps = -1;
-
- this = frame->this;
- conf = this->private;
- local = frame->local;
-
- missing = local->selfheal.missing;
- down = local->selfheal.down;
- holes = local->selfheal.hole_cnt;
- overlaps = local->selfheal.overlaps_cnt;
-
- if ((missing + down) == conf->subvolume_cnt) {
- dht_selfheal_layout_new_directory (frame, loc, layout);
- ret = 0;
- }
-
- if (holes <= down) {
- /* the down subvol might fill up the holes */
- ret = 0;
- }
-
- if (holes || overlaps) {
- dht_selfheal_layout_new_directory (frame, loc, layout);
- ret = 0;
- }
-
- for (i = 0; i < layout->cnt; i++) {
- /* directory not present */
- if (layout->list[i].err == ENOENT) {
- ret = 0;
- break;
- }
- }
-
- /* TODO: give a fix to these non-virgins */
-
- return ret;
+/*
+ * Having to call this 2x for each entry in the layout is pretty horrible, but
+ * that's what all of this layout-sorting nonsense gets us.
+ */
+static uint32_t
+dht_get_chunks_from_xl(xlator_t *parent, xlator_t *child)
+{
+ dht_conf_t *priv = parent->private;
+ xlator_list_t *trav;
+ uint32_t index = 0;
+
+ if (!priv->du_stats) {
+ return 0;
+ }
+
+ for (trav = parent->children; trav; trav = trav->next) {
+ if (trav->xlator == child) {
+ return priv->du_stats[index].chunks;
+ }
+ ++index;
+ }
+
+ return 0;
+}
+
+void
+dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout)
+{
+ xlator_t *this = NULL;
+ double chunk = 0;
+ int i = 0;
+ uint32_t start = 0;
+ int bricks_to_use = 0;
+ int err = 0;
+ int start_subvol = 0;
+ uint32_t curr_size;
+ uint32_t range_size;
+ uint64_t total_size = 0;
+ int real_i;
+ dht_conf_t *priv;
+ gf_boolean_t weight_by_size;
+ int bricks_used = 0;
+
+ this = frame->this;
+ priv = this->private;
+ weight_by_size = priv->do_weighting;
+
+ bricks_to_use = dht_get_layout_count(this, layout, 1);
+ GF_ASSERT(bricks_to_use > 0);
+
+ bricks_used = 0;
+ for (i = 0; i < layout->cnt; ++i) {
+ err = layout->list[i].err;
+ if ((err != -1) && (err != ENOENT)) {
+ continue;
+ }
+ curr_size = dht_get_chunks_from_xl(this, layout->list[i].xlator);
+ if (!curr_size) {
+ weight_by_size = _gf_false;
+ break;
+ }
+ total_size += curr_size;
+ if (++bricks_used >= bricks_to_use) {
+ break;
+ }
+ }
+
+ if (weight_by_size && total_size) {
+ /* We know total_size is not zero. */
+ chunk = ((double)0xffffffff) / ((double)total_size);
+ gf_msg_debug(this->name, 0,
+ "chunk size = 0xffffffff / %" PRIu64 " = %f", total_size,
+ chunk);
+ } else {
+ weight_by_size = _gf_false;
+ chunk = ((unsigned long)0xffffffff) / bricks_to_use;
+ }
+
+ start_subvol = dht_selfheal_layout_alloc_start(this, loc, layout);
+
+ /* clear out the range, as we are re-computing here */
+ DHT_RESET_LAYOUT_RANGE(layout);
+
+ /*
+ * OK, what's this "real_i" stuff about? This used to be two loops -
+ * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1.
+ * That way is practically an open invitation to bugs when only one
+ * of the loops is updated. Using real_i and modulo operators to make
+ * it one loop avoids this problem. Remember, folks: it's everyone's
+ * responsibility to help stamp out copy/paste abuse.
+ */
+ bricks_used = 0;
+ for (real_i = 0; real_i < layout->cnt; real_i++) {
+ i = (real_i + start_subvol) % layout->cnt;
+ err = layout->list[i].err;
+ if ((err != -1) && (err != ENOENT)) {
+ continue;
+ }
+ if (weight_by_size) {
+ curr_size = dht_get_chunks_from_xl(this, layout->list[i].xlator);
+ if (!curr_size) {
+ continue;
+ }
+ } else {
+ curr_size = 1;
+ }
+ range_size = chunk * curr_size;
+ gf_msg_debug(this->name, 0, "assigning range size 0x%x to %s",
+ range_size, layout->list[i].xlator->name);
+ DHT_SET_LAYOUT_RANGE(layout, i, start, range_size, loc->path);
+ if (++bricks_used >= bricks_to_use) {
+ layout->list[i].stop = 0xffffffff;
+ goto done;
+ }
+ start += range_size;
+ }
+
+done:
+ return;
+}
+
+static int
+dht_selfheal_dir_getafix(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ uint32_t holes = 0;
+ int ret = -1;
+ int i = -1;
+ uint32_t overlaps = 0;
+
+ local = frame->local;
+
+ holes = local->selfheal.hole_cnt;
+ overlaps = local->selfheal.overlaps_cnt;
+
+ if (holes || overlaps) {
+ /* If the layout has anomalies which would change the hash
+ * ranges, then we need to reset the commit_hash for this
+ * directory, as the layout would change and things may not
+ * be in place as expected */
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+ dht_selfheal_layout_new_directory(frame, loc, layout);
+ ret = 0;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ /* directory not present */
+ if (layout->list[i].err == ENOENT) {
+ ret = 0;
+ break;
+ }
+ }
+
+ /* TODO: give a fix to these non-virgins */
+
+ return ret;
}
int
-dht_selfheal_new_directory (call_frame_t *frame,
- dht_selfheal_dir_cbk_t dir_cbk,
- dht_layout_t *layout)
+dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ dht_layout_t *layout)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ int ret = 0;
+ inode_t *linked_inode = NULL, *inode = NULL;
+ loc_t *loc = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int32_t op_errno = EIO;
+
+ local = frame->local;
+
+ loc = &local->loc;
- local = frame->local;
+ gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
- local->selfheal.dir_cbk = dir_cbk;
- local->selfheal.layout = dht_layout_ref (frame->this, layout);
+ linked_inode = inode_link(loc->inode, loc->parent, loc->name,
+ &local->stbuf);
+ if (!linked_inode) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED,
+ "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid, NULL);
+ ret = -1;
+ goto out;
+ }
- dht_layout_sort_volname (layout);
- dht_selfheal_layout_new_directory (frame, &local->loc, layout);
- dht_selfheal_dir_xattr (frame, &local->loc, layout);
- return 0;
+ inode = loc->inode;
+ loc->inode = linked_inode;
+ inode_unref(inode);
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = dht_layout_ref(frame->this, layout);
+
+ dht_layout_sort_volname(layout);
+ dht_selfheal_layout_new_directory(frame, &local->loc, layout);
+
+ op_errno = ENOMEM;
+ ret = dht_selfheal_layout_lock(frame, layout, _gf_true,
+ dht_selfheal_dir_xattr,
+ dht_should_heal_layout);
+
+out:
+ if (ret < 0) {
+ dir_cbk(frame, NULL, frame->this, -1, op_errno, NULL);
+ }
+
+ return 0;
}
+int
+dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *tmp_layout = NULL;
+ int ret = 0;
+
+ local = frame->local;
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = dht_layout_ref(frame->this, layout);
+
+ /* No layout sorting required here */
+ tmp_layout = dht_fix_layout_of_directory(frame, &local->loc, layout);
+ if (!tmp_layout) {
+ return -1;
+ }
+
+ ret = dht_selfheal_layout_lock(frame, tmp_layout, _gf_false,
+ dht_fix_dir_xattr, dht_should_fix_layout);
+
+ return ret;
+}
int
-dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
- loc_t *loc, dht_layout_t *layout)
-{
- dht_local_t *local = NULL;
- uint32_t holes = 0;
- uint32_t down = 0;
- uint32_t misc = 0;
- int ret = 0;
- xlator_t *this = NULL;
-
- local = frame->local;
- this = frame->this;
-
- dht_layout_anomalies (this, loc, layout,
- &local->selfheal.hole_cnt,
- &local->selfheal.overlaps_cnt,
- &local->selfheal.missing,
- &local->selfheal.down,
- &local->selfheal.misc);
-
- holes = local->selfheal.hole_cnt;
- down = local->selfheal.down;
- misc = local->selfheal.misc;
-
- local->selfheal.dir_cbk = dir_cbk;
- local->selfheal.layout = dht_layout_ref (this, layout);
-
- if (down) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%d subvolumes down -- not fixing", down);
- ret = -1;
- local->op_errno = ESTALE;
- goto sorry_no_fix;
- }
-
- if (misc) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%d subvolumes have unrecoverable errors", misc);
- ret = -1;
- local->op_errno = ESTALE;
- goto sorry_no_fix;
- }
-
- dht_layout_sort_volname (layout);
- ret = dht_selfheal_dir_getafix (frame, loc, layout);
-
- if (ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "not able to form layout for the directory");
- ret = -1;
- local->op_errno = ESTALE;
- goto sorry_no_fix;
- }
-
- dht_selfheal_dir_mkdir (frame, loc, layout, 0);
-
- return 0;
+dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+ int ret = 0;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ inode_t *linked_inode = NULL, *inode = NULL;
+
+ local = frame->local;
+ this = frame->this;
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = dht_layout_ref(this, layout);
+
+ if (local->need_attrheal) {
+ if (__is_root_gfid(local->stbuf.ia_gfid)) {
+ local->stbuf.ia_gid = local->prebuf.ia_gid;
+ local->stbuf.ia_uid = local->prebuf.ia_uid;
+
+ local->stbuf.ia_ctime = local->prebuf.ia_ctime;
+ local->stbuf.ia_ctime_nsec = local->prebuf.ia_ctime_nsec;
+ local->stbuf.ia_prot = local->prebuf.ia_prot;
+
+ } else if (!IA_ISINVAL(local->mds_stbuf.ia_type)) {
+ local->stbuf = local->mds_stbuf;
+ }
+ }
+
+ if (!__is_root_gfid(local->stbuf.ia_gfid)) {
+ gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ linked_inode = inode_link(loc->inode, loc->parent, loc->name,
+ &local->stbuf);
+ if (!linked_inode) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED,
+ "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid,
+ NULL);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ inode = loc->inode;
+ loc->inode = linked_inode;
+ inode_unref(inode);
+ }
+
+ if (local->need_xattr_heal && (local->mds_xattr)) {
+ dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr,
+ NULL, NULL);
+ dict_unref(local->mds_xattr);
+ local->mds_xattr = NULL;
+ }
+
+ dht_layout_anomalies(this, loc, layout, &local->selfheal.hole_cnt,
+ &local->selfheal.overlaps_cnt,
+ &local->selfheal.missing_cnt, &local->selfheal.down,
+ &local->selfheal.misc, NULL);
+
+ down = local->selfheal.down;
+ misc = local->selfheal.misc;
+
+ if (down) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED,
+ "path=%s", loc->path, "subvol-down=%d", down, "Not-fixing",
+ "gfid=%s", gfid, NULL);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ if (misc) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED,
+ "path=%s", loc->path, "misc=%d", misc, "unrecoverable-errors",
+ "gfid=%s", gfid, NULL);
+
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ dht_layout_sort_volname(layout);
+ local->heal_layout = _gf_true;
+
+ /* Ignore return value as it can be inferred from result of
+ * dht_layout_anomalies
+ */
+ dht_selfheal_dir_getafix(frame, loc, layout);
+
+ if (!(local->selfheal.hole_cnt || local->selfheal.overlaps_cnt ||
+ local->selfheal.missing_cnt)) {
+ local->heal_layout = _gf_false;
+ }
+
+ ret = dht_selfheal_dir_mkdir(frame, loc, layout, 0);
+ if (ret < 0) {
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ return 0;
sorry_no_fix:
- /* TODO: need to put appropriate local->op_errno */
- dht_selfheal_dir_finish (frame, this, ret);
+ /* TODO: need to put appropriate local->op_errno */
+ dht_selfheal_dir_finish(frame, this, ret, 1);
+
+ return 0;
+}
- return 0;
+int
+dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ loc_t *loc, dht_layout_t *layout)
+{
+ int ret = 0;
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = dht_layout_ref(frame->this, layout);
+
+ ret = dht_selfheal_dir_mkdir(frame, loc, layout, 1);
+
+ return ret;
+}
+
+int
+dht_dir_heal_xattrs(void *data)
+{
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dict_t *user_xattr = NULL;
+ dict_t *internal_xattr = NULL;
+ dict_t *mds_xattr = NULL;
+ dict_t *xdata = NULL;
+ int call_cnt = 0;
+ int ret = -1;
+ int uret = 0;
+ int uflag = 0;
+ int i = 0;
+ int xattr_hashed = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int32_t allzero[1] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", data, out);
+
+ frame = data;
+ local = frame->local;
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, local, out);
+ mds_subvol = local->mds_subvol;
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, out);
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (!mds_subvol) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s",
+ local->loc.path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+
+ if ((local->loc.inode && gf_uuid_is_null(local->loc.inode->gfid)) ||
+ gf_uuid_is_null(local->loc.gfid)) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_NOT_PRESENT,
+ "skip-heal path=%s", local->loc.path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+
+ internal_xattr = dict_new();
+ if (!internal_xattr) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+ "dictionary", NULL);
+ goto out;
+ }
+ xdata = dict_new();
+ if (!xdata) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+ "dictionary", NULL);
+ goto out;
+ }
+
+ call_cnt = conf->subvolume_cnt;
+
+ user_xattr = dict_new();
+ if (!user_xattr) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+ "dictionary", NULL);
+ goto out;
+ }
+
+ ret = syncop_listxattr(local->mds_subvol, &local->loc, &mds_xattr, NULL,
+ NULL);
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LIST_XATTRS_FAILED,
+ "path=%s", local->loc.path, "name=%s", local->mds_subvol->name,
+ NULL);
+ }
+
+ if (!mds_xattr)
+ goto out;
+
+ dht_dir_set_heal_xattr(this, local, user_xattr, mds_xattr, &uret, &uflag);
+
+ /* To set quota related xattr need to set GLUSTERFS_INTERNAL_FOP_KEY
+ * key value to 1
+ */
+ if (dict_get(user_xattr, QUOTA_LIMIT_KEY) ||
+ dict_get(user_xattr, QUOTA_LIMIT_OBJECTS_KEY)) {
+ ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, "path=%s",
+ local->loc.path, NULL);
+ goto out;
+ }
+ }
+ if (uret <= 0 && !uflag)
+ goto out;
+
+ for (i = 0; i < call_cnt; i++) {
+ subvol = conf->subvolumes[i];
+ if (subvol == mds_subvol)
+ continue;
+ if (uret || uflag) {
+ /* Custom xattr heal is required - let posix handle it */
+ ret = dict_set_int8(xdata, "sync_backend_xattrs", _gf_true);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", local->loc.path, "key=%s",
+ "sync_backend_xattrs", NULL);
+ goto out;
+ }
+
+ ret = syncop_setxattr(subvol, &local->loc, user_xattr, 0, xdata,
+ NULL);
+ if (ret) {
+ xattr_hashed = 1;
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "set-user-xattr-failed path=%s", local->loc.path,
+ "subvol=%s", subvol->name, "gfid=%s", gfid, NULL);
+ } else {
+ dict_del(xdata, "sync_backend_xattrs");
+ }
+ }
+ }
+ /* After heal all custom xattr reset internal MDS xattr to 0 */
+ if (!xattr_hashed) {
+ ret = dht_dict_set_array(internal_xattr, conf->mds_xattr_key, allzero,
+ 1);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "key=%s", conf->mds_xattr_key, "path=%s", local->loc.path,
+ NULL);
+ goto out;
+ }
+ ret = syncop_setxattr(mds_subvol, &local->loc, internal_xattr, 0, NULL,
+ NULL);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path,
+ "subvol=%s", mds_subvol->name, "gfid=%s", gfid, NULL);
+ }
+ }
+
+out:
+ if (user_xattr)
+ dict_unref(user_xattr);
+ if (mds_xattr)
+ dict_unref(mds_xattr);
+ if (internal_xattr)
+ dict_unref(internal_xattr);
+ if (xdata)
+ dict_unref(xdata);
+ return 0;
+}
+
+int
+dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data)
+{
+ DHT_STACK_DESTROY(sync_frame);
+ return 0;
}
+int
+dht_dir_attr_heal(void *data)
+{
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ int call_cnt = 0;
+ int ret = -1;
+ int i = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", data, out);
+
+ frame = data;
+ local = frame->local;
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", local, out);
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO("dht", conf, out);
+
+ mds_subvol = local->mds_subvol;
+ call_cnt = conf->subvolume_cnt;
+
+ if (!__is_root_gfid(local->stbuf.ia_gfid) && (!mds_subvol)) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s",
+ local->loc.path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+
+ if (!__is_root_gfid(local->stbuf.ia_gfid)) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MDS_DOWN_UNABLE_TO_SET, "path=%s",
+ local->loc.path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ subvol = conf->subvolumes[i];
+ if (!subvol || subvol == mds_subvol)
+ continue;
+ if (__is_root_gfid(local->stbuf.ia_gfid)) {
+ ret = syncop_setattr(
+ subvol, &local->loc, &local->stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL,
+ NULL, NULL, NULL);
+ } else {
+ ret = syncop_setattr(
+ subvol, &local->loc, &local->mds_stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL,
+ NULL, NULL, NULL);
+ }
+
+ if (ret) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_ATTR_HEAL_FAILED, "path=%s", local->loc.path,
+ "subvol=%s", subvol->name, "gfid=%s", gfid, NULL);
+ }
+ }
+out:
+ return 0;
+}
int
-dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
- loc_t *loc, dht_layout_t *layout)
+dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data)
+{
+ DHT_STACK_DESTROY(sync_frame);
+ return 0;
+}
+
+/* EXIT: dht_update_commit_hash_for_layout */
+static int
+dht_update_commit_hash_for_layout_done(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ /* preserve oldest error */
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, NULL);
+
+ return 0;
+}
+
+static int
+dht_update_commit_hash_for_layout_unlock(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+
+ ret = dht_unlock_inodelk(frame, local->lock[0].layout.my_layout.locks,
+ local->lock[0].layout.my_layout.lk_count,
+ dht_update_commit_hash_for_layout_done);
+ if (ret < 0) {
+ /* preserve oldest error, just ... */
+ if (!local->op_ret) {
+ local->op_errno = errno;
+ local->op_ret = -1;
+ }
+
+ gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_WIND_UNLOCK_FAILED,
+ "path=%s", local->loc.path, NULL);
+
+ dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL);
+ }
+
+ return 0;
+}
+
+static int
+dht_update_commit_hash_for_layout_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ /* store first failure, just because */
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ UNLOCK(&frame->lock);
+
+ this_call_cnt = dht_frame_return(frame);
+
+ if (is_last_call(this_call_cnt)) {
+ dht_update_commit_hash_for_layout_unlock(frame, this);
+ }
+
+ return 0;
+}
+
+static int
+dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
{
- int ret = 0;
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0, j = 0;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int32_t *disk_layout = NULL;
+ dict_t **xattr = NULL;
+
+ local = frame->local;
+ conf = frame->this->private;
+ count = conf->local_subvols_cnt;
+ layout = local->layout;
+
+ if (op_ret < 0) {
+ goto err_done;
+ }
+
+ /* We precreate the xattr list as we cannot change call count post the
+ * first wind as we may never continue from there. So we finish prep
+ * work before winding the setxattrs */
+ xattr = GF_CALLOC(count, sizeof(*xattr), gf_common_mt_char);
+ if (!xattr) {
+ local->op_errno = errno;
+
+ gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_COMMIT_HASH_FAILED,
+ "allocation-failed path=%s", local->loc.path, NULL);
+
+ goto err;
+ }
+
+ for (i = 0; i < count; i++) {
+ /* find the layout index for the subvolume */
+ ret = dht_layout_index_for_subvol(layout, conf->local_subvols[i]);
+ if (ret < 0) {
+ local->op_errno = ENOENT;
+
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMMIT_HASH_FAILED,
+ "path=%s", local->loc.path, "subvol=%s",
+ conf->local_subvols[i]->name, "find-disk-layout-failed",
+ NULL);
+
+ goto err;
+ }
+ j = ret;
+
+ /* update the commit hash for the layout */
+ layout->list[j].commit_hash = layout->commit_hash;
+
+ /* extract the current layout */
+ ret = dht_disk_layout_extract(this, layout, j, &disk_layout);
+ if (ret == -1) {
+ local->op_errno = errno;
+
+ gf_smsg(this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_COMMIT_HASH_FAILED, "path=%s", local->loc.path,
+ "subvol=%s", conf->local_subvols[i]->name,
+ "extract-disk-layout-failed", NULL);
+ goto err;
+ }
+
+ xattr[i] = dict_new();
+ if (!xattr[i]) {
+ local->op_errno = errno;
+
+ gf_smsg(this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_COMMIT_HASH_FAILED, "path=%s Allocation-failed",
+ local->loc.path, NULL);
+
+ goto err;
+ }
+
+ ret = dict_set_bin(xattr[i], conf->xattr_name, disk_layout, 4 * 4);
+ if (ret != 0) {
+ local->op_errno = ENOMEM;
+
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s",
+ local->loc.path, "subvol=%s", conf->local_subvols[i]->name,
+ "set-xattr-failed", NULL);
+
+ goto err;
+ }
+ disk_layout = NULL;
+
+ gf_msg_trace(this->name, 0,
+ "setting commit hash %u on subvolume %s"
+ " for %s",
+ layout->list[j].commit_hash, conf->local_subvols[i]->name,
+ local->loc.path);
+ }
+
+ /* wind the setting of the commit hash across the local subvols */
+ local->call_cnt = count;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ for (i = 0; i < count; i++) {
+ STACK_WIND(frame, dht_update_commit_hash_for_layout_cbk,
+ conf->local_subvols[i],
+ conf->local_subvols[i]->fops->setxattr, &local->loc,
+ xattr[i], 0, NULL);
+ }
+ for (i = 0; i < count; i++)
+ dict_unref(xattr[i]);
+ GF_FREE(xattr);
+
+ return 0;
+err:
+ if (xattr) {
+ for (i = 0; i < count; i++) {
+ if (xattr[i])
+ dict_unref(xattr[i]);
+ }
+
+ GF_FREE(xattr);
+ }
+
+ GF_FREE(disk_layout);
+
+ local->op_ret = -1;
+
+ dht_update_commit_hash_for_layout_unlock(frame, this);
- local = frame->local;
+ return 0;
+err_done:
+ local->op_ret = -1;
- local->selfheal.dir_cbk = dir_cbk;
- local->selfheal.layout = dht_layout_ref (frame->this, layout);
+ dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL);
- ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1);
+ return 0;
+}
+
+/* ENTER: dht_update_commit_hash_for_layout (see EXIT above)
+ * This function is invoked from rebalance only.
+ * As a result, the check here is simple enough to see if defrag is present
+ * in the conf, as other data would be populated appropriately if so.
+ * If ever this was to be used in other code paths, checks would need to
+ * change.
+ *
+ * Functional details:
+ * - Lock the inodes on the subvols that we want the commit hash updated
+ * - Update each layout with the inode layout, modified to take in the new
+ * commit hash.
+ * - Unlock and return.
+ */
+int
+dht_update_commit_hash_for_layout(call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0;
+ dht_lock_t **lk_array = NULL;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
+
+ local = frame->local;
+ conf = frame->this->private;
+
+ if (!conf->defrag)
+ goto err;
+
+ count = conf->local_subvols_cnt;
+ lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char);
+ if (lk_array == NULL)
+ goto err;
+
+ for (i = 0; i < count; i++) {
+ lk_array[i] = dht_lock_new(frame->this, conf->local_subvols[i],
+ &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN,
+ NULL, FAIL_ON_ANY_ERROR);
+ if (lk_array[i] == NULL)
+ goto err;
+ }
+
+ local->lock[0].layout.my_layout.locks = lk_array;
+ local->lock[0].layout.my_layout.lk_count = count;
+
+ ret = dht_blocking_inodelk(frame, lk_array, count,
+ dht_update_commit_hash_for_layout_resume);
+ if (ret < 0) {
+ local->lock[0].layout.my_layout.locks = NULL;
+ local->lock[0].layout.my_layout.lk_count = 0;
+ goto err;
+ }
+
+ return 0;
+err:
+ if (lk_array != NULL) {
+ dht_lock_array_free(lk_array, count);
+ GF_FREE(lk_array);
+ }
- return ret;
+ return -1;
}
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
new file mode 100644
index 00000000000..bb72b0ffbb5
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -0,0 +1,1104 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/* TODO: add NS locking */
+#include <glusterfs/statedump.h>
+#include "dht-common.h"
+#include "dht-messages.h"
+
+#ifndef MAX
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+
+static void
+dht_layout_dump(dht_layout_t *layout, const char *prefix)
+{
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+
+ if (!layout)
+ goto out;
+
+ gf_proc_dump_build_key(key, prefix, "cnt");
+ gf_proc_dump_write(key, "%d", layout->cnt);
+ gf_proc_dump_build_key(key, prefix, "preset");
+ gf_proc_dump_write(key, "%d", layout->preset);
+ gf_proc_dump_build_key(key, prefix, "gen");
+ gf_proc_dump_write(key, "%d", layout->gen);
+ if (layout->type != IA_INVAL) {
+ gf_proc_dump_build_key(key, prefix, "inode type");
+ gf_proc_dump_write(key, "%d", layout->type);
+ }
+
+ if (!IA_ISDIR(layout->type))
+ goto out;
+
+ for (i = 0; i < layout->cnt; i++) {
+ gf_proc_dump_build_key(key, prefix, "list[%d].err", i);
+ gf_proc_dump_write(key, "%d", layout->list[i].err);
+ gf_proc_dump_build_key(key, prefix, "list[%d].start", i);
+ gf_proc_dump_write(key, "0x%x", layout->list[i].start);
+ gf_proc_dump_build_key(key, prefix, "list[%d].stop", i);
+ gf_proc_dump_write(key, "0x%x", layout->list[i].stop);
+ if (layout->list[i].xlator) {
+ gf_proc_dump_build_key(key, prefix, "list[%d].xlator.type", i);
+ gf_proc_dump_write(key, "%s", layout->list[i].xlator->type);
+ gf_proc_dump_build_key(key, prefix, "list[%d].xlator.name", i);
+ gf_proc_dump_write(key, "%s", layout->list[i].xlator->name);
+ }
+ }
+
+out:
+ return;
+}
+
+int32_t
+dht_priv_dump(xlator_t *this)
+{
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+ if (!this)
+ goto out;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ ret = TRY_LOCK(&conf->subvolume_lock);
+ if (ret != 0) {
+ return ret;
+ }
+
+ gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
+ gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", "%s.priv",
+ this->name);
+ gf_proc_dump_write("subvol_cnt", "%d", conf->subvolume_cnt);
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ snprintf(key, sizeof(key), "subvolumes[%d]", i);
+ gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
+ conf->subvolumes[i]->name);
+ if (conf->file_layouts && conf->file_layouts[i]) {
+ snprintf(key, sizeof(key), "file_layouts[%d]", i);
+ dht_layout_dump(conf->file_layouts[i], key);
+ }
+ if (conf->dir_layouts && conf->dir_layouts[i]) {
+ snprintf(key, sizeof(key), "dir_layouts[%d]", i);
+ dht_layout_dump(conf->dir_layouts[i], key);
+ }
+ if (conf->subvolume_status) {
+ snprintf(key, sizeof(key), "subvolume_status[%d]", i);
+ gf_proc_dump_write(key, "%d", (int)conf->subvolume_status[i]);
+ }
+ }
+
+ gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
+ gf_proc_dump_write("gen", "%d", conf->gen);
+ gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
+ gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
+ gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
+ gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
+ gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
+ gf_proc_dump_write("use-readdirp", "%d", conf->use_readdirp);
+
+ if (conf->du_stats && conf->subvolume_status) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i])
+ continue;
+
+ snprintf(key, sizeof(key), "subvolumes[%d]", i);
+ gf_proc_dump_write(key, "%s", conf->subvolumes[i]->name);
+
+ snprintf(key, sizeof(key), "du_stats[%d].avail_percent", i);
+ gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_percent);
+
+ snprintf(key, sizeof(key), "du_stats[%d].avail_space", i);
+ gf_proc_dump_write(key, "%" PRIu64, conf->du_stats[i].avail_space);
+
+ snprintf(key, sizeof(key), "du_stats[%d].avail_inodes", i);
+ gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_inodes);
+
+ snprintf(key, sizeof(key), "du_stats[%d].log", i);
+ gf_proc_dump_write(key, "%" PRIu32, conf->du_stats[i].log);
+ }
+ }
+
+ if (conf->last_stat_fetch)
+ gf_proc_dump_write("last_stat_fetch", "%s",
+ ctime(&conf->last_stat_fetch));
+
+ UNLOCK(&conf->subvolume_lock);
+
+out:
+ return ret;
+}
+
+int32_t
+dht_inodectx_dump(xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ dht_layout_t *layout = NULL;
+
+ if (!this)
+ goto out;
+ if (!inode)
+ goto out;
+
+ ret = dht_inode_ctx_layout_get(inode, this, &layout);
+
+ if ((ret != 0) || !layout)
+ return ret;
+
+ gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
+ dht_layout_dump(layout, "layout");
+
+out:
+ return ret;
+}
+
+void
+dht_fini(xlator_t *this)
+{
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+
+ conf = this->private;
+ this->private = NULL;
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ GF_FREE(conf->file_layouts[i]);
+ }
+ GF_FREE(conf->file_layouts);
+ }
+
+ dict_unref(conf->leaf_to_subvol);
+
+ /* allocated in dht_init_subvolumes() */
+ GF_FREE(conf->subvolumes);
+ GF_FREE(conf->subvolume_status);
+ GF_FREE(conf->last_event);
+ GF_FREE(conf->subvol_up_time);
+ GF_FREE(conf->du_stats);
+ GF_FREE(conf->decommissioned_bricks);
+
+ /* allocated in dht_init() */
+ GF_FREE(conf->mds_xattr_key);
+ GF_FREE(conf->link_xattr_name);
+ GF_FREE(conf->commithash_xattr_name);
+ GF_FREE(conf->wild_xattr_name);
+
+ /* allocated in dht_init_regex() */
+ if (conf->rsync_regex_valid)
+ regfree(&conf->rsync_regex);
+ if (conf->extra_regex_valid)
+ regfree(&conf->extra_regex);
+
+ synclock_destroy(&conf->link_lock);
+
+ if (conf->lock_pool)
+ mem_pool_destroy(conf->lock_pool);
+
+ GF_FREE(conf);
+ }
+out:
+ return;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+
+ ret = xlator_mem_acct_init(this, gf_dht_mt_end + 1);
+
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Memory accounting init failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+static int
+dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf,
+ const char *bricks)
+{
+ int i = 0;
+ int ret = -1;
+ char *tmpstr = NULL;
+ char *dup_brick = NULL;
+ char *node = NULL;
+
+ if (!conf || !bricks)
+ goto out;
+
+ dup_brick = gf_strdup(bricks);
+ if (dup_brick == NULL) {
+ goto out;
+ }
+
+ node = strtok_r(dup_brick, ",", &tmpstr);
+ while (node) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!strcmp(conf->subvolumes[i]->name, node)) {
+ conf->decommissioned_bricks[i] = conf->subvolumes[i];
+ conf->decommission_subvols_cnt++;
+ gf_msg(this->name, GF_LOG_INFO, 0,
+ DHT_MSG_SUBVOL_DECOMMISSION_INFO,
+ "decommissioning subvolume %s",
+ conf->subvolumes[i]->name);
+ break;
+ }
+ }
+ if (i == conf->subvolume_cnt) {
+ /* Wrong node given. */
+ goto out;
+ }
+ node = strtok_r(NULL, ",", &tmpstr);
+ }
+
+ ret = 0;
+ conf->decommission_in_progress = 1;
+out:
+ GF_FREE(dup_brick);
+
+ return ret;
+}
+
+static void
+dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf)
+{
+ int i = 0;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i]) {
+ conf->decommissioned_bricks[i] = NULL;
+ conf->decommission_subvols_cnt--;
+ }
+ }
+}
+
+static void
+dht_init_regex(xlator_t *this, dict_t *odict, char *name, regex_t *re,
+ gf_boolean_t *re_valid, dht_conf_t *conf)
+{
+ char *temp_str = NULL;
+
+ if (dict_get_str(odict, name, &temp_str) != 0) {
+ if (strcmp(name, "rsync-hash-regex")) {
+ return;
+ }
+ temp_str = "^\\.(.+)\\.[^.]+$";
+ }
+
+ LOCK(&conf->lock);
+ {
+ if (*re_valid) {
+ regfree(re);
+ *re_valid = _gf_false;
+ }
+
+ if (!strcmp(temp_str, "none")) {
+ goto unlock;
+ }
+
+ if (regcomp(re, temp_str, REG_EXTENDED) == 0) {
+ gf_msg_debug(this->name, 0, "using regex %s = %s", name, temp_str);
+ *re_valid = _gf_true;
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REGEX_INFO,
+ "compiling regex %s failed", temp_str);
+ }
+ }
+unlock:
+ UNLOCK(&conf->lock);
+}
+
+int
+dht_set_subvol_range(xlator_t *this)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf)
+ goto out;
+
+ conf->leaf_to_subvol = dict_new();
+ if (!conf->leaf_to_subvol)
+ goto out;
+
+ ret = glusterfs_reachable_leaves(this, conf->leaf_to_subvol);
+
+out:
+ return ret;
+}
+
+static int
+dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str)
+{
+ int rebal_thread_count = 0;
+ int ret = 0;
+
+ pthread_mutex_lock(&conf->defrag->dfq_mutex);
+ {
+ if (!strcasecmp(temp_str, "lazy")) {
+ conf->defrag->recon_thread_count = 1;
+ } else if (!strcasecmp(temp_str, "normal")) {
+ conf->defrag->recon_thread_count = 2;
+ } else if (!strcasecmp(temp_str, "aggressive")) {
+ conf->defrag->recon_thread_count = MAX(MAX_REBAL_THREADS - 4, 4);
+ } else if ((gf_string2int(temp_str, &rebal_thread_count) == 0)) {
+ if ((rebal_thread_count > 0) &&
+ (rebal_thread_count <= MAX_REBAL_THREADS)) {
+ conf->defrag->recon_thread_count = rebal_thread_count;
+ pthread_mutex_unlock(&conf->defrag->dfq_mutex);
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "rebal thread count configured to %d",
+ rebal_thread_count);
+ goto out;
+ } else {
+ pthread_mutex_unlock(&conf->defrag->dfq_mutex);
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+ "Invalid option: Reconfigure: "
+ "rebal-throttle should be "
+ "within range of 0 and maximum number of"
+ " cores available");
+ ret = -1;
+ goto out;
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+ "Invalid option: Reconfigure: "
+ "rebal-throttle should be {lazy|normal|aggressive}"
+ " or a number up to the number of cores available,"
+ " not (%s), defaulting to (%d)",
+ temp_str, conf->dthrottle);
+ ret = -1;
+ }
+ }
+ pthread_mutex_unlock(&conf->defrag->dfq_mutex);
+
+out:
+ return ret;
+}
+
+int
+dht_reconfigure(xlator_t *this, dict_t *options)
+{
+ dht_conf_t *conf = NULL;
+ char *temp_str = NULL;
+ gf_boolean_t search_unhashed;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", options, out);
+
+ conf = this->private;
+ if (!conf)
+ return 0;
+
+ if (dict_get_str(options, "lookup-unhashed", &temp_str) == 0) {
+ /* If option is not "auto", other options _should_ be boolean*/
+ if (strcasecmp(temp_str, "auto")) {
+ if (!gf_string2boolean(temp_str, &search_unhashed)) {
+ gf_msg_debug(this->name, 0,
+ "Reconfigure: "
+ "lookup-unhashed reconfigured(%s)",
+ temp_str);
+ conf->search_unhashed = search_unhashed;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+ "Invalid option: Reconfigure: "
+ "lookup-unhashed should be boolean,"
+ " not (%s), defaulting to (%d)",
+ temp_str, conf->search_unhashed);
+ ret = -1;
+ goto out;
+ }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Reconfigure:"
+ " lookup-unhashed reconfigured auto ");
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+ }
+ }
+
+ GF_OPTION_RECONF("lookup-optimize", conf->lookup_optimize, options, bool,
+ out);
+
+ GF_OPTION_RECONF("min-free-disk", conf->min_free_disk, options,
+ percent_or_size, out);
+ /* option can be any one of percent or bytes */
+ conf->disk_unit = 0;
+ if (conf->min_free_disk < 100.0)
+ conf->disk_unit = 'p';
+
+ GF_OPTION_RECONF("min-free-inodes", conf->min_free_inodes, options, percent,
+ out);
+
+ GF_OPTION_RECONF("directory-layout-spread", conf->dir_spread_cnt, options,
+ uint32, out);
+
+ GF_OPTION_RECONF("readdir-optimize", conf->readdir_optimize, options, bool,
+ out);
+ GF_OPTION_RECONF("randomize-hash-range-by-gfid", conf->randomize_by_gfid,
+ options, bool, out);
+
+ GF_OPTION_RECONF("lock-migration", conf->lock_migration_enabled, options,
+ bool, out);
+
+ GF_OPTION_RECONF("force-migration", conf->force_migration, options, bool,
+ out);
+
+ if (conf->defrag) {
+ if (dict_get_str(options, "rebal-throttle", &temp_str) == 0) {
+ ret = dht_configure_throttle(this, conf, temp_str);
+ if (ret == -1)
+ goto out;
+ }
+ }
+
+ if (conf->defrag) {
+ conf->defrag->lock_migration_enabled = conf->lock_migration_enabled;
+ }
+
+ if (conf->defrag) {
+ GF_OPTION_RECONF("rebalance-stats", conf->defrag->stats, options, bool,
+ out);
+ }
+
+ if (dict_get_str(options, "decommissioned-bricks", &temp_str) == 0) {
+ ret = dht_parse_decommissioned_bricks(this, conf, temp_str);
+ if (ret == -1)
+ goto out;
+ } else {
+ dht_decommissioned_remove(this, conf);
+ }
+
+ dht_init_regex(this, options, "rsync-hash-regex", &conf->rsync_regex,
+ &conf->rsync_regex_valid, conf);
+ dht_init_regex(this, options, "extra-hash-regex", &conf->extra_regex,
+ &conf->extra_regex_valid, conf);
+
+ GF_OPTION_RECONF("weighted-rebalance", conf->do_weighting, options, bool,
+ out);
+
+ GF_OPTION_RECONF("use-readdirp", conf->use_readdirp, options, bool, out);
+ ret = 0;
+out:
+ return ret;
+}
+
+static int
+gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag,
+ char *data)
+{
+ int ret = -1;
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *dup_str = NULL;
+ char *num = NULL;
+ char *pattern_str = NULL;
+ char *pattern = NULL;
+ gf_defrag_pattern_list_t *temp_list = NULL;
+ gf_defrag_pattern_list_t *pattern_list = NULL;
+
+ if (!this || !defrag || !data)
+ goto out;
+
+ /* Get the pattern for pattern list. "pattern:<optional-size>"
+ * eg: *avi, *pdf:10MB, *:1TB
+ */
+ pattern_str = strtok_r(data, ",", &tmp_str);
+ while (pattern_str) {
+ dup_str = gf_strdup(pattern_str);
+ if (!dup_str)
+ goto out;
+ pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1);
+ if (!pattern_list) {
+ goto out;
+ }
+ pattern = strtok_r(dup_str, ":", &tmp_str1);
+ num = strtok_r(NULL, ":", &tmp_str1);
+ if (!pattern)
+ goto out;
+ if (!num) {
+ if (gf_string2bytesize_uint64(pattern, &pattern_list->size) == 0) {
+ pattern = "*";
+ }
+ } else if (gf_string2bytesize_uint64(num, &pattern_list->size) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+ "Invalid option. Defrag pattern:"
+ " Invalid number format \"%s\"",
+ num);
+ goto out;
+ }
+ memcpy(pattern_list->path_pattern, pattern, strlen(dup_str));
+
+ if (!defrag->defrag_pattern)
+ temp_list = NULL;
+ else
+ temp_list = defrag->defrag_pattern;
+
+ pattern_list->next = temp_list;
+
+ defrag->defrag_pattern = pattern_list;
+ pattern_list = NULL;
+
+ GF_FREE(dup_str);
+ dup_str = NULL;
+
+ pattern_str = strtok_r(NULL, ",", &tmp_str);
+ }
+
+ ret = 0;
+out:
+ if (ret)
+ GF_FREE(pattern_list);
+ GF_FREE(dup_str);
+
+ return ret;
+}
+
+static int
+dht_init_methods(xlator_t *this)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", this, err);
+
+ conf = this->private;
+ methods = &(conf->methods);
+
+ methods->migration_get_dst_subvol = dht_migration_get_dst_subvol;
+ methods->migration_other = NULL;
+ methods->layout_search = dht_layout_search;
+
+ ret = 0;
+err:
+ return ret;
+}
+
+int
+dht_init(xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ char *temp_str = NULL;
+ int ret = -1;
+ int i = 0;
+ gf_defrag_info_t *defrag = NULL;
+ int cmd = 0;
+ char *node_uuid = NULL;
+ uint32_t commit_hash = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", this, err);
+
+ if (!this->children) {
+ gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_CONFIGURATION,
+ "Distribute needs more than one subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_CONFIGURATION,
+ "dangling volume. check volfile");
+ }
+
+ conf = GF_CALLOC(1, sizeof(*conf), gf_dht_mt_dht_conf_t);
+ if (!conf) {
+ goto err;
+ }
+
+ LOCK_INIT(&conf->subvolume_lock);
+ LOCK_INIT(&conf->layout_lock);
+ LOCK_INIT(&conf->lock);
+ synclock_init(&conf->link_lock, SYNC_LOCK_DEFAULT);
+
+ /* We get the commit-hash to set only for rebalance process */
+ if (dict_get_uint32(this->options, "commit-hash", &commit_hash) == 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_COMMIT_HASH_INFO,
+ "%s using commit hash %u", __func__, commit_hash);
+ conf->vol_commit_hash = commit_hash;
+ conf->vch_forced = _gf_true;
+ }
+
+ ret = dict_get_int32(this->options, "rebalance-cmd", &cmd);
+
+ if (cmd) {
+ defrag = GF_CALLOC(1, sizeof(gf_defrag_info_t), gf_defrag_info_mt);
+
+ GF_VALIDATE_OR_GOTO(this->name, defrag, err);
+
+ LOCK_INIT(&defrag->lock);
+
+ defrag->is_exiting = 0;
+
+ conf->defrag = defrag;
+ defrag->this = this;
+
+ ret = dict_get_str(this->options, "node-uuid", &node_uuid);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_CONFIGURATION,
+ "Invalid volume configuration: "
+ "node-uuid not specified");
+ goto err;
+ }
+
+ if (gf_uuid_parse(node_uuid, defrag->node_uuid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+ "Invalid option:"
+ " Cannot parse glusterd node uuid");
+ goto err;
+ }
+
+ defrag->cmd = cmd;
+
+ defrag->stats = _gf_false;
+
+ defrag->queue = NULL;
+
+ defrag->crawl_done = 0;
+
+ defrag->global_error = 0;
+
+ defrag->q_entry_count = 0;
+
+ defrag->wakeup_crawler = 0;
+
+ pthread_mutex_init(&defrag->dfq_mutex, 0);
+ pthread_cond_init(&defrag->parallel_migration_cond, 0);
+ pthread_cond_init(&defrag->rebalance_crawler_alarm, 0);
+ pthread_cond_init(&defrag->df_wakeup_thread, 0);
+
+ pthread_mutex_init(&defrag->fc_mutex, 0);
+ pthread_cond_init(&defrag->fc_wakeup_cond, 0);
+
+ defrag->global_error = 0;
+ }
+
+ conf->use_fallocate = 1;
+
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
+ if (dict_get_str(this->options, "lookup-unhashed", &temp_str) == 0) {
+ /* If option is not "auto", other options _should_ be boolean */
+ if (strcasecmp(temp_str, "auto")) {
+ gf_boolean_t search_unhashed_bool;
+ ret = gf_string2boolean(temp_str, &search_unhashed_bool);
+ if (ret == -1) {
+ goto err;
+ }
+ conf->search_unhashed = search_unhashed_bool
+ ? GF_DHT_LOOKUP_UNHASHED_ON
+ : GF_DHT_LOOKUP_UNHASHED_OFF;
+ } else {
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+ }
+ }
+
+ GF_OPTION_INIT("lookup-optimize", conf->lookup_optimize, bool, err);
+
+ GF_OPTION_INIT("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, err);
+
+ GF_OPTION_INIT("use-readdirp", conf->use_readdirp, bool, err);
+
+ GF_OPTION_INIT("min-free-disk", conf->min_free_disk, percent_or_size, err);
+
+ GF_OPTION_INIT("min-free-inodes", conf->min_free_inodes, percent, err);
+
+ conf->dir_spread_cnt = conf->subvolume_cnt;
+ GF_OPTION_INIT("directory-layout-spread", conf->dir_spread_cnt, uint32,
+ err);
+
+ GF_OPTION_INIT("assert-no-child-down", conf->assert_no_child_down, bool,
+ err);
+
+ GF_OPTION_INIT("readdir-optimize", conf->readdir_optimize, bool, err);
+
+ GF_OPTION_INIT("lock-migration", conf->lock_migration_enabled, bool, err);
+
+ GF_OPTION_INIT("force-migration", conf->force_migration, bool, err);
+
+ if (defrag) {
+ defrag->lock_migration_enabled = conf->lock_migration_enabled;
+
+ GF_OPTION_INIT("rebalance-stats", defrag->stats, bool, err);
+ if (dict_get_str(this->options, "rebalance-filter", &temp_str) == 0) {
+ if (gf_defrag_pattern_list_fill(this, defrag, temp_str) == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+ "Invalid option:"
+ " Cannot parse rebalance-filter (%s)",
+ temp_str);
+
+ goto err;
+ }
+ }
+ }
+
+ /* option can be any one of percent or bytes */
+ conf->disk_unit = 0;
+ if (conf->min_free_disk < 100)
+ conf->disk_unit = 'p';
+
+ ret = dht_init_subvolumes(this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ if (cmd) {
+ ret = dht_init_local_subvolumes(this, conf);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_INIT_LOCAL_SUBVOL_FAILED,
+ "dht_init_local_subvolumes failed");
+ goto err;
+ }
+ }
+
+ if (dict_get_str(this->options, "decommissioned-bricks", &temp_str) == 0) {
+ ret = dht_parse_decommissioned_bricks(this, conf, temp_str);
+ if (ret == -1)
+ goto err;
+ }
+
+ dht_init_regex(this, this->options, "rsync-hash-regex", &conf->rsync_regex,
+ &conf->rsync_regex_valid, conf);
+ dht_init_regex(this, this->options, "extra-hash-regex", &conf->extra_regex,
+ &conf->extra_regex_valid, conf);
+
+ ret = dht_layouts_init(this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ conf->gen = 1;
+
+ this->local_pool = mem_pool_new(dht_local_t, 512);
+ if (!this->local_pool) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ " DHT initialisation failed. "
+ "failed to create local_t's memory pool");
+ goto err;
+ }
+
+ GF_OPTION_INIT("randomize-hash-range-by-gfid", conf->randomize_by_gfid,
+ bool, err);
+
+ if (defrag) {
+ GF_OPTION_INIT("rebal-throttle", temp_str, str, err);
+ if (temp_str) {
+ ret = dht_configure_throttle(this, conf, temp_str);
+ if (ret == -1)
+ goto err;
+ }
+ }
+
+ GF_OPTION_INIT("xattr-name", conf->xattr_name, str, err);
+ gf_asprintf(&conf->mds_xattr_key, "%s." DHT_MDS_STR, conf->xattr_name);
+ gf_asprintf(&conf->link_xattr_name, "%s." DHT_LINKFILE_STR,
+ conf->xattr_name);
+ gf_asprintf(&conf->commithash_xattr_name, "%s." DHT_COMMITHASH_STR,
+ conf->xattr_name);
+ gf_asprintf(&conf->wild_xattr_name, "%s*", conf->xattr_name);
+ if (!conf->link_xattr_name || !conf->wild_xattr_name) {
+ goto err;
+ }
+
+ GF_OPTION_INIT("weighted-rebalance", conf->do_weighting, bool, err);
+
+ conf->lock_pool = mem_pool_new(dht_lock_t, 512);
+ if (!conf->lock_pool) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_FAILED,
+ "failed to create lock mem_pool, failing "
+ "initialization");
+ goto err;
+ }
+
+ this->private = conf;
+
+ if (dht_set_subvol_range(this))
+ goto err;
+
+ if (dht_init_methods(this))
+ goto err;
+
+ return 0;
+
+err:
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ GF_FREE(conf->file_layouts[i]);
+ }
+ GF_FREE(conf->file_layouts);
+ }
+
+ GF_FREE(conf->subvolumes);
+
+ GF_FREE(conf->subvolume_status);
+
+ GF_FREE(conf->du_stats);
+
+ GF_FREE(conf->defrag);
+
+ GF_FREE(conf->xattr_name);
+ GF_FREE(conf->link_xattr_name);
+ GF_FREE(conf->wild_xattr_name);
+ GF_FREE(conf->mds_xattr_key);
+
+ if (conf->lock_pool)
+ mem_pool_destroy(conf->lock_pool);
+
+ GF_FREE(conf);
+ }
+
+ return -1;
+}
+
+struct volume_options dht_options[] = {
+ {
+ .key = {"lookup-unhashed"},
+ .value = {"auto", "yes", "no", "enable", "disable", "1", "0", "on",
+ "off"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "on",
+ .description =
+ "This option if set to ON, does a lookup through "
+ "all the sub-volumes, in case a lookup didn't return any result "
+ "from the hash subvolume. If set to OFF, it does not do a lookup "
+ "on the remaining subvolumes.",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .level = OPT_STATUS_BASIC,
+ },
+ {.key = {"lookup-optimize"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description =
+ "This option if set to ON enables the optimization "
+ "of -ve lookups, by not doing a lookup on non-hashed subvolumes for "
+ "files, in case the hashed subvolume does not return any result. "
+ "This option disregards the lookup-unhashed setting, when enabled.",
+ .op_version = {GD_OP_VERSION_3_7_2},
+ .level = OPT_STATUS_ADVANCED,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+ {.key = {"min-free-disk"},
+ .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
+ .default_value = "10%",
+ .description =
+ "Percentage/Size of disk space, after which the "
+ "process starts balancing out the cluster, and logs will appear "
+ "in log files",
+ .op_version = {1},
+ .level = OPT_STATUS_BASIC,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+ {.key = {"min-free-inodes"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "5%",
+ .description = "after system has only N% of inodes, warnings "
+ "starts to appear in log files",
+ .op_version = {1},
+ .level = OPT_STATUS_BASIC,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+ {
+ .key = {"unhashed-sticky-bit"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
+ {.key = {"use-readdirp"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option if set to ON, forces the use of "
+ "readdirp, and hence also displays the stats of the files.",
+ .level = OPT_STATUS_ADVANCED,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+ {.key = {"assert-no-child-down"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON, in the event of "
+ "CHILD_DOWN, will call exit."},
+ {
+ .key = {"directory-layout-spread"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Specifies the directory layout spread. Takes number "
+ "of subvolumes as default value.",
+
+ .op_version = {2},
+ },
+ {
+ .key = {"decommissioned-bricks"},
+ .type = GF_OPTION_TYPE_ANY,
+ .description =
+ "This option if set to ON, decommissions "
+ "the brick, so that no new data is allowed to be created "
+ "on that brick.",
+ .level = OPT_STATUS_ADVANCED,
+ },
+ {
+ .key = {"rebalance-cmd"},
+ .type = GF_OPTION_TYPE_INT,
+ },
+ {
+ .key = {"commit-hash"},
+ .type = GF_OPTION_TYPE_INT,
+ },
+ {
+ .key = {"node-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+ {
+ .key = {"rebalance-stats"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description =
+ "This option if set to ON displays and logs the "
+ " time taken for migration of each file, during the rebalance "
+ "process. If set to OFF, the rebalance logs will only display the "
+ "time spent in each directory.",
+ .op_version = {2},
+ .level = OPT_STATUS_BASIC,
+ },
+ {.key = {"readdir-optimize"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description =
+ "This option if set to ON enables the optimization "
+ "that allows DHT to requests non-first subvolumes to filter out "
+ "directory entries.",
+ .op_version = {1},
+ .level = OPT_STATUS_ADVANCED,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+ {.key = {"rsync-hash-regex"},
+ .type = GF_OPTION_TYPE_STR,
+ /* Setting a default here doesn't work. See dht_init_regex. */
+ .description =
+ "Regular expression for stripping temporary-file "
+ "suffix and prefix used by rsync, to prevent relocation when the "
+ "file is renamed.",
+ .op_version = {3},
+ .level = OPT_STATUS_BASIC,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+ {.key = {"extra-hash-regex"},
+ .type = GF_OPTION_TYPE_STR,
+ /* Setting a default here doesn't work. See dht_init_regex. */
+ .description =
+ "Regular expression for stripping temporary-file "
+ "suffix and prefix used by an application, to prevent relocation when "
+ "the file is renamed.",
+ .op_version = {3},
+ .level = OPT_STATUS_BASIC,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+ {
+ .key = {"rebalance-filter"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+
+ {
+ .key = {"xattr-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "trusted.glusterfs.dht",
+ .description =
+ "Base for extended attributes used by this "
+ "translator instance, to avoid conflicts with others above or "
+ "below it.",
+ .op_version = {3},
+ },
+
+ {.key = {"weighted-rebalance"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description =
+ "When enabled, files will be allocated to bricks "
+ "with a probability proportional to their size. Otherwise, all "
+ "bricks will have the same probability (legacy behavior).",
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .level = OPT_STATUS_BASIC,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+
+ /* NUFA option */
+ {.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR},
+
+ /* switch option */
+ {.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY},
+
+ {
+ .key = {"randomize-hash-range-by-gfid"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description =
+ "Use gfid of directory to determine the subvolume "
+ "from which hash ranges are allocated starting with 0. "
+ "Note that we still use a directory/file's name to determine the "
+ "subvolume to which it hashes",
+ .op_version = {GD_OP_VERSION_3_6_0},
+ },
+
+ {.key = {"rebal-throttle"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "normal",
+ .description = " Sets the maximum number of parallel file migrations "
+ "allowed on a node during the rebalance operation. The"
+ " default value is normal and allows a max of "
+ "[($(processing units) - 4) / 2), 2] files to be "
+ "migrated at a time. Lazy will allow only one file to "
+ "be migrated at a time and aggressive will allow "
+ "max of [($(processing units) - 4) / 2), 4]",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .level = OPT_STATUS_BASIC,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC
+
+ },
+
+ {.key = {"lock-migration"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = " If enabled this feature will migrate the posix locks"
+ " associated with a file during rebalance",
+ .op_version = {GD_OP_VERSION_3_8_0},
+ .level = OPT_STATUS_ADVANCED,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+
+ {.key = {"force-migration"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "If disabled, rebalance will not migrate files that "
+ "are being written to by an application",
+ .op_version = {GD_OP_VERSION_4_0_0},
+ .level = OPT_STATUS_ADVANCED,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+
+ {.key = {NULL}},
+};
+
+#define NUM_DHT_OPTIONS (sizeof(dht_options) / sizeof(dht_options[0]))
+
+extern struct volume_options options[NUM_DHT_OPTIONS]
+ __attribute__((alias("dht_options")));
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index d0f35e0ea0b..53de8292704 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -1,552 +1,123 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-/* TODO: add NS locking */
-
-#include "statedump.h"
-#include "dht-common.c"
-
-/* TODO:
- - use volumename in xattr instead of "dht"
- - use NS locks
- - handle all cases in self heal layout reconstruction
- - complete linkfile selfheal
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
-void
-dht_layout_dump (dht_layout_t *layout, const char *prefix)
-{
-
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
-
- if (!layout)
- return;
-
- gf_proc_dump_build_key(key, prefix, "cnt");
- gf_proc_dump_write(key, "%d", layout->cnt);
- gf_proc_dump_build_key(key, prefix, "preset");
- gf_proc_dump_write(key, "%d", layout->preset);
- gf_proc_dump_build_key(key, prefix, "gen");
- gf_proc_dump_write(key, "%d", layout->gen);
- gf_proc_dump_build_key(key, prefix, "type");
- gf_proc_dump_write(key, "%d", layout->type);
-
- for (i = 0; i < layout->cnt; i++) {
- gf_proc_dump_build_key(key, prefix,"list[%d].err", i);
- gf_proc_dump_write(key, "%d", layout->list[i].err);
- gf_proc_dump_build_key(key, prefix,"list[%d].start", i);
- gf_proc_dump_write(key, "%u", layout->list[i].start);
- gf_proc_dump_build_key(key, prefix,"list[%d].stop", i);
- gf_proc_dump_write(key, "%u", layout->list[i].stop);
- if (layout->list[i].xlator) {
- gf_proc_dump_build_key(key, prefix,
- "list[%d].xlator.type", i);
- gf_proc_dump_write(key, "%s",
- layout->list[i].xlator->type);
- gf_proc_dump_build_key(key, prefix,
- "list[%d].xlator.name", i);
- gf_proc_dump_write(key, "%s",
- layout->list[i].xlator->name);
- }
- }
-}
-
-
-int32_t
-dht_priv_dump (xlator_t *this)
-{
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
- dht_conf_t *conf = NULL;
- int ret = 0;
-
- if (!this)
- return -1;
-
- conf = this->private;
-
- if (!conf)
- return -1;
-
- ret = TRY_LOCK(&conf->subvolume_lock);
-
- if (ret != 0) {
- gf_log("", GF_LOG_WARNING, "Unable to lock dht subvolume %s",
- this->name);
- return ret;
- }
-
- gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
- gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv",
- this->name);
- gf_proc_dump_build_key(key, key_prefix, "subvolume_cnt");
- gf_proc_dump_write(key,"%d", conf->subvolume_cnt);
- for (i = 0; i < conf->subvolume_cnt; i++) {
- gf_proc_dump_build_key(key, key_prefix, "subvolumes[%d]", i);
- gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
- conf->subvolumes[i]->name);
- if (conf->file_layouts && conf->file_layouts[i]){
- gf_proc_dump_build_key(key, key_prefix,
- "file_layouts[%d]",i);
- dht_layout_dump(conf->file_layouts[i], key);
- }
- if (conf->dir_layouts && conf->dir_layouts[i]) {
- gf_proc_dump_build_key(key, key_prefix,
- "dir_layouts[%d]",i);
- dht_layout_dump(conf->dir_layouts[i], key);
- }
- if (conf->subvolume_status) {
- gf_proc_dump_build_key(key, key_prefix,
- "subvolume_status[%d]", i);
- gf_proc_dump_write(key, "%d",
- (int)conf->subvolume_status[i]);
- }
-
- }
-
- gf_proc_dump_build_key(key, key_prefix,"default_dir_layout");
- dht_layout_dump(conf->default_dir_layout, key);
-
- gf_proc_dump_build_key(key, key_prefix, "search_unhashed");
- gf_proc_dump_write(key, "%d", conf->search_unhashed);
- gf_proc_dump_build_key(key, key_prefix, "gen");
- gf_proc_dump_write(key, "%d", conf->gen);
- gf_proc_dump_build_key(key, key_prefix, "min_free_disk");
- gf_proc_dump_write(key, "%lu", conf->min_free_disk);
- gf_proc_dump_build_key(key, key_prefix, "disk_unit");
- gf_proc_dump_write(key, "%c", conf->disk_unit);
- gf_proc_dump_build_key(key, key_prefix, "refresh_interval");
- gf_proc_dump_write(key, "%d", conf->refresh_interval);
- gf_proc_dump_build_key(key, key_prefix, "unhashed_sticky_bit");
- gf_proc_dump_write(key, "%d", conf->unhashed_sticky_bit);
- if (conf ->du_stats) {
- gf_proc_dump_build_key(key, key_prefix,
- "du_stats.avail_percent");
- gf_proc_dump_write(key, "%lf", conf->du_stats->avail_percent);
- gf_proc_dump_build_key(key, key_prefix,
- "du_stats.avail_space");
- gf_proc_dump_write(key, "%lu", conf->du_stats->avail_space);
- gf_proc_dump_build_key(key, key_prefix,
- "du_stats.log");
- gf_proc_dump_write(key, "%lu", conf->du_stats->log);
- }
- gf_proc_dump_build_key(key, key_prefix, "last_stat_fetch");
- gf_proc_dump_write(key, "%s", ctime(&conf->last_stat_fetch.tv_sec));
-
- UNLOCK(&conf->subvolume_lock);
-
- return 0;
-}
-
-int32_t
-dht_inodectx_dump (xlator_t *this, inode_t *inode)
-{
- int ret = -1;
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
- dht_layout_t *layout = NULL;
- uint64_t tmp_layout = 0;
-
- if (!inode)
- return -1;
-
- ret = inode_ctx_get (inode, this, &tmp_layout);
-
- if (ret != 0)
- return ret;
-
- layout = (dht_layout_t *)(long)tmp_layout;
-
- if (!layout)
- return -1;
-
- gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht",
- "%s.inode.%ld", this->name, inode->ino);
- dht_layout_dump(layout, key_prefix);
-
- return 0;
-}
-
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
-
- ret = dht_notify (this, event, data);
-
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- int i = 0;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
- this->private = NULL;
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- if (conf->default_dir_layout)
- GF_FREE (conf->default_dir_layout);
-
- if (conf->subvolumes)
- GF_FREE (conf->subvolumes);
-
- if (conf->subvolume_status)
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
- }
-
- return;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-int
-validate_options (xlator_t *this, dict_t *options, char **op_errstr)
-{
- char *temp_str = NULL;
- gf_boolean_t search_unhashed;
- int ret = 0;
-
-
-
-
-
- if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
- if (strcasecmp (temp_str, "auto")) {
- if (!gf_string2boolean (temp_str, &search_unhashed)) {
- gf_log(this->name, GF_LOG_DEBUG, "Validated"
- " lookup-unahashed (%s)",
- temp_str);
- }
- else {
- gf_log(this->name, GF_LOG_ERROR, "Validation:"
- " lookup-unahashed should be boolean,"
- " not (%s)",
- temp_str);
- *op_errstr = gf_strdup ("Error, lookup-"
- "unhashed be boolean");
- ret = -1;
- goto out;
- }
-
- }
- }
-
-
-
-
-
-out:
- return ret;
-}
-
-int
-reconfigure (xlator_t *this, dict_t *options)
-{
- dht_conf_t *conf = NULL;
- char *temp_str = NULL;
- gf_boolean_t search_unhashed;
- uint32_t temp_free_disk = 0;
- int ret = 0;
-
-
- conf = this->private;
- if (!conf)
- return 0;
-
- if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean*/
- if (strcasecmp (temp_str, "auto")) {
- if (!gf_string2boolean (temp_str, &search_unhashed)) {
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " lookup-unahashed reconfigured (%s)",
- temp_str);
- conf->search_unhashed = search_unhashed;
- }
- else {
- gf_log(this->name, GF_LOG_ERROR, "Reconfigure:"
- " lookup-unahashed should be boolean,"
- " not (%s), defaulting to (%d)",
- temp_str, conf->search_unhashed);
- //return -1;
- ret = -1;
- goto out;
- }
-
- }
- else {
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " lookup-unahashed reconfigured auto ");
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
- }
-
- if (dict_get_str (options, "min-free-disk", &temp_str) == 0) {
- if (gf_string2percent (temp_str, &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = (uint64_t)temp_free_disk;
- }
- } else {
- gf_string2bytesize (temp_str, &conf->min_free_disk);
- conf->disk_unit = 'b';
- }
-
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " min-free-disk reconfigured to ",
- temp_str);
- }
-
-out:
- return ret;
-}
-
-int
-init (xlator_t *this)
-{
- dht_conf_t *conf = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- uint32_t temp_free_disk = 0;
-
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Distribute needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t);
- if (!conf) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- conf->unhashed_sticky_bit = 0;
-
- if (dict_get_str (this->options, "unhashed-sticky-bit",
- &temp_str) == 0) {
- gf_string2boolean (temp_str, &conf->unhashed_sticky_bit);
- }
-
- conf->use_readdirp = 1;
-
- if (dict_get_str (this->options, "use-readdirp",
- &temp_str) == 0) {
- gf_string2boolean (temp_str, &conf->use_readdirp);
- }
-
- conf->disk_unit = 'p';
- conf->min_free_disk = 10;
-
- if (dict_get_str (this->options, "min-free-disk", &temp_str) == 0) {
- if (gf_string2percent (temp_str, &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = (uint64_t)temp_free_disk;
- }
- } else {
- gf_string2bytesize (temp_str, &conf->min_free_disk);
- conf->disk_unit = 'b';
- }
- }
-
-
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_dht_mt_dht_du_t);
- if (!conf->du_stats) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
-
- conf->gen = 1;
-
- this->private = conf;
-
- return 0;
-
-err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- if (conf->default_dir_layout)
- GF_FREE (conf->default_dir_layout);
-
- if (conf->subvolumes)
- GF_FREE (conf->subvolumes);
-
- if (conf->subvolume_status)
- GF_FREE (conf->subvolume_status);
-
- if (conf->du_stats)
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf);
- }
-
- return -1;
-}
-
+#include "dht-common.h"
+
+struct xlator_fops dht_pt_fops = {
+ /* we need to keep mkdir to make sure we
+ have layout on new directory */
+ .mkdir = dht_pt_mkdir,
+ .getxattr = dht_pt_getxattr,
+ .fgetxattr = dht_pt_fgetxattr,
+
+ /* required to trace fop properly in changelog */
+ .rename = dht_pt_rename,
+
+ /* FIXME: commenting the '.lookup()' below made some of
+ the failing tests to pass. I would remove the below
+ line, but keeping it here as a reminder for people
+ to check for issues if they find concerns with DHT
+ pass-through logic */
+ /*
+ .lookup = dht_lookup,
+ .readdir = dht_readdir,
+ .readdirp = dht_readdirp,
+ */
+ /* Keeping above as commented, mainly to support the
+ usecase of a gluster volume getting to 1x(anytype),
+ due to remove-brick (shrinking) exercise. In that case,
+ we would need above fops to be available, so we can
+ handle the case of dangling linkto files (if any) */
+};
struct xlator_fops fops = {
- .lookup = dht_lookup,
- .mknod = dht_mknod,
- .create = dht_create,
-
- .stat = dht_stat,
- .fstat = dht_fstat,
- .truncate = dht_truncate,
- .ftruncate = dht_ftruncate,
- .access = dht_access,
- .readlink = dht_readlink,
- .setxattr = dht_setxattr,
- .getxattr = dht_getxattr,
- .removexattr = dht_removexattr,
- .open = dht_open,
- .readv = dht_readv,
- .writev = dht_writev,
- .flush = dht_flush,
- .fsync = dht_fsync,
- .statfs = dht_statfs,
- .lk = dht_lk,
- .opendir = dht_opendir,
- .readdir = dht_readdir,
- .readdirp = dht_readdirp,
- .fsyncdir = dht_fsyncdir,
- .symlink = dht_symlink,
- .unlink = dht_unlink,
- .link = dht_link,
- .mkdir = dht_mkdir,
- .rmdir = dht_rmdir,
- .rename = dht_rename,
- .inodelk = dht_inodelk,
- .finodelk = dht_finodelk,
- .entrylk = dht_entrylk,
- .fentrylk = dht_fentrylk,
- .xattrop = dht_xattrop,
- .fxattrop = dht_fxattrop,
- .setattr = dht_setattr,
- .fsetattr = dht_fsetattr,
+ .ipc = dht_ipc,
+ .lookup = dht_lookup,
+ .mknod = dht_mknod,
+ .create = dht_create,
+
+ .open = dht_open,
+ .statfs = dht_statfs,
+ .opendir = dht_opendir,
+ .readdir = dht_readdir,
+ .readdirp = dht_readdirp,
+ .fsyncdir = dht_fsyncdir,
+ .symlink = dht_symlink,
+ .unlink = dht_unlink,
+ .link = dht_link,
+ .mkdir = dht_mkdir,
+ .rmdir = dht_rmdir,
+ .rename = dht_rename,
+ .entrylk = dht_entrylk,
+ .fentrylk = dht_fentrylk,
+
+ /* Inode read operations */
+ .stat = dht_stat,
+ .fstat = dht_fstat,
+ .access = dht_access,
+ .readlink = dht_readlink,
+ .getxattr = dht_getxattr,
+ .fgetxattr = dht_fgetxattr,
+ .readv = dht_readv,
+ .flush = dht_flush,
+ .fsync = dht_fsync,
+ .inodelk = dht_inodelk,
+ .finodelk = dht_finodelk,
+ .lk = dht_lk,
+ .lease = dht_lease,
+
+ /* Inode write operations */
+ .fremovexattr = dht_fremovexattr,
+ .removexattr = dht_removexattr,
+ .setxattr = dht_setxattr,
+ .fsetxattr = dht_fsetxattr,
+ .truncate = dht_truncate,
+ .ftruncate = dht_ftruncate,
+ .writev = dht_writev,
+ .xattrop = dht_xattrop,
+ .fxattrop = dht_fxattrop,
+ .setattr = dht_setattr,
+ .fsetattr = dht_fsetattr,
+ .fallocate = dht_fallocate,
+ .discard = dht_discard,
+ .zerofill = dht_zerofill,
};
struct xlator_dumpops dumpops = {
- .priv = dht_priv_dump,
- .inodectx = dht_inodectx_dump,
+ .priv = dht_priv_dump,
+ .inodectx = dht_inodectx_dump,
};
-
struct xlator_cbks cbks = {
-// .release = dht_release,
-// .releasedir = dht_releasedir,
- .forget = dht_forget
+ .release = dht_release,
+ // .releasedir = dht_releasedir,
+ .forget = dht_forget,
};
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {"unhashed-sticky-bit"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"use-readdirp"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {NULL} },
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+extern struct volume_options dht_options[];
+
+xlator_api_t xlator_api = {
+ .init = dht_init,
+ .fini = dht_fini,
+ .notify = dht_notify,
+ .reconfigure = dht_reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = dht_options,
+ .identifier = "distribute",
+ .pass_through_fops = &dht_pt_fops,
+ .category = GF_MAINTAINED,
};
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 09388cbb005..3648a564840 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -1,762 +1,657 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
-#include "dht-common.c"
+#include "dht-common.h"
/* TODO: all 'TODO's in dht.c holds good */
+extern struct volume_options dht_options[];
+
int
-nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
+nufa_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
{
- xlator_t *subvol = NULL;
- char is_linkfile = 0;
- char is_dir = 0;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- loc_t *loc = NULL;
- int i = 0;
- call_frame_t *prev = NULL;
- int call_cnt = 0;
- int ret = 0;
-
-
- conf = this->private;
-
- prev = cookie;
- local = frame->local;
- loc = &local->loc;
-
- if (ENTRY_MISSING (op_ret, op_errno)) {
- if (conf->search_unhashed) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
- }
-
- if (op_ret == -1)
- goto out;
-
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
- is_dir = check_is_dir (inode, stbuf, xattr);
-
- if (!is_dir && !is_linkfile) {
- /* non-directory and not a linkfile */
-
- dht_itransform (this, prev->this, stbuf->ia_ino,
- &stbuf->ia_ino);
-
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not set pre-set layout for subvol %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto err;
- }
-
- goto out;
+ xlator_t *subvol = NULL;
+ char is_linkfile = 0;
+ char is_dir = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ int i = 0;
+ xlator_t *prev = NULL;
+ int call_cnt = 0;
+ int ret = 0;
+
+ conf = this->private;
+
+ prev = cookie;
+ local = frame->local;
+ loc = &local->loc;
+
+ if (ENTRY_MISSING(op_ret, op_errno)) {
+ if (conf->search_unhashed) {
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere(frame, this, loc);
+ return 0;
}
-
- if (is_dir) {
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
-
- local->inode = inode_ref (inode);
- local->xattr = dict_ref (xattr);
-
- local->op_ret = 0;
- local->op_errno = 0;
-
- local->layout = dht_layout_new (this, conf->subvolume_cnt);
- if (!local->layout) {
- op_ret = -1;
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_DEBUG,
- "memory allocation failed :(");
- goto err;
- }
-
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
- }
+ }
+
+ if (op_ret == -1)
+ goto out;
+
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+ is_dir = check_is_dir(inode, stbuf, xattr);
+
+ if (!is_dir && !is_linkfile) {
+ /* non-directory and not a linkfile */
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg_debug(this->name, 0,
+ "could not set pre-set layout for subvol"
+ " %s",
+ prev->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto err;
}
- if (is_linkfile) {
- subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
+ goto out;
+ }
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "linkfile not having link subvolume. path=%s",
- loc->path);
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
+ if (is_dir) {
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
- STACK_WIND (frame, dht_lookup_linkfile_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
+ local->inode = inode_ref(inode);
+ local->xattr = dict_ref(xattr);
+
+ local->op_ret = 0;
+ local->op_errno = 0;
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto err;
}
- return 0;
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
+ }
-out:
- if (!local->hashed_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- local->loc.path);
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
-
- STACK_WIND (frame, dht_lookup_cbk,
- local->hashed_subvol, local->hashed_subvol->fops->lookup,
- &local->loc, local->xattr_req);
-
- return 0;
-
- err:
- DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno,
- inode, stbuf, xattr, NULL);
- return 0;
-}
+ if (is_linkfile) {
+ subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
-int
-nufa_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
-{
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int op_errno = -1;
- dht_layout_t *layout = NULL;
- int i = 0;
- int call_cnt = 0;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- conf = this->private;
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "copying location failed for path=%s",
- loc->path);
- goto err;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0,
+ "linkfile has no link subvolume. path=%s", loc->path);
+ dht_lookup_everywhere(frame, this, loc);
+ return 0;
}
- if (xattr_req) {
- local->xattr_req = dict_ref (xattr_req);
- } else {
- local->xattr_req = dict_new ();
- }
-
- hashed_subvol = dht_subvol_get_hashed (this, &local->loc);
- cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
-
- local->cached_subvol = cached_subvol;
- local->hashed_subvol = hashed_subvol;
-
- if (is_revalidate (loc)) {
- local->layout = layout = dht_layout_get (this, loc->inode);
-
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "revalidate without cache. path=%s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- if (layout->gen && (layout->gen < conf->gen)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "incomplete layout failure for path=%s",
- loc->path);
- dht_layout_unref (this, local->layout);
- goto do_fresh_lookup;
- }
-
- local->inode = inode_ref (loc->inode);
- local->ia_ino = loc->inode->ino;
-
- local->call_cnt = layout->cnt;
- call_cnt = local->call_cnt;
-
- /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
- * revalidates directly go to the cached-subvolume.
- */
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to set dict value.");
- op_errno = -1;
- goto err;
- }
-
- for (i = 0; i < layout->cnt; i++) {
- subvol = layout->list[i].xlator;
-
- STACK_WIND (frame, dht_revalidate_cbk,
- subvol, subvol->fops->lookup,
- loc, local->xattr_req);
-
- if (!--call_cnt)
- break;
- }
- } else {
- do_fresh_lookup:
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to set dict value.");
- op_errno = -1;
- goto err;
- }
-
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to set dict value.");
- op_errno = -1;
- goto err;
- }
-
- /* Send it to only local volume */
- STACK_WIND (frame, nufa_local_lookup_cbk,
- (xlator_t *)conf->private,
- ((xlator_t *)conf->private)->fops->lookup,
- loc, local->xattr_req);
- }
+ STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, local->xattr_req);
+ }
+
+ return 0;
+out:
+ if (!local->hashed_subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ local->loc.path);
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere(frame, this, loc);
return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, dht_lookup_cbk, local->hashed_subvol,
+ local->hashed_subvol, local->hashed_subvol->fops->lookup,
+ &local->loc, local->xattr_req);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
+ DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+ postparent);
+ return 0;
}
int
-nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent,
- struct iatt *postparent)
+nufa_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
{
- dht_local_t *local = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (xattr_req) {
+ local->xattr_req = dict_ref(xattr_req);
+ } else {
+ local->xattr_req = dict_new();
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+
+ local->hashed_subvol = hashed_subvol;
+
+ if (is_revalidate(loc)) {
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0,
+ "revalidate lookup without cache. "
+ "path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
- local = frame->local;
+ if (layout->gen && (layout->gen < conf->gen)) {
+ gf_msg_debug(this->name, 0, "incomplete layout failure for path=%s",
+ loc->path);
+ dht_layout_unref(this, local->layout);
+ goto do_fresh_lookup;
+ }
- if (op_ret == -1)
- goto err;
+ local->inode = inode_ref(loc->inode);
- STACK_WIND (frame, dht_create_cbk,
- local->cached_subvol, local->cached_subvol->fops->create,
- &local->loc, local->flags, local->mode, local->fd,
- local->params);
+ local->call_cnt = layout->cnt;
+ call_cnt = local->call_cnt;
- return 0;
+ /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
+ * revalidates directly go to the cached-subvolume.
+ */
+ ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dict value.");
+ op_errno = -1;
+ goto err;
+ }
- err:
- DHT_STACK_UNWIND (create, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- return 0;
-}
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
-int
-nufa_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params)
-{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- xlator_t *subvol = NULL;
- xlator_t *avail_subvol = NULL;
- int op_errno = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- conf = this->private;
-
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = ENOENT;
- goto err;
- }
-
- avail_subvol = conf->private;
- if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
- avail_subvol =
- dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
- }
+ STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol,
+ subvol->fops->lookup, loc, local->xattr_req);
- if (subvol != avail_subvol) {
- /* create a link file instead of actual file */
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->fd = fd_ref (fd);
- local->params = dict_ref (params);
- local->mode = mode;
- local->flags = flags;
-
- local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame,
- nufa_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
- return 0;
+ if (!--call_cnt)
+ break;
+ }
+ } else {
+ do_fresh_lookup:
+ ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dict value.");
+ op_errno = -1;
+ goto err;
}
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
+ ret = dict_set_uint32(local->xattr_req, conf->link_xattr_name, 256);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dict value.");
+ op_errno = -1;
+ goto err;
+ }
- STACK_WIND (frame, dht_create_cbk,
- subvol, subvol->fops->create,
- loc, flags, mode, fd, params);
+ /* Send it to only local volume */
+ STACK_WIND_COOKIE(
+ frame, nufa_local_lookup_cbk, ((xlator_t *)conf->private),
+ ((xlator_t *)conf->private),
+ ((xlator_t *)conf->private)->fops->lookup, loc, local->xattr_req);
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (create, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
-
- return 0;
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
int
-nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent)
+nufa_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret >= 0) {
- STACK_WIND (frame, dht_newfile_cbk,
- local->cached_subvol,
- local->cached_subvol->fops->mknod,
- &local->loc, local->mode, local->rdev,
- local->params);
+ if (op_ret == -1)
+ goto err;
- return 0;
- }
+ STACK_WIND_COOKIE(frame, dht_create_cbk, local->cached_subvol,
+ local->cached_subvol, local->cached_subvol->fops->create,
+ &local->loc, local->flags, local->mode, local->umask,
+ local->fd, local->params);
- WIPE (postparent);
- WIPE (preparent);
+ return 0;
- DHT_STACK_UNWIND (link, frame, op_ret, op_errno,
- inode, stbuf, preparent, postparent);
- return 0;
+err:
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
-
int
-nufa_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t rdev, dict_t *params)
+nufa_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- xlator_t *subvol = NULL;
- xlator_t *avail_subvol = NULL;
- int op_errno = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- conf = this->private;
-
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = ENOENT;
- goto err;
- }
-
- /* Consider the disksize in consideration */
- avail_subvol = conf->private;
- if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
- avail_subvol =
- dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
- }
-
- if (avail_subvol != subvol) {
- /* Create linkfile first */
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->params = dict_ref (params);
- local->mode = mode;
- local->rdev = rdev;
- local->cached_subvol = avail_subvol;
-
- dht_linkfile_create (frame, nufa_mknod_linkfile_cbk,
- avail_subvol, subvol, loc);
- return 0;
- }
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *avail_subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ avail_subvol = conf->private;
+ if (dht_is_subvol_filled(this, (xlator_t *)conf->private)) {
+ avail_subvol = dht_free_disk_available_subvol(
+ this, (xlator_t *)conf->private, local);
+ }
+
+ if (subvol != avail_subvol) {
+ /* create a link file instead of actual file */
+ local->params = dict_ref(params);
+ local->mode = mode;
+ local->flags = flags;
+ local->umask = umask;
+ local->cached_subvol = avail_subvol;
+ dht_linkfile_create(frame, nufa_create_linkfile_create_cbk, this,
+ avail_subvol, subvol, loc);
+ return 0;
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
+ gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, params);
+ STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+ subvol->fops->create, loc, flags, mode, umask, fd,
+ params);
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (mknod, frame, -1, op_errno,
- NULL, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
- return 0;
+ return 0;
}
-
int
-notify (xlator_t *this, int event, void *data, ...)
+nufa_mknod_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- int ret = -1;
+ dht_local_t *local = NULL;
- ret = dht_notify (this, event, data);
+ local = frame->local;
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto err;
+ }
- return ret;
-}
+ if (op_ret >= 0) {
+ STACK_WIND_COOKIE(
+ frame, dht_newfile_cbk, (void *)local->cached_subvol,
+ local->cached_subvol, local->cached_subvol->fops->mknod,
+ &local->loc, local->mode, local->rdev, local->umask, local->params);
-void
-fini (xlator_t *this)
-{
- int i = 0;
- dht_conf_t *conf = NULL;
+ return 0;
+ }
+err:
+ WIPE(postparent);
+ WIPE(preparent);
- conf = this->private;
+ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, xdata);
+ return 0;
+}
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
+int
+nufa_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *params)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *avail_subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ /* Consider the disksize in consideration */
+ avail_subvol = conf->private;
+ if (dht_is_subvol_filled(this, (xlator_t *)conf->private)) {
+ avail_subvol = dht_free_disk_available_subvol(
+ this, (xlator_t *)conf->private, local);
+ }
+
+ if (avail_subvol != subvol) {
+ /* Create linkfile first */
+
+ local->params = dict_ref(params);
+ local->mode = mode;
+ local->umask = umask;
+ local->rdev = rdev;
+ local->cached_subvol = avail_subvol;
+
+ dht_linkfile_create(frame, nufa_mknod_linkfile_cbk, this, avail_subvol,
+ subvol, loc);
+ return 0;
+ }
- if (conf->default_dir_layout)
- GF_FREE (conf->default_dir_layout);
+ gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
- if (conf->subvolumes)
- GF_FREE (conf->subvolumes);
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask, params);
- if (conf->subvolume_status)
- GF_FREE (conf->subvolume_status);
+ return 0;
- GF_FREE (conf);
- }
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return;
+ return 0;
}
-int
-init (xlator_t *this)
+gf_boolean_t
+same_first_part(char *str1, char term1, char *str2, char term2)
{
- dht_conf_t *conf = NULL;
- xlator_list_t *trav = NULL;
- data_t *data = NULL;
- char *local_volname = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- char my_hostname[256];
- uint32_t temp_free_disk = 0;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "NUFA needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf),
- gf_dht_mt_dht_conf_t);
- if (!conf) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ gf_boolean_t ended1;
+ gf_boolean_t ended2;
+
+ for (;;) {
+ ended1 = ((*str1 == '\0') || (*str1 == term1));
+ ended2 = ((*str2 == '\0') || (*str2 == term2));
+ if (ended1 && ended2) {
+ return _gf_true;
}
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
+ if (ended1 || ended2 || (*str1 != *str2)) {
+ return _gf_false;
}
+ ++str1;
+ ++str2;
+ }
+}
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
-
- conf->gen = 1;
-
- local_volname = "localhost";
- ret = gethostname (my_hostname, 256);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "could not find hostname (%s)",
- strerror (errno));
- }
-
- if (ret == 0)
- local_volname = my_hostname;
-
- data = dict_get (this->options, "local-volume-name");
- if (data) {
- local_volname = data->data;
- }
-
- trav = this->children;
- while (trav) {
- if (strcmp (trav->xlator->name, local_volname) == 0)
- break;
- trav = trav->next;
- }
-
- if (!trav) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not find subvolume named '%s'. "
- "Please define volume with the name as the hostname "
- "or override it with 'option local-volume-name'",
- local_volname);
- goto err;
- }
- /* The volume specified exists */
- conf->private = trav->xlator;
-
- conf->min_free_disk = 10;
- conf->disk_unit = 'p';
-
- if (dict_get_str (this->options, "min-free-disk",
- &temp_str) == 0) {
- if (gf_string2percent (temp_str,
- &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = (uint64_t)temp_free_disk;
- conf->disk_unit = 'p';
- }
- } else {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- }
- }
-
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_dht_mt_dht_du_t);
- if (!conf->du_stats) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- this->private = conf;
-
- return 0;
-
-err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- if (conf->default_dir_layout)
- GF_FREE (conf->default_dir_layout);
+typedef struct nufa_args {
+ xlator_t *this;
+ char *volname;
+ gf_boolean_t addr_match;
+} nufa_args_t;
- if (conf->subvolumes)
- GF_FREE (conf->subvolumes);
+static void
+nufa_find_local_brick(xlator_t *xl, void *data)
+{
+ nufa_args_t *args = data;
+ xlator_t *this = args->this;
+ char *local_volname = args->volname;
+ gf_boolean_t addr_match = args->addr_match;
+ char *brick_host = NULL;
+ dht_conf_t *conf = this->private;
+ int ret = -1;
+
+ /*This means a local subvol was already found. We pick the first brick
+ * that is local*/
+ if (conf->private)
+ return;
+
+ if (strcmp(xl->name, local_volname) == 0) {
+ conf->private = xl;
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "Using specified subvol %s", local_volname);
+ return;
+ }
+
+ if (!addr_match)
+ return;
+
+ ret = dict_get_str(xl->options, "remote-host", &brick_host);
+ if ((ret == 0) && (gf_is_same_address(local_volname, brick_host) ||
+ gf_is_local_addr(brick_host))) {
+ conf->private = xl;
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "Using the first local "
+ "subvol %s",
+ xl->name);
+ return;
+ }
+}
- if (conf->subvolume_status)
- GF_FREE (conf->subvolume_status);
+static void
+nufa_to_dht(xlator_t *this)
+{
+ GF_ASSERT(this);
+ GF_ASSERT(this->fops);
- if (conf->du_stats)
- GF_FREE (conf->du_stats);
+ this->fops->lookup = dht_lookup;
+ this->fops->create = dht_create;
+ this->fops->mknod = dht_mknod;
+}
- GF_FREE (conf);
+int
+nufa_find_local_subvol(xlator_t *this, void (*fn)(xlator_t *each, void *data),
+ void *data)
+{
+ int ret = -1;
+ dht_conf_t *conf = this->private;
+ xlator_list_t *trav = NULL;
+ xlator_t *parent = NULL;
+ xlator_t *candidate = NULL;
+
+ xlator_foreach_depth_first(this, fn, data);
+ if (!conf->private) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_BRICK_ERROR,
+ "Couldn't find a local "
+ "brick");
+ return -1;
+ }
+
+ candidate = conf->private;
+ trav = candidate->parents;
+ while (trav) {
+ parent = trav->xlator;
+ if (strcmp(parent->type, "cluster/nufa") == 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "Found local subvol, "
+ "%s",
+ candidate->name);
+ ret = 0;
+ conf->private = candidate;
+ break;
}
- return -1;
+ candidate = parent;
+ trav = parent->parents;
+ }
+
+ return ret;
}
+int
+nufa_init(xlator_t *this)
+{
+ data_t *data = NULL;
+ char *local_volname = NULL;
+ int ret = -1;
+ char my_hostname[256];
+ gf_boolean_t addr_match = _gf_false;
+ nufa_args_t args = {
+ 0,
+ };
+
+ ret = dht_init(this);
+ if (ret) {
+ return ret;
+ }
+
+ if ((data = dict_get(this->options, "local-volume-name"))) {
+ local_volname = data->data;
+
+ } else {
+ addr_match = _gf_true;
+ local_volname = "localhost";
+ ret = gethostname(my_hostname, 256);
+ if (ret == 0)
+ local_volname = my_hostname;
+
+ else
+ gf_msg(this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_GET_HOSTNAME_FAILED, "could not find hostname");
+ }
+
+ args.this = this;
+ args.volname = local_volname;
+ args.addr_match = addr_match;
+ ret = nufa_find_local_subvol(this, nufa_find_local_brick, &args);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "Unable to find local subvolume, switching "
+ "to dht mode");
+ nufa_to_dht(this);
+ }
+ return 0;
+}
-struct xlator_fops fops = {
- .lookup = nufa_lookup,
- .create = nufa_create,
- .mknod = nufa_mknod,
-
- .stat = dht_stat,
- .fstat = dht_fstat,
- .truncate = dht_truncate,
- .ftruncate = dht_ftruncate,
- .access = dht_access,
- .readlink = dht_readlink,
- .setxattr = dht_setxattr,
- .getxattr = dht_getxattr,
- .removexattr = dht_removexattr,
- .open = dht_open,
- .readv = dht_readv,
- .writev = dht_writev,
- .flush = dht_flush,
- .fsync = dht_fsync,
- .statfs = dht_statfs,
- .lk = dht_lk,
- .opendir = dht_opendir,
- .readdir = dht_readdir,
- .readdirp = dht_readdirp,
- .fsyncdir = dht_fsyncdir,
- .symlink = dht_symlink,
- .unlink = dht_unlink,
- .link = dht_link,
- .mkdir = dht_mkdir,
- .rmdir = dht_rmdir,
- .rename = dht_rename,
- .inodelk = dht_inodelk,
- .finodelk = dht_finodelk,
- .entrylk = dht_entrylk,
- .fentrylk = dht_fentrylk,
- .xattrop = dht_xattrop,
- .fxattrop = dht_fxattrop,
- .setattr = dht_setattr,
+dht_methods_t dht_methods = {
+ .migration_get_dst_subvol = dht_migration_get_dst_subvol,
+ .layout_search = dht_layout_search,
};
-
-struct xlator_cbks cbks = {
- .forget = dht_forget
+struct xlator_fops fops = {
+ .lookup = nufa_lookup,
+ .create = nufa_create,
+ .mknod = nufa_mknod,
+
+ .stat = dht_stat,
+ .fstat = dht_fstat,
+ .truncate = dht_truncate,
+ .ftruncate = dht_ftruncate,
+ .access = dht_access,
+ .readlink = dht_readlink,
+ .setxattr = dht_setxattr,
+ .getxattr = dht_getxattr,
+ .removexattr = dht_removexattr,
+ .open = dht_open,
+ .readv = dht_readv,
+ .writev = dht_writev,
+ .flush = dht_flush,
+ .fsync = dht_fsync,
+ .statfs = dht_statfs,
+ .lk = dht_lk,
+ .opendir = dht_opendir,
+ .readdir = dht_readdir,
+ .readdirp = dht_readdirp,
+ .fsyncdir = dht_fsyncdir,
+ .symlink = dht_symlink,
+ .unlink = dht_unlink,
+ .link = dht_link,
+ .mkdir = dht_mkdir,
+ .rmdir = dht_rmdir,
+ .rename = dht_rename,
+ .inodelk = dht_inodelk,
+ .finodelk = dht_finodelk,
+ .entrylk = dht_entrylk,
+ .fentrylk = dht_fentrylk,
+ .xattrop = dht_xattrop,
+ .fxattrop = dht_fxattrop,
+ .setattr = dht_setattr,
};
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"local-volume-name"},
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {NULL} },
+struct xlator_cbks cbks = {.forget = dht_forget};
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+xlator_api_t xlator_api = {
+ .init = nufa_init,
+ .fini = dht_fini,
+ .notify = dht_notify,
+ .reconfigure = dht_reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = dht_options,
+ .identifier = "nufa",
+ .category = GF_TECH_PREVIEW,
};
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index 344a657897c..207d109a025 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -1,29 +1,14 @@
/*
- Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
-#include "dht-common.c"
+#include "dht-common.h"
#include "dht-mem-types.h"
#include <sys/time.h>
@@ -31,1036 +16,876 @@
#include <fnmatch.h>
#include <string.h>
+extern struct volume_options dht_options[];
+
struct switch_sched_array {
- xlator_t *xl;
- int32_t eligible;
- int32_t considered;
+ xlator_t *xl;
+ int32_t eligible;
+ int32_t considered;
};
/* Select one of this struct based on the path's pattern match */
struct switch_struct {
- struct switch_struct *next;
- struct switch_sched_array *array;
- int32_t node_index; /* Index of the node in
- this pattern. */
- int32_t num_child; /* Total num of child nodes
- with this pattern. */
- char path_pattern[256];
+ struct switch_struct *next;
+ struct switch_sched_array *array;
+ int32_t node_index; /* Index of the node in
+ this pattern. */
+ int32_t num_child; /* Total num of child nodes
+ with this pattern. */
+ char path_pattern[256];
};
/* TODO: all 'TODO's in dht.c holds good */
/* This function should return child node as '*:subvolumes' is inserterd */
static int32_t
-gf_switch_valid_child (xlator_t *this, const char *child)
+gf_switch_valid_child(xlator_t *this, const char *child)
{
- xlator_list_t *children = NULL;
- int32_t ret = 0;
-
- children = this->children;
- while (children) {
- if (!strcmp (child, children->xlator->name)) {
- ret = 1;
- break;
- }
- children = children->next;
+ xlator_list_t *children = NULL;
+ int32_t ret = 0;
+
+ children = this->children;
+ while (children) {
+ if (!strcmp(child, children->xlator->name)) {
+ ret = 1;
+ break;
}
+ children = children->next;
+ }
- return ret;
+ return ret;
}
static xlator_t *
-get_switch_matching_subvol (const char *path, dht_conf_t *conf,
- xlator_t *hashed_subvol)
+get_switch_matching_subvol(const char *path, dht_conf_t *conf,
+ xlator_t *hashed_subvol)
{
- struct switch_struct *cond = NULL;
- struct switch_struct *trav = NULL;
- char *pathname = NULL;
- int idx = 0;
-
- cond = conf->private;
- if (!cond)
- return hashed_subvol;
-
- trav = cond;
- pathname = gf_strdup (path);
- while (trav) {
- if (fnmatch (trav->path_pattern,
- pathname, FNM_NOESCAPE) == 0) {
- for (idx = 0; idx < trav->num_child; idx++) {
- if (trav->array[idx].xl == hashed_subvol)
- return hashed_subvol;
- }
- idx = trav->node_index++;
- trav->node_index %= trav->num_child;
- return trav->array[idx].xl;
- }
- trav = trav->next;
- }
- GF_FREE (pathname);
- return hashed_subvol;
-}
+ struct switch_struct *cond = NULL;
+ struct switch_struct *trav = NULL;
+ char *pathname = NULL;
+ int idx = 0;
+ xlator_t *subvol = NULL;
+
+ cond = conf->private;
+ subvol = hashed_subvol;
+ if (!cond)
+ goto out;
+
+ pathname = gf_strdup(path);
+ if (!pathname)
+ goto out;
+
+ trav = cond;
+ while (trav) {
+ if (fnmatch(trav->path_pattern, pathname, FNM_NOESCAPE) == 0) {
+ for (idx = 0; idx < trav->num_child; idx++) {
+ if (trav->array[idx].xl == hashed_subvol)
+ goto out;
+ }
+ idx = trav->node_index++;
+ trav->node_index %= trav->num_child;
+ subvol = trav->array[idx].xl;
+ goto out;
+ }
+ trav = trav->next;
+ }
+out:
+ GF_FREE(pathname);
+ return subvol;
+}
int
-switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
+switch_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
{
- xlator_t *subvol = NULL;
- char is_linkfile = 0;
- char is_dir = 0;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- loc_t *loc = NULL;
- int i = 0;
- call_frame_t *prev = NULL;
- int call_cnt = 0;
- int ret = 0;
-
- conf = this->private;
-
- prev = cookie;
- local = frame->local;
- loc = &local->loc;
-
- if (ENTRY_MISSING (op_ret, op_errno)) {
- if (conf->search_unhashed) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
- }
-
- if (op_ret == -1)
- goto out;
-
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
- is_dir = check_is_dir (inode, stbuf, xattr);
-
- if (!is_dir && !is_linkfile) {
- /* non-directory and not a linkfile */
-
- dht_itransform (this, prev->this, stbuf->ia_ino,
- &stbuf->ia_ino);
-
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not set pre-set layout for subvol %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto err;
- }
-
- goto out;
+ xlator_t *subvol = NULL;
+ char is_linkfile = 0;
+ char is_dir = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ int i = 0;
+ xlator_t *prev = NULL;
+ int call_cnt = 0;
+ int ret = 0;
+
+ conf = this->private;
+
+ prev = cookie;
+ local = frame->local;
+ loc = &local->loc;
+
+ if (ENTRY_MISSING(op_ret, op_errno)) {
+ if (conf->search_unhashed) {
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere(frame, this, loc);
+ return 0;
+ }
+ }
+
+ if (op_ret == -1)
+ goto out;
+
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+ is_dir = check_is_dir(inode, stbuf, xattr);
+
+ if (!is_dir && !is_linkfile) {
+ /* non-directory and not a linkfile */
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg_debug(this->name, 0,
+ "could not set pre-set layout "
+ "for subvol %s",
+ prev->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto err;
}
- if (is_dir) {
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
-
- local->inode = inode_ref (inode);
- local->xattr = dict_ref (xattr);
-
- local->op_ret = 0;
- local->op_errno = 0;
-
- local->layout = dht_layout_new (this, conf->subvolume_cnt);
- if (!local->layout) {
- op_ret = -1;
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_DEBUG,
- "memory allocation failed :(");
- goto err;
- }
-
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
- }
+ goto out;
+ }
+
+ if (is_dir) {
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->inode = inode_ref(inode);
+ local->xattr = dict_ref(xattr);
+
+ local->op_ret = 0;
+ local->op_errno = 0;
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_msg_debug(this->name, 0, "memory allocation failed :(");
+ goto err;
}
- if (is_linkfile) {
- subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
+ }
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "linkfile not having link subvolume. path=%s",
- loc->path);
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
+ if (is_linkfile) {
+ subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
- STACK_WIND (frame, dht_lookup_linkfile_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0,
+ "linkfile has no link subvolume.path=%s", loc->path);
+ dht_lookup_everywhere(frame, this, loc);
+ return 0;
}
- return 0;
+ STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, local->xattr_req);
+ }
+
+ return 0;
out:
- if (!local->hashed_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- local->loc.path);
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
-
- STACK_WIND (frame, dht_lookup_cbk,
- local->hashed_subvol, local->hashed_subvol->fops->lookup,
- &local->loc, local->xattr_req);
-
- return 0;
-
- err:
- DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno,
- inode, stbuf, xattr, NULL);
+ if (!local->hashed_subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ local->loc.path);
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere(frame, this, loc);
return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, dht_lookup_cbk, local->hashed_subvol,
+ local->hashed_subvol, local->hashed_subvol->fops->lookup,
+ &local->loc, local->xattr_req);
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+ NULL);
+ return 0;
}
int
-switch_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+switch_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xattr_req)
{
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int op_errno = -1;
- dht_layout_t *layout = NULL;
- int i = 0;
- int call_cnt = 0;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- conf = this->private;
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- ret = loc_dup (loc, &local->loc);
- if (ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "copying location failed for path=%s",
- loc->path);
- goto err;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (xattr_req) {
+ local->xattr_req = dict_ref(xattr_req);
+ } else {
+ local->xattr_req = dict_new();
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+ cached_subvol = local->cached_subvol;
+
+ local->hashed_subvol = hashed_subvol;
+
+ if (is_revalidate(loc)) {
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0,
+ "revalidate lookup without cache. path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
}
- if (xattr_req) {
- local->xattr_req = dict_ref (xattr_req);
- } else {
- local->xattr_req = dict_new ();
- }
+ if (layout->gen && (layout->gen < conf->gen)) {
+ gf_msg_debug(this->name, 0, "incomplete layout failure for path=%s",
+ loc->path);
+ dht_layout_unref(this, local->layout);
+ goto do_fresh_lookup;
+ }
- hashed_subvol = dht_subvol_get_hashed (this, &local->loc);
- cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
+ local->inode = inode_ref(loc->inode);
- local->cached_subvol = cached_subvol;
- local->hashed_subvol = hashed_subvol;
+ local->call_cnt = layout->cnt;
+ call_cnt = local->call_cnt;
- if (is_revalidate (loc)) {
- local->layout = layout = dht_layout_get (this, loc->inode);
+ /* NOTE: we don't require 'trusted.glusterfs.dht.linkto'
+ * attribute, revalidates directly go to the cached-subvolume.
+ */
+ ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+ if (ret < 0)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "failed to set dict value for %s", conf->xattr_name);
- if (!layout) {
- gf_log (this->name, GF_LOG_DEBUG,
- "revalidate without cache. path=%s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
- if (layout->gen && (layout->gen < conf->gen)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "incomplete layout failure for path=%s",
- loc->path);
- dht_layout_unref (this, local->layout);
- goto do_fresh_lookup;
- }
-
- local->inode = inode_ref (loc->inode);
- local->ia_ino = loc->inode->ino;
-
- local->call_cnt = layout->cnt;
- call_cnt = local->call_cnt;
-
- /* NOTE: we don't require 'trusted.glusterfs.dht.linkto'
- * attribute, revalidates directly go to the cached-subvolume.
- */
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht");
-
- for (i = 0; i < layout->cnt; i++) {
- subvol = layout->list[i].xlator;
-
- STACK_WIND (frame, dht_revalidate_cbk,
- subvol, subvol->fops->lookup,
- loc, local->xattr_req);
-
- if (!--call_cnt)
- break;
- }
- } else {
- do_fresh_lookup:
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht");
-
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht.linkto");
-
- if (!hashed_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s, "
- "checking on all the subvols to see if "
- "it is a directory", loc->path);
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
-
- local->layout = dht_layout_new (this,
- conf->subvolume_cnt);
- if (!local->layout) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
- }
- return 0;
- }
+ STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol,
+ subvol->fops->lookup, loc, local->xattr_req);
- /* */
- cached_subvol = get_switch_matching_subvol (loc->path, conf,
- hashed_subvol);
- if (cached_subvol == hashed_subvol) {
- STACK_WIND (frame, dht_lookup_cbk,
- hashed_subvol,
- hashed_subvol->fops->lookup,
- loc, local->xattr_req);
- } else {
- STACK_WIND (frame, switch_local_lookup_cbk,
- cached_subvol,
- cached_subvol->fops->lookup,
- loc, local->xattr_req);
- }
+ if (!--call_cnt)
+ break;
+ }
+ } else {
+ do_fresh_lookup:
+ ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+ if (ret < 0)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "failed to set dict value for %s", conf->xattr_name);
+
+ ret = dict_set_uint32(local->xattr_req, conf->link_xattr_name, 256);
+ if (ret < 0)
+ gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED,
+ "failed to set dict value for %s", conf->link_xattr_name);
+
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "no subvolume in layout for path=%s, "
+ "checking on all the subvols to see if "
+ "it is a directory",
+ loc->path);
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+ return 0;
}
- return 0;
+ /* */
+ cached_subvol = get_switch_matching_subvol(loc->path, conf,
+ hashed_subvol);
+ if (cached_subvol == hashed_subvol) {
+ STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol,
+ hashed_subvol, hashed_subvol->fops->lookup, loc,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, switch_local_lookup_cbk, cached_subvol,
+ cached_subvol, cached_subvol->fops->lookup, loc,
+ local->xattr_req);
+ }
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
int
-switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent,
- struct iatt *postparent)
+switch_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret == -1)
- goto err;
+ if (op_ret == -1)
+ goto err;
- STACK_WIND (frame, dht_create_cbk,
- local->cached_subvol, local->cached_subvol->fops->create,
- &local->loc, local->flags, local->mode, local->fd,
- local->params);
+ STACK_WIND_COOKIE(frame, dht_create_cbk, local->cached_subvol,
+ local->cached_subvol, local->cached_subvol->fops->create,
+ &local->loc, local->flags, local->mode, local->umask,
+ local->fd, local->params);
- return 0;
+ return 0;
- err:
- DHT_STACK_UNWIND (create, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- return 0;
+err:
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
int
-switch_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params)
+switch_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- xlator_t *subvol = NULL;
- xlator_t *avail_subvol = NULL;
- int op_errno = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- conf = this->private;
-
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = ENOENT;
- goto err;
- }
-
- avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
- if (dht_is_subvol_filled (this, avail_subvol)) {
- avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
- }
-
- if (subvol != avail_subvol) {
- /* create a link file instead of actual file */
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->fd = fd_ref (fd);
- local->mode = mode;
- local->flags = flags;
-
- local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame,
- switch_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
- return 0;
- }
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *avail_subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ avail_subvol = get_switch_matching_subvol(loc->path, conf, subvol);
+ if (dht_is_subvol_filled(this, avail_subvol)) {
+ avail_subvol = dht_free_disk_available_subvol(this, avail_subvol,
+ local);
+ }
+
+ if (subvol != avail_subvol) {
+ /* create a link file instead of actual file */
+ local->mode = mode;
+ local->flags = flags;
+ local->umask = umask;
+ local->cached_subvol = avail_subvol;
+ dht_linkfile_create(frame, switch_create_linkfile_create_cbk, this,
+ avail_subvol, subvol, loc);
+ return 0;
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
+ gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_create_cbk,
- subvol, subvol->fops->create,
- loc, flags, mode, fd, params);
+ STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+ subvol->fops->create, loc, flags, mode, umask, fd,
+ params);
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (create, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
- return 0;
+ return 0;
}
int
-switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent)
+switch_mknod_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
-
- local = frame->local;
+ dht_local_t *local = NULL;
- if (op_ret >= 0) {
- STACK_WIND (frame, dht_newfile_cbk,
- local->cached_subvol,
- local->cached_subvol->fops->mknod,
- &local->loc, local->mode, local->rdev,
- local->params);
+ local = frame->local;
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto err;
+ }
- return 0;
- }
+ if (op_ret >= 0) {
+ STACK_WIND_COOKIE(
+ frame, dht_newfile_cbk, (void *)local->cached_subvol,
+ local->cached_subvol, local->cached_subvol->fops->mknod,
+ &local->loc, local->mode, local->rdev, local->umask, local->params);
- DHT_STACK_UNWIND (link, frame, op_ret, op_errno,
- inode, stbuf, preparent, postparent);
- return 0;
+ return 0;
+ }
+err:
+ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, xdata);
+ return 0;
}
-
int
-switch_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t rdev, dict_t *params)
+switch_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *params)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- xlator_t *subvol = NULL;
- xlator_t *avail_subvol = NULL;
- int op_errno = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- conf = this->private;
-
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
- }
-
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = ENOENT;
- goto err;
- }
-
- /* Consider the disksize in consideration */
- avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
- if (dht_is_subvol_filled (this, avail_subvol)) {
- avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
- }
-
- if (avail_subvol != subvol) {
- /* Create linkfile first */
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->params = dict_ref (params);
- local->mode = mode;
- local->rdev = rdev;
- local->cached_subvol = avail_subvol;
-
- dht_linkfile_create (frame, switch_mknod_linkfile_cbk,
- avail_subvol, subvol, loc);
- return 0;
- }
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *avail_subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ /* Consider the disksize in consideration */
+ avail_subvol = get_switch_matching_subvol(loc->path, conf, subvol);
+ if (dht_is_subvol_filled(this, avail_subvol)) {
+ avail_subvol = dht_free_disk_available_subvol(this, avail_subvol,
+ local);
+ }
+
+ if (avail_subvol != subvol) {
+ /* Create linkfile first */
+
+ local->params = dict_ref(params);
+ local->mode = mode;
+ local->umask = umask;
+ local->rdev = rdev;
+ local->cached_subvol = avail_subvol;
+
+ dht_linkfile_create(frame, switch_mknod_linkfile_cbk, this,
+ avail_subvol, subvol, loc);
+ return 0;
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "creating %s on %s", loc->path, subvol->name);
+ gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, params);
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask, params);
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (mknod, frame, -1, op_errno,
- NULL, NULL, NULL, NULL);
-
- return 0;
-}
-
-
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
-
- ret = dht_notify (this, event, data);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return ret;
+ return 0;
}
void
-fini (xlator_t *this)
+switch_fini(xlator_t *this)
{
- int i = 0;
- dht_conf_t *conf = NULL;
- struct switch_struct *trav = NULL;
- struct switch_struct *prev = NULL;
-
- conf = this->private;
-
- if (conf) {
- trav = (struct switch_struct *)conf->private;
- conf->private = NULL;
- while (trav) {
- if (trav->array)
- GF_FREE (trav->array);
- prev = trav;
- trav = trav->next;
- GF_FREE (prev);
- }
-
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- if (conf->default_dir_layout)
- GF_FREE (conf->default_dir_layout);
-
- if (conf->subvolumes)
- GF_FREE (conf->subvolumes);
-
- if (conf->subvolume_status)
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
+ dht_conf_t *conf = NULL;
+ struct switch_struct *trav = NULL;
+ struct switch_struct *prev = NULL;
+
+ conf = this->private;
+
+ if (conf) {
+ trav = (struct switch_struct *)conf->private;
+ conf->private = NULL;
+ while (trav) {
+ GF_FREE(trav->array);
+ prev = trav;
+ trav = trav->next;
+ GF_FREE(prev);
}
+ }
- return;
+ dht_fini(this);
}
int
-set_switch_pattern (xlator_t *this, dht_conf_t *conf,
- const char *pattern_str)
+set_switch_pattern(xlator_t *this, dht_conf_t *conf, const char *pattern_str)
{
- int flag = 0;
- int idx = 0;
- int index = 0;
- int child_count = 0;
- char *tmp = NULL;
- char *tmp1 = NULL;
- char *child = NULL;
- char *tmp_str = NULL;
- char *tmp_str1 = NULL;
- char *dup_str = NULL;
- char *dup_childs = NULL;
- char *switch_str = NULL;
- char *pattern = NULL;
- char *childs = NULL;
- char *option_string = NULL;
- struct switch_struct *switch_buf = NULL;
- struct switch_struct *switch_opt = NULL;
- struct switch_struct *trav = NULL;
- struct switch_sched_array *switch_buf_array = NULL;
- xlator_list_t *trav_xl = NULL;
-
- trav_xl = this->children;
- while (trav_xl) {
- index++;
- trav_xl = trav_xl->next;
- }
- child_count = index;
- switch_buf_array = GF_CALLOC ((index + 1),
- sizeof (struct switch_sched_array),
- gf_switch_mt_switch_sched_array);
- if (!switch_buf_array)
- goto err;
+ int flag = 0;
+ int idx = 0;
+ int index = 0;
+ int child_count = 0;
+ char *tmp = NULL;
+ char *tmp1 = NULL;
+ char *child = NULL;
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *dup_str = NULL;
+ char *dup_childs = NULL;
+ char *switch_str = NULL;
+ char *pattern = NULL;
+ char *childs = NULL;
+ char *option_string = NULL;
+ size_t pattern_length;
+ struct switch_struct *switch_buf = NULL;
+ struct switch_struct *switch_opt = NULL;
+ struct switch_struct *trav = NULL;
+ struct switch_sched_array *switch_buf_array = NULL;
+ xlator_list_t *trav_xl = NULL;
+
+ trav_xl = this->children;
+ while (trav_xl) {
+ index++;
+ trav_xl = trav_xl->next;
+ }
+ child_count = index;
+ switch_buf_array = GF_CALLOC((index + 1), sizeof(struct switch_sched_array),
+ gf_switch_mt_switch_sched_array);
+ if (!switch_buf_array)
+ goto err;
+
+ trav_xl = this->children;
+ index = 0;
+
+ while (trav_xl) {
+ switch_buf_array[index].xl = trav_xl->xlator;
+ switch_buf_array[index].eligible = 1;
+ trav_xl = trav_xl->next;
+ index++;
+ }
+
+ /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */
+
+ /* Get the pattern for considering switch case.
+ "option block-size *avi:10MB" etc */
+ option_string = gf_strdup(pattern_str);
+ if (option_string == NULL) {
+ goto err;
+ }
+ switch_str = strtok_r(option_string, ";", &tmp_str);
+ while (switch_str) {
+ dup_str = gf_strdup(switch_str);
+ if (dup_str == NULL) {
+ goto err;
+ }
+ switch_opt = GF_CALLOC(1, sizeof(struct switch_struct),
+ gf_switch_mt_switch_struct);
+ if (!switch_opt) {
+ GF_FREE(dup_str);
+ goto err;
+ }
- trav_xl = this->children;
- index = 0;
-
- while (trav_xl) {
- switch_buf_array[index].xl = trav_xl->xlator;
- switch_buf_array[index].eligible = 1;
- trav_xl = trav_xl->next;
- index++;
- }
-
- /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */
-
- /* Get the pattern for considering switch case.
- "option block-size *avi:10MB" etc */
- option_string = gf_strdup (pattern_str);
- switch_str = strtok_r (option_string, ";", &tmp_str);
- while (switch_str) {
- dup_str = gf_strdup (switch_str);
- switch_opt = GF_CALLOC (1, sizeof (struct switch_struct),
- gf_switch_mt_switch_struct);
- if (!switch_opt)
- goto err;
-
- pattern = strtok_r (dup_str, ":", &tmp_str1);
- childs = strtok_r (NULL, ":", &tmp_str1);
- if (strncmp (pattern, "*", 2) == 0) {
- gf_log ("switch", GF_LOG_NORMAL,
- "'*' pattern will be taken by default "
- "for all the unconfigured child nodes,"
- " hence neglecting current option");
- switch_str = strtok_r (NULL, ";", &tmp_str);
- GF_FREE (dup_str);
- continue;
- }
- memcpy (switch_opt->path_pattern, pattern, strlen (pattern));
- if (childs) {
- dup_childs = gf_strdup (childs);
- child = strtok_r (dup_childs, ",", &tmp);
- while (child) {
- if (gf_switch_valid_child (this, child)) {
- idx++;
- child = strtok_r (NULL, ",", &tmp);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "%s is not a subvolume of %s. "
- "pattern can only be scheduled "
- "only to a subvolume of %s",
- child, this->name, this->name);
- goto err;
- }
- }
- GF_FREE (dup_childs);
- child = strtok_r (childs, ",", &tmp1);
- switch_opt->num_child = idx;
- switch_opt->array = GF_CALLOC (1, (idx *
- sizeof (struct switch_sched_array)),
- gf_switch_mt_switch_sched_array);
- if (!switch_opt->array)
- goto err;
- idx = 0;
- while (child) {
- for (index = 0; index < child_count; index++) {
- if (strcmp (switch_buf_array[index].xl->name,
- child) == 0) {
- gf_log ("switch", GF_LOG_DEBUG,
- "'%s' pattern will be "
- "scheduled to \"%s\"",
- switch_opt->path_pattern, child);
- /*
- if (switch_buf_array[index-1].considered) {
- gf_log ("switch", GF_LOG_DEBUG,
- "ambiguity found, exiting");
- return -1;
- }
- */
- switch_opt->array[idx].xl = switch_buf_array[index].xl;
- switch_buf_array[index].considered = 1;
- idx++;
- break;
- }
- }
- child = strtok_r (NULL, ",", &tmp1);
- }
+ pattern = strtok_r(dup_str, ":", &tmp_str1);
+ childs = strtok_r(NULL, ":", &tmp_str1);
+ if (strncmp(pattern, "*", 2) == 0) {
+ gf_msg("switch", GF_LOG_INFO, 0, DHT_MSG_SWITCH_PATTERN_INFO,
+ "'*' pattern will be taken by default "
+ "for all the unconfigured child nodes,"
+ " hence neglecting current option");
+ switch_str = strtok_r(NULL, ";", &tmp_str);
+ GF_FREE(switch_opt);
+ switch_opt = NULL;
+ GF_FREE(dup_str);
+ continue;
+ }
+ GF_FREE(dup_str);
+
+ pattern_length = strlen(pattern);
+ if (pattern_length >= (sizeof(switch_opt->path_pattern))) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_SET_SWITCH_PATTERN_ERROR, "Pattern (%s) too long",
+ pattern);
+ goto err;
+ }
+ memcpy(switch_opt->path_pattern, pattern, pattern_length);
+ switch_opt->path_pattern[pattern_length] = '\0';
+
+ if (childs) {
+ dup_childs = gf_strdup(childs);
+ if (dup_childs == NULL) {
+ goto err;
+ }
+ child = strtok_r(dup_childs, ",", &tmp);
+ while (child) {
+ if (gf_switch_valid_child(this, child)) {
+ idx++;
+ child = strtok_r(NULL, ",", &tmp);
} else {
- /* error */
- gf_log ("switch", GF_LOG_ERROR,
- "Check \"scheduler.switch.case\" "
- "option in unify volume. Exiting");
- goto err;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR,
+ "%s is not a subvolume of %s. "
+ "pattern can only be scheduled "
+ "only to a subvolume of %s",
+ child, this->name, this->name);
+ GF_FREE(dup_childs);
+ goto err;
}
- GF_FREE (dup_str);
-
- /* Link it to the main structure */
- if (switch_buf) {
- /* there are already few entries */
- trav = switch_buf;
- while (trav->next)
- trav = trav->next;
- trav->next = switch_opt;
- } else {
- /* First entry */
- switch_buf = switch_opt;
+ }
+ GF_FREE(dup_childs);
+ child = strtok_r(childs, ",", &tmp1);
+ switch_opt->num_child = idx;
+ switch_opt->array = GF_CALLOC(
+ 1, (idx * sizeof(struct switch_sched_array)),
+ gf_switch_mt_switch_sched_array);
+ if (!switch_opt->array)
+ goto err;
+ idx = 0;
+ while (child) {
+ for (index = 0; index < child_count; index++) {
+ if (strcmp(switch_buf_array[index].xl->name, child) == 0) {
+ gf_msg_debug("switch", 0,
+ "'%s' pattern will be "
+ "scheduled to \"%s\"",
+ switch_opt->path_pattern, child);
+ /*
+ if (switch_buf_array[index-1].considered) {
+ gf_msg_debug ("switch", 0,
+ "ambiguity found, exiting");
+ return -1;
+ }
+ */
+ switch_opt->array[idx].xl = switch_buf_array[index].xl;
+ switch_buf_array[index].considered = 1;
+ idx++;
+ break;
+ }
}
- switch_str = strtok_r (NULL, ";", &tmp_str);
+ child = strtok_r(NULL, ",", &tmp1);
+ }
+ } else {
+ /* error */
+ gf_msg("switch", GF_LOG_ERROR, 0, DHT_MSG_SET_SWITCH_PATTERN_ERROR,
+ "Check \"scheduler.switch.case\" "
+ "option in unify volume. Exiting");
+ goto err;
}
- /* Now, all the pattern based considerations done, so for all the
- * remaining pattern, '*' to all the remaining child nodes
- */
- {
- for (index=0; index < child_count; index++) {
- /* check for considered flag */
- if (switch_buf_array[index].considered)
- continue;
- flag++;
- }
- if (!flag) {
- gf_log ("switch", GF_LOG_ERROR,
- "No nodes left for pattern '*'. Exiting");
- goto err;
- }
- switch_opt = GF_CALLOC (1, sizeof (struct switch_struct),
- gf_switch_mt_switch_struct);
- if (!switch_opt)
- goto err;
-
- /* Add the '*' pattern to the array */
- memcpy (switch_opt->path_pattern, "*", 2);
- switch_opt->num_child = flag;
- switch_opt->array =
- GF_CALLOC (1,
- flag * sizeof (struct switch_sched_array),
- gf_switch_mt_switch_sched_array);
- if (!switch_opt->array)
- goto err;
- flag = 0;
- for (index=0; index < child_count; index++) {
- /* check for considered flag */
- if (switch_buf_array[index].considered)
- continue;
- gf_log ("switch", GF_LOG_DEBUG,
- "'%s' pattern will be scheduled to \"%s\"",
- switch_opt->path_pattern,
- switch_buf_array[index].xl->name);
- switch_opt->array[flag].xl =
- switch_buf_array[index].xl;
- switch_buf_array[index].considered = 1;
- flag++;
- }
- if (switch_buf) {
- /* there are already few entries */
- trav = switch_buf;
- while (trav->next)
- trav = trav->next;
- trav->next = switch_opt;
- } else {
- /* First entry */
- switch_buf = switch_opt;
- }
- }
- /* */
- conf->private = switch_buf;
-
- return 0;
-err:
+ /* Link it to the main structure */
if (switch_buf) {
- if (switch_buf_array)
- GF_FREE (switch_buf_array);
- trav = switch_buf;
- while (trav) {
- if (trav->array)
- GF_FREE (trav->array);
- switch_opt = trav;
- trav = trav->next;
- GF_FREE (switch_opt);
- }
+ /* there are already few entries */
+ trav = switch_buf;
+ while (trav->next)
+ trav = trav->next;
+ trav->next = switch_opt;
+ } else {
+ /* First entry */
+ switch_buf = switch_opt;
}
- return -1;
-}
-
-
-int
-init (xlator_t *this)
-{
- dht_conf_t *conf = NULL;
- data_t *data = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- uint32_t temp_free_disk = 0;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "SWITCH needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t);
- if (!conf) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ switch_opt = NULL;
+ switch_str = strtok_r(NULL, ";", &tmp_str);
+ }
+
+ /* Now, all the pattern based considerations done, so for all the
+ * remaining pattern, '*' to all the remaining child nodes
+ */
+ {
+ for (index = 0; index < child_count; index++) {
+ /* check for considered flag */
+ if (switch_buf_array[index].considered)
+ continue;
+ flag++;
}
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- conf->unhashed_sticky_bit = 0;
- if (dict_get_str (this->options, "unhashed-sticky-bit",
- &temp_str) == 0) {
- gf_string2boolean (temp_str, &conf->unhashed_sticky_bit);
- }
-
- conf->min_free_disk = 10;
- conf->disk_unit = 'p';
-
- if (dict_get_str (this->options, "min-free-disk",
- &temp_str) == 0) {
- if (gf_string2percent (temp_str,
- &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = (uint64_t)temp_free_disk;
- conf->disk_unit = 'p';
- }
- } else {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- }
+ if (!flag) {
+ gf_msg("switch", GF_LOG_ERROR, 0, DHT_MSG_SET_SWITCH_PATTERN_ERROR,
+ "No nodes left for pattern '*'. Exiting");
+ goto err;
}
-
- data = dict_get (this->options, "pattern.switch.case");
- if (data) {
- /* TODO: */
- ret = set_switch_pattern (this, conf, data->data);
- if (ret) {
- goto err;
- }
- }
-
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
+ switch_opt = GF_CALLOC(1, sizeof(struct switch_struct),
+ gf_switch_mt_switch_struct);
+ if (!switch_opt)
+ goto err;
+
+ /* Add the '*' pattern to the array */
+ memcpy(switch_opt->path_pattern, "*", 2);
+ switch_opt->num_child = flag;
+ switch_opt->array = GF_CALLOC(1,
+ flag * sizeof(struct switch_sched_array),
+ gf_switch_mt_switch_sched_array);
+ if (!switch_opt->array)
+ goto err;
+ flag = 0;
+ for (index = 0; index < child_count; index++) {
+ /* check for considered flag */
+ if (switch_buf_array[index].considered)
+ continue;
+ gf_msg_debug("switch", 0,
+ "'%s'"
+ " pattern will be scheduled to \"%s\"",
+ switch_opt->path_pattern,
+ switch_buf_array[index].xl->name);
+
+ switch_opt->array[flag].xl = switch_buf_array[index].xl;
+ switch_buf_array[index].considered = 1;
+ flag++;
}
-
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
+ if (switch_buf) {
+ /* there are already few entries */
+ trav = switch_buf;
+ while (trav->next)
+ trav = trav->next;
+ trav->next = switch_opt;
+ } else {
+ /* First entry */
+ switch_buf = switch_opt;
}
+ switch_opt = NULL;
+ }
+ /* */
+ conf->private = switch_buf;
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
+ GF_FREE(option_string);
+ return 0;
+err:
+ GF_FREE(switch_buf_array);
+ GF_FREE(switch_opt);
+ GF_FREE(option_string);
+
+ if (switch_buf) {
+ trav = switch_buf;
+ while (trav) {
+ GF_FREE(trav->array);
+ switch_opt = trav;
+ trav = trav->next;
+ GF_FREE(switch_opt);
+ }
+ }
+ return -1;
+}
- conf->gen = 1;
+int32_t
+switch_init(xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ data_t *data = NULL;
+ int ret = -1;
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_switch_mt_dht_du_t);
- if (!conf->du_stats) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- goto err;
+ ret = dht_init(this);
+ if (ret) {
+ return ret;
+ }
+ conf = this->private;
+
+ data = dict_get(this->options, "pattern.switch.case");
+ if (data) {
+ /* TODO: */
+ ret = set_switch_pattern(this, conf, data->data);
+ if (ret) {
+ goto err;
}
+ }
- this->private = conf;
-
- return 0;
+ this->private = conf;
+ return 0;
err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- if (conf->default_dir_layout)
- GF_FREE (conf->default_dir_layout);
-
- if (conf->subvolumes)
- GF_FREE (conf->subvolumes);
-
- if (conf->subvolume_status)
- GF_FREE (conf->subvolume_status);
-
- if (conf->du_stats)
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf);
- }
-
- return -1;
+ dht_fini(this);
+ return -1;
}
-
struct xlator_fops fops = {
- .lookup = switch_lookup,
- .create = switch_create,
- .mknod = switch_mknod,
-
- .stat = dht_stat,
- .fstat = dht_fstat,
- .truncate = dht_truncate,
- .ftruncate = dht_ftruncate,
- .access = dht_access,
- .readlink = dht_readlink,
- .setxattr = dht_setxattr,
- .getxattr = dht_getxattr,
- .removexattr = dht_removexattr,
- .open = dht_open,
- .readv = dht_readv,
- .writev = dht_writev,
- .flush = dht_flush,
- .fsync = dht_fsync,
- .statfs = dht_statfs,
- .lk = dht_lk,
- .opendir = dht_opendir,
- .readdir = dht_readdir,
- .readdirp = dht_readdirp,
- .fsyncdir = dht_fsyncdir,
- .symlink = dht_symlink,
- .unlink = dht_unlink,
- .link = dht_link,
- .mkdir = dht_mkdir,
- .rmdir = dht_rmdir,
- .rename = dht_rename,
- .inodelk = dht_inodelk,
- .finodelk = dht_finodelk,
- .entrylk = dht_entrylk,
- .fentrylk = dht_fentrylk,
- .xattrop = dht_xattrop,
- .fxattrop = dht_fxattrop,
- .setattr = dht_setattr,
+ .lookup = switch_lookup,
+ .create = switch_create,
+ .mknod = switch_mknod,
+
+ .stat = dht_stat,
+ .fstat = dht_fstat,
+ .truncate = dht_truncate,
+ .ftruncate = dht_ftruncate,
+ .access = dht_access,
+ .readlink = dht_readlink,
+ .setxattr = dht_setxattr,
+ .getxattr = dht_getxattr,
+ .removexattr = dht_removexattr,
+ .open = dht_open,
+ .readv = dht_readv,
+ .writev = dht_writev,
+ .flush = dht_flush,
+ .fsync = dht_fsync,
+ .statfs = dht_statfs,
+ .lk = dht_lk,
+ .opendir = dht_opendir,
+ .readdir = dht_readdir,
+ .readdirp = dht_readdirp,
+ .fsyncdir = dht_fsyncdir,
+ .symlink = dht_symlink,
+ .unlink = dht_unlink,
+ .link = dht_link,
+ .mkdir = dht_mkdir,
+ .rmdir = dht_rmdir,
+ .rename = dht_rename,
+ .inodelk = dht_inodelk,
+ .finodelk = dht_finodelk,
+ .entrylk = dht_entrylk,
+ .fentrylk = dht_fentrylk,
+ .xattrop = dht_xattrop,
+ .fxattrop = dht_fxattrop,
+ .setattr = dht_setattr,
};
-
-struct xlator_cbks cbks = {
- .forget = dht_forget
-};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"pattern.switch.case"},
- .type = GF_OPTION_TYPE_ANY
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {NULL} },
+struct xlator_cbks cbks = {.forget = dht_forget};
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+xlator_api_t xlator_api = {
+ .init = switch_init,
+ .fini = switch_fini,
+ .notify = dht_notify,
+ .reconfigure = dht_reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = dht_options,
+ .identifier = "switch",
+ .category = GF_TECH_PREVIEW,
};
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
new file mode 100644
index 00000000000..771452963d1
--- /dev/null
+++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
@@ -0,0 +1,73 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include "dht-common.h"
+#include <glusterfs/byte-order.h>
+
+int
+dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p)
+{
+ return 0;
+}
+
+int
+dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout)
+{
+ return 0;
+}
+
+int
+dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this,
+ dht_layout_t *layout_int)
+{
+ return 0;
+}
+
+int
+dict_get_ptr(dict_t *this, char *key, void **ptr)
+{
+ return 0;
+}
+
+int
+dict_get_ptr_and_len(dict_t *this, char *key, void **ptr, int *len)
+{
+ return 0;
+}
+
+int
+_gf_log(const char *domain, const char *file, const char *function,
+ int32_t line, gf_loglevel_t level, const char *fmt, ...)
+{
+ return 0;
+}
+
+int
+_gf_log_callingfn(const char *domain, const char *file, const char *function,
+ int32_t line, gf_loglevel_t level, const char *fmt, ...)
+{
+ return 0;
+}
+
+void
+gf_uuid_unparse(const uuid_t uu, char *out)
+{
+ // could call a will-return function here
+ // to place the correct data in *out
+}
+
+int
+_gf_msg(const char *domain, const char *file, const char *function,
+ int32_t line, gf_loglevel_t level, int errnum, int trace,
+ uint64_t msgid, const char *fmt, ...)
+{
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
new file mode 100644
index 00000000000..c94a1d0a2e1
--- /dev/null
+++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
@@ -0,0 +1,127 @@
+/*
+ Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+#include <glusterfs/logging.h>
+#include <glusterfs/xlator.h>
+
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <cmocka_pbc.h>
+#include <cmocka.h>
+
+/*
+ * Helper functions
+ */
+
+static xlator_t *
+helper_xlator_init(uint32_t num_types)
+{
+ xlator_t *xl;
+ int i, ret;
+
+ REQUIRE(num_types > 0);
+
+ xl = test_calloc(1, sizeof(xlator_t));
+ assert_non_null(xl);
+ xl->mem_acct->num_types = num_types;
+ xl->mem_acct = test_calloc(sizeof(struct mem_acct) +
+ sizeof(struct mem_acct_rec) + num_types);
+ assert_non_null(xl->mem_acct);
+
+ xl->ctx = test_calloc(1, sizeof(glusterfs_ctx_t));
+ assert_non_null(xl->ctx);
+
+ for (i = 0; i < num_types; i++) {
+ ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock));
+ assert_false(ret);
+ }
+
+ ENSURE(num_types == xl->mem_acct.num_types);
+ ENSURE(NULL != xl);
+
+ return xl;
+}
+
+static int
+helper_xlator_destroy(xlator_t *xl)
+{
+ int i, ret;
+
+ for (i = 0; i < xl->mem_acct.num_types; i++) {
+ ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock));
+ assert_int_equal(ret, 0);
+ }
+
+ free(xl->mem_acct.rec);
+ free(xl->ctx);
+ free(xl);
+ return 0;
+}
+
+/*
+ * Unit tests
+ */
+static void
+test_dht_layout_new(void **state)
+{
+ xlator_t *xl;
+ dht_layout_t *layout;
+ dht_conf_t *conf;
+ int cnt;
+
+ expect_assert_failure(dht_layout_new(NULL, 0));
+ expect_assert_failure(dht_layout_new((xlator_t *)0x12345, -1));
+ xl = helper_xlator_init(10);
+
+ // xl->private is NULL
+ assert_null(xl->private);
+ cnt = 100;
+ layout = dht_layout_new(xl, cnt);
+ assert_non_null(layout);
+ assert_int_equal(layout->type, DHT_HASH_TYPE_DM);
+ assert_int_equal(layout->cnt, cnt);
+ assert_int_equal(GF_ATOMIC_GET(layout->ref), 1);
+ assert_int_equal(layout->gen, 0);
+ assert_int_equal(layout->spread_cnt, 0);
+ free(layout);
+
+ // xl->private is not NULL
+ cnt = 110;
+ conf = (dht_conf_t *)test_calloc(1, sizeof(dht_conf_t));
+ assert_non_null(conf);
+ conf->dir_spread_cnt = 12345;
+ conf->gen = -123;
+ xl->private = conf;
+
+ layout = dht_layout_new(xl, cnt);
+ assert_non_null(layout);
+ assert_int_equal(layout->type, DHT_HASH_TYPE_DM);
+ assert_int_equal(layout->cnt, cnt);
+ assert_int_equal(GF_ATOMIC_GET(layout->ref), 1);
+ assert_int_equal(layout->gen, conf->gen);
+ assert_int_equal(layout->spread_cnt, conf->dir_spread_cnt);
+ free(layout);
+
+ free(conf);
+ helper_xlator_destroy(xl);
+}
+
+int
+main(void)
+{
+ const struct CMUnitTest xlator_dht_layout_tests[] = {
+ unit_test(test_dht_layout_new),
+ };
+
+ return cmocka_run_group_tests(xlator_dht_layout_tests, NULL, NULL);
+}
diff --git a/xlators/cluster/ha/Makefile.am b/xlators/cluster/ec/Makefile.am
index d471a3f9243..d471a3f9243 100644
--- a/xlators/cluster/ha/Makefile.am
+++ b/xlators/cluster/ec/Makefile.am
diff --git a/xlators/cluster/ec/src/Makefile.am b/xlators/cluster/ec/src/Makefile.am
new file mode 100644
index 00000000000..406a636bbc2
--- /dev/null
+++ b/xlators/cluster/ec/src/Makefile.am
@@ -0,0 +1,83 @@
+xlator_LTLIBRARIES = ec.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+ec_sources := ec.c
+ec_sources += ec-data.c
+ec_sources += ec-helpers.c
+ec_sources += ec-common.c
+ec_sources += ec-generic.c
+ec_sources += ec-locks.c
+ec_sources += ec-dir-read.c
+ec_sources += ec-dir-write.c
+ec_sources += ec-inode-read.c
+ec_sources += ec-inode-write.c
+ec_sources += ec-combine.c
+ec_sources += ec-method.c
+ec_sources += ec-galois.c
+ec_sources += ec-code.c
+ec_sources += ec-code-c.c
+ec_sources += ec-gf8.c
+ec_sources += ec-heal.c
+ec_sources += ec-heald.c
+
+ec_headers := ec.h
+ec_headers += ec-mem-types.h
+ec_headers += ec-helpers.h
+ec_headers += ec-data.h
+ec_headers += ec-fops.h
+ec_headers += ec-common.h
+ec_headers += ec-combine.h
+ec_headers += ec-method.h
+ec_headers += ec-galois.h
+ec_headers += ec-code.h
+ec_headers += ec-code-c.h
+ec_headers += ec-gf8.h
+ec_headers += ec-heald.h
+ec_headers += ec-messages.h
+ec_headers += ec-types.h
+
+if ENABLE_EC_DYNAMIC_INTEL
+ ec_sources += ec-code-intel.c
+ ec_headers += ec-code-intel.h
+endif
+
+if ENABLE_EC_DYNAMIC_X64
+ ec_sources += ec-code-x64.c
+ ec_headers += ec-code-x64.h
+endif
+
+if ENABLE_EC_DYNAMIC_SSE
+ ec_sources += ec-code-sse.c
+ ec_headers += ec-code-sse.h
+endif
+
+if ENABLE_EC_DYNAMIC_AVX
+ ec_sources += ec-code-avx.c
+ ec_headers += ec-code-avx.h
+endif
+
+ec_ext_sources = $(top_builddir)/xlators/lib/src/libxlator.c
+
+ec_ext_headers = $(top_builddir)/xlators/lib/src/libxlator.h
+
+ec_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+ec_la_SOURCES = $(ec_sources) $(ec_headers) $(ec_ext_sources) $(ec_ext_headers)
+ec_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS)
+AM_CPPFLAGS += -I$(top_srcdir)/libglusterfs/src
+AM_CPPFLAGS += -I$(top_srcdir)/xlators/lib/src
+AM_CPPFLAGS += -I$(top_srcdir)/rpc/rpc-lib/src
+AM_CPPFLAGS += -I$(top_srcdir)/rpc/xdr/src
+AM_CPPFLAGS += -I$(top_builddir)/rpc/xdr/src
+AM_CPPFLAGS += -DGLUSTERFS_LIBEXECDIR=\"$(GLUSTERFS_LIBEXECDIR)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
+install-data-hook:
+ ln -sf ec.so $(DESTDIR)$(xlatordir)/disperse.so
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/disperse.so
diff --git a/xlators/cluster/ec/src/ec-code-avx.c b/xlators/cluster/ec/src/ec-code-avx.c
new file mode 100644
index 00000000000..70afaa00f54
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-avx.c
@@ -0,0 +1,109 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+
+#include "ec-code-intel.h"
+
+static void
+ec_code_avx_prolog(ec_code_builder_t *builder)
+{
+ builder->loop = builder->address;
+}
+
+static void
+ec_code_avx_epilog(ec_code_builder_t *builder)
+{
+ ec_code_intel_op_add_i2r(builder, 32, REG_DX);
+ ec_code_intel_op_add_i2r(builder, 32, REG_DI);
+ ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX);
+ ec_code_intel_op_jne(builder, builder->loop);
+
+ ec_code_intel_op_ret(builder, 0);
+}
+
+static void
+ec_code_avx_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+ uint32_t bit)
+{
+ if (builder->linear) {
+ ec_code_intel_op_mov_m2avx(
+ builder, REG_SI, REG_DX, 1,
+ idx * builder->width * builder->bits + bit * builder->width, dst);
+ } else {
+ if (builder->base != idx) {
+ ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+ REG_AX);
+ builder->base = idx;
+ }
+ ec_code_intel_op_mov_m2avx(builder, REG_AX, REG_DX, 1,
+ bit * builder->width, dst);
+ }
+}
+
+static void
+ec_code_avx_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit)
+{
+ ec_code_intel_op_mov_avx2m(builder, src, REG_DI, REG_NULL, 0,
+ bit * builder->width);
+}
+
+static void
+ec_code_avx_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+ ec_code_intel_op_mov_avx2avx(builder, src, dst);
+}
+
+static void
+ec_code_avx_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+ ec_code_intel_op_xor_avx2avx(builder, src, dst);
+}
+
+static void
+ec_code_avx_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1,
+ uint32_t src2)
+{
+ ec_code_intel_op_mov_avx2avx(builder, src1, dst);
+ ec_code_intel_op_xor_avx2avx(builder, src2, dst);
+}
+
+static void
+ec_code_avx_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+ uint32_t bit)
+{
+ if (builder->linear) {
+ ec_code_intel_op_xor_m2avx(
+ builder, REG_SI, REG_DX, 1,
+ idx * builder->width * builder->bits + bit * builder->width, dst);
+ } else {
+ if (builder->base != idx) {
+ ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+ REG_AX);
+ builder->base = idx;
+ }
+ ec_code_intel_op_xor_m2avx(builder, REG_AX, REG_DX, 1,
+ bit * builder->width, dst);
+ }
+}
+
+static char *ec_code_avx_needed_flags[] = {"avx2", NULL};
+
+ec_code_gen_t ec_code_gen_avx = {.name = "avx",
+ .flags = ec_code_avx_needed_flags,
+ .width = 32,
+ .prolog = ec_code_avx_prolog,
+ .epilog = ec_code_avx_epilog,
+ .load = ec_code_avx_load,
+ .store = ec_code_avx_store,
+ .copy = ec_code_avx_copy,
+ .xor2 = ec_code_avx_xor2,
+ .xor3 = ec_code_avx_xor3,
+ .xorm = ec_code_avx_xorm};
diff --git a/xlators/cluster/ec/src/ec-code-avx.h b/xlators/cluster/ec/src/ec-code-avx.h
new file mode 100644
index 00000000000..fdca4ad2c8f
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-avx.h
@@ -0,0 +1,18 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_AVX_H__
+#define __EC_CODE_AVX_H__
+
+#include "ec-code.h"
+
+extern ec_code_gen_t ec_code_gen_avx;
+
+#endif /* __EC_CODE_AVX_H__ */
diff --git a/xlators/cluster/ec/src/ec-code-c.c b/xlators/cluster/ec/src/ec-code-c.c
new file mode 100644
index 00000000000..acdc665c2cf
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-c.c
@@ -0,0 +1,11679 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "ec-method.h"
+#include "ec-code-c.h"
+
+#define WIDTH (EC_METHOD_WORD_SIZE / sizeof(uint64_t))
+
+static void
+gf8_muladd_00(void *out, void *in)
+{
+ memcpy(out, in, EC_METHOD_WORD_SIZE * 8);
+}
+
+static void
+gf8_muladd_01(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ out_ptr[0] ^= in_ptr[0];
+ out_ptr[WIDTH] ^= in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] ^= in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] ^= in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] ^= in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] ^= in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] ^= in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] ^= in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_02(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in7;
+ out1 = in0;
+ out7 = in6;
+ out5 = in4;
+ out6 = in5;
+ out3 = in2 ^ in7;
+ out4 = in3 ^ in7;
+ out2 = in1 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_03(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in0 ^ in7;
+ tmp0 = in2 ^ in7;
+ out1 = in0 ^ in1;
+ out7 = in6 ^ in7;
+ out5 = in4 ^ in5;
+ out6 = in5 ^ in6;
+ out4 = in3 ^ in4 ^ in7;
+ out2 = tmp0 ^ in1;
+ out3 = tmp0 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_04(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in6;
+ out1 = in7;
+ out7 = in5;
+ out6 = in4;
+ tmp0 = in6 ^ in7;
+ out2 = in0 ^ in6;
+ out5 = in3 ^ in7;
+ out3 = tmp0 ^ in1;
+ out4 = tmp0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_05(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in0 ^ in6;
+ out1 = in1 ^ in7;
+ out7 = in5 ^ in7;
+ out6 = in4 ^ in6;
+ out2 = out0 ^ in2;
+ out3 = out1 ^ in3 ^ in6;
+ out5 = out7 ^ in3;
+ out4 = out6 ^ in2 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_06(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in6 ^ in7;
+ tmp0 = in1 ^ in6;
+ out1 = in0 ^ in7;
+ out7 = in5 ^ in6;
+ out6 = in4 ^ in5;
+ out4 = in2 ^ in3 ^ in6;
+ out5 = in3 ^ in4 ^ in7;
+ out3 = tmp0 ^ in2;
+ out2 = tmp0 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_07(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in6;
+ tmp1 = in5 ^ in6;
+ tmp2 = in0 ^ in7;
+ tmp3 = tmp0 ^ in3;
+ out6 = tmp1 ^ in4;
+ out7 = tmp1 ^ in7;
+ out0 = tmp2 ^ in6;
+ out1 = tmp2 ^ in1;
+ out3 = tmp3 ^ in1;
+ out4 = tmp3 ^ in4;
+ out5 = out4 ^ out7 ^ in2;
+ out2 = tmp0 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_08(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in5;
+ out1 = in6;
+ out7 = in4;
+ out6 = in3 ^ in7;
+ out3 = in0 ^ in5 ^ in6;
+ out5 = in2 ^ in6 ^ in7;
+ out2 = in5 ^ in7;
+ out4 = out2 ^ in1 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_09(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in0 ^ in5;
+ tmp0 = in3 ^ in6;
+ out1 = in1 ^ in6;
+ out7 = in4 ^ in7;
+ out2 = in2 ^ in5 ^ in7;
+ out3 = tmp0 ^ out0;
+ out6 = tmp0 ^ in7;
+ out4 = out1 ^ out7 ^ in5;
+ out5 = out2 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_0A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in5 ^ in7;
+ out1 = in0 ^ in6;
+ out7 = in4 ^ in6;
+ out2 = in1 ^ in5;
+ out6 = out0 ^ in3;
+ out3 = out0 ^ out1 ^ in2;
+ out5 = out7 ^ in2 ^ in7;
+ out4 = out2 ^ in3 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_0B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = in0 ^ in6;
+ tmp2 = in4 ^ in7;
+ out0 = in0 ^ in5 ^ in7;
+ out2 = tmp0 ^ in1;
+ out1 = tmp1 ^ in1;
+ out6 = tmp1 ^ out0 ^ in3;
+ out7 = tmp2 ^ in6;
+ out4 = tmp2 ^ out6 ^ in1;
+ out3 = out6 ^ in0 ^ in2;
+ out5 = tmp0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_0C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in5 ^ in6;
+ out1 = in6 ^ in7;
+ out7 = in4 ^ in5;
+ tmp0 = in1 ^ in5;
+ tmp1 = in0 ^ in7;
+ out5 = in2 ^ in3 ^ in6;
+ out6 = in3 ^ in4 ^ in7;
+ out2 = tmp1 ^ out0;
+ out4 = tmp0 ^ in2;
+ out3 = tmp0 ^ tmp1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_0D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in4 ^ in5;
+ tmp1 = in5 ^ in6;
+ out1 = in1 ^ in6 ^ in7;
+ out7 = tmp0 ^ in7;
+ out4 = tmp0 ^ in1 ^ in2;
+ out0 = tmp1 ^ in0;
+ tmp2 = tmp1 ^ in3;
+ out6 = tmp2 ^ out7;
+ out2 = out0 ^ in2 ^ in7;
+ out3 = out0 ^ out1 ^ in3;
+ out5 = tmp2 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_0E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in2 ^ in5;
+ tmp2 = in5 ^ in6;
+ out1 = in0 ^ in6 ^ in7;
+ out3 = tmp0 ^ tmp1;
+ out2 = tmp0 ^ tmp2;
+ tmp3 = tmp1 ^ in3;
+ out7 = tmp2 ^ in4;
+ out0 = tmp2 ^ in7;
+ out4 = tmp3 ^ in1 ^ in7;
+ out5 = tmp3 ^ out7;
+ out6 = out0 ^ out5 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_0F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in6 ^ in7;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp0 ^ in5;
+ out1 = tmp1 ^ in0;
+ out7 = tmp2 ^ in4;
+ out0 = tmp2 ^ in0;
+ out6 = out7 ^ in3;
+ out5 = out6 ^ in2 ^ in7;
+ tmp3 = tmp1 ^ out0 ^ in2;
+ out4 = tmp1 ^ out5;
+ out2 = tmp3 ^ in6;
+ out3 = tmp3 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_10(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in4;
+ out1 = in5;
+ out7 = in3 ^ in7;
+ tmp0 = in6 ^ in7;
+ out2 = in4 ^ in6;
+ tmp1 = out2 ^ in5;
+ out6 = tmp0 ^ in2;
+ out3 = tmp0 ^ tmp1;
+ out5 = out2 ^ out3 ^ in1;
+ out4 = tmp1 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_11(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out7 = in3;
+ out0 = in0 ^ in4;
+ out1 = in1 ^ in5;
+ out6 = in2 ^ in7;
+ out4 = in0 ^ in5 ^ in6;
+ out5 = in1 ^ in6 ^ in7;
+ out2 = in2 ^ in4 ^ in6;
+ out3 = in3 ^ in4 ^ in5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_12(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in4 ^ in7;
+ out1 = in0 ^ in5;
+ out3 = in2 ^ in4 ^ in5;
+ tmp0 = out0 ^ in6;
+ out2 = tmp0 ^ in1;
+ tmp1 = tmp0 ^ in3;
+ out6 = tmp0 ^ out3;
+ out5 = out2 ^ in5;
+ out7 = tmp1 ^ in4;
+ out4 = tmp1 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_13(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out7 = in3 ^ in6;
+ tmp0 = in0 ^ in5;
+ tmp1 = in4 ^ in7;
+ out6 = in2 ^ in5 ^ in7;
+ out4 = tmp0 ^ out7 ^ in7;
+ out1 = tmp0 ^ in1;
+ out0 = tmp1 ^ in0;
+ out5 = tmp1 ^ in1 ^ in6;
+ out3 = tmp1 ^ out6 ^ in3;
+ out2 = out5 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_14(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in4 ^ in6;
+ out1 = in5 ^ in7;
+ out2 = in0 ^ in4;
+ tmp0 = out0 ^ in5;
+ out7 = out1 ^ in3;
+ tmp1 = out1 ^ in2;
+ out3 = tmp0 ^ in1;
+ out6 = tmp0 ^ tmp1;
+ out4 = tmp1 ^ out2;
+ out5 = out3 ^ in3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_15(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out7 = in3 ^ in5;
+ tmp0 = in0 ^ in4;
+ out1 = in1 ^ in5 ^ in7;
+ out5 = in1 ^ in3 ^ in6;
+ out0 = tmp0 ^ in6;
+ out2 = tmp0 ^ in2;
+ out3 = out5 ^ in4 ^ in5;
+ out6 = out2 ^ in0 ^ in7;
+ out4 = tmp0 ^ out6 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_16(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in5;
+ tmp1 = in4 ^ in7;
+ tmp2 = in2 ^ in3 ^ in4;
+ out1 = tmp0 ^ in7;
+ out4 = tmp0 ^ tmp2;
+ out0 = tmp1 ^ in6;
+ tmp3 = tmp1 ^ in1;
+ out6 = out0 ^ in2 ^ in5;
+ out2 = tmp3 ^ in0;
+ out3 = out6 ^ in1;
+ out7 = tmp2 ^ out6;
+ out5 = tmp3 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_17(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = in3 ^ in6;
+ tmp2 = tmp0 ^ in4;
+ out4 = tmp0 ^ in0 ^ in3;
+ out7 = tmp1 ^ in5;
+ tmp3 = tmp1 ^ in1;
+ out6 = tmp2 ^ in7;
+ out5 = tmp3 ^ in4;
+ out3 = tmp3 ^ out6;
+ out0 = out3 ^ out4 ^ in1;
+ out2 = out3 ^ out7 ^ in0;
+ out1 = tmp2 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_18(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in4 ^ in5;
+ out1 = in5 ^ in6;
+ tmp0 = in4 ^ in7;
+ out5 = in1 ^ in2 ^ in5;
+ out6 = in2 ^ in3 ^ in6;
+ out2 = tmp0 ^ out1;
+ out7 = tmp0 ^ in3;
+ tmp1 = tmp0 ^ in0;
+ out3 = tmp1 ^ in6;
+ out4 = tmp1 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_19(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out5 = in1 ^ in2;
+ out7 = in3 ^ in4;
+ tmp0 = in0 ^ in7;
+ out6 = in2 ^ in3;
+ out1 = in1 ^ in5 ^ in6;
+ out0 = in0 ^ in4 ^ in5;
+ out4 = tmp0 ^ in1;
+ tmp1 = tmp0 ^ in6;
+ out2 = tmp1 ^ out0 ^ in2;
+ out3 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_1A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in4 ^ in5;
+ tmp1 = in5 ^ in6;
+ tmp2 = tmp0 ^ in1;
+ out0 = tmp0 ^ in7;
+ out1 = tmp1 ^ in0;
+ tmp3 = tmp1 ^ in3;
+ out5 = tmp2 ^ in2;
+ out2 = tmp2 ^ in6;
+ out7 = tmp3 ^ out0;
+ out6 = tmp3 ^ in2;
+ out4 = tmp3 ^ out2 ^ in0;
+ out3 = tmp0 ^ out1 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_1B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in4;
+ tmp1 = in2 ^ in5;
+ tmp2 = in3 ^ in6;
+ out5 = tmp0 ^ in1;
+ tmp3 = tmp0 ^ in0;
+ out6 = tmp1 ^ in3;
+ out0 = tmp1 ^ tmp3 ^ in7;
+ out7 = tmp2 ^ in4;
+ tmp4 = out5 ^ in6;
+ out3 = tmp2 ^ tmp3;
+ out2 = tmp4 ^ in5;
+ out4 = tmp4 ^ out3;
+ out1 = tmp3 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_1C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3;
+ tmp1 = in4 ^ in6;
+ tmp2 = in5 ^ in7;
+ out6 = tmp0 ^ tmp1;
+ out0 = tmp1 ^ in5;
+ out1 = tmp2 ^ in6;
+ tmp3 = tmp2 ^ in1;
+ tmp4 = tmp2 ^ in4;
+ out2 = tmp4 ^ in0;
+ out7 = tmp4 ^ in3;
+ out5 = tmp0 ^ tmp3;
+ out3 = tmp3 ^ out2;
+ out4 = out3 ^ in2 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_1D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in3;
+ tmp1 = in0 ^ in4;
+ tmp2 = in3 ^ in4;
+ tmp3 = in2 ^ in7;
+ out3 = tmp0 ^ tmp1;
+ out5 = tmp0 ^ tmp3;
+ tmp4 = tmp1 ^ in5;
+ out6 = tmp2 ^ in2;
+ out7 = tmp2 ^ in5;
+ out2 = tmp3 ^ tmp4;
+ out4 = out3 ^ out6 ^ in6;
+ out0 = tmp4 ^ in6;
+ out1 = out2 ^ out4 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_1E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in4;
+ tmp1 = in2 ^ in7;
+ tmp2 = tmp0 ^ in1;
+ out3 = tmp1 ^ tmp2;
+ out2 = tmp2 ^ in5;
+ out4 = out3 ^ in3 ^ in6;
+ tmp3 = out4 ^ in7;
+ out6 = tmp3 ^ out2 ^ in4;
+ out7 = tmp1 ^ out6;
+ out0 = out7 ^ in3;
+ out1 = tmp0 ^ out0;
+ out5 = tmp3 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_1F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in4 ^ in6;
+ tmp1 = tmp0 ^ in5;
+ out7 = tmp1 ^ in3;
+ out0 = tmp1 ^ in0 ^ in7;
+ out6 = out7 ^ in2 ^ in6;
+ out1 = out0 ^ in1 ^ in4;
+ out4 = out0 ^ out6 ^ in1;
+ out3 = tmp0 ^ out4;
+ out2 = out4 ^ out7 ^ in7;
+ out5 = out3 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_20(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in4;
+ out0 = in3 ^ in7;
+ tmp0 = in3 ^ in4;
+ tmp1 = in6 ^ in7;
+ out2 = out0 ^ in5;
+ out4 = tmp0 ^ in5;
+ out3 = tmp0 ^ tmp1;
+ out7 = tmp1 ^ in2;
+ out6 = tmp1 ^ in1 ^ in5;
+ out5 = out2 ^ out3 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_21(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in1 ^ in4;
+ tmp0 = in4 ^ in6;
+ out4 = in3 ^ in5;
+ out7 = in2 ^ in6;
+ out0 = in0 ^ in3 ^ in7;
+ out6 = in1 ^ in5 ^ in7;
+ out3 = tmp0 ^ in7;
+ out5 = tmp0 ^ in0;
+ out2 = out4 ^ in2 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_22(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in3;
+ out1 = in0 ^ in4;
+ out7 = in2 ^ in7;
+ out4 = in4 ^ in5 ^ in7;
+ out5 = in0 ^ in5 ^ in6;
+ out6 = in1 ^ in6 ^ in7;
+ out3 = in2 ^ in3 ^ in4 ^ in6;
+ out2 = in1 ^ in3 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_23(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out7 = in2;
+ out0 = in0 ^ in3;
+ out4 = in5 ^ in7;
+ out5 = in0 ^ in6;
+ out6 = in1 ^ in7;
+ out3 = in2 ^ in4 ^ in6;
+ out1 = in0 ^ in1 ^ in4;
+ out2 = out4 ^ out6 ^ in2 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_24(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in4 ^ in7;
+ tmp0 = in3 ^ in4;
+ out0 = in3 ^ in6 ^ in7;
+ out3 = tmp0 ^ in1;
+ tmp1 = out0 ^ in5;
+ out6 = tmp1 ^ out3;
+ out2 = tmp1 ^ in0;
+ out7 = tmp1 ^ in2 ^ in3;
+ out5 = out2 ^ in4;
+ out4 = tmp0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_25(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in1 ^ in4;
+ tmp0 = in2 ^ in5;
+ out1 = out3 ^ in7;
+ out7 = tmp0 ^ in6;
+ out6 = out1 ^ in5;
+ out4 = out7 ^ in3 ^ in7;
+ out2 = out4 ^ in0;
+ out0 = tmp0 ^ out2;
+ out5 = out0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_26(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in3 ^ in6;
+ tmp0 = in4 ^ in7;
+ out7 = in2 ^ in5 ^ in7;
+ tmp1 = out0 ^ in0 ^ in5;
+ out1 = tmp0 ^ in0;
+ tmp2 = tmp0 ^ in6;
+ out2 = tmp1 ^ in1;
+ out5 = tmp1 ^ in7;
+ out6 = tmp2 ^ in1;
+ out4 = tmp2 ^ out7;
+ out3 = out0 ^ out6 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_27(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out7 = in2 ^ in5;
+ out0 = in0 ^ in3 ^ in6;
+ out6 = in1 ^ in4 ^ in7;
+ out4 = out7 ^ in6;
+ out2 = out0 ^ out7 ^ in1;
+ out5 = out0 ^ in7;
+ out1 = out6 ^ in0;
+ out3 = out6 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_28(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in3;
+ out1 = in4 ^ in6;
+ out0 = in3 ^ in5 ^ in7;
+ tmp0 = out1 ^ in7;
+ tmp1 = out0 ^ in4;
+ out7 = tmp0 ^ in2;
+ tmp2 = tmp0 ^ in1;
+ out3 = tmp1 ^ in0;
+ out6 = tmp1 ^ tmp2;
+ out4 = tmp2 ^ in3;
+ out5 = out3 ^ in2 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_29(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in2 ^ in3;
+ tmp0 = in1 ^ in3;
+ tmp1 = in4 ^ in6;
+ tmp2 = in0 ^ in4 ^ in7;
+ out6 = tmp0 ^ in5;
+ out4 = tmp0 ^ in6 ^ in7;
+ out1 = tmp1 ^ in1;
+ out7 = tmp1 ^ in2;
+ out3 = tmp2 ^ in5;
+ out5 = tmp2 ^ in2;
+ out0 = out3 ^ in3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_2A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in3 ^ in5;
+ tmp0 = in1 ^ in3;
+ tmp1 = in0 ^ in4;
+ out7 = in2 ^ in4 ^ in7;
+ out3 = tmp1 ^ out0 ^ in2;
+ out2 = tmp0 ^ in7;
+ out6 = tmp0 ^ in6;
+ out1 = tmp1 ^ in6;
+ out5 = tmp1 ^ out7 ^ in5;
+ out4 = out1 ^ in0 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_2B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in1 ^ in6;
+ out7 = in2 ^ in4;
+ tmp0 = in0 ^ in5;
+ tmp1 = in2 ^ in7;
+ out6 = in1 ^ in3;
+ out1 = out4 ^ in0 ^ in4;
+ out3 = tmp0 ^ out7;
+ out0 = tmp0 ^ in3;
+ out5 = tmp1 ^ in0;
+ out2 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_2C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = in2 ^ in3 ^ in4;
+ tmp2 = tmp0 ^ in6;
+ out4 = tmp1 ^ in1;
+ out5 = tmp1 ^ in0 ^ in5;
+ tmp3 = tmp2 ^ in4;
+ out6 = tmp2 ^ out4;
+ out7 = tmp3 ^ in7;
+ out2 = tmp3 ^ out5;
+ out3 = out6 ^ in0;
+ out0 = tmp1 ^ out7;
+ out1 = tmp0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_2D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3;
+ out4 = tmp0 ^ in1;
+ tmp1 = tmp0 ^ in0;
+ out2 = tmp1 ^ in6;
+ out5 = tmp1 ^ in4;
+ tmp2 = out2 ^ in2;
+ tmp3 = tmp2 ^ in5;
+ out0 = tmp3 ^ in7;
+ out7 = tmp3 ^ out5;
+ out6 = out4 ^ out7 ^ in6;
+ out3 = tmp2 ^ out6;
+ out1 = out0 ^ out6 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_2E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in4 ^ in7;
+ out0 = in3 ^ in5 ^ in6;
+ tmp1 = tmp0 ^ in0;
+ tmp2 = tmp0 ^ in2;
+ out1 = tmp1 ^ in6;
+ out4 = tmp2 ^ in1;
+ out7 = tmp2 ^ in5;
+ out3 = out0 ^ out4 ^ in0;
+ out2 = out3 ^ out7 ^ in7;
+ out6 = tmp1 ^ out2;
+ out5 = tmp1 ^ out7 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_2F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in2 ^ in5;
+ out4 = in1 ^ in2 ^ in7;
+ out6 = in1 ^ in3 ^ in4;
+ out5 = tmp0 ^ in2;
+ tmp2 = tmp0 ^ in6;
+ out7 = tmp1 ^ in4;
+ out0 = tmp2 ^ in5;
+ out2 = tmp2 ^ out4;
+ out1 = tmp2 ^ out6 ^ in7;
+ out3 = tmp1 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_30(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in4 ^ in5;
+ tmp0 = in3 ^ in6;
+ tmp1 = in4 ^ in7;
+ out6 = in1 ^ in2 ^ in5;
+ out3 = tmp0 ^ in5;
+ out4 = tmp0 ^ in0;
+ out7 = tmp0 ^ in2;
+ out0 = tmp1 ^ in3;
+ out2 = tmp1 ^ out3;
+ out5 = tmp1 ^ in0 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_31(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in5 ^ in6;
+ tmp0 = in4 ^ in5;
+ tmp1 = in0 ^ in3 ^ in4;
+ tmp2 = out3 ^ in2;
+ out1 = tmp0 ^ in1;
+ out0 = tmp1 ^ in7;
+ out4 = tmp1 ^ in6;
+ out6 = tmp2 ^ in1;
+ out2 = tmp2 ^ out0 ^ in0;
+ out5 = out1 ^ in0 ^ in7;
+ out7 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_32(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in3 ^ in4;
+ out7 = in2 ^ in3;
+ tmp0 = in5 ^ in6;
+ tmp1 = in0 ^ in7;
+ out6 = in1 ^ in2;
+ out1 = in0 ^ in4 ^ in5;
+ out2 = tmp0 ^ out0 ^ in1;
+ out3 = tmp0 ^ out7 ^ in7;
+ out4 = tmp1 ^ in6;
+ out5 = tmp1 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_33(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3;
+ tmp1 = in0 ^ in4;
+ tmp2 = in1 ^ in5;
+ out6 = in1 ^ in2 ^ in6;
+ out7 = tmp0 ^ in7;
+ out0 = tmp1 ^ in3;
+ out1 = tmp1 ^ tmp2;
+ tmp3 = tmp2 ^ in7;
+ tmp4 = tmp2 ^ in4 ^ in6;
+ out5 = tmp3 ^ in0;
+ out3 = tmp3 ^ out6;
+ out4 = tmp4 ^ out5;
+ out2 = tmp0 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_34(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in4 ^ in5;
+ tmp2 = tmp0 ^ in1;
+ tmp3 = tmp0 ^ in6;
+ out1 = tmp1 ^ in7;
+ tmp4 = tmp1 ^ in2;
+ out5 = tmp2 ^ in0;
+ out3 = tmp2 ^ out1;
+ out0 = tmp3 ^ in7;
+ out7 = tmp3 ^ tmp4;
+ out6 = tmp4 ^ in1;
+ out2 = out3 ^ out5 ^ in3;
+ out4 = tmp4 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_35(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in6;
+ tmp1 = in5 ^ in7;
+ out7 = tmp0 ^ tmp1 ^ in3;
+ out3 = tmp1 ^ in1;
+ out1 = out3 ^ in4;
+ tmp2 = out1 ^ in7;
+ out5 = tmp2 ^ in0 ^ in3;
+ out6 = tmp0 ^ tmp2;
+ out0 = out3 ^ out5 ^ in6;
+ out4 = tmp0 ^ out0;
+ out2 = out4 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_36(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in0 ^ in2;
+ tmp0 = in1 ^ in3;
+ out0 = in3 ^ in4 ^ in6;
+ out6 = in1 ^ in2 ^ in4;
+ out5 = tmp0 ^ in0;
+ tmp1 = out5 ^ in5;
+ out2 = tmp1 ^ in4;
+ out3 = tmp1 ^ out4;
+ out1 = tmp0 ^ out2 ^ in7;
+ out7 = out3 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_37(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in2 ^ in4;
+ tmp2 = tmp0 ^ in6;
+ out3 = tmp0 ^ in5;
+ out4 = tmp1 ^ in0;
+ out6 = tmp2 ^ in4;
+ out1 = out3 ^ out4 ^ in7;
+ tmp3 = out4 ^ in1 ^ in3;
+ out7 = tmp3 ^ out1;
+ out2 = tmp3 ^ in5;
+ out5 = tmp1 ^ out2;
+ out0 = tmp2 ^ tmp3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_38(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in0 ^ in3;
+ tmp0 = in3 ^ in4;
+ tmp1 = in5 ^ in7;
+ tmp2 = out3 ^ in1;
+ out2 = tmp0 ^ in6;
+ out0 = tmp0 ^ tmp1;
+ out4 = tmp1 ^ tmp2;
+ out7 = out2 ^ in2;
+ out1 = out2 ^ in3 ^ in5;
+ out6 = out4 ^ in0 ^ in2;
+ out5 = tmp2 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_39(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in0;
+ tmp0 = in1 ^ in5;
+ tmp1 = tmp0 ^ in4;
+ out1 = tmp1 ^ in6;
+ out5 = out1 ^ in0 ^ in2;
+ tmp2 = tmp0 ^ out5;
+ out2 = tmp2 ^ in0 ^ in3;
+ out7 = out2 ^ in7;
+ out6 = tmp1 ^ out7;
+ out4 = tmp2 ^ out6;
+ out0 = out4 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_3A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in0 ^ in2;
+ tmp2 = in3 ^ in4;
+ tmp3 = in1 ^ in6;
+ tmp4 = in3 ^ in7;
+ out4 = tmp0 ^ in5;
+ out5 = tmp1 ^ tmp3;
+ out3 = tmp1 ^ tmp4;
+ out0 = tmp2 ^ in5;
+ out7 = tmp2 ^ in2;
+ tmp5 = tmp3 ^ in4;
+ out2 = tmp4 ^ tmp5;
+ out1 = tmp5 ^ out4;
+ out6 = tmp0 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_3B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in6;
+ tmp1 = in2 ^ in7;
+ tmp2 = tmp0 ^ in3;
+ out3 = tmp1 ^ in0;
+ out6 = tmp1 ^ tmp2;
+ out2 = out6 ^ in4;
+ out7 = tmp0 ^ out2;
+ out0 = out3 ^ out7 ^ in5;
+ out5 = out0 ^ out2 ^ in7;
+ out1 = tmp2 ^ out0;
+ out4 = out1 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_3C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in2 ^ in7;
+ tmp2 = in1 ^ in6 ^ in7;
+ out2 = tmp0 ^ in4;
+ out3 = tmp0 ^ tmp2;
+ out4 = tmp1 ^ out3 ^ in5;
+ out5 = tmp2 ^ out2 ^ in2;
+ out1 = out4 ^ out5 ^ in6;
+ out0 = out1 ^ in3;
+ out7 = tmp1 ^ out0;
+ out6 = tmp2 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_3D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = tmp0 ^ in3;
+ out2 = tmp1 ^ in4;
+ tmp2 = out2 ^ in5;
+ out4 = tmp2 ^ in1 ^ in6;
+ out5 = out4 ^ in7;
+ out6 = out5 ^ in0;
+ out7 = out6 ^ in1;
+ out0 = tmp0 ^ out7;
+ out1 = tmp1 ^ out5;
+ out3 = tmp2 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_3E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in5;
+ tmp1 = tmp0 ^ in4;
+ out0 = tmp1 ^ in6;
+ out7 = tmp1 ^ in2;
+ out6 = out7 ^ in1 ^ in5 ^ in7;
+ out2 = out6 ^ in0 ^ in2;
+ out4 = out0 ^ out6 ^ in0;
+ out5 = tmp0 ^ out4;
+ out3 = out5 ^ in7;
+ out1 = out3 ^ out6 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_3F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ out3 = tmp0 ^ in2 ^ in6;
+ tmp1 = out3 ^ in5 ^ in7;
+ out4 = tmp1 ^ in4;
+ out5 = tmp1 ^ in3;
+ out1 = out4 ^ in2;
+ out7 = out1 ^ out3 ^ in3;
+ out2 = tmp0 ^ out7 ^ in5;
+ tmp2 = out2 ^ in0;
+ out6 = tmp2 ^ in6;
+ out0 = tmp1 ^ tmp2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_40(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in3 ^ in7;
+ tmp0 = in3 ^ in4;
+ tmp1 = in6 ^ in7;
+ out4 = tmp0 ^ in2;
+ out5 = tmp0 ^ in5;
+ out0 = tmp1 ^ in2;
+ out7 = tmp1 ^ in1 ^ in5;
+ out2 = out0 ^ in4;
+ out3 = out2 ^ out5 ^ in7;
+ out6 = out3 ^ out4 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_41(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in2 ^ in3;
+ tmp0 = in5 ^ in6;
+ tmp1 = in6 ^ in7;
+ out5 = in3 ^ in4;
+ out1 = in1 ^ in3 ^ in7;
+ out6 = in0 ^ in4 ^ in5;
+ out3 = tmp0 ^ in2;
+ out7 = tmp0 ^ in1;
+ out2 = tmp1 ^ in4;
+ out0 = tmp1 ^ in0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_42(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in2 ^ in6;
+ out5 = in3 ^ in5;
+ out1 = in0 ^ in3 ^ in7;
+ out7 = in1 ^ in5 ^ in7;
+ out4 = in2 ^ in4 ^ in7;
+ out6 = in0 ^ in4 ^ in6;
+ out2 = out0 ^ in1 ^ in4;
+ out3 = out5 ^ in6 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_43(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out5 = in3;
+ out7 = in1 ^ in5;
+ out4 = in2 ^ in7;
+ out6 = in0 ^ in4;
+ out0 = in0 ^ in2 ^ in6;
+ out3 = in5 ^ in6 ^ in7;
+ out2 = in1 ^ in4 ^ in6;
+ out1 = in0 ^ in1 ^ in3 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_44(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in3;
+ out0 = in2 ^ in7;
+ tmp0 = in4 ^ in7;
+ out7 = in1 ^ in6 ^ in7;
+ out6 = in0 ^ in5 ^ in6;
+ out4 = tmp0 ^ in3 ^ in6;
+ out3 = out0 ^ in1 ^ in3 ^ in5;
+ out2 = out0 ^ in0 ^ in4;
+ out5 = tmp0 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_45(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in1 ^ in3;
+ out7 = in1 ^ in6;
+ out5 = in4 ^ in7;
+ out6 = in0 ^ in5;
+ out0 = in0 ^ in2 ^ in7;
+ out4 = in3 ^ in6 ^ in7;
+ out2 = out5 ^ in0;
+ out3 = out0 ^ out6 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_46(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in2;
+ out1 = in0 ^ in3;
+ out7 = in1 ^ in7;
+ out4 = in4 ^ in6;
+ out5 = in5 ^ in7;
+ out6 = in0 ^ in6;
+ out3 = in1 ^ in3 ^ in5;
+ out2 = out4 ^ out6 ^ in1 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_47(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in6;
+ out7 = in1;
+ out5 = in7;
+ out6 = in0;
+ tmp0 = in0 ^ in1;
+ out3 = in1 ^ in5;
+ out0 = in0 ^ in2;
+ out1 = tmp0 ^ in3;
+ out2 = tmp0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_48(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3;
+ out1 = in3 ^ in6 ^ in7;
+ out3 = tmp0 ^ in0;
+ out0 = tmp0 ^ out1 ^ in5;
+ tmp1 = out0 ^ in4;
+ out2 = tmp1 ^ in7;
+ out5 = tmp1 ^ in3;
+ out4 = out5 ^ in1;
+ out7 = tmp0 ^ out4;
+ out6 = tmp1 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_49(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in0 ^ in2;
+ tmp0 = in2 ^ in5;
+ out2 = in4 ^ in5 ^ in6;
+ tmp1 = tmp0 ^ out2 ^ in3;
+ out7 = out2 ^ in1;
+ out5 = tmp1 ^ in7;
+ out4 = out5 ^ out7 ^ in6;
+ out1 = tmp0 ^ out4;
+ out6 = out1 ^ out7 ^ in0;
+ out0 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_4A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in6;
+ tmp1 = in3 ^ in7;
+ out0 = tmp0 ^ in5;
+ out3 = tmp1 ^ in0;
+ out5 = tmp1 ^ out0;
+ out4 = out0 ^ in1 ^ in4;
+ out1 = out3 ^ in6;
+ out2 = out4 ^ in7;
+ out6 = out1 ^ in4;
+ out7 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_4B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in0 ^ in7;
+ tmp0 = in1 ^ in5;
+ tmp1 = in2 ^ in6;
+ tmp2 = out3 ^ in3;
+ out7 = tmp0 ^ in4;
+ out4 = tmp0 ^ tmp1;
+ tmp3 = tmp1 ^ in0;
+ out6 = tmp2 ^ in4;
+ out5 = tmp2 ^ tmp3;
+ out1 = tmp2 ^ in1 ^ in6;
+ out2 = out7 ^ in6 ^ in7;
+ out0 = tmp3 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_4C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in3 ^ in6;
+ tmp0 = in2 ^ in5;
+ tmp1 = out1 ^ in5 ^ in7;
+ out0 = tmp0 ^ in7;
+ tmp2 = tmp0 ^ in4;
+ out6 = tmp1 ^ in0;
+ out2 = tmp2 ^ in0;
+ out5 = tmp2 ^ in6;
+ out3 = tmp0 ^ out6 ^ in1;
+ out7 = out0 ^ out5 ^ in1;
+ out4 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_4D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in6;
+ out4 = in1 ^ in3 ^ in5;
+ tmp2 = tmp0 ^ in7;
+ out2 = tmp0 ^ in4;
+ out1 = tmp1 ^ in3;
+ out7 = tmp1 ^ in4;
+ out0 = tmp2 ^ in2;
+ out6 = tmp2 ^ in3;
+ out5 = out7 ^ in1 ^ in2;
+ out3 = tmp1 ^ out0 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_4E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in2 ^ in5;
+ out7 = in1 ^ in4 ^ in7;
+ out1 = in0 ^ in3 ^ in6;
+ out5 = out0 ^ in6;
+ out4 = out7 ^ in5;
+ out3 = out1 ^ in1;
+ out6 = out1 ^ in7;
+ out2 = out4 ^ in0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_4F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out5 = in2 ^ in6;
+ out7 = in1 ^ in4;
+ out3 = in0 ^ in1 ^ in6;
+ out4 = in1 ^ in5 ^ in7;
+ out0 = in0 ^ in2 ^ in5;
+ out6 = in0 ^ in3 ^ in7;
+ out1 = out3 ^ in3;
+ out2 = out4 ^ in0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_50(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in2 ^ in7;
+ tmp0 = in3 ^ in5;
+ out0 = out2 ^ in4 ^ in6;
+ out1 = tmp0 ^ in7;
+ tmp1 = tmp0 ^ in6;
+ out3 = out0 ^ in3;
+ out7 = tmp1 ^ in1;
+ tmp2 = tmp1 ^ in0;
+ out5 = out3 ^ in1 ^ in2;
+ out4 = tmp2 ^ in2;
+ out6 = tmp2 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_51(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in7;
+ out3 = in2 ^ in4 ^ in6 ^ in7;
+ out0 = out3 ^ in0;
+ out6 = out0 ^ in5;
+ out4 = out6 ^ in3 ^ in7;
+ out1 = out0 ^ out4 ^ in1;
+ out7 = out1 ^ in6;
+ out5 = out7 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_52(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in1 ^ in2;
+ tmp0 = in2 ^ in4;
+ tmp1 = in3 ^ in5;
+ tmp2 = in3 ^ in6;
+ tmp3 = in0 ^ in7;
+ out0 = tmp0 ^ in6;
+ out6 = tmp0 ^ tmp3;
+ out7 = tmp1 ^ in1;
+ out1 = tmp1 ^ tmp3;
+ out3 = tmp2 ^ in4;
+ out5 = tmp2 ^ in1 ^ in7;
+ out4 = tmp2 ^ out1 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_53(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in1;
+ out3 = in4 ^ in6;
+ out0 = out3 ^ in0 ^ in2;
+ out6 = out0 ^ in7;
+ out4 = out6 ^ in5;
+ out7 = out0 ^ out4 ^ in1 ^ in3;
+ out1 = out7 ^ in0;
+ out5 = out7 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_54(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in3 ^ in5;
+ tmp0 = in1 ^ in3;
+ tmp1 = in2 ^ in4;
+ tmp2 = in0 ^ in7;
+ out5 = in1 ^ in4 ^ in6;
+ out4 = tmp2 ^ out1;
+ out7 = tmp0 ^ in6;
+ out3 = tmp0 ^ tmp1;
+ out0 = tmp1 ^ in7;
+ tmp3 = tmp2 ^ in2;
+ out2 = tmp3 ^ in6;
+ out6 = tmp3 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_55(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in3;
+ tmp1 = in1 ^ in4;
+ tmp2 = in6 ^ in7;
+ out7 = tmp0 ^ tmp2;
+ out1 = tmp0 ^ in5;
+ out3 = tmp1 ^ in2;
+ out5 = tmp1 ^ in5 ^ in6;
+ out2 = tmp2 ^ in0;
+ out4 = out5 ^ out7 ^ in0;
+ out6 = out2 ^ in2 ^ in5;
+ out0 = out5 ^ out6 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_56(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in2 ^ in4;
+ tmp0 = in0 ^ in2;
+ out4 = in0 ^ in5;
+ out7 = in1 ^ in3;
+ out5 = in1 ^ in6;
+ out6 = tmp0 ^ in7;
+ out2 = tmp0 ^ out5;
+ out1 = out4 ^ in3;
+ out3 = out7 ^ in4 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_57(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in7;
+ out0 = in0 ^ in2 ^ in4;
+ out5 = in1 ^ in5 ^ in6;
+ out4 = tmp0 ^ in4;
+ out1 = tmp0 ^ in1 ^ in3;
+ out2 = tmp0 ^ out5;
+ out3 = tmp1 ^ in4;
+ out7 = tmp1 ^ in3;
+ out6 = tmp1 ^ out2 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_58(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in2 ^ in5;
+ tmp0 = in2 ^ in3 ^ in4;
+ out5 = tmp0 ^ in1;
+ out6 = tmp0 ^ in0 ^ in5;
+ out3 = out6 ^ in7;
+ tmp1 = out2 ^ out5;
+ out7 = tmp1 ^ in6;
+ out4 = tmp1 ^ out3 ^ in3;
+ out0 = out4 ^ out7 ^ in0;
+ out1 = tmp0 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_59(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in5;
+ tmp0 = in0 ^ in5 ^ in7;
+ out3 = tmp0 ^ in2 ^ in4;
+ out0 = out3 ^ in6;
+ tmp1 = out0 ^ in7;
+ out6 = tmp1 ^ in3;
+ out5 = out6 ^ in0 ^ in1 ^ in6;
+ out4 = tmp0 ^ out5;
+ out1 = tmp1 ^ out4;
+ out7 = out1 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_5A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in2 ^ in5;
+ out5 = tmp0 ^ in3;
+ out4 = tmp0 ^ in0;
+ tmp2 = tmp1 ^ in4;
+ out2 = tmp1 ^ in1 ^ in7;
+ out7 = tmp2 ^ out5;
+ out6 = out4 ^ out7 ^ in5;
+ out0 = tmp2 ^ in6;
+ out1 = out0 ^ out6 ^ in7;
+ out3 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_5B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3;
+ tmp1 = in0 ^ in4;
+ tmp2 = in1 ^ in5;
+ out5 = tmp0 ^ tmp2;
+ tmp3 = tmp1 ^ in6;
+ out3 = tmp1 ^ in5;
+ out2 = tmp2 ^ in7;
+ tmp4 = out3 ^ in2;
+ out7 = out2 ^ in3 ^ in4;
+ out0 = tmp4 ^ in6;
+ out6 = tmp0 ^ tmp3;
+ out4 = tmp2 ^ tmp4;
+ out1 = tmp3 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_5C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in6;
+ tmp1 = in0 ^ in2 ^ in5;
+ out1 = tmp0 ^ in5;
+ tmp2 = tmp0 ^ in1;
+ out2 = tmp1 ^ in6;
+ out6 = tmp1 ^ in3;
+ out4 = tmp2 ^ in0;
+ out7 = tmp2 ^ in4;
+ out3 = tmp1 ^ out7;
+ out0 = out3 ^ out4 ^ in7;
+ out5 = out0 ^ in1 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_5D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in0 ^ in6;
+ out2 = tmp1 ^ in5;
+ tmp2 = out2 ^ in3;
+ out6 = tmp2 ^ in2;
+ out1 = tmp0 ^ tmp2;
+ tmp3 = out1 ^ in4 ^ in5;
+ out4 = tmp3 ^ in0;
+ out7 = tmp3 ^ in7;
+ tmp4 = out4 ^ out6;
+ out5 = tmp4 ^ in7;
+ out0 = tmp0 ^ out5;
+ out3 = tmp1 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_5E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = in3 ^ in5;
+ tmp2 = in1 ^ in7;
+ out7 = in1 ^ in3 ^ in4;
+ out0 = tmp0 ^ in4;
+ tmp3 = tmp1 ^ in0;
+ out5 = tmp2 ^ in2;
+ out1 = tmp3 ^ in6;
+ out6 = tmp0 ^ tmp3;
+ tmp4 = tmp2 ^ out1;
+ out3 = tmp4 ^ in4;
+ out4 = tmp1 ^ tmp4;
+ out2 = tmp0 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_5F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in5;
+ tmp1 = in0 ^ in6;
+ tmp2 = tmp0 ^ in7;
+ tmp3 = tmp1 ^ in3;
+ out2 = tmp1 ^ tmp2;
+ out5 = tmp2 ^ in2;
+ out6 = tmp3 ^ in2;
+ out3 = out2 ^ in4;
+ out4 = out3 ^ in5;
+ out1 = tmp0 ^ tmp3;
+ out7 = tmp3 ^ out4;
+ out0 = out4 ^ out5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_60(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in2 ^ in5;
+ tmp0 = in3 ^ in6;
+ out1 = in3 ^ in4 ^ in7;
+ out7 = out4 ^ in1;
+ tmp1 = out4 ^ in4;
+ out0 = tmp0 ^ in2;
+ out5 = tmp0 ^ in0;
+ out2 = tmp0 ^ tmp1;
+ out3 = tmp1 ^ in7;
+ out6 = out3 ^ out7 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_61(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in5;
+ out4 = tmp0 ^ in4;
+ tmp1 = out4 ^ in3;
+ out3 = tmp1 ^ in7;
+ out2 = tmp1 ^ in2 ^ in6;
+ out1 = tmp0 ^ out3 ^ in1;
+ out0 = out2 ^ out4 ^ in0;
+ out7 = tmp1 ^ out1;
+ out6 = out0 ^ out1 ^ in2;
+ out5 = tmp0 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_62(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in4 ^ in5;
+ tmp0 = in0 ^ in3 ^ in4;
+ out1 = tmp0 ^ in7;
+ out5 = tmp0 ^ in6;
+ tmp1 = out1 ^ in0;
+ tmp2 = tmp1 ^ out3;
+ out4 = tmp2 ^ in2;
+ tmp3 = tmp2 ^ in1;
+ out0 = out4 ^ in5 ^ in6;
+ out7 = tmp3 ^ out0;
+ out6 = tmp0 ^ tmp3;
+ out2 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_63(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in1 ^ in7;
+ out3 = tmp0 ^ in5;
+ tmp2 = out3 ^ in6;
+ out4 = out3 ^ in2 ^ in7;
+ out5 = tmp2 ^ in0;
+ tmp3 = out5 ^ in3;
+ out0 = tmp3 ^ out4;
+ out2 = tmp1 ^ tmp2;
+ out6 = tmp1 ^ tmp3;
+ tmp4 = tmp0 ^ out2;
+ out1 = tmp4 ^ out5;
+ out7 = tmp4 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_64(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in2 ^ in3;
+ out1 = in3 ^ in4;
+ out7 = in1 ^ in2;
+ tmp0 = in4 ^ in5;
+ tmp1 = in0 ^ in7;
+ out4 = in5 ^ in6 ^ in7;
+ out2 = tmp0 ^ out0 ^ in0;
+ out3 = tmp0 ^ out7 ^ in6;
+ out5 = tmp1 ^ in6;
+ out6 = tmp1 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_65(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in4 ^ in5;
+ tmp2 = in6 ^ in7;
+ out7 = in1 ^ in2 ^ in7;
+ out1 = in1 ^ in3 ^ in4;
+ out0 = tmp0 ^ in2;
+ out2 = tmp0 ^ tmp1;
+ out4 = tmp1 ^ tmp2;
+ tmp3 = tmp2 ^ in0;
+ out3 = out4 ^ out7 ^ in3;
+ out5 = tmp3 ^ in5;
+ out6 = tmp3 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_66(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in2 ^ in3;
+ tmp2 = in0 ^ in4;
+ out7 = tmp0 ^ in6;
+ out0 = tmp1 ^ in7;
+ out1 = tmp2 ^ in3;
+ tmp3 = tmp2 ^ in6;
+ tmp4 = out1 ^ in5;
+ out5 = tmp3 ^ in7;
+ out4 = tmp3 ^ tmp4;
+ out2 = tmp0 ^ tmp4 ^ in7;
+ out6 = tmp1 ^ out2 ^ in4;
+ out3 = tmp3 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_67(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp0 ^ in7;
+ out1 = tmp1 ^ in4;
+ out0 = tmp2 ^ in2;
+ tmp3 = out1 ^ in7;
+ out2 = tmp3 ^ in5;
+ out3 = out2 ^ in0 ^ in6;
+ out7 = tmp1 ^ out0 ^ in6;
+ out5 = tmp1 ^ out3;
+ out4 = tmp2 ^ out5;
+ out6 = tmp3 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_68(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in2 ^ in3 ^ in5;
+ tmp2 = tmp0 ^ in1;
+ tmp3 = tmp0 ^ in6;
+ out0 = tmp1 ^ in6;
+ out6 = tmp2 ^ in0;
+ out7 = tmp1 ^ tmp2;
+ out1 = tmp3 ^ in7;
+ out2 = out1 ^ in2;
+ out4 = tmp2 ^ out2;
+ out3 = out4 ^ out6 ^ in3;
+ out5 = tmp3 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_69(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in6 ^ in7;
+ out2 = tmp0 ^ in3 ^ in4;
+ out1 = out2 ^ in1;
+ out3 = out2 ^ in0 ^ in2;
+ out4 = out1 ^ in2 ^ in3;
+ out6 = out1 ^ in0 ^ in7;
+ out7 = out4 ^ in5 ^ in6;
+ out5 = out4 ^ out6 ^ in5;
+ out0 = tmp0 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_6A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in6;
+ out3 = in0 ^ in4 ^ in6;
+ tmp1 = tmp0 ^ in3;
+ out4 = tmp1 ^ in1;
+ tmp2 = tmp1 ^ in7;
+ out2 = out4 ^ in4;
+ out0 = tmp2 ^ in5;
+ out5 = tmp2 ^ out3;
+ out7 = out2 ^ in3 ^ in5;
+ out1 = tmp0 ^ out5;
+ out6 = tmp1 ^ out7 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_6B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in4 ^ in6;
+ out2 = tmp0 ^ in1 ^ in3;
+ out4 = out2 ^ in2;
+ tmp1 = out2 ^ in0;
+ out7 = out4 ^ in3 ^ in5 ^ in7;
+ out1 = tmp1 ^ in7;
+ out3 = tmp1 ^ in1;
+ out6 = tmp1 ^ in5;
+ out0 = tmp1 ^ out7 ^ in6;
+ out5 = tmp0 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_6C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in1;
+ tmp0 = in2 ^ in3;
+ out5 = in0 ^ in2;
+ out1 = in3 ^ in4 ^ in6;
+ tmp1 = out5 ^ in1;
+ out0 = tmp0 ^ in5;
+ out6 = tmp0 ^ tmp1;
+ out3 = tmp1 ^ in4;
+ out7 = out3 ^ in0;
+ out2 = out6 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_6D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in1 ^ in4;
+ tmp0 = in0 ^ in2;
+ tmp1 = out4 ^ in3;
+ out7 = out4 ^ in2 ^ in7;
+ out5 = tmp0 ^ in5;
+ out3 = tmp0 ^ tmp1;
+ out1 = tmp1 ^ in6;
+ out0 = out5 ^ in3;
+ out2 = out3 ^ out7 ^ in4;
+ out6 = out1 ^ in0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_6E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in3;
+ tmp1 = in0 ^ in4;
+ out4 = tmp0 ^ in7;
+ out6 = tmp0 ^ in0 ^ in5;
+ out5 = tmp1 ^ in2;
+ tmp2 = tmp1 ^ in3;
+ out3 = tmp2 ^ out4;
+ out1 = tmp2 ^ in6;
+ out2 = tmp0 ^ out5;
+ out0 = out2 ^ out3 ^ in5;
+ out7 = out1 ^ out2 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_6F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in7;
+ tmp1 = tmp0 ^ in4;
+ tmp2 = tmp0 ^ in0 ^ in2;
+ out4 = tmp1 ^ in1;
+ out0 = tmp2 ^ in5;
+ out3 = out4 ^ in0;
+ out2 = out3 ^ in7;
+ out1 = out2 ^ in6;
+ out6 = out1 ^ in4 ^ in5;
+ out7 = tmp2 ^ out1;
+ out5 = tmp1 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_70(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in2;
+ tmp0 = in2 ^ in4;
+ out2 = in2 ^ in3 ^ in5;
+ tmp1 = tmp0 ^ in6;
+ tmp2 = out2 ^ in7;
+ out0 = tmp1 ^ in3;
+ out4 = tmp1 ^ in0;
+ out7 = tmp2 ^ in1;
+ out6 = out4 ^ in1;
+ out5 = out7 ^ in0 ^ in2;
+ out1 = tmp0 ^ tmp2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_71(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in3 ^ in5;
+ out3 = in2 ^ in3;
+ tmp0 = in0 ^ in2;
+ tmp1 = out2 ^ in1;
+ out4 = tmp0 ^ in6;
+ tmp2 = tmp0 ^ in1;
+ out7 = tmp1 ^ in2;
+ out1 = tmp1 ^ in4 ^ in7;
+ out0 = out4 ^ in3 ^ in4;
+ out6 = tmp2 ^ in4;
+ out5 = tmp2 ^ out3 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_72(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in7;
+ tmp0 = in0 ^ in4;
+ tmp1 = tmp0 ^ in3 ^ in7;
+ out1 = tmp1 ^ in5;
+ out5 = out1 ^ in1;
+ tmp2 = tmp0 ^ out5;
+ out2 = tmp2 ^ in2;
+ out7 = out2 ^ in6;
+ out6 = tmp1 ^ out7;
+ out4 = tmp2 ^ out6;
+ out0 = out4 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_73(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in3 ^ in7;
+ out2 = out3 ^ in1 ^ in5;
+ out1 = out2 ^ in0 ^ in4;
+ out5 = out1 ^ in5;
+ out6 = out1 ^ out3 ^ in2;
+ out0 = out2 ^ out6 ^ in6;
+ out7 = out0 ^ out1 ^ in3;
+ out4 = out0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_74(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in1 ^ in2 ^ in6;
+ out4 = in0 ^ in4 ^ in7;
+ out5 = in0 ^ in1 ^ in5;
+ out0 = tmp0 ^ in2;
+ out1 = tmp0 ^ in5;
+ out3 = tmp1 ^ in7;
+ out6 = tmp1 ^ in0;
+ out2 = tmp1 ^ out5 ^ in3;
+ out7 = out3 ^ in3 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_75(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in0 ^ in7;
+ tmp0 = in1 ^ in3;
+ out5 = in0 ^ in1;
+ out7 = tmp0 ^ in2;
+ tmp1 = tmp0 ^ in4;
+ out6 = out5 ^ in2;
+ tmp2 = out7 ^ in6;
+ out1 = tmp1 ^ in5;
+ out0 = tmp1 ^ out6;
+ out3 = tmp2 ^ in7;
+ out2 = tmp2 ^ out6 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_76(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in1 ^ in6;
+ tmp0 = in0 ^ in5;
+ tmp1 = in3 ^ in7;
+ tmp2 = tmp0 ^ in4;
+ tmp3 = tmp1 ^ in2;
+ out5 = tmp2 ^ in1;
+ out1 = tmp2 ^ in3;
+ out0 = tmp3 ^ in4;
+ out4 = out1 ^ in5;
+ out7 = tmp3 ^ out3;
+ out2 = tmp0 ^ out7;
+ out6 = tmp1 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_77(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in0 ^ in3;
+ tmp0 = in1 ^ in4;
+ tmp1 = in1 ^ in6;
+ tmp2 = out4 ^ in5;
+ out5 = tmp0 ^ in0;
+ out1 = tmp0 ^ tmp2;
+ out3 = tmp1 ^ in3;
+ out2 = tmp1 ^ tmp2 ^ in7;
+ out7 = out3 ^ in2;
+ tmp3 = out7 ^ in6;
+ out6 = tmp2 ^ tmp3;
+ out0 = tmp3 ^ out5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_78(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in2 ^ in7;
+ tmp2 = in0 ^ in5 ^ in6;
+ out2 = tmp1 ^ in3;
+ out3 = tmp2 ^ in2;
+ out5 = out3 ^ in1 ^ in3;
+ out0 = tmp0 ^ out3 ^ in4;
+ out1 = tmp1 ^ out0;
+ out4 = out1 ^ out5 ^ in5;
+ out7 = tmp0 ^ out4;
+ out6 = tmp2 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_79(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in3 ^ in7;
+ tmp0 = in3 ^ in4;
+ tmp1 = in1 ^ in5;
+ tmp2 = tmp1 ^ in2;
+ out4 = tmp2 ^ in0 ^ in7;
+ tmp3 = out4 ^ in5;
+ out5 = tmp3 ^ out2 ^ in6;
+ out7 = tmp0 ^ tmp2;
+ out6 = tmp0 ^ tmp3;
+ out3 = tmp1 ^ out5;
+ out0 = out3 ^ in4;
+ out1 = tmp3 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_7A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2;
+ out2 = tmp0 ^ in3;
+ tmp1 = out2 ^ in4;
+ out4 = tmp1 ^ in0 ^ in5;
+ out5 = out4 ^ in6;
+ out6 = out5 ^ in7;
+ out7 = out6 ^ in0;
+ out0 = out7 ^ in1;
+ out1 = tmp0 ^ out6;
+ out3 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_7B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in1 ^ in3;
+ tmp0 = in0 ^ in5;
+ out4 = tmp0 ^ out2 ^ in2;
+ tmp1 = out4 ^ in4;
+ out6 = tmp1 ^ in7;
+ out5 = tmp1 ^ in5 ^ in6;
+ out0 = out6 ^ in1 ^ in6;
+ tmp2 = out0 ^ in2;
+ out1 = tmp2 ^ in1;
+ out3 = tmp2 ^ in4;
+ out7 = tmp0 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_7C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in5;
+ tmp1 = tmp0 ^ in4;
+ out0 = tmp1 ^ in2;
+ out1 = tmp1 ^ in6;
+ out7 = out0 ^ in1 ^ in5 ^ in7;
+ out5 = out1 ^ out7 ^ in0;
+ out3 = out5 ^ in6;
+ out6 = tmp0 ^ out5;
+ out2 = out6 ^ in1;
+ out4 = out2 ^ out7 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_7D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = tmp0 ^ in3;
+ tmp2 = tmp0 ^ in6;
+ out7 = tmp1 ^ in4;
+ tmp3 = tmp2 ^ in0;
+ out5 = tmp3 ^ in7;
+ out4 = tmp3 ^ in2 ^ in5;
+ out2 = tmp1 ^ out5;
+ out6 = tmp2 ^ out2;
+ out0 = out4 ^ out7 ^ in6;
+ out1 = tmp3 ^ out0;
+ out3 = out6 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_7E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in4;
+ tmp1 = in0 ^ in5;
+ out1 = tmp0 ^ tmp1 ^ in6;
+ out3 = tmp1 ^ in1;
+ out4 = out1 ^ in1 ^ in7;
+ tmp2 = out4 ^ in3;
+ out5 = tmp2 ^ in2;
+ out6 = tmp0 ^ out5;
+ out7 = tmp1 ^ out4 ^ in2;
+ out2 = out6 ^ in5 ^ in7;
+ out0 = tmp2 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_7F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in7;
+ tmp1 = tmp0 ^ in3 ^ in5;
+ tmp2 = tmp1 ^ in0;
+ out0 = tmp2 ^ in4;
+ out6 = tmp2 ^ in1;
+ out3 = tmp0 ^ out6;
+ tmp3 = out3 ^ in6;
+ out1 = tmp3 ^ in4;
+ out2 = tmp3 ^ in5;
+ out4 = tmp3 ^ in7;
+ out5 = tmp1 ^ out1;
+ out7 = out0 ^ out4 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_80(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3;
+ tmp1 = in4 ^ in5;
+ out1 = in2 ^ in6 ^ in7;
+ out5 = tmp0 ^ in4;
+ tmp2 = tmp0 ^ in1;
+ out6 = tmp1 ^ in3;
+ out7 = tmp1 ^ in0 ^ in6;
+ out4 = tmp2 ^ in7;
+ out3 = tmp2 ^ out6;
+ out2 = out3 ^ out5 ^ in6;
+ out0 = out2 ^ in3 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_81(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in4 ^ in6;
+ tmp1 = tmp0 ^ in3;
+ out6 = tmp1 ^ in5;
+ out5 = out6 ^ in2 ^ in6;
+ out3 = out5 ^ in1;
+ out2 = tmp0 ^ out3;
+ out1 = out3 ^ out6 ^ in7;
+ out4 = tmp1 ^ out1;
+ out7 = out2 ^ out4 ^ in0;
+ out0 = out7 ^ in1 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_82(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in1 ^ in2;
+ tmp0 = in6 ^ in7;
+ out5 = in2 ^ in3;
+ out6 = in3 ^ in4;
+ out7 = in0 ^ in4 ^ in5;
+ out0 = in1 ^ in5 ^ in6;
+ out1 = tmp0 ^ in0 ^ in2;
+ out2 = tmp0 ^ in3 ^ in5;
+ out3 = tmp0 ^ out0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_83(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in2 ^ in5;
+ tmp2 = in3 ^ in6;
+ out4 = in1 ^ in2 ^ in4;
+ out0 = tmp0 ^ in5 ^ in6;
+ out5 = tmp1 ^ in3;
+ tmp3 = tmp1 ^ in7;
+ out6 = tmp2 ^ in4;
+ out2 = tmp2 ^ tmp3;
+ tmp4 = tmp3 ^ out4;
+ out1 = tmp3 ^ out0;
+ out3 = tmp4 ^ in3;
+ out7 = tmp0 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_84(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in2 ^ in6;
+ out6 = in3 ^ in5;
+ out0 = in1 ^ in5 ^ in7;
+ out7 = in0 ^ in4 ^ in6;
+ out4 = in1 ^ in3 ^ in6;
+ out5 = in2 ^ in4 ^ in7;
+ out2 = out6 ^ in0 ^ in1;
+ out3 = out5 ^ in5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_85(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in6;
+ tmp1 = in3 ^ in6;
+ tmp2 = tmp0 ^ in4;
+ out1 = tmp0 ^ in2;
+ out6 = tmp1 ^ in5;
+ out4 = tmp2 ^ in3;
+ tmp3 = out1 ^ out6;
+ out2 = tmp3 ^ in0;
+ out3 = tmp2 ^ tmp3 ^ in7;
+ out7 = out2 ^ out3 ^ in1;
+ out5 = tmp1 ^ out3;
+ out0 = tmp2 ^ out7 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_86(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out6 = in3;
+ out7 = in0 ^ in4;
+ out0 = in1 ^ in5;
+ out5 = in2 ^ in7;
+ out3 = in4 ^ in5 ^ in6;
+ out1 = in0 ^ in2 ^ in6;
+ out4 = in1 ^ in6 ^ in7;
+ out2 = in0 ^ in3 ^ in5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_87(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out6 = in3 ^ in6;
+ tmp0 = in0 ^ in1;
+ out7 = in0 ^ in4 ^ in7;
+ out5 = in2 ^ in5 ^ in7;
+ out3 = out6 ^ in4 ^ in5;
+ out0 = tmp0 ^ in5;
+ tmp1 = tmp0 ^ in6;
+ out2 = out5 ^ in0 ^ in3;
+ out1 = tmp1 ^ in2;
+ out4 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_88(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in2 ^ in7;
+ tmp0 = in5 ^ in6;
+ out0 = in1 ^ in6 ^ in7;
+ out6 = in4 ^ in5 ^ in7;
+ out3 = out0 ^ out1 ^ in0 ^ in4;
+ out7 = tmp0 ^ in0;
+ tmp1 = tmp0 ^ in3;
+ out2 = out0 ^ in3;
+ out4 = tmp1 ^ in2;
+ out5 = tmp1 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_89(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in7;
+ tmp1 = in2 ^ in7;
+ tmp2 = tmp0 ^ in6;
+ out1 = tmp1 ^ in1;
+ out7 = tmp2 ^ in5;
+ out0 = tmp2 ^ in1;
+ out2 = out1 ^ in3 ^ in6;
+ out6 = out7 ^ in0 ^ in4;
+ out5 = out6 ^ in3;
+ out3 = tmp0 ^ out2 ^ in4;
+ out4 = tmp1 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_8A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in1 ^ in6;
+ out7 = in0 ^ in5;
+ out2 = in3 ^ in6;
+ out6 = in4 ^ in7;
+ out1 = in0 ^ in2 ^ in7;
+ out3 = out0 ^ out6 ^ in0;
+ out4 = out1 ^ out7 ^ in6;
+ out5 = out2 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_8B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in3 ^ in6;
+ tmp2 = in5 ^ in7;
+ tmp3 = tmp0 ^ in7;
+ out0 = tmp0 ^ in6;
+ out2 = tmp1 ^ in2;
+ out5 = tmp1 ^ tmp2;
+ out7 = tmp2 ^ in0;
+ tmp4 = tmp3 ^ in4;
+ out1 = tmp3 ^ in2;
+ out6 = tmp4 ^ out0;
+ out4 = out6 ^ in2 ^ in5;
+ out3 = tmp1 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_8C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in2;
+ out0 = in1 ^ in7;
+ out7 = in0 ^ in6;
+ out5 = in4 ^ in6;
+ out6 = in5 ^ in7;
+ out2 = out0 ^ in0 ^ in3;
+ out3 = out5 ^ out7 ^ in2 ^ in7;
+ out4 = out6 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_8D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in1 ^ in2;
+ tmp0 = in6 ^ in7;
+ out0 = in0 ^ in1 ^ in7;
+ out5 = in4 ^ in5 ^ in6;
+ out6 = tmp0 ^ in5;
+ out7 = tmp0 ^ in0;
+ out4 = tmp0 ^ out5 ^ in3;
+ out2 = out0 ^ in2 ^ in3;
+ out3 = out2 ^ in1 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_8E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in1;
+ out4 = in5;
+ out7 = in0;
+ out5 = in6;
+ out6 = in7;
+ out3 = in0 ^ in4;
+ out1 = in0 ^ in2;
+ out2 = in0 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_8F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in0 ^ in1;
+ tmp0 = in0 ^ in3;
+ out4 = in4 ^ in5;
+ out7 = in0 ^ in7;
+ out5 = in5 ^ in6;
+ out6 = in6 ^ in7;
+ out1 = out0 ^ in2;
+ out2 = tmp0 ^ in2;
+ out3 = tmp0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_90(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in2 ^ in6 ^ in7;
+ out3 = tmp0 ^ in7;
+ out1 = tmp1 ^ in5;
+ tmp2 = out1 ^ in4;
+ out6 = tmp2 ^ in3;
+ out5 = out6 ^ in1;
+ out4 = out5 ^ in0;
+ out0 = tmp0 ^ tmp2;
+ out7 = tmp0 ^ out4;
+ out2 = tmp1 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_91(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in4;
+ tmp1 = tmp0 ^ in3 ^ in5;
+ out2 = tmp1 ^ in1;
+ out6 = tmp1 ^ in7;
+ tmp2 = out2 ^ in5 ^ in7;
+ out3 = tmp2 ^ in4;
+ out5 = tmp2 ^ in6;
+ out1 = tmp1 ^ out5 ^ in2;
+ tmp3 = out1 ^ in0;
+ out4 = tmp3 ^ in3;
+ out0 = tmp0 ^ tmp3;
+ out7 = tmp2 ^ tmp3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_92(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in1;
+ tmp0 = in4 ^ in5;
+ tmp1 = tmp0 ^ in1;
+ out2 = tmp0 ^ in3 ^ in7;
+ out0 = tmp1 ^ in6;
+ out7 = out2 ^ in0;
+ out4 = out0 ^ in0 ^ in2;
+ out5 = out4 ^ out7 ^ in5;
+ out6 = tmp1 ^ out5;
+ out1 = out6 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_93(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in1 ^ in3;
+ tmp0 = in2 ^ in7;
+ tmp1 = out3 ^ in6;
+ tmp2 = tmp0 ^ in4;
+ out5 = tmp0 ^ tmp1;
+ out6 = tmp2 ^ in3;
+ out2 = out6 ^ in5;
+ out0 = out2 ^ out5 ^ in0;
+ out7 = tmp1 ^ out0;
+ out1 = tmp2 ^ out0;
+ out4 = out1 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_94(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in2 ^ in6;
+ tmp0 = in1 ^ in4 ^ in5;
+ out1 = out3 ^ in5;
+ out5 = tmp0 ^ out3;
+ out0 = tmp0 ^ in7;
+ out4 = tmp0 ^ in0 ^ in3;
+ out6 = out1 ^ in3 ^ in7;
+ out2 = out4 ^ in6;
+ out7 = out0 ^ out2 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_95(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3;
+ out3 = tmp0 ^ in6;
+ tmp1 = tmp0 ^ in7;
+ tmp2 = out3 ^ in0;
+ out6 = tmp1 ^ in5;
+ tmp3 = tmp2 ^ in4;
+ out7 = tmp3 ^ in2;
+ tmp4 = tmp3 ^ in5;
+ out2 = tmp4 ^ in1;
+ tmp5 = out2 ^ in6;
+ out0 = tmp1 ^ tmp5;
+ out1 = tmp5 ^ out7;
+ out4 = tmp2 ^ out1;
+ out5 = tmp4 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_96(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in6 ^ in7;
+ tmp0 = in1 ^ in5;
+ tmp1 = in5 ^ in6;
+ out6 = out3 ^ in2 ^ in3;
+ out0 = tmp0 ^ in4;
+ tmp2 = tmp1 ^ in2;
+ out4 = out0 ^ in0 ^ in7;
+ out1 = tmp2 ^ in0;
+ out5 = tmp2 ^ in1;
+ out7 = tmp0 ^ out4 ^ in3;
+ out2 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_97(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in4;
+ tmp1 = in2 ^ in6;
+ out3 = in3 ^ in6 ^ in7;
+ out7 = tmp0 ^ in3;
+ tmp2 = tmp0 ^ in5;
+ out5 = tmp1 ^ in1;
+ out6 = tmp1 ^ out3;
+ out0 = tmp2 ^ in1;
+ out2 = tmp2 ^ out3 ^ in2;
+ tmp3 = out0 ^ in4;
+ out4 = tmp3 ^ in7;
+ out1 = tmp1 ^ tmp3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_98(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in5 ^ in7;
+ tmp1 = in1 ^ in4 ^ in7;
+ out1 = tmp0 ^ in2;
+ out0 = tmp1 ^ in6;
+ out2 = tmp1 ^ in3;
+ out6 = out0 ^ out1 ^ in1;
+ out5 = tmp0 ^ out2;
+ out3 = tmp1 ^ out6 ^ in0;
+ out7 = out0 ^ out5 ^ in0;
+ out4 = out6 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_99(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ out5 = in1 ^ in3 ^ in4;
+ out6 = in2 ^ in4 ^ in5;
+ out4 = tmp0 ^ in2;
+ tmp1 = tmp0 ^ in6;
+ tmp2 = out5 ^ in7;
+ out7 = tmp1 ^ in5;
+ out0 = tmp1 ^ tmp2;
+ out2 = tmp2 ^ in2;
+ out3 = out0 ^ out6 ^ in3;
+ out1 = tmp1 ^ out3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_9A(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in3 ^ in4;
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in6;
+ out5 = in1 ^ in3 ^ in5;
+ tmp2 = tmp0 ^ in7;
+ out3 = tmp0 ^ tmp1;
+ out0 = tmp1 ^ in4;
+ out7 = tmp2 ^ in3;
+ out1 = tmp2 ^ in2;
+ out6 = out0 ^ in1 ^ in2;
+ out4 = out1 ^ in4 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_9B(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out5 = in1 ^ in3;
+ tmp0 = in3 ^ in5;
+ out6 = in2 ^ in4;
+ out4 = in0 ^ in2 ^ in7;
+ out7 = tmp0 ^ in0;
+ out2 = out6 ^ in3;
+ out1 = out4 ^ in1 ^ in5;
+ out3 = out7 ^ in1 ^ in6;
+ out0 = tmp0 ^ out3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_9C(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out1 = in2 ^ in5;
+ tmp0 = in0 ^ in3 ^ in6;
+ out3 = out1 ^ in0;
+ out6 = out1 ^ in6;
+ out7 = tmp0 ^ in7;
+ out4 = out7 ^ in4;
+ out2 = out4 ^ in1;
+ out0 = tmp0 ^ out2;
+ out5 = out0 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_9D(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out6 = in2 ^ in5;
+ tmp0 = in0 ^ in3;
+ out5 = in1 ^ in4 ^ in7;
+ out1 = out6 ^ in1;
+ out3 = tmp0 ^ out6;
+ out7 = tmp0 ^ in6;
+ out0 = out5 ^ in0;
+ out4 = out7 ^ in7;
+ out2 = out5 ^ out7 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_9E(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in1 ^ in4;
+ tmp0 = in0 ^ in5;
+ out6 = in2 ^ in6;
+ out7 = in0 ^ in3 ^ in7;
+ out4 = in0 ^ in4 ^ in6;
+ out5 = in1 ^ in5 ^ in7;
+ out1 = tmp0 ^ in2;
+ out3 = tmp0 ^ in7;
+ out2 = out4 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_9F(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out6 = in2;
+ out7 = in0 ^ in3;
+ tmp0 = in0 ^ in1;
+ out4 = in0 ^ in6;
+ out5 = in1 ^ in7;
+ out1 = tmp0 ^ in2 ^ in5;
+ out2 = out7 ^ in2 ^ in4 ^ in6;
+ out3 = out7 ^ in5 ^ in7;
+ out0 = tmp0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A0(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in6;
+ out2 = tmp0 ^ in7;
+ tmp1 = tmp0 ^ in5;
+ out6 = out2 ^ in3 ^ in4;
+ out0 = tmp1 ^ in3;
+ tmp2 = out0 ^ in2;
+ out3 = tmp2 ^ in7;
+ tmp3 = tmp2 ^ in1;
+ out5 = tmp3 ^ in0;
+ out4 = tmp3 ^ out6;
+ out7 = out5 ^ out6 ^ in1;
+ out1 = tmp1 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A1(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp0 ^ in4;
+ out4 = tmp1 ^ in7;
+ out7 = tmp2 ^ in0;
+ out6 = tmp2 ^ out4 ^ in3;
+ out3 = out4 ^ in6;
+ out2 = out3 ^ in5;
+ out1 = out2 ^ in4;
+ out5 = out1 ^ out6 ^ in0;
+ out0 = tmp1 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A2(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in6;
+ tmp0 = in1 ^ in3 ^ in5;
+ out3 = tmp0 ^ in6;
+ out4 = tmp0 ^ in2 ^ in4;
+ out0 = out3 ^ in7;
+ out6 = out0 ^ in4;
+ out1 = out0 ^ out4 ^ in0;
+ out7 = out1 ^ in5;
+ out5 = out7 ^ in3 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A3(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in2 ^ in6;
+ out3 = in1 ^ in5 ^ in6;
+ tmp0 = out2 ^ in0;
+ out4 = out2 ^ out3 ^ in3;
+ tmp1 = tmp0 ^ in4;
+ out0 = tmp0 ^ out4 ^ in7;
+ out5 = tmp1 ^ in3;
+ out7 = tmp1 ^ in5;
+ out1 = tmp1 ^ in1 ^ in7;
+ out6 = tmp1 ^ out0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A4(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in3;
+ tmp1 = in2 ^ in4;
+ tmp2 = in2 ^ in5;
+ tmp3 = in0 ^ in7;
+ out0 = tmp0 ^ in5;
+ out6 = tmp0 ^ in6 ^ in7;
+ out1 = tmp1 ^ in6;
+ out7 = tmp1 ^ tmp3;
+ out3 = tmp2 ^ in3;
+ tmp4 = tmp2 ^ out1;
+ out2 = tmp3 ^ in1;
+ out5 = tmp4 ^ out7;
+ out4 = tmp4 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A5(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in2 ^ in5;
+ tmp0 = in1 ^ in6;
+ tmp1 = in0 ^ in1;
+ tmp2 = in2 ^ in4;
+ out6 = in1 ^ in3 ^ in7;
+ out4 = tmp0 ^ in5;
+ out1 = tmp0 ^ tmp2;
+ out0 = tmp1 ^ in3 ^ in5;
+ out2 = tmp1 ^ in2 ^ in7;
+ out7 = tmp2 ^ in0;
+ out5 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A6(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in0;
+ out3 = in3 ^ in5 ^ in7;
+ out1 = in0 ^ in2 ^ in4 ^ in6;
+ out0 = out3 ^ in1;
+ out7 = out1 ^ in7;
+ out6 = out0 ^ in6;
+ out5 = out7 ^ in5;
+ out4 = out6 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A7(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in0 ^ in2;
+ out3 = in5 ^ in7;
+ out7 = out2 ^ in4 ^ in6;
+ out6 = out3 ^ in1 ^ in3;
+ out1 = out7 ^ in1;
+ out5 = out7 ^ in7;
+ out0 = out6 ^ in0;
+ out4 = out6 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A8(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in4;
+ tmp1 = in1 ^ in6;
+ tmp2 = in0 ^ in2 ^ in7;
+ out1 = tmp0 ^ in7;
+ out4 = tmp0 ^ in6;
+ out0 = tmp1 ^ in3;
+ out2 = tmp1 ^ in5;
+ out6 = tmp1 ^ in4;
+ out7 = tmp2 ^ in5;
+ out3 = tmp2 ^ out0 ^ in6;
+ out5 = out7 ^ in2 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_A9(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in2 ^ in6;
+ out6 = in1 ^ in4;
+ out7 = in0 ^ in2 ^ in5;
+ out5 = in0 ^ in3 ^ in7;
+ out2 = out4 ^ in1 ^ in5;
+ out1 = out6 ^ in2 ^ in7;
+ out0 = out2 ^ out7 ^ in3;
+ out3 = out1 ^ in0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_AA(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = in1 ^ in3;
+ tmp2 = in6 ^ in7;
+ out1 = tmp0 ^ in4 ^ in7;
+ out3 = tmp1 ^ in0;
+ out0 = tmp1 ^ tmp2;
+ out2 = tmp2 ^ in5;
+ out7 = tmp0 ^ out2;
+ out6 = out1 ^ out7 ^ in1;
+ out5 = out0 ^ out6 ^ in0;
+ out4 = out5 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_AB(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in0 ^ in1;
+ tmp0 = in1 ^ in4;
+ tmp1 = in0 ^ in7;
+ out6 = tmp0 ^ in5;
+ out1 = tmp0 ^ tmp1 ^ in2;
+ out5 = tmp1 ^ in3 ^ in4;
+ out0 = tmp0 ^ out5 ^ in6;
+ out4 = out0 ^ out3 ^ in2;
+ out2 = out4 ^ in3 ^ in5;
+ out7 = tmp1 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_AC(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in1 ^ in3;
+ out1 = in2 ^ in4;
+ tmp0 = in0 ^ in2;
+ out4 = in4 ^ in7;
+ out5 = in0 ^ in5;
+ out6 = in1 ^ in6;
+ out7 = tmp0 ^ in7;
+ out3 = tmp0 ^ in3 ^ in6;
+ out2 = out5 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_AD(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in7;
+ out5 = in0;
+ out6 = in1;
+ out7 = in0 ^ in2;
+ out0 = in0 ^ in1 ^ in3;
+ out2 = out7 ^ in1 ^ in5;
+ out1 = in1 ^ in2 ^ in4;
+ out3 = out7 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_AE(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in3 ^ in4;
+ tmp0 = in0 ^ in4;
+ tmp1 = in0 ^ in7;
+ out0 = in1 ^ in3 ^ in7;
+ out1 = tmp0 ^ in2;
+ out5 = tmp0 ^ in5;
+ tmp2 = tmp1 ^ in6;
+ out2 = tmp1 ^ in5;
+ out3 = tmp2 ^ in3;
+ out7 = tmp2 ^ in2;
+ out6 = tmp2 ^ out2 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_AF(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in3;
+ tmp0 = in0 ^ in7;
+ out5 = in0 ^ in4;
+ out6 = in1 ^ in5;
+ out7 = in0 ^ in2 ^ in6;
+ out0 = tmp0 ^ in1 ^ in3;
+ out3 = tmp0 ^ in6;
+ out2 = tmp0 ^ in2 ^ in5;
+ out1 = out5 ^ in1 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B0(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in4;
+ tmp1 = in3 ^ in6;
+ out2 = tmp0 ^ in7;
+ tmp2 = tmp0 ^ tmp1;
+ out0 = tmp2 ^ in5;
+ out3 = tmp2 ^ in2;
+ out6 = out3 ^ in6;
+ tmp3 = out6 ^ in0 ^ in1;
+ out7 = tmp3 ^ in5;
+ out5 = tmp3 ^ out2;
+ out1 = out0 ^ out5 ^ in0;
+ out4 = tmp1 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B1(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in4;
+ out2 = tmp0 ^ in2 ^ in7;
+ tmp1 = out2 ^ in6;
+ out1 = tmp1 ^ in5;
+ out3 = tmp1 ^ in7;
+ out4 = tmp1 ^ in0;
+ out6 = out3 ^ in3;
+ out0 = out6 ^ in0 ^ in2 ^ in5;
+ out5 = tmp1 ^ out0 ^ in1;
+ out7 = tmp0 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B2(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in4;
+ tmp0 = in4 ^ in7;
+ tmp1 = in1 ^ in3 ^ in6;
+ out3 = tmp0 ^ tmp1;
+ tmp2 = tmp1 ^ in0;
+ out0 = out3 ^ in5;
+ out4 = tmp2 ^ in2;
+ tmp3 = out4 ^ in6;
+ out5 = tmp0 ^ tmp3;
+ out1 = tmp3 ^ out0;
+ tmp4 = out1 ^ in7;
+ out7 = tmp4 ^ in3;
+ out6 = tmp2 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B3(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in2 ^ in4;
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in6;
+ out3 = tmp1 ^ in4 ^ in7;
+ tmp2 = tmp0 ^ out3;
+ out0 = tmp2 ^ in3;
+ out1 = tmp2 ^ in2;
+ out5 = out0 ^ in2 ^ in6;
+ out7 = tmp1 ^ out5;
+ out4 = out7 ^ in1 ^ in5 ^ in7;
+ out6 = tmp0 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B4(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in0 ^ in1;
+ out5 = out4 ^ in2;
+ tmp0 = out4 ^ in4;
+ out6 = out5 ^ in0 ^ in3;
+ out7 = tmp0 ^ out6;
+ out2 = tmp0 ^ in6 ^ in7;
+ out3 = out7 ^ in0 ^ in7;
+ out0 = out5 ^ out7 ^ in5;
+ out1 = out0 ^ out6 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B5(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in2 ^ in4;
+ out4 = tmp0 ^ in4;
+ out3 = tmp1 ^ in7;
+ tmp2 = out4 ^ in5;
+ out7 = out3 ^ in0 ^ in3;
+ out0 = tmp2 ^ in3;
+ out2 = tmp0 ^ out3 ^ in6;
+ out5 = tmp1 ^ tmp2;
+ out6 = out2 ^ out7 ^ in2;
+ out1 = tmp0 ^ out0 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B6(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in3 ^ in4;
+ tmp0 = in1 ^ in2;
+ tmp1 = in0 ^ in4;
+ tmp2 = in3 ^ in5;
+ tmp3 = out3 ^ in1 ^ in7;
+ out5 = tmp0 ^ tmp1;
+ out6 = tmp0 ^ tmp2;
+ out2 = tmp1 ^ in6;
+ out4 = tmp1 ^ tmp3;
+ out0 = tmp3 ^ in5;
+ out1 = out2 ^ in2 ^ in5;
+ out7 = tmp2 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B7(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in4;
+ tmp0 = in0 ^ in4;
+ out2 = tmp0 ^ in2 ^ in6;
+ tmp1 = out2 ^ in7;
+ out1 = out2 ^ in1 ^ in5;
+ out7 = tmp1 ^ in3;
+ out5 = out1 ^ in6;
+ out6 = tmp0 ^ out1 ^ in3;
+ out0 = tmp1 ^ out6;
+ out4 = out0 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B8(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in4;
+ tmp1 = in2 ^ in5;
+ out2 = tmp0 ^ in5;
+ out4 = tmp1 ^ in0;
+ tmp2 = tmp1 ^ in7;
+ out6 = tmp2 ^ out2;
+ out7 = out4 ^ in3;
+ out1 = tmp2 ^ in4;
+ out3 = tmp0 ^ out7;
+ out0 = out3 ^ out4 ^ in6;
+ out5 = out0 ^ in0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_B9(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = in4 ^ in5;
+ out4 = tmp0 ^ tmp1;
+ tmp2 = tmp0 ^ in3 ^ in7;
+ out3 = out4 ^ in1;
+ out7 = tmp2 ^ in5;
+ out2 = out3 ^ in0;
+ out1 = out2 ^ in7;
+ out6 = out1 ^ in5 ^ in6;
+ out0 = tmp2 ^ out6;
+ out5 = tmp1 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_BA(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in5 ^ in7;
+ out2 = tmp0 ^ in4;
+ tmp1 = out2 ^ in2;
+ out1 = tmp1 ^ in0;
+ out6 = tmp1 ^ in1;
+ out4 = out1 ^ in3 ^ in4;
+ tmp2 = out4 ^ out6;
+ out7 = out4 ^ in6 ^ in7;
+ out5 = tmp2 ^ in6;
+ out3 = tmp0 ^ tmp2;
+ out0 = out6 ^ out7 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_BB(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in2 ^ in4 ^ in5 ^ in7;
+ tmp0 = out2 ^ in1;
+ out4 = out2 ^ in0 ^ in3;
+ out1 = tmp0 ^ in0;
+ out6 = tmp0 ^ in6;
+ out3 = out1 ^ in2;
+ tmp1 = out4 ^ out6 ^ in4;
+ out0 = tmp1 ^ in7;
+ out5 = tmp1 ^ in5;
+ out7 = tmp0 ^ tmp1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_BC(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = in2 ^ in4;
+ out0 = in1 ^ in3 ^ in4;
+ out6 = in1 ^ in2 ^ in7;
+ out7 = tmp0 ^ in3;
+ out5 = tmp0 ^ out6 ^ in6;
+ out1 = tmp1 ^ in5;
+ tmp2 = out1 ^ out5 ^ in1;
+ out3 = tmp2 ^ in3;
+ out4 = tmp1 ^ tmp2;
+ out2 = tmp2 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_BD(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in1 ^ in4;
+ out0 = tmp0 ^ tmp1;
+ out7 = tmp0 ^ in2 ^ in7;
+ out1 = tmp1 ^ in2 ^ in5;
+ tmp2 = out1 ^ in0;
+ out2 = tmp2 ^ in6;
+ out3 = out2 ^ in1 ^ in7;
+ out4 = out3 ^ in2;
+ out5 = tmp1 ^ out4;
+ out6 = tmp2 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_BE(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3 ^ in6;
+ out4 = tmp0 ^ in5;
+ out7 = tmp0 ^ in2;
+ out3 = out4 ^ in4;
+ out1 = out3 ^ out7 ^ in0;
+ out2 = out3 ^ in3 ^ in7;
+ out0 = out2 ^ out4 ^ in1;
+ out5 = tmp0 ^ out0;
+ out6 = out1 ^ out5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_BF(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in4;
+ out3 = tmp0 ^ in5 ^ in6;
+ out4 = out3 ^ in3;
+ tmp1 = out3 ^ in7;
+ out2 = tmp1 ^ in2;
+ out5 = tmp1 ^ in1;
+ tmp2 = out2 ^ in5;
+ out7 = tmp2 ^ in3 ^ in4;
+ tmp3 = tmp0 ^ out5;
+ out0 = tmp3 ^ out4;
+ out1 = tmp2 ^ tmp3;
+ out6 = tmp3 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C0(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out5 = in2 ^ in5;
+ tmp0 = in1 ^ in4;
+ tmp1 = in3 ^ in6;
+ out0 = out5 ^ in1;
+ out4 = tmp0 ^ in7;
+ out3 = tmp0 ^ tmp1;
+ out1 = tmp1 ^ in2;
+ out6 = tmp1 ^ in0;
+ out7 = out4 ^ in0;
+ out2 = out4 ^ out5 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C1(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out5 = in2;
+ tmp0 = in0 ^ in1;
+ out4 = in1 ^ in7;
+ out6 = in0 ^ in3;
+ out3 = in1 ^ in4 ^ in6;
+ tmp1 = tmp0 ^ in2;
+ out7 = tmp0 ^ in4;
+ out0 = tmp1 ^ in5;
+ out1 = tmp1 ^ out6 ^ in6;
+ out2 = out6 ^ out7 ^ in5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C2(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in1 ^ in3 ^ in4;
+ tmp0 = in0 ^ in3 ^ in6;
+ out5 = in2 ^ in4 ^ in5;
+ tmp1 = out4 ^ in7;
+ out1 = tmp0 ^ in2;
+ out6 = tmp0 ^ in5;
+ out2 = out5 ^ in3;
+ out7 = tmp0 ^ tmp1;
+ out3 = tmp1 ^ in2 ^ in6;
+ out0 = tmp1 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C3(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in1 ^ in3;
+ tmp0 = in0 ^ in2;
+ tmp1 = in3 ^ in5;
+ out5 = in2 ^ in4;
+ tmp2 = tmp0 ^ out4;
+ out2 = tmp1 ^ in4;
+ out6 = tmp1 ^ in0;
+ out0 = tmp1 ^ tmp2 ^ in7;
+ out1 = tmp2 ^ in6;
+ out7 = out1 ^ out5 ^ in3;
+ out3 = tmp0 ^ out7 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C4(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in7;
+ out3 = tmp0 ^ in4;
+ tmp1 = tmp0 ^ in2;
+ out1 = tmp1 ^ in6;
+ out5 = tmp1 ^ in5;
+ out4 = out1 ^ out3 ^ in1;
+ out0 = out4 ^ in4 ^ in5;
+ out2 = out0 ^ out3 ^ in0;
+ out7 = out1 ^ out2 ^ in7;
+ out6 = tmp1 ^ out0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C5(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in4 ^ in7;
+ tmp0 = in3 ^ in7;
+ out4 = in1 ^ in2 ^ in6;
+ out6 = in0 ^ in3 ^ in4;
+ out5 = tmp0 ^ in2;
+ out1 = tmp0 ^ out4;
+ out0 = out4 ^ in0 ^ in5;
+ out2 = out0 ^ out5 ^ in4;
+ out7 = tmp0 ^ out2 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C6(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in5 ^ in6;
+ tmp1 = in1 ^ in7;
+ tmp2 = tmp0 ^ in0;
+ tmp3 = tmp0 ^ tmp1;
+ tmp4 = tmp2 ^ in4;
+ out0 = tmp3 ^ in2;
+ out6 = tmp4 ^ in3;
+ out2 = out6 ^ in2;
+ out7 = tmp1 ^ tmp4;
+ out3 = tmp2 ^ out2;
+ tmp5 = out3 ^ in5;
+ out5 = tmp5 ^ in7;
+ out4 = tmp3 ^ tmp5;
+ out1 = tmp4 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C7(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in2 ^ in4;
+ tmp0 = in3 ^ in5;
+ tmp1 = out3 ^ in7;
+ out6 = tmp0 ^ in0 ^ in4;
+ out5 = tmp1 ^ in3;
+ out2 = out6 ^ in6;
+ out7 = out2 ^ in1 ^ in3;
+ out0 = tmp1 ^ out7;
+ out1 = tmp0 ^ out0;
+ out4 = out1 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C8(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out0 = in1 ^ in2;
+ out1 = in2 ^ in3;
+ tmp0 = in5 ^ in6;
+ tmp1 = in0 ^ in7;
+ out2 = out1 ^ in1 ^ in4;
+ out4 = tmp0 ^ in4;
+ out5 = tmp0 ^ in7;
+ out6 = tmp1 ^ in6;
+ out7 = tmp1 ^ in1;
+ out3 = out2 ^ in0 ^ in2 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_C9(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in5 ^ in6;
+ out7 = in0 ^ in1;
+ tmp0 = in1 ^ in3;
+ out5 = in6 ^ in7;
+ out6 = in0 ^ in7;
+ out0 = out7 ^ in2;
+ out3 = out7 ^ in4 ^ in5;
+ out1 = tmp0 ^ in2;
+ out2 = tmp0 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_CA(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in7;
+ tmp1 = in2 ^ in7;
+ tmp2 = tmp0 ^ in6;
+ out0 = tmp1 ^ in1;
+ tmp3 = tmp1 ^ in3;
+ out6 = tmp2 ^ in5;
+ out7 = tmp2 ^ in1;
+ out2 = tmp3 ^ in4;
+ out5 = out6 ^ in0 ^ in4;
+ out4 = out5 ^ in3;
+ out1 = tmp0 ^ tmp3;
+ out3 = tmp3 ^ out5 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_CB(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in4 ^ in7;
+ tmp1 = in5 ^ in7;
+ out7 = in0 ^ in1 ^ in6;
+ out5 = tmp0 ^ in6;
+ out2 = tmp0 ^ in3;
+ out6 = tmp1 ^ in0;
+ out4 = tmp1 ^ in3 ^ in6;
+ tmp2 = out5 ^ out7 ^ in2;
+ out1 = tmp2 ^ out2;
+ out0 = tmp2 ^ in4;
+ out3 = tmp2 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_CC(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in5;
+ tmp1 = in1 ^ in6;
+ out1 = in2 ^ in3 ^ in7;
+ out5 = tmp0 ^ in6;
+ out0 = tmp1 ^ in2;
+ tmp2 = out5 ^ in0 ^ in7;
+ out3 = tmp2 ^ in4;
+ out6 = tmp0 ^ out3;
+ out7 = tmp1 ^ tmp2 ^ in3;
+ tmp3 = out1 ^ out6;
+ out4 = tmp2 ^ tmp3;
+ out2 = tmp3 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_CD(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out5 = in3 ^ in6;
+ tmp0 = in0 ^ in1;
+ tmp1 = in2 ^ in7;
+ out6 = in0 ^ in4 ^ in7;
+ out2 = tmp0 ^ out5 ^ in4;
+ out7 = tmp0 ^ in5;
+ out0 = tmp0 ^ in2 ^ in6;
+ out4 = tmp1 ^ in5;
+ out1 = tmp1 ^ in1 ^ in3;
+ out3 = out6 ^ in5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_CE(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in5;
+ tmp1 = tmp0 ^ in3;
+ out4 = tmp1 ^ in4;
+ tmp2 = out4 ^ in6;
+ out3 = tmp2 ^ in0;
+ out5 = tmp2 ^ in2;
+ out2 = out3 ^ in5 ^ in7;
+ out6 = tmp1 ^ out2;
+ out7 = out2 ^ out4 ^ in1;
+ out1 = tmp2 ^ out6;
+ out0 = tmp0 ^ out7 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_CF(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in6;
+ tmp1 = in0 ^ in1 ^ in5;
+ out4 = in2 ^ in3 ^ in5;
+ out5 = tmp0 ^ in4;
+ out7 = tmp1 ^ in6;
+ out1 = tmp1 ^ out4 ^ in7;
+ tmp2 = out5 ^ in0;
+ out2 = tmp2 ^ in7;
+ out3 = tmp2 ^ out4;
+ out6 = tmp0 ^ out2 ^ in5;
+ out0 = tmp0 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D0(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ tmp1 = in1 ^ in4;
+ tmp2 = in2 ^ in5;
+ out7 = tmp0 ^ tmp1;
+ out0 = tmp1 ^ tmp2;
+ tmp3 = tmp2 ^ in3;
+ out1 = tmp3 ^ in6;
+ tmp4 = out1 ^ in1;
+ out2 = tmp4 ^ in7;
+ out3 = out2 ^ in2;
+ out4 = tmp0 ^ out3;
+ out5 = tmp3 ^ out3;
+ out6 = tmp4 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D1(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in5 ^ in6;
+ tmp1 = tmp0 ^ in1;
+ out1 = tmp1 ^ in2;
+ out2 = tmp1 ^ in7;
+ out3 = out2 ^ in3;
+ out5 = out3 ^ in2;
+ tmp2 = out3 ^ in0;
+ out4 = tmp2 ^ in4;
+ out7 = tmp0 ^ out4;
+ out6 = tmp2 ^ out1 ^ in6;
+ out0 = out2 ^ out6 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D2(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in5 ^ in6;
+ out2 = tmp0 ^ in2 ^ in3;
+ out1 = out2 ^ in0;
+ out3 = out2 ^ in1;
+ out4 = out1 ^ in1 ^ in2;
+ out6 = out1 ^ in6 ^ in7;
+ out7 = out4 ^ in4 ^ in5;
+ out5 = out4 ^ out6 ^ in4;
+ out0 = tmp0 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D3(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in3 ^ in5 ^ in6;
+ tmp0 = out2 ^ in2;
+ tmp1 = tmp0 ^ in1;
+ out1 = tmp1 ^ in0;
+ out3 = tmp1 ^ in3;
+ out4 = out1 ^ in2 ^ in4;
+ tmp2 = out4 ^ in5;
+ out7 = tmp2 ^ in7;
+ out0 = tmp0 ^ out7;
+ tmp3 = out0 ^ in0;
+ out5 = tmp3 ^ in6;
+ out6 = tmp2 ^ tmp3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D4(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in3 ^ in5;
+ tmp0 = in1 ^ in5;
+ tmp1 = tmp0 ^ in2;
+ out4 = tmp1 ^ in0;
+ tmp2 = tmp1 ^ in6;
+ out2 = out4 ^ in3 ^ in7;
+ out0 = tmp2 ^ in4;
+ out5 = tmp2 ^ out3;
+ out1 = tmp0 ^ out5 ^ in7;
+ out6 = tmp0 ^ out2 ^ in4;
+ out7 = tmp1 ^ out6 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D5(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in5;
+ tmp0 = in0 ^ in4;
+ tmp1 = tmp0 ^ in1 ^ in5;
+ out4 = tmp1 ^ in2;
+ out0 = out4 ^ in6;
+ tmp2 = tmp0 ^ out0;
+ out5 = tmp2 ^ in3;
+ out1 = out5 ^ in7;
+ out6 = tmp1 ^ out1;
+ out7 = tmp2 ^ out6;
+ out2 = out7 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D6(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2 ^ in4 ^ in6;
+ out5 = tmp0 ^ in3;
+ out0 = tmp0 ^ in5 ^ in7;
+ out3 = out0 ^ out5 ^ in2;
+ tmp1 = out3 ^ in0;
+ out1 = tmp1 ^ in6;
+ out2 = tmp1 ^ in7;
+ out4 = tmp1 ^ in1;
+ out6 = tmp1 ^ in4;
+ out7 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D7(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in3;
+ out3 = in2 ^ in5 ^ in7;
+ out2 = tmp0 ^ in5;
+ tmp1 = tmp0 ^ out3 ^ in1;
+ out1 = tmp1 ^ in6;
+ out4 = tmp1 ^ in4;
+ tmp2 = out1 ^ in4;
+ out6 = tmp2 ^ in1;
+ out7 = tmp2 ^ in2;
+ out0 = tmp2 ^ in3;
+ out5 = tmp2 ^ in0 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D8(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in0;
+ out5 = in1;
+ tmp0 = in1 ^ in2;
+ out6 = in0 ^ in2;
+ out0 = tmp0 ^ in4;
+ tmp1 = tmp0 ^ in3;
+ out7 = tmp1 ^ out6;
+ out2 = tmp1 ^ in6;
+ out3 = out7 ^ in7;
+ out1 = tmp1 ^ in1 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_D9(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in0 ^ in4;
+ out5 = in1 ^ in5;
+ out2 = in1 ^ in3 ^ in6;
+ out3 = in0 ^ in1 ^ in7;
+ out6 = in0 ^ in2 ^ in6;
+ out0 = out4 ^ in1 ^ in2;
+ out1 = out5 ^ in2 ^ in3;
+ out7 = out3 ^ in3;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_DA(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out5 = in1 ^ in4;
+ tmp0 = in2 ^ in7;
+ tmp1 = in0 ^ in2 ^ in3;
+ out0 = tmp0 ^ out5;
+ out4 = tmp0 ^ tmp1;
+ out2 = tmp0 ^ in3 ^ in6;
+ out1 = tmp1 ^ in5;
+ out3 = tmp1 ^ in1;
+ out6 = out1 ^ in3;
+ out7 = out3 ^ in2 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_DB(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in1 ^ in5;
+ tmp2 = in3 ^ in7;
+ out3 = tmp0 ^ in2;
+ out5 = tmp1 ^ in4;
+ out6 = tmp1 ^ out3 ^ in6;
+ out2 = tmp2 ^ in6;
+ tmp3 = tmp2 ^ in4;
+ tmp4 = out3 ^ in3;
+ out4 = tmp3 ^ in0;
+ out1 = tmp4 ^ in5;
+ out0 = tmp3 ^ tmp4;
+ out7 = tmp0 ^ out2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_DC(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in2;
+ tmp1 = in0 ^ in3;
+ out6 = tmp0 ^ in4;
+ tmp2 = tmp0 ^ in7;
+ out3 = tmp1 ^ in6;
+ tmp3 = tmp1 ^ in1;
+ out1 = tmp1 ^ tmp2 ^ in5;
+ out4 = tmp2 ^ in6;
+ out2 = tmp3 ^ in2;
+ out7 = tmp3 ^ in5;
+ out5 = tmp2 ^ out2;
+ out0 = out2 ^ out3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_DD(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in0 ^ in6;
+ out2 = in0 ^ in1 ^ in3;
+ out6 = out3 ^ in2 ^ in4;
+ out7 = out2 ^ in5 ^ in7;
+ out0 = out6 ^ in1;
+ out4 = out6 ^ in7;
+ out5 = out7 ^ in0;
+ out1 = out5 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_DE(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3 ^ in6;
+ tmp1 = in3 ^ in4 ^ in7;
+ out4 = tmp0 ^ in0;
+ out5 = tmp1 ^ in1;
+ out3 = out4 ^ in7;
+ out2 = out3 ^ in6;
+ out1 = out2 ^ in5;
+ out6 = tmp1 ^ out1;
+ out0 = tmp0 ^ out5;
+ out7 = out0 ^ out1 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_DF(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in0 ^ in3 ^ in7;
+ tmp0 = out2 ^ in1 ^ in5;
+ out1 = tmp0 ^ in2;
+ out7 = tmp0 ^ in6;
+ out5 = tmp0 ^ in0 ^ in4;
+ tmp1 = out1 ^ out5 ^ in6;
+ out4 = tmp1 ^ in3;
+ out6 = tmp1 ^ in5;
+ tmp2 = tmp1 ^ in7;
+ out0 = tmp2 ^ in1;
+ out3 = tmp2 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E0(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in1 ^ in7;
+ tmp0 = in2 ^ in4;
+ out4 = out3 ^ in3 ^ in5;
+ out2 = tmp0 ^ in1;
+ tmp1 = tmp0 ^ in6;
+ out0 = out4 ^ in2;
+ out6 = out4 ^ in0;
+ out1 = tmp1 ^ in3;
+ out5 = tmp1 ^ in0;
+ out7 = out5 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E1(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in1 ^ in4;
+ tmp0 = in1 ^ in7;
+ out3 = tmp0 ^ in3;
+ tmp1 = out3 ^ in5;
+ out4 = tmp1 ^ in4;
+ tmp2 = tmp1 ^ in0;
+ out0 = tmp2 ^ in2;
+ out6 = tmp2 ^ in6;
+ tmp3 = out0 ^ out4 ^ in6;
+ out5 = tmp3 ^ in5;
+ out7 = tmp0 ^ tmp3;
+ out1 = tmp2 ^ out5 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E2(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in1 ^ in2;
+ out4 = in1 ^ in5;
+ out2 = in2 ^ in4 ^ in7;
+ out5 = in0 ^ in2 ^ in6;
+ out0 = out3 ^ in3 ^ in5;
+ out7 = out3 ^ in0 ^ in4;
+ out6 = out2 ^ out7 ^ in3;
+ out1 = out5 ^ in3 ^ in4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E3(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in4 ^ in7;
+ tmp0 = in1 ^ in3;
+ out3 = tmp0 ^ in2;
+ tmp1 = out3 ^ in0;
+ out0 = tmp1 ^ in5;
+ tmp2 = tmp1 ^ in4;
+ out1 = tmp2 ^ in6;
+ tmp3 = tmp2 ^ in3;
+ out7 = tmp3 ^ in7;
+ out6 = out1 ^ out2 ^ in2;
+ tmp4 = tmp0 ^ out0;
+ out5 = tmp4 ^ in6;
+ out4 = tmp3 ^ tmp4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E4(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in6;
+ tmp0 = in0 ^ in4;
+ tmp1 = tmp0 ^ in2 ^ in6;
+ out2 = tmp1 ^ in1;
+ out7 = out2 ^ in5;
+ tmp2 = tmp0 ^ out7;
+ out4 = tmp2 ^ in3;
+ out0 = out4 ^ in7;
+ out6 = tmp1 ^ out0;
+ out5 = tmp2 ^ out6;
+ out1 = out5 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E5(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in3 ^ in6;
+ tmp0 = in0 ^ in1;
+ tmp1 = in5 ^ in7;
+ out2 = tmp0 ^ in4 ^ in6;
+ tmp2 = tmp1 ^ out2;
+ out6 = tmp2 ^ in3;
+ out7 = tmp2 ^ in2;
+ out0 = out6 ^ in2 ^ in4;
+ out5 = out6 ^ in1 ^ in2;
+ out1 = tmp0 ^ out5 ^ in5;
+ out4 = tmp1 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E6(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in2 ^ in6 ^ in7;
+ out2 = out3 ^ in0 ^ in4;
+ out4 = out3 ^ in1 ^ in5;
+ out1 = out2 ^ in3;
+ out7 = out2 ^ out4 ^ in2;
+ out0 = out4 ^ in3 ^ in7;
+ out5 = out1 ^ in4;
+ out6 = out0 ^ out2 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E7(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in3;
+ out3 = tmp0 ^ in6 ^ in7;
+ tmp1 = out3 ^ in0;
+ out5 = tmp1 ^ in5;
+ tmp2 = tmp1 ^ in4;
+ tmp3 = out5 ^ in7;
+ out1 = tmp2 ^ in1;
+ out0 = tmp3 ^ in1;
+ out6 = out1 ^ in2;
+ out2 = tmp0 ^ tmp2;
+ tmp4 = tmp3 ^ out6;
+ out4 = tmp4 ^ in6;
+ out7 = tmp4 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E8(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in3 ^ in6;
+ tmp0 = in4 ^ in7;
+ out1 = in2 ^ in3 ^ in4;
+ out5 = tmp0 ^ in0;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp1 ^ in5;
+ out0 = tmp1 ^ out1;
+ out2 = tmp2 ^ in2;
+ out6 = tmp2 ^ out5;
+ tmp3 = out6 ^ in6;
+ out3 = tmp3 ^ in7;
+ out7 = tmp3 ^ in2 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_E9(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = in3 ^ in6;
+ tmp2 = tmp0 ^ in6;
+ out4 = tmp1 ^ in4;
+ out6 = tmp2 ^ in5;
+ out7 = tmp2 ^ in2 ^ in7;
+ out3 = out6 ^ in3 ^ in7;
+ out0 = tmp1 ^ out7;
+ out2 = out3 ^ out4 ^ in0;
+ out5 = tmp0 ^ out2;
+ out1 = out0 ^ out5 ^ in5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_EA(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in6 ^ in7;
+ out5 = in0 ^ in7;
+ out6 = in0 ^ in1;
+ out0 = in1 ^ in2 ^ in3;
+ out2 = in2 ^ in4 ^ in5;
+ out7 = out6 ^ in2;
+ out1 = out0 ^ out6 ^ in4;
+ out3 = out7 ^ in5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_EB(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in4 ^ in5;
+ tmp0 = in0 ^ in1;
+ out4 = in4 ^ in6 ^ in7;
+ out5 = in0 ^ in5 ^ in7;
+ out6 = tmp0 ^ in6;
+ tmp1 = tmp0 ^ in2;
+ out0 = tmp1 ^ in3;
+ out7 = tmp1 ^ in7;
+ out1 = out0 ^ in4;
+ out3 = out0 ^ in5 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_EC(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out3 = in0 ^ in5;
+ out4 = in2 ^ in3 ^ in7;
+ out5 = in0 ^ in3 ^ in4;
+ out6 = out3 ^ in1 ^ in4;
+ out1 = out4 ^ in4;
+ out0 = out4 ^ in1 ^ in6;
+ out2 = out0 ^ out5 ^ in5;
+ out7 = out2 ^ in4 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_ED(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in2 ^ in4;
+ tmp1 = in3 ^ in5;
+ out4 = tmp0 ^ in3 ^ in7;
+ out3 = tmp1 ^ in0;
+ out1 = out4 ^ in1;
+ out5 = out3 ^ in4;
+ out7 = out1 ^ out5 ^ in6;
+ out2 = tmp0 ^ out7;
+ out0 = tmp1 ^ out7;
+ out6 = out2 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_EE(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in2;
+ tmp0 = in0 ^ in1;
+ out5 = in0 ^ in3;
+ tmp1 = tmp0 ^ in2;
+ out6 = tmp0 ^ in4;
+ tmp2 = tmp1 ^ out5;
+ out7 = tmp1 ^ in5;
+ out1 = tmp2 ^ out6 ^ in7;
+ out0 = tmp2 ^ in6;
+ tmp3 = out7 ^ in1;
+ out3 = tmp3 ^ in7;
+ out2 = tmp3 ^ in4 ^ in6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_EF(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out4 = in2 ^ in4;
+ tmp0 = in0 ^ in5;
+ tmp1 = in4 ^ in6;
+ out5 = tmp0 ^ in3;
+ out2 = tmp0 ^ tmp1;
+ out6 = tmp1 ^ in0 ^ in1;
+ out3 = out5 ^ in2 ^ in7;
+ out7 = out3 ^ in1 ^ in3;
+ out0 = out4 ^ out6 ^ in3;
+ out1 = tmp1 ^ out0 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F0(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in4 ^ in5;
+ out2 = tmp0 ^ in6;
+ out3 = tmp1 ^ in1;
+ tmp2 = tmp1 ^ in7;
+ out1 = out2 ^ out3 ^ in3;
+ tmp3 = tmp0 ^ tmp2;
+ out0 = tmp3 ^ in3;
+ out5 = tmp3 ^ in0;
+ out4 = out1 ^ out5 ^ in4;
+ out7 = out4 ^ in2;
+ out6 = tmp2 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F1(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in1 ^ in6;
+ tmp0 = in3 ^ in5;
+ out3 = tmp0 ^ in1 ^ in4;
+ tmp1 = out3 ^ in2;
+ out1 = tmp1 ^ in6;
+ tmp2 = tmp1 ^ in0;
+ tmp3 = out1 ^ in5;
+ out0 = tmp2 ^ in7;
+ out6 = tmp2 ^ in4;
+ out7 = tmp3 ^ in0;
+ out5 = tmp0 ^ out0;
+ out4 = tmp3 ^ out5 ^ in1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F2(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in4 ^ in5;
+ out2 = in2 ^ in6 ^ in7;
+ tmp1 = tmp0 ^ in1;
+ tmp2 = tmp1 ^ in2;
+ out0 = tmp2 ^ in3;
+ out3 = tmp2 ^ in7;
+ out5 = out3 ^ in0 ^ in4;
+ tmp3 = tmp0 ^ out5;
+ out7 = tmp3 ^ in3;
+ out4 = tmp3 ^ out2;
+ out1 = out0 ^ out4 ^ in4;
+ out6 = tmp1 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F3(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in6 ^ in7;
+ tmp0 = in0 ^ in1;
+ out4 = tmp0 ^ in6;
+ tmp1 = tmp0 ^ in2;
+ out5 = tmp1 ^ in7;
+ out6 = tmp1 ^ in3;
+ out7 = out6 ^ in4;
+ out0 = out7 ^ in5;
+ out1 = out0 ^ in6;
+ out3 = out0 ^ in0 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F4(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in0 ^ in1 ^ in2;
+ tmp0 = out2 ^ in3;
+ out4 = tmp0 ^ in4;
+ out5 = out4 ^ in5;
+ out6 = out5 ^ in6;
+ out7 = out6 ^ in7;
+ out0 = out7 ^ in0;
+ out1 = out0 ^ in1;
+ out3 = tmp0 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F5(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in0 ^ in1;
+ tmp0 = out2 ^ in2;
+ out4 = tmp0 ^ in3;
+ out5 = out4 ^ in4;
+ out6 = out5 ^ in5;
+ out7 = out6 ^ in6;
+ out0 = out7 ^ in7;
+ out1 = out0 ^ in0;
+ out3 = tmp0 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F6(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in7;
+ out2 = tmp0 ^ in2;
+ out4 = out2 ^ in1 ^ in4;
+ out7 = out4 ^ in3 ^ in5;
+ out5 = out7 ^ in4 ^ in7;
+ out0 = tmp0 ^ out7 ^ in6;
+ tmp1 = out0 ^ in1;
+ out6 = out0 ^ in0 ^ in5;
+ out3 = tmp1 ^ in3;
+ out1 = tmp0 ^ tmp1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F7(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in0 ^ in7;
+ tmp0 = out2 ^ in1;
+ out4 = tmp0 ^ in2;
+ out5 = out4 ^ in3 ^ in7;
+ out6 = out5 ^ in4;
+ out7 = out6 ^ in5;
+ out0 = out7 ^ in6;
+ out1 = out0 ^ in7;
+ out3 = tmp0 ^ out1;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F8(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in4;
+ tmp1 = in3 ^ in5;
+ tmp2 = tmp0 ^ in6;
+ out4 = tmp0 ^ tmp1;
+ out1 = tmp1 ^ in2 ^ in4;
+ out3 = tmp2 ^ in1;
+ out5 = out3 ^ in5;
+ out7 = out1 ^ out5 ^ in7;
+ out6 = tmp1 ^ out7;
+ out0 = tmp2 ^ out7;
+ out2 = out6 ^ in0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_F9(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in3 ^ in5;
+ tmp1 = in0 ^ in6;
+ out4 = tmp0 ^ in0;
+ tmp2 = tmp1 ^ in4;
+ tmp3 = tmp1 ^ in2;
+ out5 = tmp2 ^ in1;
+ out3 = out5 ^ in3;
+ tmp4 = tmp3 ^ out3;
+ out1 = tmp4 ^ in5;
+ out0 = tmp4 ^ in0 ^ in7;
+ out6 = tmp0 ^ out0 ^ in4;
+ out7 = tmp2 ^ tmp4;
+ out2 = tmp3 ^ out6;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_FA(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in1;
+ tmp1 = tmp0 ^ in2;
+ tmp2 = tmp0 ^ in5;
+ tmp3 = tmp1 ^ in7;
+ out5 = tmp2 ^ in6;
+ out6 = tmp3 ^ in6;
+ out7 = tmp3 ^ in3;
+ out3 = out6 ^ in4;
+ out2 = tmp1 ^ out5;
+ out4 = out2 ^ out3 ^ in1;
+ out0 = out4 ^ out7 ^ in5;
+ out1 = tmp2 ^ out0;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_FB(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in5 ^ in6;
+ tmp0 = in0 ^ in1;
+ out4 = in0 ^ in5 ^ in7;
+ out5 = tmp0 ^ in6;
+ tmp1 = tmp0 ^ in2;
+ out6 = tmp1 ^ in7;
+ out7 = tmp1 ^ in3;
+ out0 = out7 ^ in4;
+ out1 = out0 ^ in5;
+ out3 = out0 ^ in6 ^ in7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_FC(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in1 ^ in2;
+ tmp1 = in0 ^ in7;
+ out2 = tmp0 ^ tmp1 ^ in5;
+ out3 = tmp1 ^ in4;
+ tmp2 = out2 ^ in6;
+ out6 = tmp2 ^ in4;
+ out7 = tmp2 ^ in3;
+ out4 = out6 ^ in1 ^ in3;
+ tmp3 = out4 ^ in0;
+ out1 = tmp3 ^ in6;
+ out0 = tmp3 ^ in1 ^ in5;
+ out5 = tmp0 ^ out4;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_FD(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in5;
+ tmp1 = in1 ^ in7;
+ out2 = tmp0 ^ tmp1;
+ out6 = out2 ^ in2 ^ in4;
+ tmp2 = out6 ^ in0;
+ out1 = tmp2 ^ in3;
+ out0 = tmp0 ^ out1 ^ in6;
+ out5 = out0 ^ in2;
+ tmp3 = out5 ^ in1;
+ out3 = tmp3 ^ in6;
+ out7 = tmp2 ^ tmp3;
+ out4 = tmp1 ^ out7;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_FE(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ tmp0 = in0 ^ in2;
+ out2 = tmp0 ^ in5;
+ out3 = tmp0 ^ in4;
+ tmp1 = out3 ^ in6;
+ out4 = tmp1 ^ in5;
+ tmp2 = tmp1 ^ in1;
+ out6 = tmp2 ^ in7;
+ tmp3 = tmp2 ^ in0;
+ out0 = tmp3 ^ in3;
+ tmp4 = out0 ^ out4 ^ in7;
+ out5 = tmp4 ^ in6;
+ out7 = tmp4 ^ in2;
+ out1 = tmp3 ^ out5;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void
+gf8_muladd_FF(void *out, void *in)
+{
+ unsigned int i;
+ uint64_t *in_ptr = (uint64_t *)in;
+ uint64_t *out_ptr = (uint64_t *)out;
+
+ for (i = 0; i < WIDTH; i++) {
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ uint64_t in0 = out_ptr[0];
+ uint64_t in1 = out_ptr[WIDTH];
+ uint64_t in2 = out_ptr[WIDTH * 2];
+ uint64_t in3 = out_ptr[WIDTH * 3];
+ uint64_t in4 = out_ptr[WIDTH * 4];
+ uint64_t in5 = out_ptr[WIDTH * 5];
+ uint64_t in6 = out_ptr[WIDTH * 6];
+ uint64_t in7 = out_ptr[WIDTH * 7];
+
+ out2 = in0 ^ in5;
+ tmp0 = in4 ^ in7;
+ tmp1 = out2 ^ in2;
+ out4 = tmp1 ^ in6;
+ out7 = tmp1 ^ in1 ^ in3;
+ out1 = tmp0 ^ out7;
+ tmp2 = out1 ^ in5;
+ out6 = tmp2 ^ in3;
+ tmp3 = tmp2 ^ in7;
+ out0 = tmp3 ^ in6;
+ out3 = tmp3 ^ in1;
+ out5 = tmp0 ^ out0 ^ in2;
+
+ out_ptr[0] = out0 ^ in_ptr[0];
+ out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+ out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+ out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+ out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+ out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+ out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+ out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+ in_ptr++;
+ out_ptr++;
+ }
+}
+
+static void (*gf8_muladd[])(void *out, void *in) = {
+ gf8_muladd_00, gf8_muladd_01, gf8_muladd_02, gf8_muladd_03, gf8_muladd_04,
+ gf8_muladd_05, gf8_muladd_06, gf8_muladd_07, gf8_muladd_08, gf8_muladd_09,
+ gf8_muladd_0A, gf8_muladd_0B, gf8_muladd_0C, gf8_muladd_0D, gf8_muladd_0E,
+ gf8_muladd_0F, gf8_muladd_10, gf8_muladd_11, gf8_muladd_12, gf8_muladd_13,
+ gf8_muladd_14, gf8_muladd_15, gf8_muladd_16, gf8_muladd_17, gf8_muladd_18,
+ gf8_muladd_19, gf8_muladd_1A, gf8_muladd_1B, gf8_muladd_1C, gf8_muladd_1D,
+ gf8_muladd_1E, gf8_muladd_1F, gf8_muladd_20, gf8_muladd_21, gf8_muladd_22,
+ gf8_muladd_23, gf8_muladd_24, gf8_muladd_25, gf8_muladd_26, gf8_muladd_27,
+ gf8_muladd_28, gf8_muladd_29, gf8_muladd_2A, gf8_muladd_2B, gf8_muladd_2C,
+ gf8_muladd_2D, gf8_muladd_2E, gf8_muladd_2F, gf8_muladd_30, gf8_muladd_31,
+ gf8_muladd_32, gf8_muladd_33, gf8_muladd_34, gf8_muladd_35, gf8_muladd_36,
+ gf8_muladd_37, gf8_muladd_38, gf8_muladd_39, gf8_muladd_3A, gf8_muladd_3B,
+ gf8_muladd_3C, gf8_muladd_3D, gf8_muladd_3E, gf8_muladd_3F, gf8_muladd_40,
+ gf8_muladd_41, gf8_muladd_42, gf8_muladd_43, gf8_muladd_44, gf8_muladd_45,
+ gf8_muladd_46, gf8_muladd_47, gf8_muladd_48, gf8_muladd_49, gf8_muladd_4A,
+ gf8_muladd_4B, gf8_muladd_4C, gf8_muladd_4D, gf8_muladd_4E, gf8_muladd_4F,
+ gf8_muladd_50, gf8_muladd_51, gf8_muladd_52, gf8_muladd_53, gf8_muladd_54,
+ gf8_muladd_55, gf8_muladd_56, gf8_muladd_57, gf8_muladd_58, gf8_muladd_59,
+ gf8_muladd_5A, gf8_muladd_5B, gf8_muladd_5C, gf8_muladd_5D, gf8_muladd_5E,
+ gf8_muladd_5F, gf8_muladd_60, gf8_muladd_61, gf8_muladd_62, gf8_muladd_63,
+ gf8_muladd_64, gf8_muladd_65, gf8_muladd_66, gf8_muladd_67, gf8_muladd_68,
+ gf8_muladd_69, gf8_muladd_6A, gf8_muladd_6B, gf8_muladd_6C, gf8_muladd_6D,
+ gf8_muladd_6E, gf8_muladd_6F, gf8_muladd_70, gf8_muladd_71, gf8_muladd_72,
+ gf8_muladd_73, gf8_muladd_74, gf8_muladd_75, gf8_muladd_76, gf8_muladd_77,
+ gf8_muladd_78, gf8_muladd_79, gf8_muladd_7A, gf8_muladd_7B, gf8_muladd_7C,
+ gf8_muladd_7D, gf8_muladd_7E, gf8_muladd_7F, gf8_muladd_80, gf8_muladd_81,
+ gf8_muladd_82, gf8_muladd_83, gf8_muladd_84, gf8_muladd_85, gf8_muladd_86,
+ gf8_muladd_87, gf8_muladd_88, gf8_muladd_89, gf8_muladd_8A, gf8_muladd_8B,
+ gf8_muladd_8C, gf8_muladd_8D, gf8_muladd_8E, gf8_muladd_8F, gf8_muladd_90,
+ gf8_muladd_91, gf8_muladd_92, gf8_muladd_93, gf8_muladd_94, gf8_muladd_95,
+ gf8_muladd_96, gf8_muladd_97, gf8_muladd_98, gf8_muladd_99, gf8_muladd_9A,
+ gf8_muladd_9B, gf8_muladd_9C, gf8_muladd_9D, gf8_muladd_9E, gf8_muladd_9F,
+ gf8_muladd_A0, gf8_muladd_A1, gf8_muladd_A2, gf8_muladd_A3, gf8_muladd_A4,
+ gf8_muladd_A5, gf8_muladd_A6, gf8_muladd_A7, gf8_muladd_A8, gf8_muladd_A9,
+ gf8_muladd_AA, gf8_muladd_AB, gf8_muladd_AC, gf8_muladd_AD, gf8_muladd_AE,
+ gf8_muladd_AF, gf8_muladd_B0, gf8_muladd_B1, gf8_muladd_B2, gf8_muladd_B3,
+ gf8_muladd_B4, gf8_muladd_B5, gf8_muladd_B6, gf8_muladd_B7, gf8_muladd_B8,
+ gf8_muladd_B9, gf8_muladd_BA, gf8_muladd_BB, gf8_muladd_BC, gf8_muladd_BD,
+ gf8_muladd_BE, gf8_muladd_BF, gf8_muladd_C0, gf8_muladd_C1, gf8_muladd_C2,
+ gf8_muladd_C3, gf8_muladd_C4, gf8_muladd_C5, gf8_muladd_C6, gf8_muladd_C7,
+ gf8_muladd_C8, gf8_muladd_C9, gf8_muladd_CA, gf8_muladd_CB, gf8_muladd_CC,
+ gf8_muladd_CD, gf8_muladd_CE, gf8_muladd_CF, gf8_muladd_D0, gf8_muladd_D1,
+ gf8_muladd_D2, gf8_muladd_D3, gf8_muladd_D4, gf8_muladd_D5, gf8_muladd_D6,
+ gf8_muladd_D7, gf8_muladd_D8, gf8_muladd_D9, gf8_muladd_DA, gf8_muladd_DB,
+ gf8_muladd_DC, gf8_muladd_DD, gf8_muladd_DE, gf8_muladd_DF, gf8_muladd_E0,
+ gf8_muladd_E1, gf8_muladd_E2, gf8_muladd_E3, gf8_muladd_E4, gf8_muladd_E5,
+ gf8_muladd_E6, gf8_muladd_E7, gf8_muladd_E8, gf8_muladd_E9, gf8_muladd_EA,
+ gf8_muladd_EB, gf8_muladd_EC, gf8_muladd_ED, gf8_muladd_EE, gf8_muladd_EF,
+ gf8_muladd_F0, gf8_muladd_F1, gf8_muladd_F2, gf8_muladd_F3, gf8_muladd_F4,
+ gf8_muladd_F5, gf8_muladd_F6, gf8_muladd_F7, gf8_muladd_F8, gf8_muladd_F9,
+ gf8_muladd_FA, gf8_muladd_FB, gf8_muladd_FC, gf8_muladd_FD, gf8_muladd_FE,
+ gf8_muladd_FF};
+
+static uint64_t zero[EC_METHOD_WORD_SIZE * 8] = {
+ 0,
+};
+
+void
+ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count)
+{
+ uint32_t i, last, tmp;
+
+ last = 1;
+ for (i = count; i > 0; i--) {
+ if (values[i - 1] != 0) {
+ tmp = values[i - 1];
+ values[i - 1] = ec_gf_div(gf, tmp, last);
+ last = tmp;
+ }
+ }
+}
+
+void
+ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values,
+ uint32_t count)
+{
+ src += offset;
+ gf8_muladd_00(dst, src);
+ while (--count > 0) {
+ src += EC_METHOD_CHUNK_SIZE;
+ gf8_muladd[*values](dst, src);
+ values++;
+ }
+}
+
+void
+ec_code_c_interleaved(void *dst, void **src, uint64_t offset, uint32_t *values,
+ uint32_t count)
+{
+ uint32_t i, last, tmp;
+
+ i = 0;
+ while ((last = *values++) == 0) {
+ i++;
+ }
+ gf8_muladd_00(dst, src[i++] + offset);
+ while (i < count) {
+ tmp = *values++;
+ if (tmp != 0) {
+ gf8_muladd[last](dst, src[i] + offset);
+ last = tmp;
+ }
+ i++;
+ }
+ gf8_muladd[last](dst, zero);
+}
diff --git a/xlators/cluster/ec/src/ec-code-c.h b/xlators/cluster/ec/src/ec-code-c.h
new file mode 100644
index 00000000000..42b5a064eb8
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-c.h
@@ -0,0 +1,27 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_C_H__
+#define __EC_CODE_C_H__
+
+#include "ec-types.h"
+
+void
+ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count);
+
+void
+ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values,
+ uint32_t count);
+
+void
+ec_code_c_interleaved(void *dst, void **src, uint64_t offset, uint32_t *values,
+ uint32_t count);
+
+#endif /* __EC_CODE_C_H__ */
diff --git a/xlators/cluster/ec/src/ec-code-intel.c b/xlators/cluster/ec/src/ec-code-intel.c
new file mode 100644
index 00000000000..f1c4e13e321
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-intel.c
@@ -0,0 +1,594 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <inttypes.h>
+#include <string.h>
+#include <errno.h>
+
+#include "ec-code-intel.h"
+
+static void
+ec_code_intel_init(ec_code_intel_t *intel)
+{
+ memset(intel, 0, sizeof(ec_code_intel_t));
+}
+
+static void
+ec_code_intel_prefix(ec_code_intel_t *intel, uint8_t prefix)
+{
+ intel->prefix.data[intel->prefix.bytes++] = prefix;
+}
+
+static void
+ec_code_intel_rex(ec_code_intel_t *intel, gf_boolean_t w)
+{
+ gf_boolean_t present = _gf_false;
+
+ if (w) {
+ intel->rex.w = 1;
+ present = _gf_true;
+ }
+ if (intel->modrm.present) {
+ if (intel->modrm.reg > 7) {
+ intel->modrm.reg &= 7;
+ intel->rex.r = 1;
+ present = _gf_true;
+ }
+ if (intel->sib.present) {
+ if (intel->sib.index > 7) {
+ intel->sib.index &= 7;
+ intel->rex.x = 1;
+ present = _gf_true;
+ }
+ if (intel->sib.base > 7) {
+ intel->sib.base &= 7;
+ intel->rex.b = 1;
+ present = _gf_true;
+ }
+ } else if (intel->modrm.rm > 7) {
+ intel->modrm.rm &= 7;
+ intel->rex.b = 1;
+ present = _gf_true;
+ }
+ } else if (intel->reg > 7) {
+ intel->reg &= 7;
+ intel->rex.b = 1;
+ present = _gf_true;
+ }
+ intel->rex.present = present;
+}
+
+static void
+ec_code_intel_vex(ec_code_intel_t *intel, gf_boolean_t w, gf_boolean_t l,
+ ec_code_vex_opcode_t opcode, ec_code_vex_prefix_t prefix,
+ uint32_t reg)
+{
+ ec_code_intel_rex(intel, w);
+ if (((intel->rex.w == 1) || (intel->rex.x == 0) || (intel->rex.b == 0)) ||
+ ((opcode != VEX_OPCODE_NONE) && (opcode != VEX_OPCODE_0F))) {
+ intel->rex.present = _gf_false;
+
+ intel->vex.bytes = 3;
+ intel->vex.data[0] = 0xC4;
+ intel->vex.data[1] = ((intel->rex.r << 7) | (intel->rex.x << 6) |
+ (intel->rex.b << 5) | opcode) ^
+ 0xE0;
+ intel->vex.data[2] = (intel->rex.w << 7) | ((~reg & 0x0F) << 3) |
+ (l ? 0x04 : 0x00) | prefix;
+ } else {
+ intel->vex.bytes = 2;
+ intel->vex.data[0] = 0xC5;
+ intel->vex.data[1] = (intel->rex.r << 7) | ((~reg & 0x0F) << 3) |
+ (l ? 0x04 : 0x00) | prefix;
+ }
+}
+
+static void
+ec_code_intel_modrm_reg(ec_code_intel_t *intel, uint32_t rm, uint32_t reg)
+{
+ intel->modrm.present = _gf_true;
+ intel->modrm.mod = 3;
+ intel->modrm.rm = rm;
+ intel->modrm.reg = reg;
+}
+
+static void
+ec_code_intel_modrm_mem(ec_code_intel_t *intel, uint32_t reg,
+ ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+ uint32_t scale, int32_t offset)
+{
+ if (index == REG_SP) {
+ intel->invalid = _gf_true;
+ return;
+ }
+ if ((index != REG_NULL) && (scale != 1) && (scale != 2) && (scale != 4) &&
+ (scale != 8)) {
+ intel->invalid = _gf_true;
+ return;
+ }
+ scale >>= 1;
+ if (scale == 4) {
+ scale = 3;
+ }
+
+ intel->modrm.present = _gf_true;
+ intel->modrm.reg = reg;
+
+ intel->offset.value = offset;
+ if ((offset == 0) && (base != REG_BP)) {
+ intel->modrm.mod = 0;
+ intel->offset.bytes = 0;
+ } else if ((offset >= -128) && (offset <= 127)) {
+ intel->modrm.mod = 1;
+ intel->offset.bytes = 1;
+ } else {
+ intel->modrm.mod = 2;
+ intel->offset.bytes = 4;
+ }
+
+ intel->modrm.rm = base;
+ if ((index != REG_NULL) || (base == REG_SP)) {
+ intel->modrm.rm = 4;
+ intel->sib.present = _gf_true;
+ intel->sib.index = index;
+ if (index == REG_NULL) {
+ intel->sib.index = 4;
+ }
+ intel->sib.scale = scale;
+ intel->sib.base = base;
+ if (base == REG_NULL) {
+ intel->sib.base = 5;
+ intel->modrm.mod = 0;
+ intel->offset.bytes = 4;
+ }
+ } else if (base == REG_NULL) {
+ intel->modrm.mod = 0;
+ intel->modrm.rm = 5;
+ intel->offset.bytes = 4;
+ }
+}
+
+static void
+ec_code_intel_op_1(ec_code_intel_t *intel, uint8_t opcode, uint32_t reg)
+{
+ intel->reg = reg;
+ intel->opcode.bytes = 1;
+ intel->opcode.data[0] = opcode;
+}
+
+static void
+ec_code_intel_op_2(ec_code_intel_t *intel, uint8_t opcode1, uint8_t opcode2,
+ uint32_t reg)
+{
+ intel->reg = reg;
+ intel->opcode.bytes = 2;
+ intel->opcode.data[0] = opcode1;
+ intel->opcode.data[1] = opcode2;
+}
+
+static void
+ec_code_intel_immediate_1(ec_code_intel_t *intel, uint32_t value)
+{
+ intel->immediate.bytes = 1;
+ intel->immediate.value = value;
+}
+
+static void
+ec_code_intel_immediate_2(ec_code_intel_t *intel, uint32_t value)
+{
+ intel->immediate.bytes = 2;
+ intel->immediate.value = value;
+}
+
+static void
+ec_code_intel_immediate_4(ec_code_intel_t *intel, uint32_t value)
+{
+ intel->immediate.bytes = 4;
+ intel->immediate.value = value;
+}
+
+static void
+ec_code_intel_emit(ec_code_builder_t *builder, ec_code_intel_t *intel)
+{
+ uint8_t insn[15];
+ uint32_t i, count;
+
+ if (intel->invalid) {
+ ec_code_error(builder, EINVAL);
+ return;
+ }
+
+ count = 0;
+ for (i = 0; i < intel->prefix.bytes; i++) {
+ insn[count++] = intel->prefix.data[i];
+ }
+ for (i = 0; i < intel->vex.bytes; i++) {
+ insn[count++] = intel->vex.data[i];
+ }
+ if (intel->rex.present) {
+ insn[count++] = 0x40 | (intel->rex.w << 3) | (intel->rex.r << 2) |
+ (intel->rex.x << 1) | (intel->rex.b << 0);
+ }
+ for (i = 0; i < intel->opcode.bytes; i++) {
+ insn[count++] = intel->opcode.data[i];
+ }
+ if (intel->modrm.present) {
+ insn[count++] = (intel->modrm.mod << 6) | (intel->modrm.reg << 3) |
+ (intel->modrm.rm << 0);
+ if (intel->sib.present) {
+ insn[count++] = (intel->sib.scale << 6) | (intel->sib.index << 3) |
+ (intel->sib.base << 0);
+ }
+ }
+ for (i = 0; i < intel->offset.bytes; i++) {
+ insn[count++] = intel->offset.data[i];
+ }
+ for (i = 0; i < intel->immediate.bytes; i++) {
+ insn[count++] = intel->immediate.data[i];
+ }
+
+ ec_code_emit(builder, insn, count);
+}
+
+void
+ec_code_intel_op_push_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_op_1(&intel, 0x50 | (reg & 7), reg);
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_pop_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_op_1(&intel, 0x58 | (reg & 7), reg);
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ if (size == 0) {
+ ec_code_intel_op_1(&intel, 0xC3, 0);
+ } else {
+ ec_code_intel_immediate_2(&intel, size);
+ ec_code_intel_op_1(&intel, 0xC2, 0);
+ }
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+ ec_code_intel_reg_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_reg(&intel, dst, src);
+ ec_code_intel_op_1(&intel, 0x89, 0);
+ ec_code_intel_rex(&intel, _gf_true);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+ ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+ uint32_t scale, int32_t offset)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset);
+ ec_code_intel_op_1(&intel, 0x89, 0);
+ ec_code_intel_rex(&intel, _gf_true);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, ec_code_intel_reg_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+ ec_code_intel_op_1(&intel, 0x8B, 0);
+ ec_code_intel_rex(&intel, _gf_true);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+ ec_code_intel_reg_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_reg(&intel, dst, src);
+ ec_code_intel_op_1(&intel, 0x31, 0);
+ ec_code_intel_rex(&intel, _gf_true);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, ec_code_intel_reg_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+ ec_code_intel_op_1(&intel, 0x33, 0);
+ ec_code_intel_rex(&intel, _gf_true);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value,
+ ec_code_intel_reg_t reg)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ if ((value >= -128) && (value < 128)) {
+ ec_code_intel_modrm_reg(&intel, reg, 0);
+ ec_code_intel_op_1(&intel, 0x83, 0);
+ ec_code_intel_immediate_1(&intel, value);
+ } else {
+ if (reg == REG_AX) {
+ ec_code_intel_op_1(&intel, 0x05, reg);
+ } else {
+ ec_code_intel_modrm_reg(&intel, reg, 0);
+ ec_code_intel_op_1(&intel, 0x81, 0);
+ }
+ ec_code_intel_immediate_4(&intel, value);
+ }
+ ec_code_intel_rex(&intel, _gf_true);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value,
+ ec_code_intel_reg_t reg)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ if (reg == REG_AX) {
+ ec_code_intel_op_1(&intel, 0xA9, reg);
+ } else {
+ ec_code_intel_modrm_reg(&intel, reg, 0);
+ ec_code_intel_op_1(&intel, 0xF7, 0);
+ }
+ ec_code_intel_immediate_4(&intel, value);
+ ec_code_intel_rex(&intel, _gf_true);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address)
+{
+ ec_code_intel_t intel;
+ int32_t rel;
+
+ ec_code_intel_init(&intel);
+
+ rel = address - builder->address - 2;
+ if ((rel >= -128) && (rel < 128)) {
+ ec_code_intel_op_1(&intel, 0x75, 0);
+ ec_code_intel_immediate_1(&intel, rel);
+ } else {
+ rel -= 4;
+ ec_code_intel_op_2(&intel, 0x0F, 0x85, 0);
+ ec_code_intel_immediate_4(&intel, rel);
+ }
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src,
+ uint32_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_prefix(&intel, 0x66);
+ ec_code_intel_modrm_reg(&intel, src, dst);
+ ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0);
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src,
+ ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+ uint32_t scale, int32_t offset)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_prefix(&intel, 0x66);
+ ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset);
+ ec_code_intel_op_2(&intel, 0x0F, 0x7F, 0);
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, uint32_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_prefix(&intel, 0x66);
+ ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+ ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0);
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src,
+ uint32_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_prefix(&intel, 0x66);
+ ec_code_intel_modrm_reg(&intel, src, dst);
+ ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0);
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, uint32_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_prefix(&intel, 0x66);
+ ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+ ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0);
+ ec_code_intel_rex(&intel, _gf_false);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src,
+ uint32_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_reg(&intel, src, dst);
+ ec_code_intel_op_1(&intel, 0x6F, 0);
+ ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+ VEX_REG_NONE);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src,
+ ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+ uint32_t scale, int32_t offset)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset);
+ ec_code_intel_op_1(&intel, 0x7F, 0);
+ ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+ VEX_REG_NONE);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, uint32_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+ ec_code_intel_op_1(&intel, 0x6F, 0);
+ ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+ VEX_REG_NONE);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src,
+ uint32_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_reg(&intel, src, dst);
+ ec_code_intel_op_1(&intel, 0xEF, 0);
+ ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+ dst);
+
+ ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, uint32_t dst)
+{
+ ec_code_intel_t intel;
+
+ ec_code_intel_init(&intel);
+
+ ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+ ec_code_intel_op_1(&intel, 0xEF, 0);
+ ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+ dst);
+
+ ec_code_intel_emit(builder, &intel);
+}
diff --git a/xlators/cluster/ec/src/ec-code-intel.h b/xlators/cluster/ec/src/ec-code-intel.h
new file mode 100644
index 00000000000..3fa4a174765
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-intel.h
@@ -0,0 +1,191 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_INTEL_H__
+#define __EC_CODE_INTEL_H__
+
+#include "ec-code.h"
+
+#define VEX_REG_NONE 0
+
+enum _ec_code_intel_reg;
+typedef enum _ec_code_intel_reg ec_code_intel_reg_t;
+
+enum _ec_code_vex_prefix;
+typedef enum _ec_code_vex_prefix ec_code_vex_prefix_t;
+
+enum _ec_code_vex_opcode;
+typedef enum _ec_code_vex_opcode ec_code_vex_opcode_t;
+
+struct _ec_code_intel_buffer;
+typedef struct _ec_code_intel_buffer ec_code_intel_buffer_t;
+
+struct _ec_code_intel_sib;
+typedef struct _ec_code_intel_sib ec_code_intel_sib_t;
+
+struct _ec_code_intel_modrm;
+typedef struct _ec_code_intel_modrm ec_code_intel_modrm_t;
+
+struct _ec_code_intel_rex;
+typedef struct _ec_code_intel_rex ec_code_intel_rex_t;
+
+struct _ec_code_intel;
+typedef struct _ec_code_intel ec_code_intel_t;
+
+enum _ec_code_intel_reg {
+ REG_NULL = -1,
+ REG_AX,
+ REG_CX,
+ REG_DX,
+ REG_BX,
+ REG_SP,
+ REG_BP,
+ REG_SI,
+ REG_DI,
+ REG_8,
+ REG_9,
+ REG_10,
+ REG_11,
+ REG_12,
+ REG_13,
+ REG_14,
+ REG_15
+};
+
+enum _ec_code_vex_prefix {
+ VEX_PREFIX_NONE = 0,
+ VEX_PREFIX_66,
+ VEX_PREFIX_F3,
+ VEX_PREFIX_F2
+};
+
+enum _ec_code_vex_opcode {
+ VEX_OPCODE_NONE = 0,
+ VEX_OPCODE_0F,
+ VEX_OPCODE_0F_38,
+ VEX_OPCODE_0F_3A
+};
+
+struct _ec_code_intel_buffer {
+ uint32_t bytes;
+ union {
+ uint8_t data[4];
+ uint32_t value;
+ };
+};
+
+struct _ec_code_intel_sib {
+ gf_boolean_t present;
+ uint32_t base;
+ uint32_t index;
+ uint32_t scale;
+};
+
+struct _ec_code_intel_modrm {
+ gf_boolean_t present;
+ uint32_t mod;
+ uint32_t rm;
+ uint32_t reg;
+};
+
+struct _ec_code_intel_rex {
+ gf_boolean_t present;
+ uint32_t w;
+ uint32_t r;
+ uint32_t x;
+ uint32_t b;
+};
+
+struct _ec_code_intel {
+ gf_boolean_t invalid;
+ ec_code_intel_buffer_t prefix;
+ ec_code_intel_buffer_t opcode;
+ ec_code_intel_buffer_t offset;
+ ec_code_intel_buffer_t immediate;
+ ec_code_intel_buffer_t vex;
+ ec_code_intel_rex_t rex;
+ ec_code_intel_modrm_t modrm;
+ ec_code_intel_sib_t sib;
+ uint32_t reg;
+};
+
+void
+ec_code_intel_op_push_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg);
+void
+ec_code_intel_op_pop_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg);
+void
+ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size);
+
+void
+ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+ ec_code_intel_reg_t dst);
+void
+ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+ ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+ uint32_t scale, int32_t offset);
+void
+ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, ec_code_intel_reg_t dst);
+void
+ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+ ec_code_intel_reg_t dst);
+void
+ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, ec_code_intel_reg_t dst);
+void
+ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value,
+ ec_code_intel_reg_t reg);
+void
+ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value,
+ ec_code_intel_reg_t reg);
+void
+ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address);
+
+void
+ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src,
+ uint32_t dst);
+void
+ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src,
+ ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+ uint32_t scale, int32_t offset);
+void
+ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, uint32_t dst);
+void
+ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src,
+ uint32_t dst);
+void
+ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, uint32_t dst);
+
+void
+ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src,
+ uint32_t dst);
+void
+ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src,
+ ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+ uint32_t scale, int32_t offset);
+void
+ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, uint32_t dst);
+void
+ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src,
+ uint32_t dst);
+void
+ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+ ec_code_intel_reg_t index, uint32_t scale,
+ int32_t offset, uint32_t dst);
+
+#endif /* __EC_CODE_INTEL_H__ */
diff --git a/xlators/cluster/ec/src/ec-code-sse.c b/xlators/cluster/ec/src/ec-code-sse.c
new file mode 100644
index 00000000000..e11e7ff8400
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-sse.c
@@ -0,0 +1,101 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+
+#include "ec-code-intel.h"
+
+static void
+ec_code_sse_prolog(ec_code_builder_t *builder)
+{
+ builder->loop = builder->address;
+}
+
+static void
+ec_code_sse_epilog(ec_code_builder_t *builder)
+{
+ ec_code_intel_op_add_i2r(builder, 16, REG_DX);
+ ec_code_intel_op_add_i2r(builder, 16, REG_DI);
+ ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX);
+ ec_code_intel_op_jne(builder, builder->loop);
+
+ ec_code_intel_op_ret(builder, 0);
+}
+
+static void
+ec_code_sse_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+ uint32_t bit)
+{
+ if (builder->linear) {
+ ec_code_intel_op_mov_m2sse(
+ builder, REG_SI, REG_DX, 1,
+ idx * builder->width * builder->bits + bit * builder->width, dst);
+ } else {
+ if (builder->base != idx) {
+ ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+ REG_AX);
+ builder->base = idx;
+ }
+ ec_code_intel_op_mov_m2sse(builder, REG_AX, REG_DX, 1,
+ bit * builder->width, dst);
+ }
+}
+
+static void
+ec_code_sse_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit)
+{
+ ec_code_intel_op_mov_sse2m(builder, src, REG_DI, REG_NULL, 0,
+ bit * builder->width);
+}
+
+static void
+ec_code_sse_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+ ec_code_intel_op_mov_sse2sse(builder, src, dst);
+}
+
+static void
+ec_code_sse_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+ ec_code_intel_op_xor_sse2sse(builder, src, dst);
+}
+
+static void
+ec_code_sse_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+ uint32_t bit)
+{
+ if (builder->linear) {
+ ec_code_intel_op_xor_m2sse(
+ builder, REG_SI, REG_DX, 1,
+ idx * builder->width * builder->bits + bit * builder->width, dst);
+ } else {
+ if (builder->base != idx) {
+ ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+ REG_AX);
+ builder->base = idx;
+ }
+ ec_code_intel_op_xor_m2sse(builder, REG_AX, REG_DX, 1,
+ bit * builder->width, dst);
+ }
+}
+
+static char *ec_code_sse_needed_flags[] = {"sse2", NULL};
+
+ec_code_gen_t ec_code_gen_sse = {.name = "sse",
+ .flags = ec_code_sse_needed_flags,
+ .width = 16,
+ .prolog = ec_code_sse_prolog,
+ .epilog = ec_code_sse_epilog,
+ .load = ec_code_sse_load,
+ .store = ec_code_sse_store,
+ .copy = ec_code_sse_copy,
+ .xor2 = ec_code_sse_xor2,
+ .xor3 = NULL,
+ .xorm = ec_code_sse_xorm};
diff --git a/xlators/cluster/ec/src/ec-code-sse.h b/xlators/cluster/ec/src/ec-code-sse.h
new file mode 100644
index 00000000000..f1acbcf894b
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-sse.h
@@ -0,0 +1,18 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_SSE_H__
+#define __EC_CODE_SSE_H__
+
+#include "ec-code.h"
+
+extern ec_code_gen_t ec_code_gen_sse;
+
+#endif /* __EC_CODE_SSE_H__ */
diff --git a/xlators/cluster/ec/src/ec-code-x64.c b/xlators/cluster/ec/src/ec-code-x64.c
new file mode 100644
index 00000000000..26565b4493f
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-x64.c
@@ -0,0 +1,144 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+
+#include "ec-code-intel.h"
+
+static ec_code_intel_reg_t ec_code_x64_regmap[] = {
+ REG_AX, REG_CX, REG_BP, REG_8, REG_9, REG_10,
+ REG_11, REG_12, REG_13, REG_14, REG_15};
+
+static void
+ec_code_x64_prolog(ec_code_builder_t *builder)
+{
+ uint32_t i;
+
+ ec_code_intel_op_push_r(builder, REG_BP);
+ if (!builder->linear) {
+ ec_code_intel_op_push_r(builder, REG_BX);
+ }
+ if (builder->regs > 11) {
+ ec_code_error(builder, EINVAL);
+ return;
+ }
+ for (i = 7; i < builder->regs; i++) {
+ ec_code_intel_op_push_r(builder, ec_code_x64_regmap[i]);
+ }
+
+ builder->loop = builder->address;
+}
+
+static void
+ec_code_x64_epilog(ec_code_builder_t *builder)
+{
+ uint32_t i;
+
+ ec_code_intel_op_add_i2r(builder, 8, REG_DX);
+ ec_code_intel_op_add_i2r(builder, 8, REG_DI);
+ ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX);
+ ec_code_intel_op_jne(builder, builder->loop);
+
+ if (builder->regs > 11) {
+ ec_code_error(builder, EINVAL);
+ return;
+ }
+ for (i = builder->regs; i > 7; i--) {
+ ec_code_intel_op_pop_r(builder, ec_code_x64_regmap[i - 1]);
+ }
+ if (!builder->linear) {
+ ec_code_intel_op_pop_r(builder, REG_BX);
+ }
+ ec_code_intel_op_pop_r(builder, REG_BP);
+ ec_code_intel_op_ret(builder, 0);
+}
+
+static void
+ec_code_x64_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+ uint32_t bit)
+{
+ dst = ec_code_x64_regmap[dst];
+
+ if (builder->linear) {
+ ec_code_intel_op_mov_m2r(
+ builder, REG_SI, REG_DX, 1,
+ idx * builder->width * builder->bits + bit * builder->width, dst);
+ } else {
+ if (builder->base != idx) {
+ ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+ REG_BX);
+ builder->base = idx;
+ }
+ ec_code_intel_op_mov_m2r(builder, REG_BX, REG_DX, 1,
+ bit * builder->width, dst);
+ }
+}
+
+static void
+ec_code_x64_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit)
+{
+ src = ec_code_x64_regmap[src];
+
+ ec_code_intel_op_mov_r2m(builder, src, REG_DI, REG_NULL, 0,
+ bit * builder->width);
+}
+
+static void
+ec_code_x64_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+ dst = ec_code_x64_regmap[dst];
+ src = ec_code_x64_regmap[src];
+
+ ec_code_intel_op_mov_r2r(builder, src, dst);
+}
+
+static void
+ec_code_x64_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+ dst = ec_code_x64_regmap[dst];
+ src = ec_code_x64_regmap[src];
+
+ ec_code_intel_op_xor_r2r(builder, src, dst);
+}
+
+static void
+ec_code_x64_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+ uint32_t bit)
+{
+ dst = ec_code_x64_regmap[dst];
+
+ if (builder->linear) {
+ ec_code_intel_op_xor_m2r(
+ builder, REG_SI, REG_DX, 1,
+ idx * builder->width * builder->bits + bit * builder->width, dst);
+ } else {
+ if (builder->base != idx) {
+ ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+ REG_BX);
+ builder->base = idx;
+ }
+ ec_code_intel_op_xor_m2r(builder, REG_BX, REG_DX, 1,
+ bit * builder->width, dst);
+ }
+}
+
+static char *ec_code_x64_needed_flags[] = {NULL};
+
+ec_code_gen_t ec_code_gen_x64 = {.name = "x64",
+ .flags = ec_code_x64_needed_flags,
+ .width = sizeof(uint64_t),
+ .prolog = ec_code_x64_prolog,
+ .epilog = ec_code_x64_epilog,
+ .load = ec_code_x64_load,
+ .store = ec_code_x64_store,
+ .copy = ec_code_x64_copy,
+ .xor2 = ec_code_x64_xor2,
+ .xor3 = NULL,
+ .xorm = ec_code_x64_xorm};
diff --git a/xlators/cluster/ec/src/ec-code-x64.h b/xlators/cluster/ec/src/ec-code-x64.h
new file mode 100644
index 00000000000..bd8174e4bf5
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-x64.h
@@ -0,0 +1,18 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_X64_H__
+#define __EC_CODE_X64_H__
+
+#include "ec-code.h"
+
+extern ec_code_gen_t ec_code_gen_x64;
+
+#endif /* __EC_CODE_X64_H__ */
diff --git a/xlators/cluster/ec/src/ec-code.c b/xlators/cluster/ec/src/ec-code.c
new file mode 100644
index 00000000000..03162ae05a9
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code.c
@@ -0,0 +1,1060 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+
+#include <glusterfs/syscall.h>
+
+#include "ec-mem-types.h"
+#include "ec-code.h"
+#include "ec-messages.h"
+#include "ec-code-c.h"
+#include "ec-helpers.h"
+
+#ifdef USE_EC_DYNAMIC_X64
+#include "ec-code-x64.h"
+#endif
+
+#ifdef USE_EC_DYNAMIC_SSE
+#include "ec-code-sse.h"
+#endif
+
+#ifdef USE_EC_DYNAMIC_AVX
+#include "ec-code-avx.h"
+#endif
+
+#define EC_CODE_SIZE (1024 * 64)
+#define EC_CODE_ALIGN 4096
+
+#define EC_CODE_CHUNK_MIN_SIZE 512
+
+#define EC_PROC_BUFFER_SIZE 4096
+
+#define PROC_CPUINFO "/proc/cpuinfo"
+
+struct _ec_code_proc;
+typedef struct _ec_code_proc ec_code_proc_t;
+
+struct _ec_code_proc {
+ int32_t fd;
+ gf_boolean_t eof;
+ gf_boolean_t error;
+ gf_boolean_t skip;
+ ssize_t size;
+ ssize_t pos;
+ char buffer[EC_PROC_BUFFER_SIZE];
+};
+
+static ec_code_gen_t *ec_code_gen_table[] = {
+#ifdef USE_EC_DYNAMIC_AVX
+ &ec_code_gen_avx,
+#endif
+#ifdef USE_EC_DYNAMIC_SSE
+ &ec_code_gen_sse,
+#endif
+#ifdef USE_EC_DYNAMIC_X64
+ &ec_code_gen_x64,
+#endif
+ NULL};
+
+static void
+ec_code_arg_set(ec_code_arg_t *arg, uint32_t value)
+{
+ arg->value = value;
+}
+
+static void
+ec_code_arg_assign(ec_code_builder_t *builder, ec_code_op_t *op,
+ ec_code_arg_t *arg, uint32_t reg)
+{
+ arg->value = reg;
+
+ if (builder->regs <= reg) {
+ builder->regs = reg + 1;
+ }
+}
+
+static void
+ec_code_arg_use(ec_code_builder_t *builder, ec_code_op_t *op,
+ ec_code_arg_t *arg, uint32_t reg)
+{
+ arg->value = reg;
+}
+
+static void
+ec_code_arg_update(ec_code_builder_t *builder, ec_code_op_t *op,
+ ec_code_arg_t *arg, uint32_t reg)
+{
+ arg->value = reg;
+}
+
+static ec_code_op_t *
+ec_code_op_next(ec_code_builder_t *builder)
+{
+ ec_code_op_t *op;
+
+ op = &builder->ops[builder->count++];
+ memset(op, 0, sizeof(ec_code_op_t));
+
+ return op;
+}
+
+static void
+ec_code_load(ec_code_builder_t *builder, uint32_t bit, uint32_t offset)
+{
+ ec_code_op_t *op;
+
+ op = ec_code_op_next(builder);
+
+ op->op = EC_GF_OP_LOAD;
+ ec_code_arg_assign(builder, op, &op->arg1, builder->map[bit]);
+ ec_code_arg_set(&op->arg2, offset);
+ ec_code_arg_set(&op->arg3, bit);
+}
+
+static void
+ec_code_store(ec_code_builder_t *builder, uint32_t reg, uint32_t bit)
+{
+ ec_code_op_t *op;
+
+ op = ec_code_op_next(builder);
+
+ op->op = EC_GF_OP_STORE;
+ ec_code_arg_use(builder, op, &op->arg1, builder->map[reg]);
+ ec_code_arg_set(&op->arg2, 0);
+ ec_code_arg_set(&op->arg3, bit);
+}
+
+static void
+ec_code_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+ ec_code_op_t *op;
+
+ op = ec_code_op_next(builder);
+
+ op->op = EC_GF_OP_COPY;
+ ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]);
+ ec_code_arg_use(builder, op, &op->arg2, builder->map[src]);
+ ec_code_arg_set(&op->arg3, 0);
+}
+
+static void
+ec_code_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+ ec_code_op_t *op;
+
+ op = ec_code_op_next(builder);
+
+ op->op = EC_GF_OP_XOR2;
+ ec_code_arg_update(builder, op, &op->arg1, builder->map[dst]);
+ ec_code_arg_use(builder, op, &op->arg2, builder->map[src]);
+ ec_code_arg_set(&op->arg3, 0);
+}
+
+static void
+ec_code_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1,
+ uint32_t src2)
+{
+ ec_code_op_t *op;
+
+ if (builder->code->gen->xor3 == NULL) {
+ ec_code_copy(builder, dst, src1);
+ ec_code_xor2(builder, dst, src2);
+
+ return;
+ }
+
+ op = ec_code_op_next(builder);
+
+ op->op = EC_GF_OP_XOR3;
+ ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]);
+ ec_code_arg_use(builder, op, &op->arg2, builder->map[src1]);
+ ec_code_arg_use(builder, op, &op->arg3, builder->map[src2]);
+}
+
+static void
+ec_code_xorm(ec_code_builder_t *builder, uint32_t bit, uint32_t offset)
+{
+ ec_code_op_t *op;
+
+ op = ec_code_op_next(builder);
+
+ op->op = EC_GF_OP_XORM;
+ ec_code_arg_update(builder, op, &op->arg1, builder->map[bit]);
+ ec_code_arg_set(&op->arg2, offset);
+ ec_code_arg_set(&op->arg3, bit);
+}
+
+static void
+ec_code_dup(ec_code_builder_t *builder, ec_gf_op_t *op)
+{
+ switch (op->op) {
+ case EC_GF_OP_COPY:
+ ec_code_copy(builder, op->arg1, op->arg2);
+ break;
+ case EC_GF_OP_XOR2:
+ ec_code_xor2(builder, op->arg1, op->arg2);
+ break;
+ case EC_GF_OP_XOR3:
+ ec_code_xor3(builder, op->arg1, op->arg2, op->arg3);
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+ec_code_gf_load(ec_code_builder_t *builder, uint32_t offset)
+{
+ uint32_t i;
+
+ for (i = 0; i < builder->code->gf->bits; i++) {
+ ec_code_load(builder, i, offset);
+ }
+}
+
+static void
+ec_code_gf_load_xor(ec_code_builder_t *builder, uint32_t offset)
+{
+ uint32_t i;
+
+ for (i = 0; i < builder->code->gf->bits; i++) {
+ ec_code_xorm(builder, i, offset);
+ }
+}
+
+static void
+ec_code_gf_store(ec_code_builder_t *builder)
+{
+ uint32_t i;
+
+ for (i = 0; i < builder->code->gf->bits; i++) {
+ ec_code_store(builder, i, i);
+ }
+}
+
+static void
+ec_code_gf_clear(ec_code_builder_t *builder)
+{
+ uint32_t i;
+
+ ec_code_xor2(builder, 0, 0);
+ for (i = 0; i < builder->code->gf->bits; i++) {
+ ec_code_store(builder, 0, i);
+ }
+}
+
+static void
+ec_code_gf_mul(ec_code_builder_t *builder, uint32_t value)
+{
+ ec_gf_mul_t *mul;
+ ec_gf_op_t *op;
+ uint32_t map[EC_GF_MAX_REGS];
+ int32_t i;
+
+ mul = builder->code->gf->table[value];
+ for (op = mul->ops; op->op != EC_GF_OP_END; op++) {
+ ec_code_dup(builder, op);
+ }
+
+ for (i = 0; i < mul->regs; i++) {
+ map[i] = builder->map[mul->map[i]];
+ }
+ memcpy(builder->map, map, sizeof(uint32_t) * mul->regs);
+}
+
+static ec_code_builder_t *
+ec_code_prepare(ec_code_t *code, uint32_t count, uint32_t width,
+ gf_boolean_t linear)
+{
+ ec_code_builder_t *builder;
+ uint32_t i;
+
+ count *= code->gf->bits + code->gf->max_ops;
+ count += code->gf->bits;
+ builder = GF_MALLOC(
+ sizeof(ec_code_builder_t) + sizeof(ec_code_op_t) * count,
+ ec_mt_ec_code_builder_t);
+ if (builder == NULL) {
+ return EC_ERR(ENOMEM);
+ }
+
+ builder->address = 0;
+ builder->code = code;
+ builder->size = 0;
+ builder->count = 0;
+ builder->regs = 0;
+ builder->error = 0;
+ builder->bits = code->gf->bits;
+ builder->width = width;
+ builder->data = NULL;
+ builder->linear = linear;
+ builder->base = -1;
+
+ for (i = 0; i < EC_GF_MAX_REGS; i++) {
+ builder->map[i] = i;
+ }
+
+ return builder;
+}
+
+static size_t
+ec_code_space_size(void)
+{
+ return (sizeof(ec_code_space_t) + 15) & ~15;
+}
+
+static size_t
+ec_code_chunk_size(void)
+{
+ return (sizeof(ec_code_chunk_t) + 15) & ~15;
+}
+
+static ec_code_chunk_t *
+ec_code_chunk_from_space(ec_code_space_t *space)
+{
+ return (ec_code_chunk_t *)((uintptr_t)space + ec_code_space_size());
+}
+
+static void *
+ec_code_to_executable(ec_code_space_t *space, void *addr)
+{
+ return (void *)((uintptr_t)addr - (uintptr_t)space +
+ (uintptr_t)space->exec);
+}
+
+static void *
+ec_code_from_executable(ec_code_space_t *space, void *addr)
+{
+ return (void *)((uintptr_t)addr - (uintptr_t)space->exec +
+ (uintptr_t)space);
+}
+
+static void *
+ec_code_func_from_chunk(ec_code_chunk_t *chunk, void **exec)
+{
+ void *addr;
+
+ addr = (void *)((uintptr_t)chunk + ec_code_chunk_size());
+
+ *exec = ec_code_to_executable(chunk->space, addr);
+
+ return addr;
+}
+
+static ec_code_chunk_t *
+ec_code_chunk_from_func(ec_code_func_linear_t func)
+{
+ ec_code_chunk_t *chunk;
+
+ chunk = (ec_code_chunk_t *)((uintptr_t)func - ec_code_chunk_size());
+
+ return ec_code_from_executable(chunk->space, chunk);
+}
+
+static ec_code_chunk_t *
+ec_code_chunk_split(ec_code_chunk_t *chunk, size_t size)
+{
+ ec_code_chunk_t *extra;
+ ssize_t avail;
+
+ avail = chunk->size - size - ec_code_chunk_size();
+ if (avail > 0) {
+ extra = (ec_code_chunk_t *)((uintptr_t)chunk + chunk->size - avail);
+ extra->space = chunk->space;
+ extra->size = avail;
+ list_add(&extra->list, &chunk->list);
+ chunk->size = size;
+ }
+ list_del_init(&chunk->list);
+
+ return chunk;
+}
+
+static gf_boolean_t
+ec_code_chunk_touch(ec_code_chunk_t *prev, ec_code_chunk_t *next)
+{
+ uintptr_t end;
+
+ end = (uintptr_t)prev + ec_code_chunk_size() + prev->size;
+ return (end == (uintptr_t)next);
+}
+
+static ec_code_space_t *
+ec_code_space_create(ec_code_t *code, size_t size)
+{
+ char path[] = GLUSTERFS_LIBEXECDIR "/ec-code-dynamic.XXXXXX";
+ ec_code_space_t *space;
+ void *exec;
+ int32_t fd, err;
+
+ /* We need to create memory areas to store the generated dynamic code.
+ * Obviously these areas need to be written to be able to create the
+ * code and they also need to be executable to execute it.
+ *
+ * However it's a bad practice to have a memory region that is both
+ * writable *and* executable. In fact, selinux forbids this and causes
+ * attempts to do so to fail (unless specifically configured).
+ *
+ * To solve the problem we'll use two distinct memory areas mapped to
+ * the same physical storage. One of the memory areas will have write
+ * permission, and the other will have execute permission. Both areas
+ * will have the same contents. The physical storage will be a regular
+ * file that will be mmapped to both areas.
+ */
+
+ /* We need to create a temporary file as the backend storage for the
+ * memory mapped areas. */
+ /* coverity[secure_temp] mkstemp uses 0600 as the mode and is safe */
+ fd = mkstemp(path);
+ if (fd < 0) {
+ err = errno;
+ gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED,
+ "Unable to create a temporary file for the ec dynamic "
+ "code");
+ space = EC_ERR(err);
+ goto done;
+ }
+ /* Once created we don't need to keep it in the file system. It will
+ * still exist until we close the last file descriptor or unmap the
+ * memory areas bound to the file. */
+ sys_unlink(path);
+
+ size = (size + EC_CODE_ALIGN - 1) & ~(EC_CODE_ALIGN - 1);
+ if (sys_ftruncate(fd, size) < 0) {
+ err = errno;
+ gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED,
+ "Unable to resize the file for the ec dynamic code");
+ space = EC_ERR(err);
+ goto done_close;
+ }
+
+ /* This creates an executable memory area to be able to run the
+ * generated fragments of code. */
+ exec = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0);
+ if (exec == MAP_FAILED) {
+ err = errno;
+ gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED,
+ "Unable to map the executable area for the ec dynamic "
+ "code");
+ space = EC_ERR(err);
+ goto done_close;
+ }
+ /* It's not important to check the return value of mlock(). If it fails
+ * everything will continue to work normally. */
+ mlock(exec, size);
+
+ /* This maps a read/write memory area to be able to create the dynamici
+ * code. */
+ space = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (space == MAP_FAILED) {
+ err = errno;
+ gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED,
+ "Unable to map the writable area for the ec dynamic "
+ "code");
+ space = EC_ERR(err);
+
+ munmap(exec, size);
+
+ goto done_close;
+ }
+
+ space->exec = exec;
+ space->size = size;
+ space->code = code;
+ list_add_tail(&space->list, &code->spaces);
+ INIT_LIST_HEAD(&space->chunks);
+
+done_close:
+ /* If everything has succeeded, we already have the memory areas
+ * mapped. We don't need the file descriptor anymore because the
+ * backend storage will be there until the mmap()'d regions are
+ * unmapped. */
+ sys_close(fd);
+done:
+ return space;
+}
+
+static void
+ec_code_space_destroy(ec_code_space_t *space)
+{
+ list_del_init(&space->list);
+
+ munmap(space->exec, space->size);
+ munmap(space, space->size);
+}
+
+static void
+ec_code_chunk_merge(ec_code_chunk_t *chunk)
+{
+ ec_code_chunk_t *item, *tmp;
+
+ list_for_each_entry_safe(item, tmp, &chunk->space->chunks, list)
+ {
+ if ((uintptr_t)item > (uintptr_t)chunk) {
+ list_add_tail(&chunk->list, &item->list);
+ if (ec_code_chunk_touch(chunk, item)) {
+ chunk->size += item->size + ec_code_chunk_size();
+ list_del_init(&item->list);
+ }
+
+ goto check;
+ }
+ if (ec_code_chunk_touch(item, chunk)) {
+ item->size += chunk->size + ec_code_chunk_size();
+ list_del_init(&item->list);
+ chunk = item;
+ }
+ }
+ list_add_tail(&chunk->list, &chunk->space->chunks);
+
+check:
+ if (chunk->size ==
+ chunk->space->size - ec_code_space_size() - ec_code_chunk_size()) {
+ ec_code_space_destroy(chunk->space);
+ }
+}
+
+static ec_code_chunk_t *
+ec_code_space_alloc(ec_code_t *code, size_t size)
+{
+ ec_code_space_t *space;
+ ec_code_chunk_t *chunk;
+ size_t map_size;
+
+ /* To minimize fragmentation, we only allocate chunks of sizes multiples
+ * of EC_CODE_CHUNK_MIN_SIZE. */
+ size = ((size + ec_code_chunk_size() + EC_CODE_CHUNK_MIN_SIZE - 1) &
+ ~(EC_CODE_CHUNK_MIN_SIZE - 1)) -
+ ec_code_chunk_size();
+ list_for_each_entry(space, &code->spaces, list)
+ {
+ list_for_each_entry(chunk, &space->chunks, list)
+ {
+ if (chunk->size >= size) {
+ goto out;
+ }
+ }
+ }
+
+ map_size = EC_CODE_SIZE - ec_code_space_size() - ec_code_chunk_size();
+ if (map_size < size) {
+ map_size = size;
+ }
+ space = ec_code_space_create(code, map_size);
+ if (EC_IS_ERR(space)) {
+ return (ec_code_chunk_t *)space;
+ }
+
+ chunk = ec_code_chunk_from_space(space);
+ chunk->size = map_size - ec_code_space_size() - ec_code_chunk_size();
+ list_add(&chunk->list, &space->chunks);
+
+out:
+ chunk->space = space;
+
+ return ec_code_chunk_split(chunk, size);
+}
+
+static ec_code_chunk_t *
+ec_code_alloc(ec_code_t *code, uint32_t size)
+{
+ ec_code_chunk_t *chunk;
+
+ LOCK(&code->lock);
+
+ chunk = ec_code_space_alloc(code, size);
+
+ UNLOCK(&code->lock);
+
+ return chunk;
+}
+
+static void
+ec_code_free(ec_code_chunk_t *chunk)
+{
+ gf_lock_t *lock;
+
+ lock = &chunk->space->code->lock;
+ LOCK(lock);
+
+ ec_code_chunk_merge(chunk);
+
+ UNLOCK(lock);
+}
+
+static int32_t
+ec_code_write(ec_code_builder_t *builder)
+{
+ ec_code_gen_t *gen;
+ ec_code_op_t *op;
+ uint32_t i;
+
+ builder->error = 0;
+ builder->size = 0;
+ builder->address = 0;
+ builder->base = -1;
+
+ gen = builder->code->gen;
+ gen->prolog(builder);
+ for (i = 0; i < builder->count; i++) {
+ op = &builder->ops[i];
+ switch (op->op) {
+ case EC_GF_OP_LOAD:
+ gen->load(builder, op->arg1.value, op->arg2.value,
+ op->arg3.value);
+ break;
+ case EC_GF_OP_STORE:
+ gen->store(builder, op->arg1.value, op->arg3.value);
+ break;
+ case EC_GF_OP_COPY:
+ gen->copy(builder, op->arg1.value, op->arg2.value);
+ break;
+ case EC_GF_OP_XOR2:
+ gen->xor2(builder, op->arg1.value, op->arg2.value);
+ break;
+ case EC_GF_OP_XOR3:
+ gen->xor3(builder, op->arg1.value, op->arg2.value,
+ op->arg3.value);
+ break;
+ case EC_GF_OP_XORM:
+ gen->xorm(builder, op->arg1.value, op->arg2.value,
+ op->arg3.value);
+ break;
+ default:
+ break;
+ }
+ }
+ gen->epilog(builder);
+
+ return builder->error;
+}
+
+static void *
+ec_code_compile(ec_code_builder_t *builder)
+{
+ ec_code_chunk_t *chunk;
+ void *func;
+ int32_t err;
+
+ err = ec_code_write(builder);
+ if (err != 0) {
+ return EC_ERR(err);
+ }
+
+ chunk = ec_code_alloc(builder->code, builder->size);
+ if (EC_IS_ERR(chunk)) {
+ return chunk;
+ }
+ builder->data = ec_code_func_from_chunk(chunk, &func);
+
+ err = ec_code_write(builder);
+ if (err != 0) {
+ ec_code_free(chunk);
+
+ return EC_ERR(err);
+ }
+
+ GF_FREE(builder);
+
+ return func;
+}
+
+ec_code_t *
+ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen)
+{
+ ec_code_t *code;
+
+ code = GF_MALLOC(sizeof(ec_code_t), ec_mt_ec_code_t);
+ if (code == NULL) {
+ return EC_ERR(ENOMEM);
+ }
+ memset(code, 0, sizeof(ec_code_t));
+ INIT_LIST_HEAD(&code->spaces);
+ LOCK_INIT(&code->lock);
+
+ code->gf = gf;
+ code->gen = gen;
+
+ return code;
+}
+
+void
+ec_code_destroy(ec_code_t *code)
+{
+ if (!list_empty(&code->spaces)) {
+ }
+
+ LOCK_DESTROY(&code->lock);
+
+ GF_FREE(code);
+}
+
+static uint32_t
+ec_code_value_next(uint32_t *values, uint32_t count, uint32_t *offset)
+{
+ uint32_t i, next;
+
+ next = 0;
+ for (i = *offset + 1; i < count; i++) {
+ next = values[i];
+ if (next != 0) {
+ break;
+ }
+ }
+ *offset = i;
+
+ return next;
+}
+
+static void *
+ec_code_build_dynamic(ec_code_t *code, uint32_t width, uint32_t *values,
+ uint32_t count, gf_boolean_t linear)
+{
+ ec_code_builder_t *builder;
+ uint32_t offset, val, next;
+
+ builder = ec_code_prepare(code, count, width, linear);
+ if (EC_IS_ERR(builder)) {
+ return builder;
+ }
+
+ offset = -1;
+ next = ec_code_value_next(values, count, &offset);
+ if (next != 0) {
+ ec_code_gf_load(builder, offset);
+ do {
+ val = next;
+ next = ec_code_value_next(values, count, &offset);
+ if (next != 0) {
+ ec_code_gf_mul(builder, ec_gf_div(code->gf, val, next));
+ ec_code_gf_load_xor(builder, offset);
+ }
+ } while (next != 0);
+ ec_code_gf_mul(builder, val);
+ ec_code_gf_store(builder);
+ } else {
+ ec_code_gf_clear(builder);
+ }
+
+ return ec_code_compile(builder);
+}
+
+static void *
+ec_code_build(ec_code_t *code, uint32_t width, uint32_t *values, uint32_t count,
+ gf_boolean_t linear)
+{
+ void *func;
+
+ if (code->gen != NULL) {
+ func = ec_code_build_dynamic(code, width, values, count, linear);
+ if (!EC_IS_ERR(func)) {
+ return func;
+ }
+
+ gf_msg_debug(THIS->name, GF_LOG_DEBUG,
+ "Unable to generate dynamic code. Falling back "
+ "to precompiled code");
+
+ /* The dynamic code generation shouldn't fail in normal
+ * conditions, but if it fails at some point, it's very
+ * probable that it will fail again, so we completely disable
+ * dynamic code generation. */
+ code->gen = NULL;
+ }
+
+ ec_code_c_prepare(code->gf, values, count);
+
+ if (linear) {
+ return ec_code_c_linear;
+ }
+
+ return ec_code_c_interleaved;
+}
+
+ec_code_func_linear_t
+ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values,
+ uint32_t count)
+{
+ return (ec_code_func_linear_t)ec_code_build(code, width, values, count,
+ _gf_true);
+}
+
+ec_code_func_interleaved_t
+ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values,
+ uint32_t count)
+{
+ return (ec_code_func_interleaved_t)ec_code_build(code, width, values, count,
+ _gf_false);
+}
+
+void
+ec_code_release(ec_code_t *code, ec_code_func_t *func)
+{
+ if ((func->linear != ec_code_c_linear) &&
+ (func->interleaved != ec_code_c_interleaved)) {
+ ec_code_free(ec_code_chunk_from_func(func->linear));
+ }
+}
+
+void
+ec_code_error(ec_code_builder_t *builder, int32_t error)
+{
+ if (builder->error == 0) {
+ gf_msg(THIS->name, GF_LOG_ERROR, error, EC_MSG_DYN_CODEGEN_FAILED,
+ "Failed to generate dynamic code");
+ builder->error = error;
+ }
+}
+
+void
+ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count)
+{
+ if (builder->error != 0) {
+ return;
+ }
+
+ if (builder->data != NULL) {
+ memcpy(builder->data + builder->size, bytes, count);
+ }
+
+ builder->size += count;
+ builder->address += count;
+}
+
+static char *
+ec_code_proc_trim_left(char *text, ssize_t *length)
+{
+ ssize_t len;
+
+ for (len = *length; (len > 0) && isspace(*text); len--) {
+ text++;
+ }
+ *length = len;
+
+ return text;
+}
+
+static char *
+ec_code_proc_trim_right(char *text, ssize_t *length, char sep)
+{
+ char *last;
+ ssize_t len;
+
+ len = *length;
+
+ last = text;
+ for (len = *length; (len > 0) && (*text != sep); len--) {
+ if (!isspace(*text)) {
+ last = text + 1;
+ }
+ text++;
+ }
+ *last = 0;
+ *length = len;
+
+ return text;
+}
+
+static char *
+ec_code_proc_line_parse(ec_code_proc_t *file, ssize_t *length)
+{
+ char *text, *end;
+ ssize_t len;
+
+ len = file->size - file->pos;
+ text = ec_code_proc_trim_left(file->buffer + file->pos, &len);
+ end = ec_code_proc_trim_right(text, &len, '\n');
+ if (len == 0) {
+ if (!file->eof) {
+ if (text == file->buffer) {
+ file->size = file->pos = 0;
+ file->skip = _gf_true;
+ } else {
+ file->size = file->pos = end - text;
+ memmove(file->buffer, text, file->pos + 1);
+ }
+ len = sys_read(file->fd, file->buffer + file->pos,
+ sizeof(file->buffer) - file->pos - 1);
+ if (len > 0) {
+ file->size += len;
+ }
+ file->error = len < 0;
+ file->eof = len <= 0;
+
+ return NULL;
+ }
+ file->size = file->pos = 0;
+ } else {
+ file->pos = end - file->buffer + 1;
+ }
+
+ *length = end - text;
+
+ if (file->skip) {
+ file->skip = _gf_false;
+ text = NULL;
+ }
+
+ return text;
+}
+
+static char *
+ec_code_proc_line(ec_code_proc_t *file, ssize_t *length)
+{
+ char *text;
+
+ text = NULL;
+ while (!file->eof) {
+ text = ec_code_proc_line_parse(file, length);
+ if (text != NULL) {
+ break;
+ }
+ }
+
+ return text;
+}
+
+static char *
+ec_code_proc_split(char *text, ssize_t *length, char sep)
+{
+ text = ec_code_proc_trim_right(text, length, sep);
+ if (*length == 0) {
+ return NULL;
+ }
+ (*length)--;
+ text++;
+
+ return ec_code_proc_trim_left(text, length);
+}
+
+static uint32_t
+ec_code_cpu_check(uint32_t idx, char *list, uint32_t count)
+{
+ ec_code_gen_t *gen;
+ char **ptr;
+ char *table[count + 1];
+ uint32_t i;
+
+ for (i = 0; i < count; i++) {
+ table[i] = list;
+ list += strlen(list) + 1;
+ }
+
+ gen = ec_code_gen_table[idx];
+ while (gen != NULL) {
+ for (ptr = gen->flags; *ptr != NULL; ptr++) {
+ for (i = 0; i < count; i++) {
+ if (strcmp(*ptr, table[i]) == 0) {
+ break;
+ }
+ }
+ if (i >= count) {
+ gen = ec_code_gen_table[++idx];
+ break;
+ }
+ }
+ if (*ptr == NULL) {
+ break;
+ }
+ }
+
+ return idx;
+}
+
+ec_code_gen_t *
+ec_code_detect(xlator_t *xl, const char *def)
+{
+ ec_code_proc_t file;
+ ec_code_gen_t *gen = NULL;
+ char *line, *data, *list;
+ ssize_t length;
+ uint32_t count, base, select;
+
+ if (strcmp(def, "none") == 0) {
+ gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE,
+ "Not using any cpu extensions");
+
+ return NULL;
+ }
+
+ file.fd = sys_open(PROC_CPUINFO, O_RDONLY, 0);
+ if (file.fd < 0) {
+ goto out;
+ }
+ file.size = file.pos = 0;
+ file.eof = file.error = file.skip = _gf_false;
+
+ select = 0;
+ if (strcmp(def, "auto") != 0) {
+ while (ec_code_gen_table[select] != NULL) {
+ if (strcmp(ec_code_gen_table[select]->name, def) == 0) {
+ break;
+ }
+ select++;
+ }
+ if (ec_code_gen_table[select] == NULL) {
+ gf_msg(xl->name, GF_LOG_WARNING, EINVAL, EC_MSG_EXTENSION_UNKNOWN,
+ "CPU extension '%s' is not known. Not using any cpu "
+ "extensions",
+ def);
+
+ return NULL;
+ }
+ } else {
+ def = NULL;
+ }
+
+ while ((line = ec_code_proc_line(&file, &length)) != NULL) {
+ data = ec_code_proc_split(line, &length, ':');
+ if ((data != NULL) && (strcmp(line, "flags") == 0)) {
+ list = data;
+ count = 0;
+ while ((data != NULL) && (*data != 0)) {
+ count++;
+ data = ec_code_proc_split(data, &length, ' ');
+ }
+ base = select;
+ select = ec_code_cpu_check(select, list, count);
+ if ((base != select) && (def != NULL)) {
+ gf_msg(xl->name, GF_LOG_WARNING, ENOTSUP,
+ EC_MSG_EXTENSION_UNSUPPORTED,
+ "CPU extension '%s' is not supported", def);
+ def = NULL;
+ }
+ }
+ }
+
+ if (file.error) {
+ gf_msg(xl->name, GF_LOG_WARNING, 0, EC_MSG_EXTENSION_FAILED,
+ "Unable to determine supported CPU extensions. Not using any "
+ "cpu extensions");
+
+ gen = NULL;
+ } else {
+ gen = ec_code_gen_table[select];
+ if (gen == NULL) {
+ gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE,
+ "Not using any cpu extensions");
+ } else {
+ gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION,
+ "Using '%s' CPU extensions", gen->name);
+ }
+ }
+
+ sys_close(file.fd);
+
+out:
+ return gen;
+}
diff --git a/xlators/cluster/ec/src/ec-code.h b/xlators/cluster/ec/src/ec-code.h
new file mode 100644
index 00000000000..75fb35d93e3
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_H__
+#define __EC_CODE_H__
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/list.h>
+
+#include "ec-types.h"
+#include "ec-galois.h"
+
+ec_code_gen_t *
+ec_code_detect(xlator_t *xl, const char *def);
+
+ec_code_t *
+ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen);
+
+void
+ec_code_destroy(ec_code_t *code);
+
+ec_code_func_linear_t
+ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values,
+ uint32_t count);
+ec_code_func_interleaved_t
+ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values,
+ uint32_t count);
+void
+ec_code_release(ec_code_t *code, ec_code_func_t *func);
+
+void
+ec_code_error(ec_code_builder_t *builder, int32_t error);
+
+void
+ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count);
+
+#endif /* __EC_CODE_H__ */
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c
new file mode 100644
index 00000000000..703a30e2485
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-combine.c
@@ -0,0 +1,995 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <fnmatch.h>
+
+#include "libxlator.h"
+#include <glusterfs/byte-order.h>
+
+#include "ec-types.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-messages.h"
+#include <glusterfs/quota-common-utils.h>
+
+#define EC_QUOTA_PREFIX "trusted.glusterfs.quota."
+
+#define EC_MISSING_DATA ((data_t *)1ULL)
+
+struct _ec_dict_info;
+typedef struct _ec_dict_info ec_dict_info_t;
+
+struct _ec_dict_combine;
+typedef struct _ec_dict_combine ec_dict_combine_t;
+
+struct _ec_dict_info {
+ dict_t *dict;
+ int32_t count;
+};
+
+struct _ec_dict_combine {
+ ec_cbk_data_t *cbk;
+ int32_t which;
+};
+
+int32_t
+ec_combine_write(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ int valid = 0;
+
+ if (!fop || !dst || !src)
+ return 0;
+
+ switch (fop->id) {
+ case GF_FOP_REMOVEXATTR:
+ case GF_FOP_FREMOVEXATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+ return 1;
+
+ case GF_FOP_SYMLINK:
+ case GF_FOP_LINK:
+ case GF_FOP_CREATE:
+ case GF_FOP_MKNOD:
+ case GF_FOP_MKDIR:
+ valid = 3;
+ break;
+ case GF_FOP_UNLINK:
+ case GF_FOP_RMDIR:
+ case GF_FOP_SETATTR:
+ case GF_FOP_FSETATTR:
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_FTRUNCATE:
+ case GF_FOP_WRITE:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
+ case GF_FOP_ZEROFILL:
+ valid = 2;
+ break;
+ case GF_FOP_RENAME:
+ valid = 5;
+ break;
+ default:
+ gf_msg_callingfn(fop->xl->name, GF_LOG_WARNING, EINVAL,
+ EC_MSG_INVALID_FOP, "Invalid fop %d", fop->id);
+ return 0;
+ break;
+ }
+
+ if (!ec_iatt_combine(fop, dst->iatt, src->iatt, valid)) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+ "Mismatching iatt in "
+ "answers of '%s'",
+ gf_fop_list[fop->id]);
+ return 0;
+ }
+ return 1;
+}
+
+void
+ec_iatt_time_merge(int64_t *dst_sec, uint32_t *dst_nsec, int64_t src_sec,
+ uint32_t src_nsec)
+{
+ if ((*dst_sec < src_sec) ||
+ ((*dst_sec == src_sec) && (*dst_nsec < src_nsec))) {
+ *dst_sec = src_sec;
+ *dst_nsec = src_nsec;
+ }
+}
+
+static gf_boolean_t
+ec_iatt_is_trusted(ec_fop_data_t *fop, struct iatt *iatt)
+{
+ uint64_t ino;
+ int32_t i;
+
+ /* Only the top level fop will have fop->locks filled. */
+ while (fop->parent != NULL) {
+ fop = fop->parent;
+ }
+
+ /* Lookups are special requests always done without locks taken but they
+ * require to be able to identify differences between bricks. Special
+ * handling of these differences is already done in lookup specific code
+ * so we shouldn't ignore any difference here and consider all iatt
+ * structures as trusted. */
+ if (fop->id == GF_FOP_LOOKUP) {
+ return _gf_true;
+ }
+
+ /* Check if the iatt references an inode locked by the current fop */
+ for (i = 0; i < fop->lock_count; i++) {
+ ino = gfid_to_ino(fop->locks[i].lock->loc.inode->gfid);
+ if (iatt->ia_ino == ino) {
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
+}
+
+int32_t
+ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src,
+ int32_t count)
+{
+ int32_t i;
+ gf_boolean_t failed = _gf_false;
+
+ for (i = 0; i < count; i++) {
+ /* Check for basic fields. These fields must be equal always, even if
+ * the inode is not locked because in these cases the parent inode
+ * will be locked and differences in these fields require changes in
+ * the parent directory. */
+ if ((dst[i].ia_ino != src[i].ia_ino) ||
+ (((dst[i].ia_type == IA_IFBLK) || (dst[i].ia_type == IA_IFCHR)) &&
+ (dst[i].ia_rdev != src[i].ia_rdev)) ||
+ (gf_uuid_compare(dst[i].ia_gfid, src[i].ia_gfid) != 0)) {
+ failed = _gf_true;
+ }
+ /* Check for not so stable fields. These fields can change if the
+ * inode is not locked. */
+ if (!failed && ((dst[i].ia_uid != src[i].ia_uid) ||
+ (dst[i].ia_gid != src[i].ia_gid) ||
+ (st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type) !=
+ st_mode_from_ia(src[i].ia_prot, src[i].ia_type)))) {
+ if (ec_iatt_is_trusted(fop, dst)) {
+ /* If the iatt contains information from an inode that is
+ * locked, these differences are real problems, so we need to
+ * report them. Otherwise we ignore them and don't care which
+ * data is returned. */
+ failed = _gf_true;
+ } else {
+ gf_msg_debug(fop->xl->name, 0,
+ "Ignoring iatt differences because inode is not "
+ "locked");
+ }
+ }
+ if (failed) {
+ gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_IATT_COMBINE_FAIL,
+ "Failed to combine iatt (inode: %" PRIu64 "-%" PRIu64
+ ", "
+ "links: %u-%u, uid: %u-%u, gid: %u-%u, "
+ "rdev: %" PRIu64 "-%" PRIu64 ", size: %" PRIu64 "-%" PRIu64
+ ", "
+ "mode: %o-%o), %s",
+ dst[i].ia_ino, src[i].ia_ino, dst[i].ia_nlink,
+ src[i].ia_nlink, dst[i].ia_uid, src[i].ia_uid, dst[i].ia_gid,
+ src[i].ia_gid, dst[i].ia_rdev, src[i].ia_rdev,
+ dst[i].ia_size, src[i].ia_size,
+ st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type),
+ st_mode_from_ia(src[i].ia_prot, dst[i].ia_type),
+ ec_msg_str(fop));
+
+ return 0;
+ }
+ }
+
+ while (count-- > 0) {
+ dst[count].ia_blocks += src[count].ia_blocks;
+ if (dst[count].ia_blksize < src[count].ia_blksize) {
+ dst[count].ia_blksize = src[count].ia_blksize;
+ }
+
+ ec_iatt_time_merge(&dst[count].ia_atime, &dst[count].ia_atime_nsec,
+ src[count].ia_atime, src[count].ia_atime_nsec);
+ ec_iatt_time_merge(&dst[count].ia_mtime, &dst[count].ia_mtime_nsec,
+ src[count].ia_mtime, src[count].ia_mtime_nsec);
+ ec_iatt_time_merge(&dst[count].ia_ctime, &dst[count].ia_ctime_nsec,
+ src[count].ia_ctime, src[count].ia_ctime_nsec);
+ }
+
+ return 1;
+}
+
+void
+ec_iatt_rebuild(ec_t *ec, struct iatt *iatt, int32_t count, int32_t answers)
+{
+ uint64_t blocks;
+
+ while (count-- > 0) {
+ blocks = iatt[count].ia_blocks * ec->fragments + answers - 1;
+ blocks /= answers;
+ iatt[count].ia_blocks = blocks;
+ }
+}
+
+gf_boolean_t
+ec_xattr_match(dict_t *dict, char *key, data_t *value, void *arg)
+{
+ if ((fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) ||
+ (strcmp(key, GET_LINK_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0)) {
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+gf_boolean_t
+ec_value_ignore(char *key)
+{
+ if ((strcmp(key, GF_CONTENT_KEY) == 0) ||
+ (strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) ||
+ (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0) ||
+ (strcmp(key, GF_XATTR_LOCKINFO_KEY) == 0) ||
+ (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) ||
+ (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) ||
+ (strcmp(key, DHT_IATT_IN_XDATA_KEY) == 0) ||
+ (strncmp(key, EC_QUOTA_PREFIX, SLEN(EC_QUOTA_PREFIX)) == 0) ||
+ (fnmatch(MARKER_XATTR_PREFIX ".*." XTIME, key, 0) == 0) ||
+ (fnmatch(GF_XATTR_MARKER_KEY ".*", key, 0) == 0) ||
+ (XATTR_IS_NODE_UUID(key))) {
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+int32_t
+ec_dict_compare(dict_t *dict1, dict_t *dict2)
+{
+ if (are_dicts_equal(dict1, dict2, ec_xattr_match, ec_value_ignore))
+ return 1;
+ return 0;
+}
+
+static uint32_t
+ec_dict_list(data_t **list, ec_cbk_data_t *cbk, int32_t which, char *key,
+ gf_boolean_t global)
+{
+ ec_t *ec = cbk->fop->xl->private;
+ ec_cbk_data_t *ans = NULL;
+ dict_t *dict = NULL;
+ data_t *data;
+ uint32_t count;
+ int32_t i;
+
+ for (i = 0; i < ec->nodes; i++) {
+ /* We initialize the list with EC_MISSING_DATA if we are
+ * returning a global list or the current subvolume belongs
+ * to the group of the accepted answer. Note that if some
+ * subvolume is known to be down before issuing the request,
+ * we won't have any answer from it, so we set here the
+ * appropriate default value. */
+ if (global || ((cbk->mask & (1ULL << i)) != 0)) {
+ list[i] = EC_MISSING_DATA;
+ } else {
+ list[i] = NULL;
+ }
+ }
+
+ count = 0;
+ list_for_each_entry(ans, &cbk->fop->answer_list, answer_list)
+ {
+ if (global || ((cbk->mask & ans->mask) != 0)) {
+ dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict;
+ data = dict_get(dict, key);
+ if (data != NULL) {
+ list[ans->idx] = data;
+ count++;
+ }
+ }
+ }
+
+ return count;
+}
+
+int32_t
+ec_concat_prepare(xlator_t *xl, char **str, char **sep, char **post,
+ const char *fmt, va_list args)
+{
+ char *tmp;
+ int32_t len;
+
+ len = gf_vasprintf(str, fmt, args);
+ if (len < 0) {
+ return -ENOMEM;
+ }
+
+ tmp = strchr(*str, '{');
+ if (tmp == NULL) {
+ goto out;
+ }
+ *tmp++ = 0;
+ *sep = tmp;
+ tmp = strchr(tmp, '}');
+ if (tmp == NULL) {
+ goto out;
+ }
+ *tmp++ = 0;
+ *post = tmp;
+
+ return 0;
+
+out:
+ gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_FORMAT,
+ "Invalid concat format");
+
+ GF_FREE(*str);
+
+ return -EINVAL;
+}
+
+static int32_t
+ec_dict_data_concat(ec_cbk_data_t *cbk, int32_t which, char *key, char *new_key,
+ const char *def, gf_boolean_t global, const char *fmt, ...)
+{
+ ec_t *ec = cbk->fop->xl->private;
+ data_t *data[ec->nodes];
+ char *str = NULL, *pre = NULL, *sep, *post;
+ dict_t *dict;
+ va_list args;
+ int32_t i, num, len, deflen, prelen, postlen, seplen, tmp;
+ int32_t err;
+
+ ec_dict_list(data, cbk, which, key, global);
+
+ va_start(args, fmt);
+ err = ec_concat_prepare(cbk->fop->xl, &pre, &sep, &post, fmt, args);
+ va_end(args);
+
+ if (err != 0) {
+ return err;
+ }
+
+ prelen = strlen(pre);
+ seplen = strlen(sep);
+ postlen = strlen(post);
+
+ deflen = 0;
+ if (def != NULL) {
+ deflen = strlen(def);
+ }
+
+ len = prelen + postlen + 1;
+ num = -1;
+ for (i = 0; i < ec->nodes; i++) {
+ if (data[i] == NULL) {
+ continue;
+ }
+ if (data[i] == EC_MISSING_DATA) {
+ if (def == NULL) {
+ continue;
+ }
+ len += deflen;
+ } else {
+ len += data[i]->len - 1;
+ }
+ if (num >= 0) {
+ len += seplen;
+ }
+ num++;
+ }
+
+ err = -ENOMEM;
+
+ str = GF_MALLOC(len, gf_common_mt_char);
+ if (str == NULL) {
+ goto out;
+ }
+
+ memcpy(str, pre, prelen);
+ len = prelen;
+ for (i = 0; i < ec->nodes; i++) {
+ if (data[i] == NULL) {
+ continue;
+ }
+ if (data[i] == EC_MISSING_DATA) {
+ if (deflen == 0) {
+ continue;
+ }
+ tmp = deflen;
+ memcpy(str + len, def, tmp);
+ } else {
+ tmp = data[i]->len - 1;
+ memcpy(str + len, data[i]->data, tmp);
+ }
+ len += tmp;
+ if (i < num) {
+ memcpy(str + len, sep, seplen);
+ len += seplen;
+ }
+ }
+ memcpy(str + len, post, postlen + 1);
+
+ dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+ if (new_key) {
+ key = new_key;
+ }
+ err = dict_set_dynstr(dict, key, str);
+ if (err != 0) {
+ goto out;
+ }
+
+ str = NULL;
+
+out:
+ GF_FREE(str);
+ GF_FREE(pre);
+
+ return err;
+}
+
+int32_t
+ec_dict_data_merge(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+ ec_t *ec = cbk->fop->xl->private;
+ data_t *data[ec->nodes];
+ dict_t *dict, *lockinfo, *tmp = NULL;
+ char *ptr = NULL;
+ int32_t i, len;
+ int32_t err;
+
+ ec_dict_list(data, cbk, which, key, _gf_false);
+
+ lockinfo = dict_new();
+ if (lockinfo == NULL) {
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+ continue;
+ }
+
+ tmp = dict_new();
+ if (tmp == NULL) {
+ err = -ENOMEM;
+
+ goto out;
+ }
+ err = dict_unserialize(data[i]->data, data[i]->len, &tmp);
+ if (err != 0) {
+ goto out;
+ }
+ if (dict_copy(tmp, lockinfo) == NULL) {
+ err = -ENOMEM;
+
+ goto out;
+ }
+
+ dict_unref(tmp);
+ }
+
+ tmp = NULL;
+
+ err = dict_allocate_and_serialize(lockinfo, (char **)&ptr,
+ (unsigned int *)&len);
+ if (err != 0) {
+ goto out;
+ }
+
+ dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+ err = dict_set_dynptr(dict, key, ptr, len);
+ if (err != 0) {
+ goto out;
+ }
+
+ ptr = NULL;
+
+out:
+ GF_FREE(ptr);
+ dict_unref(lockinfo);
+ if (tmp != NULL) {
+ dict_unref(tmp);
+ }
+
+ return err;
+}
+
+int32_t
+ec_dict_data_uuid(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+ ec_cbk_data_t *ans, *min;
+ dict_t *src, *dst;
+ data_t *data;
+
+ min = cbk;
+ for (ans = cbk->next; ans != NULL; ans = ans->next) {
+ if (ans->idx < min->idx) {
+ min = ans;
+ }
+ }
+
+ if (min != cbk) {
+ src = (which == EC_COMBINE_XDATA) ? min->xdata : min->dict;
+ dst = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+
+ data = dict_get(src, key);
+ if (data == NULL) {
+ return -ENOENT;
+ }
+ if (dict_set(dst, key, data) != 0) {
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+ec_dict_data_iatt(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+ ec_t *ec = cbk->fop->xl->private;
+ data_t *data[ec->nodes];
+ dict_t *dict;
+ struct iatt *stbuf, *tmp;
+ int32_t i, ret;
+
+ ec_dict_list(data, cbk, which, key, _gf_false);
+
+ stbuf = NULL;
+ for (i = 0; i < ec->nodes; i++) {
+ if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+ continue;
+ }
+ tmp = data_to_iatt(data[i], key);
+ if (tmp == NULL) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (stbuf == NULL) {
+ stbuf = GF_MALLOC(sizeof(struct iatt), gf_common_mt_char);
+ if (stbuf == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ *stbuf = *tmp;
+ } else {
+ if (!ec_iatt_combine(cbk->fop, stbuf, tmp, 1)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ }
+
+ if ((stbuf != NULL) && (stbuf->ia_type == IA_IFREG)) {
+ ec_iatt_rebuild(ec, stbuf, 1, cbk->count);
+ /* TODO: not sure if an iatt could come in xdata from a fop that takes
+ * no locks. */
+ if (!ec_get_inode_size(cbk->fop, cbk->fop->locks[0].lock->loc.inode,
+ &stbuf->ia_size)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+ ret = dict_set_iatt(dict, key, stbuf, false);
+ if (ret >= 0) {
+ stbuf = NULL;
+ }
+
+out:
+ GF_FREE(stbuf);
+
+ return ret;
+}
+
+int32_t
+ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+ ec_t *ec = cbk->fop->xl->private;
+ data_t *data[ec->nodes];
+ dict_t *dict;
+ int32_t i;
+ uint32_t max, tmp;
+
+ ec_dict_list(data, cbk, which, key, _gf_false);
+
+ max = 0;
+ for (i = 0; i < ec->nodes; i++) {
+ if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+ continue;
+ }
+
+ tmp = data_to_uint32(data[i]);
+ if (max < tmp) {
+ max = tmp;
+ }
+ }
+
+ dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+ return dict_set_uint32(dict, key, max);
+}
+
+int32_t
+ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+ ec_t *ec = cbk->fop->xl->private;
+ data_t *data[ec->nodes];
+ dict_t *dict;
+ int32_t i;
+ uint64_t max, tmp;
+
+ ec_dict_list(data, cbk, which, key, _gf_false);
+
+ max = 0;
+ for (i = 0; i < ec->nodes; i++) {
+ if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+ continue;
+ }
+
+ tmp = data_to_uint64(data[i]);
+ if (max < tmp) {
+ max = tmp;
+ }
+ }
+
+ dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+ return dict_set_uint64(dict, key, max);
+}
+
+int32_t
+ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+ ec_t *ec = cbk->fop->xl->private;
+ data_t *data[ec->nodes];
+ dict_t *dict = NULL;
+ int32_t i = 0;
+ quota_meta_t size = {
+ 0,
+ };
+ quota_meta_t max_size = {
+ 0,
+ };
+
+ if (ec_dict_list(data, cbk, which, key, _gf_false) == 0) {
+ return 0;
+ }
+
+ /* Quota size xattr is managed outside of the control of the ec xlator.
+ * This means that it might not be updated at the same time on all
+ * bricks and we can receive slightly different values. If that's the
+ * case, we take the maximum of all received values.
+ */
+ for (i = 0; i < ec->nodes; i++) {
+ if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA) ||
+ (quota_data_to_meta(data[i], &size) < 0)) {
+ continue;
+ }
+
+ if (size.size > max_size.size)
+ max_size.size = size.size;
+ if (size.file_count > max_size.file_count)
+ max_size.file_count = size.file_count;
+ if (size.dir_count > max_size.dir_count)
+ max_size.dir_count = size.dir_count;
+ }
+
+ max_size.size *= ec->fragments;
+
+ dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+ return quota_dict_set_meta(dict, key, &max_size, IA_IFDIR);
+}
+
+int32_t
+ec_dict_data_stime(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+ ec_t *ec = cbk->fop->xl->private;
+ data_t *data[ec->nodes];
+ dict_t *dict;
+ int32_t i, err;
+
+ ec_dict_list(data, cbk, which, key, _gf_false);
+
+ dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+ for (i = 0; i < ec->nodes; i++) {
+ if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+ continue;
+ }
+ err = gf_get_max_stime(cbk->fop->xl, dict, key, data[i]);
+ if (err != 0) {
+ gf_msg(cbk->fop->xl->name, GF_LOG_ERROR, -err,
+ EC_MSG_STIME_COMBINE_FAIL, "STIME combination failed");
+
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg)
+{
+ ec_dict_combine_t *data = arg;
+
+ if ((strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) ||
+ (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0)) {
+ return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL,
+ _gf_false, _gf_false, "(<EC:%s> { })",
+ data->cbk->fop->xl->name);
+ }
+
+ if (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) {
+ return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL,
+ _gf_false, "{\n}");
+ }
+
+ if (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) == 0) {
+ return ec_dict_data_merge(data->cbk, data->which, key);
+ }
+
+ if (strcmp(key, GET_LINK_COUNT) == 0) {
+ return ec_dict_data_max32(data->cbk, data->which, key);
+ }
+
+ if (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) {
+ return ec_dict_data_max32(data->cbk, data->which, key);
+ }
+ if ((strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) ||
+ (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0)) {
+ return ec_dict_data_max32(data->cbk, data->which, key);
+ }
+
+ if (strcmp(key, QUOTA_SIZE_KEY) == 0) {
+ return ec_dict_data_quota(data->cbk, data->which, key);
+ }
+ /* Ignore all other quota attributes */
+ if (strncmp(key, EC_QUOTA_PREFIX, SLEN(EC_QUOTA_PREFIX)) == 0) {
+ return 0;
+ }
+
+ if (XATTR_IS_NODE_UUID(key)) {
+ if (data->cbk->fop->int32) {
+ /* List of node uuid is requested */
+ return ec_dict_data_concat(data->cbk, data->which, key,
+ GF_XATTR_LIST_NODE_UUIDS_KEY, UUID0_STR,
+ _gf_true, "{ }");
+ } else {
+ return ec_dict_data_uuid(data->cbk, data->which, key);
+ }
+ }
+
+ if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ return ec_dict_data_stime(data->cbk, data->which, key);
+ }
+
+ if (fnmatch(MARKER_XATTR_PREFIX ".*." XTIME, key, FNM_NOESCAPE) == 0) {
+ return ec_dict_data_max64(data->cbk, data->which, key);
+ }
+
+ if (strcmp(key, GF_PRESTAT) == 0 || strcmp(key, GF_POSTSTAT) == 0) {
+ return ec_dict_data_iatt(data->cbk, data->which, key);
+ }
+
+ return 0;
+}
+
+int32_t
+ec_dict_combine(ec_cbk_data_t *cbk, int32_t which)
+{
+ dict_t *dict = NULL;
+ ec_dict_combine_t data;
+ int32_t err = 0;
+
+ data.cbk = cbk;
+ data.which = which;
+
+ dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+ if (dict != NULL) {
+ err = dict_foreach(dict, ec_dict_data_combine, &data);
+ if (err != 0) {
+ gf_msg(cbk->fop->xl->name, GF_LOG_ERROR, -err,
+ EC_MSG_DICT_COMBINE_FAIL, "Dictionary combination failed");
+
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+ec_vector_compare(struct iovec *dst_vector, int32_t dst_count,
+ struct iovec *src_vector, int32_t src_count)
+{
+ int32_t dst_size = 0, src_size = 0;
+
+ if (dst_count > 0) {
+ dst_size = iov_length(dst_vector, dst_count);
+ }
+ if (src_count > 0) {
+ src_size = iov_length(src_vector, src_count);
+ }
+
+ return (dst_size == src_size);
+}
+
+int32_t
+ec_flock_compare(struct gf_flock *dst, struct gf_flock *src)
+{
+ if ((dst->l_type != src->l_type) || (dst->l_whence != src->l_whence) ||
+ (dst->l_start != src->l_start) || (dst->l_len != src->l_len) ||
+ (dst->l_pid != src->l_pid) ||
+ !is_same_lkowner(&dst->l_owner, &src->l_owner)) {
+ return 0;
+ }
+
+ return 1;
+}
+
+void
+ec_statvfs_combine(struct statvfs *dst, struct statvfs *src)
+{
+ if (dst->f_bsize < src->f_bsize) {
+ dst->f_bsize = src->f_bsize;
+ }
+
+ if (dst->f_frsize < src->f_frsize) {
+ dst->f_blocks *= dst->f_frsize;
+ dst->f_blocks /= src->f_frsize;
+
+ dst->f_bfree *= dst->f_frsize;
+ dst->f_bfree /= src->f_frsize;
+
+ dst->f_bavail *= dst->f_frsize;
+ dst->f_bavail /= src->f_frsize;
+
+ dst->f_frsize = src->f_frsize;
+ } else if (dst->f_frsize > src->f_frsize) {
+ src->f_blocks *= src->f_frsize;
+ src->f_blocks /= dst->f_frsize;
+
+ src->f_bfree *= src->f_frsize;
+ src->f_bfree /= dst->f_frsize;
+
+ src->f_bavail *= src->f_frsize;
+ src->f_bavail /= dst->f_frsize;
+ }
+ if (dst->f_blocks > src->f_blocks) {
+ dst->f_blocks = src->f_blocks;
+ }
+ if (dst->f_bfree > src->f_bfree) {
+ dst->f_bfree = src->f_bfree;
+ }
+ if (dst->f_bavail > src->f_bavail) {
+ dst->f_bavail = src->f_bavail;
+ }
+
+ if (dst->f_files < src->f_files) {
+ dst->f_files = src->f_files;
+ }
+ if (dst->f_ffree > src->f_ffree) {
+ dst->f_ffree = src->f_ffree;
+ }
+ if (dst->f_favail > src->f_favail) {
+ dst->f_favail = src->f_favail;
+ }
+ if (dst->f_namemax > src->f_namemax) {
+ dst->f_namemax = src->f_namemax;
+ }
+
+ if (dst->f_flag != src->f_flag) {
+ gf_msg_debug(THIS->name, 0,
+ "Mismatching file system flags "
+ "(%lX, %lX)",
+ dst->f_flag, src->f_flag);
+ }
+ dst->f_flag &= src->f_flag;
+}
+
+int32_t
+ec_combine_check(ec_cbk_data_t *dst, ec_cbk_data_t *src, ec_combine_f combine)
+{
+ ec_fop_data_t *fop = dst->fop;
+
+ if (dst->op_ret != src->op_ret) {
+ gf_msg_debug(fop->xl->name, 0,
+ "Mismatching return code in "
+ "answers of '%s': %d <-> %d",
+ ec_fop_name(fop->id), dst->op_ret, src->op_ret);
+
+ return 0;
+ }
+ if (dst->op_ret < 0) {
+ if (dst->op_errno != src->op_errno) {
+ gf_msg_debug(fop->xl->name, 0,
+ "Mismatching errno code in "
+ "answers of '%s': %d <-> %d",
+ ec_fop_name(fop->id), dst->op_errno, src->op_errno);
+
+ return 0;
+ }
+ }
+
+ if (!ec_dict_compare(dst->xdata, src->xdata)) {
+ gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_XDATA_MISMATCH,
+ "Mismatching xdata in answers "
+ "of '%s'",
+ ec_fop_name(fop->id));
+
+ return 0;
+ }
+
+ if ((dst->op_ret >= 0) && (combine != NULL)) {
+ return combine(fop, dst, src);
+ }
+
+ return 1;
+}
+
+void
+ec_combine(ec_cbk_data_t *newcbk, ec_combine_f combine)
+{
+ ec_fop_data_t *fop = newcbk->fop;
+ ec_cbk_data_t *cbk = NULL, *tmp = NULL;
+ struct list_head *item = NULL;
+ int32_t needed = 0;
+ char str[32];
+
+ LOCK(&fop->lock);
+
+ fop->received |= newcbk->mask;
+
+ item = fop->cbk_list.prev;
+ list_for_each_entry(cbk, &fop->cbk_list, list)
+ {
+ if (ec_combine_check(newcbk, cbk, combine)) {
+ newcbk->count += cbk->count;
+ newcbk->mask |= cbk->mask;
+
+ item = cbk->list.prev;
+ while (item != &fop->cbk_list) {
+ tmp = list_entry(item, ec_cbk_data_t, list);
+ if (tmp->count >= newcbk->count) {
+ break;
+ }
+ item = item->prev;
+ }
+ list_del(&cbk->list);
+
+ newcbk->next = cbk;
+
+ break;
+ }
+ }
+ list_add(&newcbk->list, item);
+
+ ec_trace("ANSWER", fop, "combine=%s[%d]",
+ ec_bin(str, sizeof(str), newcbk->mask, 0), newcbk->count);
+
+ cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
+ if ((fop->mask ^ fop->remaining) == fop->received) {
+ needed = fop->minimum - cbk->count;
+ }
+
+ UNLOCK(&fop->lock);
+
+ if (needed > 0) {
+ ec_dispatch_next(fop, newcbk->idx);
+ }
+}
diff --git a/xlators/cluster/ec/src/ec-combine.h b/xlators/cluster/ec/src/ec-combine.h
new file mode 100644
index 00000000000..1010cc3be26
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-combine.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_COMBINE_H__
+#define __EC_COMBINE_H__
+
+#define EC_COMBINE_DICT 0
+#define EC_COMBINE_XDATA 1
+
+typedef int32_t (*ec_combine_f)(ec_fop_data_t *fop, ec_cbk_data_t *dst,
+ ec_cbk_data_t *src);
+
+void
+ec_iatt_rebuild(ec_t *ec, struct iatt *iatt, int32_t count, int32_t answers);
+
+int32_t
+ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src,
+ int32_t count);
+int32_t
+ec_dict_compare(dict_t *dict1, dict_t *dict2);
+int32_t
+ec_vector_compare(struct iovec *dst_vector, int32_t dst_count,
+ struct iovec *src_vector, int32_t src_count);
+int32_t
+ec_flock_compare(struct gf_flock *dst, struct gf_flock *src);
+void
+ec_statvfs_combine(struct statvfs *dst, struct statvfs *src);
+
+int32_t
+ec_dict_combine(ec_cbk_data_t *cbk, int32_t which);
+
+void
+ec_combine(ec_cbk_data_t *cbk, ec_combine_f combine);
+
+int32_t
+ec_combine_write(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src);
+#endif /* __EC_COMBINE_H__ */
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
new file mode 100644
index 00000000000..b955efd8c2d
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -0,0 +1,3042 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/byte-order.h>
+#include <glusterfs/hashfn.h>
+
+#include "ec-mem-types.h"
+#include "ec-types.h"
+#include "ec-helpers.h"
+#include "ec-combine.h"
+#include "ec-common.h"
+#include "ec-fops.h"
+#include "ec-method.h"
+#include "ec.h"
+#include "ec-messages.h"
+
+#define EC_INVALID_INDEX UINT32_MAX
+
+void
+ec_update_fd_status(fd_t *fd, xlator_t *xl, int idx, int32_t ret_status)
+{
+ ec_fd_t *fd_ctx;
+
+ if (fd == NULL)
+ return;
+
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __ec_fd_get(fd, xl);
+ if (fd_ctx) {
+ if (ret_status >= 0)
+ fd_ctx->fd_status[idx] = EC_FD_OPENED;
+ else
+ fd_ctx->fd_status[idx] = EC_FD_NOT_OPENED;
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+static uintptr_t
+ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t mask)
+{
+ int i = 0;
+ int count = 0;
+ ec_t *ec = NULL;
+ ec_fd_t *fd_ctx = NULL;
+ uintptr_t need_open = 0;
+
+ ec = this->private;
+
+ fd_ctx = ec_fd_get(fd, this);
+ if (!fd_ctx)
+ return count;
+
+ LOCK(&fd->lock);
+ {
+ for (i = 0; i < ec->nodes; i++) {
+ if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) &&
+ ((ec->xl_up & (1 << i)) != 0) && ((mask & (1 << i)) != 0)) {
+ fd_ctx->fd_status[i] = EC_FD_OPENING;
+ need_open |= (1 << i);
+ count++;
+ }
+ }
+ }
+ UNLOCK(&fd->lock);
+
+ /* If fd needs to open on minimum number of nodes
+ * then ignore fixing the fd as it has been
+ * requested from heal operation.
+ */
+ if (count >= ec->fragments) {
+ need_open = 0;
+ }
+
+ return need_open;
+}
+
+static gf_boolean_t
+ec_is_fd_fixable(fd_t *fd)
+{
+ if (!fd || !fd->inode)
+ return _gf_false;
+ else if (fd_is_anonymous(fd))
+ return _gf_false;
+ else if (gf_uuid_is_null(fd->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
+}
+
+static void
+ec_fix_open(ec_fop_data_t *fop, uintptr_t mask)
+{
+ uintptr_t need_open = 0;
+ int ret = 0;
+ int32_t flags = 0;
+ loc_t loc = {
+ 0,
+ };
+
+ if (!ec_is_fd_fixable(fop->fd))
+ goto out;
+
+ /* Evaluate how many remote fd's to be opened */
+ need_open = ec_fd_ctx_need_open(fop->fd, fop->xl, mask);
+ if (need_open == 0) {
+ goto out;
+ }
+
+ loc.inode = inode_ref(fop->fd->inode);
+ gf_uuid_copy(loc.gfid, fop->fd->inode->gfid);
+ ret = loc_path(&loc, NULL);
+ if (ret < 0) {
+ goto out;
+ }
+
+ flags = fop->fd->flags & (~(O_TRUNC | O_APPEND | O_CREAT | O_EXCL));
+ if (IA_IFDIR == fop->fd->inode->ia_type) {
+ ec_opendir(fop->frame, fop->xl, need_open,
+ EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL,
+ &fop->loc[0], fop->fd, NULL);
+ } else {
+ ec_open(fop->frame, fop->xl, need_open,
+ EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, &loc,
+ flags, fop->fd, NULL);
+ }
+
+out:
+ loc_wipe(&loc);
+}
+
+static off_t
+ec_range_end_get(off_t fl_start, uint64_t fl_size)
+{
+ if (fl_size > 0) {
+ if (fl_size >= EC_RANGE_FULL) {
+ /* Infinity */
+ fl_start = LLONG_MAX;
+ } else {
+ fl_start += fl_size - 1;
+ if (fl_start < 0) {
+ /* Overflow */
+ fl_start = LLONG_MAX;
+ }
+ }
+ }
+
+ return fl_start;
+}
+
+static gf_boolean_t
+ec_is_range_conflict(ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+ return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
+}
+
+static gf_boolean_t
+ec_lock_conflict(ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+ ec_t *ec = l1->fop->xl->private;
+
+ /* Fops like access/stat won't have to worry what the other fops are
+ * modifying as the fop is wound only to one brick. So it can be
+ * executed in parallel*/
+ if (l1->fop->minimum == EC_MINIMUM_ONE ||
+ l2->fop->minimum == EC_MINIMUM_ONE)
+ return _gf_false;
+
+ if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) &&
+ (l2->fop->flags & EC_FLAG_LOCK_SHARED))
+ return _gf_false;
+
+ if (!ec->parallel_writes) {
+ return _gf_true;
+ }
+
+ return ec_is_range_conflict(l1, l2);
+}
+
+uint32_t
+ec_select_first_by_read_policy(ec_t *ec, ec_fop_data_t *fop)
+{
+ if (ec->read_policy == EC_ROUND_ROBIN) {
+ return ec->idx;
+ } else if (ec->read_policy == EC_GFID_HASH) {
+ if (fop->use_fd) {
+ return SuperFastHash((char *)fop->fd->inode->gfid,
+ sizeof(fop->fd->inode->gfid)) %
+ ec->nodes;
+ } else {
+ if (gf_uuid_is_null(fop->loc[0].gfid))
+ loc_gfid(&fop->loc[0], fop->loc[0].gfid);
+ return SuperFastHash((char *)fop->loc[0].gfid,
+ sizeof(fop->loc[0].gfid)) %
+ ec->nodes;
+ }
+ }
+ return 0;
+}
+
+static gf_boolean_t
+ec_child_valid(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
+{
+ return (idx < ec->nodes) && (((fop->remaining >> idx) & 1) == 1);
+}
+
+static uint32_t
+ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
+{
+ while (!ec_child_valid(ec, fop, idx)) {
+ if (++idx >= ec->nodes) {
+ idx = 0;
+ }
+ if (idx == fop->first) {
+ return EC_INVALID_INDEX;
+ }
+ }
+
+ return idx;
+}
+
+int32_t
+ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
+ uintptr_t bad, uint32_t pending, dict_t *xdata)
+{
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
+ "Heal failed");
+ } else {
+ if ((mask & ~good) != 0) {
+ gf_msg(this->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_SUCCESS,
+ "Heal succeeded on %d/%d "
+ "subvolumes",
+ gf_bits_count(mask & ~(good | bad)),
+ gf_bits_count(mask & ~good));
+ }
+ }
+
+ return 0;
+}
+
+static uintptr_t
+ec_fop_needs_name_heal(ec_fop_data_t *fop)
+{
+ ec_t *ec = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ ec_cbk_data_t *enoent_cbk = NULL;
+
+ ec = fop->xl->private;
+ if (fop->id != GF_FOP_LOOKUP)
+ return 0;
+
+ if (!fop->loc[0].name || strlen(fop->loc[0].name) == 0)
+ return 0;
+
+ list_for_each_entry(cbk, &fop->cbk_list, list)
+ {
+ if (cbk->op_ret < 0 && cbk->op_errno == ENOENT) {
+ enoent_cbk = cbk;
+ break;
+ }
+ }
+
+ if (!enoent_cbk)
+ return 0;
+
+ return ec->xl_up & ~enoent_cbk->mask;
+}
+
+int32_t
+ec_fop_needs_heal(ec_fop_data_t *fop)
+{
+ ec_t *ec = fop->xl->private;
+
+ if (fop->lock_count == 0) {
+ /*
+ * if fop->lock_count is zero that means it saw version mismatch
+ * without any locks so it can't be trusted. If we launch a heal
+ * based on this it will lead to INODELKs which will affect I/O
+ * performance. Considering self-heal-daemon and operations on
+ * the inode from client which take locks can still trigger the
+ * heal we can choose to not attempt a heal when fop->lock_count
+ * is zero.
+ */
+ return 0;
+ }
+ return (ec->xl_up & ~(fop->remaining | fop->good)) != 0;
+}
+
+void
+ec_check_status(ec_fop_data_t *fop)
+{
+ ec_t *ec = fop->xl->private;
+ int32_t partial = 0;
+ char str1[32], str2[32], str3[32], str4[32], str5[32];
+
+ if (!ec_fop_needs_name_heal(fop) && !ec_fop_needs_heal(fop)) {
+ return;
+ }
+
+ if (fop->answer && fop->answer->op_ret >= 0) {
+ if ((fop->id == GF_FOP_LOOKUP) || (fop->id == GF_FOP_STAT) ||
+ (fop->id == GF_FOP_FSTAT)) {
+ partial = fop->answer->iatt[0].ia_type == IA_IFDIR;
+ } else if (fop->id == GF_FOP_OPENDIR) {
+ partial = 1;
+ }
+ }
+
+ gf_msg(
+ fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
+ "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
+ "remaining=%s, good=%s, bad=%s,"
+ "(Least significant bit represents first client/brick of subvol), %s)",
+ gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
+ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+ ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
+ ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
+ ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
+ ec->nodes),
+ ec_msg_str(fop));
+ if (fop->use_fd) {
+ if (fop->fd != NULL) {
+ ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
+ fop->fd, partial, NULL);
+ }
+ } else {
+ ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
+ &fop->loc[0], partial, NULL);
+
+ if (fop->loc[1].inode != NULL) {
+ ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
+ &fop->loc[1], partial, NULL);
+ }
+ }
+}
+
+void
+ec_update_good(ec_fop_data_t *fop, uintptr_t good)
+{
+ fop->good = good;
+
+ /* Fops that are executed only on one brick do not have enough information
+ * to decide if healing is needed or not. */
+ if ((fop->expected != 1) && (fop->parent == NULL)) {
+ ec_check_status(fop);
+ }
+}
+
+void
+ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop)
+{
+ /* Fops that are executed only on one brick do not have enough information
+ * to update the global mask of good bricks. */
+ if (fop->expected == 1) {
+ return;
+ }
+
+ /* When updating the good mask of the lock, we only take into consideration
+ * those bits corresponding to the bricks where the fop has been executed.
+ * Bad bricks are removed from good_mask, but once marked as bad it's never
+ * set to good until the lock is released and reacquired */
+
+ lock->good_mask &= fop->good | fop->remaining;
+}
+
+void
+__ec_fop_set_error(ec_fop_data_t *fop, int32_t error)
+{
+ if ((error != 0) && (fop->error == 0)) {
+ fop->error = error;
+ }
+}
+
+void
+ec_fop_set_error(ec_fop_data_t *fop, int32_t error)
+{
+ LOCK(&fop->lock);
+
+ __ec_fop_set_error(fop, error);
+
+ UNLOCK(&fop->lock);
+}
+
+gf_boolean_t
+ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro)
+{
+ if ((error != 0) && (cbk->op_ret >= 0)) {
+ /* If cbk->op_errno was 0, it means that the fop succeeded and this
+ * error has happened while processing the answer. If the operation was
+ * read-only, there's no problem (i.e. we simply return the generated
+ * error code). However if it caused a modification, we must return EIO
+ * to indicate that the operation has been partially executed. */
+ cbk->op_errno = ro ? error : EIO;
+ cbk->op_ret = -1;
+
+ ec_fop_set_error(cbk->fop, cbk->op_errno);
+ }
+
+ return (cbk->op_ret < 0);
+}
+
+ec_cbk_data_t *
+ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro)
+{
+ ec_cbk_data_t *cbk;
+ int32_t err;
+
+ cbk = fop->answer;
+ if (cbk == NULL) {
+ ec_fop_set_error(fop, EIO);
+
+ return NULL;
+ }
+
+ if (cbk->op_ret < 0) {
+ ec_fop_set_error(fop, cbk->op_errno);
+ }
+
+ err = ec_dict_combine(cbk, EC_COMBINE_XDATA);
+ if (ec_cbk_set_error(cbk, -err, ro)) {
+ return NULL;
+ }
+
+ return cbk;
+}
+
+void
+ec_sleep(ec_fop_data_t *fop)
+{
+ LOCK(&fop->lock);
+
+ GF_ASSERT(fop->refs > 0);
+ fop->refs++;
+ fop->jobs++;
+
+ UNLOCK(&fop->lock);
+}
+
+int32_t
+ec_check_complete(ec_fop_data_t *fop, ec_resume_f resume)
+{
+ int32_t error = -1;
+
+ LOCK(&fop->lock);
+
+ GF_ASSERT(fop->resume == NULL);
+
+ if (--fop->jobs != 0) {
+ ec_trace("WAIT", fop, "resume=%p", resume);
+
+ fop->resume = resume;
+ } else {
+ error = fop->error;
+ fop->error = 0;
+ }
+
+ UNLOCK(&fop->lock);
+
+ return error;
+}
+
+void
+ec_resume(ec_fop_data_t *fop, int32_t error)
+{
+ ec_resume_f resume = NULL;
+
+ LOCK(&fop->lock);
+
+ __ec_fop_set_error(fop, error);
+
+ if (--fop->jobs == 0) {
+ resume = fop->resume;
+ fop->resume = NULL;
+ if (resume != NULL) {
+ ec_trace("RESUME", fop, "error=%d", error);
+
+ if (fop->error != 0) {
+ error = fop->error;
+ }
+ fop->error = 0;
+ }
+ }
+
+ UNLOCK(&fop->lock);
+
+ if (resume != NULL) {
+ resume(fop, error);
+ }
+
+ ec_fop_data_release(fop);
+}
+
+void
+ec_resume_parent(ec_fop_data_t *fop)
+{
+ ec_fop_data_t *parent;
+ int32_t error = 0;
+
+ parent = fop->parent;
+ if (parent != NULL) {
+ if ((fop->fop_flags & EC_FOP_NO_PROPAGATE_ERROR) == 0) {
+ error = fop->error;
+ }
+ ec_trace("RESUME_PARENT", fop, "error=%u", error);
+ fop->parent = NULL;
+ ec_resume(parent, error);
+ }
+}
+
+gf_boolean_t
+ec_is_recoverable_error(int32_t op_errno)
+{
+ switch (op_errno) {
+ case ENOTCONN:
+ case ESTALE:
+ case ENOENT:
+ case EBADFD: /*Opened fd but brick is disconnected*/
+ case EIO: /*Backend-fs crash like XFS/ext4 etc*/
+ return _gf_true;
+ }
+ return _gf_false;
+}
+
+void
+ec_complete(ec_fop_data_t *fop)
+{
+ ec_cbk_data_t *cbk = NULL;
+ int32_t resume = 0, update = 0;
+ int healing_count = 0;
+
+ LOCK(&fop->lock);
+
+ ec_trace("COMPLETE", fop, "");
+
+ if (--fop->winds == 0) {
+ if (fop->answer == NULL) {
+ if (!list_empty(&fop->cbk_list)) {
+ cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
+ healing_count = gf_bits_count(cbk->mask & fop->healing);
+ /* fop shouldn't be treated as success if it is not
+ * successful on at least fop->minimum good copies*/
+ if ((cbk->count - healing_count) >= fop->minimum) {
+ fop->answer = cbk;
+
+ update = 1;
+ }
+ }
+
+ resume = 1;
+ }
+ }
+
+ UNLOCK(&fop->lock);
+
+ /* ec_update_good() locks inode->lock. This may cause deadlocks with
+ fop->lock when used in another order. Since ec_update_good() will not
+ be called more than once for each fop, it can be called from outside
+ the fop->lock locked region. */
+ if (update) {
+ ec_update_good(fop, cbk->mask);
+ }
+
+ if (resume) {
+ ec_resume(fop, 0);
+ }
+
+ ec_fop_data_release(fop);
+}
+
+/* There could be already granted locks sitting on the bricks, unlock for which
+ * must be wound at all costs*/
+static gf_boolean_t
+ec_must_wind(ec_fop_data_t *fop)
+{
+ if ((fop->id == GF_FOP_INODELK) || (fop->id == GF_FOP_FINODELK) ||
+ (fop->id == GF_FOP_LK)) {
+ if (fop->flock.l_type == F_UNLCK)
+ return _gf_true;
+ } else if ((fop->id == GF_FOP_ENTRYLK) || (fop->id == GF_FOP_FENTRYLK)) {
+ if (fop->entrylk_cmd == ENTRYLK_UNLOCK)
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static gf_boolean_t
+ec_internal_op(ec_fop_data_t *fop)
+{
+ if (ec_must_wind(fop))
+ return _gf_true;
+ if (fop->id == GF_FOP_XATTROP)
+ return _gf_true;
+ if (fop->id == GF_FOP_FXATTROP)
+ return _gf_true;
+ if (fop->id == GF_FOP_OPEN)
+ return _gf_true;
+ return _gf_false;
+}
+
+char *
+ec_msg_str(ec_fop_data_t *fop)
+{
+ loc_t *loc1 = NULL;
+ loc_t *loc2 = NULL;
+ char gfid1[64] = {0};
+ char gfid2[64] = {0};
+ ec_fop_data_t *parent = fop->parent;
+
+ if (fop->errstr)
+ return fop->errstr;
+ if (!fop->use_fd) {
+ loc1 = &fop->loc[0];
+ loc2 = &fop->loc[1];
+
+ if (fop->id == GF_FOP_RENAME) {
+ gf_asprintf(&fop->errstr,
+ "FOP : '%s' failed on '%s' and '%s' with gfids "
+ "%s and %s respectively. Parent FOP: %s",
+ ec_fop_name(fop->id), loc1->path, loc2->path,
+ uuid_utoa_r(loc1->gfid, gfid1),
+ uuid_utoa_r(loc2->gfid, gfid2),
+ parent ? ec_fop_name(parent->id) : "No Parent");
+ } else {
+ gf_asprintf(
+ &fop->errstr,
+ "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s",
+ ec_fop_name(fop->id), loc1->path,
+ uuid_utoa_r(loc1->gfid, gfid1),
+ parent ? ec_fop_name(parent->id) : "No Parent");
+ }
+ } else {
+ gf_asprintf(
+ &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s",
+ ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1),
+ parent ? ec_fop_name(parent->id) : "No Parent");
+ }
+ return fop->errstr;
+}
+
+static void
+ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need,
+ int32_t loglevel)
+{
+ ec_t *ec = fop->xl->private;
+ char str1[32], str2[32], str3[32];
+
+ gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT,
+ "Insufficient available children for this request: "
+ "Have : %d, Need : %u : Child UP : %s "
+ "Mask: %s, Healing : %s : %s ",
+ have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+ ec_bin(str3, sizeof(str3), fop->healing, ec->nodes),
+ ec_msg_str(fop));
+}
+
+static int32_t
+ec_child_select(ec_fop_data_t *fop)
+{
+ ec_t *ec = fop->xl->private;
+ int32_t first = 0, num = 0;
+
+ ec_fop_cleanup(fop);
+
+ fop->mask &= ec->node_mask;
+ /* Wind the fop on same subvols as parent for any internal extra fops like
+ * head/tail read in case of writev fop. Unlocks shouldn't do this because
+ * unlock should go on all subvols where lock is performed*/
+ if (fop->parent && !ec_internal_op(fop)) {
+ fop->mask &= (fop->parent->mask & ~fop->parent->healing);
+ if (ec_is_data_fop(fop->id)) {
+ fop->healing |= fop->parent->healing;
+ }
+ }
+
+ if ((fop->mask & ~ec->xl_up) != 0) {
+ gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_EXEC_UNAVAIL,
+ "Executing operation with "
+ "some subvolumes unavailable. (%" PRIXPTR "). %s ",
+ fop->mask & ~ec->xl_up, ec_msg_str(fop));
+ fop->mask &= ec->xl_up;
+ }
+
+ switch (fop->minimum) {
+ case EC_MINIMUM_ALL:
+ fop->minimum = gf_bits_count(fop->mask);
+ if (fop->minimum >= ec->fragments) {
+ break;
+ }
+ case EC_MINIMUM_MIN:
+ fop->minimum = ec->fragments;
+ break;
+ case EC_MINIMUM_ONE:
+ fop->minimum = 1;
+ }
+
+ if (ec->read_policy == EC_ROUND_ROBIN) {
+ first = ec->idx;
+ if (++first >= ec->nodes) {
+ first = 0;
+ }
+ ec->idx = first;
+ }
+
+ num = gf_bits_count(fop->mask);
+ /*Unconditionally wind on healing subvolumes*/
+ fop->mask |= fop->healing;
+ fop->remaining = fop->mask;
+ fop->received = 0;
+
+ ec_trace("SELECT", fop, "");
+
+ if ((num < fop->minimum) && (num < ec->fragments)) {
+ ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR);
+ return 0;
+ }
+
+ if (!fop->parent && fop->lock_count &&
+ (fop->locks[0].update[EC_DATA_TXN] ||
+ fop->locks[0].update[EC_METADATA_TXN])) {
+ if (ec->quorum_count && (num < ec->quorum_count)) {
+ ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+void
+ec_dispatch_next(ec_fop_data_t *fop, uint32_t idx)
+{
+ uint32_t i = EC_INVALID_INDEX;
+ ec_t *ec = fop->xl->private;
+
+ LOCK(&fop->lock);
+
+ i = ec_child_next(ec, fop, idx);
+ if (i < EC_MAX_NODES) {
+ idx = i;
+
+ fop->remaining ^= 1ULL << idx;
+
+ ec_trace("EXECUTE", fop, "idx=%d", idx);
+
+ fop->winds++;
+ fop->refs++;
+ }
+
+ UNLOCK(&fop->lock);
+
+ if (i < EC_MAX_NODES) {
+ fop->wind(ec, fop, idx);
+ }
+}
+
+void
+ec_dispatch_mask(ec_fop_data_t *fop, uintptr_t mask)
+{
+ ec_t *ec = fop->xl->private;
+ int32_t count, idx;
+
+ count = gf_bits_count(mask);
+
+ LOCK(&fop->lock);
+
+ ec_trace("EXECUTE", fop, "mask=%lX", mask);
+
+ fop->remaining ^= mask;
+
+ fop->winds += count;
+ fop->refs += count;
+
+ UNLOCK(&fop->lock);
+
+ idx = 0;
+ while (mask != 0) {
+ if ((mask & 1) != 0) {
+ fop->wind(ec, fop, idx);
+ }
+ idx++;
+ mask >>= 1;
+ }
+}
+
+void
+ec_dispatch_start(ec_fop_data_t *fop)
+{
+ fop->answer = NULL;
+ fop->good = 0;
+
+ INIT_LIST_HEAD(&fop->cbk_list);
+
+ if (fop->lock_count > 0) {
+ ec_owner_copy(fop->frame, &fop->req_frame->root->lk_owner);
+ }
+}
+
+void
+ec_dispatch_one(ec_fop_data_t *fop)
+{
+ ec_dispatch_start(fop);
+
+ if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
+ fop->expected = 1;
+ fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
+
+ ec_dispatch_next(fop, fop->first);
+ }
+}
+
+gf_boolean_t
+ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk)
+{
+ ec_cbk_data_t *tmp;
+
+ tmp = ec_fop_prepare_answer(fop, _gf_true);
+ if (cbk != NULL) {
+ *cbk = tmp;
+ }
+ if ((tmp != NULL) && (tmp->op_ret < 0) &&
+ ec_is_recoverable_error(tmp->op_errno)) {
+ GF_ASSERT(fop->mask & (1ULL << tmp->idx));
+ fop->mask ^= (1ULL << tmp->idx);
+ if (fop->mask) {
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
+}
+
+void
+ec_dispatch_inc(ec_fop_data_t *fop)
+{
+ ec_dispatch_start(fop);
+
+ if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
+ fop->expected = gf_bits_count(fop->remaining);
+ fop->first = 0;
+
+ ec_dispatch_next(fop, 0);
+ }
+}
+
+void
+ec_dispatch_all(ec_fop_data_t *fop)
+{
+ ec_dispatch_start(fop);
+
+ if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
+ fop->expected = gf_bits_count(fop->remaining);
+ fop->first = 0;
+
+ ec_dispatch_mask(fop, fop->remaining);
+ }
+}
+
+void
+ec_dispatch_min(ec_fop_data_t *fop)
+{
+ ec_t *ec = fop->xl->private;
+ uintptr_t mask;
+ uint32_t idx;
+ int32_t count;
+
+ ec_dispatch_start(fop);
+
+ if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
+ fop->expected = count = ec->fragments;
+ fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
+ idx = fop->first - 1;
+ mask = 0;
+ while (count-- > 0) {
+ idx = ec_child_next(ec, fop, idx + 1);
+ if (idx < EC_MAX_NODES)
+ mask |= 1ULL << idx;
+ }
+
+ ec_dispatch_mask(fop, mask);
+ }
+}
+
+void
+ec_succeed_all(ec_fop_data_t *fop)
+{
+ ec_dispatch_start(fop);
+
+ if (ec_child_select(fop)) {
+ fop->expected = gf_bits_count(fop->remaining);
+ fop->first = 0;
+
+ /* Simulate a successful execution on all bricks */
+ ec_trace("SUCCEED", fop, "");
+
+ fop->good = fop->remaining;
+ fop->remaining = 0;
+ }
+}
+
+ec_lock_t *
+ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
+{
+ ec_t *ec = fop->xl->private;
+ ec_lock_t *lock;
+ int32_t err;
+
+ if ((loc->inode == NULL) ||
+ (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid))) {
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_INODE,
+ "Trying to lock based on an invalid "
+ "inode");
+
+ __ec_fop_set_error(fop, EINVAL);
+
+ return NULL;
+ }
+
+ lock = mem_get0(ec->lock_pool);
+ if (lock != NULL) {
+ lock->good_mask = UINTPTR_MAX;
+ INIT_LIST_HEAD(&lock->owners);
+ INIT_LIST_HEAD(&lock->waiting);
+ INIT_LIST_HEAD(&lock->frozen);
+ err = ec_loc_from_loc(fop->xl, &lock->loc, loc);
+ if (err != 0) {
+ mem_put(lock);
+ lock = NULL;
+
+ __ec_fop_set_error(fop, -err);
+ }
+ }
+
+ return lock;
+}
+
+void
+ec_lock_destroy(ec_lock_t *lock)
+{
+ loc_wipe(&lock->loc);
+ if (lock->fd != NULL) {
+ fd_unref(lock->fd);
+ }
+
+ mem_put(lock);
+}
+
+int32_t
+ec_lock_compare(ec_lock_t *lock1, ec_lock_t *lock2)
+{
+ return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
+}
+
+static void
+ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, loc_t *base,
+ off_t fl_start, uint64_t fl_size)
+{
+ ec_lock_link_t *link;
+
+ /* This check is only prepared for up to 2 locks per fop. If more locks
+ * are needed this must be changed. */
+ if ((fop->lock_count > 0) &&
+ (ec_lock_compare(fop->locks[0].lock, lock) < 0)) {
+ fop->first_lock = fop->lock_count;
+ } else {
+ /* When the first lock is added to the current fop, request lock
+ * counts from locks xlator to be able to determine if there is
+ * contention and release the lock sooner. */
+ if (fop->xdata == NULL) {
+ fop->xdata = dict_new();
+ if (fop->xdata == NULL) {
+ ec_fop_set_error(fop, ENOMEM);
+ return;
+ }
+ }
+ if (dict_set_str(fop->xdata, GLUSTERFS_INODELK_DOM_COUNT,
+ fop->xl->name) != 0) {
+ ec_fop_set_error(fop, ENOMEM);
+ return;
+ }
+ }
+
+ link = &fop->locks[fop->lock_count++];
+
+ link->lock = lock;
+ link->fop = fop;
+ link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0;
+ link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
+ link->base = base;
+ link->fl_start = fl_start;
+ link->fl_end = ec_range_end_get(fl_start, fl_size);
+
+ lock->refs_pending++;
+}
+
+static void
+ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+ loc_t *base, off_t fl_start, uint64_t fl_size)
+{
+ ec_lock_t *lock = NULL;
+ ec_inode_t *ctx;
+
+ if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) {
+ return;
+ }
+
+ LOCK(&loc->inode->lock);
+
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx == NULL) {
+ __ec_fop_set_error(fop, ENOMEM);
+
+ goto unlock;
+ }
+
+ if (ctx->inode_lock != NULL) {
+ lock = ctx->inode_lock;
+
+ /* If there's another lock, make sure that it's not the same. Otherwise
+ * do not insert it.
+ *
+ * This can only happen on renames where source and target names are
+ * in the same directory. */
+ if ((fop->lock_count > 0) && (fop->locks[0].lock == lock)) {
+ /* Combine data/meta updates */
+ fop->locks[0].update[EC_DATA_TXN] |= (flags & EC_UPDATE_DATA) != 0;
+ fop->locks[0].update[EC_METADATA_TXN] |= (flags & EC_UPDATE_META) !=
+ 0;
+
+ /* Only one base inode is allowed per fop, so there shouldn't be
+ * overwrites here. */
+ if (base != NULL) {
+ fop->locks[0].base = base;
+ }
+
+ goto update_query;
+ }
+
+ ec_trace("LOCK_INODELK", fop,
+ "lock=%p, inode=%p. Lock already "
+ "acquired",
+ lock, loc->inode);
+
+ goto insert;
+ }
+
+ lock = ec_lock_allocate(fop, loc);
+ if (lock == NULL) {
+ goto unlock;
+ }
+
+ ec_trace("LOCK_CREATE", fop, "lock=%p", lock);
+
+ lock->flock.l_type = F_WRLCK;
+ lock->flock.l_whence = SEEK_SET;
+
+ lock->ctx = ctx;
+ ctx->inode_lock = lock;
+
+insert:
+ ec_lock_insert(fop, lock, flags, base, fl_start, fl_size);
+update_query:
+ lock->query |= (flags & EC_QUERY_INFO) != 0;
+unlock:
+ UNLOCK(&loc->inode->lock);
+}
+
+void
+ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+ off_t fl_start, uint64_t fl_size)
+{
+ ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size);
+}
+
+void
+ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
+ uint32_t flags)
+{
+ loc_t tmp;
+ int32_t err;
+
+ if (fop->error != 0) {
+ return;
+ }
+
+ err = ec_loc_parent(fop->xl, loc, &tmp);
+ if (err != 0) {
+ ec_fop_set_error(fop, -err);
+
+ return;
+ }
+
+ if ((flags & EC_INODE_SIZE) != 0) {
+ flags ^= EC_INODE_SIZE;
+ } else {
+ base = NULL;
+ }
+
+ ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, EC_RANGE_FULL);
+
+ loc_wipe(&tmp);
+}
+
+void
+ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start,
+ uint64_t fl_size)
+{
+ loc_t loc;
+ int32_t err;
+
+ if (fop->error != 0) {
+ return;
+ }
+
+ err = ec_loc_from_fd(fop->xl, &loc, fd);
+ if (err != 0) {
+ ec_fop_set_error(fop, -err);
+
+ return;
+ }
+
+ ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size);
+
+ loc_wipe(&loc);
+}
+
+gf_boolean_t
+ec_config_check(xlator_t *xl, ec_config_t *config)
+{
+ ec_t *ec;
+
+ ec = xl->private;
+ if ((config->version != EC_CONFIG_VERSION) ||
+ (config->algorithm != EC_CONFIG_ALGORITHM) ||
+ (config->gf_word_size != EC_GF_BITS) || (config->bricks != ec->nodes) ||
+ (config->redundancy != ec->redundancy) ||
+ (config->chunk_size != EC_METHOD_CHUNK_SIZE)) {
+ uint32_t data_bricks;
+
+ /* This combination of version/algorithm requires the following
+ values. Incorrect values for these fields are a sign of
+ corruption:
+
+ redundancy > 0
+ redundancy * 2 < bricks
+ gf_word_size must be a power of 2
+ chunk_size (in bits) must be a multiple of gf_word_size *
+ (bricks - redundancy) */
+
+ data_bricks = config->bricks - config->redundancy;
+ if ((config->redundancy < 1) ||
+ (config->redundancy * 2 >= config->bricks) ||
+ !ec_is_power_of_2(config->gf_word_size) ||
+ ((config->chunk_size * 8) % (config->gf_word_size * data_bricks) !=
+ 0)) {
+ gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG,
+ "Invalid or corrupted config");
+ } else {
+ gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG,
+ "Unsupported config "
+ "(V=%u, A=%u, W=%u, "
+ "N=%u, R=%u, S=%u)",
+ config->version, config->algorithm, config->gf_word_size,
+ config->bricks, config->redundancy, config->chunk_size);
+ }
+
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+gf_boolean_t
+ec_set_dirty_flag(ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+{
+ gf_boolean_t set_dirty = _gf_false;
+
+ if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+ if (!link->optimistic_changelog)
+ dirty[EC_DATA_TXN] = 1;
+ }
+
+ if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+ if (!link->optimistic_changelog)
+ dirty[EC_METADATA_TXN] = 1;
+ }
+
+ if (dirty[EC_METADATA_TXN] || dirty[EC_DATA_TXN]) {
+ set_dirty = _gf_true;
+ }
+
+ return set_dirty;
+}
+
+int32_t
+ec_prepare_update_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ struct list_head list;
+ ec_fop_data_t *fop = cookie, *parent, *tmp;
+ ec_lock_link_t *parent_link = fop->data;
+ ec_lock_link_t *link = NULL;
+ ec_lock_t *lock = NULL;
+ ec_inode_t *ctx;
+ gf_boolean_t release = _gf_false;
+ uint64_t provided_flags = 0;
+ uint64_t dirty[EC_VERSION_SIZE] = {0, 0};
+ lock = parent_link->lock;
+ parent = parent_link->fop;
+ ctx = lock->ctx;
+
+ INIT_LIST_HEAD(&list);
+ provided_flags = EC_PROVIDED_FLAGS(parent_link->waiting_flags);
+
+ LOCK(&lock->loc.inode->lock);
+
+ list_for_each_entry(link, &lock->owners, owner_list)
+ {
+ if ((link->waiting_flags & provided_flags) != 0) {
+ link->waiting_flags ^= (link->waiting_flags & provided_flags);
+ if (EC_NEEDED_FLAGS(link->waiting_flags) == 0)
+ list_add_tail(&link->fop->cbk_list, &list);
+ }
+ }
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_SIZE_VERS_GET_FAIL,
+ "Failed to get size and version : %s", ec_msg_str(fop));
+
+ goto unlock;
+ }
+
+ if (EC_FLAGS_HAVE(provided_flags, EC_FLAG_XATTROP)) {
+ op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version,
+ EC_VERSION_SIZE);
+ if (op_errno != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ EC_MSG_VER_XATTR_GET_FAIL, "Unable to get version xattr. %s",
+ ec_msg_str(fop));
+ goto unlock;
+ }
+ ctx->post_version[0] += ctx->pre_version[0];
+ ctx->post_version[1] += ctx->pre_version[1];
+
+ ctx->have_version = _gf_true;
+
+ if (lock->loc.inode->ia_type == IA_IFREG ||
+ lock->loc.inode->ia_type == IA_INVAL) {
+ op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size);
+ if (op_errno != 0) {
+ if (lock->loc.inode->ia_type == IA_IFREG) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ EC_MSG_SIZE_XATTR_GET_FAIL,
+ "Unable to get size xattr. %s", ec_msg_str(fop));
+ goto unlock;
+ }
+ } else {
+ ctx->post_size = ctx->pre_size;
+
+ ctx->have_size = _gf_true;
+ }
+
+ op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config);
+ if (op_errno != 0) {
+ if ((lock->loc.inode->ia_type == IA_IFREG) ||
+ (op_errno != ENODATA)) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ EC_MSG_CONFIG_XATTR_GET_FAIL,
+ "Unable to get config xattr. %s", ec_msg_str(fop));
+
+ goto unlock;
+ }
+ } else {
+ if (!ec_config_check(parent->xl, &ctx->config)) {
+ gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+ EC_MSG_CONFIG_XATTR_INVALID, "Invalid config xattr");
+
+ op_errno = EINVAL;
+
+ goto unlock;
+ }
+ ctx->have_config = _gf_true;
+ }
+ }
+ ctx->have_info = _gf_true;
+ }
+
+ ec_set_dirty_flag(fop->data, ctx, dirty);
+ if (dirty[EC_METADATA_TXN] &&
+ (EC_FLAGS_HAVE(provided_flags, EC_FLAG_METADATA_DIRTY))) {
+ GF_ASSERT(!ctx->dirty[EC_METADATA_TXN]);
+ ctx->dirty[EC_METADATA_TXN] = 1;
+ }
+
+ if (dirty[EC_DATA_TXN] &&
+ (EC_FLAGS_HAVE(provided_flags, EC_FLAG_DATA_DIRTY))) {
+ GF_ASSERT(!ctx->dirty[EC_DATA_TXN]);
+ ctx->dirty[EC_DATA_TXN] = 1;
+ }
+ op_errno = 0;
+unlock:
+
+ lock->waiting_flags ^= provided_flags;
+
+ if (op_errno == 0) {
+ /* If the fop fails on any of the good bricks, it is important to mark
+ * it dirty and update versions right away if dirty was not set before.
+ */
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
+ release = _gf_true;
+ }
+
+ if (parent_link->update[0] && !parent_link->dirty[0]) {
+ lock->release |= release;
+ }
+
+ if (parent_link->update[1] && !parent_link->dirty[1]) {
+ lock->release |= release;
+ }
+
+ /* We don't allow the main fop to be executed on bricks that have not
+ * succeeded the initial xattrop. */
+ ec_lock_update_good(lock, fop);
+
+ /*As of now only data healing marks bricks as healing*/
+ lock->healing |= fop->healing;
+ }
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ while (!list_empty(&list)) {
+ tmp = list_entry(list.next, ec_fop_data_t, cbk_list);
+ list_del_init(&tmp->cbk_list);
+
+ if (op_errno == 0) {
+ tmp->mask &= fop->good;
+
+ /*As of now only data healing marks bricks as healing*/
+ if (ec_is_data_fop(tmp->id)) {
+ tmp->healing |= fop->healing;
+ }
+ }
+
+ ec_resume(tmp, op_errno);
+ }
+
+ return 0;
+}
+
+static gf_boolean_t
+ec_set_needed_flag(ec_lock_t *lock, ec_lock_link_t *link, uint64_t flag)
+{
+ uint64_t current;
+
+ link->waiting_flags |= EC_FLAG_NEEDS(flag);
+
+ current = EC_NEEDED_FLAGS(lock->waiting_flags);
+ if (!EC_FLAGS_HAVE(current, flag)) {
+ lock->waiting_flags |= EC_FLAG_NEEDS(flag);
+ link->waiting_flags |= EC_FLAG_PROVIDES(flag);
+
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static uint64_t
+ec_set_xattrop_flags_and_params(ec_lock_t *lock, ec_lock_link_t *link,
+ uint64_t *dirty)
+{
+ uint64_t oldflags = 0;
+ uint64_t newflags = 0;
+ ec_inode_t *ctx = lock->ctx;
+
+ oldflags = EC_NEEDED_FLAGS(lock->waiting_flags);
+
+ if (lock->query && !ctx->have_info) {
+ ec_set_needed_flag(lock, link, EC_FLAG_XATTROP);
+ }
+
+ if (dirty[EC_DATA_TXN]) {
+ if (!ec_set_needed_flag(lock, link, EC_FLAG_DATA_DIRTY)) {
+ dirty[EC_DATA_TXN] = 0;
+ }
+ }
+
+ if (dirty[EC_METADATA_TXN]) {
+ if (!ec_set_needed_flag(lock, link, EC_FLAG_METADATA_DIRTY)) {
+ dirty[EC_METADATA_TXN] = 0;
+ }
+ }
+ newflags = EC_NEEDED_FLAGS(lock->waiting_flags);
+
+ return oldflags ^ newflags;
+}
+
+void
+ec_get_size_version(ec_lock_link_t *link)
+{
+ loc_t loc;
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ ec_fop_data_t *fop;
+ dict_t *dict = NULL;
+ dict_t *xdata = NULL;
+ ec_t *ec = NULL;
+ int32_t error = 0;
+ gf_boolean_t set_dirty = _gf_false;
+ uint64_t allzero[EC_VERSION_SIZE] = {0, 0};
+ uint64_t dirty[EC_VERSION_SIZE] = {0, 0};
+ lock = link->lock;
+ ctx = lock->ctx;
+ fop = link->fop;
+ ec = fop->xl->private;
+ uint64_t changed_flags = 0;
+
+ if (ec->optimistic_changelog && !(ec->node_mask & ~link->lock->good_mask) &&
+ !ec_is_data_fop(fop->id))
+ link->optimistic_changelog = _gf_true;
+
+ memset(&loc, 0, sizeof(loc));
+
+ LOCK(&lock->loc.inode->lock);
+
+ set_dirty = ec_set_dirty_flag(link, ctx, dirty);
+
+ /* If ec metadata has already been retrieved, do not try again. */
+ if (ctx->have_info) {
+ if (ec_is_data_fop(fop->id)) {
+ fop->healing |= lock->healing;
+ }
+ if (!set_dirty)
+ goto unlock;
+ }
+
+ /* Determine if there's something we need to retrieve for the current
+ * operation. */
+ if (!set_dirty && !lock->query && (lock->loc.inode->ia_type != IA_IFREG) &&
+ (lock->loc.inode->ia_type != IA_INVAL)) {
+ goto unlock;
+ }
+
+ changed_flags = ec_set_xattrop_flags_and_params(lock, link, dirty);
+ if (link->waiting_flags) {
+ /* This fop needs to wait until all its flags are cleared which
+ * potentially can be cleared by other xattrops that are already
+ * wound*/
+ ec_sleep(fop);
+ } else {
+ GF_ASSERT(!changed_flags);
+ }
+
+unlock:
+ UNLOCK(&lock->loc.inode->lock);
+
+ if (!changed_flags)
+ goto out;
+
+ dict = dict_new();
+ if (dict == NULL) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ if (EC_FLAGS_HAVE(changed_flags, EC_FLAG_XATTROP)) {
+ /* Once we know that an xattrop will be needed,
+ * we try to get all available information in a
+ * single call. */
+ error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
+ EC_VERSION_SIZE);
+ if (error != 0) {
+ goto out;
+ }
+
+ if (lock->loc.inode->ia_type == IA_IFREG ||
+ lock->loc.inode->ia_type == IA_INVAL) {
+ error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0);
+ if (error == 0) {
+ error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+ }
+ if (error != 0) {
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (xdata == NULL || dict_set_int32(xdata, GF_GET_SIZE, 1)) {
+ error = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
+ if (memcmp(allzero, dirty, sizeof(allzero))) {
+ error = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+ if (error != 0) {
+ goto out;
+ }
+ }
+
+ fop->frame->root->uid = 0;
+ fop->frame->root->gid = 0;
+
+ /* For normal fops, ec_[f]xattrop() must succeed on at least
+ * EC_MINIMUM_MIN bricks, however when this is called as part of a
+ * self-heal operation the mask of target bricks (fop->mask) could
+ * contain less than EC_MINIMUM_MIN bricks, causing the xattrop to
+ * always fail. Thus we always use the same minimum used for the main
+ * fop.
+ */
+ if (lock->fd == NULL) {
+ error = ec_loc_from_loc(fop->xl, &loc, &lock->loc);
+ if (error != 0) {
+ goto out;
+ }
+ if (gf_uuid_is_null(loc.pargfid)) {
+ if (loc.parent != NULL) {
+ inode_unref(loc.parent);
+ loc.parent = NULL;
+ }
+ GF_FREE((char *)loc.path);
+ loc.path = NULL;
+ loc.name = NULL;
+ }
+
+ ec_xattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
+ ec_prepare_update_cbk, link, &loc, GF_XATTROP_ADD_ARRAY64,
+ dict, xdata);
+ } else {
+ ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
+ ec_prepare_update_cbk, link, lock->fd,
+ GF_XATTROP_ADD_ARRAY64, dict, xdata);
+ }
+
+ error = 0;
+
+out:
+ fop->frame->root->uid = fop->uid;
+ fop->frame->root->gid = fop->gid;
+
+ loc_wipe(&loc);
+
+ if (dict != NULL) {
+ dict_unref(dict);
+ }
+
+ if (xdata != NULL) {
+ dict_unref(xdata);
+ }
+
+ if (error != 0) {
+ ec_fop_set_error(fop, -error);
+ }
+}
+
+gf_boolean_t
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size)
+{
+ ec_inode_t *ctx;
+ gf_boolean_t found = _gf_false;
+
+ ctx = __ec_inode_get(inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+
+ if (ctx->have_size) {
+ *size = ctx->post_size;
+ found = _gf_true;
+ }
+
+out:
+ return found;
+}
+
+gf_boolean_t
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size)
+{
+ gf_boolean_t found = _gf_false;
+
+ LOCK(&inode->lock);
+ {
+ found = __ec_get_inode_size(fop, inode, size);
+ }
+ UNLOCK(&inode->lock);
+
+ return found;
+}
+
+gf_boolean_t
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size)
+{
+ ec_inode_t *ctx;
+ gf_boolean_t found = _gf_false;
+
+ ctx = __ec_inode_get(inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+
+ /* Normal fops always have ctx->have_size set. However self-heal calls this
+ * to prepare the inode, so ctx->have_size will be false. In this case we
+ * prepare both pre_size and post_size, and set have_size and have_info to
+ * true. */
+ if (!ctx->have_size) {
+ ctx->pre_size = size;
+ ctx->have_size = ctx->have_info = _gf_true;
+ }
+ ctx->post_size = size;
+
+ found = _gf_true;
+
+out:
+ return found;
+}
+
+gf_boolean_t
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size)
+{
+ gf_boolean_t found = _gf_false;
+
+ LOCK(&inode->lock);
+ {
+ found = __ec_set_inode_size(fop, inode, size);
+ }
+ UNLOCK(&inode->lock);
+
+ return found;
+}
+
+static void
+ec_release_stripe_cache(ec_inode_t *ctx)
+{
+ ec_stripe_list_t *stripe_cache = NULL;
+ ec_stripe_t *stripe = NULL;
+
+ stripe_cache = &ctx->stripe_cache;
+ while (!list_empty(&stripe_cache->lru)) {
+ stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru);
+ list_del(&stripe->lru);
+ GF_FREE(stripe);
+ }
+ stripe_cache->count = 0;
+ stripe_cache->max = 0;
+}
+
+void
+ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
+{
+ ec_inode_t *ctx;
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, fop->xl);
+ if (ctx == NULL) {
+ goto unlock;
+ }
+
+ ec_release_stripe_cache(ctx);
+ ctx->have_info = _gf_false;
+ ctx->have_config = _gf_false;
+ ctx->have_version = _gf_false;
+ ctx->have_size = _gf_false;
+
+ memset(&ctx->config, 0, sizeof(ctx->config));
+ memset(ctx->pre_version, 0, sizeof(ctx->pre_version));
+ memset(ctx->post_version, 0, sizeof(ctx->post_version));
+ ctx->pre_size = ctx->post_size = 0;
+ memset(ctx->dirty, 0, sizeof(ctx->dirty));
+
+unlock:
+ UNLOCK(&inode->lock);
+}
+
+int32_t
+ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_lock_link_t *link;
+
+ if (op_ret >= 0) {
+ link = fop->data;
+ link->size = buf->ia_size;
+ } else {
+ /* Prevent failure of parent fop. */
+ fop->error = 0;
+ }
+
+ return 0;
+}
+
+/* This function is used to get the trusted.ec.size xattr from a file when
+ * no lock is needed on the inode. This is only required to maintain iatt
+ * structs on fops that manipulate directory entries but do not operate
+ * directly on the inode, like link, rename, ...
+ *
+ * Any error processing this request is ignored. In the worst case, an invalid
+ * or not up to date value in the iatt could cause some cache invalidation.
+ */
+void
+ec_get_real_size(ec_lock_link_t *link)
+{
+ ec_fop_data_t *fop;
+ dict_t *xdata;
+
+ if (link->base == NULL || link->base->inode == NULL) {
+ return;
+ }
+
+ if (link->base->inode->ia_type != IA_IFREG) {
+ return;
+ }
+
+ fop = link->fop;
+
+ if (ec_get_inode_size(fop, link->base->inode, &link->size)) {
+ return;
+ }
+
+ xdata = dict_new();
+ if (xdata == NULL) {
+ return;
+ }
+ if (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) {
+ goto out;
+ }
+
+ /* Send a simple lookup. A single answer is considered ok since this value
+ * is only used to return an iatt struct related to an inode that is not
+ * locked and have not suffered any operation. */
+ ec_lookup(fop->frame, fop->xl, fop->mask, 1, ec_get_real_size_cbk, link,
+ link->base, xdata);
+
+out:
+ if (xdata != NULL) {
+ dict_unref(xdata);
+ }
+}
+
+static void
+ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop)
+{
+ /* If the fop has an fd available, attach it to the lock structure to be
+ * able to do fxattrop calls instead of xattrop. */
+ if (fop->use_fd && (lock->fd == NULL)) {
+ lock->fd = __fd_ref(fop->fd);
+ }
+}
+
+static gf_boolean_t
+ec_link_has_lock_conflict(ec_lock_link_t *link, gf_boolean_t waitlist_check)
+{
+ ec_lock_link_t *trav_link = NULL;
+
+ list_for_each_entry(trav_link, &link->lock->owners, owner_list)
+ {
+ if (ec_lock_conflict(trav_link, link))
+ return _gf_true;
+ }
+
+ if (!waitlist_check)
+ return _gf_false;
+
+ list_for_each_entry(trav_link, &link->lock->waiting, wait_list)
+ {
+ if (ec_lock_conflict(trav_link, link))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static void
+ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
+{
+ ec_fop_data_t *fop;
+ ec_lock_link_t *link;
+ gf_boolean_t conflict = _gf_false;
+
+ while (!conflict && !list_empty(&lock->waiting)) {
+ link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
+ fop = link->fop;
+
+ /* If lock is not acquired, at most one fop can be assigned as owner.
+ * The following fops will need to wait in the lock->waiting queue
+ * until the lock has been fully acquired. */
+ conflict = !lock->acquired;
+
+ /* If the fop is not shareable, only this fop can be assigned as owner.
+ * Other fops will need to wait until this one finishes. */
+ if (ec_link_has_lock_conflict(link, _gf_false)) {
+ conflict = _gf_true;
+ }
+
+ /* If only one fop is allowed, it can be assigned as the owner of the
+ * lock only if there weren't any other owner. */
+ if (conflict && !list_empty(&lock->owners)) {
+ break;
+ }
+
+ list_move_tail(&link->wait_list, list);
+
+ list_add_tail(&link->owner_list, &lock->owners);
+ lock->refs_owners++;
+
+ ec_lock_update_fd(lock, fop);
+ }
+}
+
+static void
+ec_lock_apply(ec_lock_link_t *link)
+{
+ ec_fop_data_t *fop = link->fop;
+
+ fop->mask &= link->lock->good_mask;
+ fop->locked++;
+
+ ec_get_size_version(link);
+ ec_get_real_size(link);
+}
+
+gf_boolean_t
+ec_lock_acquire(ec_lock_link_t *link);
+
+static void
+ec_lock_resume_shared(struct list_head *list)
+{
+ ec_lock_link_t *link;
+
+ while (!list_empty(list)) {
+ link = list_entry(list->next, ec_lock_link_t, wait_list);
+ list_del_init(&link->wait_list);
+
+ if (link->lock->acquired) {
+ ec_lock_apply(link);
+ ec_lock(link->fop);
+ } else {
+ GF_ASSERT(list_empty(list));
+
+ ec_lock_acquire(link);
+ }
+
+ ec_resume(link->fop, 0);
+ }
+}
+
+void
+ec_lock_acquired(ec_lock_link_t *link)
+{
+ struct list_head list;
+ ec_lock_t *lock;
+ ec_fop_data_t *fop;
+
+ lock = link->lock;
+ fop = link->fop;
+
+ ec_trace("LOCKED", fop, "lock=%p", lock);
+
+ INIT_LIST_HEAD(&list);
+
+ LOCK(&lock->loc.inode->lock);
+
+ lock->acquired = _gf_true;
+ if (lock->contention) {
+ lock->release = _gf_true;
+ lock->contention = _gf_false;
+ }
+
+ ec_lock_update_fd(lock, fop);
+ ec_lock_wake_shared(lock, &list);
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ ec_lock_apply(link);
+
+ if (fop->use_fd &&
+ (link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) {
+ /* Try to reopen closed fd's only if lock has succeeded. */
+ ec_fix_open(fop, lock->mask);
+ }
+
+ ec_lock_resume_shared(&list);
+}
+
+int32_t
+ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_lock_link_t *link = NULL;
+ ec_lock_t *lock = NULL;
+
+ link = fop->data;
+ lock = link->lock;
+ if (op_ret >= 0) {
+ lock->mask = lock->good_mask = fop->good;
+ lock->healing = 0;
+
+ ec_lock_acquired(link);
+ ec_lock(fop->parent);
+ } else {
+ LOCK(&lock->loc.inode->lock);
+ {
+ lock->contention = _gf_false;
+ }
+ UNLOCK(&lock->loc.inode->lock);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_PREOP_LOCK_FAILED,
+ "Failed to complete preop lock");
+ }
+
+ return 0;
+}
+
+gf_boolean_t
+ec_lock_acquire(ec_lock_link_t *link)
+{
+ ec_lock_t *lock;
+ ec_fop_data_t *fop;
+ gf_lkowner_t lk_owner;
+
+ lock = link->lock;
+ fop = link->fop;
+
+ if (!lock->acquired) {
+ set_lk_owner_from_ptr(&lk_owner, lock);
+
+ ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock,
+ lock->loc.inode);
+
+ lock->flock.l_type = F_WRLCK;
+ ec_inodelk(fop->frame, fop->xl, &lk_owner, -1, EC_MINIMUM_ALL,
+ ec_locked, link, fop->xl->name, &lock->loc, F_SETLKW,
+ &lock->flock, NULL);
+
+ return _gf_false;
+ }
+
+ ec_trace("LOCK_REUSE", fop, "lock=%p", lock);
+
+ ec_lock_acquired(link);
+
+ return _gf_true;
+}
+
+static ec_lock_link_t *
+ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock)
+{
+ ec_lock_link_t *timer_link;
+
+ /* If we don't have any timer, there's nothing to cancel. */
+ if (lock->timer == NULL) {
+ return NULL;
+ }
+
+ /* We are trying to access a lock that has an unlock timer active.
+ * This means that the lock must be idle, i.e. no fop can be in the
+ * owner, waiting or frozen lists. It also means that the lock cannot
+ * have been marked as being released (this is done without timers).
+ * There should only be one owner reference, but it's possible that
+ * some fops are being prepared to use this lock. */
+ GF_ASSERT((lock->refs_owners == 1) && list_empty(&lock->owners) &&
+ list_empty(&lock->waiting));
+
+ /* We take the timer_link before cancelling the timer, since a
+ * successful cancellation will destroy it. It must not be NULL
+ * because it references the fop responsible for the delayed unlock
+ * that we are currently trying to cancel. */
+ timer_link = lock->timer->data;
+ GF_ASSERT(timer_link != NULL);
+
+ if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) {
+ /* It's too late to avoid the execution of the timer callback.
+ * Since we need to be sure that the callback has access to all
+ * needed resources, we cannot resume the execution of the
+ * timer fop now. This will be done in the callback. */
+ timer_link = NULL;
+ } else {
+ /* The timer has been cancelled. The fop referenced by
+ * timer_link holds the last reference. The caller is
+ * responsible to release it when not needed anymore. */
+ ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+ }
+
+ /* We have two options here:
+ *
+ * 1. The timer has been successfully cancelled.
+ *
+ * This is the easiest case and we can continue with the currently
+ * acquired lock.
+ *
+ * 2. The timer callback has already been fired.
+ *
+ * In this case we have not been able to cancel the timer before
+ * the timer callback has been fired, but we also know that
+ * lock->timer != NULL. This means that the timer callback is still
+ * trying to acquire the inode mutex that we currently own. We are
+ * safe until we release it. In this case we can safely clear
+ * lock->timer. This will cause that the timer callback does nothing
+ * once it acquires the mutex.
+ */
+ lock->timer = NULL;
+
+ return timer_link;
+}
+
+static gf_boolean_t
+ec_lock_assign_owner(ec_lock_link_t *link)
+{
+ ec_fop_data_t *fop;
+ ec_lock_t *lock;
+ ec_lock_link_t *timer_link = NULL;
+ gf_boolean_t assigned = _gf_false;
+
+ /* The link cannot be in any list because we have just finished preparing
+ * it. */
+ GF_ASSERT(list_empty(&link->wait_list));
+
+ fop = link->fop;
+ lock = link->lock;
+
+ LOCK(&lock->loc.inode->lock);
+
+ /* Since the link has just been prepared but it's not active yet, the
+ * refs_pending must be one at least (the ref owned by this link). */
+ GF_ASSERT(lock->refs_pending > 0);
+ /* The link is not pending any more. It will be assigned to the owner,
+ * waiting or frozen list. */
+ lock->refs_pending--;
+
+ if (lock->release) {
+ ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock);
+
+ /* When lock->release is set, we'll unlock the lock as soon as
+ * possible, meaning that we won't use a timer. */
+ GF_ASSERT(lock->timer == NULL);
+
+ /* The lock is marked to be released. We can still have owners and fops
+ * in the waiting ilist f they have been added before the lock has been
+ * marked to be released. However new fops are put into the frozen list
+ * to wait for the next unlock/lock cycle. */
+ list_add_tail(&link->wait_list, &lock->frozen);
+
+ goto unlock;
+ }
+
+ /* The lock is not marked to be released, so the frozen list should be
+ * empty. */
+ GF_ASSERT(list_empty(&lock->frozen));
+
+ timer_link = ec_lock_timer_cancel(fop->xl, lock);
+
+ if (!list_empty(&lock->owners)) {
+ /* There are other owners of this lock. We can only take ownership if
+ * the lock is already acquired and doesn't have conflict with existing
+ * owners, or waiters(to prevent starvation).
+ * Otherwise we need to wait.
+ */
+ if (!lock->acquired || ec_link_has_lock_conflict(link, _gf_true)) {
+ ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
+
+ list_add_tail(&link->wait_list, &lock->waiting);
+
+ goto unlock;
+ }
+ }
+
+ list_add_tail(&link->owner_list, &lock->owners);
+
+ /* If timer_link is not NULL, it means that we have inherited the owner
+ * reference assigned to the timer fop. In this case we simply reuse it.
+ * Otherwise we need to increase the number of owners. */
+ if (timer_link == NULL) {
+ lock->refs_owners++;
+ }
+
+ assigned = _gf_true;
+
+unlock:
+ if (!assigned) {
+ /* We have not been able to take ownership of this lock. The fop must
+ * be put to sleep. */
+ ec_sleep(fop);
+ }
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ /* If we have cancelled the timer, we need to resume the fop that was
+ * waiting for it. */
+ if (timer_link != NULL) {
+ ec_resume(timer_link->fop, 0);
+ }
+
+ return assigned;
+}
+
+static void
+ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
+ gf_boolean_t release)
+{
+ struct list_head list;
+ ec_lock_t *lock = link->lock;
+ ec_fop_data_t *fop = link->fop;
+ ec_inode_t *ctx = lock->ctx;
+
+ INIT_LIST_HEAD(&list);
+
+ LOCK(&lock->loc.inode->lock);
+
+ ec_trace("LOCK_DONE", fop, "lock=%p", lock);
+
+ /* Current link must belong to the owner list of the lock. We don't
+ * decrement lock->refs_owners here because the inode mutex is released
+ * before ec_unlock() is called and we need to know when the last owner
+ * unlocks the lock to do proper cleanup. lock->refs_owners is used for
+ * this task. */
+ GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list));
+ list_del_init(&link->owner_list);
+
+ lock->release |= release;
+
+ if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
+ if (link->update[0]) {
+ ctx->post_version[0]++;
+ }
+ if (link->update[1]) {
+ ctx->post_version[1]++;
+ }
+ /* If the fop fails on any of the good bricks, it is important to mark
+ * it dirty and update versions right away. */
+ if (link->update[0] || link->update[1]) {
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
+ lock->release = _gf_true;
+ }
+ }
+ }
+
+ if (fop->healing) {
+ lock->healing = fop->healing & (fop->good | fop->remaining);
+ }
+ ec_lock_update_good(lock, fop);
+
+ ec_lock_wake_shared(lock, &list);
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ ec_lock_resume_shared(&list);
+}
+
+void
+ec_lock(ec_fop_data_t *fop)
+{
+ ec_lock_link_t *link;
+
+ /* There is a chance that ec_resume is called on fop even before ec_sleep.
+ * Which can result in refs == 0 for fop leading to use after free in this
+ * function when it calls ec_sleep so do ec_sleep at start and ec_resume at
+ * the end of this function.*/
+ ec_sleep(fop);
+
+ while (fop->locked < fop->lock_count) {
+ /* Since there are only up to 2 locks per fop, this xor will change
+ * the order of the locks if fop->first_lock is 1. */
+ link = &fop->locks[fop->locked ^ fop->first_lock];
+
+ if (!ec_lock_assign_owner(link) || !ec_lock_acquire(link)) {
+ break;
+ }
+ }
+
+ ec_resume(fop, 0);
+}
+
+void
+ec_lock_unfreeze(ec_lock_link_t *link)
+{
+ struct list_head list;
+ ec_lock_t *lock;
+ gf_boolean_t destroy = _gf_false;
+
+ lock = link->lock;
+
+ INIT_LIST_HEAD(&list);
+
+ LOCK(&lock->loc.inode->lock);
+
+ /* The lock must be marked to be released here, since we have just released
+ * it and any attempt to assign it to more fops must have added them to the
+ * frozen list. We can only have one active reference here: the one that
+ * is processing this unfreeze. */
+ GF_ASSERT(lock->release && (lock->refs_owners == 1));
+ lock->release = _gf_false;
+ lock->refs_owners = 0;
+
+ lock->acquired = _gf_false;
+
+ /* We are unfreezing a lock. This means that the lock has already been
+ * released. In this state it shouldn't have a pending timer nor have any
+ * owner, and the waiting list should be empty. Only the frozen list can
+ * contain some fop. */
+ GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) &&
+ list_empty(&lock->owners));
+
+ /* We move all frozen fops to the waiting list. */
+ list_splice_init(&lock->frozen, &lock->waiting);
+
+ /* If we don't have any fop waiting nor there are any prepared fops using
+ * this lock, we can finally dispose it. */
+ destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0);
+ if (destroy) {
+ ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock);
+
+ lock->ctx->inode_lock = NULL;
+ } else {
+ ec_trace("LOCK_UNFREEZE", link->fop, "lock=%p", lock);
+
+ ec_lock_wake_shared(lock, &list);
+ }
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ ec_lock_resume_shared(&list);
+
+ if (destroy) {
+ ec_lock_destroy(lock);
+ }
+}
+
+int32_t
+ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_lock_link_t *link = fop->data;
+
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED,
+ "entry/inode unlocking failed :(%s)", ec_msg_str(link->fop));
+ } else {
+ ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock);
+ }
+
+ ec_lock_unfreeze(link);
+
+ return 0;
+}
+
+void
+ec_unlock_lock(ec_lock_link_t *link)
+{
+ ec_lock_t *lock;
+ ec_fop_data_t *fop;
+ gf_lkowner_t lk_owner;
+
+ lock = link->lock;
+ fop = link->fop;
+
+ lock->unlock_now = _gf_false;
+ ec_clear_inode_info(fop, lock->loc.inode);
+
+ if ((lock->mask != 0) && lock->acquired) {
+ set_lk_owner_from_ptr(&lk_owner, lock);
+ lock->flock.l_type = F_UNLCK;
+ ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
+ lock->loc.inode);
+
+ ec_inodelk(fop->frame, fop->xl, &lk_owner, lock->mask, EC_MINIMUM_ONE,
+ ec_unlocked, link, fop->xl->name, &lock->loc, F_SETLK,
+ &lock->flock, NULL);
+ } else {
+ ec_lock_unfreeze(link);
+ }
+}
+
+void
+ec_inode_bad_inc(inode_t *inode, xlator_t *xl)
+{
+ ec_inode_t *ctx = NULL;
+
+ LOCK(&inode->lock);
+ {
+ ctx = __ec_inode_get(inode, xl);
+ if (ctx == NULL) {
+ goto unlock;
+ }
+ ctx->bad_version++;
+ }
+unlock:
+ UNLOCK(&inode->lock);
+}
+
+int32_t
+ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_lock_link_t *link;
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+
+ link = fop->data;
+ lock = link->lock;
+ ctx = lock->ctx;
+
+ if (op_ret < 0) {
+ if (link->lock->fd == NULL) {
+ ec_inode_bad_inc(link->lock->loc.inode, this);
+ } else {
+ ec_inode_bad_inc(link->lock->fd->inode, this);
+ }
+
+ gf_msg(fop->xl->name, fop_log_level(fop->id, op_errno), op_errno,
+ EC_MSG_SIZE_VERS_UPDATE_FAIL,
+ "Failed to update version and size. %s", ec_msg_str(fop));
+ } else {
+ fop->parent->good &= fop->good;
+
+ ec_lock_update_good(lock, fop);
+
+ if (ec_dict_del_array(xattr, EC_XATTR_VERSION, ctx->post_version,
+ EC_VERSION_SIZE) == 0) {
+ ctx->pre_version[0] = ctx->post_version[0];
+ ctx->pre_version[1] = ctx->post_version[1];
+
+ ctx->have_version = _gf_true;
+ }
+ if (ec_dict_del_number(xattr, EC_XATTR_SIZE, &ctx->post_size) == 0) {
+ ctx->pre_size = ctx->post_size;
+
+ ctx->have_size = _gf_true;
+ }
+ if ((ec_dict_del_config(xdata, EC_XATTR_CONFIG, &ctx->config) == 0) &&
+ ec_config_check(fop->xl, &ctx->config)) {
+ ctx->have_config = _gf_true;
+ }
+
+ ctx->have_info = _gf_true;
+ }
+ /* If we are here because of fop's and other than unlock request,
+ * that means we are still holding a lock. That make sure
+ * lock->unlock_now can not be modified.
+ */
+ if (lock->unlock_now) {
+ ec_unlock_lock(fop->data);
+ }
+
+ return 0;
+}
+
+void
+ec_update_size_version(ec_lock_link_t *link, uint64_t *version, uint64_t size,
+ uint64_t *dirty)
+{
+ ec_fop_data_t *fop;
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ dict_t *dict = NULL;
+ uintptr_t update_on = 0;
+ int32_t err = -ENOMEM;
+
+ fop = link->fop;
+ lock = link->lock;
+ ctx = lock->ctx;
+
+ ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
+ version[0], version[1], size, dirty[0], dirty[1]);
+
+ dict = dict_new();
+ if (dict == NULL) {
+ goto out;
+ }
+
+ /* If we don't have version information or it has been modified, we
+ * update it. */
+ if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) {
+ err = ec_dict_set_array(dict, EC_XATTR_VERSION, version,
+ EC_VERSION_SIZE);
+ if (err != 0) {
+ goto out;
+ }
+ }
+
+ if (size != 0) {
+ /* If size has been changed, we should already
+ * know the previous size of the file. */
+ GF_ASSERT(ctx->have_size);
+
+ err = ec_dict_set_number(dict, EC_XATTR_SIZE, size);
+ if (err != 0) {
+ goto out;
+ }
+ }
+
+ if (dirty[0] || dirty[1]) {
+ err = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+ if (err != 0) {
+ goto out;
+ }
+ }
+
+ /* If config information is not known, we request it now. */
+ if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) {
+ /* A failure requesting this xattr is ignored because it's not
+ * absolutely required right now. */
+ (void)ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+ }
+
+ fop->frame->root->uid = 0;
+ fop->frame->root->gid = 0;
+
+ update_on = lock->good_mask | lock->healing;
+
+ if (link->lock->fd == NULL) {
+ ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
+ ec_update_size_version_done, link, &link->lock->loc,
+ GF_XATTROP_ADD_ARRAY64, dict, NULL);
+ } else {
+ ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
+ ec_update_size_version_done, link, link->lock->fd,
+ GF_XATTROP_ADD_ARRAY64, dict, NULL);
+ }
+
+ fop->frame->root->uid = fop->uid;
+ fop->frame->root->gid = fop->gid;
+
+ dict_unref(dict);
+
+ return;
+
+out:
+ if (dict != NULL) {
+ dict_unref(dict);
+ }
+
+ ec_fop_set_error(fop, -err);
+
+ gf_msg(fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
+ "Unable to update version and size. %s", ec_msg_str(fop));
+
+ if (lock->unlock_now) {
+ ec_unlock_lock(fop->data);
+ }
+}
+
+gf_boolean_t
+ec_update_info(ec_lock_link_t *link)
+{
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ uint64_t version[2] = {0, 0};
+ uint64_t dirty[2] = {0, 0};
+ uint64_t size;
+ ec_t *ec = NULL;
+ uintptr_t mask;
+
+ lock = link->lock;
+ ctx = lock->ctx;
+ ec = link->fop->xl->private;
+
+ /* pre_version[*] will be 0 if have_version is false */
+ version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
+ ctx->pre_version[EC_DATA_TXN];
+ version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
+ ctx->pre_version[EC_METADATA_TXN];
+
+ size = ctx->post_size - ctx->pre_size;
+ /* If we set the dirty flag for update fop, we have to unset it.
+ * If fop has failed on some bricks, leave the dirty as marked. */
+
+ if (lock->unlock_now) {
+ if (version[EC_DATA_TXN]) {
+ /*A data fop will have difference in post and pre version
+ *and for data fop we send writes on healing bricks also */
+ mask = lock->good_mask | lock->healing;
+ } else {
+ mask = lock->good_mask;
+ }
+ /* Ensure that nodes are up while doing final
+ * metadata update.*/
+ if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
+ if (ctx->dirty[EC_DATA_TXN] != 0) {
+ dirty[EC_DATA_TXN] = -1;
+ }
+ if (ctx->dirty[EC_METADATA_TXN] != 0) {
+ dirty[EC_METADATA_TXN] = -1;
+ }
+ /*If everything is fine and we already
+ *have version xattr set on entry, there
+ *is no need to update version again*/
+ if (ctx->pre_version[EC_DATA_TXN]) {
+ version[EC_DATA_TXN] = 0;
+ }
+ if (ctx->pre_version[EC_METADATA_TXN]) {
+ version[EC_METADATA_TXN] = 0;
+ }
+ } else {
+ link->optimistic_changelog = _gf_false;
+ ec_set_dirty_flag(link, ctx, dirty);
+ }
+ memset(ctx->dirty, 0, sizeof(ctx->dirty));
+ }
+
+ if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
+ (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
+ ec_update_size_version(link, version, size, dirty);
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+void
+ec_unlock_now(ec_lock_link_t *link)
+{
+ ec_lock_t *lock;
+ lock = link->lock;
+
+ ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock);
+ /*At this point, lock is not being used by any fop and
+ *can not be reused by any fop as it is going to be released.
+ *lock->unlock_now can not be modified at any other place.
+ */
+ lock->unlock_now = _gf_true;
+
+ if (!ec_update_info(link)) {
+ ec_unlock_lock(link);
+ }
+
+ ec_resume(link->fop, 0);
+}
+
+void
+ec_lock_release(ec_t *ec, inode_t *inode)
+{
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ ec_lock_link_t *timer_link = NULL;
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, ec->xl);
+ if (ctx == NULL) {
+ goto done;
+ }
+ lock = ctx->inode_lock;
+ if ((lock == NULL) || lock->release) {
+ goto done;
+ }
+
+ gf_msg_debug(ec->xl->name, 0, "Releasing inode %p due to lock contention",
+ inode);
+
+ if (!lock->acquired) {
+ /* This happens if some bricks already got the lock while inodelk is in
+ * progress. Set release to true after lock is acquired*/
+ lock->contention = _gf_true;
+ goto done;
+ }
+
+ /* The lock is not marked to be released, so the frozen list should be
+ * empty. */
+ GF_ASSERT(list_empty(&lock->frozen));
+
+ timer_link = ec_lock_timer_cancel(ec->xl, lock);
+
+ /* We mark the lock to be released as soon as possible. */
+ lock->release = _gf_true;
+
+done:
+ UNLOCK(&inode->lock);
+
+ /* If we have cancelled the timer, we need to start the unlock of the
+ * inode. If there was a timer but we have been unable to cancel it
+ * because it was just triggered, the timer callback will take care
+ * of releasing the inode. */
+ if (timer_link != NULL) {
+ ec_unlock_now(timer_link);
+ }
+}
+
+void
+ec_unlock_timer_add(ec_lock_link_t *link);
+
+void
+ec_unlock_timer_del(ec_lock_link_t *link)
+{
+ ec_lock_t *lock;
+ inode_t *inode;
+ gf_boolean_t now = _gf_false;
+
+ /* If we are here, it means that the timer has expired before having
+ * been cancelled. This guarantees that 'link' is still valid because
+ * the fop that contains it must be pending (if timer cancellation in
+ * ec_lock_assign_owner() fails, the fop is left sleeping).
+ *
+ * At the same time, the fop still has a reference to the lock, so
+ * it must also be valid.
+ */
+ lock = link->lock;
+
+ /* 'lock' must have a valid inode since it can only be destroyed
+ * when the lock itself is destroyed, but we have a reference to the
+ * lock to avoid this.
+ */
+ inode = lock->loc.inode;
+
+ LOCK(&inode->lock);
+
+ if (lock->timer != NULL) {
+ ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
+
+ /* The unlock timer has expired without anyone cancelling it.
+ * This means that it shouldn't have any owner, and the waiting
+ * and frozen lists should be empty. It must have only one
+ * owner reference, but there can be fops being prepared
+ * though.
+ * */
+ GF_ASSERT(!lock->release && (lock->refs_owners == 1) &&
+ list_empty(&lock->owners) && list_empty(&lock->waiting) &&
+ list_empty(&lock->frozen));
+
+ gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);
+ lock->timer = NULL;
+
+ /* Any fop being processed from now on, will need to wait
+ * until the next unlock/lock cycle. */
+ lock->release = now = _gf_true;
+ }
+
+ UNLOCK(&inode->lock);
+
+ if (now) {
+ ec_unlock_now(link);
+ } else {
+ /* The timer has been cancelled just after firing it but before
+ * getting here. This means that another fop has used the lock
+ * and everything should be handled as if this callback were
+ * have not been executed. However we still have an owner
+ * reference.
+ *
+ * We need to release our reference. If this is not the last
+ * reference (the most common case because another fop has
+ * taken another ref) we only need to decrement the counter.
+ * Otherwise we have been delayed enough so that the other fop
+ * has had time to acquire the reference, do its operation and
+ * release it. At the time of releasing it, the fop did found
+ * that the ref counter was > 1 (our reference), so the delayed
+ * unlock timer wasn't started. We need to start it again if we
+ * are the last reference.
+ *
+ * ec_unlock_timer_add() handles both cases.
+ */
+ ec_unlock_timer_add(link);
+
+ /* We need to resume the fop that was waiting for the delayed
+ * unlock.
+ */
+ ec_resume(link->fop, 0);
+ }
+}
+
+void
+ec_unlock_timer_cbk(void *data)
+{
+ ec_unlock_timer_del(data);
+}
+
+static gf_boolean_t
+ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop)
+{
+ /* Fops with no locks at this point mean that they are sent as sub-fops
+ * of other higher level fops. In this case we simply assume that the
+ * parent fop will take correct care of the eager lock. */
+ if (fop->lock_count == 0) {
+ return _gf_true;
+ }
+
+ /* We may have more than one lock, but this only happens in the rename
+ * fop, and both locks will reference an inode of the same type (a
+ * directory in this case), so we only need to check the first lock. */
+ if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
+ return ec->eager_lock;
+ }
+
+ return ec->other_eager_lock;
+}
+
+static uint32_t
+ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock)
+{
+ if (lock->loc.inode->ia_type == IA_IFREG) {
+ return ec->eager_lock_timeout;
+ }
+
+ return ec->other_eager_lock_timeout;
+}
+
+static gf_boolean_t
+ec_lock_delay_create(ec_lock_link_t *link)
+{
+ struct timespec delay;
+ ec_fop_data_t *fop = link->fop;
+ ec_lock_t *lock = link->lock;
+
+ delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock);
+ delay.tv_nsec = 0;
+ lock->timer = gf_timer_call_after(fop->xl->ctx, delay, ec_unlock_timer_cbk,
+ link);
+ if (lock->timer == NULL) {
+ gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
+ EC_MSG_UNLOCK_DELAY_FAILED, "Unable to delay an unlock");
+
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+void
+ec_unlock_timer_add(ec_lock_link_t *link)
+{
+ ec_fop_data_t *fop = link->fop;
+ ec_lock_t *lock = link->lock;
+ gf_boolean_t now = _gf_false;
+
+ LOCK(&lock->loc.inode->lock);
+
+ /* We are trying to unlock the lock. We can have multiple scenarios here,
+ * but all of them need to have lock->timer == NULL:
+ *
+ * 1. There are other owners currently running that can call ec_unlock().
+ *
+ * None of them can have started the timer until the last one. But this
+ * call should be the consequence of this lastest one.
+ *
+ * 2. There are fops in the waiting or frozen lists.
+ *
+ * These fops cannot call ec_unlock(). So we should be here.
+ *
+ * We must reach here with at least one owner reference.
+ */
+ GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0));
+
+ /* If the fop detects that a heal is needed, we mark the lock to be
+ * released as soon as possible. */
+ lock->release |= ec_fop_needs_heal(fop);
+
+ if (lock->refs_owners > 1) {
+ ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);
+
+ /* If there are other owners we cannot do anything else with the lock.
+ * Note that the current fop has already been removed from the owners
+ * list in ec_lock_reuse(). */
+ lock->refs_owners--;
+
+ UNLOCK(&lock->loc.inode->lock);
+ } else if (lock->acquired) {
+ /* There are no other owners and the lock is acquired. If there were
+ * fops waiting, at least one of them should have been promoted to an
+ * owner, so the waiting list should be empty. */
+ GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+ ec_t *ec = fop->xl->private;
+
+ /* If everything goes as expected this fop will be put to sleep until
+ * the timer callback is executed. */
+ ec_sleep(fop);
+
+ /* If the lock needs to be released, or ec is shutting down, do not
+ * delay lock release. */
+ if (!lock->release && !ec->shutdown) {
+ ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
+ lock->release);
+
+ if (!ec_lock_delay_create(link)) {
+ /* We are unable to create a new timer. We immediately release
+ * the lock. */
+ lock->release = now = _gf_true;
+ }
+
+ } else {
+ ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
+ lock->release);
+ lock->release = now = _gf_true;
+ }
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ if (now) {
+ ec_unlock_now(link);
+ }
+ } else {
+ /* There are no owners and the lock is not acquired. This can only
+ * happen if a lock attempt has failed and we get to the unlock step
+ * of the fop. As in the previous case, the waiting list must be
+ * empty. */
+ GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+ /* We need to mark the lock to be released to correctly handle fops
+ * that may get in after we release the inode mutex but before
+ * ec_lock_unfreeze() is processed. */
+ lock->release = _gf_true;
+
+ UNLOCK(&lock->loc.inode->lock);
+
+ ec_lock_unfreeze(link);
+ }
+}
+
+void
+ec_unlock(ec_fop_data_t *fop)
+{
+ int32_t i;
+
+ for (i = 0; i < fop->lock_count; i++) {
+ ec_unlock_timer_add(&fop->locks[i]);
+ }
+}
+
+void
+ec_flush_size_version(ec_fop_data_t *fop)
+{
+ GF_ASSERT(fop->lock_count == 1);
+ ec_update_info(&fop->locks[0]);
+}
+
+static void
+ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
+ ec_fop_data_t *fop)
+{
+ off_t base;
+
+ /* On write fops, we only update existing fragments if the write has
+ * succeeded. Otherwise, we remove them from the cache. */
+ if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) &&
+ (fop->answer->op_ret >= 0)) {
+ base = stripe->frag_offset - fop->frag_range.first;
+ base *= ec->fragments;
+
+ /* We check if the stripe offset falls inside the real region
+ * modified by the write fop (a write request is allowed,
+ * though uncommon, to write less bytes than requested). The
+ * current write fop implementation doesn't allow partial
+ * writes of fragments, so if there's no error, we are sure
+ * that a full stripe has been completely modified or not
+ * touched at all. The value of op_ret may not be a multiple
+ * of the stripe size because it depends on the requested
+ * size by the user, so we update the stripe if the write has
+ * modified at least one byte (meaning ec has written the full
+ * stripe). */
+ if (base < fop->answer->op_ret + fop->head) {
+ memcpy(stripe->data, fop->vector[0].iov_base + base,
+ ec->stripe_size);
+ list_move_tail(&stripe->lru, &stripe_cache->lru);
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.updates);
+ }
+ } else {
+ stripe->frag_offset = -1;
+ list_move(&stripe->lru, &stripe_cache->lru);
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.invals);
+ }
+}
+
+static void
+ec_update_cached_stripes(ec_fop_data_t *fop)
+{
+ uint64_t first;
+ uint64_t last;
+ ec_stripe_t *stripe = NULL;
+ ec_inode_t *ctx = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+ inode_t *inode = NULL;
+ struct list_head *temp;
+ struct list_head sentinel;
+
+ first = fop->frag_range.first;
+ /* 'last' represents the first stripe not touched by the operation */
+ last = fop->frag_range.last;
+
+ /* If there are no modified stripes, we don't need to do anything
+ * else. */
+ if (last <= first) {
+ return;
+ }
+
+ if (!fop->use_fd) {
+ inode = fop->loc[0].inode;
+ } else {
+ inode = fop->fd->inode;
+ }
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+ stripe_cache = &ctx->stripe_cache;
+
+ /* Since we'll be moving elements of the list to the tail, we might
+ * end in an infinite loop. To avoid it, we insert a sentinel element
+ * into the list, so that it will be used to detect when we have
+ * traversed all existing elements once. */
+ list_add_tail(&sentinel, &stripe_cache->lru);
+ temp = stripe_cache->lru.next;
+ while (temp != &sentinel) {
+ stripe = list_entry(temp, ec_stripe_t, lru);
+ temp = temp->next;
+ if ((first <= stripe->frag_offset) && (stripe->frag_offset < last)) {
+ ec_update_stripe(fop->xl->private, stripe_cache, stripe, fop);
+ }
+ }
+ list_del(&sentinel);
+
+out:
+ UNLOCK(&inode->lock);
+}
+
+void
+ec_lock_reuse(ec_fop_data_t *fop)
+{
+ ec_cbk_data_t *cbk;
+ ec_t *ec = NULL;
+ int32_t i, count;
+ gf_boolean_t release = _gf_false;
+ ec = fop->xl->private;
+ cbk = fop->answer;
+
+ if (ec_eager_lock_used(ec, fop) && cbk != NULL) {
+ if (cbk->xdata != NULL) {
+ if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, &count) ==
+ 0) &&
+ (count > 1)) {
+ release = _gf_true;
+ }
+ if (release) {
+ gf_msg_debug(fop->xl->name, 0, "Lock contention detected");
+ }
+ }
+ } else {
+ /* If eager lock is disabled or if we haven't get
+ * an answer with enough quorum, we always release
+ * the lock. */
+ release = _gf_true;
+ }
+ ec_update_cached_stripes(fop);
+
+ for (i = 0; i < fop->lock_count; i++) {
+ ec_lock_next_owner(&fop->locks[i], cbk, release);
+ }
+}
+
+void
+__ec_manager(ec_fop_data_t *fop, int32_t error)
+{
+ ec_t *ec = fop->xl->private;
+
+ do {
+ ec_trace("MANAGER", fop, "error=%d", error);
+
+ if (!ec_must_wind(fop)) {
+ if (ec->xl_up_count < ec->fragments) {
+ error = ENOTCONN;
+ }
+ }
+
+ if (error != 0) {
+ fop->error = error;
+ fop->state = -fop->state;
+ }
+
+ if ((fop->state == EC_STATE_END) || (fop->state == -EC_STATE_END)) {
+ ec_fop_data_release(fop);
+
+ break;
+ }
+
+ /* At each state, fop must not be used anywhere else and there
+ * shouldn't be any pending subfop going on. */
+ GF_ASSERT(fop->jobs == 0);
+
+ /* While the manager is running we need to avoid that subfops launched
+ * from it could finish and call ec_resume() before the fop->handler
+ * has completed. This could lead to the same manager being executed
+ * by two threads concurrently. ec_check_complete() will take care of
+ * this reference. */
+ fop->jobs = 1;
+
+ fop->state = fop->handler(fop, fop->state);
+ GF_ASSERT(fop->state >= 0);
+
+ error = ec_check_complete(fop, __ec_manager);
+ } while (error >= 0);
+}
+
+void
+ec_manager(ec_fop_data_t *fop, int32_t error)
+{
+ GF_ASSERT(fop->jobs == 0);
+ GF_ASSERT(fop->winds == 0);
+ GF_ASSERT(fop->error == 0);
+
+ if (fop->state == EC_STATE_START) {
+ fop->state = EC_STATE_INIT;
+ }
+
+ __ec_manager(fop, error);
+}
+
+gf_boolean_t
+__ec_is_last_fop(ec_t *ec)
+{
+ if ((list_empty(&ec->pending_fops)) &&
+ (GF_ATOMIC_GET(ec->async_fop_count) == 0)) {
+ return _gf_true;
+ }
+ return _gf_false;
+}
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
new file mode 100644
index 00000000000..51493612ac6
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -0,0 +1,234 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_COMMON_H__
+#define __EC_COMMON_H__
+
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include "ec-data.h"
+
+typedef enum { EC_DATA_TXN, EC_METADATA_TXN } ec_txn_t;
+
+#define EC_FOP_HEAL -1
+#define EC_FOP_FHEAL -2
+
+#define EC_CONFIG_VERSION 0
+
+#define EC_CONFIG_ALGORITHM 0
+
+#define EC_FLAG_LOCK_SHARED 0x0001
+
+#define QUORUM_CBK(fn, fop, frame, cookie, this, op_ret, op_errno, params...) \
+ do { \
+ ec_t *__ec = fop->xl->private; \
+ int32_t __op_ret = 0; \
+ int32_t __op_errno = 0; \
+ int32_t __success_count = gf_bits_count(fop->good); \
+ \
+ __op_ret = op_ret; \
+ __op_errno = op_errno; \
+ if (!fop->parent && frame && \
+ (GF_CLIENT_PID_SELF_HEALD != frame->root->pid) && \
+ __ec->quorum_count && (__success_count < __ec->quorum_count) && \
+ op_ret >= 0) { \
+ __op_ret = -1; \
+ __op_errno = EIO; \
+ gf_msg(__ec->xl->name, GF_LOG_ERROR, 0, \
+ EC_MSG_CHILDS_INSUFFICIENT, \
+ "Insufficient available children for this request " \
+ "(have %d, need %d). %s", \
+ __success_count, __ec->quorum_count, ec_msg_str(fop)); \
+ } \
+ fn(frame, cookie, this, __op_ret, __op_errno, params); \
+ } while (0)
+
+enum _ec_xattrop_flags {
+ EC_FLAG_XATTROP,
+ EC_FLAG_DATA_DIRTY,
+ EC_FLAG_METADATA_DIRTY,
+
+ /* Add any new flag here, before EC_FLAG_MAX. The maximum number of
+ * flags that can be defined is 16. */
+
+ EC_FLAG_MAX
+};
+
+/* We keep two sets of flags. One to determine what's really providing the
+ * current xattrop and the other to know what the parent fop of the xattrop
+ * needs to proceed. It might happen that a fop needs some information that
+ * is being already requested by a previous fop. The two sets are stored
+ * contiguously. */
+
+#define EC_FLAG_NEEDS(_flag) (1 << (_flag))
+#define EC_FLAG_PROVIDES(_flag) (1 << ((_flag) + EC_FLAG_MAX))
+
+#define EC_NEEDED_FLAGS(_flags) ((_flags) & ((1 << EC_FLAG_MAX) - 1))
+
+#define EC_PROVIDED_FLAGS(_flags) EC_NEEDED_FLAGS((_flags) >> EC_FLAG_MAX)
+
+#define EC_FLAGS_HAVE(_flags, _flag) (((_flags) & (1 << (_flag))) != 0)
+
+#define EC_SELFHEAL_BIT 62
+
+#define EC_MINIMUM_ONE (1 << 6)
+#define EC_MINIMUM_MIN (2 << 6)
+#define EC_MINIMUM_ALL (3 << 6)
+#define EC_FOP_NO_PROPAGATE_ERROR (1 << 8)
+#define EC_FOP_MINIMUM(_flags) ((_flags)&255)
+#define EC_FOP_FLAGS(_flags) ((_flags) & ~255)
+
+#define EC_UPDATE_DATA 1
+#define EC_UPDATE_META 2
+#define EC_QUERY_INFO 4
+#define EC_INODE_SIZE 8
+
+#define EC_STATE_START 0
+#define EC_STATE_END 0
+#define EC_STATE_INIT 1
+#define EC_STATE_LOCK 2
+#define EC_STATE_DISPATCH 3
+#define EC_STATE_PREPARE_ANSWER 4
+#define EC_STATE_REPORT 5
+#define EC_STATE_LOCK_REUSE 6
+#define EC_STATE_UNLOCK 7
+
+#define EC_STATE_DELAYED_START 100
+
+#define EC_STATE_HEAL_ENTRY_LOOKUP 200
+#define EC_STATE_HEAL_ENTRY_PREPARE 201
+#define EC_STATE_HEAL_PRE_INODELK_LOCK 202
+#define EC_STATE_HEAL_PRE_INODE_LOOKUP 203
+#define EC_STATE_HEAL_XATTRIBUTES_REMOVE 204
+#define EC_STATE_HEAL_XATTRIBUTES_SET 205
+#define EC_STATE_HEAL_ATTRIBUTES 206
+#define EC_STATE_HEAL_OPEN 207
+#define EC_STATE_HEAL_REOPEN_FD 208
+#define EC_STATE_HEAL_UNLOCK 209
+#define EC_STATE_HEAL_UNLOCK_ENTRY 210
+#define EC_STATE_HEAL_DATA_LOCK 211
+#define EC_STATE_HEAL_DATA_COPY 212
+#define EC_STATE_HEAL_DATA_UNLOCK 213
+#define EC_STATE_HEAL_POST_INODELK_LOCK 214
+#define EC_STATE_HEAL_POST_INODE_LOOKUP 215
+#define EC_STATE_HEAL_SETATTR 216
+#define EC_STATE_HEAL_POST_INODELK_UNLOCK 217
+#define EC_STATE_HEAL_DISPATCH 218
+
+/* Value to cover the full range of a file */
+#define EC_RANGE_FULL ((uint64_t)LLONG_MAX + 1)
+
+gf_boolean_t
+ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk);
+void
+ec_dispatch_next(ec_fop_data_t *fop, uint32_t idx);
+
+void
+ec_complete(ec_fop_data_t *fop);
+
+void
+ec_update_good(ec_fop_data_t *fop, uintptr_t good);
+
+void
+ec_fop_set_error(ec_fop_data_t *fop, int32_t error);
+
+void
+__ec_fop_set_error(ec_fop_data_t *fop, int32_t error);
+
+ec_cbk_data_t *
+ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro);
+
+gf_boolean_t
+ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro);
+
+void
+ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+ off_t fl_start, uint64_t fl_size);
+void
+ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
+ uint32_t flags);
+void
+ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start,
+ uint64_t fl_size);
+void
+ec_lock(ec_fop_data_t *fop);
+void
+ec_lock_reuse(ec_fop_data_t *fop);
+void
+ec_unlock(ec_fop_data_t *fop);
+void
+ec_lock_release(ec_t *ec, inode_t *inode);
+
+gf_boolean_t
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size);
+gf_boolean_t
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size);
+gf_boolean_t
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size);
+gf_boolean_t
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size);
+void
+ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode);
+
+void
+ec_flush_size_version(ec_fop_data_t *fop);
+
+void
+ec_dispatch_all(ec_fop_data_t *fop);
+void
+ec_dispatch_inc(ec_fop_data_t *fop);
+void
+ec_dispatch_min(ec_fop_data_t *fop);
+void
+ec_dispatch_one(ec_fop_data_t *fop);
+
+void
+ec_succeed_all(ec_fop_data_t *fop);
+
+void
+ec_sleep(ec_fop_data_t *fop);
+void
+ec_resume(ec_fop_data_t *fop, int32_t error);
+void
+ec_resume_parent(ec_fop_data_t *fop);
+
+void
+ec_manager(ec_fop_data_t *fop, int32_t error);
+gf_boolean_t
+ec_is_recoverable_error(int32_t op_errno);
+void
+ec_handle_healers_done(ec_fop_data_t *fop);
+
+int32_t
+ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal);
+int32_t
+ec_get_heal_info(xlator_t *this, loc_t *loc, dict_t **dict);
+
+int32_t
+ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+void
+ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
+ int32_t ret_status);
+gf_boolean_t
+ec_is_entry_healing(ec_fop_data_t *fop);
+void
+ec_set_entry_healing(ec_fop_data_t *fop);
+void
+ec_reset_entry_healing(ec_fop_data_t *fop);
+char *
+ec_msg_str(ec_fop_data_t *fop);
+gf_boolean_t
+__ec_is_last_fop(ec_t *ec);
+void
+ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop);
+#endif /* __EC_COMMON_H__ */
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
new file mode 100644
index 00000000000..06388833546
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-data.c
@@ -0,0 +1,288 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-data.h"
+#include "ec-messages.h"
+
+ec_cbk_data_t *
+ec_cbk_data_allocate(call_frame_t *frame, xlator_t *this, ec_fop_data_t *fop,
+ int32_t id, int32_t idx, int32_t op_ret, int32_t op_errno)
+{
+ ec_cbk_data_t *cbk;
+ ec_t *ec = this->private;
+
+ if (fop->xl != this) {
+ gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_XLATOR_MISMATCH,
+ "Mismatching xlators between request "
+ "and answer (req=%s, ans=%s).",
+ fop->xl->name, this->name);
+
+ return NULL;
+ }
+ if (fop->frame != frame) {
+ gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_FRAME_MISMATCH,
+ "Mismatching frames between request "
+ "and answer (req=%p, ans=%p).",
+ fop->frame, frame);
+
+ return NULL;
+ }
+ if (fop->id != id) {
+ gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_FOP_MISMATCH,
+ "Mismatching fops between request "
+ "and answer (req=%d, ans=%d).",
+ fop->id, id);
+
+ return NULL;
+ }
+
+ cbk = mem_get0(ec->cbk_pool);
+ if (cbk == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to allocate memory for an "
+ "answer.");
+ return NULL;
+ }
+
+ cbk->fop = fop;
+ cbk->idx = idx;
+ cbk->mask = 1ULL << idx;
+ cbk->count = 1;
+ cbk->op_ret = op_ret;
+ cbk->op_errno = op_errno;
+ INIT_LIST_HEAD(&cbk->entries.list);
+
+ LOCK(&fop->lock);
+
+ list_add_tail(&cbk->answer_list, &fop->answer_list);
+
+ UNLOCK(&fop->lock);
+
+ return cbk;
+}
+
+void
+ec_cbk_data_destroy(ec_cbk_data_t *cbk)
+{
+ if (cbk->xdata != NULL) {
+ dict_unref(cbk->xdata);
+ }
+ if (cbk->dict != NULL) {
+ dict_unref(cbk->dict);
+ }
+ if (cbk->inode != NULL) {
+ inode_unref(cbk->inode);
+ }
+ if (cbk->fd != NULL) {
+ fd_unref(cbk->fd);
+ }
+ if (cbk->buffers != NULL) {
+ iobref_unref(cbk->buffers);
+ }
+ GF_FREE(cbk->vector);
+ gf_dirent_free(&cbk->entries);
+ GF_FREE(cbk->str);
+
+ mem_put(cbk);
+}
+
+ec_fop_data_t *
+ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id,
+ uint32_t flags, uintptr_t target, uint32_t fop_flags,
+ ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks,
+ void *data)
+{
+ ec_fop_data_t *fop, *parent;
+ ec_t *ec = this->private;
+
+ fop = mem_get0(ec->fop_pool);
+ if (fop == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to allocate memory for a "
+ "request.");
+
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&fop->cbk_list);
+ INIT_LIST_HEAD(&fop->healer);
+ INIT_LIST_HEAD(&fop->answer_list);
+ INIT_LIST_HEAD(&fop->pending_list);
+ INIT_LIST_HEAD(&fop->locks[0].owner_list);
+ INIT_LIST_HEAD(&fop->locks[0].wait_list);
+ INIT_LIST_HEAD(&fop->locks[1].owner_list);
+ INIT_LIST_HEAD(&fop->locks[1].wait_list);
+
+ fop->xl = this;
+ fop->req_frame = frame;
+
+ /* fops need a private frame to be able to execute some postop operations
+ * even if the original fop has completed and reported back to the upper
+ * xlator and it has destroyed the base frame.
+ *
+ * TODO: minimize usage of private frames. Reuse req_frame as much as
+ * possible.
+ */
+ if (frame != NULL) {
+ fop->frame = copy_frame(frame);
+ } else {
+ fop->frame = create_frame(this, this->ctx->pool);
+ }
+ if (fop->frame == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to create a private frame "
+ "for a request");
+
+ mem_put(fop);
+
+ return NULL;
+ }
+ fop->id = id;
+ fop->refs = 1;
+
+ fop->flags = flags;
+ fop->minimum = EC_FOP_MINIMUM(fop_flags);
+ fop->fop_flags = EC_FOP_FLAGS(fop_flags);
+ fop->mask = target;
+
+ fop->wind = wind;
+ fop->handler = handler;
+ fop->cbks = cbks;
+ fop->data = data;
+
+ fop->uid = fop->frame->root->uid;
+ fop->gid = fop->frame->root->gid;
+
+ LOCK_INIT(&fop->lock);
+
+ fop->frame->local = fop;
+
+ if (frame != NULL) {
+ parent = frame->local;
+ if (parent != NULL) {
+ ec_sleep(parent);
+ }
+
+ fop->parent = parent;
+ }
+
+ LOCK(&ec->lock);
+
+ list_add_tail(&fop->pending_list, &ec->pending_fops);
+
+ UNLOCK(&ec->lock);
+
+ return fop;
+}
+
+void
+ec_fop_data_acquire(ec_fop_data_t *fop)
+{
+ LOCK(&fop->lock);
+
+ ec_trace("ACQUIRE", fop, "");
+
+ fop->refs++;
+
+ UNLOCK(&fop->lock);
+}
+
+static void
+ec_handle_last_pending_fop_completion(ec_fop_data_t *fop, gf_boolean_t *notify)
+{
+ ec_t *ec = fop->xl->private;
+
+ *notify = _gf_false;
+
+ if (!list_empty(&fop->pending_list)) {
+ LOCK(&ec->lock);
+ {
+ list_del_init(&fop->pending_list);
+ *notify = __ec_is_last_fop(ec);
+ }
+ UNLOCK(&ec->lock);
+ }
+}
+
+void
+ec_fop_cleanup(ec_fop_data_t *fop)
+{
+ ec_cbk_data_t *cbk, *tmp;
+
+ list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list)
+ {
+ list_del_init(&cbk->answer_list);
+
+ ec_cbk_data_destroy(cbk);
+ }
+ INIT_LIST_HEAD(&fop->cbk_list);
+
+ fop->answer = NULL;
+}
+
+void
+ec_fop_data_release(ec_fop_data_t *fop)
+{
+ ec_t *ec = NULL;
+ int32_t refs;
+ gf_boolean_t notify = _gf_false;
+
+ LOCK(&fop->lock);
+
+ ec_trace("RELEASE", fop, "");
+
+ GF_ASSERT(fop->refs > 0);
+ refs = --fop->refs;
+
+ UNLOCK(&fop->lock);
+
+ if (refs == 0) {
+ fop->frame->local = NULL;
+ STACK_DESTROY(fop->frame->root);
+
+ LOCK_DESTROY(&fop->lock);
+
+ if (fop->xdata != NULL) {
+ dict_unref(fop->xdata);
+ }
+ if (fop->dict != NULL) {
+ dict_unref(fop->dict);
+ }
+ if (fop->inode != NULL) {
+ inode_unref(fop->inode);
+ }
+ if (fop->fd != NULL) {
+ fd_unref(fop->fd);
+ }
+ if (fop->buffers != NULL) {
+ iobref_unref(fop->buffers);
+ }
+ GF_FREE(fop->vector);
+ GF_FREE(fop->str[0]);
+ GF_FREE(fop->str[1]);
+ loc_wipe(&fop->loc[0]);
+ loc_wipe(&fop->loc[1]);
+ GF_FREE(fop->errstr);
+
+ ec_resume_parent(fop);
+
+ ec_fop_cleanup(fop);
+
+ ec = fop->xl->private;
+ ec_handle_last_pending_fop_completion(fop, &notify);
+ ec_handle_healers_done(fop);
+ mem_put(fop);
+ if (notify) {
+ ec_pending_fops_completed(ec);
+ }
+ }
+}
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
new file mode 100644
index 00000000000..c8a74ffe1ed
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -0,0 +1,35 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_DATA_H__
+#define __EC_DATA_H__
+
+#include "ec-types.h"
+
+ec_cbk_data_t *
+ec_cbk_data_allocate(call_frame_t *frame, xlator_t *this, ec_fop_data_t *fop,
+ int32_t id, int32_t idx, int32_t op_ret, int32_t op_errno);
+ec_fop_data_t *
+ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id,
+ uint32_t flags, uintptr_t target, uint32_t fop_flags,
+ ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks,
+ void *data);
+void
+ec_fop_data_acquire(ec_fop_data_t *fop);
+void
+ec_fop_data_release(ec_fop_data_t *fop);
+
+void
+ec_fop_cleanup(ec_fop_data_t *fop);
+
+void
+ec_pending_fops_completed(ec_t *ec);
+
+#endif /* __EC_DATA_H__ */
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
new file mode 100644
index 00000000000..f71dcfac293
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-dir-read.c
@@ -0,0 +1,647 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-fops.h"
+
+/****************************************************************
+ *
+ * File Operation: opendir
+ *
+ ***************************************************************/
+
+int32_t
+ec_combine_opendir(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (dst->fd != src->fd) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_FD_MISMATCH,
+ "Mismatching fd in answers "
+ "of 'GF_FOP_OPENDIR': %p <-> %p",
+ dst->fd, src->fd);
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_OPENDIR, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (fd != NULL) {
+ cbk->fd = fd_ref(fd);
+ if (cbk->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_opendir);
+
+ ec_update_fd_status(fd, this, idx, op_ret);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_opendir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_opendir_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->opendir,
+ &fop->loc[0], fop->fd, fop->xdata);
+}
+
+int32_t
+ec_manager_opendir(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+ ec_fd_t *ctx;
+ int32_t err;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ LOCK(&fop->fd->lock);
+
+ ctx = __ec_fd_get(fop->fd, fop->xl);
+ if (ctx == NULL) {
+ UNLOCK(&fop->fd->lock);
+
+ fop->error = ENOMEM;
+
+ return EC_STATE_REPORT;
+ }
+ if (!ctx->loc.inode) {
+ err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
+ if (err != 0) {
+ UNLOCK(&fop->fd->lock);
+
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ UNLOCK(&fop->fd->lock);
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_true);
+ if (cbk != NULL) {
+ /* Save which subvolumes successfully opened the directory.
+ * If ctx->open is 0, it means that readdir cannot be
+ * processed in this directory.
+ */
+ LOCK(&fop->fd->lock);
+
+ ctx = __ec_fd_get(fop->fd, fop->xl);
+ if (ctx != NULL) {
+ ctx->open |= cbk->mask;
+ }
+
+ UNLOCK(&fop->fd->lock);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.opendir != NULL) {
+ fop->cbks.opendir(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->fd, cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.opendir != NULL) {
+ fop->cbks.opendir(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc,
+ fd_t *fd, dict_t *xdata)
+{
+ ec_cbk_t callback = {.opendir = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(OPENDIR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_OPENDIR, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_opendir,
+ ec_manager_opendir, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+/* Returns -1 if client_id is invalid else index of child subvol in xl_list */
+int
+ec_deitransform(xlator_t *this, off_t offset)
+{
+ int idx = -1;
+ int client_id = -1;
+ ec_t *ec = this->private;
+ char id[32] = {0};
+ int err;
+
+ client_id = gf_deitransform(this, offset);
+ sprintf(id, "%d", client_id);
+ err = dict_get_int32(ec->leaf_to_subvolid, id, &idx);
+ if (err < 0) {
+ idx = err;
+ goto out;
+ }
+
+out:
+ if (idx < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_REQUEST,
+ "Invalid index %d in readdirp request", client_id);
+ idx = -EINVAL;
+ }
+ return idx;
+}
+
+/* FOP: readdir */
+
+void
+ec_adjust_readdirp(ec_t *ec, int32_t idx, gf_dirent_t *entries)
+{
+ gf_dirent_t *entry;
+
+ list_for_each_entry(entry, &entries->list, list)
+ {
+ if (!entry->inode)
+ continue;
+
+ if (entry->d_stat.ia_type == IA_IFREG) {
+ if ((entry->dict == NULL) ||
+ (ec_dict_del_number(entry->dict, EC_XATTR_SIZE,
+ &entry->d_stat.ia_size) != 0)) {
+ inode_unref(entry->inode);
+ entry->inode = NULL;
+ } else {
+ ec_iatt_rebuild(ec, &entry->d_stat, 1, 1);
+ }
+ }
+ }
+}
+
+int32_t
+ec_common_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+ op_errno);
+ if (cbk) {
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->op_ret >= 0)
+ list_splice_init(&entries->list, &cbk->entries.list);
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_readdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_common_readdir_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->readdir,
+ fop->fd, fop->size, fop->offset, fop->xdata);
+}
+
+int32_t
+ec_manager_readdir(ec_fop_data_t *fop, int32_t state)
+{
+ ec_fd_t *ctx = NULL;
+ ec_cbk_data_t *cbk = NULL;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ /* Return error if opendir has not been successfully called on
+ * any subvolume. */
+ ctx = ec_fd_get(fop->fd, fop->xl);
+ if (ctx == NULL) {
+ fop->error = ENOMEM;
+ } else if (ctx->open == 0) {
+ fop->error = EBADFD;
+ }
+
+ if (fop->error) {
+ gf_msg(fop->xl->name, GF_LOG_ERROR, fop->error,
+ EC_MSG_INVALID_REQUEST, "EC is not winding readdir: %s",
+ ec_msg_str(fop));
+ return EC_STATE_REPORT;
+ }
+
+ if (fop->id == GF_FOP_READDIRP) {
+ int32_t err;
+
+ if (fop->xdata == NULL) {
+ fop->xdata = dict_new();
+ if (fop->xdata == NULL) {
+ fop->error = ENOMEM;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ err = dict_set_uint64(fop->xdata, EC_XATTR_SIZE, 0);
+ if (err != 0) {
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ if (fop->offset != 0) {
+ /* Non-zero offset is irrecoverable error as the offset may not
+ * be valid on other bricks*/
+ int32_t idx = -1;
+
+ idx = ec_deitransform(fop->xl, fop->offset);
+
+ if (idx < 0) {
+ fop->error = -idx;
+ return EC_STATE_REPORT;
+ }
+ fop->mask &= 1ULL << idx;
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ ec_lock(fop);
+ }
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_one(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ if (ec_dispatch_one_retry(fop, &cbk)) {
+ return EC_STATE_DISPATCH;
+ }
+
+ if ((cbk != NULL) && (cbk->op_ret > 0) &&
+ (fop->id == GF_FOP_READDIRP)) {
+ ec_adjust_readdirp(fop->xl->private, cbk->idx, &cbk->entries);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+ GF_ASSERT(cbk);
+ if (fop->id == GF_FOP_READDIR) {
+ if (fop->cbks.readdir != NULL) {
+ fop->cbks.readdir(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, &cbk->entries, cbk->xdata);
+ }
+ } else {
+ if (fop->cbks.readdirp != NULL) {
+ fop->cbks.readdirp(fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno,
+ &cbk->entries, cbk->xdata);
+ }
+ }
+ if (fop->offset == 0)
+ return EC_STATE_LOCK_REUSE;
+ else
+ return EC_STATE_END;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ if (fop->id == GF_FOP_READDIR) {
+ if (fop->cbks.readdir != NULL) {
+ fop->cbks.readdir(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL);
+ }
+ } else {
+ if (fop->cbks.readdirp != NULL) {
+ fop->cbks.readdirp(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL);
+ }
+ }
+ if (fop->offset == 0)
+ return EC_STATE_LOCK_REUSE;
+ else
+ return EC_STATE_END;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ GF_ASSERT(fop->offset == 0);
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ GF_ASSERT(fop->offset == 0);
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd,
+ size_t size, off_t offset, dict_t *xdata)
+{
+ ec_cbk_t callback = {.readdir = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(READDIR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_READDIR, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_readdir,
+ ec_manager_readdir, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->size = size;
+ fop->offset = offset;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+/* FOP: readdirp */
+
+void
+ec_wind_readdirp(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_common_readdir_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->readdirp,
+ fop->fd, fop->size, fop->offset, fop->xdata);
+}
+
+void
+ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd,
+ size_t size, off_t offset, dict_t *xdata)
+{
+ ec_cbk_t callback = {.readdirp = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(READDIRP) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(
+ frame, this, GF_FOP_READDIRP, EC_FLAG_LOCK_SHARED, target, fop_flags,
+ ec_wind_readdirp, ec_manager_readdir, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->size = size;
+ fop->offset = offset;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c
new file mode 100644
index 00000000000..53d27d895c3
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-dir-write.c
@@ -0,0 +1,1487 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-method.h"
+#include "ec-fops.h"
+
+int
+ec_dir_write_cbk(call_frame_t *frame, xlator_t *this, void *cookie, int op_ret,
+ int op_errno, struct iatt *poststat, struct iatt *preparent,
+ struct iatt *postparent, struct iatt *preparent2,
+ struct iatt *postparent2, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int i = 0;
+ int idx = 0;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+ idx = (long)cookie;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+ op_errno);
+ if (!cbk)
+ goto out;
+
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+
+ if (op_ret < 0)
+ goto out;
+
+ if (poststat)
+ cbk->iatt[i++] = *poststat;
+
+ if (preparent)
+ cbk->iatt[i++] = *preparent;
+
+ if (postparent)
+ cbk->iatt[i++] = *postparent;
+
+ if (preparent2)
+ cbk->iatt[i++] = *preparent2;
+
+ if (postparent2)
+ cbk->iatt[i++] = *postparent2;
+
+out:
+ if (cbk)
+ ec_combine(cbk, ec_combine_write);
+
+ if (fop)
+ ec_complete(fop);
+ return 0;
+}
+
+/* FOP: create */
+
+int32_t
+ec_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_create(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_create_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->create,
+ &fop->loc[0], fop->int32, fop->mode[0], fop->mode[1],
+ fop->fd, fop->xdata);
+}
+
+int32_t
+ec_manager_create(ec_fop_data_t *fop, int32_t state)
+{
+ ec_config_t config;
+ ec_t *ec;
+ ec_cbk_data_t *cbk;
+ ec_fd_t *ctx;
+ uint64_t version[2] = {0, 0};
+ int32_t err;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ LOCK(&fop->fd->lock);
+
+ ctx = __ec_fd_get(fop->fd, fop->xl);
+ if (ctx == NULL) {
+ UNLOCK(&fop->fd->lock);
+
+ fop->error = ENOMEM;
+
+ return EC_STATE_REPORT;
+ }
+ err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
+ if (err != 0) {
+ UNLOCK(&fop->fd->lock);
+
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+
+ ctx->flags = fop->int32;
+
+ UNLOCK(&fop->fd->lock);
+
+ if (fop->xdata == NULL) {
+ fop->xdata = dict_new();
+ if (fop->xdata == NULL) {
+ fop->error = ENOMEM;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ ec = fop->xl->private;
+
+ config.version = EC_CONFIG_VERSION;
+ config.algorithm = EC_CONFIG_ALGORITHM;
+ config.gf_word_size = EC_GF_BITS;
+ config.bricks = ec->nodes;
+ config.redundancy = ec->redundancy;
+ config.chunk_size = EC_METHOD_CHUNK_SIZE;
+
+ err = ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, &config);
+ if (err != 0) {
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+ err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version,
+ EC_VERSION_SIZE);
+ if (err != 0) {
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+ err = ec_dict_set_number(fop->xdata, EC_XATTR_SIZE, 0);
+ if (err != 0) {
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+
+ /* We need to write to specific offsets on the bricks, so we
+ * need to remove O_APPEND from flags (if present) */
+ fop->int32 &= ~O_APPEND;
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+ EC_UPDATE_DATA | EC_UPDATE_META);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ int32_t err;
+
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+ err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+ &cbk->iatt[0]);
+ if (!ec_cbk_set_error(cbk, -err, _gf_false)) {
+ LOCK(&fop->fd->lock);
+
+ ctx = __ec_fd_get(fop->fd, fop->xl);
+ if (ctx != NULL) {
+ ctx->open |= cbk->mask;
+ }
+
+ UNLOCK(&fop->fd->lock);
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.create != NULL) {
+ QUORUM_CBK(fop->cbks.create, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->fd,
+ fop->loc[0].inode, &cbk->iatt[0], &cbk->iatt[1],
+ &cbk->iatt[2], cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.create != NULL) {
+ fop->cbks.create(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL, NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc,
+ int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ ec_cbk_t callback = {.create = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(CREATE) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_CREATE, 0, target, fop_flags,
+ ec_wind_create, ec_manager_create, callback,
+ data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = flags;
+ fop->mode[0] = mode;
+ fop->mode[1] = umask;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: link */
+
+int32_t
+ec_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_link(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_link_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->link,
+ &fop->loc[0], &fop->loc[1], fop->xdata);
+}
+
+int32_t
+ec_manager_link(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_parent_inode(
+ fop, &fop->loc[1], &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_INODE_SIZE);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ int32_t err;
+
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+ if (cbk->iatt[0].ia_type == IA_IFREG) {
+ cbk->iatt[0].ia_size = fop->locks[0].size;
+ }
+
+ err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+ &cbk->iatt[0]);
+ ec_cbk_set_error(cbk, -err, _gf_false);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.link != NULL) {
+ QUORUM_CBK(fop->cbks.link, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+ &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+ cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.link != NULL) {
+ fop->cbks.link(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ ec_cbk_t callback = {.link = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(LINK) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_LINK, 0, target, fop_flags,
+ ec_wind_link, ec_manager_link, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ if (oldloc != NULL) {
+ if (loc_copy(&fop->loc[0], oldloc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (newloc != NULL) {
+ if (loc_copy(&fop->loc[1], newloc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: mkdir */
+
+int32_t
+ec_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_mkdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_mkdir_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->mkdir,
+ &fop->loc[0], fop->mode[0], fop->mode[1], fop->xdata);
+}
+
+int32_t
+ec_manager_mkdir(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+ uint64_t version[2] = {0, 0};
+ int32_t err;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ if (fop->xdata == NULL) {
+ fop->xdata = dict_new();
+ if (fop->xdata == NULL) {
+ fop->error = ENOMEM;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version,
+ EC_VERSION_SIZE);
+ if (err != 0) {
+ fop->error = -err;
+ return EC_STATE_REPORT;
+ }
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+ EC_UPDATE_DATA | EC_UPDATE_META);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ int32_t err;
+
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+ err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+ &cbk->iatt[0]);
+ ec_cbk_set_error(cbk, -err, _gf_false);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.mkdir != NULL) {
+ QUORUM_CBK(fop->cbks.mkdir, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+ &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+ cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ cbk = fop->answer;
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.mkdir != NULL) {
+ fop->cbks.mkdir(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL, NULL,
+ ((cbk) ? cbk->xdata : NULL));
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc,
+ mode_t mode, mode_t umask, dict_t *xdata)
+{
+ ec_cbk_t callback = {.mkdir = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(MKDIR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_MKDIR, 0, target, fop_flags,
+ ec_wind_mkdir, ec_manager_mkdir, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->mode[0] = mode;
+ fop->mode[1] = umask;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: mknod */
+
+int32_t
+ec_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_mknod(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_mknod_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->mknod,
+ &fop->loc[0], fop->mode[0], fop->dev, fop->mode[1],
+ fop->xdata);
+}
+
+int32_t
+ec_manager_mknod(ec_fop_data_t *fop, int32_t state)
+{
+ ec_config_t config;
+ ec_t *ec;
+ ec_cbk_data_t *cbk;
+ uint64_t version[2] = {0, 0};
+
+ switch (state) {
+ case EC_STATE_INIT:
+ if (S_ISREG(fop->mode[0])) {
+ int32_t err;
+
+ if (fop->xdata == NULL) {
+ fop->xdata = dict_new();
+ if (fop->xdata == NULL) {
+ fop->error = ENOMEM;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ ec = fop->xl->private;
+
+ config.version = EC_CONFIG_VERSION;
+ config.algorithm = EC_CONFIG_ALGORITHM;
+ config.gf_word_size = EC_GF_BITS;
+ config.bricks = ec->nodes;
+ config.redundancy = ec->redundancy;
+ config.chunk_size = EC_METHOD_CHUNK_SIZE;
+
+ err = ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, &config);
+ if (err != 0) {
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+ err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version,
+ EC_VERSION_SIZE);
+ if (err != 0) {
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+ err = ec_dict_set_number(fop->xdata, EC_XATTR_SIZE, 0);
+ if (err != 0) {
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+ EC_UPDATE_DATA | EC_UPDATE_META);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ int32_t err;
+
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+ err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+ &cbk->iatt[0]);
+ ec_cbk_set_error(cbk, -err, _gf_false);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.mknod != NULL) {
+ QUORUM_CBK(fop->cbks.mknod, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+ &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+ cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.mknod != NULL) {
+ fop->cbks.mknod(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc,
+ mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ ec_cbk_t callback = {.mknod = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(MKNOD) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_MKNOD, 0, target, fop_flags,
+ ec_wind_mknod, ec_manager_mknod, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->mode[0] = mode;
+ fop->dev = rdev;
+ fop->mode[1] = umask;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: rename */
+
+int32_t
+ec_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, struct iatt *preoldparent,
+ struct iatt *postoldparent, struct iatt *prenewparent,
+ struct iatt *postnewparent, dict_t *xdata)
+{
+ return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
+}
+
+void
+ec_wind_rename(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_rename_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->rename,
+ &fop->loc[0], &fop->loc[1], fop->xdata);
+}
+
+int32_t
+ec_manager_rename(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_parent_inode(
+ fop, &fop->loc[0], &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_INODE_SIZE);
+ ec_lock_prepare_parent_inode(fop, &fop->loc[1], NULL,
+ EC_UPDATE_DATA | EC_UPDATE_META);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 5, cbk->count);
+
+ if (cbk->iatt[0].ia_type == IA_IFREG) {
+ cbk->iatt[0].ia_size = fop->locks[0].size;
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.rename != NULL) {
+ QUORUM_CBK(fop->cbks.rename, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], &cbk->iatt[2], &cbk->iatt[3],
+ &cbk->iatt[4], cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.rename != NULL) {
+ fop->cbks.rename(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL, NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ ec_cbk_t callback = {.rename = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(RENAME) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_RENAME, 0, target, fop_flags,
+ ec_wind_rename, ec_manager_rename, callback,
+ data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ if (oldloc != NULL) {
+ if (loc_copy(&fop->loc[0], oldloc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (newloc != NULL) {
+ if (loc_copy(&fop->loc[1], newloc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: rmdir */
+
+int32_t
+ec_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_rmdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_rmdir_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->rmdir,
+ &fop->loc[0], fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_rmdir(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+ EC_UPDATE_DATA | EC_UPDATE_META);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ ec_fop_prepare_answer(fop, _gf_false);
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.rmdir != NULL) {
+ QUORUM_CBK(fop->cbks.rmdir, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.rmdir != NULL) {
+ fop->cbks.rmdir(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc,
+ int xflags, dict_t *xdata)
+{
+ ec_cbk_t callback = {.rmdir = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(RMDIR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_RMDIR, 0, target, fop_flags,
+ ec_wind_rmdir, ec_manager_rmdir, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = xflags;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: symlink */
+
+int32_t
+ec_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_symlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_symlink_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->symlink,
+ fop->str[0], &fop->loc[0], fop->mode[0], fop->xdata);
+}
+
+int32_t
+ec_manager_symlink(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+ EC_UPDATE_DATA | EC_UPDATE_META);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ int32_t err;
+
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+ err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+ &cbk->iatt[0]);
+ ec_cbk_set_error(cbk, -err, _gf_false);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.symlink != NULL) {
+ QUORUM_CBK(fop->cbks.symlink, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+ &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+ cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.symlink != NULL) {
+ fop->cbks.symlink(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_symlink_cbk_t func, void *data,
+ const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ ec_cbk_t callback = {.symlink = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(SYMLINK) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_SYMLINK, 0, target,
+ fop_flags, ec_wind_symlink, ec_manager_symlink,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->mode[0] = umask;
+
+ if (linkname != NULL) {
+ fop->str[0] = gf_strdup(linkname);
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: unlink */
+
+int32_t
+ec_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_unlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_unlink_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->unlink,
+ &fop->loc[0], fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_unlink(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+ EC_UPDATE_DATA | EC_UPDATE_META);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ ec_fop_prepare_answer(fop, _gf_false);
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.unlink != NULL) {
+ QUORUM_CBK(fop->cbks.unlink, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.unlink != NULL) {
+ fop->cbks.unlink(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc,
+ int xflags, dict_t *xdata)
+{
+ ec_cbk_t callback = {.unlink = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(UNLINK) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_UNLINK, 0, target, fop_flags,
+ ec_wind_unlink, ec_manager_unlink, callback,
+ data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = xflags;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h
new file mode 100644
index 00000000000..07edf8a7fec
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-fops.h
@@ -0,0 +1,254 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_FOPS_H__
+#define __EC_FOPS_H__
+
+#include <glusterfs/xlator.h>
+
+#include "ec-types.h"
+#include "ec-common.h"
+
+void
+ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc,
+ int32_t mask, dict_t *xdata);
+
+void
+ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc,
+ int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
+
+void
+ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_entrylk_cbk_t func, void *data,
+ const char *volume, loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
+
+void
+ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data,
+ const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata);
+
+void
+ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd,
+ dict_t *xdata);
+
+void
+ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd,
+ int32_t datasync, dict_t *xdata);
+
+void
+ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd,
+ int32_t datasync, dict_t *xdata);
+
+void
+ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc,
+ const char *name, dict_t *xdata);
+
+void
+ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+void
+ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc,
+ int32_t partial, dict_t *xdata);
+
+void
+ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd,
+ int32_t partial, dict_t *xdata);
+
+void
+ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
+ uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func,
+ void *data, const char *volume, loc_t *loc, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata);
+
+void
+ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
+ uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func,
+ void *data, const char *volume, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata);
+
+void
+ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata);
+
+void
+ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags,
+ fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata);
+
+void
+ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc,
+ dict_t *xdata);
+
+void
+ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc,
+ mode_t mode, mode_t umask, dict_t *xdata);
+
+void
+ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc,
+ mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata);
+
+void
+ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc,
+ int32_t flags, fd_t *fd, dict_t *xdata);
+
+void
+ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc,
+ fd_t *fd, dict_t *xdata);
+
+void
+ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd,
+ size_t size, off_t offset, dict_t *xdata);
+
+void
+ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd,
+ size_t size, off_t offset, dict_t *xdata);
+
+void
+ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc,
+ size_t size, dict_t *xdata);
+
+void
+ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata);
+
+void
+ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_removexattr_cbk_t func, void *data,
+ loc_t *loc, const char *name, dict_t *xdata);
+
+void
+ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data,
+ fd_t *fd, const char *name, dict_t *xdata);
+
+void
+ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata);
+
+void
+ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc,
+ int xflags, dict_t *xdata);
+
+void
+ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+void
+ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+void
+ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata);
+
+void
+ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd,
+ dict_t *dict, int32_t flags, dict_t *xdata);
+
+void
+ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc,
+ dict_t *xdata);
+
+void
+ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd,
+ dict_t *xdata);
+
+void
+ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc,
+ dict_t *xdata);
+
+void
+ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_symlink_cbk_t func, void *data,
+ const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata);
+
+void
+ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata);
+
+void
+ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
+
+void
+ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc,
+ off_t offset, dict_t *xdata);
+
+void
+ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd,
+ off_t offset, dict_t *xdata);
+
+void
+ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc,
+ int xflags, dict_t *xdata);
+
+void
+ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata);
+
+void
+ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+
+void
+ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+
+void
+ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd,
+ off_t offset, gf_seek_what_t what, dict_t *xdata);
+
+void
+ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op,
+ dict_t *xdata);
+
+#endif /* __EC_FOPS_H__ */
diff --git a/xlators/cluster/ec/src/ec-galois.c b/xlators/cluster/ec/src/ec-galois.c
new file mode 100644
index 00000000000..6e4990c71f5
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-galois.c
@@ -0,0 +1,183 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+
+#include "ec-mem-types.h"
+#include "ec-gf8.h"
+#include "ec-helpers.h"
+
+static ec_gf_t *
+ec_gf_alloc(uint32_t bits, uint32_t mod)
+{
+ ec_gf_t *gf;
+
+ gf = GF_MALLOC(sizeof(ec_gf_t), ec_mt_ec_gf_t);
+ if (gf == NULL) {
+ goto failed;
+ }
+
+ gf->bits = bits;
+ gf->size = 1 << bits;
+ gf->mod = mod;
+
+ gf->log = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1),
+ gf_common_mt_int);
+ if (gf->log == NULL) {
+ goto failed_gf;
+ }
+ gf->pow = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1),
+ gf_common_mt_int);
+ if (gf->pow == NULL) {
+ goto failed_log;
+ }
+
+ return gf;
+
+failed_log:
+ GF_FREE(gf->log);
+failed_gf:
+ GF_FREE(gf);
+failed:
+ return EC_ERR(ENOMEM);
+}
+
+static void
+ec_gf_init_tables(ec_gf_t *gf)
+{
+ uint32_t i, tmp;
+
+ memset(gf->log, -1, sizeof(uint32_t) * gf->size);
+
+ gf->pow[0] = 1;
+ gf->log[0] = gf->size;
+ gf->log[1] = 0;
+ for (i = 1; i < gf->size; i++) {
+ tmp = gf->pow[i - 1] << 1;
+ if (tmp >= gf->size) {
+ tmp ^= gf->mod;
+ }
+ gf->pow[i + gf->size - 1] = gf->pow[i] = tmp;
+ gf->log[tmp + gf->size - 1] = gf->log[tmp] = i;
+ }
+}
+
+ec_gf_t *
+ec_gf_prepare(uint32_t bits, uint32_t mod)
+{
+ ec_gf_mul_t **tbl;
+ ec_gf_t *gf;
+ uint32_t i, j;
+
+ if (bits != 8) {
+ return EC_ERR(EINVAL);
+ }
+
+ tbl = ec_gf8_mul;
+ if (mod == 0) {
+ mod = 0x11d;
+ }
+
+ gf = ec_gf_alloc(bits, mod);
+ if (EC_IS_ERR(gf)) {
+ return gf;
+ }
+ ec_gf_init_tables(gf);
+
+ gf->table = tbl;
+ gf->min_ops = bits * bits;
+ gf->max_ops = 0;
+ gf->avg_ops = 0;
+ for (i = 1; i < gf->size; i++) {
+ for (j = 0; tbl[i]->ops[j].op != EC_GF_OP_END; j++) {
+ }
+ if (gf->max_ops < j) {
+ gf->max_ops = j;
+ }
+ if (gf->min_ops > j) {
+ gf->min_ops = j;
+ }
+ gf->avg_ops += j;
+ }
+ gf->avg_ops /= gf->size;
+
+ return gf;
+}
+
+void
+ec_gf_destroy(ec_gf_t *gf)
+{
+ GF_FREE(gf->pow);
+ GF_FREE(gf->log);
+ GF_FREE(gf);
+}
+
+uint32_t
+ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b)
+{
+ if ((a >= gf->size) || (b >= gf->size)) {
+ return gf->size;
+ }
+
+ return a ^ b;
+}
+
+uint32_t
+ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b)
+{
+ if ((a >= gf->size) || (b >= gf->size)) {
+ return gf->size;
+ }
+
+ if ((a != 0) && (b != 0)) {
+ return gf->pow[gf->log[a] + gf->log[b]];
+ }
+
+ return 0;
+}
+
+uint32_t
+ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b)
+{
+ if ((a >= gf->size) || (b >= gf->size)) {
+ return gf->size;
+ }
+
+ if (b != 0) {
+ if (a != 0) {
+ return gf->pow[gf->size - 1 + gf->log[a] - gf->log[b]];
+ }
+
+ return 0;
+ }
+
+ return gf->size;
+}
+
+uint32_t
+ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b)
+{
+ uint32_t r;
+
+ if ((a >= gf->size) || ((a == 0) && (b == 0))) {
+ return gf->size;
+ }
+
+ r = 1;
+ while (b != 0) {
+ if ((b & 1) != 0) {
+ r = ec_gf_mul(gf, r, a);
+ }
+ a = ec_gf_mul(gf, a, a);
+ b >>= 1;
+ }
+
+ return r;
+}
diff --git a/xlators/cluster/ec/src/ec-galois.h b/xlators/cluster/ec/src/ec-galois.h
new file mode 100644
index 00000000000..ed55d53e419
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-galois.h
@@ -0,0 +1,32 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_GALOIS_H__
+#define __EC_GALOIS_H__
+
+#include <inttypes.h>
+
+#include "ec-types.h"
+
+ec_gf_t *
+ec_gf_prepare(uint32_t bits, uint32_t mod);
+void
+ec_gf_destroy(ec_gf_t *gf);
+
+uint32_t
+ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b);
+uint32_t
+ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b);
+uint32_t
+ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b);
+uint32_t
+ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b);
+
+#endif /* __EC_GALOIS_H__ */
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
new file mode 100644
index 00000000000..884deb93669
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -0,0 +1,1591 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/byte-order.h>
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-fops.h"
+
+/* FOP: flush */
+
+int32_t
+ec_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FLUSH, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_flush(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_flush_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->flush, fop->fd,
+ fop->xdata);
+}
+
+int32_t
+ec_manager_flush(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_fd(fop, fop->fd, 0, 0, EC_RANGE_FULL);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_flush_size_version(fop);
+
+ return EC_STATE_DELAYED_START;
+
+ case EC_STATE_DELAYED_START:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ ec_fop_prepare_answer(fop, _gf_false);
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.flush != NULL) {
+ fop->cbks.flush(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DELAYED_START:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.flush != NULL) {
+ fop->cbks.flush(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+static int32_t
+ec_validate_fd(fd_t *fd, xlator_t *xl)
+{
+ uint64_t iversion = 0;
+ uint64_t fversion = 0;
+ ec_inode_t *inode_ctx = NULL;
+ ec_fd_t *fd_ctx = NULL;
+
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __ec_fd_get(fd, xl);
+ if (fd_ctx) {
+ fversion = fd_ctx->bad_version;
+ }
+ }
+ UNLOCK(&fd->lock);
+
+ LOCK(&fd->inode->lock);
+ {
+ inode_ctx = __ec_inode_get(fd->inode, xl);
+ if (inode_ctx) {
+ iversion = inode_ctx->bad_version;
+ }
+ }
+ UNLOCK(&fd->inode->lock);
+ if (fversion < iversion) {
+ return EBADF;
+ }
+ return 0;
+}
+
+void
+ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd,
+ dict_t *xdata)
+{
+ ec_cbk_t callback = {.flush = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FLUSH) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ if (fd) {
+ error = ec_validate_fd(fd, this);
+ if (error) {
+ gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD,
+ "Failing %s on %s", gf_fop_list[GF_FOP_FLUSH],
+ fd->inode ? uuid_utoa(fd->inode->gfid) : "");
+ goto out;
+ }
+ }
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FLUSH, 0, target, fop_flags,
+ ec_wind_flush, ec_manager_flush, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: fsync */
+
+int32_t
+ec_combine_fsync(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 2)) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+ "Mismatching iatt in "
+ "answers of 'GF_FOP_FSYNC'");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSYNC, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (prebuf != NULL) {
+ cbk->iatt[0] = *prebuf;
+ }
+ if (postbuf != NULL) {
+ cbk->iatt[1] = *postbuf;
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_fsync);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_fsync(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fsync_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fsync, fop->fd,
+ fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_fsync(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, EC_RANGE_FULL);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_flush_size_version(fop);
+
+ return EC_STATE_DELAYED_START;
+
+ case EC_STATE_DELAYED_START:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode,
+ &cbk->iatt[0].ia_size));
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.fsync != NULL) {
+ fop->cbks.fsync(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
+ cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ case -EC_STATE_DELAYED_START:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.fsync != NULL) {
+ fop->cbks.fsync(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd,
+ int32_t datasync, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fsync = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FSYNC) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ if (fd) {
+ error = ec_validate_fd(fd, this);
+ if (error) {
+ gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD,
+ "Failing %s on %s", gf_fop_list[GF_FOP_FSYNC],
+ fd->inode ? uuid_utoa(fd->inode->gfid) : "");
+ goto out;
+ }
+ }
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNC, 0, target, fop_flags,
+ ec_wind_fsync, ec_manager_fsync, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->int32 = datasync;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: fsyncdir */
+
+int32_t
+ec_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSYNCDIR, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_fsyncdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fsyncdir_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fsyncdir,
+ fop->fd, fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_fsyncdir(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_fd(fop, fop->fd, 0, 0, EC_RANGE_FULL);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_flush_size_version(fop);
+
+ return EC_STATE_DELAYED_START;
+
+ case EC_STATE_DELAYED_START:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ ec_fop_prepare_answer(fop, _gf_false);
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.fsyncdir != NULL) {
+ fop->cbks.fsyncdir(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ case -EC_STATE_DELAYED_START:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.fsyncdir != NULL) {
+ fop->cbks.fsyncdir(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd,
+ int32_t datasync, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fsyncdir = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FSYNCDIR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNCDIR, 0, target,
+ fop_flags, ec_wind_fsyncdir, ec_manager_fsyncdir,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->int32 = datasync;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: lookup */
+
+void
+ec_lookup_rebuild(ec_t *ec, ec_fop_data_t *fop, ec_cbk_data_t *cbk)
+{
+ ec_inode_t *ctx = NULL;
+ uint64_t size = 0;
+ int32_t have_size = 0, err;
+
+ if (cbk->op_ret < 0) {
+ return;
+ }
+
+ ec_dict_del_array(cbk->xdata, EC_XATTR_VERSION, cbk->version,
+ EC_VERSION_SIZE);
+
+ err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, &cbk->iatt[0]);
+ if (ec_cbk_set_error(cbk, -err, _gf_true)) {
+ return;
+ }
+
+ LOCK(&cbk->inode->lock);
+
+ ctx = __ec_inode_get(cbk->inode, fop->xl);
+ if (ctx != NULL) {
+ if (ctx->have_version) {
+ cbk->version[0] = ctx->post_version[0];
+ cbk->version[1] = ctx->post_version[1];
+ }
+ if (ctx->have_size) {
+ size = ctx->post_size;
+ have_size = 1;
+ }
+ }
+
+ UNLOCK(&cbk->inode->lock);
+
+ if (cbk->iatt[0].ia_type == IA_IFREG) {
+ cbk->size = cbk->iatt[0].ia_size;
+ ec_dict_del_number(cbk->xdata, EC_XATTR_SIZE, &cbk->iatt[0].ia_size);
+ if (have_size) {
+ cbk->iatt[0].ia_size = size;
+ }
+ }
+}
+
+int32_t
+ec_combine_lookup(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 2)) {
+ gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_IATT_MISMATCH,
+ "Mismatching iatt in "
+ "answers of 'GF_FOP_LOOKUP'");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+ uint64_t dirty[2] = {0};
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_LOOKUP, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (inode != NULL) {
+ cbk->inode = inode_ref(inode);
+ if (cbk->inode == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_INODE_REF_FAIL,
+ "Failed to reference an inode.");
+
+ goto out;
+ }
+ }
+ if (buf != NULL) {
+ cbk->iatt[0] = *buf;
+ }
+ if (postparent != NULL) {
+ cbk->iatt[1] = *postparent;
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ ec_dict_del_array(xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+ }
+
+ ec_combine(cbk, ec_combine_lookup);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_lookup(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_lookup_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->lookup,
+ &fop->loc[0], fop->xdata);
+}
+
+int32_t
+ec_manager_lookup(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+ int32_t err;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ if (fop->xdata == NULL) {
+ fop->xdata = dict_new();
+ if (fop->xdata == NULL) {
+ gf_msg(fop->xl->name, GF_LOG_ERROR, ENOMEM,
+ EC_MSG_LOOKUP_REQ_PREP_FAIL,
+ "Unable to prepare "
+ "lookup request");
+
+ fop->error = ENOMEM;
+
+ return EC_STATE_REPORT;
+ }
+ } else {
+ /*TODO: To be handled once we have 'syndromes' */
+ dict_del(fop->xdata, GF_CONTENT_KEY);
+ }
+ err = dict_set_uint64(fop->xdata, EC_XATTR_SIZE, 0);
+ if (err == 0) {
+ err = dict_set_uint64(fop->xdata, EC_XATTR_VERSION, 0);
+ }
+ if (err == 0) {
+ err = dict_set_uint64(fop->xdata, EC_XATTR_DIRTY, 0);
+ }
+ if (err != 0) {
+ gf_msg(fop->xl->name, GF_LOG_ERROR, -err,
+ EC_MSG_LOOKUP_REQ_PREP_FAIL,
+ "Unable to prepare lookup "
+ "request");
+
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+
+ /* Fall through */
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ /*
+ * Lookup happens without any lock, so there is a chance that it
+ * will have answers before modification happened and after
+ * modification happened in the same response. So choose the next
+ * best answer when the answers don't match for EC_MINIMUM_MIN
+ */
+
+ if (!fop->answer && !list_empty(&fop->cbk_list)) {
+ fop->answer = list_entry(fop->cbk_list.next, ec_cbk_data_t,
+ list);
+ }
+
+ cbk = ec_fop_prepare_answer(fop, _gf_true);
+ if (cbk != NULL) {
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+ ec_lookup_rebuild(fop->xl->private, fop, cbk);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.lookup != NULL) {
+ fop->cbks.lookup(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->inode, &cbk->iatt[0],
+ cbk->xdata, &cbk->iatt[1]);
+ }
+
+ return EC_STATE_END;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.lookup != NULL) {
+ fop->cbks.lookup(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL, NULL);
+ }
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc,
+ dict_t *xdata)
+{
+ ec_cbk_t callback = {.lookup = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(LOOKUP) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_LOOKUP, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_lookup,
+ ec_manager_lookup, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ /* Do not log failures here as a memory problem would have already
+ * been logged by the corresponding alloc functions */
+ if (fop->xdata == NULL)
+ goto out;
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: statfs */
+
+int32_t
+ec_combine_statfs(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ ec_statvfs_combine(&dst->statvfs, &src->statvfs);
+
+ return 1;
+}
+
+int32_t
+ec_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct statvfs *buf, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_STATFS, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (buf != NULL) {
+ cbk->statvfs = *buf;
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_statfs);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_statfs(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_statfs_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->statfs,
+ &fop->loc[0], fop->xdata);
+}
+
+int32_t
+ec_manager_statfs(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk = NULL;
+ gf_boolean_t deem_statfs_enabled = _gf_false;
+ int32_t err = 0;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_true);
+ if (cbk != NULL) {
+ ec_t *ec = fop->xl->private;
+
+ if (cbk->xdata) {
+ err = dict_get_int8(cbk->xdata, "quota-deem-statfs",
+ (int8_t *)&deem_statfs_enabled);
+ if (err != -ENOENT) {
+ ec_cbk_set_error(cbk, -err, _gf_true);
+ }
+ }
+
+ if (err != 0 || deem_statfs_enabled == _gf_false) {
+ cbk->statvfs.f_blocks *= ec->fragments;
+ cbk->statvfs.f_bfree *= ec->fragments;
+ cbk->statvfs.f_bavail *= ec->fragments;
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.statfs != NULL) {
+ fop->cbks.statfs(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, &cbk->statvfs, cbk->xdata);
+ }
+
+ return EC_STATE_END;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.statfs != NULL) {
+ fop->cbks.statfs(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL);
+ }
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc,
+ dict_t *xdata)
+{
+ ec_cbk_t callback = {.statfs = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(STATFS) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_STATFS, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_statfs,
+ ec_manager_statfs, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+/* FOP: xattrop */
+
+int32_t
+ec_combine_xattrop(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (!ec_dict_compare(dst->dict, src->dict)) {
+ gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_DICT_MISMATCH,
+ "Mismatching dictionary in "
+ "answers of 'GF_FOP_XATTROP'");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_lock_link_t *link = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ uint64_t dirty[2] = {0};
+ data_t *data;
+ uint64_t *version;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+ op_errno);
+ if (!cbk)
+ goto out;
+
+ if (op_ret >= 0) {
+ cbk->dict = dict_ref(xattr);
+
+ data = dict_get(cbk->dict, EC_XATTR_VERSION);
+ if ((data != NULL) && (data->len >= sizeof(uint64_t))) {
+ version = (uint64_t *)data->data;
+
+ if (((ntoh64(version[0]) >> EC_SELFHEAL_BIT) & 1) != 0) {
+ LOCK(&fop->lock);
+
+ fop->healing |= 1ULL << idx;
+
+ UNLOCK(&fop->lock);
+ }
+ }
+
+ ec_dict_del_array(xattr, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+ link = fop->data;
+ if (link) {
+ /*Keep a note of if the dirty is already set or not*/
+ link->dirty[0] |= (dirty[0] != 0);
+ link->dirty[1] |= (dirty[1] != 0);
+ }
+ }
+
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+
+ ec_combine(cbk, ec_combine_xattrop);
+
+out:
+ if (fop)
+ ec_complete(fop);
+
+ return 0;
+}
+
+void
+ec_wind_xattrop(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->xattrop,
+ &fop->loc[0], fop->xattrop_flags, fop->dict, fop->xdata);
+}
+
+int32_t
+ec_manager_xattrop(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META, 0,
+ EC_RANGE_FULL);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META, 0,
+ EC_RANGE_FULL);
+ }
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ int32_t err;
+
+ err = ec_dict_combine(cbk, EC_COMBINE_DICT);
+ ec_cbk_set_error(cbk, -err, _gf_false);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->id == GF_FOP_XATTROP) {
+ if (fop->cbks.xattrop != NULL) {
+ fop->cbks.xattrop(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->dict, cbk->xdata);
+ }
+ } else {
+ if (fop->cbks.fxattrop != NULL) {
+ fop->cbks.fxattrop(fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, cbk->dict,
+ cbk->xdata);
+ }
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->id == GF_FOP_XATTROP) {
+ if (fop->cbks.xattrop != NULL) {
+ fop->cbks.xattrop(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL);
+ }
+ } else {
+ if (fop->cbks.fxattrop != NULL) {
+ fop->cbks.fxattrop(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL);
+ }
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ ec_cbk_t callback = {.xattrop = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(XATTROP) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_XATTROP, 0, target,
+ fop_flags, ec_wind_xattrop, ec_manager_xattrop,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->xattrop_flags = optype;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xattr != NULL) {
+ fop->dict = dict_ref(xattr);
+ if (fop->dict == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+void
+ec_wind_fxattrop(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fxattrop,
+ fop->fd, fop->xattrop_flags, fop->dict, fop->xdata);
+}
+
+void
+ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fxattrop = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FXATTROP) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FXATTROP, 0, target,
+ fop_flags, ec_wind_fxattrop, ec_manager_xattrop,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->xattrop_flags = optype;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xattr != NULL) {
+ fop->dict = dict_ref(xattr);
+ if (fop->dict == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+/* FOP: IPC */
+
+int32_t
+ec_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_IPC, idx, op_ret,
+ op_errno);
+
+ if (cbk != NULL) {
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_ipc(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_ipc_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->ipc, fop->int32,
+ fop->xdata);
+}
+
+int32_t
+ec_manager_ipc(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ ec_fop_prepare_answer(fop, _gf_true);
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+ if (fop->cbks.ipc != NULL) {
+ fop->cbks.ipc(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->xdata);
+ }
+
+ return EC_STATE_END;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.ipc != NULL) {
+ fop->cbks.ipc(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL);
+ }
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op,
+ dict_t *xdata)
+{
+ ec_cbk_t callback = {.ipc = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(IPC) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_IPC, 0, target, fop_flags,
+ ec_wind_ipc, ec_manager_ipc, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ }
+ fop->int32 = op;
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
diff --git a/xlators/cluster/ec/src/ec-gf8.c b/xlators/cluster/ec/src/ec-gf8.c
new file mode 100644
index 00000000000..039adae5929
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-gf8.c
@@ -0,0 +1,5882 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "ec-gf8.h"
+
+static ec_gf_op_t ec_gf8_mul_00_ops[] = {{EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_00 = {0,
+ {
+ 0,
+ },
+ ec_gf8_mul_00_ops};
+
+static ec_gf_op_t ec_gf8_mul_01_ops[] = {{EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_01 = {8,
+ {
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_01_ops};
+
+static ec_gf_op_t ec_gf8_mul_02_ops[] = {{EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_02 = {8,
+ {
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_02_ops};
+
+static ec_gf_op_t ec_gf8_mul_03_ops[] = {
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_COPY, 8, 3, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_03 = {9,
+ {
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ },
+ ec_gf8_mul_03_ops};
+
+static ec_gf_op_t ec_gf8_mul_04_ops[] = {
+ {EC_GF_OP_XOR3, 8, 6, 7}, {EC_GF_OP_XOR2, 2, 8, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_04 = {9,
+ {
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 8,
+ },
+ ec_gf8_mul_04_ops};
+
+static ec_gf_op_t ec_gf8_mul_05_ops[] = {
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_05 = {8,
+ {
+ 0,
+ 1,
+ 2,
+ 6,
+ 7,
+ 3,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_05_ops};
+
+static ec_gf_op_t ec_gf8_mul_06_ops[] = {
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_COPY, 8, 2, 0},
+ {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_06 = {9,
+ {
+ 7,
+ 0,
+ 1,
+ 2,
+ 8,
+ 3,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_06_ops};
+
+static ec_gf_op_t ec_gf8_mul_07_ops[] = {
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_07 = {8,
+ {
+ 6,
+ 0,
+ 1,
+ 3,
+ 2,
+ 4,
+ 5,
+ 7,
+ },
+ ec_gf8_mul_07_ops};
+
+static ec_gf_op_t ec_gf8_mul_08_ops[] = {
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR3, 8, 6, 7},
+ {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_08 = {9,
+ {
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 8,
+ },
+ ec_gf8_mul_08_ops};
+
+static ec_gf_op_t ec_gf8_mul_09_ops[] = {
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_09 = {8,
+ {
+ 0,
+ 1,
+ 2,
+ 3,
+ 5,
+ 6,
+ 7,
+ 4,
+ },
+ ec_gf8_mul_09_ops};
+
+static ec_gf_op_t ec_gf8_mul_0A_ops[] = {
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0A = {8,
+ {
+ 5,
+ 0,
+ 1,
+ 2,
+ 6,
+ 7,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_0A_ops};
+
+static ec_gf_op_t ec_gf8_mul_0B_ops[] = {
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_COPY, 9, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_COPY, 8, 5, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR3, 3, 8, 6}, {EC_GF_OP_XOR2, 1, 9, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0B = {10,
+ {
+ 7,
+ 1,
+ 5,
+ 2,
+ 4,
+ 3,
+ 0,
+ 6,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_0B_ops};
+
+static ec_gf_op_t ec_gf8_mul_0C_ops[] = {
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_COPY, 8, 1, 0},
+ {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0C = {9,
+ {
+ 5,
+ 7,
+ 0,
+ 1,
+ 8,
+ 2,
+ 3,
+ 4,
+ 6,
+ },
+ ec_gf8_mul_0C_ops};
+
+static ec_gf_op_t ec_gf8_mul_0D_ops[] = {
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR3, 8, 2, 4}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR3, 2, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0D = {9,
+ {
+ 5,
+ 6,
+ 7,
+ 3,
+ 1,
+ 0,
+ 2,
+ 4,
+ 8,
+ },
+ ec_gf8_mul_0D_ops};
+
+static ec_gf_op_t ec_gf8_mul_0E_ops[] = {
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0E = {8,
+ {
+ 7,
+ 0,
+ 6,
+ 1,
+ 3,
+ 2,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_0E_ops};
+
+static ec_gf_op_t ec_gf8_mul_0F_ops[] = {
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0F = {8,
+ {
+ 1,
+ 0,
+ 5,
+ 6,
+ 7,
+ 2,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_0F_ops};
+
+static ec_gf_op_t ec_gf8_mul_10_ops[] = {
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_10 = {8,
+ {
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ },
+ ec_gf8_mul_10_ops};
+
+static ec_gf_op_t ec_gf8_mul_11_ops[] = {
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_11 = {8,
+ {
+ 4,
+ 1,
+ 2,
+ 6,
+ 0,
+ 5,
+ 7,
+ 3,
+ },
+ ec_gf8_mul_11_ops};
+
+static ec_gf_op_t ec_gf8_mul_12_ops[] = {
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_12 = {8,
+ {
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 5,
+ 6,
+ 4,
+ },
+ ec_gf8_mul_12_ops};
+
+static ec_gf_op_t ec_gf8_mul_13_ops[] = {
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 8, 3, 7},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 8, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_13 = {9,
+ {
+ 4,
+ 5,
+ 2,
+ 6,
+ 0,
+ 1,
+ 7,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_13_ops};
+
+static ec_gf_op_t ec_gf8_mul_14_ops[] = {
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_14 = {8,
+ {
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 4,
+ 5,
+ 3,
+ },
+ ec_gf8_mul_14_ops};
+
+static ec_gf_op_t ec_gf8_mul_15_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR3, 5, 8, 7},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_15 = {9,
+ {
+ 0,
+ 1,
+ 2,
+ 4,
+ 7,
+ 6,
+ 5,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_15_ops};
+
+static ec_gf_op_t ec_gf8_mul_16_ops[] = {
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_16 = {8,
+ {
+ 6,
+ 7,
+ 4,
+ 1,
+ 2,
+ 3,
+ 5,
+ 0,
+ },
+ ec_gf8_mul_16_ops};
+
+static ec_gf_op_t ec_gf8_mul_17_ops[] = {
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_17 = {8,
+ {
+ 5,
+ 7,
+ 0,
+ 1,
+ 3,
+ 2,
+ 4,
+ 6,
+ },
+ ec_gf8_mul_17_ops};
+
+static ec_gf_op_t ec_gf8_mul_18_ops[] = {
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_18 = {9,
+ {
+ 4,
+ 5,
+ 7,
+ 6,
+ 0,
+ 1,
+ 2,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_18_ops};
+
+static ec_gf_op_t ec_gf8_mul_19_ops[] = {
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_19 = {8,
+ {
+ 0,
+ 5,
+ 2,
+ 6,
+ 7,
+ 1,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_19_ops};
+
+static ec_gf_op_t ec_gf8_mul_1A_ops[] = {
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1A = {8,
+ {
+ 7,
+ 0,
+ 4,
+ 5,
+ 3,
+ 1,
+ 2,
+ 6,
+ },
+ ec_gf8_mul_1A_ops};
+
+static ec_gf_op_t ec_gf8_mul_1B_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1B = {8,
+ {
+ 7,
+ 4,
+ 5,
+ 6,
+ 3,
+ 1,
+ 2,
+ 0,
+ },
+ ec_gf8_mul_1B_ops};
+
+static ec_gf_op_t ec_gf8_mul_1C_ops[] = {
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1C = {8,
+ {
+ 5,
+ 4,
+ 3,
+ 0,
+ 1,
+ 7,
+ 2,
+ 6,
+ },
+ ec_gf8_mul_1C_ops};
+
+static ec_gf_op_t ec_gf8_mul_1D_ops[] = {
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR3, 8, 4, 2},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1D = {9,
+ {
+ 0,
+ 7,
+ 5,
+ 8,
+ 2,
+ 3,
+ 4,
+ 1,
+ 6,
+ },
+ ec_gf8_mul_1D_ops};
+
+static ec_gf_op_t ec_gf8_mul_1E_ops[] = {
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1E = {8,
+ {
+ 4,
+ 7,
+ 5,
+ 1,
+ 6,
+ 0,
+ 2,
+ 3,
+ },
+ ec_gf8_mul_1E_ops};
+
+static ec_gf_op_t ec_gf8_mul_1F_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR3, 8, 3, 7},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1F = {9,
+ {
+ 1,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 3,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_1F_ops};
+
+static ec_gf_op_t ec_gf8_mul_20_ops[] = {
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_20 = {8,
+ {
+ 7,
+ 4,
+ 5,
+ 6,
+ 3,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_20_ops};
+
+static ec_gf_op_t ec_gf8_mul_21_ops[] = {
+ {EC_GF_OP_COPY, 9, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR3, 8, 7, 5}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+ {EC_GF_OP_XOR2, 4, 9, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_21 = {10,
+ {
+ 0,
+ 1,
+ 2,
+ 7,
+ 5,
+ 4,
+ 3,
+ 6,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_21_ops};
+
+static ec_gf_op_t ec_gf8_mul_22_ops[] = {
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_22 = {8,
+ {
+ 3,
+ 0,
+ 5,
+ 2,
+ 6,
+ 4,
+ 1,
+ 7,
+ },
+ ec_gf8_mul_22_ops};
+
+static ec_gf_op_t ec_gf8_mul_23_ops[] = {
+ {EC_GF_OP_COPY, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_23 = {9,
+ {
+ 0,
+ 4,
+ 3,
+ 2,
+ 5,
+ 6,
+ 1,
+ 8,
+ 7,
+ },
+ ec_gf8_mul_23_ops};
+
+static ec_gf_op_t ec_gf8_mul_24_ops[] = {
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_24 = {8,
+ {
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 4,
+ 5,
+ 3,
+ },
+ ec_gf8_mul_24_ops};
+
+static ec_gf_op_t ec_gf8_mul_25_ops[] = {
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_25 = {8,
+ {
+ 2,
+ 7,
+ 0,
+ 1,
+ 3,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_25_ops};
+
+static ec_gf_op_t ec_gf8_mul_26_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_26 = {8,
+ {
+ 3,
+ 4,
+ 1,
+ 2,
+ 0,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_26_ops};
+
+static ec_gf_op_t ec_gf8_mul_27_ops[] = {
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_27 = {8,
+ {
+ 3,
+ 0,
+ 1,
+ 2,
+ 6,
+ 7,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_27_ops};
+
+static ec_gf_op_t ec_gf8_mul_28_ops[] = {
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_28 = {8,
+ {
+ 5,
+ 6,
+ 3,
+ 0,
+ 1,
+ 2,
+ 4,
+ 7,
+ },
+ ec_gf8_mul_28_ops};
+
+static ec_gf_op_t ec_gf8_mul_29_ops[] = {
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_29 = {8,
+ {
+ 4,
+ 6,
+ 3,
+ 5,
+ 7,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_29_ops};
+
+static ec_gf_op_t ec_gf8_mul_2A_ops[] = {
+ {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 0, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR3, 6, 8, 4}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2A = {9,
+ {
+ 3,
+ 4,
+ 7,
+ 2,
+ 6,
+ 5,
+ 1,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_2A_ops};
+
+static ec_gf_op_t ec_gf8_mul_2B_ops[] = {
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2B = {8,
+ {
+ 3,
+ 4,
+ 7,
+ 5,
+ 6,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_2B_ops};
+
+static ec_gf_op_t ec_gf8_mul_2C_ops[] = {
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2C = {8,
+ {
+ 5,
+ 6,
+ 7,
+ 0,
+ 2,
+ 3,
+ 4,
+ 1,
+ },
+ ec_gf8_mul_2C_ops};
+
+static ec_gf_op_t ec_gf8_mul_2D_ops[] = {
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR3, 8, 4, 6},
+ {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2D = {9,
+ {
+ 7,
+ 0,
+ 3,
+ 5,
+ 1,
+ 4,
+ 2,
+ 6,
+ 8,
+ },
+ ec_gf8_mul_2D_ops};
+
+static ec_gf_op_t ec_gf8_mul_2E_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_COPY, 8, 4, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2E = {9,
+ {
+ 5,
+ 0,
+ 7,
+ 3,
+ 2,
+ 6,
+ 4,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_2E_ops};
+
+static ec_gf_op_t ec_gf8_mul_2F_ops[] = {
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR3, 8, 7, 6}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+ {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2F = {9,
+ {
+ 6,
+ 3,
+ 2,
+ 5,
+ 7,
+ 0,
+ 1,
+ 4,
+ 8,
+ },
+ ec_gf8_mul_2F_ops};
+
+static ec_gf_op_t ec_gf8_mul_30_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR3, 6, 8, 7},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_30 = {9,
+ {
+ 3,
+ 4,
+ 7,
+ 5,
+ 0,
+ 6,
+ 1,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_30_ops};
+
+static ec_gf_op_t ec_gf8_mul_31_ops[] = {
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_31 = {8,
+ {
+ 7,
+ 1,
+ 4,
+ 5,
+ 6,
+ 0,
+ 2,
+ 3,
+ },
+ ec_gf8_mul_31_ops};
+
+static ec_gf_op_t ec_gf8_mul_32_ops[] = {
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_32 = {8,
+ {
+ 3,
+ 4,
+ 6,
+ 7,
+ 5,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_32_ops};
+
+static ec_gf_op_t ec_gf8_mul_33_ops[] = {
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_33 = {8,
+ {
+ 5,
+ 4,
+ 3,
+ 0,
+ 2,
+ 1,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_33_ops};
+
+static ec_gf_op_t ec_gf8_mul_34_ops[] = {
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_34 = {8,
+ {
+ 7,
+ 5,
+ 3,
+ 0,
+ 2,
+ 4,
+ 1,
+ 6,
+ },
+ ec_gf8_mul_34_ops};
+
+static ec_gf_op_t ec_gf8_mul_35_ops[] = {
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_35 = {8,
+ {
+ 6,
+ 7,
+ 5,
+ 4,
+ 2,
+ 0,
+ 1,
+ 3,
+ },
+ ec_gf8_mul_35_ops};
+
+static ec_gf_op_t ec_gf8_mul_36_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_36 = {8,
+ {
+ 6,
+ 7,
+ 4,
+ 1,
+ 2,
+ 3,
+ 0,
+ 5,
+ },
+ ec_gf8_mul_36_ops};
+
+static ec_gf_op_t ec_gf8_mul_37_ops[] = {
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR3, 8, 0, 1},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_37 = {9,
+ {
+ 6,
+ 7,
+ 2,
+ 1,
+ 0,
+ 3,
+ 4,
+ 5,
+ 8,
+ },
+ ec_gf8_mul_37_ops};
+
+static ec_gf_op_t ec_gf8_mul_38_ops[] = {
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR3, 8, 6, 7},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_38 = {9,
+ {
+ 4,
+ 5,
+ 6,
+ 3,
+ 0,
+ 1,
+ 7,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_38_ops};
+
+static ec_gf_op_t ec_gf8_mul_39_ops[] = {
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_39 = {8,
+ {
+ 1,
+ 6,
+ 3,
+ 0,
+ 5,
+ 2,
+ 4,
+ 7,
+ },
+ ec_gf8_mul_39_ops};
+
+static ec_gf_op_t ec_gf8_mul_3A_ops[] = {
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3A = {8,
+ {
+ 3,
+ 4,
+ 7,
+ 0,
+ 5,
+ 6,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_3A_ops};
+
+static ec_gf_op_t ec_gf8_mul_3B_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR3, 8, 7, 3}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3B = {9,
+ {
+ 3,
+ 0,
+ 1,
+ 7,
+ 6,
+ 2,
+ 4,
+ 8,
+ 5,
+ },
+ ec_gf8_mul_3B_ops};
+
+static ec_gf_op_t ec_gf8_mul_3C_ops[] = {
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3C = {8,
+ {
+ 3,
+ 6,
+ 4,
+ 1,
+ 7,
+ 2,
+ 0,
+ 5,
+ },
+ ec_gf8_mul_3C_ops};
+
+static ec_gf_op_t ec_gf8_mul_3D_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3D = {8,
+ {
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ },
+ ec_gf8_mul_3D_ops};
+
+static ec_gf_op_t ec_gf8_mul_3E_ops[] = {
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3E = {8,
+ {
+ 6,
+ 1,
+ 2,
+ 7,
+ 0,
+ 3,
+ 5,
+ 4,
+ },
+ ec_gf8_mul_3E_ops};
+
+static ec_gf_op_t ec_gf8_mul_3F_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_COPY, 10, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_COPY, 9, 2, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR3, 4, 9, 7},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 3, 10, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3F = {11,
+ {
+ 1,
+ 7,
+ 6,
+ 2,
+ 4,
+ 3,
+ 5,
+ 0,
+ 8,
+ 9,
+ 10,
+ },
+ ec_gf8_mul_3F_ops};
+
+static ec_gf_op_t ec_gf8_mul_40_ops[] = {
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR3, 8, 7, 6},
+ {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_40 = {9,
+ {
+ 5,
+ 7,
+ 4,
+ 6,
+ 2,
+ 3,
+ 0,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_40_ops};
+
+static ec_gf_op_t ec_gf8_mul_41_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 4, 0},
+ {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_41 = {9,
+ {
+ 0,
+ 7,
+ 6,
+ 5,
+ 3,
+ 4,
+ 8,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_41_ops};
+
+static ec_gf_op_t ec_gf8_mul_42_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_42 = {9,
+ {
+ 2,
+ 7,
+ 1,
+ 6,
+ 4,
+ 3,
+ 0,
+ 5,
+ 8,
+ },
+ ec_gf8_mul_42_ops};
+
+static ec_gf_op_t ec_gf8_mul_43_ops[] = {
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_43 = {8,
+ {
+ 2,
+ 6,
+ 4,
+ 1,
+ 7,
+ 3,
+ 0,
+ 5,
+ },
+ ec_gf8_mul_43_ops};
+
+static ec_gf_op_t ec_gf8_mul_44_ops[] = {
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_44 = {8,
+ {
+ 2,
+ 3,
+ 4,
+ 1,
+ 6,
+ 5,
+ 0,
+ 7,
+ },
+ ec_gf8_mul_44_ops};
+
+static ec_gf_op_t ec_gf8_mul_45_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_45 = {8,
+ {
+ 2,
+ 3,
+ 0,
+ 1,
+ 7,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_45_ops};
+
+static ec_gf_op_t ec_gf8_mul_46_ops[] = {
+ {EC_GF_OP_XOR3, 8, 2, 4}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 8, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_46 = {9,
+ {
+ 2,
+ 0,
+ 1,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ },
+ ec_gf8_mul_46_ops};
+
+static ec_gf_op_t ec_gf8_mul_47_ops[] = {
+ {EC_GF_OP_XOR3, 8, 0, 1}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_47 = {9,
+ {
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_47_ops};
+
+static ec_gf_op_t ec_gf8_mul_48_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_48 = {8,
+ {
+ 4,
+ 5,
+ 6,
+ 0,
+ 1,
+ 3,
+ 7,
+ 2,
+ },
+ ec_gf8_mul_48_ops};
+
+static ec_gf_op_t ec_gf8_mul_49_ops[] = {
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR3, 8, 0, 6},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR3, 1, 8, 5},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_49 = {9,
+ {
+ 7,
+ 2,
+ 4,
+ 0,
+ 3,
+ 5,
+ 1,
+ 6,
+ 8,
+ },
+ ec_gf8_mul_49_ops};
+
+static ec_gf_op_t ec_gf8_mul_4A_ops[] = {
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4A = {8,
+ {
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 3,
+ 4,
+ 2,
+ },
+ ec_gf8_mul_4A_ops};
+
+static ec_gf_op_t ec_gf8_mul_4B_ops[] = {
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR3, 8, 3, 7}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4B = {9,
+ {
+ 5,
+ 3,
+ 6,
+ 7,
+ 0,
+ 2,
+ 4,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_4B_ops};
+
+static ec_gf_op_t ec_gf8_mul_4C_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4C = {8,
+ {
+ 5,
+ 3,
+ 4,
+ 7,
+ 0,
+ 6,
+ 2,
+ 1,
+ },
+ ec_gf8_mul_4C_ops};
+
+static ec_gf_op_t ec_gf8_mul_4D_ops[] = {
+ {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR3, 9, 3, 1},
+ {EC_GF_OP_XOR2, 5, 9, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 0, 8, 2},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4D = {10,
+ {
+ 0,
+ 9,
+ 3,
+ 5,
+ 6,
+ 4,
+ 7,
+ 1,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_4D_ops};
+
+static ec_gf_op_t ec_gf8_mul_4E_ops[] = {
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4E = {8,
+ {
+ 2,
+ 3,
+ 0,
+ 1,
+ 5,
+ 6,
+ 7,
+ 4,
+ },
+ ec_gf8_mul_4E_ops};
+
+static ec_gf_op_t ec_gf8_mul_4F_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4F = {8,
+ {
+ 0,
+ 3,
+ 5,
+ 6,
+ 1,
+ 2,
+ 7,
+ 4,
+ },
+ ec_gf8_mul_4F_ops};
+
+static ec_gf_op_t ec_gf8_mul_50_ops[] = {
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_50 = {8,
+ {
+ 4,
+ 5,
+ 7,
+ 3,
+ 0,
+ 1,
+ 2,
+ 6,
+ },
+ ec_gf8_mul_50_ops};
+
+static ec_gf_op_t ec_gf8_mul_51_ops[] = {
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_51 = {8,
+ {
+ 0,
+ 1,
+ 7,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_51_ops};
+
+static ec_gf_op_t ec_gf8_mul_52_ops[] = {
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_COPY, 9, 4, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR3, 3, 5, 8},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 2, 9, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_52 = {10,
+ {
+ 2,
+ 3,
+ 1,
+ 4,
+ 6,
+ 7,
+ 0,
+ 5,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_52_ops};
+
+static ec_gf_op_t ec_gf8_mul_53_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_53 = {8,
+ {
+ 2,
+ 0,
+ 1,
+ 4,
+ 5,
+ 6,
+ 7,
+ 3,
+ },
+ ec_gf8_mul_53_ops};
+
+static ec_gf_op_t ec_gf8_mul_54_ops[] = {
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_54 = {8,
+ {
+ 7,
+ 3,
+ 0,
+ 4,
+ 2,
+ 6,
+ 5,
+ 1,
+ },
+ ec_gf8_mul_54_ops};
+
+static ec_gf_op_t ec_gf8_mul_55_ops[] = {
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_55 = {8,
+ {
+ 1,
+ 5,
+ 6,
+ 4,
+ 3,
+ 7,
+ 2,
+ 0,
+ },
+ ec_gf8_mul_55_ops};
+
+static ec_gf_op_t ec_gf8_mul_56_ops[] = {
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_56 = {8,
+ {
+ 2,
+ 3,
+ 0,
+ 4,
+ 5,
+ 6,
+ 7,
+ 1,
+ },
+ ec_gf8_mul_56_ops};
+
+static ec_gf_op_t ec_gf8_mul_57_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_57 = {8,
+ {
+ 2,
+ 3,
+ 0,
+ 1,
+ 4,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_57_ops};
+
+static ec_gf_op_t ec_gf8_mul_58_ops[] = {
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_58 = {8,
+ {
+ 4,
+ 3,
+ 2,
+ 7,
+ 0,
+ 1,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_58_ops};
+
+static ec_gf_op_t ec_gf8_mul_59_ops[] = {
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_59 = {8,
+ {
+ 7,
+ 3,
+ 5,
+ 6,
+ 1,
+ 2,
+ 0,
+ 4,
+ },
+ ec_gf8_mul_59_ops};
+
+static ec_gf_op_t ec_gf8_mul_5A_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5A = {8,
+ {
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 5,
+ 4,
+ },
+ ec_gf8_mul_5A_ops};
+
+static ec_gf_op_t ec_gf8_mul_5B_ops[] = {
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5B = {8,
+ {
+ 6,
+ 0,
+ 7,
+ 5,
+ 2,
+ 1,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_5B_ops};
+
+static ec_gf_op_t ec_gf8_mul_5C_ops[] = {
+ {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5C = {9,
+ {
+ 7,
+ 5,
+ 2,
+ 4,
+ 1,
+ 0,
+ 6,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_5C_ops};
+
+static ec_gf_op_t ec_gf8_mul_5D_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5D = {8,
+ {
+ 1,
+ 3,
+ 5,
+ 4,
+ 6,
+ 7,
+ 2,
+ 0,
+ },
+ ec_gf8_mul_5D_ops};
+
+static ec_gf_op_t ec_gf8_mul_5E_ops[] = {
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5E = {8,
+ {
+ 4,
+ 3,
+ 6,
+ 2,
+ 5,
+ 7,
+ 0,
+ 1,
+ },
+ ec_gf8_mul_5E_ops};
+
+static ec_gf_op_t ec_gf8_mul_5F_ops[] = {
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5F = {8,
+ {
+ 6,
+ 1,
+ 3,
+ 4,
+ 5,
+ 7,
+ 2,
+ 0,
+ },
+ ec_gf8_mul_5F_ops};
+
+static ec_gf_op_t ec_gf8_mul_60_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_60 = {8,
+ {
+ 2,
+ 3,
+ 4,
+ 7,
+ 5,
+ 6,
+ 0,
+ 1,
+ },
+ ec_gf8_mul_60_ops};
+
+static ec_gf_op_t ec_gf8_mul_61_ops[] = {
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_61 = {8,
+ {
+ 0,
+ 5,
+ 6,
+ 7,
+ 4,
+ 2,
+ 1,
+ 3,
+ },
+ ec_gf8_mul_61_ops};
+
+static ec_gf_op_t ec_gf8_mul_62_ops[] = {
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_62 = {8,
+ {
+ 2,
+ 0,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 1,
+ },
+ ec_gf8_mul_62_ops};
+
+static ec_gf_op_t ec_gf8_mul_63_ops[] = {
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_63 = {8,
+ {
+ 3,
+ 4,
+ 6,
+ 5,
+ 7,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_63_ops};
+
+static ec_gf_op_t ec_gf8_mul_64_ops[] = {
+ {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 0, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 8, 7, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_64 = {9,
+ {
+ 2,
+ 3,
+ 4,
+ 6,
+ 5,
+ 7,
+ 8,
+ 1,
+ 0,
+ },
+ ec_gf8_mul_64_ops};
+
+static ec_gf_op_t ec_gf8_mul_65_ops[] = {
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_65 = {8,
+ {
+ 2,
+ 5,
+ 1,
+ 3,
+ 4,
+ 0,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_65_ops};
+
+static ec_gf_op_t ec_gf8_mul_66_ops[] = {
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_66 = {8,
+ {
+ 2,
+ 3,
+ 1,
+ 4,
+ 5,
+ 7,
+ 0,
+ 6,
+ },
+ ec_gf8_mul_66_ops};
+
+static ec_gf_op_t ec_gf8_mul_67_ops[] = {
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_67 = {8,
+ {
+ 2,
+ 4,
+ 5,
+ 6,
+ 7,
+ 3,
+ 1,
+ 0,
+ },
+ ec_gf8_mul_67_ops};
+
+static ec_gf_op_t ec_gf8_mul_68_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_68 = {8,
+ {
+ 5,
+ 7,
+ 2,
+ 3,
+ 0,
+ 6,
+ 4,
+ 1,
+ },
+ ec_gf8_mul_68_ops};
+
+static ec_gf_op_t ec_gf8_mul_69_ops[] = {
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_69 = {8,
+ {
+ 0,
+ 1,
+ 3,
+ 2,
+ 4,
+ 5,
+ 7,
+ 6,
+ },
+ ec_gf8_mul_69_ops};
+
+static ec_gf_op_t ec_gf8_mul_6A_ops[] = {
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6A = {8,
+ {
+ 5,
+ 7,
+ 4,
+ 6,
+ 1,
+ 2,
+ 0,
+ 3,
+ },
+ ec_gf8_mul_6A_ops};
+
+static ec_gf_op_t ec_gf8_mul_6B_ops[] = {
+ {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6B = {9,
+ {
+ 6,
+ 7,
+ 2,
+ 0,
+ 3,
+ 1,
+ 5,
+ 4,
+ 8,
+ },
+ ec_gf8_mul_6B_ops};
+
+static ec_gf_op_t ec_gf8_mul_6C_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6C = {8,
+ {
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_6C_ops};
+
+static ec_gf_op_t ec_gf8_mul_6D_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR3, 8, 3, 4}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6D = {9,
+ {
+ 3,
+ 6,
+ 7,
+ 0,
+ 4,
+ 5,
+ 1,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_6D_ops};
+
+static ec_gf_op_t ec_gf8_mul_6E_ops[] = {
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6E = {8,
+ {
+ 5,
+ 6,
+ 3,
+ 1,
+ 7,
+ 2,
+ 0,
+ 4,
+ },
+ ec_gf8_mul_6E_ops};
+
+static ec_gf_op_t ec_gf8_mul_6F_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR3, 0, 8, 7}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6F = {9,
+ {
+ 2,
+ 6,
+ 3,
+ 7,
+ 0,
+ 1,
+ 4,
+ 5,
+ 8,
+ },
+ ec_gf8_mul_6F_ops};
+
+static ec_gf_op_t ec_gf8_mul_70_ops[] = {
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_70 = {8,
+ {
+ 3,
+ 4,
+ 5,
+ 2,
+ 6,
+ 0,
+ 1,
+ 7,
+ },
+ ec_gf8_mul_70_ops};
+
+static ec_gf_op_t ec_gf8_mul_71_ops[] = {
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_71 = {8,
+ {
+ 4,
+ 7,
+ 5,
+ 3,
+ 6,
+ 0,
+ 2,
+ 1,
+ },
+ ec_gf8_mul_71_ops};
+
+static ec_gf_op_t ec_gf8_mul_72_ops[] = {
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_72 = {8,
+ {
+ 0,
+ 5,
+ 2,
+ 7,
+ 4,
+ 1,
+ 3,
+ 6,
+ },
+ ec_gf8_mul_72_ops};
+
+static ec_gf_op_t ec_gf8_mul_73_ops[] = {
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_73 = {8,
+ {
+ 6,
+ 0,
+ 1,
+ 7,
+ 4,
+ 5,
+ 2,
+ 3,
+ },
+ ec_gf8_mul_73_ops};
+
+static ec_gf_op_t ec_gf8_mul_74_ops[] = {
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_74 = {8,
+ {
+ 3,
+ 2,
+ 1,
+ 0,
+ 4,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_74_ops};
+
+static ec_gf_op_t ec_gf8_mul_75_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_75 = {8,
+ {
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ },
+ ec_gf8_mul_75_ops};
+
+static ec_gf_op_t ec_gf8_mul_76_ops[] = {
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 8, 6, 2},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 8, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_76 = {9,
+ {
+ 2,
+ 3,
+ 0,
+ 6,
+ 5,
+ 1,
+ 7,
+ 8,
+ 4,
+ },
+ ec_gf8_mul_76_ops};
+
+static ec_gf_op_t ec_gf8_mul_77_ops[] = {
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_77 = {8,
+ {
+ 7,
+ 4,
+ 3,
+ 6,
+ 0,
+ 1,
+ 5,
+ 2,
+ },
+ ec_gf8_mul_77_ops};
+
+static ec_gf_op_t ec_gf8_mul_78_ops[] = {
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR3, 8, 0, 2},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_78 = {9,
+ {
+ 4,
+ 7,
+ 3,
+ 2,
+ 5,
+ 1,
+ 6,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_78_ops};
+
+static ec_gf_op_t ec_gf8_mul_79_ops[] = {
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR3, 8, 4, 7},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_79 = {9,
+ {
+ 4,
+ 5,
+ 7,
+ 3,
+ 1,
+ 6,
+ 2,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_79_ops};
+
+static ec_gf_op_t ec_gf8_mul_7A_ops[] = {
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7A = {8,
+ {
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ },
+ ec_gf8_mul_7A_ops};
+
+static ec_gf_op_t ec_gf8_mul_7B_ops[] = {
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR3, 8, 5, 3},
+ {EC_GF_OP_XOR2, 8, 0, 0}, {EC_GF_OP_COPY, 9, 4, 0},
+ {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 4, 1, 9},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7B = {10,
+ {
+ 1,
+ 2,
+ 3,
+ 4,
+ 8,
+ 5,
+ 6,
+ 0,
+ 7,
+ 9,
+ },
+ ec_gf8_mul_7B_ops};
+
+static ec_gf_op_t ec_gf8_mul_7C_ops[] = {
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7C = {8,
+ {
+ 2,
+ 4,
+ 1,
+ 6,
+ 3,
+ 5,
+ 7,
+ 0,
+ },
+ ec_gf8_mul_7C_ops};
+
+static ec_gf_op_t ec_gf8_mul_7D_ops[] = {
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7D = {8,
+ {
+ 1,
+ 0,
+ 3,
+ 5,
+ 6,
+ 7,
+ 2,
+ 4,
+ },
+ ec_gf8_mul_7D_ops};
+
+static ec_gf_op_t ec_gf8_mul_7E_ops[] = {
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_COPY, 8, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 6, 2, 7},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7E = {9,
+ {
+ 5,
+ 1,
+ 2,
+ 0,
+ 7,
+ 3,
+ 4,
+ 6,
+ 8,
+ },
+ ec_gf8_mul_7E_ops};
+
+static ec_gf_op_t ec_gf8_mul_7F_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR3, 9, 7, 5}, {EC_GF_OP_XOR2, 2, 9, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 9, 0},
+ {EC_GF_OP_XOR3, 9, 6, 4}, {EC_GF_OP_XOR2, 7, 9, 0},
+ {EC_GF_OP_XOR2, 3, 9, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7F = {10,
+ {
+ 4,
+ 1,
+ 0,
+ 5,
+ 6,
+ 7,
+ 2,
+ 3,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_7F_ops};
+
+static ec_gf_op_t ec_gf8_mul_80_ops[] = {
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_80 = {8,
+ {
+ 7,
+ 5,
+ 6,
+ 4,
+ 1,
+ 2,
+ 3,
+ 0,
+ },
+ ec_gf8_mul_80_ops};
+
+static ec_gf_op_t ec_gf8_mul_81_ops[] = {
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_81 = {8,
+ {
+ 2,
+ 7,
+ 4,
+ 1,
+ 5,
+ 6,
+ 3,
+ 0,
+ },
+ ec_gf8_mul_81_ops};
+
+static ec_gf_op_t ec_gf8_mul_82_ops[] = {
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR3, 5, 8, 7}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_82 = {9,
+ {
+ 6,
+ 2,
+ 7,
+ 5,
+ 1,
+ 3,
+ 4,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_82_ops};
+
+static ec_gf_op_t ec_gf8_mul_83_ops[] = {
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_83 = {8,
+ {
+ 3,
+ 5,
+ 6,
+ 7,
+ 1,
+ 2,
+ 4,
+ 0,
+ },
+ ec_gf8_mul_83_ops};
+
+static ec_gf_op_t ec_gf8_mul_84_ops[] = {
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_84 = {8,
+ {
+ 7,
+ 6,
+ 0,
+ 4,
+ 1,
+ 5,
+ 3,
+ 2,
+ },
+ ec_gf8_mul_84_ops};
+
+static ec_gf_op_t ec_gf8_mul_85_ops[] = {
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_85 = {8,
+ {
+ 7,
+ 6,
+ 0,
+ 3,
+ 2,
+ 4,
+ 5,
+ 1,
+ },
+ ec_gf8_mul_85_ops};
+
+static ec_gf_op_t ec_gf8_mul_86_ops[] = {
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_86 = {8,
+ {
+ 1,
+ 2,
+ 6,
+ 4,
+ 5,
+ 7,
+ 3,
+ 0,
+ },
+ ec_gf8_mul_86_ops};
+
+static ec_gf_op_t ec_gf8_mul_87_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_COPY, 8, 1, 0},
+ {EC_GF_OP_XOR2, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR3, 5, 8, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_87 = {9,
+ {
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 7,
+ 6,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_87_ops};
+
+static ec_gf_op_t ec_gf8_mul_88_ops[] = {
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_88 = {8,
+ {
+ 6,
+ 7,
+ 3,
+ 1,
+ 2,
+ 4,
+ 5,
+ 0,
+ },
+ ec_gf8_mul_88_ops};
+
+static ec_gf_op_t ec_gf8_mul_89_ops[] = {
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR3, 8, 5, 2},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_89 = {9,
+ {
+ 2,
+ 1,
+ 6,
+ 5,
+ 7,
+ 3,
+ 4,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_89_ops};
+
+static ec_gf_op_t ec_gf8_mul_8A_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8A = {8,
+ {
+ 1,
+ 2,
+ 3,
+ 0,
+ 6,
+ 7,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_8A_ops};
+
+static ec_gf_op_t ec_gf8_mul_8B_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8B = {8,
+ {
+ 6,
+ 1,
+ 2,
+ 3,
+ 5,
+ 7,
+ 4,
+ 0,
+ },
+ ec_gf8_mul_8B_ops};
+
+static ec_gf_op_t ec_gf8_mul_8C_ops[] = {
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8C = {8,
+ {
+ 1,
+ 2,
+ 0,
+ 7,
+ 3,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_8C_ops};
+
+static ec_gf_op_t ec_gf8_mul_8D_ops[] = {
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8D = {8,
+ {
+ 7,
+ 1,
+ 3,
+ 2,
+ 4,
+ 5,
+ 0,
+ 6,
+ },
+ ec_gf8_mul_8D_ops};
+
+static ec_gf_op_t ec_gf8_mul_8E_ops[] = {{EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8E = {8,
+ {
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ },
+ ec_gf8_mul_8E_ops};
+
+static ec_gf_op_t ec_gf8_mul_8F_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8F = {8,
+ {
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ },
+ ec_gf8_mul_8F_ops};
+
+static ec_gf_op_t ec_gf8_mul_90_ops[] = {
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_90 = {8,
+ {
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 3,
+ 2,
+ },
+ ec_gf8_mul_90_ops};
+
+static ec_gf_op_t ec_gf8_mul_91_ops[] = {
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_COPY, 9, 1, 0}, {EC_GF_OP_COPY, 8, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 9, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR3, 5, 8, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_91 = {10,
+ {
+ 2,
+ 3,
+ 1,
+ 4,
+ 0,
+ 6,
+ 7,
+ 5,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_91_ops};
+
+static ec_gf_op_t ec_gf8_mul_92_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_92 = {8,
+ {
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 5,
+ 4,
+ },
+ ec_gf8_mul_92_ops};
+
+static ec_gf_op_t ec_gf8_mul_93_ops[] = {
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_93 = {8,
+ {
+ 6,
+ 4,
+ 5,
+ 1,
+ 7,
+ 2,
+ 3,
+ 0,
+ },
+ ec_gf8_mul_93_ops};
+
+static ec_gf_op_t ec_gf8_mul_94_ops[] = {
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_94 = {8,
+ {
+ 7,
+ 5,
+ 0,
+ 2,
+ 6,
+ 1,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_94_ops};
+
+static ec_gf_op_t ec_gf8_mul_95_ops[] = {
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_95 = {8,
+ {
+ 7,
+ 6,
+ 1,
+ 3,
+ 0,
+ 4,
+ 5,
+ 2,
+ },
+ ec_gf8_mul_95_ops};
+
+static ec_gf_op_t ec_gf8_mul_96_ops[] = {
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR3, 8, 0, 4}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_96 = {9,
+ {
+ 4,
+ 0,
+ 1,
+ 6,
+ 7,
+ 2,
+ 3,
+ 5,
+ 8,
+ },
+ ec_gf8_mul_96_ops};
+
+static ec_gf_op_t ec_gf8_mul_97_ops[] = {
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_COPY, 8, 2, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 8, 6, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_97 = {9,
+ {
+ 4,
+ 5,
+ 3,
+ 6,
+ 7,
+ 1,
+ 2,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_97_ops};
+
+static ec_gf_op_t ec_gf8_mul_98_ops[] = {
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_98 = {8,
+ {
+ 4,
+ 2,
+ 3,
+ 6,
+ 7,
+ 5,
+ 1,
+ 0,
+ },
+ ec_gf8_mul_98_ops};
+
+static ec_gf_op_t ec_gf8_mul_99_ops[] = {
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_99 = {8,
+ {
+ 6,
+ 5,
+ 3,
+ 7,
+ 0,
+ 1,
+ 4,
+ 2,
+ },
+ ec_gf8_mul_99_ops};
+
+static ec_gf_op_t ec_gf8_mul_9A_ops[] = {
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR3, 8, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9A = {9,
+ {
+ 6,
+ 3,
+ 4,
+ 0,
+ 5,
+ 1,
+ 2,
+ 7,
+ 8,
+ },
+ ec_gf8_mul_9A_ops};
+
+static ec_gf_op_t ec_gf8_mul_9B_ops[] = {
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_COPY, 9, 5, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR3, 8, 3, 2}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 3, 9, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9B = {10,
+ {
+ 4,
+ 5,
+ 8,
+ 6,
+ 7,
+ 1,
+ 2,
+ 0,
+ 3,
+ 9,
+ },
+ ec_gf8_mul_9B_ops};
+
+static ec_gf_op_t ec_gf8_mul_9C_ops[] = {
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9C = {8,
+ {
+ 3,
+ 2,
+ 1,
+ 0,
+ 4,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_9C_ops};
+
+static ec_gf_op_t ec_gf8_mul_9D_ops[] = {
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9D = {8,
+ {
+ 0,
+ 1,
+ 2,
+ 3,
+ 7,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_9D_ops};
+
+static ec_gf_op_t ec_gf8_mul_9E_ops[] = {
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_COPY, 8, 7, 0},
+ {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9E = {9,
+ {
+ 4,
+ 5,
+ 3,
+ 8,
+ 6,
+ 0,
+ 2,
+ 7,
+ 1,
+ },
+ ec_gf8_mul_9E_ops};
+
+static ec_gf_op_t ec_gf8_mul_9F_ops[] = {
+ {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 8, 3, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9F = {9,
+ {
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_9F_ops};
+
+static ec_gf_op_t ec_gf8_mul_A0_ops[] = {
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A0 = {8,
+ {
+ 3,
+ 1,
+ 6,
+ 7,
+ 5,
+ 2,
+ 4,
+ 0,
+ },
+ ec_gf8_mul_A0_ops};
+
+static ec_gf_op_t ec_gf8_mul_A1_ops[] = {
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR3, 8, 0, 6},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A1 = {9,
+ {
+ 7,
+ 4,
+ 1,
+ 5,
+ 6,
+ 0,
+ 2,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_A1_ops};
+
+static ec_gf_op_t ec_gf8_mul_A2_ops[] = {
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A2 = {8,
+ {
+ 7,
+ 0,
+ 6,
+ 3,
+ 2,
+ 1,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_A2_ops};
+
+static ec_gf_op_t ec_gf8_mul_A3_ops[] = {
+ {EC_GF_OP_COPY, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A3 = {9,
+ {
+ 3,
+ 7,
+ 2,
+ 6,
+ 1,
+ 4,
+ 0,
+ 5,
+ 8,
+ },
+ ec_gf8_mul_A3_ops};
+
+static ec_gf_op_t ec_gf8_mul_A4_ops[] = {
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A4 = {8,
+ {
+ 5,
+ 6,
+ 7,
+ 2,
+ 4,
+ 3,
+ 0,
+ 1,
+ },
+ ec_gf8_mul_A4_ops};
+
+static ec_gf_op_t ec_gf8_mul_A5_ops[] = {
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR3, 8, 5, 6}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A5 = {9,
+ {
+ 1,
+ 4,
+ 2,
+ 5,
+ 6,
+ 7,
+ 3,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_A5_ops};
+
+static ec_gf_op_t ec_gf8_mul_A6_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A6 = {8,
+ {
+ 1,
+ 2,
+ 0,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_A6_ops};
+
+static ec_gf_op_t ec_gf8_mul_A7_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A7 = {8,
+ {
+ 0,
+ 1,
+ 2,
+ 5,
+ 6,
+ 7,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_A7_ops};
+
+static ec_gf_op_t ec_gf8_mul_A8_ops[] = {
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 1, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_COPY, 9, 4, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 8, 3, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 9, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A8 = {10,
+ {
+ 1,
+ 7,
+ 5,
+ 8,
+ 6,
+ 3,
+ 4,
+ 0,
+ 2,
+ 9,
+ },
+ ec_gf8_mul_A8_ops};
+
+static ec_gf_op_t ec_gf8_mul_A9_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A9 = {8,
+ {
+ 3,
+ 7,
+ 6,
+ 1,
+ 2,
+ 0,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_A9_ops};
+
+static ec_gf_op_t ec_gf8_mul_AA_ops[] = {
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AA = {8,
+ {
+ 0,
+ 4,
+ 5,
+ 3,
+ 6,
+ 7,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_AA_ops};
+
+static ec_gf_op_t ec_gf8_mul_AB_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_COPY, 9, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR3, 3, 9, 7},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AB = {10,
+ {
+ 2,
+ 3,
+ 8,
+ 0,
+ 5,
+ 6,
+ 1,
+ 4,
+ 7,
+ 9,
+ },
+ ec_gf8_mul_AB_ops};
+
+static ec_gf_op_t ec_gf8_mul_AC_ops[] = {
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AC = {8,
+ {
+ 3,
+ 2,
+ 1,
+ 0,
+ 4,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_AC_ops};
+
+static ec_gf_op_t ec_gf8_mul_AD_ops[] = {
+ {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+ {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AD = {9,
+ {
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_AD_ops};
+
+static ec_gf_op_t ec_gf8_mul_AE_ops[] = {
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_COPY, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AE = {9,
+ {
+ 7,
+ 0,
+ 5,
+ 6,
+ 3,
+ 4,
+ 1,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_AE_ops};
+
+static ec_gf_op_t ec_gf8_mul_AF_ops[] = {
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AF = {8,
+ {
+ 0,
+ 1,
+ 2,
+ 7,
+ 3,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_AF_ops};
+
+static ec_gf_op_t ec_gf8_mul_B0_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B0 = {8,
+ {
+ 4,
+ 0,
+ 7,
+ 2,
+ 3,
+ 1,
+ 6,
+ 5,
+ },
+ ec_gf8_mul_B0_ops};
+
+static ec_gf_op_t ec_gf8_mul_B1_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_COPY, 8, 4, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR3, 5, 8, 1}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B1 = {9,
+ {
+ 2,
+ 6,
+ 4,
+ 7,
+ 0,
+ 1,
+ 3,
+ 5,
+ 8,
+ },
+ ec_gf8_mul_B1_ops};
+
+static ec_gf_op_t ec_gf8_mul_B2_ops[] = {
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 8, 4, 5},
+ {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 8, 1, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B2 = {9,
+ {
+ 0,
+ 7,
+ 4,
+ 5,
+ 6,
+ 1,
+ 2,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_B2_ops};
+
+static ec_gf_op_t ec_gf8_mul_B3_ops[] = {
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_COPY, 9, 5, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR3, 8, 6, 4},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 8, 5, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR3, 1, 9, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B3 = {10,
+ {
+ 2,
+ 3,
+ 4,
+ 5,
+ 1,
+ 6,
+ 0,
+ 7,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_B3_ops};
+
+static ec_gf_op_t ec_gf8_mul_B4_ops[] = {
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B4 = {8,
+ {
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_B4_ops};
+
+static ec_gf_op_t ec_gf8_mul_B5_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR3, 4, 8, 3}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B5 = {9,
+ {
+ 3,
+ 4,
+ 0,
+ 7,
+ 1,
+ 5,
+ 6,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_B5_ops};
+
+static ec_gf_op_t ec_gf8_mul_B6_ops[] = {
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B6 = {8,
+ {
+ 5,
+ 3,
+ 6,
+ 4,
+ 7,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_B6_ops};
+
+static ec_gf_op_t ec_gf8_mul_B7_ops[] = {
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B7 = {8,
+ {
+ 5,
+ 0,
+ 1,
+ 4,
+ 2,
+ 6,
+ 7,
+ 3,
+ },
+ ec_gf8_mul_B7_ops};
+
+static ec_gf_op_t ec_gf8_mul_B8_ops[] = {
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B8 = {8,
+ {
+ 6,
+ 4,
+ 5,
+ 1,
+ 2,
+ 0,
+ 7,
+ 3,
+ },
+ ec_gf8_mul_B8_ops};
+
+static ec_gf_op_t ec_gf8_mul_B9_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR3, 0, 8, 2}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B9 = {9,
+ {
+ 6,
+ 7,
+ 0,
+ 2,
+ 1,
+ 4,
+ 5,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_B9_ops};
+
+static ec_gf_op_t ec_gf8_mul_BA_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BA = {8,
+ {
+ 1,
+ 2,
+ 4,
+ 3,
+ 5,
+ 6,
+ 0,
+ 7,
+ },
+ ec_gf8_mul_BA_ops};
+
+static ec_gf_op_t ec_gf8_mul_BB_ops[] = {
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_COPY, 8, 3, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 8, 5, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BB = {9,
+ {
+ 7,
+ 2,
+ 1,
+ 8,
+ 3,
+ 5,
+ 6,
+ 4,
+ 0,
+ },
+ ec_gf8_mul_BB_ops};
+
+static ec_gf_op_t ec_gf8_mul_BC_ops[] = {
+ {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 2, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR3, 2, 8, 4},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BC = {9,
+ {
+ 2,
+ 6,
+ 3,
+ 4,
+ 5,
+ 1,
+ 7,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_BC_ops};
+
+static ec_gf_op_t ec_gf8_mul_BD_ops[] = {
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BD = {8,
+ {
+ 4,
+ 5,
+ 0,
+ 2,
+ 7,
+ 1,
+ 6,
+ 3,
+ },
+ ec_gf8_mul_BD_ops};
+
+static ec_gf_op_t ec_gf8_mul_BE_ops[] = {
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BE = {8,
+ {
+ 0,
+ 6,
+ 7,
+ 4,
+ 5,
+ 1,
+ 3,
+ 2,
+ },
+ ec_gf8_mul_BE_ops};
+
+static ec_gf_op_t ec_gf8_mul_BF_ops[] = {
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BF = {8,
+ {
+ 5,
+ 6,
+ 1,
+ 7,
+ 3,
+ 0,
+ 2,
+ 4,
+ },
+ ec_gf8_mul_BF_ops};
+
+static ec_gf_op_t ec_gf8_mul_C0_ops[] = {
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C0 = {8,
+ {
+ 1,
+ 2,
+ 3,
+ 4,
+ 7,
+ 5,
+ 6,
+ 0,
+ },
+ ec_gf8_mul_C0_ops};
+
+static ec_gf_op_t ec_gf8_mul_C1_ops[] = {
+ {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 8, 3, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C1 = {9,
+ {
+ 5,
+ 6,
+ 7,
+ 4,
+ 1,
+ 2,
+ 3,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_C1_ops};
+
+static ec_gf_op_t ec_gf8_mul_C2_ops[] = {
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C2 = {8,
+ {
+ 7,
+ 6,
+ 3,
+ 0,
+ 1,
+ 4,
+ 5,
+ 2,
+ },
+ ec_gf8_mul_C2_ops};
+
+static ec_gf_op_t ec_gf8_mul_C3_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR3, 0, 2, 6}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR3, 9, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 9, 0},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C3 = {10,
+ {
+ 5,
+ 6,
+ 4,
+ 7,
+ 1,
+ 2,
+ 3,
+ 0,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_C3_ops};
+
+static ec_gf_op_t ec_gf8_mul_C4_ops[] = {
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C4 = {8,
+ {
+ 0,
+ 2,
+ 1,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_C4_ops};
+
+static ec_gf_op_t ec_gf8_mul_C5_ops[] = {
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C5 = {8,
+ {
+ 4,
+ 3,
+ 5,
+ 7,
+ 6,
+ 2,
+ 0,
+ 1,
+ },
+ ec_gf8_mul_C5_ops};
+
+static ec_gf_op_t ec_gf8_mul_C6_ops[] = {
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_COPY, 8, 4, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR3, 9, 5, 4},
+ {EC_GF_OP_XOR2, 6, 9, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 7, 9, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 8, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C6 = {10,
+ {
+ 6,
+ 3,
+ 0,
+ 4,
+ 5,
+ 7,
+ 2,
+ 1,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_C6_ops};
+
+static ec_gf_op_t ec_gf8_mul_C7_ops[] = {
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C7 = {8,
+ {
+ 7,
+ 0,
+ 6,
+ 2,
+ 5,
+ 3,
+ 4,
+ 1,
+ },
+ ec_gf8_mul_C7_ops};
+
+static ec_gf_op_t ec_gf8_mul_C8_ops[] = {
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C8 = {8,
+ {
+ 1,
+ 3,
+ 2,
+ 4,
+ 6,
+ 7,
+ 5,
+ 0,
+ },
+ ec_gf8_mul_C8_ops};
+
+static ec_gf_op_t ec_gf8_mul_C9_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C9 = {8,
+ {
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ },
+ ec_gf8_mul_C9_ops};
+
+static ec_gf_op_t ec_gf8_mul_CA_ops[] = {
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CA = {8,
+ {
+ 1,
+ 2,
+ 5,
+ 7,
+ 3,
+ 4,
+ 0,
+ 6,
+ },
+ ec_gf8_mul_CA_ops};
+
+static ec_gf_op_t ec_gf8_mul_CB_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CB = {8,
+ {
+ 2,
+ 3,
+ 4,
+ 5,
+ 7,
+ 6,
+ 0,
+ 1,
+ },
+ ec_gf8_mul_CB_ops};
+
+static ec_gf_op_t ec_gf8_mul_CC_ops[] = {
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CC = {8,
+ {
+ 2,
+ 7,
+ 1,
+ 0,
+ 5,
+ 6,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_CC_ops};
+
+static ec_gf_op_t ec_gf8_mul_CD_ops[] = {
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CD = {8,
+ {
+ 0,
+ 6,
+ 1,
+ 2,
+ 7,
+ 3,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_CD_ops};
+
+static ec_gf_op_t ec_gf8_mul_CE_ops[] = {
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_COPY, 8, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR3, 3, 6, 8},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR3, 8, 2, 3},
+ {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CE = {9,
+ {
+ 5,
+ 7,
+ 3,
+ 0,
+ 2,
+ 6,
+ 4,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_CE_ops};
+
+static ec_gf_op_t ec_gf8_mul_CF_ops[] = {
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CF = {8,
+ {
+ 3,
+ 6,
+ 7,
+ 0,
+ 2,
+ 4,
+ 5,
+ 1,
+ },
+ ec_gf8_mul_CF_ops};
+
+static ec_gf_op_t ec_gf8_mul_D0_ops[] = {
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D0 = {8,
+ {
+ 5,
+ 6,
+ 7,
+ 2,
+ 0,
+ 3,
+ 1,
+ 4,
+ },
+ ec_gf8_mul_D0_ops};
+
+static ec_gf_op_t ec_gf8_mul_D1_ops[] = {
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR3, 8, 6, 0},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D1 = {9,
+ {
+ 5,
+ 6,
+ 3,
+ 2,
+ 0,
+ 7,
+ 4,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_D1_ops};
+
+static ec_gf_op_t ec_gf8_mul_D2_ops[] = {
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D2 = {8,
+ {
+ 7,
+ 0,
+ 2,
+ 1,
+ 3,
+ 4,
+ 6,
+ 5,
+ },
+ ec_gf8_mul_D2_ops};
+
+static ec_gf_op_t ec_gf8_mul_D3_ops[] = {
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_COPY, 8, 4, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 8, 6, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D3 = {9,
+ {
+ 0,
+ 3,
+ 2,
+ 8,
+ 4,
+ 6,
+ 7,
+ 1,
+ 5,
+ },
+ ec_gf8_mul_D3_ops};
+
+static ec_gf_op_t ec_gf8_mul_D4_ops[] = {
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_COPY, 8, 1, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR3, 1, 7, 8},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D4 = {9,
+ {
+ 4,
+ 1,
+ 7,
+ 5,
+ 0,
+ 6,
+ 3,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_D4_ops};
+
+static ec_gf_op_t ec_gf8_mul_D5_ops[] = {
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D5 = {8,
+ {
+ 6,
+ 7,
+ 4,
+ 5,
+ 2,
+ 3,
+ 1,
+ 0,
+ },
+ ec_gf8_mul_D5_ops};
+
+static ec_gf_op_t ec_gf8_mul_D6_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D6 = {9,
+ {
+ 0,
+ 6,
+ 2,
+ 7,
+ 1,
+ 3,
+ 4,
+ 5,
+ 8,
+ },
+ ec_gf8_mul_D6_ops};
+
+static ec_gf_op_t ec_gf8_mul_D7_ops[] = {
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR3, 8, 3, 5}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR3, 6, 7, 8}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D7 = {9,
+ {
+ 3,
+ 4,
+ 6,
+ 5,
+ 0,
+ 7,
+ 1,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_D7_ops};
+
+static ec_gf_op_t ec_gf8_mul_D8_ops[] = {
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D8 = {8,
+ {
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ },
+ ec_gf8_mul_D8_ops};
+
+static ec_gf_op_t ec_gf8_mul_D9_ops[] = {
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D9 = {8,
+ {
+ 1,
+ 2,
+ 6,
+ 7,
+ 4,
+ 5,
+ 0,
+ 3,
+ },
+ ec_gf8_mul_D9_ops};
+
+static ec_gf_op_t ec_gf8_mul_DA_ops[] = {
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR3, 8, 2, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DA = {9,
+ {
+ 2,
+ 5,
+ 7,
+ 1,
+ 0,
+ 4,
+ 3,
+ 6,
+ 8,
+ },
+ ec_gf8_mul_DA_ops};
+
+static ec_gf_op_t ec_gf8_mul_DB_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 8, 4, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DB = {9,
+ {
+ 7,
+ 5,
+ 6,
+ 2,
+ 3,
+ 4,
+ 1,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_DB_ops};
+
+static ec_gf_op_t ec_gf8_mul_DC_ops[] = {
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DC = {8,
+ {
+ 4,
+ 5,
+ 2,
+ 6,
+ 7,
+ 1,
+ 0,
+ 3,
+ },
+ ec_gf8_mul_DC_ops};
+
+static ec_gf_op_t ec_gf8_mul_DD_ops[] = {
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DD = {8,
+ {
+ 1,
+ 2,
+ 3,
+ 6,
+ 7,
+ 0,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_DD_ops};
+
+static ec_gf_op_t ec_gf8_mul_DE_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DE = {8,
+ {
+ 0,
+ 5,
+ 2,
+ 6,
+ 7,
+ 1,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_DE_ops};
+
+static ec_gf_op_t ec_gf8_mul_DF_ops[] = {
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 8, 3, 0},
+ {EC_GF_OP_COPY, 9, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR3, 1, 9, 2}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DF = {10,
+ {
+ 7,
+ 2,
+ 8,
+ 4,
+ 3,
+ 1,
+ 0,
+ 6,
+ 5,
+ 9,
+ },
+ ec_gf8_mul_DF_ops};
+
+static ec_gf_op_t ec_gf8_mul_E0_ops[] = {
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+ {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E0 = {8,
+ {
+ 2,
+ 3,
+ 4,
+ 7,
+ 5,
+ 6,
+ 0,
+ 1,
+ },
+ ec_gf8_mul_E0_ops};
+
+static ec_gf_op_t ec_gf8_mul_E1_ops[] = {
+ {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 7, 0},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR3, 9, 5, 3},
+ {EC_GF_OP_XOR2, 0, 9, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 4, 9, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E1 = {10,
+ {
+ 0,
+ 7,
+ 1,
+ 3,
+ 4,
+ 5,
+ 6,
+ 2,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_E1_ops};
+
+static ec_gf_op_t ec_gf8_mul_E2_ops[] = {
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E2 = {8,
+ {
+ 2,
+ 3,
+ 7,
+ 1,
+ 5,
+ 6,
+ 0,
+ 4,
+ },
+ ec_gf8_mul_E2_ops};
+
+static ec_gf_op_t ec_gf8_mul_E3_ops[] = {
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+ {EC_GF_OP_XOR3, 8, 2, 7}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR3, 6, 8, 4},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E3 = {9,
+ {
+ 5,
+ 4,
+ 7,
+ 2,
+ 1,
+ 3,
+ 6,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_E3_ops};
+
+static ec_gf_op_t ec_gf8_mul_E4_ops[] = {
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+ {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E4 = {8,
+ {
+ 7,
+ 0,
+ 1,
+ 6,
+ 3,
+ 4,
+ 2,
+ 5,
+ },
+ ec_gf8_mul_E4_ops};
+
+static ec_gf_op_t ec_gf8_mul_E5_ops[] = {
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E5 = {9,
+ {
+ 4,
+ 5,
+ 3,
+ 6,
+ 7,
+ 1,
+ 0,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_E5_ops};
+
+static ec_gf_op_t ec_gf8_mul_E6_ops[] = {
+ {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E6 = {8,
+ {
+ 5,
+ 4,
+ 3,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_E6_ops};
+
+static ec_gf_op_t ec_gf8_mul_E7_ops[] = {
+ {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR3, 9, 0, 6}, {EC_GF_OP_XOR2, 4, 9, 0},
+ {EC_GF_OP_XOR2, 5, 9, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E7 = {10,
+ {
+ 1,
+ 4,
+ 3,
+ 6,
+ 7,
+ 5,
+ 2,
+ 0,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_E7_ops};
+
+static ec_gf_op_t ec_gf8_mul_E8_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E8 = {8,
+ {
+ 1,
+ 4,
+ 2,
+ 7,
+ 3,
+ 0,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_E8_ops};
+
+static ec_gf_op_t ec_gf8_mul_E9_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_COPY, 8, 1, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+ {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR3, 1, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E9 = {9,
+ {
+ 6,
+ 2,
+ 0,
+ 3,
+ 4,
+ 1,
+ 5,
+ 7,
+ 8,
+ },
+ ec_gf8_mul_E9_ops};
+
+static ec_gf_op_t ec_gf8_mul_EA_ops[] = {
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EA = {8,
+ {
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_EA_ops};
+
+static ec_gf_op_t ec_gf8_mul_EB_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EB = {8,
+ {
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ },
+ ec_gf8_mul_EB_ops};
+
+static ec_gf_op_t ec_gf8_mul_EC_ops[] = {
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR3, 8, 4, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EC = {9,
+ {
+ 7,
+ 4,
+ 3,
+ 0,
+ 2,
+ 5,
+ 1,
+ 6,
+ 8,
+ },
+ ec_gf8_mul_EC_ops};
+
+static ec_gf_op_t ec_gf8_mul_ED_ops[] = {
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_ED = {8,
+ {
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 4,
+ 3,
+ 2,
+ },
+ ec_gf8_mul_ED_ops};
+
+static ec_gf_op_t ec_gf8_mul_EE_ops[] = {
+ {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR3, 8, 2, 3},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 8, 5, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EE = {9,
+ {
+ 6,
+ 4,
+ 5,
+ 7,
+ 2,
+ 3,
+ 0,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_EE_ops};
+
+static ec_gf_op_t ec_gf8_mul_EF_ops[] = {
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_COPY, 8, 0, 0},
+ {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 6, 8, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EF = {9,
+ {
+ 6,
+ 4,
+ 5,
+ 7,
+ 2,
+ 0,
+ 3,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_EF_ops};
+
+static ec_gf_op_t ec_gf8_mul_F0_ops[] = {
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR3, 8, 3, 6},
+ {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 8, 4, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+ {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F0 = {9,
+ {
+ 3,
+ 4,
+ 6,
+ 1,
+ 2,
+ 0,
+ 5,
+ 7,
+ 8,
+ },
+ ec_gf8_mul_F0_ops};
+
+static ec_gf_op_t ec_gf8_mul_F1_ops[] = {
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_COPY, 9, 2, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 9, 0, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+ {EC_GF_OP_XOR2, 7, 9, 0}, {EC_GF_OP_XOR2, 4, 9, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR3, 9, 8, 7},
+ {EC_GF_OP_XOR2, 1, 9, 0}, {EC_GF_OP_XOR2, 5, 9, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F1 = {10,
+ {
+ 7,
+ 2,
+ 6,
+ 3,
+ 5,
+ 1,
+ 4,
+ 0,
+ 8,
+ 9,
+ },
+ ec_gf8_mul_F1_ops};
+
+static ec_gf_op_t ec_gf8_mul_F2_ops[] = {
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+ {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR3, 8, 6, 4},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F2 = {9,
+ {
+ 1,
+ 0,
+ 6,
+ 7,
+ 4,
+ 5,
+ 2,
+ 3,
+ 8,
+ },
+ ec_gf8_mul_F2_ops};
+
+static ec_gf_op_t ec_gf8_mul_F3_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F3 = {8,
+ {
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ },
+ ec_gf8_mul_F3_ops};
+
+static ec_gf_op_t ec_gf8_mul_F4_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F4 = {8,
+ {
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ },
+ ec_gf8_mul_F4_ops};
+
+static ec_gf_op_t ec_gf8_mul_F5_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F5 = {8,
+ {
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ },
+ ec_gf8_mul_F5_ops};
+
+static ec_gf_op_t ec_gf8_mul_F6_ops[] = {
+ {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_COPY, 8, 3, 0},
+ {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_COPY, 9, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 9, 4, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+ {EC_GF_OP_XOR2, 6, 9, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR3, 7, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F6 = {10,
+ {
+ 0,
+ 6,
+ 2,
+ 7,
+ 4,
+ 3,
+ 5,
+ 9,
+ 1,
+ 8,
+ },
+ ec_gf8_mul_F6_ops};
+
+static ec_gf_op_t ec_gf8_mul_F7_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+ {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F7 = {8,
+ {
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ },
+ ec_gf8_mul_F7_ops};
+
+static ec_gf_op_t ec_gf8_mul_F8_ops[] = {
+ {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+ {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+ {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F8 = {8,
+ {
+ 6,
+ 2,
+ 0,
+ 1,
+ 4,
+ 5,
+ 3,
+ 7,
+ },
+ ec_gf8_mul_F8_ops};
+
+static ec_gf_op_t ec_gf8_mul_F9_ops[] = {
+ {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR3, 8, 7, 1}, {EC_GF_OP_XOR2, 1, 3, 0},
+ {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F9 = {9,
+ {
+ 4,
+ 1,
+ 7,
+ 6,
+ 0,
+ 3,
+ 5,
+ 2,
+ 8,
+ },
+ ec_gf8_mul_F9_ops};
+
+static ec_gf_op_t ec_gf8_mul_FA_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+ {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+ {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+ {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FA = {8,
+ {
+ 0,
+ 1,
+ 2,
+ 4,
+ 5,
+ 6,
+ 7,
+ 3,
+ },
+ ec_gf8_mul_FA_ops};
+
+static ec_gf_op_t ec_gf8_mul_FB_ops[] = {
+ {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+ {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+ {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+ {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+ {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FB = {8,
+ {
+ 4,
+ 5,
+ 6,
+ 7,
+ 0,
+ 1,
+ 2,
+ 3,
+ },
+ ec_gf8_mul_FB_ops};
+
+static ec_gf_op_t ec_gf8_mul_FC_ops[] = {
+ {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+ {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_COPY, 9, 3, 0},
+ {EC_GF_OP_XOR3, 8, 5, 7}, {EC_GF_OP_XOR2, 3, 6, 0},
+ {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+ {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+ {EC_GF_OP_XOR3, 0, 9, 2}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FC = {10,
+ {
+ 5,
+ 6,
+ 3,
+ 7,
+ 1,
+ 8,
+ 0,
+ 4,
+ 2,
+ 9,
+ },
+ ec_gf8_mul_FC_ops};
+
+static ec_gf_op_t ec_gf8_mul_FD_ops[] = {
+ {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_COPY, 8, 7, 0},
+ {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+ {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+ {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+ {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+ {EC_GF_OP_XOR3, 1, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FD = {9,
+ {
+ 5,
+ 3,
+ 7,
+ 6,
+ 1,
+ 2,
+ 4,
+ 0,
+ 8,
+ },
+ ec_gf8_mul_FD_ops};
+
+static ec_gf_op_t ec_gf8_mul_FE_ops[] = {
+ {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_COPY, 8, 2, 0},
+ {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+ {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+ {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FE = {9,
+ {
+ 3,
+ 4,
+ 8,
+ 2,
+ 5,
+ 0,
+ 6,
+ 1,
+ 7,
+ },
+ ec_gf8_mul_FE_ops};
+
+static ec_gf_op_t ec_gf8_mul_FF_ops[] = {
+ {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_COPY, 9, 0, 0},
+ {EC_GF_OP_COPY, 8, 4, 0}, {EC_GF_OP_XOR2, 9, 1, 0},
+ {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 9, 4, 0},
+ {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+ {EC_GF_OP_XOR2, 3, 9, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+ {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+ {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+ {EC_GF_OP_XOR3, 3, 8, 5}, {EC_GF_OP_XOR2, 4, 6, 0},
+ {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FF = {10,
+ {
+ 6,
+ 5,
+ 0,
+ 1,
+ 2,
+ 4,
+ 9,
+ 3,
+ 7,
+ 8,
+ },
+ ec_gf8_mul_FF_ops};
+
+ec_gf_mul_t *ec_gf8_mul[] = {
+ &ec_gf8_mul_00, &ec_gf8_mul_01, &ec_gf8_mul_02, &ec_gf8_mul_03,
+ &ec_gf8_mul_04, &ec_gf8_mul_05, &ec_gf8_mul_06, &ec_gf8_mul_07,
+ &ec_gf8_mul_08, &ec_gf8_mul_09, &ec_gf8_mul_0A, &ec_gf8_mul_0B,
+ &ec_gf8_mul_0C, &ec_gf8_mul_0D, &ec_gf8_mul_0E, &ec_gf8_mul_0F,
+ &ec_gf8_mul_10, &ec_gf8_mul_11, &ec_gf8_mul_12, &ec_gf8_mul_13,
+ &ec_gf8_mul_14, &ec_gf8_mul_15, &ec_gf8_mul_16, &ec_gf8_mul_17,
+ &ec_gf8_mul_18, &ec_gf8_mul_19, &ec_gf8_mul_1A, &ec_gf8_mul_1B,
+ &ec_gf8_mul_1C, &ec_gf8_mul_1D, &ec_gf8_mul_1E, &ec_gf8_mul_1F,
+ &ec_gf8_mul_20, &ec_gf8_mul_21, &ec_gf8_mul_22, &ec_gf8_mul_23,
+ &ec_gf8_mul_24, &ec_gf8_mul_25, &ec_gf8_mul_26, &ec_gf8_mul_27,
+ &ec_gf8_mul_28, &ec_gf8_mul_29, &ec_gf8_mul_2A, &ec_gf8_mul_2B,
+ &ec_gf8_mul_2C, &ec_gf8_mul_2D, &ec_gf8_mul_2E, &ec_gf8_mul_2F,
+ &ec_gf8_mul_30, &ec_gf8_mul_31, &ec_gf8_mul_32, &ec_gf8_mul_33,
+ &ec_gf8_mul_34, &ec_gf8_mul_35, &ec_gf8_mul_36, &ec_gf8_mul_37,
+ &ec_gf8_mul_38, &ec_gf8_mul_39, &ec_gf8_mul_3A, &ec_gf8_mul_3B,
+ &ec_gf8_mul_3C, &ec_gf8_mul_3D, &ec_gf8_mul_3E, &ec_gf8_mul_3F,
+ &ec_gf8_mul_40, &ec_gf8_mul_41, &ec_gf8_mul_42, &ec_gf8_mul_43,
+ &ec_gf8_mul_44, &ec_gf8_mul_45, &ec_gf8_mul_46, &ec_gf8_mul_47,
+ &ec_gf8_mul_48, &ec_gf8_mul_49, &ec_gf8_mul_4A, &ec_gf8_mul_4B,
+ &ec_gf8_mul_4C, &ec_gf8_mul_4D, &ec_gf8_mul_4E, &ec_gf8_mul_4F,
+ &ec_gf8_mul_50, &ec_gf8_mul_51, &ec_gf8_mul_52, &ec_gf8_mul_53,
+ &ec_gf8_mul_54, &ec_gf8_mul_55, &ec_gf8_mul_56, &ec_gf8_mul_57,
+ &ec_gf8_mul_58, &ec_gf8_mul_59, &ec_gf8_mul_5A, &ec_gf8_mul_5B,
+ &ec_gf8_mul_5C, &ec_gf8_mul_5D, &ec_gf8_mul_5E, &ec_gf8_mul_5F,
+ &ec_gf8_mul_60, &ec_gf8_mul_61, &ec_gf8_mul_62, &ec_gf8_mul_63,
+ &ec_gf8_mul_64, &ec_gf8_mul_65, &ec_gf8_mul_66, &ec_gf8_mul_67,
+ &ec_gf8_mul_68, &ec_gf8_mul_69, &ec_gf8_mul_6A, &ec_gf8_mul_6B,
+ &ec_gf8_mul_6C, &ec_gf8_mul_6D, &ec_gf8_mul_6E, &ec_gf8_mul_6F,
+ &ec_gf8_mul_70, &ec_gf8_mul_71, &ec_gf8_mul_72, &ec_gf8_mul_73,
+ &ec_gf8_mul_74, &ec_gf8_mul_75, &ec_gf8_mul_76, &ec_gf8_mul_77,
+ &ec_gf8_mul_78, &ec_gf8_mul_79, &ec_gf8_mul_7A, &ec_gf8_mul_7B,
+ &ec_gf8_mul_7C, &ec_gf8_mul_7D, &ec_gf8_mul_7E, &ec_gf8_mul_7F,
+ &ec_gf8_mul_80, &ec_gf8_mul_81, &ec_gf8_mul_82, &ec_gf8_mul_83,
+ &ec_gf8_mul_84, &ec_gf8_mul_85, &ec_gf8_mul_86, &ec_gf8_mul_87,
+ &ec_gf8_mul_88, &ec_gf8_mul_89, &ec_gf8_mul_8A, &ec_gf8_mul_8B,
+ &ec_gf8_mul_8C, &ec_gf8_mul_8D, &ec_gf8_mul_8E, &ec_gf8_mul_8F,
+ &ec_gf8_mul_90, &ec_gf8_mul_91, &ec_gf8_mul_92, &ec_gf8_mul_93,
+ &ec_gf8_mul_94, &ec_gf8_mul_95, &ec_gf8_mul_96, &ec_gf8_mul_97,
+ &ec_gf8_mul_98, &ec_gf8_mul_99, &ec_gf8_mul_9A, &ec_gf8_mul_9B,
+ &ec_gf8_mul_9C, &ec_gf8_mul_9D, &ec_gf8_mul_9E, &ec_gf8_mul_9F,
+ &ec_gf8_mul_A0, &ec_gf8_mul_A1, &ec_gf8_mul_A2, &ec_gf8_mul_A3,
+ &ec_gf8_mul_A4, &ec_gf8_mul_A5, &ec_gf8_mul_A6, &ec_gf8_mul_A7,
+ &ec_gf8_mul_A8, &ec_gf8_mul_A9, &ec_gf8_mul_AA, &ec_gf8_mul_AB,
+ &ec_gf8_mul_AC, &ec_gf8_mul_AD, &ec_gf8_mul_AE, &ec_gf8_mul_AF,
+ &ec_gf8_mul_B0, &ec_gf8_mul_B1, &ec_gf8_mul_B2, &ec_gf8_mul_B3,
+ &ec_gf8_mul_B4, &ec_gf8_mul_B5, &ec_gf8_mul_B6, &ec_gf8_mul_B7,
+ &ec_gf8_mul_B8, &ec_gf8_mul_B9, &ec_gf8_mul_BA, &ec_gf8_mul_BB,
+ &ec_gf8_mul_BC, &ec_gf8_mul_BD, &ec_gf8_mul_BE, &ec_gf8_mul_BF,
+ &ec_gf8_mul_C0, &ec_gf8_mul_C1, &ec_gf8_mul_C2, &ec_gf8_mul_C3,
+ &ec_gf8_mul_C4, &ec_gf8_mul_C5, &ec_gf8_mul_C6, &ec_gf8_mul_C7,
+ &ec_gf8_mul_C8, &ec_gf8_mul_C9, &ec_gf8_mul_CA, &ec_gf8_mul_CB,
+ &ec_gf8_mul_CC, &ec_gf8_mul_CD, &ec_gf8_mul_CE, &ec_gf8_mul_CF,
+ &ec_gf8_mul_D0, &ec_gf8_mul_D1, &ec_gf8_mul_D2, &ec_gf8_mul_D3,
+ &ec_gf8_mul_D4, &ec_gf8_mul_D5, &ec_gf8_mul_D6, &ec_gf8_mul_D7,
+ &ec_gf8_mul_D8, &ec_gf8_mul_D9, &ec_gf8_mul_DA, &ec_gf8_mul_DB,
+ &ec_gf8_mul_DC, &ec_gf8_mul_DD, &ec_gf8_mul_DE, &ec_gf8_mul_DF,
+ &ec_gf8_mul_E0, &ec_gf8_mul_E1, &ec_gf8_mul_E2, &ec_gf8_mul_E3,
+ &ec_gf8_mul_E4, &ec_gf8_mul_E5, &ec_gf8_mul_E6, &ec_gf8_mul_E7,
+ &ec_gf8_mul_E8, &ec_gf8_mul_E9, &ec_gf8_mul_EA, &ec_gf8_mul_EB,
+ &ec_gf8_mul_EC, &ec_gf8_mul_ED, &ec_gf8_mul_EE, &ec_gf8_mul_EF,
+ &ec_gf8_mul_F0, &ec_gf8_mul_F1, &ec_gf8_mul_F2, &ec_gf8_mul_F3,
+ &ec_gf8_mul_F4, &ec_gf8_mul_F5, &ec_gf8_mul_F6, &ec_gf8_mul_F7,
+ &ec_gf8_mul_F8, &ec_gf8_mul_F9, &ec_gf8_mul_FA, &ec_gf8_mul_FB,
+ &ec_gf8_mul_FC, &ec_gf8_mul_FD, &ec_gf8_mul_FE, &ec_gf8_mul_FF};
diff --git a/xlators/cluster/ec/src/ec-gf8.h b/xlators/cluster/ec/src/ec-gf8.h
new file mode 100644
index 00000000000..4aca91127fc
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-gf8.h
@@ -0,0 +1,18 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_GF8_H__
+#define __EC_GF8_H__
+
+#include "ec-galois.h"
+
+extern ec_gf_mul_t *ec_gf8_mul[];
+
+#endif /* __EC_GF8_H__ */
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
new file mode 100644
index 00000000000..7d991f04aac
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -0,0 +1,3367 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/cluster-syncop.h>
+
+#include "ec.h"
+#include "ec-types.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-method.h"
+#include "ec-fops.h"
+#include "ec-heald.h"
+
+#define EC_COUNT(array, max) \
+ ({ \
+ int __i; \
+ int __res = 0; \
+ for (__i = 0; __i < max; __i++) \
+ if (array[__i]) \
+ __res++; \
+ __res; \
+ })
+#define EC_INTERSECT(dst, src1, src2, max) \
+ ({ \
+ int __i; \
+ for (__i = 0; __i < max; __i++) \
+ dst[__i] = src1[__i] && src2[__i]; \
+ })
+#define EC_ADJUST_SOURCE(source, sources, max) \
+ ({ \
+ int __i; \
+ if (sources[source] == 0) { \
+ source = -1; \
+ for (__i = 0; __i < max; __i++) \
+ if (sources[__i]) \
+ source = __i; \
+ } \
+ })
+#define IA_EQUAL(f, s, field) \
+ (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0)
+#define EC_REPLIES_ALLOC(replies, numsubvols) \
+ do { \
+ int __i = 0; \
+ replies = alloca0(numsubvols * sizeof(*replies)); \
+ for (__i = 0; __i < numsubvols; __i++) \
+ INIT_LIST_HEAD(&replies[__i].entries.list); \
+ } while (0)
+
+struct ec_name_data {
+ call_frame_t *frame;
+ unsigned char *participants;
+ unsigned char *failed_on;
+ unsigned char *gfidless;
+ unsigned char *enoent;
+ unsigned char *same;
+ char *name;
+ inode_t *parent;
+ default_args_cbk_t *replies;
+ uint32_t heal_pending;
+};
+
+static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
+
+static gf_boolean_t
+ec_ignorable_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
+{
+ int i = 0;
+
+ if (!key)
+ goto out;
+
+ if (strncmp(key, EC_XATTR_PREFIX, SLEN(EC_XATTR_PREFIX)) == 0)
+ return _gf_true;
+
+ for (i = 0; ec_ignore_xattrs[i]; i++) {
+ if (!strcmp(key, ec_ignore_xattrs[i]))
+ return _gf_true;
+ }
+
+out:
+ return _gf_false;
+}
+
+static gf_boolean_t
+ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
+{
+ return !ec_ignorable_key_match(dict, key, val, mdata);
+}
+/* FOP: heal */
+
+void
+ec_set_entry_healing(ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ loc_t *loc = NULL;
+
+ if (!fop)
+ return;
+
+ loc = &fop->loc[0];
+ LOCK(&loc->inode->lock);
+ {
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ ctx->heal_count += 1;
+ }
+ }
+ UNLOCK(&loc->inode->lock);
+}
+
+void
+ec_reset_entry_healing(ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ loc_t *loc = NULL;
+ int32_t heal_count = 0;
+ if (!fop)
+ return;
+
+ loc = &fop->loc[0];
+ LOCK(&loc->inode->lock);
+ {
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ ctx->heal_count += -1;
+ heal_count = ctx->heal_count;
+ }
+ }
+ UNLOCK(&loc->inode->lock);
+ GF_ASSERT(heal_count >= 0);
+}
+
+uintptr_t
+ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
+{
+ ec_cbk_data_t *cbk;
+ uintptr_t mask[2] = {0, 0};
+
+ list_for_each_entry(cbk, &fop->cbk_list, list)
+ {
+ mask[cbk->op_ret >= 0] |= cbk->mask;
+ }
+
+ if (pgood != NULL) {
+ *pgood = mask[1];
+ }
+
+ return mask[0];
+}
+
+void
+ec_heal_update(ec_fop_data_t *fop, int32_t is_open)
+{
+ ec_heal_t *heal = fop->data;
+ uintptr_t good, bad;
+
+ bad = ec_heal_check(fop, &good);
+
+ LOCK(&heal->lock);
+
+ heal->bad &= ~bad;
+ if (is_open) {
+ heal->open |= good;
+ }
+
+ UNLOCK(&heal->lock);
+
+ fop->error = 0;
+}
+
+void
+ec_heal_avoid(ec_fop_data_t *fop)
+{
+ ec_heal_t *heal = fop->data;
+ uintptr_t bad;
+
+ bad = ec_heal_check(fop, NULL);
+
+ LOCK(&heal->lock);
+
+ heal->good &= ~bad;
+
+ UNLOCK(&heal->lock);
+}
+
+int32_t
+ec_heal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
+ if (op_ret >= 0) {
+ GF_ASSERT(
+ ec_set_inode_size(heal->fop, heal->fd->inode, heal->total_size));
+ }
+
+ return 0;
+}
+
+void
+ec_heal_lock(ec_heal_t *heal, int32_t type, fd_t *fd, loc_t *loc, off_t offset,
+ size_t size)
+{
+ struct gf_flock flock;
+ fop_inodelk_cbk_t cbk = NULL;
+
+ flock.l_type = type;
+ flock.l_whence = SEEK_SET;
+ flock.l_start = offset;
+ flock.l_len = size;
+ flock.l_pid = 0;
+ flock.l_owner.len = 0;
+
+ if (type == F_UNLCK) {
+ /* Remove inode size information before unlocking it. */
+ if (fd == NULL) {
+ ec_clear_inode_info(heal->fop, heal->loc.inode);
+ } else {
+ ec_clear_inode_info(heal->fop, heal->fd->inode);
+ }
+ cbk = ec_lock_unlocked;
+ } else {
+ /* Otherwise use the callback to update size information. */
+ cbk = ec_heal_lock_cbk;
+ }
+
+ if (fd != NULL) {
+ ec_finodelk(heal->fop->frame, heal->xl,
+ &heal->fop->frame->root->lk_owner, heal->fop->mask,
+ EC_MINIMUM_ALL, cbk, heal, heal->xl->name, fd, F_SETLKW,
+ &flock, NULL);
+ } else {
+ ec_inodelk(heal->fop->frame, heal->xl,
+ &heal->fop->frame->root->lk_owner, heal->fop->mask,
+ EC_MINIMUM_ALL, cbk, heal, heal->xl->name, loc, F_SETLKW,
+ &flock, NULL);
+ }
+}
+
+void
+ec_heal_inodelk(ec_heal_t *heal, int32_t type, int32_t use_fd, off_t offset,
+ size_t size)
+{
+ ec_heal_lock(heal, type, use_fd ? heal->fd : NULL, &heal->loc, offset,
+ size);
+}
+
+int32_t
+ec_heal_xattr_clean(dict_t *dict, char *key, data_t *data, void *arg)
+{
+ dict_t *base = arg;
+
+ if (ec_ignorable_key_match(NULL, key, NULL, NULL)) {
+ dict_del(dict, key);
+ return 0;
+ }
+
+ if (dict_get(base, key) != NULL)
+ dict_del(dict, key);
+
+ return 0;
+}
+
+/********************************************************************
+ * ec_wind_xattrop_parallel:
+ * Helper function to update the extended attributes
+ * in parallel.
+ *
+ *******************************************************************/
+void
+ec_wind_xattrop_parallel(call_frame_t *frame, xlator_t *subvol, int child_index,
+ loc_t *loc, gf_xattrop_flags_t flags, dict_t **dict,
+ dict_t *xdata)
+{
+ gf_msg_debug("EC", 0, "WIND: on child %d ", child_index);
+ STACK_WIND_COOKIE(
+ frame, cluster_xattrop_cbk, (void *)(uintptr_t)child_index, subvol,
+ subvol->fops->xattrop, loc, flags, dict[child_index], xdata);
+}
+
+int32_t
+ec_heal_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
+ ec_trace("WRITE_CBK", cookie, "ret=%d, errno=%d", op_ret, op_errno);
+
+ gf_msg_debug(fop->xl->name, 0,
+ "%s: write op_ret %d, op_errno %s"
+ " at %" PRIu64,
+ uuid_utoa(heal->fd->inode->gfid), op_ret, strerror(op_errno),
+ heal->offset);
+
+ ec_heal_update(cookie, 0);
+
+ return 0;
+}
+
+int32_t
+ec_heal_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
+ ec_trace("READ_CBK", fop, "ret=%d, errno=%d", op_ret, op_errno);
+
+ ec_heal_avoid(fop);
+
+ if (op_ret > 0) {
+ gf_msg_debug(fop->xl->name, 0,
+ "%s: read succeeded, proceeding "
+ "to write at %" PRIu64,
+ uuid_utoa(heal->fd->inode->gfid), heal->offset);
+ ec_writev(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE,
+ ec_heal_writev_cbk, heal, heal->fd, vector, count,
+ heal->offset, 0, iobref, NULL);
+ } else {
+ if (op_ret < 0) {
+ gf_msg_debug(fop->xl->name, 0,
+ "%s: read failed %s, failing "
+ "to heal block at %" PRIu64,
+ uuid_utoa(heal->fd->inode->gfid), strerror(op_errno),
+ heal->offset);
+ heal->bad = 0;
+ }
+ heal->done = 1;
+ }
+
+ return 0;
+}
+
+void
+ec_heal_data_block(ec_heal_t *heal)
+{
+ ec_trace("DATA", heal->fop, "good=%lX, bad=%lX", heal->good, heal->bad);
+
+ if ((heal->good != 0) && (heal->bad != 0) &&
+ (heal->iatt.ia_type == IA_IFREG)) {
+ ec_readv(heal->fop->frame, heal->xl, heal->good, EC_MINIMUM_MIN,
+ ec_heal_readv_cbk, heal, heal->fd, heal->size, heal->offset, 0,
+ NULL);
+ }
+}
+
+/* FOP: fheal */
+
+void
+ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd,
+ int32_t partial, dict_t *xdata)
+{
+ ec_fd_t *ctx = ec_fd_get(fd, this);
+
+ if (ctx != NULL) {
+ gf_msg_trace("ec", 0, "FHEAL ctx: flags=%X, open=%" PRIXPTR, ctx->flags,
+ ctx->open);
+ ec_heal(frame, this, target, fop_flags, func, data, &ctx->loc, partial,
+ xdata);
+ }
+}
+
+/* Common heal code */
+void
+ec_mask_to_char_array(uintptr_t mask, unsigned char *array, int numsubvols)
+{
+ int i = 0;
+
+ for (i = 0; i < numsubvols; i++)
+ array[i] = ((mask >> i) & 1);
+}
+
+uintptr_t
+ec_char_array_to_mask(unsigned char *array, int numsubvols)
+{
+ int i = 0;
+ uintptr_t mask = 0;
+
+ if (array == NULL)
+ goto out;
+
+ for (i = 0; i < numsubvols; i++)
+ if (array[i])
+ mask |= (1ULL << i);
+out:
+ return mask;
+}
+
+int
+ec_heal_entry_find_direction(ec_t *ec, default_args_cbk_t *replies,
+ uint64_t *versions, uint64_t *dirty,
+ unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
+ int source = -1;
+ uint64_t max_version = 0;
+ int ret = 0;
+ int i = 0;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret == -1)
+ continue;
+
+ if (source == -1)
+ source = i;
+
+ ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_VERSION, xattr,
+ EC_VERSION_SIZE);
+ if (ret == 0) {
+ versions[i] = xattr[EC_DATA_TXN];
+ if (max_version < versions[i]) {
+ max_version = versions[i];
+ source = i;
+ }
+ }
+
+ memset(xattr, 0, sizeof(xattr));
+ ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_DIRTY, xattr,
+ EC_VERSION_SIZE);
+ if (ret == 0) {
+ dirty[i] = xattr[EC_DATA_TXN];
+ }
+ }
+
+ if (source < 0)
+ goto out;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret == -1)
+ continue;
+
+ if (versions[i] == versions[source])
+ sources[i] = 1;
+ else
+ healed_sinks[i] = 1;
+ }
+
+out:
+ return source;
+}
+
+int
+ec_adjust_versions(call_frame_t *frame, ec_t *ec, ec_txn_t type, inode_t *inode,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks, uint64_t *versions,
+ uint64_t *dirty)
+{
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ dict_t **xattr = NULL;
+ int op_ret = 0;
+ loc_t loc = {0};
+ gf_boolean_t erase_dirty = _gf_false;
+ uint64_t *versions_xattr = NULL;
+ uint64_t *dirty_xattr = NULL;
+ uint64_t allzero[2] = {0};
+ unsigned char *on = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+
+ /* Allocate the required memory */
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+ if (!xattr) {
+ op_ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < ec->nodes; i++) {
+ xattr[i] = dict_new();
+ if (!xattr[i]) {
+ op_ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* dirty xattr represents if the file/dir needs heal. Unless all the
+ * copies are healed, don't erase it */
+ if (EC_COUNT(sources, ec->nodes) + EC_COUNT(healed_sinks, ec->nodes) ==
+ ec->nodes)
+ erase_dirty = _gf_true;
+ else
+ op_ret = -ENOTCONN;
+
+ /* Populate the xattr array */
+ for (i = 0; i < ec->nodes; i++) {
+ if (!sources[i] && !healed_sinks[i])
+ continue;
+ versions_xattr = GF_CALLOC(EC_VERSION_SIZE, sizeof(*versions_xattr),
+ gf_common_mt_pointer);
+ if (!versions_xattr) {
+ op_ret = -ENOMEM;
+ continue;
+ }
+
+ versions_xattr[type] = hton64(versions[source] - versions[i]);
+ ret = dict_set_bin(xattr[i], EC_XATTR_VERSION, versions_xattr,
+ (sizeof(*versions_xattr) * EC_VERSION_SIZE));
+ if (ret < 0) {
+ op_ret = -ENOMEM;
+ continue;
+ }
+
+ if (erase_dirty) {
+ dirty_xattr = GF_CALLOC(EC_VERSION_SIZE, sizeof(*dirty_xattr),
+ gf_common_mt_pointer);
+ if (!dirty_xattr) {
+ op_ret = -ENOMEM;
+ continue;
+ }
+
+ dirty_xattr[type] = hton64(-dirty[i]);
+ ret = dict_set_bin(xattr[i], EC_XATTR_DIRTY, dirty_xattr,
+ (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+ if (ret < 0) {
+ op_ret = -ENOMEM;
+ continue;
+ }
+ }
+
+ if (memcmp(versions_xattr, allzero,
+ (sizeof(*versions_xattr) * EC_VERSION_SIZE)) == 0) {
+ if (!erase_dirty) {
+ continue;
+ }
+
+ if (memcmp(dirty_xattr, allzero,
+ (sizeof(*dirty_xattr) * EC_VERSION_SIZE)) == 0) {
+ continue;
+ }
+ }
+
+ on[i] = 1;
+ call_count++;
+ }
+
+ /* Update the bricks with xattr */
+ if (call_count) {
+ PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+ ec_wind_xattrop_parallel, &loc,
+ GF_XATTROP_ADD_ARRAY64, xattr, NULL);
+ ret = cluster_fop_success_fill(replies, ec->nodes, output);
+ }
+
+ if (ret < call_count) {
+ op_ret = -ENOTCONN;
+ goto out;
+ }
+
+out:
+ /* Cleanup */
+ if (xattr) {
+ for (i = 0; i < ec->nodes; i++) {
+ if (xattr[i])
+ dict_unref(xattr[i]);
+ }
+ GF_FREE(xattr);
+ }
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return op_ret;
+}
+
+int
+ec_heal_metadata_find_direction(ec_t *ec, default_args_cbk_t *replies,
+ uint64_t *versions, uint64_t *dirty,
+ unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
+ uint64_t max_version = 0;
+ int same_count = 0;
+ int max_same_count = 0;
+ int same_source = -1;
+ int ret = 0;
+ int i = 0;
+ int j = 0;
+ int *groups = NULL;
+ struct iatt source_ia = {0};
+ struct iatt child_ia = {0};
+
+ groups = alloca0(ec->nodes * sizeof(*groups));
+ for (i = 0; i < ec->nodes; i++)
+ groups[i] = -1;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret < 0)
+ continue;
+ ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_VERSION, xattr,
+ EC_VERSION_SIZE);
+ if (ret == 0) {
+ versions[i] = xattr[EC_METADATA_TXN];
+ }
+
+ memset(xattr, 0, sizeof(xattr));
+ ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_DIRTY, xattr,
+ EC_VERSION_SIZE);
+ if (ret == 0) {
+ dirty[i] = xattr[EC_METADATA_TXN];
+ }
+ if (groups[i] >= 0) /*Already part of group*/
+ continue;
+ groups[i] = i;
+ same_count = 1;
+ source_ia = replies[i].stat;
+ for (j = i + 1; j < ec->nodes; j++) {
+ if (!replies[j].valid || replies[j].op_ret < 0)
+ continue;
+ child_ia = replies[j].stat;
+ if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+ !IA_EQUAL(source_ia, child_ia, type) ||
+ !IA_EQUAL(source_ia, child_ia, prot) ||
+ !IA_EQUAL(source_ia, child_ia, uid) ||
+ !IA_EQUAL(source_ia, child_ia, gid))
+ continue;
+ if (!are_dicts_equal(replies[i].xdata, replies[j].xdata,
+ ec_sh_key_match, NULL))
+ continue;
+ groups[j] = i;
+ same_count++;
+ }
+
+ if (max_same_count < same_count) {
+ max_same_count = same_count;
+ same_source = i;
+ }
+ }
+
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
+ goto out;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (groups[i] == groups[same_source])
+ sources[i] = 1;
+ else if (replies[i].valid && replies[i].op_ret >= 0)
+ healed_sinks[i] = 1;
+ }
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i] && (versions[i] > max_version)) {
+ same_source = i;
+ max_version = versions[i];
+ }
+ }
+ ret = same_source;
+out:
+ return ret;
+}
+
+int
+__ec_heal_metadata_prepare(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on,
+ default_args_cbk_t *replies, uint64_t *versions,
+ uint64_t *dirty, unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ loc_t loc = {0};
+ unsigned char *output = NULL;
+ unsigned char *lookup_on = NULL;
+ int ret = 0;
+ int source = 0;
+ default_args_cbk_t *greplies = NULL;
+ int i = 0;
+ EC_REPLIES_ALLOC(greplies, ec->nodes);
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ output = alloca0(ec->nodes);
+ lookup_on = alloca0(ec->nodes);
+ ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, &loc, NULL);
+ if (ret <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ memcpy(lookup_on, output, ec->nodes);
+ /*Use getxattr to get the filtered xattrs which filter internal xattrs*/
+ ret = cluster_getxattr(ec->xl_list, lookup_on, ec->nodes, greplies, output,
+ frame, ec->xl, &loc, NULL, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (lookup_on[i] && !output[i]) {
+ replies[i].valid = 0;
+ continue;
+ }
+ if (replies[i].xdata) {
+ dict_unref(replies[i].xdata);
+ replies[i].xdata = NULL;
+ if (greplies[i].xattr)
+ replies[i].xdata = dict_ref(greplies[i].xattr);
+ }
+ }
+
+ source = ec_heal_metadata_find_direction(ec, replies, versions, dirty,
+ sources, healed_sinks);
+ if (source < 0) {
+ ret = -EIO;
+ goto out;
+ }
+ ret = source;
+out:
+ cluster_replies_wipe(greplies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
+}
+
+/* Metadata heal */
+int
+__ec_removexattr_sinks(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks, default_args_cbk_t *replies)
+{
+ int i = 0;
+ int ret = 0;
+ loc_t loc = {0};
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (i == source)
+ continue;
+ if (!sources[i] && !healed_sinks[i])
+ continue;
+ ret = dict_foreach(replies[i].xdata, ec_heal_xattr_clean,
+ replies[source].xdata);
+ if (ret < 0) {
+ sources[i] = 0;
+ healed_sinks[i] = 0;
+ continue;
+ }
+
+ if (replies[i].xdata->count == 0) {
+ continue;
+ } else if (sources[i]) {
+ /* This can happen if setxattr/removexattr succeeds on
+ * the bricks but fails to update the version. This
+ * will make sure that the xattrs are made equal after
+ * heal*/
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+
+ ret = syncop_removexattr(ec->xl_list[i], &loc, "", replies[i].xdata,
+ NULL);
+ if (ret < 0)
+ healed_sinks[i] = 0;
+ }
+
+ loc_wipe(&loc);
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0)
+ return -ENOTCONN;
+ return 0;
+}
+
+int
+__ec_heal_metadata(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ loc_t loc = {0};
+ int ret = 0;
+ int source = 0;
+ default_args_cbk_t *replies = NULL;
+ default_args_cbk_t *sreplies = NULL;
+ uint64_t *versions = NULL;
+ uint64_t *dirty = NULL;
+ unsigned char *output = NULL;
+ dict_t *source_dict = NULL;
+ struct iatt source_buf = {0};
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ EC_REPLIES_ALLOC(sreplies, ec->nodes);
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ output = alloca0(ec->nodes);
+ versions = alloca0(ec->nodes * sizeof(*versions));
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ source = __ec_heal_metadata_prepare(frame, ec, inode, locked_on, replies,
+ versions, dirty, sources, healed_sinks);
+ if (source < 0) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if ((EC_COUNT(sources, ec->nodes) == ec->nodes) ||
+ (EC_COUNT(healed_sinks, ec->nodes) == 0)) {
+ ret = 0;
+ goto erase_dirty;
+ }
+
+ source_buf = replies[source].stat;
+ ret = cluster_setattr(ec->xl_list, healed_sinks, ec->nodes, sreplies,
+ output, frame, ec->xl, &loc, &source_buf,
+ GF_SET_ATTR_MODE | GF_SET_ATTR_UID | GF_SET_ATTR_GID,
+ NULL);
+ /*In case the operation fails on some of the subvols*/
+ memcpy(healed_sinks, output, ec->nodes);
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ ret = __ec_removexattr_sinks(frame, ec, inode, source, sources,
+ healed_sinks, replies);
+ if (ret < 0)
+ goto out;
+
+ source_dict = dict_ref(replies[source].xdata);
+ if (dict_foreach_match(source_dict, ec_ignorable_key_match, NULL,
+ dict_remove_foreach_fn, NULL) == -1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = cluster_setxattr(ec->xl_list, healed_sinks, ec->nodes, replies,
+ output, frame, ec->xl, &loc, source_dict, 0, NULL);
+
+ EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes);
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+erase_dirty:
+ ret = ec_adjust_versions(frame, ec, EC_METADATA_TXN, inode, source, sources,
+ healed_sinks, versions, dirty);
+out:
+ if (source_dict)
+ dict_unref(source_dict);
+
+ loc_wipe(&loc);
+ cluster_replies_wipe(replies, ec->nodes);
+ cluster_replies_wipe(sreplies, ec->nodes);
+ return ret;
+}
+
+int
+ec_heal_metadata(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ int ret = 0;
+ default_args_cbk_t *replies = NULL;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, up_subvols, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name, inode, 0, 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_heal_metadata(frame, ec, inode, locked_on, sources,
+ healed_sinks);
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, inode, 0, 0);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+
+/*entry heal*/
+int
+__ec_heal_entry_prepare(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on, uint64_t *versions,
+ uint64_t *dirty, unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ loc_t loc = {0};
+ int source = 0;
+ int ret = 0;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ dict_t *xdata = NULL;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ xdata = dict_new();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (dict_set_uint64(xdata, EC_XATTR_VERSION, 0) ||
+ dict_set_uint64(xdata, EC_XATTR_DIRTY, 0)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ output = alloca0(ec->nodes);
+ ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, &loc, xdata);
+ if (ret <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ source = ec_heal_entry_find_direction(ec, replies, versions, dirty, sources,
+ healed_sinks);
+ if (source < 0) {
+ ret = -EIO;
+ goto out;
+ }
+ ret = source;
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+int32_t
+ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
+ call_frame_t *frame, xlator_t *this, unsigned char *on)
+{
+ dict_t *xattr = NULL;
+ int32_t ret = -1;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ uint64_t dirty[EC_VERSION_SIZE] = {1, 1};
+ loc_t newloc = {0};
+
+ /*Symlinks don't have any data to be healed*/
+ if (ia->ia_type == IA_IFLNK)
+ dirty[EC_DATA_TXN] = 0;
+
+ newloc.inode = inode_ref(loc->inode);
+ gf_uuid_copy(newloc.gfid, ia->ia_gfid);
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ output = alloca0(ec->nodes);
+ xattr = dict_new();
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = ec_dict_set_array(xattr, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+ if (ret)
+ goto out;
+
+ ret = cluster_xattrop(ec->xl_list, on, ec->nodes, replies, output, frame,
+ ec->xl, &newloc, GF_XATTROP_ADD_ARRAY64, xattr, NULL);
+
+ if (ret < ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&newloc);
+ return ret;
+}
+
+/*Name heal*/
+int
+ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data)
+{
+ struct ec_name_data *name_data = data;
+ struct iatt *ia = NULL;
+ ec_t *ec = NULL;
+ loc_t loc = {0};
+ unsigned char *same = data_to_bin(d);
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ int ret = 0;
+ int estale_count = 0;
+ int i = 0;
+ call_frame_t *frame = name_data->frame;
+ uuid_t gfid;
+
+ ec = name_data->frame->this->private;
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ if (EC_COUNT(same, ec->nodes) >= ec->fragments) {
+ ret = 0;
+ goto out;
+ }
+
+ loc.parent = inode_ref(name_data->parent);
+ loc.inode = inode_new(name_data->parent->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ gf_uuid_parse(key, gfid);
+ gf_uuid_copy(loc.pargfid, name_data->parent->gfid);
+ loc.name = name_data->name;
+ output = alloca0(ec->nodes);
+ ret = cluster_lookup(ec->xl_list, name_data->participants, ec->nodes,
+ replies, output, name_data->frame, ec->xl, &loc, NULL);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1) {
+ if (replies[i].op_errno == ESTALE || replies[i].op_errno == ENOENT)
+ estale_count++;
+ else
+ name_data->participants[i] = 0;
+ } else if (gf_uuid_compare(gfid, replies[i].stat.ia_gfid)) {
+ estale_count++;
+ gf_msg_debug(ec->xl->name, 0, "%s/%s: different gfid as %s",
+ uuid_utoa(name_data->parent->gfid), name_data->name,
+ key);
+ }
+ }
+
+ if (estale_count <= ec->redundancy) {
+ /* We have at least ec->fragments number of fragments, so the
+ * file is recoverable, so don't delete it*/
+
+ /* Please note that the lookup call above could fail with
+ * ENOTCONN on all subvoumes and still this branch will be
+ * true, but in those cases conservatively we decide to not
+ * delete the file until we are sure*/
+ ret = 0;
+ goto out;
+ }
+
+ /*Noway to recover, delete the name*/
+ loc_wipe(&loc);
+ loc.parent = inode_ref(name_data->parent);
+ gf_uuid_copy(loc.pargfid, loc.parent->gfid);
+ loc.name = name_data->name;
+ for (i = 0; i < ec->nodes; i++) {
+ if (same[i] && replies[i].valid && (replies[i].op_ret == 0)) {
+ ia = &replies[i].stat;
+ break;
+ }
+ }
+
+ if (!ia) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ if (IA_ISDIR(ia->ia_type)) {
+ ret = cluster_rmdir(ec->xl_list, same, ec->nodes, replies, output,
+ frame, ec->xl, &loc, 1, NULL);
+ gf_msg_debug(ec->xl->name, 0,
+ "cluster rmdir succeeded on %d "
+ "nodes",
+ ret);
+ } else {
+ ret = cluster_unlink(ec->xl_list, same, ec->nodes, replies, output,
+ frame, ec->xl, &loc, 0, NULL);
+ gf_msg_debug(ec->xl->name, 0,
+ "cluster unlink succeeded on %d "
+ "nodes",
+ ret);
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (output[i]) {
+ same[i] = 0;
+ name_data->enoent[i] = 1;
+ } else {
+ /*op failed*/
+ if (same[i])
+ name_data->participants[i] = 0;
+ }
+ }
+ ret = 0;
+ /*This will help in making decisions about creating names*/
+ dict_del(gfid_db, key);
+out:
+ if (ret < 0) {
+ gf_msg_debug(ec->xl->name, 0, "%s/%s: heal failed %s",
+ uuid_utoa(name_data->parent->gfid), name_data->name,
+ strerror(-ret));
+ }
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
+ec_delete_stale_names(call_frame_t *frame, ec_t *ec, inode_t *parent,
+ char *name, default_args_cbk_t *replies, dict_t *gfid_db,
+ unsigned char *enoent, unsigned char *gfidless,
+ unsigned char *participants)
+{
+ struct ec_name_data name_data = {0};
+
+ name_data.enoent = enoent;
+ name_data.gfidless = gfidless;
+ name_data.participants = participants;
+ name_data.name = name;
+ name_data.parent = parent;
+ name_data.frame = frame;
+ name_data.replies = replies;
+ return dict_foreach(gfid_db, ec_delete_stale_name, &name_data);
+}
+
+int
+_assign_same(dict_t *dict, char *key, data_t *value, void *data)
+{
+ struct ec_name_data *name_data = data;
+
+ name_data->same = data_to_bin(value);
+ return 0;
+}
+
+int
+ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+ default_args_cbk_t *lookup_replies, dict_t *gfid_db,
+ unsigned char *enoent, unsigned char *participants)
+{
+ int ret = 0;
+ int i = 0;
+ struct ec_name_data name_data = {0};
+ struct iatt *ia = NULL;
+ unsigned char *output = 0;
+ unsigned char *output1 = 0;
+ unsigned char *on = NULL;
+ default_args_cbk_t *replies = NULL;
+ loc_t loc = {0};
+ loc_t srcloc = {0};
+ unsigned char *link = NULL;
+ unsigned char *create = NULL;
+ dict_t *xdata = NULL;
+ char *linkname = NULL;
+ ec_config_t config;
+
+ /* There should be just one gfid key */
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ if (gfid_db->count != 1) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = dict_foreach(gfid_db, _assign_same, &name_data);
+ if (ret < 0)
+ goto out;
+ /*There should at least be one valid success reply with gfid*/
+ for (i = 0; i < ec->nodes; i++)
+ if (name_data.same[i])
+ break;
+
+ if (i == ec->nodes) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ia = &lookup_replies[i].stat;
+ xdata = dict_new();
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.inode = inode_new(parent->table);
+ if (loc.inode)
+ srcloc.inode = inode_ref(loc.inode);
+ gf_uuid_copy(srcloc.gfid, ia->ia_gfid);
+ if (!loc.inode || !xdata ||
+ dict_set_static_bin(xdata, "gfid-req", ia->ia_gfid,
+ sizeof(ia->ia_gfid))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ link = alloca0(ec->nodes);
+ create = alloca0(ec->nodes);
+ on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ output1 = alloca0(ec->nodes);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!lookup_replies[i].valid)
+ continue;
+ if (lookup_replies[i].op_ret)
+ continue;
+ on[i] = 1;
+ }
+ switch (ia->ia_type) {
+ case IA_IFDIR:
+ ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+ (void)cluster_mkdir(
+ ec->xl_list, enoent, ec->nodes, replies, output, frame, ec->xl,
+ &loc, st_mode_from_ia(ia->ia_prot, ia->ia_type), 0, xdata);
+ break;
+
+ case IA_IFLNK:
+ /*Check for hard links and create/link*/
+ ret = cluster_lookup(ec->xl_list, enoent, ec->nodes, replies,
+ output, frame, ec->xl, &srcloc, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (output[i]) {
+ link[i] = 1;
+ } else {
+ if (replies[i].op_errno == ENOENT ||
+ replies[i].op_errno == ESTALE) {
+ create[i] = 1;
+ }
+ }
+ }
+
+ if (EC_COUNT(link, ec->nodes)) {
+ cluster_link(ec->xl_list, link, ec->nodes, replies, output1,
+ frame, ec->xl, &srcloc, &loc, NULL);
+ }
+
+ if (EC_COUNT(create, ec->nodes)) {
+ cluster_readlink(ec->xl_list, name_data.same, ec->nodes,
+ replies, output, frame, ec->xl, &srcloc, 4096,
+ NULL);
+ if (EC_COUNT(output, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (output[i])
+ break;
+ }
+ linkname = alloca0(strlen(replies[i].buf) + 1);
+ strcpy(linkname, replies[i].buf);
+ ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+ cluster_symlink(ec->xl_list, create, ec->nodes, replies, output,
+ frame, ec->xl, linkname, &loc, 0, xdata);
+ }
+ for (i = 0; i < ec->nodes; i++)
+ if (output1[i])
+ output[i] = 1;
+ break;
+ case IA_IFREG:
+ ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+ config.version = EC_CONFIG_VERSION;
+ config.algorithm = EC_CONFIG_ALGORITHM;
+ config.gf_word_size = EC_GF_BITS;
+ config.bricks = ec->nodes;
+ config.redundancy = ec->redundancy;
+ config.chunk_size = EC_METHOD_CHUNK_SIZE;
+
+ ret = ec_dict_set_config(xdata, EC_XATTR_CONFIG, &config);
+ if (ret != 0) {
+ goto out;
+ }
+
+ /* Fall through */
+
+ default:
+ ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret)
+ goto out;
+ ret = cluster_mknod(
+ ec->xl_list, enoent, ec->nodes, replies, output, frame, ec->xl,
+ &loc, st_mode_from_ia(ia->ia_prot, ia->ia_type),
+ makedev(ia_major(ia->ia_rdev), ia_minor(ia->ia_rdev)), 0,
+ xdata);
+ break;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (enoent[i] && !output[i])
+ participants[i] = 0;
+ }
+
+ ret = 0;
+out:
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "%s/%s: heal failed %s",
+ uuid_utoa(parent->gfid), name, strerror(-ret));
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ loc_wipe(&srcloc);
+ if (xdata)
+ dict_unref(xdata);
+ return ret;
+}
+
+int
+__ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+ unsigned char *participants)
+{
+ unsigned char *output = NULL;
+ unsigned char *enoent = NULL;
+ default_args_cbk_t *replies = NULL;
+ dict_t *xdata = NULL;
+ dict_t *gfid_db = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int i = 0;
+ struct iatt *ia = NULL;
+ char gfid[64] = {0};
+ unsigned char *same = NULL;
+ unsigned char *gfidless = NULL;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ loc.parent = inode_ref(parent);
+ loc.inode = inode_new(parent->table);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ xdata = dict_new();
+ gfid_db = dict_new();
+ if (!xdata || !gfid_db || !loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_int32(xdata, GF_GFIDLESS_LOOKUP, 1);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ output = alloca0(ec->nodes);
+ gfidless = alloca0(ec->nodes);
+ enoent = alloca0(ec->nodes);
+ ret = cluster_lookup(ec->xl_list, participants, ec->nodes, replies, output,
+ frame, ec->xl, &loc, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret == -1) {
+ /*If ESTALE comes here, that means parent dir is not
+ * present, nothing to do there, so reset participants
+ * for that brick*/
+ if (replies[i].op_errno == ENOENT)
+ enoent[i] = 1;
+ else
+ participants[i] = 0;
+ continue;
+ }
+ ia = &replies[i].stat;
+ if (gf_uuid_is_null(ia->ia_gfid)) {
+ if (IA_ISDIR(ia->ia_type) || ia->ia_size == 0)
+ gfidless[i] = 1;
+ else
+ participants[i] = 0;
+ } else {
+ uuid_utoa_r(ia->ia_gfid, gfid);
+ ret = dict_get_bin(gfid_db, gfid, (void **)&same);
+ if (ret < 0) {
+ same = alloca0(ec->nodes);
+ }
+ same[i] = 1;
+ if (ret < 0) {
+ ret = dict_set_static_bin(gfid_db, gfid, same, ec->nodes);
+ }
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ ret = ec_delete_stale_names(frame, ec, parent, name, replies, gfid_db,
+ enoent, gfidless, participants);
+
+ if (gfid_db->count == 0) {
+ /* All entries seem to be stale entries and deleted,
+ * nothing more to do.*/
+ goto out;
+ }
+
+ if (gfid_db->count > 1) {
+ gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+ "%s/%s: Not able to heal", uuid_utoa(parent->gfid), name);
+ memset(participants, 0, ec->nodes);
+ goto out;
+ }
+
+ EC_INTERSECT(enoent, enoent, participants, ec->nodes);
+ if (EC_COUNT(enoent, ec->nodes) == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
+ participants);
+ if (ret >= 0) {
+ /* If ec_create_name() succeeded we return 1 to indicate that a new
+ * file has been created and it will need to be healed. */
+ ret = 1;
+ }
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ if (xdata)
+ dict_unref(xdata);
+ if (gfid_db)
+ dict_unref(gfid_db);
+ return ret;
+}
+
+int
+ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+ unsigned char *participants)
+{
+ int ret = 0;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ unsigned char *locked_on = NULL;
+ loc_t loc = {0};
+
+ loc.parent = inode_ref(parent);
+ loc.name = name;
+ loc.inode = inode_new(parent->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ output = alloca0(ec->nodes);
+ locked_on = alloca0(ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, participants, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name, parent, 0, 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s/%s: Skipping "
+ "heal as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(parent->gfid), name, ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ EC_INTERSECT(participants, participants, locked_on, ec->nodes);
+ ret = __ec_heal_name(frame, ec, parent, name, participants);
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, parent, 0, 0);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
+ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct ec_name_data *name_data = data;
+ xlator_t *this = THIS;
+ ec_t *ec = this->private;
+ unsigned char *name_on = alloca0(ec->nodes);
+ int i = 0;
+ int ret = 0;
+
+ if (ec->shutdown) {
+ gf_msg_debug(this->name, 0,
+ "Cancelling directory heal "
+ "because EC is stopping.");
+ return -ENOTCONN;
+ }
+
+ memcpy(name_on, name_data->participants, ec->nodes);
+ ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
+ name_on);
+
+ if (ret < 0) {
+ memset(name_on, 0, ec->nodes);
+ } else {
+ name_data->heal_pending += ret;
+ }
+
+ for (i = 0; i < ec->nodes; i++)
+ if (name_data->participants[i] && !name_on[i])
+ name_data->failed_on[i] = 1;
+
+ return 0;
+}
+
+int
+ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *participants, uint32_t *pending)
+{
+ int i = 0;
+ int j = 0;
+ loc_t loc = {0};
+ struct ec_name_data name_data = {0};
+ int ret = 0;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ name_data.frame = frame;
+ name_data.participants = participants;
+ name_data.failed_on = alloca0(ec->nodes);
+ name_data.heal_pending = 0;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!participants[i])
+ continue;
+ ret = syncop_dir_scan(ec->xl_list[i], &loc, GF_CLIENT_PID_SELF_HEALD,
+ &name_data, ec_name_heal_handler);
+ if (ret < 0) {
+ break;
+ }
+ for (j = 0; j < ec->nodes; j++)
+ if (name_data.failed_on[j])
+ participants[j] = 0;
+
+ if (EC_COUNT(participants, ec->nodes) <= ec->fragments) {
+ ret = -ENOTCONN;
+ break;
+ }
+ }
+ *pending += name_data.heal_pending;
+
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
+__ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *heal_on, unsigned char *sources,
+ unsigned char *healed_sinks, uint32_t *pending)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *output = NULL;
+ uint64_t *versions = NULL;
+ uint64_t *dirty = NULL;
+ unsigned char *participants = NULL;
+ default_args_cbk_t *replies = NULL;
+ int ret = 0;
+ int source = 0;
+ int i = 0;
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ versions = alloca0(ec->nodes * sizeof(*versions));
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, heal_on, ec->nodes, replies, locked_on,
+ frame, ec->xl, ec->xl->name, inode, 0, 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_heal_entry_prepare(frame, ec, inode, locked_on, versions,
+ dirty, sources, healed_sinks);
+ source = ret;
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, inode, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ participants = alloca0(ec->nodes);
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i] || healed_sinks[i])
+ participants[i] = 1;
+ }
+ ret = ec_heal_names(frame, ec, inode, participants, pending);
+
+ if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
+ goto out;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!participants[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 0;
+ }
+ }
+
+ ec_adjust_versions(frame, ec, EC_DATA_TXN, inode, source, sources,
+ healed_sinks, versions, dirty);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+
+int
+ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks,
+ uint32_t *pending)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ char selfheal_domain[1024] = {0};
+ int ret = 0;
+ default_args_cbk_t *replies = NULL;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+
+ sprintf(selfheal_domain, "%s:self-heal", ec->xl->name);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+ /*If other processes are already doing the heal, don't block*/
+ ret = cluster_tiebreaker_inodelk(ec->xl_list, up_subvols, ec->nodes,
+ replies, locked_on, frame, ec->xl,
+ selfheal_domain, inode, 0, 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
+ healed_sinks, pending);
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, selfheal_domain, inode, 0, 0);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+
+/*Find direction for data heal and heal info*/
+int
+ec_heal_data_find_direction(ec_t *ec, default_args_cbk_t *replies,
+ uint64_t *data_versions, uint64_t *dirty,
+ uint64_t *size, unsigned char *sources,
+ unsigned char *healed_sinks,
+ gf_boolean_t check_ondisksize, int which)
+{
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
+ char version_size[128] = {0};
+ dict_t *version_size_db = NULL;
+ unsigned char *same = NULL;
+ int max_same_count = 0;
+ int source = 0;
+ int i = 0;
+ int ret = 0;
+ dict_t *dict = NULL;
+ uint64_t source_size = 0;
+
+ version_size_db = dict_new();
+ if (!version_size_db) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret < 0)
+ continue;
+ dict = (which == EC_COMBINE_XDATA) ? replies[i].xdata
+ : replies[i].xattr;
+
+ ret = ec_dict_get_array(dict, EC_XATTR_VERSION, xattr, EC_VERSION_SIZE);
+ if (ret == 0) {
+ data_versions[i] = xattr[EC_DATA_TXN];
+ }
+
+ memset(xattr, 0, sizeof(xattr));
+ ret = ec_dict_get_array(dict, EC_XATTR_DIRTY, xattr, EC_VERSION_SIZE);
+ if (ret == 0) {
+ dirty[i] = xattr[EC_DATA_TXN];
+ }
+ ret = ec_dict_del_number(dict, EC_XATTR_SIZE, &size[i]);
+ /*Build a db of same metadata and data version and size*/
+ snprintf(version_size, sizeof(version_size), "%" PRIu64 "-%" PRIu64,
+ data_versions[i], size[i]);
+
+ ret = dict_get_bin(version_size_db, version_size, (void **)&same);
+ if (ret < 0) {
+ same = alloca0(ec->nodes);
+ }
+
+ same[i] = 1;
+ if (max_same_count < EC_COUNT(same, ec->nodes)) {
+ max_same_count = EC_COUNT(same, ec->nodes);
+ source = i;
+ }
+
+ if (ret < 0) {
+ ret = dict_set_static_bin(version_size_db, version_size, same,
+ ec->nodes);
+ }
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ /* If we don't have ec->fragments number of same version,size it is not
+ * recoverable*/
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
+ goto out;
+ } else {
+ snprintf(version_size, sizeof(version_size), "%" PRIu64 "-%" PRIu64,
+ data_versions[source], size[source]);
+
+ ret = dict_get_bin(version_size_db, version_size, (void **)&same);
+ if (ret < 0)
+ goto out;
+ memcpy(sources, same, ec->nodes);
+ for (i = 0; i < ec->nodes; i++) {
+ if (replies[i].valid && (replies[i].op_ret == 0) && !sources[i])
+ healed_sinks[i] = 1;
+ }
+ }
+
+ /* There could be files with versions, size same but on disk ia_size
+ * could be different because of disk crashes, mark them as sinks as
+ * well*/
+
+ if (check_ondisksize) {
+ source_size = size[source];
+ ec_adjust_size_up(ec, &source_size, _gf_true);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ if (replies[i].stat.ia_size != source_size) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ max_same_count--;
+ } else {
+ source = i;
+ }
+ }
+ }
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ ret = source;
+out:
+ if (version_size_db)
+ dict_unref(version_size_db);
+ return ret;
+}
+
+int
+__ec_heal_data_prepare(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *locked_on, uint64_t *versions,
+ uint64_t *dirty, uint64_t *size, unsigned char *sources,
+ unsigned char *healed_sinks, unsigned char *trim,
+ struct iatt *stbuf)
+{
+ default_args_cbk_t *replies = NULL;
+ default_args_cbk_t *fstat_replies = NULL;
+ unsigned char *output = NULL;
+ unsigned char *fstat_output = NULL;
+ dict_t *xattrs = NULL;
+ uint64_t zero_array[2] = {0};
+ int source = 0;
+ int ret = 0;
+ uint64_t zero_value = 0;
+ int i = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ EC_REPLIES_ALLOC(fstat_replies, ec->nodes);
+ output = alloca0(ec->nodes);
+ fstat_output = alloca0(ec->nodes);
+ xattrs = dict_new();
+ if (!xattrs ||
+ dict_set_static_bin(xattrs, EC_XATTR_VERSION, zero_array,
+ sizeof(zero_array)) ||
+ dict_set_static_bin(xattrs, EC_XATTR_DIRTY, zero_array,
+ sizeof(zero_array)) ||
+ dict_set_static_bin(xattrs, EC_XATTR_SIZE, &zero_value,
+ sizeof(zero_value))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = cluster_fxattrop(ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs,
+ NULL);
+
+ ret = cluster_fstat(ec->xl_list, locked_on, ec->nodes, fstat_replies,
+ fstat_output, frame, ec->xl, fd, NULL);
+
+ for (i = 0; i < ec->nodes; i++) {
+ output[i] = output[i] && fstat_output[i];
+ replies[i].valid = output[i];
+ if (output[i])
+ replies[i].stat = fstat_replies[i].stat;
+ }
+
+ if (EC_COUNT(output, ec->nodes) <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ source = ec_heal_data_find_direction(ec, replies, versions, dirty, size,
+ sources, healed_sinks, _gf_true,
+ EC_COMBINE_DICT);
+ ret = source;
+ if (ret < 0)
+ goto out;
+
+ if (stbuf)
+ *stbuf = replies[source].stat;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (healed_sinks[i]) {
+ if (replies[i].stat.ia_size)
+ trim[i] = 1;
+ }
+ }
+
+ if (EC_COUNT(sources, ec->nodes) < ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ ret = source;
+out:
+ if (xattrs)
+ dict_unref(xattrs);
+ cluster_replies_wipe(replies, ec->nodes);
+ cluster_replies_wipe(fstat_replies, ec->nodes);
+ if (ret < 0) {
+ gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+ uuid_utoa(fd->inode->gfid), strerror(-ret));
+ } else {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: sources: %d, sinks: "
+ "%d",
+ uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+ EC_COUNT(healed_sinks, ec->nodes));
+ }
+ return ret;
+}
+
+int
+__ec_heal_mark_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ uint64_t *versions, unsigned char *healed_sinks)
+{
+ int i = 0;
+ int ret = 0;
+ unsigned char *mark = NULL;
+ dict_t *xattrs = NULL;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ uint64_t versions_xattr[2] = {0};
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ xattrs = dict_new();
+ if (!xattrs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mark = alloca0(ec->nodes);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if ((versions[i] >> EC_SELFHEAL_BIT) & 1)
+ continue;
+ mark[i] = 1;
+ }
+
+ if (EC_COUNT(mark, ec->nodes) == 0)
+ return 0;
+
+ versions_xattr[EC_DATA_TXN] = hton64(1ULL << EC_SELFHEAL_BIT);
+ if (dict_set_static_bin(xattrs, EC_XATTR_VERSION, versions_xattr,
+ sizeof(versions_xattr))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ output = alloca0(ec->nodes);
+ ret = cluster_fxattrop(ec->xl_list, mark, ec->nodes, replies, output, frame,
+ ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!output[i]) {
+ if (mark[i])
+ healed_sinks[i] = 0;
+ continue;
+ }
+ versions[i] |= (1ULL << EC_SELFHEAL_BIT);
+ }
+
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ if (xattrs)
+ dict_unref(xattrs);
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+ uuid_utoa(fd->inode->gfid), strerror(-ret));
+ return ret;
+}
+
+int32_t
+ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
+{
+ ec_heal_t *heal = fop->data;
+ heal->fop = fop;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ ec_owner_set(fop->frame, fop->frame->root);
+
+ ec_heal_inodelk(heal, F_WRLCK, 1, 0, 0);
+
+ return EC_STATE_HEAL_DATA_COPY;
+
+ case EC_STATE_HEAL_DATA_COPY:
+ gf_msg_debug(fop->xl->name, 0, "%s: read/write starting",
+ uuid_utoa(heal->fd->inode->gfid));
+ ec_heal_data_block(heal);
+
+ return EC_STATE_HEAL_DATA_UNLOCK;
+
+ case -EC_STATE_HEAL_DATA_COPY:
+ case -EC_STATE_HEAL_DATA_UNLOCK:
+ case EC_STATE_HEAL_DATA_UNLOCK:
+ ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0);
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0,
+ (heal->good | heal->bad), heal->good, heal->bad,
+ 0, NULL);
+ }
+
+ return EC_STATE_END;
+ case -EC_STATE_REPORT:
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
+ fop->error, 0, 0, 0, 0, NULL);
+ }
+
+ return EC_STATE_END;
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, 0, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+/*Takes lock */
+void
+ec_heal_block(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_heal_cbk_t func, ec_heal_t *heal)
+{
+ ec_cbk_t callback = {.heal = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(HEAL) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags,
+ NULL, ec_manager_heal_block, callback, heal);
+ if (fop == NULL)
+ goto out;
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
+ }
+}
+
+int32_t
+ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask,
+ uintptr_t good, uintptr_t bad, uint32_t pending,
+ dict_t *xdata)
+{
+ ec_heal_t *heal = cookie;
+
+ if (heal->fop) {
+ heal->fop->heal = NULL;
+ }
+ heal->fop = NULL;
+ heal->error = op_ret < 0 ? op_errno : 0;
+ syncbarrier_wake(heal->data);
+ return 0;
+}
+
+int
+ec_sync_heal_block(call_frame_t *frame, xlator_t *this, ec_heal_t *heal)
+{
+ ec_heal_block(frame, this, heal->bad | heal->good, EC_MINIMUM_ONE,
+ ec_heal_block_done, heal);
+ syncbarrier_wait(heal->data, 1);
+ if (heal->error != 0) {
+ return -heal->error;
+ }
+ if (heal->bad == 0)
+ return -ENOTCONN;
+ return 0;
+}
+
+int
+ec_rebuild_data(call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ ec_heal_t *heal = NULL;
+ int ret = 0;
+ syncbarrier_t barrier;
+
+ if (syncbarrier_init(&barrier))
+ return -ENOMEM;
+
+ heal = alloca0(sizeof(*heal));
+ heal->fd = fd_ref(fd);
+ heal->xl = ec->xl;
+ heal->data = &barrier;
+ ec_adjust_size_up(ec, &size, _gf_false);
+ heal->total_size = size;
+ heal->size = (128 * GF_UNIT_KB * (ec->self_heal_window_size));
+ /* We need to adjust the size to a multiple of the stripe size of the
+ * volume. Otherwise writes would need to fill gaps (head and/or tail)
+ * with existent data from the bad bricks. This could be garbage on a
+ * damaged file or it could fail if there aren't enough bricks. */
+ heal->size -= heal->size % ec->stripe_size;
+ heal->bad = ec_char_array_to_mask(healed_sinks, ec->nodes);
+ heal->good = ec_char_array_to_mask(sources, ec->nodes);
+ heal->iatt.ia_type = IA_IFREG;
+ LOCK_INIT(&heal->lock);
+
+ for (heal->offset = 0; (heal->offset < size) && !heal->done;
+ heal->offset += heal->size) {
+ /* We immediately abort any heal if a shutdown request has been
+ * received to avoid delays. The healing of this file will be
+ * restarted by another SHD or other client that accesses the
+ * file. */
+ if (ec->shutdown) {
+ gf_msg_debug(ec->xl->name, 0,
+ "Cancelling heal because "
+ "EC is stopping.");
+ ret = -ENOTCONN;
+ break;
+ }
+
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: sources: %d, sinks: "
+ "%d, offset: %" PRIu64 " bsize: %" PRIu64,
+ uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+ EC_COUNT(healed_sinks, ec->nodes), heal->offset,
+ heal->size);
+ ret = ec_sync_heal_block(frame, ec->xl, heal);
+ if (ret < 0)
+ break;
+ }
+ memset(healed_sinks, 0, ec->nodes);
+ ec_mask_to_char_array(heal->bad, healed_sinks, ec->nodes);
+ fd_unref(heal->fd);
+ LOCK_DESTROY(&heal->lock);
+ syncbarrier_destroy(heal->data);
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+ uuid_utoa(fd->inode->gfid), strerror(-ret));
+ return ret;
+}
+
+int
+__ec_heal_trim_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *healed_sinks, unsigned char *trim,
+ uint64_t size)
+{
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ int ret = 0;
+ int i = 0;
+ off_t trim_offset = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ output = alloca0(ec->nodes);
+
+ if (EC_COUNT(trim, ec->nodes) == 0) {
+ ret = 0;
+ goto out;
+ }
+ trim_offset = size;
+ ec_adjust_offset_up(ec, &trim_offset, _gf_true);
+ ret = cluster_ftruncate(ec->xl_list, trim, ec->nodes, replies, output,
+ frame, ec->xl, fd, trim_offset, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!output[i] && trim[i])
+ healed_sinks[i] = 0;
+ }
+
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+ uuid_utoa(fd->inode->gfid), strerror(-ret));
+ return ret;
+}
+
+int
+ec_data_undo_pending(call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr,
+ uint64_t *versions, uint64_t *dirty, uint64_t *size,
+ int source, gf_boolean_t erase_dirty, int idx)
+{
+ uint64_t versions_xattr[2] = {0};
+ uint64_t dirty_xattr[2] = {0};
+ uint64_t allzero[2] = {0};
+ uint64_t size_xattr = 0;
+ int ret = 0;
+
+ versions_xattr[EC_DATA_TXN] = hton64(versions[source] - versions[idx]);
+ ret = dict_set_static_bin(xattr, EC_XATTR_VERSION, versions_xattr,
+ sizeof(versions_xattr));
+ if (ret < 0)
+ goto out;
+
+ size_xattr = hton64(size[source] - size[idx]);
+ ret = dict_set_static_bin(xattr, EC_XATTR_SIZE, &size_xattr,
+ sizeof(size_xattr));
+ if (ret < 0)
+ goto out;
+
+ if (erase_dirty) {
+ dirty_xattr[EC_DATA_TXN] = hton64(-dirty[idx]);
+ ret = dict_set_static_bin(xattr, EC_XATTR_DIRTY, dirty_xattr,
+ sizeof(dirty_xattr));
+ if (ret < 0)
+ goto out;
+ }
+
+ if ((memcmp(versions_xattr, allzero, sizeof(allzero)) == 0) &&
+ (memcmp(dirty_xattr, allzero, sizeof(allzero)) == 0) &&
+ (size_xattr == 0)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = syncop_fxattrop(ec->xl_list[idx], fd, GF_XATTROP_ADD_ARRAY64, xattr,
+ NULL, NULL, NULL);
+out:
+ return ret;
+}
+
+int
+__ec_fd_data_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *sources,
+ unsigned char *healed_sinks, uint64_t *versions,
+ uint64_t *dirty, uint64_t *size)
+{
+ dict_t *xattr = NULL;
+ int i = 0;
+ int ret = 0;
+ int op_ret = 0;
+ int source = -1;
+ gf_boolean_t erase_dirty = _gf_false;
+
+ xattr = dict_new();
+ if (!xattr) {
+ op_ret = -ENOMEM;
+ goto out;
+ }
+
+ /* dirty xattr represents if the file needs heal. Unless all the
+ * copies are healed, don't erase it */
+ if (EC_COUNT(sources, ec->nodes) + EC_COUNT(healed_sinks, ec->nodes) ==
+ ec->nodes)
+ erase_dirty = _gf_true;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+
+ if (source == -1) {
+ op_ret = -ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (healed_sinks[i]) {
+ ret = ec_data_undo_pending(frame, ec, fd, xattr, versions, dirty,
+ size, source, erase_dirty, i);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ if (!erase_dirty)
+ goto out;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ ret = ec_data_undo_pending(frame, ec, fd, xattr, versions, dirty,
+ size, source, erase_dirty, i);
+ if (ret < 0)
+ continue;
+ }
+ }
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return op_ret;
+}
+
+int
+ec_restore_time_and_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *sources,
+ unsigned char *healed_sinks,
+ uint64_t *versions, uint64_t *dirty,
+ uint64_t *size)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *participants = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *postsh_sources = NULL;
+ unsigned char *postsh_healed_sinks = NULL;
+ unsigned char *postsh_trim = NULL;
+ uint64_t *postsh_versions = NULL;
+ uint64_t *postsh_dirty = NULL;
+ uint64_t *postsh_size = NULL;
+ int ret = 0;
+ int i = 0;
+ struct iatt source_buf = {0};
+ loc_t loc = {0};
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ participants = alloca0(ec->nodes);
+ postsh_sources = alloca0(ec->nodes);
+ postsh_healed_sinks = alloca0(ec->nodes);
+ postsh_trim = alloca0(ec->nodes);
+ postsh_versions = alloca0(ec->nodes * sizeof(*postsh_versions));
+ postsh_dirty = alloca0(ec->nodes * sizeof(*postsh_dirty));
+ postsh_size = alloca0(ec->nodes * sizeof(*postsh_size));
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (healed_sinks[i] || sources[i])
+ participants[i] = 1;
+ }
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, participants, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name, fd->inode, 0,
+ 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, postsh_versions,
+ postsh_dirty, postsh_size, postsh_sources,
+ postsh_healed_sinks, postsh_trim,
+ &source_buf);
+ if (ret < 0)
+ goto unlock;
+
+ loc.inode = inode_ref(fd->inode);
+ gf_uuid_copy(loc.gfid, fd->inode->gfid);
+ ret = cluster_setattr(
+ ec->xl_list, healed_sinks, ec->nodes, replies, output, frame,
+ ec->xl, &loc, &source_buf,
+ GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME, NULL);
+ EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes);
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_fd_data_adjust_versions(frame, ec, fd, sources, healed_sinks,
+ versions, dirty, size);
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, fd->inode, 0, 0);
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
+__ec_heal_data(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *output = NULL;
+ uint64_t *versions = NULL;
+ uint64_t *dirty = NULL;
+ uint64_t *size = NULL;
+ unsigned char *trim = NULL;
+ default_args_cbk_t *replies = NULL;
+ int ret = 0;
+ int source = 0;
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ trim = alloca0(ec->nodes);
+ versions = alloca0(ec->nodes * sizeof(*versions));
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ size = alloca0(ec->nodes * sizeof(*size));
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, heal_on, ec->nodes, replies, locked_on,
+ frame, ec->xl, ec->xl->name, fd->inode, 0, 0);
+ {
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, versions, dirty,
+ size, sources, healed_sinks, trim, NULL);
+ if (ret < 0)
+ goto unlock;
+
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = __ec_fd_data_adjust_versions(
+ frame, ec, fd, sources, healed_sinks, versions, dirty, size);
+ goto unlock;
+ }
+
+ source = ret;
+ ret = __ec_heal_mark_sinks(frame, ec, fd, versions, healed_sinks);
+ if (ret < 0)
+ goto unlock;
+
+ ret = __ec_heal_trim_sinks(frame, ec, fd, healed_sinks, trim,
+ size[source]);
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, fd->inode, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0)
+ goto out;
+
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: sources: %d, sinks: "
+ "%d",
+ uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+ EC_COUNT(healed_sinks, ec->nodes));
+
+ ret = ec_rebuild_data(frame, ec, fd, size[source], sources, healed_sinks);
+ if (ret < 0)
+ goto out;
+
+ ret = ec_restore_time_and_adjust_versions(
+ frame, ec, fd, sources, healed_sinks, versions, dirty, size);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+
+int
+ec_heal_data(call_frame_t *frame, ec_t *ec, gf_boolean_t block, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ fd_t *fd = NULL;
+ loc_t loc = {0};
+ char selfheal_domain[1024] = {0};
+ int ret = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ fd = fd_create(inode, 0);
+ if (!fd) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+ ret = cluster_open(ec->xl_list, up_subvols, ec->nodes, replies, output,
+ frame, ec->xl, &loc, O_RDWR | O_LARGEFILE, fd, NULL);
+ if (ret <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ fd_bind(fd);
+ sprintf(selfheal_domain, "%s:self-heal", ec->xl->name);
+ /*If other processes are already doing the heal, don't block*/
+ if (block) {
+ ret = cluster_inodelk(ec->xl_list, output, ec->nodes, replies,
+ locked_on, frame, ec->xl, selfheal_domain, inode,
+ 0, 0);
+ } else {
+ ret = cluster_tiebreaker_inodelk(ec->xl_list, output, ec->nodes,
+ replies, locked_on, frame, ec->xl,
+ selfheal_domain, inode, 0, 0);
+ }
+ {
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_heal_data(frame, ec, fd, locked_on, sources, healed_sinks);
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, selfheal_domain, inode, 0, 0);
+out:
+ if (fd)
+ fd_unref(fd);
+ loc_wipe(&loc);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+
+int
+ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode)
+{
+ int i = 0;
+ int ret = 0;
+ dict_t **xattr = NULL;
+ loc_t loc = {0};
+ uint64_t dirty_xattr[EC_VERSION_SIZE] = {0};
+ unsigned char *on = NULL;
+ default_args_cbk_t *replies = NULL;
+ dict_t *dict = NULL;
+
+ /* Allocate the required memory */
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ on = alloca0(ec->nodes);
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < ec->nodes; i++) {
+ xattr[i] = dict;
+ on[i] = 1;
+ }
+ ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr,
+ (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+ ec_wind_xattrop_parallel, &loc, GF_XATTROP_ADD_ARRAY64,
+ xattr, NULL);
+out:
+ if (dict) {
+ dict_unref(dict);
+ }
+ if (xattr) {
+ GF_FREE(xattr);
+ }
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
+}
+
+void
+ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+{
+ call_frame_t *frame = NULL;
+ unsigned char *participants = NULL;
+ unsigned char *msources = NULL;
+ unsigned char *mhealed_sinks = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ ec_t *ec = NULL;
+ int ret = 0;
+ int op_ret = 0;
+ int op_errno = 0;
+ intptr_t mgood = 0;
+ intptr_t mbad = 0;
+ intptr_t good = 0;
+ intptr_t bad = 0;
+ uint32_t pending = 0;
+ ec_fop_data_t *fop = data;
+ gf_boolean_t blocking = _gf_false;
+ ec_heal_need_t need_heal = EC_HEAL_NONEED;
+ unsigned char *up_subvols = NULL;
+ char up_bricks[32];
+
+ ec = this->private;
+
+ /* If it is heal request from getxattr, complete the heal and then
+ * unwind, if it is ec_heal with NULL as frame then no need to block
+ * the heal as the caller doesn't care about its completion. In case
+ * of heald whichever gets tiebreaking inodelk will take care of the
+ * heal, so no need to block*/
+ if (fop->req_frame && !ec->shd.iamshd)
+ blocking = _gf_true;
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ ec_owner_set(frame, frame->root);
+ /*Do heal as root*/
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+ /*Mark the fops as internal*/
+ frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+ participants = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, participants, ec->nodes);
+
+ up_subvols = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+ if (loc->name && strlen(loc->name)) {
+ ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
+ participants);
+ if (ret >= 0) {
+ gf_msg_debug(this->name, 0,
+ "%s: name heal "
+ "successful on %" PRIXPTR,
+ loc->path,
+ ec_char_array_to_mask(participants, ec->nodes));
+ } else {
+ gf_msg_debug(
+ this->name, 0,
+ "%s: name heal "
+ "failed. ret = %d, subvolumes up = %s",
+ loc->path, ret,
+ ec_bin(up_bricks, sizeof(up_bricks), ec->xl_up, ec->nodes));
+ }
+ }
+
+ /* Mount triggers heal only when it detects that it must need heal, shd
+ * triggers heals periodically which need not be thorough*/
+ if (ec->shd.iamshd && (ret <= 0)) {
+ ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
+ &need_heal);
+
+ if (need_heal == EC_HEAL_PURGE_INDEX) {
+ gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+ "Index entry needs to be purged for: %s ",
+ uuid_utoa(loc->gfid));
+ /* We need to send zero-xattrop so that stale index entry could be
+ * removed. We need not take lock on this entry to do so as
+ * xattrop on a brick is atomic. */
+ ec_heal_purge_stale_index(frame, ec, loc->inode);
+ goto out;
+ } else if (need_heal == EC_HEAL_NONEED) {
+ gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
+ "Heal is not required for : %s ", uuid_utoa(loc->gfid));
+ goto out;
+ }
+ }
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ if (IA_ISREG(loc->inode->ia_type)) {
+ ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
+ healed_sinks);
+ } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
+ ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
+ &pending);
+ } else {
+ ret = 0;
+ memcpy(sources, participants, ec->nodes);
+ memcpy(healed_sinks, participants, ec->nodes);
+ }
+
+ if (ret == 0) {
+ good = ec_char_array_to_mask(sources, ec->nodes);
+ bad = ec_char_array_to_mask(healed_sinks, ec->nodes);
+ } else {
+ op_ret = -1;
+ op_errno = -ret;
+ }
+ msources = alloca0(ec->nodes);
+ mhealed_sinks = alloca0(ec->nodes);
+ ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
+ if (ret == 0) {
+ mgood = ec_char_array_to_mask(msources, ec->nodes);
+ mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
+ } else {
+ op_ret = -1;
+ op_errno = -ret;
+ }
+
+out:
+ ec_reset_entry_healing(fop);
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno,
+ ec_char_array_to_mask(participants, ec->nodes),
+ mgood & good, mbad & bad, pending, NULL);
+ }
+ if (frame)
+ STACK_DESTROY(frame->root);
+
+ return;
+}
+
+int
+ec_synctask_heal_wrap(void *opaque)
+{
+ ec_fop_data_t *fop = opaque;
+ ec_heal_do(fop->xl, fop, &fop->loc[0], fop->int32);
+ return 0;
+}
+
+int
+ec_heal_done(int ret, call_frame_t *heal, void *opaque)
+{
+ if (opaque)
+ ec_fop_data_release(opaque);
+ return 0;
+}
+
+ec_fop_data_t *
+__ec_dequeue_heals(ec_t *ec)
+{
+ ec_fop_data_t *fop = NULL;
+
+ if (list_empty(&ec->heal_waiting))
+ goto none;
+
+ if ((ec->background_heals > 0) && (ec->healers >= ec->background_heals))
+ goto none;
+
+ fop = list_entry(ec->heal_waiting.next, ec_fop_data_t, healer);
+ ec->heal_waiters--;
+ list_del_init(&fop->healer);
+ list_add(&fop->healer, &ec->healing);
+ ec->healers++;
+ return fop;
+none:
+ gf_msg_debug(ec->xl->name, 0, "Num healers: %d, Num Waiters: %d",
+ ec->healers, ec->heal_waiters);
+ return NULL;
+}
+
+void
+ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
+{
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
+ 0, 0, NULL);
+ }
+ ec_fop_data_release(fop);
+}
+
+void
+ec_launch_heal(ec_t *ec, ec_fop_data_t *fop)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+
+ frame = create_frame(ec->xl, ec->xl->ctx->pool);
+ if (!frame) {
+ ret = -1;
+ goto out;
+ }
+
+ ec_owner_set(frame, frame->root);
+ /*Do heal as root*/
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+ /*Mark the fops as internal*/
+ frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+
+ ret = synctask_new(ec->xl->ctx->env, ec_synctask_heal_wrap, ec_heal_done,
+ frame, fop);
+out:
+ if (ret < 0) {
+ ec_fop_set_error(fop, ENOMEM);
+ ec_heal_fail(ec, fop);
+ }
+
+ if (frame)
+ STACK_DESTROY(frame->root);
+}
+
+void
+ec_handle_healers_done(ec_fop_data_t *fop)
+{
+ ec_t *ec = fop->xl->private;
+ ec_fop_data_t *heal_fop = NULL;
+
+ if (list_empty(&fop->healer))
+ return;
+
+ LOCK(&ec->lock);
+
+ list_del_init(&fop->healer);
+
+ do {
+ ec->healers--;
+ heal_fop = __ec_dequeue_heals(ec);
+
+ if ((heal_fop != NULL) && ec->shutdown) {
+ /* This will prevent ec_handle_healers_done() to be
+ * called recursively. That would be problematic if
+ * the queue is too big. */
+ list_del_init(&heal_fop->healer);
+
+ UNLOCK(&ec->lock);
+
+ ec_fop_set_error(fop, ENOTCONN);
+ ec_heal_fail(ec, heal_fop);
+
+ LOCK(&ec->lock);
+ }
+ } while ((heal_fop != NULL) && ec->shutdown);
+
+ UNLOCK(&ec->lock);
+
+ if (heal_fop)
+ ec_launch_heal(ec, heal_fop);
+}
+
+gf_boolean_t
+ec_is_entry_healing(ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ int32_t heal_count = 0;
+ loc_t *loc = NULL;
+
+ loc = &fop->loc[0];
+
+ LOCK(&loc->inode->lock);
+ {
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ heal_count = ctx->heal_count;
+ }
+ }
+ UNLOCK(&loc->inode->lock);
+ GF_ASSERT(heal_count >= 0);
+ return heal_count;
+}
+
+void
+ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
+{
+ gf_boolean_t can_heal = _gf_true;
+ ec_t *ec = this->private;
+ ec_fop_data_t *fop_rel = NULL;
+
+ if (fop->req_frame == NULL) {
+ LOCK(&ec->lock);
+ {
+ if ((ec->background_heals > 0) &&
+ (ec->heal_wait_qlen + ec->background_heals) >
+ (ec->heal_waiters + ec->healers)) {
+ if (!ec_is_entry_healing(fop)) {
+ list_add_tail(&fop->healer, &ec->heal_waiting);
+ ec->heal_waiters++;
+ ec_set_entry_healing(fop);
+ } else {
+ fop_rel = fop;
+ }
+ fop = __ec_dequeue_heals(ec);
+ } else {
+ can_heal = _gf_false;
+ }
+ }
+ UNLOCK(&ec->lock);
+ }
+
+ if (can_heal) {
+ if (fop) {
+ if (fop->req_frame != NULL) {
+ ec_set_entry_healing(fop);
+ }
+ ec_launch_heal(ec, fop);
+ }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Max number of heals are "
+ "pending, background self-heal rejected");
+ ec_fop_set_error(fop, EBUSY);
+ ec_heal_fail(ec, fop);
+ }
+ if (fop_rel) {
+ ec_heal_done(0, NULL, fop_rel);
+ }
+}
+
+void
+ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc,
+ int32_t partial, dict_t *xdata)
+{
+ ec_cbk_t callback = {.heal = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t err = EINVAL;
+
+ gf_msg_trace("ec", 0, "EC(HEAL) %p", frame);
+
+ VALIDATE_OR_GOTO(this, fail);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, fail);
+
+ if (!loc || !loc->inode || gf_uuid_is_null(loc->inode->gfid))
+ goto fail;
+
+ if (frame && frame->local)
+ goto fail;
+ fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags,
+ NULL, NULL, callback, data);
+
+ err = ENOMEM;
+
+ if (fop == NULL)
+ goto fail;
+
+ fop->int32 = partial;
+
+ if (loc) {
+ if (loc_copy(&fop->loc[0], loc) != 0)
+ goto fail;
+ }
+
+ if (xdata)
+ fop->xdata = dict_ref(xdata);
+
+ ec_heal_throttle(this, fop);
+
+ return;
+
+fail:
+ if (fop)
+ ec_fop_data_release(fop);
+ if (func)
+ func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
+}
+
+int
+ec_replace_heal_done(int ret, call_frame_t *heal, void *opaque)
+{
+ ec_t *ec = opaque;
+ gf_boolean_t last_fop = _gf_false;
+
+ if (GF_ATOMIC_DEC(ec->async_fop_count) == 0) {
+ LOCK(&ec->lock);
+ {
+ last_fop = __ec_is_last_fop(ec);
+ }
+ UNLOCK(&ec->lock);
+ }
+ gf_msg_debug(ec->xl->name, 0, "getxattr on bricks is done ret %d", ret);
+
+ if (last_fop)
+ ec_pending_fops_completed(ec);
+
+ return 0;
+}
+
+int32_t
+ec_replace_heal(ec_t *ec, inode_t *inode)
+{
+ loc_t loc = {0};
+ int ret = 0;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ ret = syncop_getxattr(ec->xl, &loc, NULL, EC_XATTR_HEAL, NULL, NULL);
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d",
+ ret);
+
+ /* Once the root inode has been checked, it might have triggered a
+ * self-heal on it after a replace brick command or for some other
+ * reason. It can also happen that the volume already had damaged
+ * files in the index, even if the heal on the root directory failed.
+ * In both cases we need to wake all index healers to continue
+ * healing remaining entries that are marked as dirty. */
+ ec_shd_index_healer_wake(ec);
+
+ loc_wipe(&loc);
+ return ret;
+}
+
+int32_t
+ec_replace_brick_heal_wrap(void *opaque)
+{
+ ec_t *ec = opaque;
+ inode_table_t *itable = NULL;
+ int32_t ret = -1;
+
+ if (ec->xl->itable)
+ itable = ec->xl->itable;
+ else
+ goto out;
+
+ if (xlator_is_cleanup_starting(ec->xl))
+ goto out;
+
+ ret = ec_replace_heal(ec, itable->root);
+out:
+ return ret;
+}
+
+int32_t
+ec_launch_replace_heal(ec_t *ec)
+{
+ int ret = -1;
+
+ ret = synctask_new(ec->xl->ctx->env, ec_replace_brick_heal_wrap,
+ ec_replace_heal_done, NULL, ec);
+
+ if (ret < 0) {
+ gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d",
+ ret);
+ ec_replace_heal_done(-1, NULL, ec);
+ }
+
+ return ret;
+}
+
+int32_t
+ec_set_heal_info(dict_t **dict_rsp, char *status)
+{
+ dict_t *dict = NULL;
+ int ret = 0;
+
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = dict_set_str(dict, "heal-info", status);
+ if (ret) {
+ gf_msg(THIS->name, GF_LOG_WARNING, -ret, EC_MSG_HEAL_FAIL,
+ "Failed to set heal-info key to "
+ "%s",
+ status);
+ dict_unref(dict);
+ dict = NULL;
+ }
+ *dict_rsp = dict;
+out:
+ return ret;
+}
+
+static int32_t
+_need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources,
+ gf_boolean_t self_locked, int32_t lock_count,
+ ec_heal_need_t *need_heal, uint64_t *versions)
+{
+ int i = 0;
+ int source_count = 0;
+
+ source_count = EC_COUNT(sources, ec->nodes);
+ if (source_count == ec->nodes) {
+ *need_heal = EC_HEAL_NONEED;
+ if (self_locked || lock_count == 0) {
+ for (i = 0; i < ec->nodes; i++) {
+ if (dirty[i] || (versions[i] != versions[0])) {
+ *need_heal = EC_HEAL_MUST;
+ goto out;
+ }
+ }
+ /* If lock count is 0, all dirty flags are 0 and all the
+ * versions are macthing then why are we here. It looks
+ * like something went wrong while removing the index entries
+ * after completing a successful heal or fop. In this case
+ * we need to remove this index entry to avoid triggering heal
+ * in a loop and causing lookups again and again*/
+ *need_heal = EC_HEAL_PURGE_INDEX;
+ } else {
+ for (i = 0; i < ec->nodes; i++) {
+ /* Since each lock can only increment the dirty
+ * count once, if dirty is > 1 it means that
+ * another operation has left the dirty count
+ * set and this indicates a problem in the
+ * inode.*/
+ if (dirty[i] > 1) {
+ *need_heal = EC_HEAL_MUST;
+ goto out;
+ }
+ if (dirty[i] != dirty[0] || (versions[i] != versions[0])) {
+ *need_heal = EC_HEAL_MAYBE;
+ }
+ }
+ }
+ } else {
+ *need_heal = EC_HEAL_MUST;
+ }
+
+out:
+ return source_count;
+}
+
+static int32_t
+ec_need_metadata_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ uint64_t *dirty = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ uint64_t *meta_versions = NULL;
+ int ret = 0;
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ meta_versions = alloca0(ec->nodes * sizeof(*meta_versions));
+ ret = ec_heal_metadata_find_direction(ec, replies, meta_versions, dirty,
+ sources, healed_sinks);
+ if (ret < 0 && ret != -EIO) {
+ goto out;
+ }
+
+ ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+ need_heal, meta_versions);
+out:
+ return ret;
+}
+
+static int32_t
+ec_need_data_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ uint64_t *dirty = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ uint64_t *data_versions = NULL;
+ uint64_t *size = NULL;
+ int ret = 0;
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ data_versions = alloca0(ec->nodes * sizeof(*data_versions));
+ size = alloca0(ec->nodes * sizeof(*size));
+
+ /* When dd is going on and heal info is called there is a very good
+ * chance for on disk sizes to mismatch even though nothing is wrong
+ * we don't need ondisk size check there. But if the file is either
+ * self-locked or the caller wants a thorough check then make sure to
+ * perform on disk check also. */
+ ret = ec_heal_data_find_direction(
+ ec, replies, data_versions, dirty, size, sources, healed_sinks,
+ self_locked || thorough, EC_COMBINE_XDATA);
+ if (ret < 0 && ret != -EIO) {
+ goto out;
+ }
+
+ ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+ need_heal, data_versions);
+out:
+ return ret;
+}
+
+static int32_t
+ec_need_entry_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ uint64_t *dirty = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ uint64_t *data_versions = NULL;
+ int ret = 0;
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ data_versions = alloca0(ec->nodes * sizeof(*data_versions));
+
+ ret = ec_heal_entry_find_direction(ec, replies, data_versions, dirty,
+ sources, healed_sinks);
+ if (ret < 0 && ret != -EIO) {
+ goto out;
+ }
+
+ ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+ need_heal, data_versions);
+out:
+ return ret;
+}
+
+static int32_t
+ec_need_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ int ret = 0;
+
+ ret = ec_need_metadata_heal(ec, inode, replies, lock_count, self_locked,
+ thorough, need_heal);
+ if (ret < 0)
+ goto out;
+
+ if (*need_heal == EC_HEAL_MUST)
+ goto out;
+
+ if (inode->ia_type == IA_IFREG) {
+ ret = ec_need_data_heal(ec, inode, replies, lock_count, self_locked,
+ thorough, need_heal);
+ } else if (inode->ia_type == IA_IFDIR) {
+ ret = ec_need_entry_heal(ec, inode, replies, lock_count, self_locked,
+ thorough, need_heal);
+ }
+
+out:
+ return ret;
+}
+
+int32_t
+ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ loc_t loc = {0};
+ int i = 0;
+ int ret = 0;
+ dict_t *xdata = NULL;
+ uint64_t zero_array[2] = {0};
+ uint64_t zero_value = 0;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ int32_t lock_count = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ output = alloca0(ec->nodes);
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ xdata = dict_new();
+ if (!xdata ||
+ dict_set_static_bin(xdata, EC_XATTR_VERSION, zero_array,
+ sizeof(zero_array)) ||
+ dict_set_static_bin(xdata, EC_XATTR_DIRTY, zero_array,
+ sizeof(zero_array)) ||
+ dict_set_static_bin(xdata, EC_XATTR_SIZE, &zero_value,
+ sizeof(zero_value))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!self_locked) {
+ ret = dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, ec->xl->name);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, &loc, xdata);
+
+ if (ret != ec->nodes) {
+ ret = ec->nodes;
+ *need_heal = EC_HEAL_MUST;
+ goto out;
+ }
+
+ if (self_locked)
+ goto need_heal;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!output[i] || !replies[i].xdata) {
+ continue;
+ }
+ if ((dict_get_int32(replies[i].xdata, GLUSTERFS_INODELK_COUNT,
+ &lock_count) == 0) &&
+ lock_count > 0) {
+ break;
+ }
+ }
+need_heal:
+ ret = ec_need_heal(ec, inode, replies, lock_count, self_locked, thorough,
+ need_heal);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ if (xdata) {
+ dict_unref(xdata);
+ }
+ return ret;
+}
+
+int32_t
+ec_heal_locked_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ ec_heal_need_t *need_heal)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ int ret = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+ ret = cluster_inodelk(ec->xl_list, up_subvols, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name, inode, 0, 0);
+ if (ret != ec->nodes) {
+ *need_heal = EC_HEAL_MUST;
+ goto unlock;
+ }
+ ret = ec_heal_inspect(frame, ec, inode, locked_on, _gf_true, _gf_true,
+ need_heal);
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, inode, 0, 0);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+
+int32_t
+ec_get_heal_info(xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp)
+{
+ int ret = -ENOMEM;
+ ec_heal_need_t need_heal = EC_HEAL_NONEED;
+ call_frame_t *frame = NULL;
+ ec_t *ec = NULL;
+ unsigned char *up_subvols = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, entry_loc, out);
+
+ ec = this->private;
+ up_subvols = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+ if (EC_COUNT(up_subvols, ec->nodes) != ec->nodes) {
+ need_heal = EC_HEAL_MUST;
+ goto set_heal;
+ }
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame) {
+ goto out;
+ }
+ ec_owner_set(frame, frame->root);
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+ frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+
+ if (loc_copy(&loc, entry_loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+ goto out;
+ }
+ if (!loc.inode) {
+ ret = syncop_inode_find(this, this, loc.gfid, &loc.inode, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = ec_heal_inspect(frame, ec, loc.inode, up_subvols, _gf_false,
+ _gf_false, &need_heal);
+ if (ret == ec->nodes && need_heal != EC_HEAL_MAYBE) {
+ goto set_heal;
+ }
+ need_heal = EC_HEAL_NONEED;
+ ret = ec_heal_locked_inspect(frame, ec, loc.inode, &need_heal);
+ if (ret < 0)
+ goto out;
+set_heal:
+ if (need_heal == EC_HEAL_MUST) {
+ ret = ec_set_heal_info(dict_rsp, "heal");
+ } else {
+ ret = ec_set_heal_info(dict_rsp, "no-heal");
+ }
+out:
+ if (frame) {
+ STACK_DESTROY(frame->root);
+ }
+ loc_wipe(&loc);
+ return ret;
+}
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
new file mode 100644
index 00000000000..5c1586bc9c5
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -0,0 +1,681 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat-errno.h>
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-heald.h"
+#include "ec-mem-types.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+#include "protocol-common.h"
+
+#define NTH_INDEX_HEALER(this, n) \
+ (&((((ec_t *)this->private))->shd.index_healers[n]))
+#define NTH_FULL_HEALER(this, n) \
+ (&((((ec_t *)this->private))->shd.full_healers[n]))
+
+gf_boolean_t
+ec_shd_is_subvol_local(xlator_t *this, int subvol)
+{
+ ec_t *ec = NULL;
+ gf_boolean_t is_local = _gf_false;
+ loc_t loc = {
+ 0,
+ };
+
+ ec = this->private;
+ loc.inode = this->itable->root;
+ syncop_is_subvol_local(ec->xl_list[subvol], &loc, &is_local);
+ return is_local;
+}
+
+char *
+ec_subvol_name(xlator_t *this, int subvol)
+{
+ ec_t *ec = NULL;
+
+ ec = this->private;
+ if (subvol < 0 || subvol > ec->nodes)
+ return NULL;
+
+ return ec->xl_list[subvol]->name;
+}
+
+int
+__ec_shd_healer_wait(struct subvol_healer *healer)
+{
+ ec_t *ec = NULL;
+ struct timespec wait_till = {
+ 0,
+ };
+ int ret = 0;
+
+ ec = healer->this->private;
+
+disabled_loop:
+ wait_till.tv_sec = gf_time() + ec->shd.timeout;
+
+ while (!healer->rerun) {
+ ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
+ if (ret == ETIMEDOUT)
+ break;
+ }
+
+ if (ec->shutdown) {
+ healer->running = _gf_false;
+ return -1;
+ }
+
+ ret = healer->rerun;
+ healer->rerun = 0;
+
+ if (!ec->shd.enabled || !ec->up)
+ goto disabled_loop;
+
+ return ret;
+}
+
+int
+ec_shd_healer_wait(struct subvol_healer *healer)
+{
+ int ret = 0;
+
+ pthread_mutex_lock(&healer->mutex);
+ {
+ ret = __ec_shd_healer_wait(healer);
+ }
+ pthread_mutex_unlock(&healer->mutex);
+
+ return ret;
+}
+
+int
+ec_shd_index_inode(xlator_t *this, xlator_t *subvol, inode_t **inode)
+{
+ loc_t rootloc = {
+ 0,
+ };
+ int ret = 0;
+ dict_t *xattr = NULL;
+ void *index_gfid = NULL;
+
+ *inode = NULL;
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_GFID, NULL,
+ NULL);
+ if (ret < 0)
+ goto out;
+ if (!xattr) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = dict_get_ptr(xattr, GF_XATTROP_INDEX_GFID, &index_gfid);
+ if (ret)
+ goto out;
+
+ gf_msg_debug(this->name, 0, "index-dir gfid for %s: %s", subvol->name,
+ uuid_utoa(index_gfid));
+
+ ret = syncop_inode_find(this, subvol, index_gfid, inode, NULL, NULL);
+
+out:
+ loc_wipe(&rootloc);
+
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
+}
+
+int
+ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name)
+{
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
+
+ loc.parent = inode_ref(inode);
+ loc.name = name;
+
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+
+ loc_wipe(&loc);
+ return ret;
+}
+
+static gf_boolean_t
+ec_is_heal_completed(char *status)
+{
+ char *bad_pos = NULL;
+ char *zero_pos = NULL;
+
+ if (!status) {
+ return _gf_false;
+ }
+
+ /*Logic:
+ * Status will be of the form Good: <binary>, Bad: <binary>
+ * If heal completes, if we do strchr for '0' it should be present after
+ * 'Bad:' i.e. strRchr for ':'
+ * */
+
+ zero_pos = strchr(status, '0');
+ bad_pos = strrchr(status, ':');
+ if (!zero_pos || !bad_pos) {
+ /*malformed status*/
+ return _gf_false;
+ }
+
+ if (zero_pos > bad_pos) {
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+int
+ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
+ gf_boolean_t full)
+{
+ dict_t *xdata = NULL;
+ dict_t *dict = NULL;
+ uint32_t count;
+ int32_t ret;
+ char *heal_status = NULL;
+ ec_t *ec = healer->this->private;
+
+ GF_ATOMIC_INC(ec->stats.shd.attempted);
+ ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL,
+ &xdata);
+ if (ret == 0) {
+ if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) {
+ if (ec_is_heal_completed(heal_status)) {
+ GF_ATOMIC_INC(ec->stats.shd.completed);
+ }
+ }
+ }
+
+ if (!full && (loc->inode->ia_type == IA_IFDIR)) {
+ /* If we have just healed a directory, it's possible that
+ * other index entries have appeared to be healed. */
+ if ((xdata != NULL) &&
+ (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
+ (count > 0)) {
+ /* Force a rerun of the index healer. */
+ gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
+ count);
+
+ healer->rerun = _gf_true;
+ }
+ }
+
+ if (xdata != NULL) {
+ dict_unref(xdata);
+ }
+
+ if (dict) {
+ dict_unref(dict);
+ }
+
+ return ret;
+}
+
+int
+ec_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ ec_t *ec = NULL;
+ loc_t loc = {0};
+ int ret = 0;
+
+ ec = healer->this->private;
+ if (ec->xl_up_count <= ec->fragments) {
+ return -ENOTCONN;
+ }
+ if (!ec->shd.enabled)
+ return -EBUSY;
+
+ gf_msg_debug(healer->this->name, 0, "got entry: %s", entry->d_name);
+
+ ret = gf_uuid_parse(entry->d_name, loc.gfid);
+ if (ret)
+ return 0;
+
+ /* If this fails with ENOENT/ESTALE index is stale */
+ ret = syncop_gfid_to_path(healer->this->itable, subvol, loc.gfid,
+ (char **)&loc.path);
+ if (ret < 0)
+ goto out;
+
+ ret = syncop_inode_find(healer->this, healer->this, loc.gfid, &loc.inode,
+ NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ ec_shd_selfheal(healer, healer->subvol, &loc, _gf_false);
+out:
+ if (ret == -ENOENT || ret == -ESTALE) {
+ gf_msg(healer->this->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
+ "Purging index for gfid %s:", uuid_utoa(loc.gfid));
+ ec_shd_index_purge(subvol, parent->inode, entry->d_name);
+ }
+ loc_wipe(&loc);
+
+ return 0;
+}
+
+int
+ec_shd_index_sweep(struct subvol_healer *healer)
+{
+ loc_t loc = {0};
+ ec_t *ec = NULL;
+ int ret = 0;
+ xlator_t *subvol = NULL;
+ dict_t *xdata = NULL;
+
+ ec = healer->this->private;
+ subvol = ec->xl_list[healer->subvol];
+
+ ret = ec_shd_index_inode(healer->this, subvol, &loc.inode);
+ if (ret < 0) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, errno,
+ EC_MSG_INDEX_DIR_GET_FAIL, "unable to get index-dir on %s",
+ subvol->name);
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata || dict_set_int32(xdata, "get-gfid-type", 1)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ _mask_cancellation();
+ ret = syncop_mt_dir_scan(NULL, subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
+ healer, ec_shd_index_heal, xdata,
+ ec->shd.max_threads, ec->shd.wait_qlength);
+ _unmask_cancellation();
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+
+ return ret;
+}
+
+int
+ec_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ xlator_t *this = healer->this;
+ ec_t *ec = NULL;
+ loc_t loc = {0};
+ int ret = 0;
+
+ ec = this->private;
+
+ if (this->cleanup_starting) {
+ return -ENOTCONN;
+ }
+
+ if (ec->xl_up_count <= ec->fragments) {
+ return -ENOTCONN;
+ }
+ if (!ec->shd.enabled)
+ return -EBUSY;
+
+ if (gf_uuid_is_null(entry->d_stat.ia_gfid)) {
+ /* It's possible that an entry has been removed just after
+ * being seen in a directory but before getting its stat info.
+ * In this case we'll receive a NULL gfid here. Since the file
+ * doesn't exist anymore, we can safely ignore it. */
+ return 0;
+ }
+
+ loc.parent = inode_ref(parent->inode);
+ loc.name = entry->d_name;
+ gf_uuid_copy(loc.gfid, entry->d_stat.ia_gfid);
+
+ /* If this fails with ENOENT/ESTALE index is stale */
+ ret = syncop_gfid_to_path(this->itable, subvol, loc.gfid,
+ (char **)&loc.path);
+ if (ret < 0)
+ goto out;
+
+ ret = syncop_inode_find(this, this, loc.gfid, &loc.inode, NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ ec_shd_selfheal(healer, healer->subvol, &loc, _gf_true);
+
+ ret = 0;
+
+out:
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
+ec_shd_full_sweep(struct subvol_healer *healer, inode_t *inode)
+{
+ ec_t *ec = NULL;
+ loc_t loc = {0};
+ int ret = -1;
+
+ ec = healer->this->private;
+ loc.inode = inode;
+ _mask_cancellation();
+ ret = syncop_ftw(ec->xl_list[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer, ec_shd_full_heal);
+ _unmask_cancellation();
+ return ret;
+}
+
+void *
+ec_shd_index_healer(void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int run = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+ ec_t *ec = this->private;
+
+ for (;;) {
+ run = ec_shd_healer_wait(healer);
+ if (run == -1)
+ break;
+
+ if (ec->xl_up_count > ec->fragments) {
+ gf_msg_debug(this->name, 0, "starting index sweep on subvol %s",
+ ec_subvol_name(this, healer->subvol));
+ ec_shd_index_sweep(healer);
+ }
+ gf_msg_debug(this->name, 0, "finished index sweep on subvol %s",
+ ec_subvol_name(this, healer->subvol));
+ }
+
+ return NULL;
+}
+
+void *
+ec_shd_full_healer(void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ loc_t rootloc = {0};
+
+ int run = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+ ec_t *ec = this->private;
+
+ rootloc.inode = this->itable->root;
+ for (;;) {
+ run = ec_shd_healer_wait(healer);
+ if (run < 0) {
+ break;
+ } else if (run == 0) {
+ continue;
+ }
+
+ if (ec->xl_up_count > ec->fragments) {
+ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_FULL_SWEEP_START,
+ "starting full sweep on subvol %s",
+ ec_subvol_name(this, healer->subvol));
+
+ ec_shd_selfheal(healer, healer->subvol, &rootloc, _gf_true);
+ ec_shd_full_sweep(healer, this->itable->root);
+ }
+
+ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_FULL_SWEEP_STOP,
+ "finished full sweep on subvol %s",
+ ec_subvol_name(this, healer->subvol));
+ }
+
+ return NULL;
+}
+
+int
+ec_shd_healer_init(xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = 0;
+
+ ret = pthread_mutex_init(&healer->mutex, NULL);
+ if (ret)
+ goto out;
+
+ ret = pthread_cond_init(&healer->cond, NULL);
+ if (ret)
+ goto out;
+
+ healer->this = this;
+ healer->running = _gf_false;
+ healer->rerun = _gf_false;
+out:
+ return ret;
+}
+
+int
+ec_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer,
+ void *(threadfn)(void *))
+{
+ int ret = 0;
+
+ pthread_mutex_lock(&healer->mutex);
+ {
+ if (healer->running) {
+ pthread_cond_signal(&healer->cond);
+ } else {
+ ret = gf_thread_create(&healer->thread, NULL, threadfn, healer,
+ "ecshd");
+ if (ret)
+ goto unlock;
+ healer->running = 1;
+ }
+
+ healer->rerun = 1;
+ }
+unlock:
+ pthread_mutex_unlock(&healer->mutex);
+
+ return ret;
+}
+
+int
+ec_shd_full_healer_spawn(xlator_t *this, int subvol)
+{
+ if (xlator_is_cleanup_starting(this))
+ return -1;
+
+ return ec_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol),
+ ec_shd_full_healer);
+}
+
+int
+ec_shd_index_healer_spawn(xlator_t *this, int subvol)
+{
+ if (xlator_is_cleanup_starting(this))
+ return -1;
+
+ return ec_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol),
+ ec_shd_index_healer);
+}
+
+void
+ec_shd_index_healer_wake(ec_t *ec)
+{
+ int32_t i;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (((ec->xl_up >> i) & 1) != 0) {
+ ec_shd_index_healer_spawn(ec->xl, i);
+ }
+ }
+}
+
+int
+ec_selfheal_daemon_init(xlator_t *this)
+{
+ ec_t *ec = NULL;
+ ec_self_heald_t *shd = NULL;
+ int ret = -1;
+ int i = 0;
+
+ ec = this->private;
+ shd = &ec->shd;
+
+ shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers), ec->nodes,
+ ec_mt_subvol_healer_t);
+ if (!shd->index_healers)
+ goto out;
+
+ for (i = 0; i < ec->nodes; i++) {
+ shd->index_healers[i].subvol = i;
+ ret = ec_shd_healer_init(this, &shd->index_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), ec->nodes,
+ ec_mt_subvol_healer_t);
+ if (!shd->full_healers)
+ goto out;
+
+ for (i = 0; i < ec->nodes; i++) {
+ shd->full_healers[i].subvol = i;
+ ret = ec_shd_healer_init(this, &shd->full_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+ec_heal_op(xlator_t *this, dict_t *output, gf_xl_afr_op_t op, int xl_id)
+{
+ char key[64] = {0};
+ int op_ret = 0;
+ ec_t *ec = NULL;
+ int i = 0;
+ GF_UNUSED int ret = 0;
+
+ ec = this->private;
+
+ op_ret = -1;
+ for (i = 0; i < ec->nodes; i++) {
+ snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+ if (((ec->xl_up >> i) & 1) == 0) {
+ ret = dict_set_str(output, key, "Brick is not connected");
+ } else if (!ec->up) {
+ ret = dict_set_str(output, key, "Disperse subvolume is not up");
+ } else if (!ec_shd_is_subvol_local(this, i)) {
+ ret = dict_set_str(output, key, "Brick is remote");
+ } else {
+ ret = dict_set_str(output, key, "Started self-heal");
+ if (op == GF_SHD_OP_HEAL_FULL) {
+ ec_shd_full_healer_spawn(this, i);
+ } else if (op == GF_SHD_OP_HEAL_INDEX) {
+ ec_shd_index_healer_spawn(this, i);
+ }
+ op_ret = 0;
+ }
+ }
+ return op_ret;
+}
+
+int
+ec_xl_op(xlator_t *this, dict_t *input, dict_t *output)
+{
+ gf_xl_afr_op_t op = GF_SHD_OP_INVALID;
+ int ret = 0;
+ int xl_id = 0;
+
+ ret = dict_get_int32(input, "xl-op", (int32_t *)&op);
+ if (ret)
+ goto out;
+
+ ret = dict_get_int32(input, this->name, &xl_id);
+ if (ret)
+ goto out;
+
+ ret = dict_set_int32(output, this->name, xl_id);
+ if (ret)
+ goto out;
+
+ switch (op) {
+ case GF_SHD_OP_HEAL_FULL:
+ ret = ec_heal_op(this, output, op, xl_id);
+ break;
+
+ case GF_SHD_OP_HEAL_INDEX:
+ ret = ec_heal_op(this, output, op, xl_id);
+ break;
+
+ default:
+ ret = -1;
+ break;
+ }
+out:
+ dict_del(output, this->name);
+ return ret;
+}
+
+void
+ec_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
+{
+ if (!healer)
+ return;
+
+ pthread_cond_destroy(&healer->cond);
+ pthread_mutex_destroy(&healer->mutex);
+}
+
+void
+ec_selfheal_daemon_fini(xlator_t *this)
+{
+ struct subvol_healer *healer = NULL;
+ ec_self_heald_t *shd = NULL;
+ ec_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+ if (!priv)
+ return;
+
+ shd = &priv->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < priv->nodes; i++) {
+ healer = &shd->index_healers[i];
+ ec_destroy_healer_object(this, healer);
+
+ healer = &shd->full_healers[i];
+ ec_destroy_healer_object(this, healer);
+ }
+
+ GF_FREE(shd->index_healers);
+ GF_FREE(shd->full_healers);
+}
diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h
new file mode 100644
index 00000000000..6c7da4edc10
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-heald.h
@@ -0,0 +1,30 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_HEALD_H__
+#define __EC_HEALD_H__
+
+#include "ec-types.h" // for ec_t
+#include "glusterfs/dict.h" // for dict_t
+#include "glusterfs/globals.h" // for xlator_t
+
+int
+ec_xl_op(xlator_t *this, dict_t *input, dict_t *output);
+
+int
+ec_selfheal_daemon_init(xlator_t *this);
+
+void
+ec_shd_index_healer_wake(ec_t *ec);
+
+void
+ec_selfheal_daemon_fini(xlator_t *this);
+
+#endif /* __EC_HEALD_H__ */
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
new file mode 100644
index 00000000000..48f54475e01
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -0,0 +1,867 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <libgen.h>
+
+#include <glusterfs/byte-order.h>
+
+#include "ec.h"
+#include "ec-mem-types.h"
+#include "ec-messages.h"
+#include "ec-fops.h"
+#include "ec-method.h"
+#include "ec-helpers.h"
+
+static const char *ec_fop_list[] = {[-EC_FOP_HEAL] = "HEAL"};
+
+const char *
+ec_bin(char *str, size_t size, uint64_t value, int32_t digits)
+{
+ str += size;
+
+ if (size-- < 1) {
+ goto failed;
+ }
+ *--str = 0;
+
+ while ((value != 0) || (digits > 0)) {
+ if (size-- < 1) {
+ goto failed;
+ }
+ *--str = '0' + (value & 1);
+ digits--;
+ value >>= 1;
+ }
+
+ return str;
+
+failed:
+ return "<buffer too small>";
+}
+
+const char *
+ec_fop_name(int32_t id)
+{
+ if (id >= 0) {
+ return gf_fop_list[id];
+ }
+
+ return ec_fop_list[-id];
+}
+
+void
+ec_trace(const char *event, ec_fop_data_t *fop, const char *fmt, ...)
+{
+ char str1[32], str2[32], str3[32];
+ char *msg;
+ ec_t *ec = fop->xl->private;
+ va_list args;
+ int32_t ret;
+
+ va_start(args, fmt);
+ ret = vasprintf(&msg, fmt, args);
+ va_end(args);
+
+ if (ret < 0) {
+ msg = "<memory allocation error>";
+ }
+
+ gf_msg_trace("ec", 0,
+ "%s(%s) %p(%p) [refs=%d, winds=%d, jobs=%d] "
+ "frame=%p/%p, min/exp=%d/%d, err=%d state=%d "
+ "{%s:%s:%s} %s",
+ event, ec_fop_name(fop->id), fop, fop->parent, fop->refs,
+ fop->winds, fop->jobs, fop->req_frame, fop->frame,
+ fop->minimum, fop->expected, fop->error, fop->state,
+ ec_bin(str1, sizeof(str1), fop->mask, ec->nodes),
+ ec_bin(str2, sizeof(str2), fop->remaining, ec->nodes),
+ ec_bin(str3, sizeof(str3), fop->good, ec->nodes), msg);
+
+ if (ret >= 0) {
+ free(msg);
+ }
+}
+
+int32_t
+ec_bits_consume(uint64_t *n)
+{
+ uint64_t tmp;
+
+ tmp = *n;
+ tmp &= -tmp;
+ *n ^= tmp;
+
+ return gf_bits_index(tmp);
+}
+
+size_t
+ec_iov_copy_to(void *dst, struct iovec *vector, int32_t count, off_t offset,
+ size_t size)
+{
+ int32_t i = 0;
+ size_t total = 0, len = 0;
+
+ while (i < count) {
+ if (offset < vector[i].iov_len) {
+ while ((i < count) && (size > 0)) {
+ len = size;
+ if (len > vector[i].iov_len - offset) {
+ len = vector[i].iov_len - offset;
+ }
+ memcpy(dst, vector[i++].iov_base + offset, len);
+ offset = 0;
+ dst += len;
+ total += len;
+ size -= len;
+ }
+
+ break;
+ }
+
+ offset -= vector[i].iov_len;
+ i++;
+ }
+
+ return total;
+}
+
+int32_t
+ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, void **ptr)
+{
+ struct iobref *iobref = NULL;
+ struct iobuf *iobuf = NULL;
+ int32_t ret = -ENOMEM;
+
+ iobuf = iobuf_get_page_aligned(xl->ctx->iobuf_pool, size,
+ EC_METHOD_WORD_SIZE);
+ if (iobuf == NULL) {
+ goto out;
+ }
+
+ iobref = *piobref;
+ if (iobref == NULL) {
+ iobref = iobref_new();
+ if (iobref == NULL) {
+ goto out;
+ }
+ }
+
+ ret = iobref_add(iobref, iobuf);
+ if (ret != 0) {
+ if (iobref != *piobref) {
+ iobref_unref(iobref);
+ }
+ iobref = NULL;
+
+ goto out;
+ }
+
+ GF_ASSERT(EC_ALIGN_CHECK(iobuf->ptr, EC_METHOD_WORD_SIZE));
+
+ *ptr = iobuf->ptr;
+
+out:
+ if (iobuf != NULL) {
+ iobuf_unref(iobuf);
+ }
+
+ if (iobref != NULL) {
+ *piobref = iobref;
+ }
+
+ return ret;
+}
+
+int32_t
+ec_dict_set_array(dict_t *dict, char *key, uint64_t value[], int32_t size)
+{
+ int ret = -1;
+ uint64_t *ptr = NULL;
+ int32_t vindex;
+
+ if (value == NULL) {
+ return -EINVAL;
+ }
+
+ ptr = GF_MALLOC(sizeof(uint64_t) * size, gf_common_mt_char);
+ if (ptr == NULL) {
+ return -ENOMEM;
+ }
+ for (vindex = 0; vindex < size; vindex++) {
+ ptr[vindex] = hton64(value[vindex]);
+ }
+ ret = dict_set_bin(dict, key, ptr, sizeof(uint64_t) * size);
+ if (ret)
+ GF_FREE(ptr);
+ return ret;
+}
+
+int32_t
+ec_dict_get_array(dict_t *dict, char *key, uint64_t value[], int32_t size)
+{
+ void *ptr;
+ int32_t len;
+ int32_t vindex;
+ int32_t old_size = 0;
+ int32_t err;
+
+ if (dict == NULL) {
+ return -EINVAL;
+ }
+ err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+ if (err != 0) {
+ return err;
+ }
+
+ if (len > (size * sizeof(uint64_t)) || (len % sizeof(uint64_t))) {
+ return -EINVAL;
+ }
+
+ /* 3.6 version ec would have stored version in 64 bit. In that case treat
+ * metadata versions same as data*/
+ old_size = min(size, len / sizeof(uint64_t));
+ for (vindex = 0; vindex < old_size; vindex++) {
+ value[vindex] = ntoh64(*((uint64_t *)ptr + vindex));
+ }
+
+ if (old_size < size) {
+ for (vindex = old_size; vindex < size; vindex++) {
+ value[vindex] = value[old_size - 1];
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+ec_dict_del_array(dict_t *dict, char *key, uint64_t value[], int32_t size)
+{
+ int ret = 0;
+
+ ret = ec_dict_get_array(dict, key, value, size);
+ if (ret == 0)
+ dict_del(dict, key);
+
+ return ret;
+}
+
+int32_t
+ec_dict_set_number(dict_t *dict, char *key, uint64_t value)
+{
+ int ret = -1;
+ uint64_t *ptr;
+
+ ptr = GF_MALLOC(sizeof(value), gf_common_mt_char);
+ if (ptr == NULL) {
+ return -ENOMEM;
+ }
+
+ *ptr = hton64(value);
+
+ ret = dict_set_bin(dict, key, ptr, sizeof(value));
+ if (ret)
+ GF_FREE(ptr);
+
+ return ret;
+}
+
+int32_t
+ec_dict_del_number(dict_t *dict, char *key, uint64_t *value)
+{
+ void *ptr;
+ int32_t len, err;
+
+ if (dict == NULL) {
+ return -EINVAL;
+ }
+ err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+ if (err != 0) {
+ return err;
+ }
+ if (len != sizeof(uint64_t)) {
+ return -EINVAL;
+ }
+
+ *value = ntoh64(*(uint64_t *)ptr);
+
+ dict_del(dict, key);
+
+ return 0;
+}
+
+int32_t
+ec_dict_set_config(dict_t *dict, char *key, ec_config_t *config)
+{
+ int ret = -1;
+ uint64_t *ptr, data;
+
+ if (config->version > EC_CONFIG_VERSION) {
+ gf_msg("ec", GF_LOG_ERROR, EINVAL, EC_MSG_UNSUPPORTED_VERSION,
+ "Trying to store an unsupported config "
+ "version (%u)",
+ config->version);
+
+ return -EINVAL;
+ }
+
+ ptr = GF_MALLOC(sizeof(uint64_t), gf_common_mt_char);
+ if (ptr == NULL) {
+ return -ENOMEM;
+ }
+
+ data = ((uint64_t)config->version) << 56;
+ data |= ((uint64_t)config->algorithm) << 48;
+ data |= ((uint64_t)config->gf_word_size) << 40;
+ data |= ((uint64_t)config->bricks) << 32;
+ data |= ((uint64_t)config->redundancy) << 24;
+ data |= config->chunk_size;
+
+ *ptr = hton64(data);
+
+ ret = dict_set_bin(dict, key, ptr, sizeof(uint64_t));
+ if (ret)
+ GF_FREE(ptr);
+
+ return ret;
+}
+
+int32_t
+ec_dict_del_config(dict_t *dict, char *key, ec_config_t *config)
+{
+ void *ptr;
+ uint64_t data;
+ int32_t len, err;
+
+ if (dict == NULL) {
+ return -EINVAL;
+ }
+ err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+ if (err != 0) {
+ return err;
+ }
+ if (len != sizeof(uint64_t)) {
+ return -EINVAL;
+ }
+
+ data = ntoh64(*(uint64_t *)ptr);
+ /* Currently we need to get the config xattr for entries of type IA_INVAL.
+ * These entries can later become IA_DIR entries (after inode_link()),
+ * which don't have a config xattr. However, since the xattr is requested
+ * using an xattrop() fop, it will always return a config full of 0's
+ * instead of saying that it doesn't exist.
+ *
+ * We need to filter out this case and consider that a config xattr == 0 is
+ * the same as a non-existent xattr. Otherwise ec_config_check() will fail.
+ */
+ if (data == 0) {
+ return -ENODATA;
+ }
+
+ config->version = (data >> 56) & 0xff;
+ if (config->version > EC_CONFIG_VERSION) {
+ gf_msg("ec", GF_LOG_ERROR, EINVAL, EC_MSG_UNSUPPORTED_VERSION,
+ "Found an unsupported config version (%u)", config->version);
+
+ return -EINVAL;
+ }
+
+ config->algorithm = (data >> 48) & 0xff;
+ config->gf_word_size = (data >> 40) & 0xff;
+ config->bricks = (data >> 32) & 0xff;
+ config->redundancy = (data >> 24) & 0xff;
+ config->chunk_size = data & 0xffffff;
+
+ dict_del(dict, key);
+
+ return 0;
+}
+
+gf_boolean_t
+ec_loc_gfid_check(xlator_t *xl, uuid_t dst, uuid_t src)
+{
+ if (gf_uuid_is_null(src)) {
+ return _gf_true;
+ }
+
+ if (gf_uuid_is_null(dst)) {
+ gf_uuid_copy(dst, src);
+
+ return _gf_true;
+ }
+
+ if (gf_uuid_compare(dst, src) != 0) {
+ gf_msg(xl->name, GF_LOG_WARNING, 0, EC_MSG_GFID_MISMATCH,
+ "Mismatching GFID's in loc");
+
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+int32_t
+ec_loc_setup_inode(xlator_t *xl, inode_table_t *table, loc_t *loc)
+{
+ int32_t ret = -EINVAL;
+
+ if (loc->inode != NULL) {
+ if (!ec_loc_gfid_check(xl, loc->gfid, loc->inode->gfid)) {
+ goto out;
+ }
+ } else if (table != NULL) {
+ if (!gf_uuid_is_null(loc->gfid)) {
+ loc->inode = inode_find(table, loc->gfid);
+ } else if (loc->path && strchr(loc->path, '/')) {
+ loc->inode = inode_resolve(table, (char *)loc->path);
+ }
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+ec_loc_setup_parent(xlator_t *xl, inode_table_t *table, loc_t *loc)
+{
+ char *path, *parent;
+ int32_t ret = -EINVAL;
+
+ if (loc->parent != NULL) {
+ if (!ec_loc_gfid_check(xl, loc->pargfid, loc->parent->gfid)) {
+ goto out;
+ }
+ } else if (table != NULL) {
+ if (!gf_uuid_is_null(loc->pargfid)) {
+ loc->parent = inode_find(table, loc->pargfid);
+ } else if (loc->path && strchr(loc->path, '/')) {
+ path = gf_strdup(loc->path);
+ if (path == NULL) {
+ gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Unable to duplicate path '%s'", loc->path);
+
+ ret = -ENOMEM;
+
+ goto out;
+ }
+ parent = dirname(path);
+ loc->parent = inode_resolve(table, parent);
+ if (loc->parent != NULL) {
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ }
+ GF_FREE(path);
+ }
+ }
+
+ /* If 'pargfid' has not been determined, clear 'name' to avoid resolutions
+ based on <gfid:pargfid>/name. */
+ if (gf_uuid_is_null(loc->pargfid)) {
+ loc->name = NULL;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+ec_loc_setup_path(xlator_t *xl, loc_t *loc)
+{
+ static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+ char *name;
+ int32_t ret = -EINVAL;
+
+ if (loc->path != NULL) {
+ name = strrchr(loc->path, '/');
+ if (name == NULL) {
+ /* Allow gfid paths: <gfid:...> */
+ if (strncmp(loc->path, "<gfid:", 6) == 0) {
+ ret = 0;
+ }
+ goto out;
+ }
+ if (name == loc->path) {
+ if (name[1] == 0) {
+ if (!ec_loc_gfid_check(xl, loc->gfid, root)) {
+ goto out;
+ }
+ } else {
+ if (!ec_loc_gfid_check(xl, loc->pargfid, root)) {
+ goto out;
+ }
+ }
+ }
+ name++;
+
+ if (loc->name != NULL) {
+ if (strcmp(loc->name, name) != 0) {
+ gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_LOC_NAME,
+ "Invalid name '%s' in loc", loc->name);
+
+ goto out;
+ }
+ } else {
+ loc->name = name;
+ }
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent)
+{
+ inode_table_t *table = NULL;
+ char *str = NULL;
+ int32_t ret = -ENOMEM;
+
+ memset(parent, 0, sizeof(loc_t));
+
+ if (loc->parent != NULL) {
+ table = loc->parent->table;
+ parent->inode = inode_ref(loc->parent);
+ } else if (loc->inode != NULL) {
+ table = loc->inode->table;
+ }
+ if (!gf_uuid_is_null(loc->pargfid)) {
+ gf_uuid_copy(parent->gfid, loc->pargfid);
+ }
+ if (loc->path && strchr(loc->path, '/')) {
+ str = gf_strdup(loc->path);
+ if (str == NULL) {
+ gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Unable to duplicate path '%s'", loc->path);
+
+ goto out;
+ }
+ parent->path = gf_strdup(dirname(str));
+ if (parent->path == NULL) {
+ gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Unable to duplicate path '%s'", dirname(str));
+
+ goto out;
+ }
+ }
+
+ ret = ec_loc_setup_path(xl, parent);
+ if (ret == 0) {
+ ret = ec_loc_setup_inode(xl, table, parent);
+ }
+ if (ret == 0) {
+ ret = ec_loc_setup_parent(xl, table, parent);
+ }
+ if (ret != 0) {
+ goto out;
+ }
+
+ if ((parent->inode == NULL) && (parent->path == NULL) &&
+ gf_uuid_is_null(parent->gfid)) {
+ gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_LOC_PARENT_INODE_MISSING,
+ "Parent inode missing for loc_t");
+
+ ret = -EINVAL;
+
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ GF_FREE(str);
+
+ if (ret != 0) {
+ loc_wipe(parent);
+ }
+
+ return ret;
+}
+
+int32_t
+ec_loc_update(xlator_t *xl, loc_t *loc, inode_t *inode, struct iatt *iatt)
+{
+ inode_table_t *table = NULL;
+ int32_t ret = -EINVAL;
+
+ if (inode != NULL) {
+ table = inode->table;
+ if (loc->inode != inode) {
+ if (loc->inode != NULL) {
+ inode_unref(loc->inode);
+ }
+ loc->inode = inode_ref(inode);
+ gf_uuid_copy(loc->gfid, inode->gfid);
+ }
+ } else if (loc->inode != NULL) {
+ table = loc->inode->table;
+ } else if (loc->parent != NULL) {
+ table = loc->parent->table;
+ }
+
+ if (iatt != NULL) {
+ if (!ec_loc_gfid_check(xl, loc->gfid, iatt->ia_gfid)) {
+ goto out;
+ }
+ }
+
+ ret = ec_loc_setup_path(xl, loc);
+ if (ret == 0) {
+ ret = ec_loc_setup_inode(xl, table, loc);
+ }
+ if (ret == 0) {
+ ret = ec_loc_setup_parent(xl, table, loc);
+ }
+ if (ret != 0) {
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+int32_t
+ec_loc_from_fd(xlator_t *xl, loc_t *loc, fd_t *fd)
+{
+ ec_fd_t *ctx;
+ int32_t ret = -ENOMEM;
+
+ memset(loc, 0, sizeof(*loc));
+
+ ctx = ec_fd_get(fd, xl);
+ if (ctx != NULL) {
+ if (loc_copy(loc, &ctx->loc) != 0) {
+ goto out;
+ }
+ }
+
+ ret = ec_loc_update(xl, loc, fd->inode, NULL);
+ if (ret != 0) {
+ goto out;
+ }
+
+out:
+ if (ret != 0) {
+ loc_wipe(loc);
+ }
+
+ return ret;
+}
+
+int32_t
+ec_loc_from_loc(xlator_t *xl, loc_t *dst, loc_t *src)
+{
+ int32_t ret = -ENOMEM;
+
+ memset(dst, 0, sizeof(*dst));
+
+ if (loc_copy(dst, src) != 0) {
+ goto out;
+ }
+
+ ret = ec_loc_update(xl, dst, NULL, NULL);
+ if (ret != 0) {
+ goto out;
+ }
+
+out:
+ if (ret != 0) {
+ loc_wipe(dst);
+ }
+
+ return ret;
+}
+
+void
+ec_owner_set(call_frame_t *frame, void *owner)
+{
+ set_lk_owner_from_ptr(&frame->root->lk_owner, owner);
+}
+
+void
+ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner)
+{
+ lk_owner_copy(&frame->root->lk_owner, owner);
+}
+
+static void
+ec_stripe_cache_init(ec_t *ec, ec_inode_t *ctx)
+{
+ ec_stripe_list_t *stripe_cache = NULL;
+
+ stripe_cache = &(ctx->stripe_cache);
+ if (stripe_cache->max == 0) {
+ stripe_cache->max = ec->stripe_cache;
+ }
+}
+
+ec_inode_t *
+__ec_inode_get(inode_t *inode, xlator_t *xl)
+{
+ ec_inode_t *ctx = NULL;
+ uint64_t value = 0;
+
+ if ((__inode_ctx_get(inode, xl, &value) != 0) || (value == 0)) {
+ ctx = GF_MALLOC(sizeof(*ctx), ec_mt_ec_inode_t);
+ if (ctx != NULL) {
+ memset(ctx, 0, sizeof(*ctx));
+ INIT_LIST_HEAD(&ctx->heal);
+ INIT_LIST_HEAD(&ctx->stripe_cache.lru);
+ ctx->heal_count = 0;
+ value = (uint64_t)(uintptr_t)ctx;
+ if (__inode_ctx_set(inode, xl, &value) != 0) {
+ GF_FREE(ctx);
+
+ return NULL;
+ }
+ }
+ } else {
+ ctx = (ec_inode_t *)(uintptr_t)value;
+ }
+ if (ctx)
+ ec_stripe_cache_init(xl->private, ctx);
+
+ return ctx;
+}
+
+ec_inode_t *
+ec_inode_get(inode_t *inode, xlator_t *xl)
+{
+ ec_inode_t *ctx = NULL;
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, xl);
+
+ UNLOCK(&inode->lock);
+
+ return ctx;
+}
+
+ec_fd_t *
+__ec_fd_get(fd_t *fd, xlator_t *xl)
+{
+ int i = 0;
+ ec_fd_t *ctx = NULL;
+ ec_inode_t *ictx = NULL;
+ uint64_t value = 0;
+ ec_t *ec = xl->private;
+
+ if ((__fd_ctx_get(fd, xl, &value) != 0) || (value == 0)) {
+ ctx = GF_MALLOC(sizeof(*ctx) + (sizeof(ec_fd_status_t) * ec->nodes),
+ ec_mt_ec_fd_t);
+ if (ctx != NULL) {
+ memset(ctx, 0, sizeof(*ctx));
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (fd_is_anonymous(fd)) {
+ ctx->fd_status[i] = EC_FD_OPENED;
+ } else {
+ ctx->fd_status[i] = EC_FD_NOT_OPENED;
+ }
+ }
+
+ value = (uint64_t)(uintptr_t)ctx;
+ if (__fd_ctx_set(fd, xl, value) != 0) {
+ GF_FREE(ctx);
+ return NULL;
+ }
+ /* Only refering bad-version so no need for lock
+ * */
+ ictx = __ec_inode_get(fd->inode, xl);
+ if (ictx) {
+ ctx->bad_version = ictx->bad_version;
+ }
+ }
+ } else {
+ ctx = (ec_fd_t *)(uintptr_t)value;
+ }
+
+ /* Treat anonymous fd specially */
+ if (fd->anonymous && ctx) {
+ /* Mark the fd open for all subvolumes. */
+ ctx->open = -1;
+ /* Try to populate ctx->loc with fd->inode information. */
+ ec_loc_update(xl, &ctx->loc, fd->inode, NULL);
+ }
+
+ return ctx;
+}
+
+ec_fd_t *
+ec_fd_get(fd_t *fd, xlator_t *xl)
+{
+ ec_fd_t *ctx = NULL;
+
+ LOCK(&fd->lock);
+
+ ctx = __ec_fd_get(fd, xl);
+
+ UNLOCK(&fd->lock);
+
+ return ctx;
+}
+
+gf_boolean_t
+ec_is_internal_xattr(dict_t *dict, char *key, data_t *value, void *data)
+{
+ if (key && (strncmp(key, EC_XATTR_PREFIX, SLEN(EC_XATTR_PREFIX)) == 0))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+void
+ec_filter_internal_xattrs(dict_t *xattr)
+{
+ dict_foreach_match(xattr, ec_is_internal_xattr, NULL,
+ dict_remove_foreach_fn, NULL);
+}
+
+gf_boolean_t
+ec_is_data_fop(glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_WRITE:
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_FTRUNCATE:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
+ case GF_FOP_ZEROFILL:
+ return _gf_true;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
+/*
+gf_boolean_t
+ec_is_metadata_fop (int32_t lock_kind, glusterfs_fop_t fop)
+{
+ if (lock_kind == EC_LOCK_ENTRY) {
+ return _gf_false;
+ }
+
+ switch (fop) {
+ case GF_FOP_SETATTR:
+ case GF_FOP_FSETATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+ case GF_FOP_REMOVEXATTR:
+ case GF_FOP_FREMOVEXATTR:
+ return _gf_true;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}*/
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
new file mode 100644
index 00000000000..015db514e05
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-helpers.h
@@ -0,0 +1,200 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_HELPERS_H__
+#define __EC_HELPERS_H__
+
+#include "ec-types.h"
+
+#define EC_ERR(_x) ((void *)-(intptr_t)(_x))
+#define EC_IS_ERR(_x) (((uintptr_t)(_x) & ~0xfffULL) == ~0xfffULL)
+#define EC_GET_ERR(_x) ((int32_t)(intptr_t)(_x))
+
+#define EC_ALIGN_CHECK(_ptr, _align) ((((uintptr_t)(_ptr)) & ((_align)-1)) == 0)
+
+const char *
+ec_bin(char *str, size_t size, uint64_t value, int32_t digits);
+const char *
+ec_fop_name(int32_t id);
+void
+ec_trace(const char *event, ec_fop_data_t *fop, const char *fmt, ...);
+int32_t
+ec_bits_consume(uint64_t *n);
+size_t
+ec_iov_copy_to(void *dst, struct iovec *vector, int32_t count, off_t offset,
+ size_t size);
+int32_t
+ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, void **ptr);
+int32_t
+ec_dict_set_array(dict_t *dict, char *key, uint64_t *value, int32_t size);
+int32_t
+ec_dict_get_array(dict_t *dict, char *key, uint64_t value[], int32_t size);
+
+int32_t
+ec_dict_del_array(dict_t *dict, char *key, uint64_t *value, int32_t size);
+int32_t
+ec_dict_set_number(dict_t *dict, char *key, uint64_t value);
+int32_t
+ec_dict_del_number(dict_t *dict, char *key, uint64_t *value);
+int32_t
+ec_dict_set_config(dict_t *dict, char *key, ec_config_t *config);
+int32_t
+ec_dict_del_config(dict_t *dict, char *key, ec_config_t *config);
+
+int32_t
+ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent);
+int32_t
+ec_loc_update(xlator_t *xl, loc_t *loc, inode_t *inode, struct iatt *iatt);
+
+int32_t
+ec_loc_from_fd(xlator_t *xl, loc_t *loc, fd_t *fd);
+int32_t
+ec_loc_from_loc(xlator_t *xl, loc_t *dst, loc_t *src);
+
+void
+ec_owner_set(call_frame_t *frame, void *owner);
+void
+ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner);
+
+ec_inode_t *
+__ec_inode_get(inode_t *inode, xlator_t *xl);
+ec_inode_t *
+ec_inode_get(inode_t *inode, xlator_t *xl);
+ec_fd_t *
+__ec_fd_get(fd_t *fd, xlator_t *xl);
+ec_fd_t *
+ec_fd_get(fd_t *fd, xlator_t *xl);
+
+static inline uint32_t
+ec_adjust_size_down(ec_t *ec, uint64_t *value, gf_boolean_t scale)
+{
+ uint64_t head, tmp;
+
+ tmp = *value;
+ head = tmp % ec->stripe_size;
+ tmp -= head;
+
+ if (scale) {
+ tmp /= ec->fragments;
+ }
+
+ *value = tmp;
+
+ return (uint32_t)head;
+}
+
+/* This function can cause an overflow if the passed value is too near to the
+ * uint64_t limit. If this happens, it returns the tail in negative form and
+ * the value is set to UINT64_MAX. */
+static inline int32_t
+ec_adjust_size_up(ec_t *ec, uint64_t *value, gf_boolean_t scale)
+{
+ uint64_t tmp;
+ int32_t tail;
+
+ tmp = *value;
+ /* We first adjust the value down. This never causes overflow. */
+ tail = ec_adjust_size_down(ec, &tmp, scale);
+
+ /* If the value was already aligned, tail will be 0 and nothing else
+ * needs to be done. */
+ if (tail != 0) {
+ /* Otherwise, we need to compute the real tail and adjust the
+ * returned value to the next stripe. */
+ tail = ec->stripe_size - tail;
+ if (scale) {
+ tmp += ec->fragment_size;
+ } else {
+ tmp += ec->stripe_size;
+ /* If no scaling is requested there's a possibility of
+ * overflow. */
+ if (tmp < ec->stripe_size) {
+ tmp = UINT64_MAX;
+ tail = -tail;
+ }
+ }
+ }
+
+ *value = tmp;
+
+ return tail;
+}
+
+/* This function is equivalent to ec_adjust_size_down() but with a potentially
+ * different parameter size (off_t vs uint64_t). */
+static inline uint32_t
+ec_adjust_offset_down(ec_t *ec, off_t *value, gf_boolean_t scale)
+{
+ off_t head, tmp;
+
+ tmp = *value;
+ head = tmp % ec->stripe_size;
+ tmp -= head;
+
+ if (scale) {
+ tmp /= ec->fragments;
+ }
+
+ *value = tmp;
+
+ return (uint32_t)head;
+}
+
+/* This function is equivalent to ec_adjust_size_up() but with a potentially
+ * different parameter size (off_t vs uint64_t). */
+static inline int32_t
+ec_adjust_offset_up(ec_t *ec, off_t *value, gf_boolean_t scale)
+{
+ uint64_t tail, tmp;
+
+ /* An offset is a signed type that can only have positive values, so
+ * we take advantage of this to avoid overflows. We simply convert it
+ * to an unsigned integer and operate normally. This won't cause an
+ * overflow. Overflow is only checked when converting back to an
+ * off_t. */
+ tmp = *value;
+ tail = ec->stripe_size;
+ tail -= (tmp + tail - 1) % tail + 1;
+ tmp += tail;
+ if (scale) {
+ /* If we are scaling, we'll never get an overflow. */
+ tmp /= ec->fragments;
+ } else {
+ /* Check if there has been an overflow. */
+ if ((off_t)tmp < 0) {
+ tmp = GF_OFF_MAX;
+ tail = -tail;
+ }
+ }
+
+ *value = (off_t)tmp;
+
+ return (int32_t)tail;
+}
+
+static inline int32_t
+ec_is_power_of_2(uint32_t value)
+{
+ return (value != 0) && ((value & (value - 1)) == 0);
+}
+
+gf_boolean_t
+ec_is_internal_xattr(dict_t *dict, char *key, data_t *value, void *data);
+
+void
+ec_filter_internal_xattrs(dict_t *xattr);
+
+gf_boolean_t
+ec_is_data_fop(glusterfs_fop_t fop);
+
+int32_t
+ec_launch_replace_heal(ec_t *ec);
+
+#endif /* __EC_HELPERS_H__ */
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
new file mode 100644
index 00000000000..dad5f4d7018
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -0,0 +1,2046 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-method.h"
+#include "ec-fops.h"
+
+/* FOP: access */
+
+int32_t
+ec_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_ACCESS, idx, op_ret,
+ op_errno);
+ if (cbk) {
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_access(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_access_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->access,
+ &fop->loc[0], fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_access(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk = NULL;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_one(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ if (ec_dispatch_one_retry(fop, NULL)) {
+ return EC_STATE_DISPATCH;
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+ GF_ASSERT(cbk);
+ if (fop->cbks.access != NULL) {
+ if (cbk) {
+ fop->cbks.access(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->xdata);
+ }
+ }
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ if (fop->cbks.access != NULL) {
+ fop->cbks.access(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL);
+ }
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc,
+ int32_t mask, dict_t *xdata)
+{
+ ec_cbk_t callback = {.access = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(ACCESS) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_ACCESS, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_access,
+ ec_manager_access, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = mask;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: getxattr */
+
+int32_t
+ec_combine_getxattr(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (!ec_dict_compare(dst->dict, src->dict)) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_DICT_MISMATCH,
+ "Mismatching dictionary in "
+ "answers of 'GF_FOP_GETXATTR'");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_GETXATTR, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (dict != NULL) {
+ cbk->dict = dict_ref(dict);
+ if (cbk->dict == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_getxattr);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_getxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_getxattr_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->getxattr,
+ &fop->loc[0], fop->str[0], fop->xdata);
+}
+
+void
+ec_handle_special_xattrs(ec_fop_data_t *fop)
+{
+ ec_cbk_data_t *cbk = NULL;
+ /* Stime may not be available on all the bricks, so even if some of the
+ * subvols succeed the operation, treat it as answer.*/
+ if (fop->str[0] && fnmatch(GF_XATTR_STIME_PATTERN, fop->str[0], 0) == 0) {
+ if (!fop->answer || (fop->answer->op_ret < 0)) {
+ list_for_each_entry(cbk, &fop->cbk_list, list)
+ {
+ if (cbk->op_ret >= 0) {
+ fop->answer = cbk;
+ break;
+ }
+ }
+ }
+ }
+}
+
+int32_t
+ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ /* clear-locks commands must be done without any locks acquired
+ to avoid interferences. */
+ if ((fop->str[0] == NULL) ||
+ (strncmp(fop->str[0], GF_XATTR_CLRLK_CMD,
+ SLEN(GF_XATTR_CLRLK_CMD)) != 0)) {
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ }
+ ec_lock(fop);
+ }
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ if (fop->minimum == EC_MINIMUM_ALL) {
+ ec_dispatch_all(fop);
+ } else {
+ ec_dispatch_one(fop);
+ }
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ ec_handle_special_xattrs(fop);
+ if (fop->minimum == EC_MINIMUM_ALL) {
+ cbk = ec_fop_prepare_answer(fop, _gf_true);
+ } else {
+ if (ec_dispatch_one_retry(fop, &cbk)) {
+ return EC_STATE_DISPATCH;
+ }
+ }
+ if (cbk != NULL) {
+ int32_t err;
+
+ err = ec_dict_combine(cbk, EC_COMBINE_DICT);
+ if (!ec_cbk_set_error(cbk, -err, _gf_true)) {
+ if (cbk->xdata != NULL)
+ ec_filter_internal_xattrs(cbk->xdata);
+
+ if (cbk->dict != NULL)
+ ec_filter_internal_xattrs(cbk->dict);
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.getxattr != NULL) {
+ fop->cbks.getxattr(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->dict, cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.getxattr != NULL) {
+ fop->cbks.getxattr(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+int32_t
+ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask,
+ uintptr_t good, uintptr_t bad, uint32_t pending,
+ dict_t *xdata)
+{
+ fop_getxattr_cbk_t func = cookie;
+ ec_t *ec = xl->private;
+ dict_t *dict = NULL;
+ char *str;
+ char bin1[65], bin2[65];
+
+ /* We try to return the 'pending' information in xdata, but if this cannot
+ * be set, we will ignore it silently. We prefer to report the success or
+ * failure of the heal itself. */
+ if (xdata == NULL) {
+ xdata = dict_new();
+ } else {
+ dict_ref(xdata);
+ }
+ if (xdata != NULL) {
+ if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
+ /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
+ * enforces to check the result in this case. However we don't
+ * really care if it succeeded or not. We'll just do the same.
+ *
+ * This empty 'if' avoids the warning, and it will be removed by
+ * the optimizer. */
+ }
+ }
+
+ if (op_ret >= 0) {
+ dict = dict_new();
+ if (dict == NULL) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ } else {
+ if (gf_asprintf(&str, "Good: %s, Bad: %s",
+ ec_bin(bin1, sizeof(bin1), good, ec->nodes),
+ ec_bin(bin2, sizeof(bin2), mask & ~(good | bad),
+ ec->nodes)) < 0) {
+ dict_unref(dict);
+ dict = NULL;
+
+ op_ret = -1;
+ op_errno = ENOMEM;
+
+ goto out;
+ }
+
+ if (dict_set_dynstr(dict, EC_XATTR_HEAL, str) != 0) {
+ GF_FREE(str);
+ dict_unref(dict);
+ dict = NULL;
+
+ op_ret = -1;
+ op_errno = ENOMEM;
+
+ goto out;
+ }
+ }
+ }
+
+out:
+ func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
+
+ if (dict != NULL) {
+ dict_unref(dict);
+ }
+ if (xdata != NULL) {
+ dict_unref(xdata);
+ }
+
+ return 0;
+}
+
+void
+ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ ec_cbk_t callback = {.getxattr = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(GETXATTR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ /* Special handling of an explicit self-heal request */
+ if ((name != NULL) && (strcmp(name, EC_XATTR_HEAL) == 0)) {
+ ec_heal(frame, this, target, EC_MINIMUM_ONE, ec_getxattr_heal_cbk, func,
+ loc, 0, NULL);
+
+ return;
+ }
+
+ fop = ec_fop_data_allocate(
+ frame, this, GF_FOP_GETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags,
+ ec_wind_getxattr, ec_manager_getxattr, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (name != NULL) {
+ /* In case of list-node-uuids xattr, set flag to indicate
+ * the same and use node-uuid xattr for winding fop */
+ if (XATTR_IS_NODE_UUID_LIST(name)) {
+ fop->int32 = 1;
+ fop->str[0] = gf_strdup(GF_XATTR_NODE_UUID_KEY);
+ } else {
+ fop->str[0] = gf_strdup(name);
+ }
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+/* FOP: fgetxattr */
+
+int32_t
+ec_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FGETXATTR, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (dict != NULL) {
+ cbk->dict = dict_ref(dict);
+ if (cbk->dict == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_getxattr);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_fgetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fgetxattr_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fgetxattr,
+ fop->fd, fop->str[0], fop->xdata);
+}
+
+void
+ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fgetxattr = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FGETXATTR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(
+ frame, this, GF_FOP_FGETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags,
+ ec_wind_fgetxattr, ec_manager_getxattr, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (name != NULL) {
+ fop->str[0] = gf_strdup(name);
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+/* FOP: open */
+
+int32_t
+ec_combine_open(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (dst->fd != src->fd) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_FD_MISMATCH,
+ "Mismatching fd in answers "
+ "of 'GF_FOP_OPEN': %p <-> %p",
+ dst->fd, src->fd);
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_OPEN, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (fd != NULL) {
+ cbk->fd = fd_ref(fd);
+ if (cbk->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_open);
+
+ ec_update_fd_status(fd, this, idx, op_ret);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_open(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_open_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->open,
+ &fop->loc[0], fop->int32, fop->fd, fop->xdata);
+}
+
+int32_t
+ec_open_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ int32_t error = 0;
+
+ fop = fop->data;
+ if (op_ret >= 0) {
+ fop->answer->iatt[0] = *postbuf;
+ } else {
+ error = op_errno;
+ }
+
+ ec_resume(fop, error);
+
+ return 0;
+}
+
+int32_t
+ec_manager_open(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+ ec_fd_t *ctx;
+ int32_t err;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ LOCK(&fop->fd->lock);
+
+ ctx = __ec_fd_get(fop->fd, fop->xl);
+ if (ctx == NULL) {
+ UNLOCK(&fop->fd->lock);
+
+ fop->error = ENOMEM;
+
+ return EC_STATE_REPORT;
+ }
+ if (!ctx->loc.inode) {
+ err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
+ if (err != 0) {
+ UNLOCK(&fop->fd->lock);
+
+ fop->error = -err;
+
+ return EC_STATE_REPORT;
+ }
+ }
+
+ ctx->flags = fop->int32;
+
+ UNLOCK(&fop->fd->lock);
+
+ /* We need to write to specific offsets on the bricks, so we
+ need to remove O_APPEND from flags (if present).
+ If O_TRUNC is specified, we remove it from open and an
+ ftruncate will be executed later, which will correctly update
+ the file size taking appropriate locks. O_TRUNC flag is saved
+ into fop->uint32 to use it later.*/
+ fop->uint32 = fop->int32 & O_TRUNC;
+ fop->int32 &= ~(O_APPEND | O_TRUNC);
+
+ /* Fall through */
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_true);
+ if (cbk != NULL) {
+ int32_t err;
+
+ err = ec_loc_update(fop->xl, &fop->loc[0], cbk->fd->inode,
+ NULL);
+ if (!ec_cbk_set_error(cbk, -err, _gf_true)) {
+ LOCK(&fop->fd->lock);
+
+ ctx = __ec_fd_get(fop->fd, fop->xl);
+ if (ctx != NULL) {
+ ctx->open |= cbk->mask;
+ }
+
+ UNLOCK(&fop->fd->lock);
+
+ /* If O_TRUNC was specified, call ftruncate to
+ effectively trunc the file with appropriate locks
+ acquired. We don't use ctx->flags because self-heal
+ can use the same fd with different flags. */
+ if (fop->uint32 != 0) {
+ ec_sleep(fop);
+ ec_ftruncate(fop->req_frame, fop->xl, cbk->mask,
+ fop->minimum, ec_open_truncate_cbk, fop,
+ cbk->fd, 0, NULL);
+ }
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.open != NULL) {
+ fop->cbks.open(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->fd, cbk->xdata);
+ }
+
+ return EC_STATE_END;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.open != NULL) {
+ fop->cbks.open(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL);
+ }
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc,
+ int32_t flags, fd_t *fd, dict_t *xdata)
+{
+ ec_cbk_t callback = {.open = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(OPEN) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_OPEN, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_open, ec_manager_open,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = flags;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+/* FOP: readlink */
+
+int32_t
+ec_combine_readlink(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+ "Mismatching iatt in "
+ "answers of 'GF_FOP_READLINK'");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, const char *path,
+ struct iatt *buf, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+ op_errno);
+ if (cbk) {
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+
+ if (cbk->op_ret >= 0) {
+ cbk->iatt[0] = *buf;
+ cbk->str = gf_strdup(path);
+ if (!cbk->str) {
+ ec_cbk_set_error(cbk, ENOMEM, _gf_true);
+ }
+ }
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL)
+ ec_complete(fop);
+
+ return 0;
+}
+
+void
+ec_wind_readlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_readlink_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->readlink,
+ &fop->loc[0], fop->size, fop->xdata);
+}
+
+int32_t
+ec_manager_readlink(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk = NULL;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ ec_lock(fop);
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_one(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ if (ec_dispatch_one_retry(fop, &cbk)) {
+ return EC_STATE_DISPATCH;
+ }
+
+ if ((cbk != NULL) && (cbk->op_ret >= 0)) {
+ ec_iatt_rebuild(fop->xl->private, &cbk->iatt[0], 1, 1);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+ GF_ASSERT(cbk);
+ if (fop->cbks.readlink != NULL) {
+ fop->cbks.readlink(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->str, &cbk->iatt[0],
+ cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ if (fop->cbks.readlink != NULL) {
+ fop->cbks.readlink(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL);
+ }
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc,
+ size_t size, dict_t *xdata)
+{
+ ec_cbk_t callback = {.readlink = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(READLINK) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(
+ frame, this, GF_FOP_READLINK, EC_FLAG_LOCK_SHARED, target, fop_flags,
+ ec_wind_readlink, ec_manager_readlink, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->size = size;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: readv */
+
+int32_t
+ec_readv_rebuild(ec_t *ec, ec_fop_data_t *fop, ec_cbk_data_t *cbk)
+{
+ struct iovec vector[1];
+ ec_cbk_data_t *ans = NULL;
+ struct iobref *iobref = NULL;
+ void *ptr;
+ uint64_t fsize = 0, size = 0, max = 0;
+ int32_t pos, err = -ENOMEM;
+
+ if (cbk->op_ret < 0) {
+ err = -cbk->op_errno;
+
+ goto out;
+ }
+
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size));
+
+ if (cbk->op_ret > 0) {
+ void *blocks[cbk->count];
+ uint32_t values[cbk->count];
+
+ fsize = cbk->op_ret;
+ size = fsize * ec->fragments;
+ for (ans = cbk; ans != NULL; ans = ans->next) {
+ pos = gf_bits_count(cbk->mask & ((1 << ans->idx) - 1));
+ values[pos] = ans->idx + 1;
+ blocks[pos] = ans->vector[0].iov_base;
+ if ((ans->int32 != 1) ||
+ !EC_ALIGN_CHECK(blocks[pos], EC_METHOD_WORD_SIZE)) {
+ if (iobref == NULL) {
+ err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr);
+ if (err != 0) {
+ goto out;
+ }
+ }
+ ec_iov_copy_to(ptr, ans->vector, ans->int32, 0, fsize);
+ blocks[pos] = ptr;
+ ptr += fsize;
+ }
+ }
+
+ err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr);
+ if (err != 0) {
+ goto out;
+ }
+
+ err = ec_method_decode(&ec->matrix, fsize, cbk->mask, values, blocks,
+ ptr);
+ if (err != 0) {
+ goto out;
+ }
+
+ vector[0].iov_base = ptr + fop->head;
+ vector[0].iov_len = size - fop->head;
+
+ max = fop->offset * ec->fragments + size;
+ if (max > cbk->iatt[0].ia_size) {
+ max = cbk->iatt[0].ia_size;
+ }
+ max -= fop->offset * ec->fragments + fop->head;
+ if (max > fop->user_size) {
+ max = fop->user_size;
+ }
+ size -= fop->head;
+ if (size > max) {
+ vector[0].iov_len -= size - max;
+ size = max;
+ }
+
+ cbk->op_ret = size;
+ cbk->int32 = 1;
+
+ iobref_unref(cbk->buffers);
+ cbk->buffers = iobref;
+
+ GF_FREE(cbk->vector);
+ cbk->vector = iov_dup(vector, 1);
+ if (cbk->vector == NULL) {
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+
+out:
+ if (iobref != NULL) {
+ iobref_unref(iobref);
+ }
+
+ return err;
+}
+
+int32_t
+ec_combine_readv(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (!ec_vector_compare(dst->vector, dst->int32, src->vector, src->int32)) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_VECTOR_MISMATCH,
+ "Mismatching vector in "
+ "answers of 'GF_FOP_READ'");
+
+ return 0;
+ }
+
+ if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+ "Mismatching iatt in "
+ "answers of 'GF_FOP_READ'");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iovec *vector, int32_t count,
+ struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ ec_t *ec = this->private;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_READ, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ cbk->int32 = count;
+
+ if (count > 0) {
+ cbk->vector = iov_dup(vector, count);
+ if (cbk->vector == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a "
+ "vector list.");
+
+ goto out;
+ }
+ cbk->int32 = count;
+ }
+ if (stbuf != NULL) {
+ cbk->iatt[0] = *stbuf;
+ }
+ if (iobref != NULL) {
+ cbk->buffers = iobref_ref(iobref);
+ if (cbk->buffers == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_BUF_REF_FAIL,
+ "Failed to reference a "
+ "buffer.");
+
+ goto out;
+ }
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ if ((op_ret > 0) && ((op_ret % ec->fragment_size) != 0)) {
+ ec_cbk_set_error(cbk, EIO, _gf_true);
+ }
+
+ ec_combine(cbk, ec_combine_readv);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_readv(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_readv_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->readv, fop->fd,
+ fop->size, fop->offset, fop->uint32, fop->xdata);
+}
+
+int32_t
+ec_manager_readv(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+ ec_t *ec = fop->xl->private;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ fop->user_size = fop->size;
+ fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
+ _gf_true);
+ fop->size += fop->head;
+ ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true);
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset,
+ fop->size);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ if (ec->read_mask) {
+ fop->mask &= ec->read_mask;
+ }
+ ec_dispatch_min(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_true);
+ if (cbk != NULL) {
+ int32_t err;
+
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1, cbk->count);
+
+ err = ec_readv_rebuild(fop->xl->private, fop, cbk);
+ if (err != 0) {
+ ec_cbk_set_error(cbk, -err, _gf_true);
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.readv != NULL) {
+ fop->cbks.readv(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->vector, cbk->int32,
+ &cbk->iatt[0], cbk->buffers, cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.readv != NULL) {
+ fop->cbks.readv(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, 0, NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ ec_cbk_t callback = {.readv = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(READ) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_READ, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_readv,
+ ec_manager_readv, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->size = size;
+ fop->offset = offset;
+ fop->uint32 = flags;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, 0, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: seek */
+
+int32_t
+ec_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, off_t offset, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ ec_t *ec = this->private;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_SEEK, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ cbk->offset = offset;
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ }
+
+ if ((op_ret > 0) && ((cbk->offset % ec->fragment_size) != 0)) {
+ cbk->op_ret = -1;
+ cbk->op_errno = EIO;
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_seek(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_seek_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->seek, fop->fd,
+ fop->offset, fop->seek, fop->xdata);
+}
+
+int32_t
+ec_manager_seek(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+ uint64_t size;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ fop->user_size = fop->offset;
+ fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
+ _gf_true);
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset,
+ EC_RANGE_FULL);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(
+ ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, &size));
+
+ if (fop->user_size >= size) {
+ ec_fop_set_error(fop, ENXIO);
+
+ return EC_STATE_REPORT;
+ }
+
+ ec_dispatch_one(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ if (ec_dispatch_one_retry(fop, &cbk)) {
+ return EC_STATE_DISPATCH;
+ }
+ if ((cbk != NULL) && (cbk->op_ret >= 0)) {
+ ec_t *ec = fop->xl->private;
+
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
+ &size));
+
+ cbk->offset *= ec->fragments;
+ if (cbk->offset < fop->user_size) {
+ cbk->offset = fop->user_size;
+ }
+ if (cbk->offset > size) {
+ cbk->offset = size;
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.seek != NULL) {
+ fop->cbks.seek(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->offset, cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.seek != NULL) {
+ fop->cbks.seek(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
+ NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, 0, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd,
+ off_t offset, gf_seek_what_t what, dict_t *xdata)
+{
+ ec_cbk_t callback = {.seek = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = EIO;
+
+ gf_msg_trace("ec", 0, "EC(SEEK) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_SEEK, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_seek, ec_manager_seek,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->offset = offset;
+ fop->seek = what;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, EIO, 0, NULL);
+ }
+}
+
+/* FOP: stat */
+
+int32_t
+ec_combine_stat(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+ "Mismatching iatt in "
+ "answers of 'GF_FOP_STAT'");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_STAT, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (buf != NULL) {
+ cbk->iatt[0] = *buf;
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_stat);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_stat(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_stat_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->stat,
+ &fop->loc[0], fop->xdata);
+}
+
+int32_t
+ec_manager_stat(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ }
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_true);
+
+ if (cbk != NULL) {
+ if (cbk->iatt[0].ia_type == IA_IFREG) {
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1, cbk->count);
+
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->id == GF_FOP_STAT) {
+ if (fop->cbks.stat != NULL) {
+ fop->cbks.stat(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, &cbk->iatt[0], cbk->xdata);
+ }
+ } else {
+ if (fop->cbks.fstat != NULL) {
+ fop->cbks.fstat(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, &cbk->iatt[0], cbk->xdata);
+ }
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->id == GF_FOP_STAT) {
+ if (fop->cbks.stat != NULL) {
+ fop->cbks.stat(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL);
+ }
+ } else {
+ if (fop->cbks.fstat != NULL) {
+ fop->cbks.fstat(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL);
+ }
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc,
+ dict_t *xdata)
+{
+ ec_cbk_t callback = {.stat = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(STAT) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_STAT, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_stat, ec_manager_stat,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
+
+/* FOP: fstat */
+
+int32_t
+ec_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSTAT, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (buf != NULL) {
+ cbk->iatt[0] = *buf;
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_stat);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_fstat(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fstat_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fstat, fop->fd,
+ fop->xdata);
+}
+
+void
+ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd,
+ dict_t *xdata)
+{
+ ec_cbk_t callback = {.fstat = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FSTAT) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FSTAT, EC_FLAG_LOCK_SHARED,
+ target, fop_flags, ec_wind_fstat,
+ ec_manager_stat, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
new file mode 100644
index 00000000000..9b5fe2a7fdc
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -0,0 +1,2369 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-method.h"
+#include "ec-fops.h"
+#include "ec-mem-types.h"
+
+int32_t
+ec_update_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ ec_cbk_data_t *cbk = NULL;
+ ec_fop_data_t *parent = fop->parent;
+ int i = 0;
+
+ ec_trace("UPDATE_WRITEV_CBK", cookie, "ret=%d, errno=%d, parent-fop=%s",
+ op_ret, op_errno, ec_fop_name(parent->id));
+
+ if (op_ret < 0) {
+ ec_fop_set_error(parent, op_errno);
+ goto out;
+ }
+ cbk = ec_cbk_data_allocate(parent->frame, this, parent, parent->id, 0,
+ op_ret, op_errno);
+ if (!cbk) {
+ ec_fop_set_error(parent, ENOMEM);
+ goto out;
+ }
+
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+
+ if (prebuf)
+ cbk->iatt[i++] = *prebuf;
+
+ if (postbuf)
+ cbk->iatt[i++] = *postbuf;
+
+ LOCK(&parent->lock);
+ {
+ parent->good &= fop->good;
+
+ if (gf_bits_count(parent->good) < parent->minimum) {
+ __ec_fop_set_error(parent, EIO);
+ } else if (fop->error == 0 && parent->answer == NULL) {
+ parent->answer = cbk;
+ }
+ }
+ UNLOCK(&parent->lock);
+out:
+ return 0;
+}
+
+static int32_t
+ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset, uint64_t size)
+{
+ struct iobref *iobref = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iovec vector;
+ int32_t err = -ENOMEM;
+
+ iobref = iobref_new();
+ if (iobref == NULL) {
+ goto out;
+ }
+ iobuf = iobuf_get(fop->xl->ctx->iobuf_pool);
+ if (iobuf == NULL) {
+ goto out;
+ }
+ err = iobref_add(iobref, iobuf);
+ if (err != 0) {
+ goto out;
+ }
+
+ if (fop->locks[0].lock)
+ ec_lock_update_good(fop->locks[0].lock, fop);
+ vector.iov_base = iobuf->ptr;
+ vector.iov_len = size;
+ memset(vector.iov_base, 0, vector.iov_len);
+
+ ec_writev(fop->frame, fop->xl, mask, fop->minimum, ec_update_writev_cbk,
+ NULL, fop->fd, &vector, 1, offset, 0, iobref, NULL);
+
+ err = 0;
+
+out:
+ if (iobuf != NULL) {
+ iobuf_unref(iobuf);
+ }
+ if (iobref != NULL) {
+ iobref_unref(iobref);
+ }
+
+ return err;
+}
+
+int
+ec_inode_write_cbk(call_frame_t *frame, xlator_t *this, void *cookie,
+ int op_ret, int op_errno, struct iatt *prestat,
+ struct iatt *poststat, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int i = 0;
+ int idx = 0;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+ idx = (int32_t)(uintptr_t)cookie;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+ op_errno);
+ if (!cbk)
+ goto out;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+
+ if (prestat)
+ cbk->iatt[i++] = *prestat;
+
+ if (poststat)
+ cbk->iatt[i++] = *poststat;
+
+out:
+ if (cbk)
+ ec_combine(cbk, ec_combine_write);
+
+ if (fop)
+ ec_complete(fop);
+ return 0;
+}
+/* FOP: removexattr */
+
+int32_t
+ec_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL,
+ xdata);
+}
+
+void
+ec_wind_removexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_removexattr_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->removexattr,
+ &fop->loc[0], fop->str[0], fop->xdata);
+}
+
+void
+ec_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ switch (fop->id) {
+ case GF_FOP_SETXATTR:
+ if (fop->cbks.setxattr) {
+ QUORUM_CBK(fop->cbks.setxattr, fop, frame, cookie, this, op_ret,
+ op_errno, xdata);
+ }
+ break;
+ case GF_FOP_REMOVEXATTR:
+ if (fop->cbks.removexattr) {
+ QUORUM_CBK(fop->cbks.removexattr, fop, frame, cookie, this,
+ op_ret, op_errno, xdata);
+ }
+ break;
+ case GF_FOP_FSETXATTR:
+ if (fop->cbks.fsetxattr) {
+ QUORUM_CBK(fop->cbks.fsetxattr, fop, frame, cookie, this,
+ op_ret, op_errno, xdata);
+ }
+ break;
+ case GF_FOP_FREMOVEXATTR:
+ if (fop->cbks.fremovexattr) {
+ QUORUM_CBK(fop->cbks.fremovexattr, fop, frame, cookie, this,
+ op_ret, op_errno, xdata);
+ }
+ break;
+ }
+}
+
+int32_t
+ec_manager_xattr(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0],
+ EC_UPDATE_META | EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
+ 0, EC_RANGE_FULL);
+ }
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ ec_fop_prepare_answer(fop, _gf_false);
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ ec_xattr_cbk(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->xdata);
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ ec_xattr_cbk(fop->req_frame, fop, fop->xl, -1, fop->error, NULL);
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_removexattr_cbk_t func, void *data,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ ec_cbk_t callback = {.removexattr = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(REMOVEXATTR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_REMOVEXATTR, 0, target,
+ fop_flags, ec_wind_removexattr, ec_manager_xattr,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (name != NULL) {
+ fop->str[0] = gf_strdup(name);
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: fremovexattr */
+
+int32_t
+ec_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL,
+ xdata);
+}
+
+void
+ec_wind_fremovexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fremovexattr_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fremovexattr,
+ fop->fd, fop->str[0], fop->xdata);
+}
+
+void
+ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fremovexattr = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FREMOVEXATTR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FREMOVEXATTR, 0, target,
+ fop_flags, ec_wind_fremovexattr,
+ ec_manager_xattr, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (name != NULL) {
+ fop->str[0] = gf_strdup(name);
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: setattr */
+
+int32_t
+ec_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prestat,
+ struct iatt *poststat, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+ poststat, xdata);
+}
+
+void
+ec_wind_setattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_setattr_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->setattr,
+ &fop->loc[0], &fop->iatt, fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_setattr(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ if (fop->fd == NULL) {
+ ec_lock_prepare_inode(fop, &fop->loc[0],
+ EC_UPDATE_META | EC_QUERY_INFO, 0,
+ EC_RANGE_FULL);
+ } else {
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
+ 0, EC_RANGE_FULL);
+ }
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ if (cbk->iatt[0].ia_type == IA_IFREG) {
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->id == GF_FOP_SETATTR) {
+ if (fop->cbks.setattr != NULL) {
+ QUORUM_CBK(fop->cbks.setattr, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno,
+ &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
+ }
+ } else {
+ if (fop->cbks.fsetattr != NULL) {
+ QUORUM_CBK(fop->cbks.fsetattr, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno,
+ &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
+ }
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->id == GF_FOP_SETATTR) {
+ if (fop->cbks.setattr != NULL) {
+ fop->cbks.setattr(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL, NULL);
+ }
+ } else {
+ if (fop->cbks.fsetattr != NULL) {
+ fop->cbks.fsetattr(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL, NULL);
+ }
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ ec_cbk_t callback = {.setattr = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(SETATTR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_SETATTR, 0, target,
+ fop_flags, ec_wind_setattr, ec_manager_setattr,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = valid;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (stbuf != NULL) {
+ fop->iatt = *stbuf;
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: fsetattr */
+
+int32_t
+ec_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prestat,
+ struct iatt *poststat, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+ poststat, xdata);
+}
+
+void
+ec_wind_fsetattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fsetattr_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fsetattr,
+ fop->fd, &fop->iatt, fop->int32, fop->xdata);
+}
+
+void
+ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fsetattr = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FSETATTR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETATTR, 0, target,
+ fop_flags, ec_wind_fsetattr, ec_manager_setattr,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->int32 = valid;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (stbuf != NULL) {
+ fop->iatt = *stbuf;
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: setxattr */
+
+int32_t
+ec_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL,
+ xdata);
+}
+
+void
+ec_wind_setxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_setxattr_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->setxattr,
+ &fop->loc[0], fop->dict, fop->int32, fop->xdata);
+}
+
+void
+ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ ec_cbk_t callback = {.setxattr = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(SETXATTR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_SETXATTR, 0, target,
+ fop_flags, ec_wind_setxattr, ec_manager_xattr,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = flags;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (dict != NULL) {
+ fop->dict = dict_copy_with_ref(dict, NULL);
+ if (fop->dict == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: fsetxattr */
+
+int32_t
+ec_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSETXATTR, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_fsetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fsetxattr_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fsetxattr,
+ fop->fd, fop->dict, fop->int32, fop->xdata);
+}
+
+void
+ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fsetxattr = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FSETXATTR) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETXATTR, 0, target,
+ fop_flags, ec_wind_fsetxattr, ec_manager_xattr,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->int32 = flags;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (dict != NULL) {
+ fop->dict = dict_copy_with_ref(dict, NULL);
+ if (fop->dict == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/*********************************************************************
+ *
+ * File Operation : fallocate
+ *
+ *********************************************************************/
+
+int32_t
+ec_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+}
+
+void
+ec_wind_fallocate(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fallocate_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fallocate,
+ fop->fd, fop->int32, fop->offset, fop->size, fop->xdata);
+}
+
+int32_t
+ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk = NULL;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ if (fop->size == 0) {
+ ec_fop_set_error(fop, EINVAL);
+ return EC_STATE_REPORT;
+ }
+ if (fop->int32 &
+ (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_PUNCH_HOLE)) {
+ ec_fop_set_error(fop, ENOTSUP);
+ return EC_STATE_REPORT;
+ }
+ fop->user_size = fop->offset + fop->size;
+ fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
+ _gf_true);
+ fop->size += fop->head;
+ ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true);
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ ec_lock_prepare_fd(fop, fop->fd,
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+ fop->offset, fop->size);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+ /* This shouldn't fail because we have the inode locked. */
+ LOCK(&fop->locks[0].lock->loc.inode->lock);
+ {
+ GF_ASSERT(__ec_get_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
+
+ /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
+ if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ } else if (fop->user_size > cbk->iatt[0].ia_size) {
+ cbk->iatt[1].ia_size = fop->user_size;
+
+ /* This shouldn't fail because we have the inode
+ * locked. */
+ GF_ASSERT(__ec_set_inode_size(
+ fop, fop->locks[0].lock->loc.inode,
+ cbk->iatt[1].ia_size));
+ } else {
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ }
+ }
+ UNLOCK(&fop->locks[0].lock->loc.inode->lock);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.fallocate != NULL) {
+ QUORUM_CBK(fop->cbks.fallocate, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.fallocate != NULL) {
+ fop->cbks.fallocate(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fallocate = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FALLOCATE) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FALLOCATE, 0, target,
+ fop_flags, ec_wind_fallocate,
+ ec_manager_fallocate, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+ fop->int32 = mode;
+ fop->offset = offset;
+ fop->size = len;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+ goto out;
+ }
+ }
+
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/*********************************************************************
+ *
+ * File Operation : Discard
+ *
+ *********************************************************************/
+void
+ec_update_discard_write(ec_fop_data_t *fop, uintptr_t mask)
+{
+ ec_t *ec = fop->xl->private;
+ off_t off_head = 0;
+ off_t off_tail = 0;
+ uint64_t size_head = 0;
+ uint64_t size_tail = 0;
+ int error = 0;
+
+ off_head = fop->offset * ec->fragments - fop->int32;
+ if (fop->size == 0) {
+ error = ec_update_write(fop, mask, off_head, fop->user_size);
+ } else {
+ size_head = fop->int32;
+ size_tail = (off_head + fop->user_size) % ec->stripe_size;
+ off_tail = off_head + fop->user_size - size_tail;
+ if (size_head) {
+ error = ec_update_write(fop, mask, off_head, size_head);
+ if (error) {
+ goto out;
+ }
+ }
+ if (size_tail) {
+ error = ec_update_write(fop, mask, off_tail, size_tail);
+ }
+ }
+out:
+ if (error)
+ ec_fop_set_error(fop, -error);
+}
+
+void
+ec_discard_adjust_offset_size(ec_fop_data_t *fop)
+{
+ ec_t *ec = fop->xl->private;
+
+ fop->user_size = fop->size;
+ /* If discard length covers at least a fragment on brick, we will
+ * perform discard operation(when fop->size is non-zero) else we just
+ * write zeros.
+ */
+ fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true);
+ fop->frag_range.first = fop->offset;
+ if (fop->size < fop->int32) {
+ fop->size = 0;
+ } else {
+ fop->size -= fop->int32;
+ ec_adjust_size_down(ec, &fop->size, _gf_true);
+ }
+ fop->frag_range.last = fop->offset + fop->size;
+}
+
+int32_t
+ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+}
+
+void
+ec_wind_discard(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_discard_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->discard,
+ fop->fd, fop->offset, fop->size, fop->xdata);
+}
+
+int32_t
+ec_manager_discard(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk = NULL;
+ off_t fl_start = 0;
+ uint64_t fl_size = 0;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ if ((fop->size <= 0) || (fop->offset < 0)) {
+ ec_fop_set_error(fop, EINVAL);
+ return EC_STATE_REPORT;
+ }
+ /* Because of the head/tail writes, "discard" happens on the
+ * remaining regions, but we need to compute region including
+ * head/tail writes so compute them separately*/
+ fl_start = fop->offset;
+ fl_size = fop->size;
+ fl_size += ec_adjust_offset_down(fop->xl->private, &fl_start,
+ _gf_true);
+ ec_adjust_size_up(fop->xl->private, &fl_size, _gf_true);
+
+ ec_discard_adjust_offset_size(fop);
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ ec_lock_prepare_fd(fop, fop->fd,
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+ fl_start, fl_size);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+
+ /* Dispatch discard fop only if we have whole fragment
+ * to deallocate */
+ if (fop->size) {
+ ec_dispatch_all(fop);
+ return EC_STATE_DELAYED_START;
+ } else {
+ /* Assume discard to have succeeded on all bricks */
+ ec_succeed_all(fop);
+ }
+
+ /* Fall through */
+
+ case EC_STATE_DELAYED_START:
+
+ if (fop->size) {
+ if (fop->answer && fop->answer->op_ret == 0)
+ ec_update_discard_write(fop, fop->answer->mask);
+ } else {
+ ec_update_discard_write(fop, fop->mask);
+ }
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
+
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ }
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.discard != NULL) {
+ QUORUM_CBK(fop->cbks.discard, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_DELAYED_START:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.discard != NULL) {
+ fop->cbks.discard(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ ec_cbk_t callback = {.discard = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(DISCARD) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target,
+ fop_flags, ec_wind_discard, ec_manager_discard,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+ fop->offset = offset;
+ fop->size = len;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ }
+
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/*********************************************************************
+ *
+ * File Operation : truncate
+ *
+ *********************************************************************/
+
+int32_t
+ec_update_truncate_write(ec_fop_data_t *fop, uintptr_t mask)
+{
+ ec_t *ec = fop->xl->private;
+ uint64_t size = fop->offset * ec->fragments - fop->user_size;
+ return ec_update_write(fop, mask, fop->user_size, size);
+}
+
+int32_t
+ec_truncate_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ ec_fop_data_t *fop = cookie;
+ int32_t err;
+
+ fop->parent->good &= fop->good;
+ if (op_ret >= 0) {
+ fd_bind(fd);
+ err = ec_update_truncate_write(fop->parent, fop->answer->mask);
+ if (err != 0) {
+ ec_fop_set_error(fop->parent, -err);
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+ec_truncate_clean(ec_fop_data_t *fop)
+{
+ if (fop->fd == NULL) {
+ fop->fd = fd_create(fop->loc[0].inode, fop->frame->root->pid);
+ if (fop->fd == NULL) {
+ return -ENOMEM;
+ }
+
+ ec_open(fop->frame, fop->xl, fop->answer->mask, fop->minimum,
+ ec_truncate_open_cbk, fop, &fop->loc[0], O_RDWR, fop->fd, NULL);
+
+ return 0;
+ } else {
+ return ec_update_truncate_write(fop, fop->answer->mask);
+ }
+}
+
+int32_t
+ec_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prestat,
+ struct iatt *poststat, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+ poststat, xdata);
+}
+
+void
+ec_wind_truncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_truncate_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->truncate,
+ &fop->loc[0], fop->offset, fop->xdata);
+}
+
+int32_t
+ec_manager_truncate(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+ off_t offset_down;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ fop->user_size = fop->offset;
+ ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true);
+ fop->frag_range.first = fop->offset;
+ fop->frag_range.last = UINT64_MAX;
+
+ /* Fall through */
+
+ case EC_STATE_LOCK:
+ offset_down = fop->user_size;
+ ec_adjust_offset_down(fop->xl->private, &offset_down, _gf_true);
+
+ if (fop->id == GF_FOP_TRUNCATE) {
+ ec_lock_prepare_inode(
+ fop, &fop->loc[0],
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+ offset_down, EC_RANGE_FULL);
+ } else {
+ ec_lock_prepare_fd(
+ fop, fop->fd,
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+ offset_down, EC_RANGE_FULL);
+ }
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ int32_t err;
+
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+ /* This shouldn't fail because we have the inode locked. */
+ /* Inode size doesn't need to be updated under locks, because
+ * conflicting operations won't be in-flight
+ */
+ GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
+ cbk->iatt[1].ia_size = fop->user_size;
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_set_inode_size(fop, fop->locks[0].lock->loc.inode,
+ fop->user_size));
+ if ((cbk->iatt[0].ia_size > cbk->iatt[1].ia_size) &&
+ (fop->user_size != fop->offset)) {
+ err = ec_truncate_clean(fop);
+ if (err != 0) {
+ ec_cbk_set_error(cbk, -err, _gf_false);
+ }
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->id == GF_FOP_TRUNCATE) {
+ if (fop->cbks.truncate != NULL) {
+ QUORUM_CBK(fop->cbks.truncate, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno,
+ &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
+ }
+ } else {
+ if (fop->cbks.ftruncate != NULL) {
+ QUORUM_CBK(fop->cbks.ftruncate, fop, fop->req_frame, fop,
+ fop->xl, cbk->op_ret, cbk->op_errno,
+ &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
+ }
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->id == GF_FOP_TRUNCATE) {
+ if (fop->cbks.truncate != NULL) {
+ fop->cbks.truncate(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL, NULL);
+ }
+ } else {
+ if (fop->cbks.ftruncate != NULL) {
+ fop->cbks.ftruncate(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL, NULL, NULL);
+ }
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ ec_cbk_t callback = {.truncate = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(TRUNCATE) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_TRUNCATE, 0, target,
+ fop_flags, ec_wind_truncate, ec_manager_truncate,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->offset = offset;
+
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: ftruncate */
+
+int32_t
+ec_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prestat,
+ struct iatt *poststat, dict_t *xdata)
+{
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+ poststat, xdata);
+}
+
+void
+ec_wind_ftruncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_ftruncate_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->ftruncate,
+ fop->fd, fop->offset, fop->xdata);
+}
+
+void
+ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ ec_cbk_t callback = {.ftruncate = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FTRUNCATE) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FTRUNCATE, 0, target,
+ fop_flags, ec_wind_ftruncate,
+ ec_manager_truncate, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->offset = offset;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
+
+/* FOP: writev */
+static ec_stripe_t *
+ec_allocate_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache)
+{
+ ec_stripe_t *stripe = NULL;
+
+ if (stripe_cache->count >= stripe_cache->max) {
+ GF_ASSERT(!list_empty(&stripe_cache->lru));
+ stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru);
+ list_move_tail(&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.evicts);
+ } else {
+ stripe = GF_MALLOC(sizeof(ec_stripe_t) + ec->stripe_size,
+ ec_mt_ec_stripe_t);
+ if (stripe != NULL) {
+ stripe_cache->count++;
+ list_add_tail(&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.allocs);
+ } else {
+ GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+ }
+ }
+
+ return stripe;
+}
+
+static void
+ec_write_stripe_data(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ off_t base;
+
+ base = fop->size - ec->stripe_size;
+ memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size);
+ stripe->frag_offset = fop->frag_range.last - ec->fragment_size;
+}
+
+static void
+ec_add_stripe_in_cache(ec_t *ec, ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ ec_stripe_t *stripe = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+ gf_boolean_t failed = _gf_true;
+
+ LOCK(&fop->fd->inode->lock);
+
+ ctx = __ec_inode_get(fop->fd->inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+
+ stripe_cache = &ctx->stripe_cache;
+ if (stripe_cache->max > 0) {
+ stripe = ec_allocate_stripe(ec, stripe_cache);
+ if (stripe == NULL) {
+ goto out;
+ }
+
+ ec_write_stripe_data(ec, fop, stripe);
+ }
+
+ failed = _gf_false;
+
+out:
+ UNLOCK(&fop->fd->inode->lock);
+
+ if (failed) {
+ gf_msg(ec->xl->name, GF_LOG_DEBUG, ENOMEM, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to create and add stripe in cache");
+ }
+}
+
+int32_t
+ec_writev_merge_tail(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ ec_t *ec = this->private;
+ ec_fop_data_t *fop = frame->local;
+ uint64_t size, base, tmp;
+
+ if (op_ret >= 0) {
+ tmp = 0;
+ size = fop->size - fop->user_size - fop->head;
+ base = ec->stripe_size - size;
+ if (op_ret > base) {
+ tmp = min(op_ret - base, size);
+ ec_iov_copy_to(fop->vector[0].iov_base + fop->size - size, vector,
+ count, base, tmp);
+
+ size -= tmp;
+ }
+
+ if (size > 0) {
+ memset(fop->vector[0].iov_base + fop->size - size, 0, size);
+ }
+
+ if (ec->stripe_cache) {
+ ec_add_stripe_in_cache(ec, fop);
+ }
+ }
+ return 0;
+}
+
+int32_t
+ec_writev_merge_head(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ ec_t *ec = this->private;
+ ec_fop_data_t *fop = frame->local;
+ uint64_t size, base;
+
+ if (op_ret >= 0) {
+ size = fop->head;
+ base = 0;
+
+ if (op_ret > 0) {
+ base = min(op_ret, size);
+ ec_iov_copy_to(fop->vector[0].iov_base, vector, count, 0, base);
+
+ size -= base;
+ }
+
+ if (size > 0) {
+ memset(fop->vector[0].iov_base + base, 0, size);
+ }
+
+ size = fop->size - fop->user_size - fop->head;
+ if ((size > 0) && (fop->size == ec->stripe_size)) {
+ ec_writev_merge_tail(frame, cookie, this, op_ret, op_errno, vector,
+ count, stbuf, iobref, xdata);
+ }
+ }
+
+ return 0;
+}
+
+static int
+ec_make_internal_fop_xdata(dict_t **xdata)
+{
+ dict_t *dict = NULL;
+
+ if (*xdata)
+ return 0;
+
+ dict = dict_new();
+ if (!dict)
+ goto out;
+
+ if (dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"))
+ goto out;
+
+ *xdata = dict;
+ return 0;
+out:
+ if (dict)
+ dict_unref(dict);
+ return -1;
+}
+
+static int32_t
+ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop)
+{
+ struct iobref *iobref = NULL;
+ struct iovec *iov;
+ void *ptr;
+ int32_t err;
+
+ fop->user_size = iov_length(fop->vector, fop->int32);
+ fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false);
+ fop->frag_range.first = fop->offset / ec->fragments;
+ fop->size = fop->user_size + fop->head;
+ ec_adjust_size_up(ec, &fop->size, _gf_false);
+ fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments;
+
+ if ((fop->int32 != 1) || (fop->head != 0) || (fop->size > fop->user_size) ||
+ !EC_ALIGN_CHECK(fop->vector[0].iov_base, EC_METHOD_WORD_SIZE)) {
+ err = ec_buffer_alloc(ec->xl, fop->size, &iobref, &ptr);
+ if (err != 0) {
+ goto out;
+ }
+
+ ec_iov_copy_to(ptr + fop->head, fop->vector, fop->int32, 0,
+ fop->user_size);
+
+ fop->vector[0].iov_base = ptr;
+ fop->vector[0].iov_len = fop->size;
+
+ iobref_unref(fop->buffers);
+ fop->buffers = iobref;
+ }
+
+ if (fop->int32 != 2) {
+ iov = GF_MALLOC(VECTORSIZE(2), gf_common_mt_iovec);
+ if (iov == NULL) {
+ err = -ENOMEM;
+
+ goto out;
+ }
+ iov[0].iov_base = fop->vector[0].iov_base;
+ iov[0].iov_len = fop->vector[0].iov_len;
+
+ GF_FREE(fop->vector);
+ fop->vector = iov;
+ }
+
+ fop->vector[1].iov_len = fop->size / ec->fragments;
+ err = ec_buffer_alloc(ec->xl, fop->vector[1].iov_len * ec->nodes,
+ &fop->buffers, &fop->vector[1].iov_base);
+ if (err != 0) {
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static void
+ec_merge_stripe_head_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ uint32_t head, size;
+
+ head = fop->head;
+ memcpy(fop->vector[0].iov_base, stripe->data, head);
+
+ size = ec->stripe_size - head;
+ if (size > fop->user_size) {
+ head += fop->user_size;
+ size = ec->stripe_size - head;
+ memcpy(fop->vector[0].iov_base + head, stripe->data + head, size);
+ }
+}
+
+static void
+ec_merge_stripe_tail_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ uint32_t head, tail;
+ off_t offset;
+
+ offset = fop->user_size + fop->head;
+ tail = fop->size - offset;
+ head = ec->stripe_size - tail;
+
+ memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail);
+}
+
+static ec_stripe_t *
+ec_get_stripe_from_cache_locked(ec_t *ec, ec_fop_data_t *fop,
+ uint64_t frag_offset)
+{
+ ec_inode_t *ctx = NULL;
+ ec_stripe_t *stripe = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+
+ ctx = __ec_inode_get(fop->fd->inode, fop->xl);
+ if (ctx == NULL) {
+ GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+ return NULL;
+ }
+
+ stripe_cache = &ctx->stripe_cache;
+ list_for_each_entry(stripe, &stripe_cache->lru, lru)
+ {
+ if (stripe->frag_offset == frag_offset) {
+ list_move_tail(&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.hits);
+ return stripe;
+ }
+ }
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.misses);
+
+ return NULL;
+}
+
+static gf_boolean_t
+ec_get_and_merge_stripe(ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which)
+{
+ uint64_t frag_offset;
+ ec_stripe_t *stripe = NULL;
+ gf_boolean_t found = _gf_false;
+
+ if (!ec->stripe_cache) {
+ return found;
+ }
+
+ LOCK(&fop->fd->inode->lock);
+ if (which == EC_STRIPE_HEAD) {
+ frag_offset = fop->frag_range.first;
+ stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+ if (stripe) {
+ ec_merge_stripe_head_locked(ec, fop, stripe);
+ found = _gf_true;
+ }
+ }
+
+ if (which == EC_STRIPE_TAIL) {
+ frag_offset = fop->frag_range.last - ec->fragment_size;
+ stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+ if (stripe) {
+ ec_merge_stripe_tail_locked(ec, fop, stripe);
+ found = _gf_true;
+ }
+ }
+ UNLOCK(&fop->fd->inode->lock);
+
+ return found;
+}
+
+static uintptr_t
+ec_get_lock_good_mask(inode_t *inode, xlator_t *xl)
+{
+ ec_lock_t *lock = NULL;
+ ec_inode_t *ictx = NULL;
+ LOCK(&inode->lock);
+ {
+ ictx = __ec_inode_get(inode, xl);
+ if (ictx)
+ lock = ictx->inode_lock;
+ }
+ UNLOCK(&inode->lock);
+ if (lock)
+ return lock->good_mask;
+ return 0;
+}
+
+void
+ec_writev_start(ec_fop_data_t *fop)
+{
+ ec_t *ec = fop->xl->private;
+ ec_fd_t *ctx;
+ fd_t *fd;
+ dict_t *xdata = NULL;
+ uint64_t tail, current;
+ int32_t err = -ENOMEM;
+ gf_boolean_t found_stripe = _gf_false;
+
+ /* This shouldn't fail because we have the inode locked. */
+ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &current));
+
+ fd = fd_anonymous(fop->fd->inode);
+ if (fd == NULL) {
+ goto failed;
+ }
+
+ fop->frame->root->uid = 0;
+ fop->frame->root->gid = 0;
+
+ ctx = ec_fd_get(fop->fd, fop->xl);
+ if (ctx != NULL) {
+ if ((ctx->flags & O_APPEND) != 0) {
+ /* Appending writes take full locks so size won't change because
+ * of any parallel operations
+ */
+ fop->offset = current;
+ }
+ }
+
+ err = ec_writev_prepare_buffers(ec, fop);
+ if (err != 0) {
+ goto failed_fd;
+ }
+ tail = fop->size - fop->user_size - fop->head;
+ if (fop->head > 0) {
+ if (current > fop->offset) {
+ found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata(&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl,
+ ec_get_lock_good_mask(fop->fd->inode, fop->xl),
+ EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd,
+ ec->stripe_size, fop->offset, 0, xdata);
+ }
+ } else {
+ memset(fop->vector[0].iov_base, 0, fop->head);
+ memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
+ if (ec->stripe_cache && (fop->size <= ec->stripe_size)) {
+ ec_add_stripe_in_cache(ec, fop);
+ }
+ }
+ }
+
+ if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
+ /* Current locking scheme will make sure the 'current' below will
+ * never decrease while the fop is in progress, so the checks will
+ * work as expected
+ */
+ if (current > fop->offset + fop->head + fop->user_size) {
+ found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_TAIL);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata(&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl,
+ ec_get_lock_good_mask(fop->fd->inode, fop->xl),
+ EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd,
+ ec->stripe_size,
+ fop->offset + fop->size - ec->stripe_size, 0, xdata);
+ }
+ } else {
+ memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
+ if (ec->stripe_cache) {
+ ec_add_stripe_in_cache(ec, fop);
+ }
+ }
+ }
+
+ err = 0;
+
+failed_xdata:
+ if (xdata) {
+ dict_unref(xdata);
+ }
+failed_fd:
+ fd_unref(fd);
+failed:
+ ec_fop_set_error(fop, -err);
+}
+
+int32_t
+ec_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prestat, struct iatt *poststat,
+ dict_t *xdata)
+{
+ ec_t *ec = NULL;
+ if (this && this->private) {
+ ec = this->private;
+ if ((op_ret > 0) && ((op_ret % ec->fragment_size) != 0)) {
+ op_ret = -1;
+ op_errno = EIO;
+ }
+ }
+ return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+ poststat, xdata);
+}
+
+void
+ec_wind_writev(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ struct iovec vector[1];
+ size_t size;
+
+ size = fop->vector[1].iov_len;
+
+ vector[0].iov_base = fop->vector[1].iov_base + idx * size;
+ vector[0].iov_len = size;
+
+ STACK_WIND_COOKIE(fop->frame, ec_writev_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->writev, fop->fd,
+ vector, 1, fop->offset / ec->fragments, fop->uint32,
+ fop->buffers, fop->xdata);
+}
+
+static void
+ec_writev_encode(ec_fop_data_t *fop)
+{
+ ec_t *ec = fop->xl->private;
+ void *blocks[ec->nodes];
+ uint32_t i;
+
+ blocks[0] = fop->vector[1].iov_base;
+ for (i = 1; i < ec->nodes; i++) {
+ blocks[i] = blocks[i - 1] + fop->vector[1].iov_len;
+ }
+ ec_method_encode(&ec->matrix, fop->vector[0].iov_len,
+ fop->vector[0].iov_base, blocks);
+}
+
+int32_t
+ec_manager_writev(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+ ec_fd_t *ctx = NULL;
+ ec_t *ec = fop->xl->private;
+ off_t fl_start = 0;
+ uint64_t fl_size = LONG_MAX;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ case EC_STATE_LOCK:
+ ctx = ec_fd_get(fop->fd, fop->xl);
+ if (ctx != NULL) {
+ if ((ctx->flags & O_APPEND) == 0) {
+ off_t user_size = 0;
+ off_t head = 0;
+
+ fl_start = fop->offset;
+ user_size = iov_length(fop->vector, fop->int32);
+ head = ec_adjust_offset_down(ec, &fl_start, _gf_true);
+ fl_size = user_size + head;
+ ec_adjust_size_up(ec, &fl_size, _gf_true);
+ }
+ }
+ ec_lock_prepare_fd(fop, fop->fd,
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+ fl_start, fl_size);
+ ec_lock(fop);
+
+ return EC_STATE_DISPATCH;
+
+ case EC_STATE_DISPATCH:
+ ec_writev_start(fop);
+
+ return EC_STATE_DELAYED_START;
+
+ case EC_STATE_DELAYED_START:
+ /* Restore uid, gid if they were changed to do some partial
+ * reads. */
+ fop->frame->root->uid = fop->uid;
+ fop->frame->root->gid = fop->gid;
+
+ ec_writev_encode(fop);
+
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ cbk = ec_fop_prepare_answer(fop, _gf_false);
+ if (cbk != NULL) {
+ ec_t *ec = fop->xl->private;
+ uint64_t size;
+
+ ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+ /* This shouldn't fail because we have the inode locked. */
+ LOCK(&fop->fd->inode->lock);
+ {
+ GF_ASSERT(__ec_get_inode_size(fop, fop->fd->inode,
+ &cbk->iatt[0].ia_size));
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ size = fop->offset + fop->head + fop->user_size;
+ if (size > cbk->iatt[0].ia_size) {
+ /* Only update inode size if this is a top level fop.
+ * Otherwise this is an internal write and the top
+ * level fop should take care of the real inode size.
+ */
+ if (fop->parent == NULL) {
+ /* This shouldn't fail because we have the inode
+ * locked. */
+ GF_ASSERT(
+ __ec_set_inode_size(fop, fop->fd->inode, size));
+ }
+ cbk->iatt[1].ia_size = size;
+ }
+ }
+ UNLOCK(&fop->fd->inode->lock);
+
+ if (fop->error == 0) {
+ cbk->op_ret *= ec->fragments;
+ if (cbk->op_ret < fop->head) {
+ cbk->op_ret = 0;
+ } else {
+ cbk->op_ret -= fop->head;
+ }
+ if (cbk->op_ret > fop->user_size) {
+ cbk->op_ret = fop->user_size;
+ }
+ }
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.writev != NULL) {
+ QUORUM_CBK(fop->cbks.writev, fop, fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+ &cbk->iatt[1], cbk->xdata);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_DELAYED_START:
+ /* We have failed while doing partial reads. We need to restore
+ * original uid, gid. */
+ fop->frame->root->uid = fop->uid;
+ fop->frame->root->gid = fop->gid;
+
+ /* Fall through */
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_LOCK:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.writev != NULL) {
+ fop->cbks.writev(fop->req_frame, fop, fop->xl, -1, fop->error,
+ NULL, NULL, NULL);
+ }
+
+ return EC_STATE_LOCK_REUSE;
+
+ case -EC_STATE_LOCK_REUSE:
+ case EC_STATE_LOCK_REUSE:
+ ec_lock_reuse(fop);
+
+ return EC_STATE_UNLOCK;
+
+ case -EC_STATE_UNLOCK:
+ case EC_STATE_UNLOCK:
+ ec_unlock(fop);
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ ec_cbk_t callback = {.writev = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(WRITE) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_WRITE, 0, target, fop_flags,
+ ec_wind_writev, ec_manager_writev, callback,
+ data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = count;
+ fop->offset = offset;
+ fop->uint32 = flags;
+
+ fop->use_fd = 1;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (count > 0) {
+ fop->vector = iov_dup(vector, count);
+ if (fop->vector == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a "
+ "vector list.");
+
+ goto out;
+ }
+ fop->int32 = count;
+ }
+ if (iobref != NULL) {
+ fop->buffers = iobref_ref(iobref);
+ if (fop->buffers == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_BUF_REF_FAIL,
+ "Failed to reference a "
+ "buffer.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_copy_with_ref(xdata, NULL);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+ }
+}
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
new file mode 100644
index 00000000000..601960d6154
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-locks.c
@@ -0,0 +1,1128 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-fops.h"
+#include "ec-messages.h"
+
+#define EC_LOCK_MODE_NONE 0
+#define EC_LOCK_MODE_INC 1
+#define EC_LOCK_MODE_ALL 2
+
+int32_t
+ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
+{
+ ec_t *ec = fop->xl->private;
+ ec_cbk_data_t *ans = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ uintptr_t locked = 0;
+ int32_t good = 0;
+ int32_t eagain = 0;
+ int32_t estale = 0;
+ int32_t error = -1;
+
+ /* There are some errors that we'll handle in an special way while trying
+ * to acquire a lock.
+ *
+ * EAGAIN: If it's found during a parallel non-blocking lock request, we
+ * consider that there's contention on the inode, so we consider
+ * the acquisition a failure and try again with a sequential
+ * blocking lock request. This will ensure that we get a lock on
+ * as many bricks as possible (ignoring EAGAIN here would cause
+ * unnecessary triggers of self-healing).
+ *
+ * If it's found during a sequential blocking lock request, it's
+ * considered an error. Lock will only succeed if there are
+ * enough other bricks locked.
+ *
+ * ESTALE: This can appear during parallel or sequential lock request if
+ * the inode has just been unlinked. We consider this error is
+ * not recoverable, but we also don't consider it as fatal. So,
+ * if it happens during parallel lock, we won't attempt a
+ * sequential one unless there are EAGAIN errors on other
+ * bricks (and are enough to form a quorum), but if we reach
+ * quorum counting the ESTALE bricks, we consider the whole
+ * result of the operation is ESTALE instead of EIO.
+ */
+
+ list_for_each_entry(ans, &fop->cbk_list, list)
+ {
+ if (ans->op_ret >= 0) {
+ if (locked != 0) {
+ error = EIO;
+ }
+ locked |= ans->mask;
+ good = ans->count;
+ cbk = ans;
+ } else if (ans->op_errno == ESTALE) {
+ estale += ans->count;
+ } else if ((ans->op_errno == EAGAIN) &&
+ (fop->uint32 != EC_LOCK_MODE_INC)) {
+ eagain += ans->count;
+ }
+ }
+
+ if (error == -1) {
+ /* If we have enough quorum with succeeded and EAGAIN answers, we
+ * ignore for now any ESTALE answer. If there are EAGAIN answers,
+ * we retry with a sequential blocking lock request if needed.
+ * Otherwise we succeed. */
+ if ((good + eagain) >= ec->fragments) {
+ if (eagain == 0) {
+ if (fop->answer == NULL) {
+ fop->answer = cbk;
+ }
+
+ ec_update_good(fop, locked);
+
+ error = 0;
+ } else {
+ switch (fop->uint32) {
+ case EC_LOCK_MODE_NONE:
+ error = EAGAIN;
+ break;
+ case EC_LOCK_MODE_ALL:
+ fop->uint32 = EC_LOCK_MODE_INC;
+ break;
+ default:
+ /* This shouldn't happen because eagain cannot be > 0
+ * when fop->uint32 is EC_LOCK_MODE_INC. */
+ error = EIO;
+ break;
+ }
+ }
+ } else {
+ /* We have been unable to find enough candidates that will be able
+ * to take the lock. If we have quorum on some answer, we return
+ * it. Otherwise we check if ESTALE answers allow us to reach
+ * quorum. If so, we return ESTALE. */
+ if (fop->answer && fop->answer->op_ret < 0) {
+ error = fop->answer->op_errno;
+ } else if ((good + eagain + estale) >= ec->fragments) {
+ error = ESTALE;
+ } else {
+ error = EIO;
+ }
+ }
+ }
+
+ *mask = locked;
+
+ return error;
+}
+
+int32_t
+ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED,
+ "Failed to unlock an entry/inode");
+ }
+
+ return 0;
+}
+
+int32_t
+ec_lock_lk_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
+ dict_t *xdata)
+{
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_LK_UNLOCK_FAILED,
+ "Failed to unlock an lk");
+ }
+
+ return 0;
+}
+
+/* FOP: entrylk */
+
+int32_t
+ec_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_ENTRYLK, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_entrylk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_entrylk_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->entrylk,
+ fop->str[0], &fop->loc[0], fop->str[1], fop->entrylk_cmd,
+ fop->entrylk_type, fop->xdata);
+}
+
+int32_t
+ec_manager_entrylk(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ if (fop->entrylk_cmd == ENTRYLK_LOCK) {
+ fop->uint32 = EC_LOCK_MODE_ALL;
+ fop->entrylk_cmd = ENTRYLK_LOCK_NB;
+ }
+
+ /* Fall through */
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_PREPARE_ANSWER:
+ if (fop->entrylk_cmd != ENTRYLK_UNLOCK) {
+ uintptr_t mask;
+
+ ec_fop_set_error(fop, ec_lock_check(fop, &mask));
+ if (fop->error != 0) {
+ if (mask != 0) {
+ if (fop->id == GF_FOP_ENTRYLK) {
+ ec_entrylk(
+ fop->frame, fop->xl, mask, 1, ec_lock_unlocked,
+ NULL, fop->str[0], &fop->loc[0], fop->str[1],
+ ENTRYLK_UNLOCK, fop->entrylk_type, fop->xdata);
+ } else {
+ ec_fentrylk(fop->frame, fop->xl, mask, 1,
+ ec_lock_unlocked, NULL, fop->str[0],
+ fop->fd, fop->str[1], ENTRYLK_UNLOCK,
+ fop->entrylk_type, fop->xdata);
+ }
+ }
+ if (fop->error < 0) {
+ fop->error = 0;
+
+ fop->entrylk_cmd = ENTRYLK_LOCK;
+
+ ec_dispatch_inc(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+ }
+ }
+ } else {
+ ec_fop_prepare_answer(fop, _gf_true);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->id == GF_FOP_ENTRYLK) {
+ if (fop->cbks.entrylk != NULL) {
+ fop->cbks.entrylk(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->xdata);
+ }
+ } else {
+ if (fop->cbks.fentrylk != NULL) {
+ fop->cbks.fentrylk(fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, cbk->xdata);
+ }
+ }
+
+ return EC_STATE_END;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->id == GF_FOP_ENTRYLK) {
+ if (fop->cbks.entrylk != NULL) {
+ fop->cbks.entrylk(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL);
+ }
+ } else {
+ if (fop->cbks.fentrylk != NULL) {
+ fop->cbks.fentrylk(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL);
+ }
+ }
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_entrylk_cbk_t func, void *data,
+ const char *volume, loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+ ec_cbk_t callback = {.entrylk = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(ENTRYLK) %p", frame);
+
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_ENTRYLK, 0, target,
+ fop_flags, ec_wind_entrylk, ec_manager_entrylk,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->entrylk_cmd = cmd;
+ fop->entrylk_type = type;
+
+ if (volume != NULL) {
+ fop->str[0] = gf_strdup(volume);
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (basename != NULL) {
+ fop->str[1] = gf_strdup(basename);
+ if (fop->str[1] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: fentrylk */
+
+int32_t
+ec_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FENTRYLK, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_fentrylk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_fentrylk_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->fentrylk,
+ fop->str[0], fop->fd, fop->str[1], fop->entrylk_cmd,
+ fop->entrylk_type, fop->xdata);
+}
+
+void
+ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data,
+ const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
+{
+ ec_cbk_t callback = {.fentrylk = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FENTRYLK) %p", frame);
+
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FENTRYLK, 0, target,
+ fop_flags, ec_wind_fentrylk, ec_manager_entrylk,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->entrylk_cmd = cmd;
+ fop->entrylk_type = type;
+
+ if (volume != NULL) {
+ fop->str[0] = gf_strdup(volume);
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (basename != NULL) {
+ fop->str[1] = gf_strdup(basename);
+ if (fop->str[1] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: inodelk */
+
+int32_t
+ec_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_INODELK, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_inodelk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_inodelk_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->inodelk,
+ fop->str[0], &fop->loc[0], fop->int32, &fop->flock,
+ fop->xdata);
+}
+
+int32_t
+ec_manager_inodelk(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ fop->flock.l_len += ec_adjust_offset_down(
+ fop->xl->private, &fop->flock.l_start, _gf_true);
+ ec_adjust_offset_up(fop->xl->private, &fop->flock.l_len, _gf_true);
+ if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK)) {
+ fop->uint32 = EC_LOCK_MODE_ALL;
+ fop->int32 = F_SETLK;
+ }
+
+ /* Fall through */
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_PREPARE_ANSWER:
+ if (fop->flock.l_type != F_UNLCK) {
+ uintptr_t mask;
+
+ ec_fop_set_error(fop, ec_lock_check(fop, &mask));
+ if (fop->error != 0) {
+ if (mask != 0) {
+ ec_t *ec = fop->xl->private;
+ struct gf_flock flock;
+
+ flock.l_type = F_UNLCK;
+ flock.l_whence = fop->flock.l_whence;
+ flock.l_start = fop->flock.l_start * ec->fragments;
+ flock.l_len = fop->flock.l_len * ec->fragments;
+ flock.l_pid = 0;
+ flock.l_owner.len = 0;
+
+ if (fop->id == GF_FOP_INODELK) {
+ ec_inodelk(fop->frame, fop->xl,
+ &fop->frame->root->lk_owner, mask, 1,
+ ec_lock_unlocked, NULL, fop->str[0],
+ &fop->loc[0], F_SETLK, &flock,
+ fop->xdata);
+ } else {
+ ec_finodelk(fop->frame, fop->xl,
+ &fop->frame->root->lk_owner, mask, 1,
+ ec_lock_unlocked, NULL, fop->str[0],
+ fop->fd, F_SETLK, &flock, fop->xdata);
+ }
+ }
+ if (fop->error < 0) {
+ fop->error = 0;
+
+ fop->int32 = F_SETLKW;
+
+ ec_dispatch_inc(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+ }
+ }
+ } else {
+ ec_fop_prepare_answer(fop, _gf_true);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->id == GF_FOP_INODELK) {
+ if (fop->cbks.inodelk != NULL) {
+ fop->cbks.inodelk(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, cbk->xdata);
+ }
+ } else {
+ if (fop->cbks.finodelk != NULL) {
+ fop->cbks.finodelk(fop->req_frame, fop, fop->xl,
+ cbk->op_ret, cbk->op_errno, cbk->xdata);
+ }
+ }
+
+ return EC_STATE_END;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->id == GF_FOP_INODELK) {
+ if (fop->cbks.inodelk != NULL) {
+ fop->cbks.inodelk(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL);
+ }
+ } else {
+ if (fop->cbks.finodelk != NULL) {
+ fop->cbks.finodelk(fop->req_frame, fop, fop->xl, -1,
+ fop->error, NULL);
+ }
+ }
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
+ uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func,
+ void *data, const char *volume, loc_t *loc, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ ec_cbk_t callback = {.inodelk = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(INODELK) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_INODELK, 0, target,
+ fop_flags, ec_wind_inodelk, ec_manager_inodelk,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->int32 = cmd;
+ ec_owner_copy(fop->frame, owner);
+
+ if (volume != NULL) {
+ fop->str[0] = gf_strdup(volume);
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (loc != NULL) {
+ if (loc_copy(&fop->loc[0], loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+
+ goto out;
+ }
+ }
+ if (flock != NULL) {
+ fop->flock.l_type = flock->l_type;
+ fop->flock.l_whence = flock->l_whence;
+ fop->flock.l_start = flock->l_start;
+ fop->flock.l_len = flock->l_len;
+ fop->flock.l_pid = flock->l_pid;
+ fop->flock.l_owner.len = flock->l_owner.len;
+ if (flock->l_owner.len > 0) {
+ memcpy(fop->flock.l_owner.data, flock->l_owner.data,
+ flock->l_owner.len);
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: finodelk */
+
+int32_t
+ec_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FINODELK, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, NULL);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_finodelk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_finodelk_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->finodelk,
+ fop->str[0], fop->fd, fop->int32, &fop->flock,
+ fop->xdata);
+}
+
+void
+ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
+ uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func,
+ void *data, const char *volume, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ ec_cbk_t callback = {.finodelk = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(FINODELK) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_FINODELK, 0, target,
+ fop_flags, ec_wind_finodelk, ec_manager_inodelk,
+ callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->int32 = cmd;
+ ec_owner_copy(fop->frame, owner);
+
+ if (volume != NULL) {
+ fop->str[0] = gf_strdup(volume);
+ if (fop->str[0] == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to duplicate a string.");
+
+ goto out;
+ }
+ }
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (flock != NULL) {
+ fop->flock.l_type = flock->l_type;
+ fop->flock.l_whence = flock->l_whence;
+ fop->flock.l_start = flock->l_start;
+ fop->flock.l_len = flock->l_len;
+ fop->flock.l_pid = flock->l_pid;
+ fop->flock.l_owner.len = flock->l_owner.len;
+ if (flock->l_owner.len > 0) {
+ memcpy(fop->flock.l_owner.data, flock->l_owner.data,
+ flock->l_owner.len);
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL);
+ }
+}
+
+/* FOP: lk */
+
+int32_t
+ec_combine_lk(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+ if (!ec_flock_compare(&dst->flock, &src->flock)) {
+ gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_LOCK_MISMATCH,
+ "Mismatching lock in "
+ "answers of 'GF_FOP_LK'");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+int32_t
+ec_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *flock, dict_t *xdata)
+{
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = frame->local;
+
+ ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+ op_ret, op_errno);
+
+ cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_LK, idx, op_ret,
+ op_errno);
+ if (cbk != NULL) {
+ if (op_ret >= 0) {
+ if (flock != NULL) {
+ cbk->flock.l_type = flock->l_type;
+ cbk->flock.l_whence = flock->l_whence;
+ cbk->flock.l_start = flock->l_start;
+ cbk->flock.l_len = flock->l_len;
+ cbk->flock.l_pid = flock->l_pid;
+ cbk->flock.l_owner.len = flock->l_owner.len;
+ if (flock->l_owner.len > 0) {
+ memcpy(cbk->flock.l_owner.data, flock->l_owner.data,
+ flock->l_owner.len);
+ }
+ }
+ }
+ if (xdata != NULL) {
+ cbk->xdata = dict_ref(xdata);
+ if (cbk->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ ec_combine(cbk, ec_combine_lk);
+ }
+
+out:
+ if (fop != NULL) {
+ ec_complete(fop);
+ }
+
+ return 0;
+}
+
+void
+ec_wind_lk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+ ec_trace("WIND", fop, "idx=%d", idx);
+
+ STACK_WIND_COOKIE(fop->frame, ec_lk_cbk, (void *)(uintptr_t)idx,
+ ec->xl_list[idx], ec->xl_list[idx]->fops->lk, fop->fd,
+ fop->int32, &fop->flock, fop->xdata);
+}
+
+int32_t
+ec_manager_lk(ec_fop_data_t *fop, int32_t state)
+{
+ ec_cbk_data_t *cbk;
+
+ switch (state) {
+ case EC_STATE_INIT:
+ if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK)) {
+ fop->uint32 = EC_LOCK_MODE_ALL;
+ fop->int32 = F_SETLK;
+ }
+
+ /* Fall through */
+
+ case EC_STATE_DISPATCH:
+ ec_dispatch_all(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+
+ case EC_STATE_PREPARE_ANSWER:
+ case -EC_STATE_PREPARE_ANSWER:
+ if (fop->flock.l_type != F_UNLCK) {
+ uintptr_t mask;
+
+ ec_fop_set_error(fop, ec_lock_check(fop, &mask));
+ if (fop->error != 0) {
+ if (mask != 0) {
+ struct gf_flock flock = {0};
+
+ flock.l_type = F_UNLCK;
+ flock.l_whence = fop->flock.l_whence;
+ flock.l_start = fop->flock.l_start;
+ flock.l_len = fop->flock.l_len;
+ flock.l_pid = fop->flock.l_pid;
+ lk_owner_copy(&flock.l_owner, &fop->flock.l_owner);
+
+ ec_lk(fop->frame, fop->xl, mask, 1, ec_lock_lk_unlocked,
+ NULL, fop->fd, F_SETLK, &flock, fop->xdata);
+ }
+
+ if (fop->error < 0) {
+ fop->error = 0;
+
+ fop->int32 = F_SETLKW;
+
+ ec_dispatch_inc(fop);
+
+ return EC_STATE_PREPARE_ANSWER;
+ }
+ }
+ } else {
+ ec_fop_prepare_answer(fop, _gf_true);
+ }
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ cbk = fop->answer;
+
+ GF_ASSERT(cbk != NULL);
+
+ if (fop->cbks.lk != NULL) {
+ fop->cbks.lk(fop->req_frame, fop, fop->xl, cbk->op_ret,
+ cbk->op_errno, &cbk->flock, cbk->xdata);
+ }
+
+ return EC_STATE_END;
+
+ case -EC_STATE_INIT:
+ case -EC_STATE_DISPATCH:
+ case -EC_STATE_REPORT:
+ GF_ASSERT(fop->error != 0);
+
+ if (fop->cbks.lk != NULL) {
+ fop->cbks.lk(fop->req_frame, fop, fop->xl, -1, fop->error, NULL,
+ NULL);
+ }
+
+ return EC_STATE_END;
+
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
+ }
+}
+
+void
+ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags,
+ fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ ec_cbk_t callback = {.lk = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(LK) %p", frame);
+
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, GF_FOP_LK, 0, target, fop_flags,
+ ec_wind_lk, ec_manager_lk, callback, data);
+ if (fop == NULL) {
+ goto out;
+ }
+
+ fop->use_fd = 1;
+
+ fop->int32 = cmd;
+
+ if (fd != NULL) {
+ fop->fd = fd_ref(fd);
+ if (fop->fd == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to reference a "
+ "file descriptor.");
+
+ goto out;
+ }
+ }
+ if (flock != NULL) {
+ fop->flock.l_type = flock->l_type;
+ fop->flock.l_whence = flock->l_whence;
+ fop->flock.l_start = flock->l_start;
+ fop->flock.l_len = flock->l_len;
+ fop->flock.l_pid = flock->l_pid;
+ fop->flock.l_owner.len = flock->l_owner.len;
+ if (flock->l_owner.len > 0) {
+ memcpy(fop->flock.l_owner.data, flock->l_owner.data,
+ flock->l_owner.len);
+ }
+ }
+ if (xdata != NULL) {
+ fop->xdata = dict_ref(xdata);
+ if (fop->xdata == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+ "Failed to reference a "
+ "dictionary.");
+
+ goto out;
+ }
+ }
+
+ error = 0;
+
+out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, NULL, this, -1, error, NULL, NULL);
+ }
+}
diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h
new file mode 100644
index 00000000000..3252c4c1c58
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-mem-types.h
@@ -0,0 +1,30 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_MEM_TYPES_H__
+#define __EC_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_ec_mem_types_ {
+ ec_mt_ec_t = gf_common_mt_end + 1,
+ ec_mt_xlator_t,
+ ec_mt_ec_inode_t,
+ ec_mt_ec_fd_t,
+ ec_mt_subvol_healer_t,
+ ec_mt_ec_gf_t,
+ ec_mt_ec_code_t,
+ ec_mt_ec_code_builder_t,
+ ec_mt_ec_matrix_t,
+ ec_mt_ec_stripe_t,
+ ec_mt_end
+};
+
+#endif /* __EC_MEM_TYPES_H__ */
diff --git a/xlators/cluster/ec/src/ec-messages.h b/xlators/cluster/ec/src/ec-messages.h
new file mode 100644
index 00000000000..72e98f11286
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-messages.h
@@ -0,0 +1,61 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _EC_MESSAGES_H_
+#define _EC_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(EC, EC_MSG_INVALID_CONFIG, EC_MSG_HEAL_FAIL,
+ EC_MSG_DICT_COMBINE_FAIL, EC_MSG_STIME_COMBINE_FAIL,
+ EC_MSG_INVALID_DICT_NUMS, EC_MSG_IATT_COMBINE_FAIL,
+ EC_MSG_INVALID_FORMAT, EC_MSG_DICT_GET_FAILED,
+ EC_MSG_UNHANDLED_STATE, EC_MSG_FILE_DESC_REF_FAIL,
+ EC_MSG_LOC_COPY_FAIL, EC_MSG_BUF_REF_FAIL, EC_MSG_DICT_REF_FAIL,
+ EC_MSG_LK_UNLOCK_FAILED, EC_MSG_UNLOCK_FAILED,
+ EC_MSG_LOC_PARENT_INODE_MISSING, EC_MSG_INVALID_LOC_NAME,
+ EC_MSG_NO_MEMORY, EC_MSG_GFID_MISMATCH, EC_MSG_UNSUPPORTED_VERSION,
+ EC_MSG_FD_CREATE_FAIL, EC_MSG_READDIRP_REQ_PREP_FAIL,
+ EC_MSG_LOOKUP_REQ_PREP_FAIL, EC_MSG_INODE_REF_FAIL,
+ EC_MSG_LOOKUP_READAHEAD_FAIL, EC_MSG_FRAME_MISMATCH,
+ EC_MSG_XLATOR_MISMATCH, EC_MSG_VECTOR_MISMATCH, EC_MSG_IATT_MISMATCH,
+ EC_MSG_FD_MISMATCH, EC_MSG_DICT_MISMATCH, EC_MSG_INDEX_DIR_GET_FAIL,
+ EC_MSG_PREOP_LOCK_FAILED, EC_MSG_CHILDS_INSUFFICIENT,
+ EC_MSG_OP_EXEC_UNAVAIL, EC_MSG_UNLOCK_DELAY_FAILED,
+ EC_MSG_SIZE_VERS_UPDATE_FAIL, EC_MSG_INVALID_REQUEST,
+ EC_MSG_INVALID_LOCK_TYPE, EC_MSG_SIZE_VERS_GET_FAIL,
+ EC_MSG_FILE_SIZE_GET_FAIL, EC_MSG_FOP_MISMATCH,
+ EC_MSG_SUBVOL_ID_DICT_SET_FAIL, EC_MSG_SUBVOL_BUILD_FAIL,
+ EC_MSG_XLATOR_INIT_FAIL, EC_MSG_NO_PARENTS, EC_MSG_TIMER_CREATE_FAIL,
+ EC_MSG_TOO_MANY_SUBVOLS, EC_MSG_DATA_UNAVAILABLE,
+ EC_MSG_INODE_REMOVE_FAIL, EC_MSG_INVALID_REDUNDANCY,
+ EC_MSG_XLATOR_PARSE_OPT_FAIL, EC_MSG_OP_FAIL_ON_SUBVOLS,
+ EC_MSG_INVALID_INODE, EC_MSG_LOCK_MISMATCH, EC_MSG_XDATA_MISMATCH,
+ EC_MSG_HEALING_INFO, EC_MSG_HEAL_SUCCESS, EC_MSG_FULL_SWEEP_START,
+ EC_MSG_FULL_SWEEP_STOP, EC_MSG_INVALID_FOP, EC_MSG_EC_UP,
+ EC_MSG_EC_DOWN, EC_MSG_SIZE_XATTR_GET_FAIL,
+ EC_MSG_VER_XATTR_GET_FAIL, EC_MSG_CONFIG_XATTR_GET_FAIL,
+ EC_MSG_CONFIG_XATTR_INVALID, EC_MSG_EXTENSION, EC_MSG_EXTENSION_NONE,
+ EC_MSG_EXTENSION_UNKNOWN, EC_MSG_EXTENSION_UNSUPPORTED,
+ EC_MSG_EXTENSION_FAILED, EC_MSG_NO_GF, EC_MSG_MATRIX_FAILED,
+ EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED,
+ EC_MSG_THREAD_CLEANUP_FAILED, EC_MSG_FD_BAD);
+
+#endif /* !_EC_MESSAGES_H_ */
diff --git a/xlators/cluster/ec/src/ec-method.c b/xlators/cluster/ec/src/ec-method.c
new file mode 100644
index 00000000000..55faed0b193
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-method.c
@@ -0,0 +1,433 @@
+/*
+ Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+#include <inttypes.h>
+
+#include "ec-types.h"
+#include "ec-mem-types.h"
+#include "ec-galois.h"
+#include "ec-code.h"
+#include "ec-method.h"
+#include "ec-helpers.h"
+
+static void
+ec_method_matrix_normal(ec_gf_t *gf, uint32_t *matrix, uint32_t columns,
+ uint32_t *values, uint32_t count)
+{
+ uint32_t i, j, v, tmp;
+
+ columns--;
+ for (i = 0; i < count; i++) {
+ v = *values++;
+ *matrix++ = tmp = ec_gf_exp(gf, v, columns);
+ for (j = 0; j < columns; j++) {
+ *matrix++ = tmp = ec_gf_div(gf, tmp, v);
+ }
+ }
+}
+
+static void
+ec_method_matrix_inverse(ec_gf_t *gf, uint32_t *matrix, uint32_t *values,
+ uint32_t count)
+{
+ uint32_t a[count];
+ uint32_t i, j, p, last, tmp;
+
+ last = count - 1;
+ for (i = 0; i < last; i++) {
+ a[i] = 1;
+ }
+ a[i] = values[0];
+ for (i = last; i > 0; i--) {
+ for (j = i - 1; j < last; j++) {
+ a[j] = a[j + 1] ^ ec_gf_mul(gf, values[i], a[j]);
+ }
+ a[j] = ec_gf_mul(gf, values[i], a[j]);
+ }
+ for (i = 0; i < count; i++) {
+ p = a[0];
+ matrix += count;
+ *matrix = tmp = p ^ values[i];
+ for (j = 1; j < last; j++) {
+ matrix += count;
+ *matrix = tmp = a[j] ^ ec_gf_mul(gf, values[i], tmp);
+ p = tmp ^ ec_gf_mul(gf, values[i], p);
+ }
+ for (j = 0; j < last; j++) {
+ *matrix = ec_gf_div(gf, *matrix, p);
+ matrix -= count;
+ }
+ *matrix = ec_gf_div(gf, 1, p);
+ matrix++;
+ }
+}
+
+static void
+ec_method_matrix_init(ec_matrix_list_t *list, ec_matrix_t *matrix,
+ uintptr_t mask, uint32_t *rows, gf_boolean_t inverse)
+{
+ uint32_t i;
+
+ matrix->refs = 1;
+ matrix->mask = mask;
+ matrix->code = list->code;
+ matrix->columns = list->columns;
+ INIT_LIST_HEAD(&matrix->lru);
+
+ if (inverse) {
+ matrix->rows = list->columns;
+ ec_method_matrix_inverse(matrix->code->gf, matrix->values, rows,
+ matrix->rows);
+ for (i = 0; i < matrix->rows; i++) {
+ matrix->row_data[i].values = matrix->values + i * matrix->columns;
+ matrix->row_data[i].func.interleaved = ec_code_build_interleaved(
+ matrix->code, EC_METHOD_WORD_SIZE, matrix->row_data[i].values,
+ matrix->columns);
+ }
+ } else {
+ matrix->rows = list->rows;
+ ec_method_matrix_normal(matrix->code->gf, matrix->values,
+ matrix->columns, rows, matrix->rows);
+ for (i = 0; i < matrix->rows; i++) {
+ matrix->row_data[i].values = matrix->values + i * matrix->columns;
+ matrix->row_data[i].func.linear = ec_code_build_linear(
+ matrix->code, EC_METHOD_WORD_SIZE, matrix->row_data[i].values,
+ matrix->columns);
+ }
+ }
+}
+
+static void
+ec_method_matrix_release(ec_matrix_t *matrix)
+{
+ uint32_t i;
+
+ for (i = 0; i < matrix->rows; i++) {
+ if (matrix->row_data[i].func.linear != NULL) {
+ ec_code_release(matrix->code, &matrix->row_data[i].func);
+ matrix->row_data[i].func.linear = NULL;
+ }
+ }
+}
+
+static void
+ec_method_matrix_destroy(ec_matrix_list_t *list, ec_matrix_t *matrix)
+{
+ list_del_init(&matrix->lru);
+
+ ec_method_matrix_release(matrix);
+
+ mem_put(matrix);
+
+ list->count--;
+}
+
+static void
+ec_method_matrix_unref(ec_matrix_list_t *list, ec_matrix_t *matrix)
+{
+ if (--matrix->refs == 0) {
+ list_add_tail(&matrix->lru, &list->lru);
+ if (list->count > list->max) {
+ matrix = list_first_entry(&list->lru, ec_matrix_t, lru);
+ ec_method_matrix_destroy(list, matrix);
+ }
+ }
+}
+
+static ec_matrix_t *
+ec_method_matrix_lookup(ec_matrix_list_t *list, uintptr_t mask, uint32_t *pos)
+{
+ ec_matrix_t *matrix;
+ uint32_t i, j, k;
+
+ i = 0;
+ j = list->count;
+ while (i < j) {
+ k = (i + j) >> 1;
+ matrix = list->objects[k];
+ if (matrix->mask == mask) {
+ *pos = k;
+ return matrix;
+ }
+ if (matrix->mask < mask) {
+ i = k + 1;
+ } else {
+ j = k;
+ }
+ }
+ *pos = i;
+
+ return NULL;
+}
+
+static void
+ec_method_matrix_remove(ec_matrix_list_t *list, uintptr_t mask)
+{
+ uint32_t pos;
+
+ if (ec_method_matrix_lookup(list, mask, &pos) != NULL) {
+ list->count--;
+ if (pos < list->count) {
+ memmove(list->objects + pos, list->objects + pos + 1,
+ sizeof(ec_matrix_t *) * (list->count - pos));
+ }
+ }
+}
+
+static void
+ec_method_matrix_insert(ec_matrix_list_t *list, ec_matrix_t *matrix)
+{
+ uint32_t pos;
+
+ GF_ASSERT(ec_method_matrix_lookup(list, matrix->mask, &pos) == NULL);
+
+ if (pos < list->count) {
+ memmove(list->objects + pos + 1, list->objects + pos,
+ sizeof(ec_matrix_t *) * (list->count - pos));
+ }
+ list->objects[pos] = matrix;
+ list->count++;
+}
+
+static ec_matrix_t *
+ec_method_matrix_get(ec_matrix_list_t *list, uintptr_t mask, uint32_t *rows)
+{
+ ec_matrix_t *matrix;
+ uint32_t pos;
+
+ LOCK(&list->lock);
+
+ matrix = ec_method_matrix_lookup(list, mask, &pos);
+ if (matrix != NULL) {
+ list_del_init(&matrix->lru);
+ matrix->refs++;
+
+ goto out;
+ }
+
+ if ((list->count >= list->max) && !list_empty(&list->lru)) {
+ matrix = list_first_entry(&list->lru, ec_matrix_t, lru);
+ list_del_init(&matrix->lru);
+
+ ec_method_matrix_remove(list, matrix->mask);
+
+ ec_method_matrix_release(matrix);
+ } else {
+ matrix = mem_get0(list->pool);
+ if (matrix == NULL) {
+ matrix = EC_ERR(ENOMEM);
+ goto out;
+ }
+ matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) +
+ sizeof(ec_matrix_row_t) * list->columns);
+ }
+
+ ec_method_matrix_init(list, matrix, mask, rows, _gf_true);
+
+ if (list->count < list->max) {
+ ec_method_matrix_insert(list, matrix);
+ } else {
+ matrix->mask = 0;
+ }
+
+out:
+ UNLOCK(&list->lock);
+
+ return matrix;
+}
+
+static void
+ec_method_matrix_put(ec_matrix_list_t *list, ec_matrix_t *matrix)
+{
+ LOCK(&list->lock);
+
+ ec_method_matrix_unref(list, matrix);
+
+ UNLOCK(&list->lock);
+}
+
+static int32_t
+ec_method_setup(xlator_t *xl, ec_matrix_list_t *list, const char *gen)
+{
+ ec_matrix_t *matrix;
+ uint32_t values[list->rows];
+ uint32_t i;
+ int32_t err;
+
+ matrix = GF_MALLOC(sizeof(ec_matrix_t) +
+ sizeof(ec_matrix_row_t) * list->rows +
+ sizeof(uint32_t) * list->columns * list->rows,
+ ec_mt_ec_matrix_t);
+ if (matrix == NULL) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ memset(matrix, 0, sizeof(ec_matrix_t));
+ matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) +
+ sizeof(ec_matrix_row_t) * list->rows);
+
+ list->code = ec_code_create(list->gf, ec_code_detect(xl, gen));
+ if (EC_IS_ERR(list->code)) {
+ err = EC_GET_ERR(list->code);
+ list->code = NULL;
+ goto failed_matrix;
+ }
+
+ for (i = 0; i < list->rows; i++) {
+ values[i] = i + 1;
+ }
+ ec_method_matrix_init(list, matrix, 0, values, _gf_false);
+
+ list->encode = matrix;
+
+ return 0;
+
+failed_matrix:
+ GF_FREE(matrix);
+failed:
+ return err;
+}
+
+int32_t
+ec_method_init(xlator_t *xl, ec_matrix_list_t *list, uint32_t columns,
+ uint32_t rows, uint32_t max, const char *gen)
+{
+ list->columns = columns;
+ list->rows = rows;
+ list->max = max;
+ list->stripe = EC_METHOD_CHUNK_SIZE * list->columns;
+ INIT_LIST_HEAD(&list->lru);
+ int32_t err;
+
+ list->pool = mem_pool_new_fn(xl->ctx,
+ sizeof(ec_matrix_t) +
+ sizeof(ec_matrix_row_t) * columns +
+ sizeof(uint32_t) * columns * columns,
+ 128, "ec_matrix_t");
+ if (list->pool == NULL) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ list->objects = GF_MALLOC(sizeof(ec_matrix_t *) * max, ec_mt_ec_matrix_t);
+ if (list->objects == NULL) {
+ err = -ENOMEM;
+ goto failed_pool;
+ }
+
+ list->gf = ec_gf_prepare(EC_GF_BITS, EC_GF_MOD);
+ if (EC_IS_ERR(list->gf)) {
+ err = EC_GET_ERR(list->gf);
+ goto failed_objects;
+ }
+
+ err = ec_method_setup(xl, list, gen);
+ if (err != 0) {
+ goto failed_gf;
+ }
+
+ LOCK_INIT(&list->lock);
+
+ return 0;
+
+failed_gf:
+ ec_gf_destroy(list->gf);
+failed_objects:
+ GF_FREE(list->objects);
+failed_pool:
+ mem_pool_destroy(list->pool);
+failed:
+ list->pool = NULL;
+ list->objects = NULL;
+ list->gf = NULL;
+
+ return err;
+}
+
+void
+ec_method_fini(ec_matrix_list_t *list)
+{
+ ec_matrix_t *matrix;
+
+ if (list->encode == NULL) {
+ return;
+ }
+
+ while (!list_empty(&list->lru)) {
+ matrix = list_first_entry(&list->lru, ec_matrix_t, lru);
+ ec_method_matrix_destroy(list, matrix);
+ }
+
+ GF_ASSERT(list->count == 0);
+
+ if (list->pool) /*Init was successful*/
+ LOCK_DESTROY(&list->lock);
+
+ ec_method_matrix_release(list->encode);
+ GF_FREE(list->encode);
+
+ ec_code_destroy(list->code);
+ ec_gf_destroy(list->gf);
+ GF_FREE(list->objects);
+
+ if (list->pool)
+ mem_pool_destroy(list->pool);
+}
+
+int32_t
+ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen)
+{
+ /* TODO: Allow changing code generator */
+
+ return 0;
+}
+
+void
+ec_method_encode(ec_matrix_list_t *list, uint64_t size, void *in, void **out)
+{
+ ec_matrix_t *matrix;
+ uint64_t pos;
+ uint32_t i;
+
+ matrix = list->encode;
+ for (pos = 0; pos < size; pos += list->stripe) {
+ for (i = 0; i < matrix->rows; i++) {
+ matrix->row_data[i].func.linear(
+ out[i], in, pos, matrix->row_data[i].values, list->columns);
+ out[i] += EC_METHOD_CHUNK_SIZE;
+ }
+ }
+}
+
+int32_t
+ec_method_decode(ec_matrix_list_t *list, uint64_t size, uintptr_t mask,
+ uint32_t *rows, void **in, void *out)
+{
+ ec_matrix_t *matrix;
+ uint64_t pos;
+ uint32_t i;
+
+ matrix = ec_method_matrix_get(list, mask, rows);
+ if (EC_IS_ERR(matrix)) {
+ return EC_GET_ERR(matrix);
+ }
+ for (pos = 0; pos < size; pos += EC_METHOD_CHUNK_SIZE) {
+ for (i = 0; i < matrix->rows; i++) {
+ matrix->row_data[i].func.interleaved(
+ out, in, pos, matrix->row_data[i].values, list->columns);
+ out += EC_METHOD_CHUNK_SIZE;
+ }
+ }
+
+ ec_method_matrix_put(list, matrix);
+
+ return 0;
+}
diff --git a/xlators/cluster/ec/src/ec-method.h b/xlators/cluster/ec/src/ec-method.h
new file mode 100644
index 00000000000..f91233b2f88
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-method.h
@@ -0,0 +1,48 @@
+/*
+ Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_METHOD_H__
+#define __EC_METHOD_H__
+
+#include "ec-types.h"
+#include "ec-galois.h"
+
+#define EC_GF_BITS 8
+#define EC_GF_MOD 0x11D
+
+#define EC_GF_SIZE (1 << EC_GF_BITS)
+
+/* Determines the maximum size of the matrix used to encode/decode data */
+#define EC_METHOD_MAX_FRAGMENTS 16
+/* Determines the maximum number of usable elements in the Galois Field */
+#define EC_METHOD_MAX_NODES (EC_GF_SIZE - 1)
+
+#define EC_METHOD_WORD_SIZE 64
+
+#define EC_METHOD_CHUNK_SIZE (EC_METHOD_WORD_SIZE * EC_GF_BITS)
+
+int32_t
+ec_method_init(xlator_t *xl, ec_matrix_list_t *list, uint32_t columns,
+ uint32_t rows, uint32_t max, const char *gen);
+
+void
+ec_method_fini(ec_matrix_list_t *list);
+
+int32_t
+ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen);
+
+void
+ec_method_encode(ec_matrix_list_t *list, uint64_t size, void *in, void **out);
+
+int32_t
+ec_method_decode(ec_matrix_list_t *list, uint64_t size, uintptr_t mask,
+ uint32_t *rows, void **in, void *out);
+
+#endif /* __EC_METHOD_H__ */
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
new file mode 100644
index 00000000000..de9b89bb2c9
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -0,0 +1,690 @@
+/*
+ Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_TYPES_H__
+#define __EC_TYPES_H__
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/timer.h>
+#include "libxlator.h"
+#include <glusterfs/atomic.h>
+
+#define EC_GF_MAX_REGS 16
+
+enum _ec_heal_need;
+typedef enum _ec_heal_need ec_heal_need_t;
+
+enum _ec_stripe_part;
+typedef enum _ec_stripe_part ec_stripe_part_t;
+
+enum _ec_read_policy;
+typedef enum _ec_read_policy ec_read_policy_t;
+
+struct _ec_config;
+typedef struct _ec_config ec_config_t;
+
+struct _ec_fd;
+typedef struct _ec_fd ec_fd_t;
+
+struct _ec_fragment_range;
+typedef struct _ec_fragment_range ec_fragment_range_t;
+
+struct _ec_inode;
+typedef struct _ec_inode ec_inode_t;
+
+union _ec_cbk;
+typedef union _ec_cbk ec_cbk_t;
+
+struct _ec_lock;
+typedef struct _ec_lock ec_lock_t;
+
+struct _ec_lock_link;
+typedef struct _ec_lock_link ec_lock_link_t;
+
+struct _ec_fop_data;
+typedef struct _ec_fop_data ec_fop_data_t;
+
+struct _ec_cbk_data;
+typedef struct _ec_cbk_data ec_cbk_data_t;
+
+enum _ec_gf_opcode;
+typedef enum _ec_gf_opcode ec_gf_opcode_t;
+
+struct _ec_gf_op;
+typedef struct _ec_gf_op ec_gf_op_t;
+
+struct _ec_gf_mul;
+typedef struct _ec_gf_mul ec_gf_mul_t;
+
+struct _ec_gf;
+typedef struct _ec_gf ec_gf_t;
+
+struct _ec_code_gen;
+typedef struct _ec_code_gen ec_code_gen_t;
+
+struct _ec_code;
+typedef struct _ec_code ec_code_t;
+
+struct _ec_code_arg;
+typedef struct _ec_code_arg ec_code_arg_t;
+
+struct _ec_code_op;
+typedef struct _ec_code_op ec_code_op_t;
+
+struct _ec_code_builder;
+typedef struct _ec_code_builder ec_code_builder_t;
+
+struct _ec_code_chunk;
+typedef struct _ec_code_chunk ec_code_chunk_t;
+
+struct _ec_stripe;
+typedef struct _ec_stripe ec_stripe_t;
+
+struct _ec_stripe_list;
+typedef struct _ec_stripe_list ec_stripe_list_t;
+
+struct _ec_code_space;
+typedef struct _ec_code_space ec_code_space_t;
+
+typedef void (*ec_code_func_linear_t)(void *dst, void *src, uint64_t offset,
+ uint32_t *values, uint32_t count);
+
+typedef void (*ec_code_func_interleaved_t)(void *dst, void **src,
+ uint64_t offset, uint32_t *values,
+ uint32_t count);
+
+union _ec_code_func;
+typedef union _ec_code_func ec_code_func_t;
+
+struct _ec_matrix_row;
+typedef struct _ec_matrix_row ec_matrix_row_t;
+
+struct _ec_matrix;
+typedef struct _ec_matrix ec_matrix_t;
+
+struct _ec_matrix_list;
+typedef struct _ec_matrix_list ec_matrix_list_t;
+
+struct _ec_heal;
+typedef struct _ec_heal ec_heal_t;
+
+struct _ec_self_heald;
+typedef struct _ec_self_heald ec_self_heald_t;
+
+struct _ec_statistics;
+typedef struct _ec_statistics ec_statistics_t;
+
+struct _ec;
+typedef struct _ec ec_t;
+
+typedef void (*ec_wind_f)(ec_t *, ec_fop_data_t *, int32_t);
+typedef int32_t (*ec_handler_f)(ec_fop_data_t *, int32_t);
+typedef void (*ec_resume_f)(ec_fop_data_t *, int32_t);
+
+enum _ec_read_policy { EC_ROUND_ROBIN, EC_GFID_HASH, EC_READ_POLICY_MAX };
+
+enum _ec_heal_need {
+ EC_HEAL_NONEED,
+ EC_HEAL_MAYBE,
+ EC_HEAL_MUST,
+ EC_HEAL_PURGE_INDEX
+};
+
+enum _ec_stripe_part { EC_STRIPE_HEAD, EC_STRIPE_TAIL };
+
+/* Enumartions to indicate FD status. */
+typedef enum { EC_FD_NOT_OPENED, EC_FD_OPENED, EC_FD_OPENING } ec_fd_status_t;
+
+struct _ec_config {
+ uint32_t version;
+ uint8_t algorithm;
+ uint8_t gf_word_size;
+ uint8_t bricks;
+ uint8_t redundancy;
+ uint32_t chunk_size;
+};
+
+struct _ec_fd {
+ loc_t loc;
+ uintptr_t open;
+ int32_t flags;
+ uint64_t bad_version;
+ ec_fd_status_t fd_status[0];
+};
+
+struct _ec_stripe {
+ struct list_head lru; /* LRU list member */
+ uint64_t frag_offset; /* Fragment offset of this stripe */
+ char data[]; /* Contents of the stripe */
+};
+
+struct _ec_stripe_list {
+ struct list_head lru;
+ uint32_t count;
+ uint32_t max;
+};
+
+struct _ec_inode {
+ ec_lock_t *inode_lock;
+ gf_boolean_t have_info;
+ gf_boolean_t have_config;
+ gf_boolean_t have_version;
+ gf_boolean_t have_size;
+ int32_t heal_count;
+ ec_config_t config;
+ uint64_t pre_version[2];
+ uint64_t post_version[2];
+ uint64_t pre_size;
+ uint64_t post_size;
+ uint64_t dirty[2];
+ struct list_head heal;
+ ec_stripe_list_t stripe_cache;
+ uint64_t bad_version;
+};
+
+typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
+ int32_t, uintptr_t, uintptr_t, uintptr_t,
+ uint32_t, dict_t *);
+typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
+ int32_t, uintptr_t, uintptr_t, uintptr_t,
+ uint32_t, dict_t *);
+
+union _ec_cbk {
+ fop_access_cbk_t access;
+ fop_create_cbk_t create;
+ fop_discard_cbk_t discard;
+ fop_entrylk_cbk_t entrylk;
+ fop_fentrylk_cbk_t fentrylk;
+ fop_fallocate_cbk_t fallocate;
+ fop_flush_cbk_t flush;
+ fop_fsync_cbk_t fsync;
+ fop_fsyncdir_cbk_t fsyncdir;
+ fop_getxattr_cbk_t getxattr;
+ fop_fgetxattr_cbk_t fgetxattr;
+ fop_heal_cbk_t heal;
+ fop_fheal_cbk_t fheal;
+ fop_inodelk_cbk_t inodelk;
+ fop_finodelk_cbk_t finodelk;
+ fop_link_cbk_t link;
+ fop_lk_cbk_t lk;
+ fop_lookup_cbk_t lookup;
+ fop_mkdir_cbk_t mkdir;
+ fop_mknod_cbk_t mknod;
+ fop_open_cbk_t open;
+ fop_opendir_cbk_t opendir;
+ fop_readdir_cbk_t readdir;
+ fop_readdirp_cbk_t readdirp;
+ fop_readlink_cbk_t readlink;
+ fop_readv_cbk_t readv;
+ fop_removexattr_cbk_t removexattr;
+ fop_fremovexattr_cbk_t fremovexattr;
+ fop_rename_cbk_t rename;
+ fop_rmdir_cbk_t rmdir;
+ fop_setattr_cbk_t setattr;
+ fop_fsetattr_cbk_t fsetattr;
+ fop_setxattr_cbk_t setxattr;
+ fop_fsetxattr_cbk_t fsetxattr;
+ fop_stat_cbk_t stat;
+ fop_fstat_cbk_t fstat;
+ fop_statfs_cbk_t statfs;
+ fop_symlink_cbk_t symlink;
+ fop_truncate_cbk_t truncate;
+ fop_ftruncate_cbk_t ftruncate;
+ fop_unlink_cbk_t unlink;
+ fop_writev_cbk_t writev;
+ fop_xattrop_cbk_t xattrop;
+ fop_fxattrop_cbk_t fxattrop;
+ fop_zerofill_cbk_t zerofill;
+ fop_seek_cbk_t seek;
+ fop_ipc_cbk_t ipc;
+};
+
+struct _ec_lock {
+ ec_inode_t *ctx;
+ gf_timer_t *timer;
+
+ /* List of owners of this lock. All fops added to this list are running
+ * concurrently. */
+ struct list_head owners;
+
+ /* List of fops waiting to be an owner of the lock. Fops are added to this
+ * list when the current owner has an incompatible access (conflicting lock)
+ * or the lock is not acquired yet. */
+ struct list_head waiting;
+
+ /* List of fops that will wait until the next unlock/lock cycle. This
+ * happens when the currently acquired lock is decided to be released as
+ * soon as possible. In this case, all frozen fops will be continued only
+ * after the lock is reacquired. */
+ struct list_head frozen;
+
+ uintptr_t mask;
+ uintptr_t good_mask;
+ uintptr_t healing;
+ uint32_t refs_owners; /* Refs for fops owning the lock */
+ uint32_t refs_pending; /* Refs assigned to fops being prepared */
+ uint32_t waiting_flags; /*Track xattrop/dirty marking*/
+ gf_boolean_t acquired;
+ gf_boolean_t contention;
+ gf_boolean_t unlock_now;
+ gf_boolean_t release;
+ gf_boolean_t query;
+ fd_t *fd;
+ loc_t loc;
+ union {
+ entrylk_type type;
+ struct gf_flock flock;
+ };
+};
+
+struct _ec_lock_link {
+ ec_lock_t *lock;
+ ec_fop_data_t *fop;
+ struct list_head owner_list;
+ struct list_head wait_list;
+ gf_boolean_t update[2];
+ gf_boolean_t dirty[2];
+ gf_boolean_t optimistic_changelog;
+ loc_t *base;
+ uint64_t size;
+ uint32_t waiting_flags;
+ off_t fl_start;
+ off_t fl_end;
+};
+
+/* This structure keeps a range of fragment offsets affected by a fop. Since
+ * real file offsets can be difficult to handle correctly because of overflows,
+ * we use the 'scaled' offset, which corresponds to the offset of the fragment
+ * seen by the bricks, which is always smaller and cannot overflow. */
+struct _ec_fragment_range {
+ uint64_t first; /* Address of the first affected fragment as seen by the
+ bricks (offset on brick) */
+ uint64_t last; /* Address of the first non affected fragment as seen by
+ the bricks (offset on brick) */
+};
+
+/* EC xlator data structure to collect all the data required to perform
+ * the file operation.*/
+struct _ec_fop_data {
+ int32_t id; /* ID of the file operation */
+ int32_t refs;
+ int32_t state;
+ uint32_t minimum; /* Minimum number of successful
+ operation required to conclude a
+ fop as successful */
+ int32_t expected;
+ int32_t winds;
+ int32_t jobs;
+ int32_t error;
+ ec_fop_data_t *parent;
+ xlator_t *xl; /* points to EC xlator */
+ call_frame_t *req_frame; /* frame of the calling xlator */
+ call_frame_t *frame; /* frame used by this fop */
+ struct list_head cbk_list; /* sorted list of groups of answers */
+ struct list_head answer_list; /* list of answers */
+ struct list_head pending_list; /* member of ec_t.pending_fops */
+ ec_cbk_data_t *answer; /* accepted answer */
+ int32_t lock_count;
+ int32_t locked;
+ gf_lock_t lock;
+ ec_lock_link_t locks[2];
+ int32_t first_lock;
+
+ uint32_t fop_flags; /* Flags passed by the caller. */
+ uint32_t flags; /* Internal flags. */
+ uint32_t first;
+ uintptr_t mask;
+ uintptr_t healing; /*Dispatch is done but call is successful only
+ if fop->minimum number of subvolumes succeed
+ which are not healing*/
+ uintptr_t remaining;
+ uintptr_t received; /* Mask of responses */
+ uintptr_t good;
+
+ uid_t uid;
+ gid_t gid;
+
+ ec_wind_f wind; /* Function to wind to */
+ ec_handler_f handler; /* FOP manager function */
+ ec_resume_f resume;
+ ec_cbk_t cbks; /* Callback function for this FOP */
+ void *data;
+ ec_heal_t *heal;
+ struct list_head healer;
+
+ uint64_t user_size;
+ uint32_t head;
+
+ int32_t use_fd; /* Indicates whether this FOP uses FD or
+ not */
+
+ dict_t *xdata;
+ dict_t *dict;
+ int32_t int32;
+ uint32_t uint32;
+ uint64_t size;
+ off_t offset;
+ mode_t mode[2];
+ entrylk_cmd entrylk_cmd;
+ entrylk_type entrylk_type;
+ gf_xattrop_flags_t xattrop_flags;
+ dev_t dev;
+ inode_t *inode;
+ fd_t *fd; /* FD of the file on which FOP is
+ being carried upon */
+ struct iatt iatt;
+ char *str[2];
+ loc_t loc[2]; /* Holds the location details for
+ the file */
+ struct gf_flock flock;
+ struct iovec *vector;
+ struct iobref *buffers;
+ gf_seek_what_t seek;
+ ec_fragment_range_t frag_range; /* This will hold the range of stripes
+ affected by the fop. */
+ char *errstr; /*String of fop name, path and gfid
+ to be used in gf_msg. */
+};
+
+struct _ec_cbk_data {
+ struct list_head list; /* item in the sorted list of groups */
+ struct list_head answer_list; /* item in the list of answers */
+ ec_fop_data_t *fop;
+ ec_cbk_data_t *next; /* next answer in the same group */
+ uint32_t idx;
+ int32_t op_ret;
+ int32_t op_errno;
+ int32_t count;
+ uintptr_t mask;
+
+ dict_t *xdata;
+ dict_t *dict;
+ int32_t int32;
+ uintptr_t uintptr[3];
+ uint64_t size;
+ uint64_t version[2];
+ inode_t *inode;
+ fd_t *fd;
+ struct statvfs statvfs;
+ struct iatt iatt[5];
+ struct gf_flock flock;
+ struct iovec *vector;
+ struct iobref *buffers;
+ char *str;
+ gf_dirent_t entries;
+ off_t offset;
+ gf_seek_what_t what;
+};
+
+enum _ec_gf_opcode {
+ EC_GF_OP_LOAD,
+ EC_GF_OP_STORE,
+ EC_GF_OP_COPY,
+ EC_GF_OP_XOR2,
+ EC_GF_OP_XOR3,
+ EC_GF_OP_XORM,
+ EC_GF_OP_END
+};
+
+struct _ec_gf_op {
+ ec_gf_opcode_t op;
+ uint32_t arg1;
+ uint32_t arg2;
+ uint32_t arg3;
+};
+
+struct _ec_gf_mul {
+ uint32_t regs;
+ uint32_t map[EC_GF_MAX_REGS];
+ ec_gf_op_t *ops;
+};
+
+struct _ec_gf {
+ uint32_t bits;
+ uint32_t size;
+ uint32_t mod;
+ uint32_t min_ops;
+ uint32_t max_ops;
+ uint32_t avg_ops;
+ uint32_t *log;
+ uint32_t *pow;
+ ec_gf_mul_t **table;
+};
+
+struct _ec_code_gen {
+ char *name;
+ char **flags;
+ uint32_t width;
+
+ void (*prolog)(ec_code_builder_t *builder);
+ void (*epilog)(ec_code_builder_t *builder);
+ void (*load)(ec_code_builder_t *builder, uint32_t reg, uint32_t offset,
+ uint32_t bit);
+ void (*store)(ec_code_builder_t *builder, uint32_t reg, uint32_t bit);
+ void (*copy)(ec_code_builder_t *builder, uint32_t dst, uint32_t src);
+ void (*xor2)(ec_code_builder_t *builder, uint32_t dst, uint32_t src);
+ void (*xor3)(ec_code_builder_t *builder, uint32_t dst, uint32_t src1,
+ uint32_t src2);
+ void (*xorm)(ec_code_builder_t *builder, uint32_t dst, uint32_t offset,
+ uint32_t bit);
+};
+
+struct _ec_code {
+ gf_lock_t lock;
+ struct list_head spaces;
+ ec_gf_t *gf;
+ ec_code_gen_t *gen;
+};
+
+struct _ec_code_arg {
+ uint32_t value;
+};
+
+struct _ec_code_op {
+ ec_gf_opcode_t op;
+ ec_code_arg_t arg1;
+ ec_code_arg_t arg2;
+ ec_code_arg_t arg3;
+};
+
+struct _ec_code_builder {
+ ec_code_t *code;
+ uint64_t address;
+ uint8_t *data;
+ uint32_t size;
+ int32_t error;
+ uint32_t regs;
+ uint32_t bits;
+ uint32_t width;
+ uint32_t count;
+ uint32_t base;
+ uint32_t map[EC_GF_MAX_REGS];
+ gf_boolean_t linear;
+ uint64_t loop;
+ ec_code_op_t ops[0];
+};
+
+struct _ec_code_chunk {
+ struct list_head list;
+ size_t size;
+ ec_code_space_t *space;
+};
+
+struct _ec_code_space {
+ struct list_head list;
+ struct list_head chunks;
+ ec_code_t *code;
+ void *exec;
+ size_t size;
+};
+
+union _ec_code_func {
+ ec_code_func_linear_t linear;
+ ec_code_func_interleaved_t interleaved;
+};
+
+struct _ec_matrix_row {
+ ec_code_func_t func;
+ uint32_t *values;
+};
+
+struct _ec_matrix {
+ struct list_head lru;
+ uint32_t refs;
+ uint32_t columns;
+ uint32_t rows;
+ uintptr_t mask;
+ ec_code_t *code;
+ uint32_t *values;
+ ec_matrix_row_t row_data[0];
+};
+
+struct _ec_matrix_list {
+ struct list_head lru;
+ gf_lock_t lock;
+ uint32_t columns;
+ uint32_t rows;
+ uint32_t max;
+ uint32_t count;
+ uint32_t stripe;
+ struct mem_pool *pool;
+ ec_gf_t *gf;
+ ec_code_t *code;
+ ec_matrix_t *encode;
+ ec_matrix_t **objects;
+};
+
+struct _ec_heal {
+ struct list_head list;
+ gf_lock_t lock;
+ xlator_t *xl;
+ ec_fop_data_t *fop;
+ void *data;
+ ec_fop_data_t *lookup;
+ loc_t loc;
+ struct iatt iatt;
+ char *symlink;
+ fd_t *fd;
+ int32_t partial;
+ int32_t done;
+ int32_t error;
+ gf_boolean_t nameheal;
+ uintptr_t available;
+ uintptr_t good;
+ uintptr_t bad;
+ uintptr_t open;
+ uintptr_t fixed;
+ uint64_t offset;
+ uint64_t size;
+ uint64_t total_size;
+ uint64_t version[2];
+ uint64_t raw_size;
+};
+
+struct subvol_healer {
+ xlator_t *this;
+ int subvol;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ pthread_t thread;
+};
+
+struct _ec_self_heald {
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
+ int timeout;
+ uint32_t max_threads;
+ uint32_t wait_qlength;
+ struct subvol_healer *index_healers;
+ struct subvol_healer *full_healers;
+};
+
+struct _ec_statistics {
+ struct {
+ gf_atomic_t hits; /* Cache hits. */
+ gf_atomic_t misses; /* Cache misses. */
+ gf_atomic_t updates; /* Number of times an existing stripe has
+ been updated with new content. */
+ gf_atomic_t invals; /* Number of times an existing stripe has
+ been invalidated because of truncates
+ or discards. */
+ gf_atomic_t evicts; /* Number of times that an existing entry
+ has been evicted to make room for newer
+ entries. */
+ gf_atomic_t allocs; /* Number of memory allocations made to
+ store stripes. */
+ gf_atomic_t errors; /* Number of errors that have caused extra
+ requests. (Basically memory allocation
+ errors). */
+ } stripe_cache;
+ struct {
+ gf_atomic_t attempted; /*Number of heals attempted on
+ files/directories*/
+ gf_atomic_t completed; /*Number of heals complted on files/directories*/
+ } shd;
+};
+
+struct _ec {
+ xlator_t *xl;
+ int32_t healers;
+ int32_t heal_waiters;
+ int32_t nodes; /* Total number of bricks(n) */
+ int32_t bits_for_nodes;
+ int32_t fragments; /* Data bricks(k) */
+ int32_t redundancy; /* Redundant bricks(m) */
+ uint32_t fragment_size; /* Size of fragment/chunk on a
+ brick. */
+ uint32_t stripe_size; /* (fragment_size * fragments)
+ maximum size of user data
+ stored in one stripe. */
+ int32_t up; /* Represents whether EC volume is
+ up or not. */
+ uint32_t idx;
+ uint32_t xl_up_count; /* Number of UP bricks. */
+ uintptr_t xl_up; /* Bit flag representing UP
+ bricks */
+ uint32_t xl_notify_count; /* Number of notifications. */
+ uintptr_t xl_notify; /* Bit flag representing
+ notification for bricks. */
+ uintptr_t node_mask;
+ uintptr_t read_mask; /*Stores user defined read-mask*/
+ gf_atomic_t async_fop_count; /* Number of on going asynchronous fops. */
+ xlator_t **xl_list;
+ gf_lock_t lock;
+ gf_timer_t *timer;
+ gf_boolean_t shutdown;
+ gf_boolean_t eager_lock;
+ gf_boolean_t other_eager_lock;
+ gf_boolean_t optimistic_changelog;
+ gf_boolean_t parallel_writes;
+ uint32_t stripe_cache;
+ uint32_t quorum_count;
+ uint32_t background_heals;
+ uint32_t heal_wait_qlen;
+ uint32_t self_heal_window_size; /* max size of read/writes */
+ uint32_t eager_lock_timeout;
+ uint32_t other_eager_lock_timeout;
+ struct list_head pending_fops;
+ struct list_head heal_waiting;
+ struct list_head healing;
+ struct mem_pool *fop_pool;
+ struct mem_pool *cbk_pool;
+ struct mem_pool *lock_pool;
+ ec_self_heald_t shd;
+ char vol_uuid[UUID_SIZE + 1];
+ dict_t *leaf_to_subvolid;
+ ec_read_policy_t read_policy;
+ ec_matrix_list_t matrix;
+ ec_statistics_t stats;
+};
+
+#endif /* __EC_TYPES_H__ */
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
new file mode 100644
index 00000000000..7344be4968d
--- /dev/null
+++ b/xlators/cluster/ec/src/ec.c
@@ -0,0 +1,1873 @@
+/*
+ Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/defaults.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/upcall-utils.h>
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-mem-types.h"
+#include "ec-types.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-fops.h"
+#include "ec-method.h"
+#include "ec-code.h"
+#include "ec-heald.h"
+#include <glusterfs/events.h>
+
+static char *ec_read_policies[EC_READ_POLICY_MAX + 1] = {
+ [EC_ROUND_ROBIN] = "round-robin",
+ [EC_GFID_HASH] = "gfid-hash",
+ [EC_READ_POLICY_MAX] = NULL};
+
+#define EC_INTERNAL_XATTR_OR_GOTO(name, xattr, op_errno, label) \
+ do { \
+ if (ec_is_internal_xattr(NULL, (char *)name, NULL, NULL)) { \
+ op_errno = EPERM; \
+ goto label; \
+ } \
+ if (name && (strlen(name) == 0) && xattr) { \
+ /* Bulk [f]removexattr/[f]setxattr */ \
+ GF_IF_INTERNAL_XATTR_GOTO(EC_XATTR_PREFIX "*", xattr, op_errno, \
+ label); \
+ } \
+ } while (0)
+
+int32_t
+ec_parse_options(xlator_t *this)
+{
+ ec_t *ec = this->private;
+ int32_t error = EINVAL;
+ uintptr_t mask;
+
+ GF_OPTION_INIT("redundancy", ec->redundancy, int32, out);
+ ec->fragments = ec->nodes - ec->redundancy;
+ if ((ec->redundancy < 1) || (ec->redundancy >= ec->fragments) ||
+ (ec->fragments > EC_MAX_FRAGMENTS)) {
+ gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_REDUNDANCY,
+ "Invalid redundancy (must be between "
+ "1 and %d)",
+ (ec->nodes - 1) / 2);
+
+ goto out;
+ }
+
+ ec->bits_for_nodes = 1;
+ mask = 2;
+ while (ec->nodes > mask) {
+ ec->bits_for_nodes++;
+ mask <<= 1;
+ }
+ ec->node_mask = (1ULL << ec->nodes) - 1ULL;
+ ec->fragment_size = EC_METHOD_CHUNK_SIZE;
+ ec->stripe_size = ec->fragment_size * ec->fragments;
+
+ gf_msg_debug("ec", 0,
+ "Initialized with: nodes=%u, fragments=%u, "
+ "stripe_size=%u, node_mask=%" PRIxFAST32,
+ ec->nodes, ec->fragments, ec->stripe_size, ec->node_mask);
+
+ error = 0;
+
+out:
+ return error;
+}
+
+int32_t
+ec_prepare_childs(xlator_t *this)
+{
+ ec_t *ec = this->private;
+ xlator_list_t *child = NULL;
+ int32_t count = 0;
+
+ for (child = this->children; child != NULL; child = child->next) {
+ count++;
+ }
+ if (count > EC_MAX_NODES) {
+ gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_TOO_MANY_SUBVOLS,
+ "Too many subvolumes");
+
+ return EINVAL;
+ }
+ ec->nodes = count;
+
+ ec->xl_list = GF_CALLOC(count, sizeof(ec->xl_list[0]), ec_mt_xlator_t);
+ if (ec->xl_list == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Allocation of xlator list failed");
+
+ return ENOMEM;
+ }
+ ec->xl_up = 0;
+ ec->xl_up_count = 0;
+
+ count = 0;
+ for (child = this->children; child != NULL; child = child->next) {
+ ec->xl_list[count++] = child->xlator;
+ }
+
+ return 0;
+}
+
+/* This function transforms the subvol to subvol-id*/
+static int
+_subvol_to_subvolid(dict_t *this, char *key, data_t *value, void *data)
+{
+ ec_t *ec = data;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int ret = -1;
+
+ subvol = data_to_ptr(value);
+ for (i = 0; i < ec->nodes; i++) {
+ if (ec->xl_list[i] == subvol) {
+ ret = dict_set_int32(this, key, i);
+ /* -1 stops dict_foreach and returns -1*/
+ if (ret < 0)
+ ret = -1;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+int
+ec_subvol_to_subvol_id_transform(ec_t *ec, dict_t *leaf_to_subvolid)
+{
+ return dict_foreach(leaf_to_subvolid, _subvol_to_subvolid, ec);
+}
+
+void
+__ec_destroy_private(xlator_t *this)
+{
+ ec_t *ec = this->private;
+
+ if (ec != NULL) {
+ LOCK(&ec->lock);
+
+ if (ec->timer != NULL) {
+ gf_timer_call_cancel(this->ctx, ec->timer);
+ ec->timer = NULL;
+ }
+
+ UNLOCK(&ec->lock);
+
+ /* There is a race with timer because there is no way to know if
+ * timer callback has really been cancelled or it has been scheduled
+ * for execution. If it has been scheduled, it will crash if we
+ * destroy ec too fast.
+ *
+ * Not sure how this can be solved without using global variables or
+ * having support from gf_timer_call_cancel()
+ */
+ sleep(2);
+
+ this->private = NULL;
+ if (ec->xl_list != NULL) {
+ GF_FREE(ec->xl_list);
+ ec->xl_list = NULL;
+ }
+
+ if (ec->fop_pool != NULL) {
+ mem_pool_destroy(ec->fop_pool);
+ }
+
+ if (ec->cbk_pool != NULL) {
+ mem_pool_destroy(ec->cbk_pool);
+ }
+
+ if (ec->lock_pool != NULL) {
+ mem_pool_destroy(ec->lock_pool);
+ }
+
+ LOCK_DESTROY(&ec->lock);
+
+ if (ec->leaf_to_subvolid)
+ dict_unref(ec->leaf_to_subvolid);
+
+ ec_method_fini(&ec->matrix);
+
+ GF_FREE(ec);
+ }
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ if (xlator_mem_acct_init(this, ec_mt_end + 1) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Memory accounting initialization "
+ "failed.");
+
+ return -1;
+ }
+
+ return 0;
+}
+
+void
+ec_configure_background_heal_opts(ec_t *ec, int background_heals,
+ int heal_wait_qlen)
+{
+ if (background_heals == 0) {
+ ec->heal_wait_qlen = 0;
+ } else {
+ ec->heal_wait_qlen = heal_wait_qlen;
+ }
+ ec->background_heals = background_heals;
+}
+
+int
+ec_assign_read_policy(ec_t *ec, char *read_policy)
+{
+ int read_policy_idx = -1;
+
+ read_policy_idx = gf_get_index_by_elem(ec_read_policies, read_policy);
+ if (read_policy_idx < 0 || read_policy_idx >= EC_READ_POLICY_MAX)
+ return -1;
+
+ ec->read_policy = read_policy_idx;
+ return 0;
+}
+
+int32_t
+reconfigure(xlator_t *this, dict_t *options)
+{
+ ec_t *ec = this->private;
+ char *read_policy = NULL;
+ char *extensions = NULL;
+ uint32_t heal_wait_qlen = 0;
+ uint32_t background_heals = 0;
+ int32_t ret = -1;
+ int32_t err;
+
+ GF_OPTION_RECONF("cpu-extensions", extensions, options, str, failed);
+
+ GF_OPTION_RECONF("self-heal-daemon", ec->shd.enabled, options, bool,
+ failed);
+ GF_OPTION_RECONF("iam-self-heal-daemon", ec->shd.iamshd, options, bool,
+ failed);
+ GF_OPTION_RECONF("eager-lock", ec->eager_lock, options, bool, failed);
+ GF_OPTION_RECONF("other-eager-lock", ec->other_eager_lock, options, bool,
+ failed);
+ GF_OPTION_RECONF("eager-lock-timeout", ec->eager_lock_timeout, options,
+ uint32, failed);
+ GF_OPTION_RECONF("other-eager-lock-timeout", ec->other_eager_lock_timeout,
+ options, uint32, failed);
+ GF_OPTION_RECONF("background-heals", background_heals, options, uint32,
+ failed);
+ GF_OPTION_RECONF("heal-wait-qlength", heal_wait_qlen, options, uint32,
+ failed);
+ GF_OPTION_RECONF("self-heal-window-size", ec->self_heal_window_size,
+ options, uint32, failed);
+ GF_OPTION_RECONF("heal-timeout", ec->shd.timeout, options, int32, failed);
+ ec_configure_background_heal_opts(ec, background_heals, heal_wait_qlen);
+ GF_OPTION_RECONF("shd-max-threads", ec->shd.max_threads, options, uint32,
+ failed);
+ GF_OPTION_RECONF("shd-wait-qlength", ec->shd.wait_qlength, options, uint32,
+ failed);
+
+ GF_OPTION_RECONF("read-policy", read_policy, options, str, failed);
+
+ GF_OPTION_RECONF("optimistic-change-log", ec->optimistic_changelog, options,
+ bool, failed);
+ GF_OPTION_RECONF("parallel-writes", ec->parallel_writes, options, bool,
+ failed);
+ GF_OPTION_RECONF("stripe-cache", ec->stripe_cache, options, uint32, failed);
+ GF_OPTION_RECONF("quorum-count", ec->quorum_count, options, uint32, failed);
+ ret = 0;
+ if (ec_assign_read_policy(ec, read_policy)) {
+ ret = -1;
+ }
+
+ err = ec_method_update(this, &ec->matrix, extensions);
+ if (err != 0) {
+ ret = -1;
+ }
+
+failed:
+ return ret;
+}
+
+glusterfs_event_t
+ec_get_event_from_state(ec_t *ec)
+{
+ int down_count = 0;
+
+ if (ec->xl_up_count >= ec->fragments) {
+ /* If ec is up but some subvolumes are yet to notify, give
+ * grace time for other subvols to notify to prevent start of
+ * I/O which may result in self-heals */
+ if (ec->xl_notify_count < ec->nodes)
+ return GF_EVENT_MAXVAL;
+
+ return GF_EVENT_CHILD_UP;
+ } else {
+ down_count = ec->xl_notify_count - ec->xl_up_count;
+ if (down_count > ec->redundancy)
+ return GF_EVENT_CHILD_DOWN;
+ }
+
+ return GF_EVENT_MAXVAL;
+}
+
+void
+ec_up(xlator_t *this, ec_t *ec)
+{
+ char str1[32], str2[32];
+
+ if (ec->timer != NULL) {
+ gf_timer_call_cancel(this->ctx, ec->timer);
+ ec->timer = NULL;
+ }
+
+ ec->up = 1;
+ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP,
+ "Going UP : Child UP = %s Child Notify = %s",
+ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
+
+ gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
+}
+
+void
+ec_down(xlator_t *this, ec_t *ec)
+{
+ char str1[32], str2[32];
+
+ if (ec->timer != NULL) {
+ gf_timer_call_cancel(this->ctx, ec->timer);
+ ec->timer = NULL;
+ }
+
+ ec->up = 0;
+ gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN,
+ "Going DOWN : Child UP = %s Child Notify = %s",
+ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
+
+ gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
+}
+
+void
+ec_notify_cbk(void *data)
+{
+ ec_t *ec = data;
+ glusterfs_event_t event = GF_EVENT_MAXVAL;
+ gf_boolean_t propagate = _gf_false;
+ gf_boolean_t launch_heal = _gf_false;
+
+ LOCK(&ec->lock);
+ {
+ if (!ec->timer) {
+ /*
+ * Either child_up/child_down is already sent to parent
+ * This is a spurious wake up.
+ */
+ goto unlock;
+ }
+
+ gf_timer_call_cancel(ec->xl->ctx, ec->timer);
+ ec->timer = NULL;
+
+ /* The timeout has expired, so any subvolume that has not
+ * already reported its state, will be considered to be down.
+ * We mark as if all bricks had reported. */
+ ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
+ ec->xl_notify_count = ec->nodes;
+
+ /* Since we have marked all subvolumes as notified, it's
+ * guaranteed that ec_get_event_from_state() will return
+ * CHILD_UP or CHILD_DOWN, but not MAXVAL. */
+ event = ec_get_event_from_state(ec);
+ if (event == GF_EVENT_CHILD_UP) {
+ /* We are ready to bring the volume up. If there are
+ * still bricks DOWN, they will be healed when they
+ * come up. */
+ ec_up(ec->xl, ec);
+
+ if (ec->shd.iamshd && !ec->shutdown) {
+ launch_heal = _gf_true;
+ GF_ATOMIC_INC(ec->async_fop_count);
+ }
+ }
+
+ propagate = _gf_true;
+ }
+unlock:
+ UNLOCK(&ec->lock);
+
+ if (launch_heal) {
+ /* We have just brought the volume UP, so we trigger
+ * a self-heal check on the root directory. */
+ ec_launch_replace_heal(ec);
+ }
+ if (propagate) {
+ default_notify(ec->xl, event, NULL);
+ }
+}
+
+void
+ec_launch_notify_timer(xlator_t *this, ec_t *ec)
+{
+ struct timespec delay = {
+ 0,
+ };
+
+ gf_msg_debug(this->name, 0, "Initiating child-down timer");
+ delay.tv_sec = 10;
+ delay.tv_nsec = 0;
+ ec->timer = gf_timer_call_after(this->ctx, delay, ec_notify_cbk, ec);
+ if (ec->timer == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_TIMER_CREATE_FAIL,
+ "Cannot create timer "
+ "for delayed initialization");
+ }
+}
+
+gf_boolean_t
+ec_disable_delays(ec_t *ec)
+{
+ ec->shutdown = _gf_true;
+
+ return __ec_is_last_fop(ec);
+}
+
+void
+ec_cleanup_healer_object(ec_t *ec)
+{
+ struct subvol_healer *healer = NULL;
+ ec_self_heald_t *shd = NULL;
+ void *res = NULL;
+ int i = 0;
+ gf_boolean_t is_join = _gf_false;
+
+ shd = &ec->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < ec->nodes; i++) {
+ healer = &shd->index_healers[i];
+ pthread_mutex_lock(&healer->mutex);
+ {
+ healer->rerun = 1;
+ if (healer->running) {
+ pthread_cond_signal(&healer->cond);
+ is_join = _gf_true;
+ }
+ }
+ pthread_mutex_unlock(&healer->mutex);
+ if (is_join) {
+ pthread_join(healer->thread, &res);
+ is_join = _gf_false;
+ }
+
+ healer = &shd->full_healers[i];
+ pthread_mutex_lock(&healer->mutex);
+ {
+ healer->rerun = 1;
+ if (healer->running) {
+ pthread_cond_signal(&healer->cond);
+ is_join = _gf_true;
+ }
+ }
+ pthread_mutex_unlock(&healer->mutex);
+ if (is_join) {
+ pthread_join(healer->thread, &res);
+ is_join = _gf_false;
+ }
+ }
+}
+void
+ec_pending_fops_completed(ec_t *ec)
+{
+ if (ec->shutdown) {
+ default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL);
+ }
+}
+
+static gf_boolean_t
+ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
+{
+ uintptr_t current_state = 0;
+
+ if (xlator_is_cleanup_starting(ec->xl))
+ return _gf_false;
+
+ if ((ec->xl_notify & index_mask) == 0) {
+ ec->xl_notify |= index_mask;
+ ec->xl_notify_count++;
+ }
+ current_state = ec->xl_up & index_mask;
+ if (current_state != new_state) {
+ ec->xl_up ^= index_mask;
+ ec->xl_up_count += (current_state ? -1 : 1);
+
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static gf_boolean_t
+ec_upcall(ec_t *ec, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *ci = NULL;
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ inode_t *inode;
+ inode_table_t *table;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_CACHE_INVALIDATION:
+ ci = upcall->data;
+ ci->flags |= UP_INVAL_ATTR;
+ return _gf_true;
+
+ case GF_UPCALL_INODELK_CONTENTION:
+ lc = upcall->data;
+ if (strcmp(lc->domain, ec->xl->name) != 0) {
+ /* The lock is not owned by EC, ignore it. */
+ return _gf_true;
+ }
+ table = ((xlator_t *)ec->xl->graph->top)->itable;
+ if (table == NULL) {
+ /* Self-heal daemon doesn't have an inode table on the top
+ * xlator because it doesn't need it. In this case we should
+ * use the inode table managed by EC itself where all inodes
+ * being healed should be present. However self-heal doesn't
+ * use eager-locking and inodelk's are already released as
+ * soon as possible. In this case we can safely ignore these
+ * notifications. */
+ return _gf_false;
+ }
+ inode = inode_find(table, upcall->gfid);
+ /* If inode is not found, it means that it's already released,
+ * so we can ignore it. Probably it has been released and
+ * destroyed while the contention notification was being sent.
+ */
+ if (inode != NULL) {
+ ec_lock_release(ec, inode);
+ inode_unref(inode);
+ }
+
+ return _gf_false;
+
+ default:
+ return _gf_true;
+ }
+}
+
+int32_t
+ec_notify(xlator_t *this, int32_t event, void *data, void *data2)
+{
+ ec_t *ec = this->private;
+ int32_t idx = 0;
+ int32_t error = 0;
+ glusterfs_event_t old_event = GF_EVENT_MAXVAL;
+ dict_t *input = NULL;
+ dict_t *output = NULL;
+ gf_boolean_t propagate = _gf_true;
+ gf_boolean_t needs_shd_check = _gf_false;
+ int32_t orig_event = event;
+ uintptr_t mask = 0;
+
+ gf_msg_trace(this->name, 0, "NOTIFY(%d): %p, %p", event, data, data2);
+
+ if (event == GF_EVENT_UPCALL) {
+ propagate = ec_upcall(ec, data);
+ goto done;
+ }
+
+ if (event == GF_EVENT_TRANSLATOR_OP) {
+ if (!ec->up) {
+ error = -1;
+ } else {
+ input = data;
+ output = data2;
+ error = ec_xl_op(this, input, output);
+ }
+ goto out;
+ }
+
+ for (idx = 0; idx < ec->nodes; idx++) {
+ if (ec->xl_list[idx] == data) {
+ break;
+ }
+ }
+
+ LOCK(&ec->lock);
+
+ if (event == GF_EVENT_PARENT_UP) {
+ /*
+ * Start a timer which sends appropriate event to parent
+ * xlator to prevent the 'mount' syscall from hanging.
+ */
+ ec_launch_notify_timer(this, ec);
+ goto unlock;
+ } else if (event == GF_EVENT_PARENT_DOWN) {
+ /* If there aren't pending fops running after we have waken up
+ * them, we immediately propagate the notification. */
+ propagate = ec_disable_delays(ec);
+ ec_cleanup_healer_object(ec);
+ goto unlock;
+ }
+
+ if (idx < ec->nodes) { /* CHILD_* events */
+ old_event = ec_get_event_from_state(ec);
+
+ mask = 1ULL << idx;
+ if (event == GF_EVENT_CHILD_UP) {
+ /* We need to trigger a selfheal if a brick changes
+ * to UP state. */
+ if (ec_set_up_state(ec, mask, mask) && ec->shd.iamshd &&
+ !ec->shutdown) {
+ needs_shd_check = _gf_true;
+ }
+ } else if (event == GF_EVENT_CHILD_DOWN) {
+ ec_set_up_state(ec, mask, 0);
+ }
+
+ event = ec_get_event_from_state(ec);
+
+ if (event == GF_EVENT_CHILD_UP) {
+ if (!ec->up) {
+ ec_up(this, ec);
+ }
+ } else {
+ /* If the volume is not UP, it's irrelevant if one
+ * brick has come up. We cannot heal anything. */
+ needs_shd_check = _gf_false;
+
+ if ((event == GF_EVENT_CHILD_DOWN) && ec->up) {
+ ec_down(this, ec);
+ }
+ }
+
+ if (event != GF_EVENT_MAXVAL) {
+ if (event == old_event) {
+ if (orig_event == GF_EVENT_CHILD_UP)
+ event = GF_EVENT_SOME_DESCENDENT_UP;
+ else /* orig_event has to be GF_EVENT_CHILD_DOWN */
+ event = GF_EVENT_SOME_DESCENDENT_DOWN;
+ }
+ } else {
+ propagate = _gf_false;
+ needs_shd_check = _gf_false;
+ }
+
+ if (needs_shd_check) {
+ GF_ATOMIC_INC(ec->async_fop_count);
+ }
+ }
+unlock:
+ UNLOCK(&ec->lock);
+
+done:
+ if (needs_shd_check) {
+ ec_launch_replace_heal(ec);
+ }
+ if (propagate) {
+ error = default_notify(this, event, data);
+ }
+
+out:
+ return error;
+}
+
+int32_t
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+ int ret = -1;
+ va_list ap;
+ void *data2 = NULL;
+
+ va_start(ap, data);
+ data2 = va_arg(ap, dict_t *);
+ va_end(ap);
+ ret = ec_notify(this, event, data, data2);
+
+ return ret;
+}
+
+static void
+ec_statistics_init(ec_t *ec)
+{
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.hits, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.misses, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.updates, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.invals, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0);
+ GF_ATOMIC_INIT(ec->stats.shd.attempted, 0);
+ GF_ATOMIC_INIT(ec->stats.shd.completed, 0);
+}
+
+static int
+ec_assign_read_mask(ec_t *ec, char *read_mask_str)
+{
+ char *mask = NULL;
+ char *maskptr = NULL;
+ char *saveptr = NULL;
+ char *id_str = NULL;
+ int id = 0;
+ int ret = 0;
+ uintptr_t read_mask = 0;
+
+ if (!read_mask_str) {
+ ec->read_mask = 0;
+ ret = 0;
+ goto out;
+ }
+
+ mask = gf_strdup(read_mask_str);
+ if (!mask) {
+ ret = -1;
+ goto out;
+ }
+ maskptr = mask;
+
+ for (;;) {
+ id_str = strtok_r(maskptr, ":", &saveptr);
+ if (id_str == NULL)
+ break;
+ if (gf_string2int(id_str, &id)) {
+ gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+ "In read-mask \"%s\" id %s is not a valid integer",
+ read_mask_str, id_str);
+ ret = -1;
+ goto out;
+ }
+
+ if ((id < 0) || (id >= ec->nodes)) {
+ gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+ "In read-mask \"%s\" id %d is not in range [0 - %d]",
+ read_mask_str, id, ec->nodes - 1);
+ ret = -1;
+ goto out;
+ }
+ read_mask |= (1UL << id);
+ maskptr = NULL;
+ }
+
+ if (gf_bits_count(read_mask) < ec->fragments) {
+ gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+ "read-mask \"%s\" should contain at least %d ids", read_mask_str,
+ ec->fragments);
+ ret = -1;
+ goto out;
+ }
+ ec->read_mask = read_mask;
+ ret = 0;
+out:
+ GF_FREE(mask);
+ return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+ ec_t *ec = NULL;
+ char *read_policy = NULL;
+ char *extensions = NULL;
+ int32_t err;
+ char *read_mask_str = NULL;
+
+ if (this->parents == NULL) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, EC_MSG_NO_PARENTS,
+ "Volume does not have parents.");
+ }
+
+ ec = GF_MALLOC(sizeof(*ec), ec_mt_ec_t);
+ if (ec == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to allocate private memory.");
+
+ return -1;
+ }
+ memset(ec, 0, sizeof(*ec));
+
+ this->private = ec;
+
+ ec->xl = this;
+ LOCK_INIT(&ec->lock);
+
+ GF_ATOMIC_INIT(ec->async_fop_count, 0);
+ INIT_LIST_HEAD(&ec->pending_fops);
+ INIT_LIST_HEAD(&ec->heal_waiting);
+ INIT_LIST_HEAD(&ec->healing);
+
+ ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024);
+ ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096);
+ ec->lock_pool = mem_pool_new(ec_lock_t, 1024);
+ if ((ec->fop_pool == NULL) || (ec->cbk_pool == NULL) ||
+ (ec->lock_pool == NULL)) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+ "Failed to create memory pools.");
+
+ goto failed;
+ }
+
+ if (ec_prepare_childs(this) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+ "Failed to initialize xlator");
+
+ goto failed;
+ }
+
+ if (ec_parse_options(this) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_XLATOR_PARSE_OPT_FAIL,
+ "Failed to parse xlator options");
+
+ goto failed;
+ }
+
+ GF_OPTION_INIT("cpu-extensions", extensions, str, failed);
+
+ err = ec_method_init(this, &ec->matrix, ec->fragments, ec->nodes,
+ ec->nodes * 2, extensions);
+ if (err != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -err, EC_MSG_MATRIX_FAILED,
+ "Failed to initialize matrix management");
+
+ goto failed;
+ }
+
+ GF_OPTION_INIT("self-heal-daemon", ec->shd.enabled, bool, failed);
+ GF_OPTION_INIT("iam-self-heal-daemon", ec->shd.iamshd, bool, failed);
+ GF_OPTION_INIT("eager-lock", ec->eager_lock, bool, failed);
+ GF_OPTION_INIT("other-eager-lock", ec->other_eager_lock, bool, failed);
+ GF_OPTION_INIT("eager-lock-timeout", ec->eager_lock_timeout, uint32,
+ failed);
+ GF_OPTION_INIT("other-eager-lock-timeout", ec->other_eager_lock_timeout,
+ uint32, failed);
+ GF_OPTION_INIT("background-heals", ec->background_heals, uint32, failed);
+ GF_OPTION_INIT("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed);
+ GF_OPTION_INIT("self-heal-window-size", ec->self_heal_window_size, uint32,
+ failed);
+ ec_configure_background_heal_opts(ec, ec->background_heals,
+ ec->heal_wait_qlen);
+ GF_OPTION_INIT("read-policy", read_policy, str, failed);
+ if (ec_assign_read_policy(ec, read_policy))
+ goto failed;
+
+ GF_OPTION_INIT("heal-timeout", ec->shd.timeout, int32, failed);
+ GF_OPTION_INIT("shd-max-threads", ec->shd.max_threads, uint32, failed);
+ GF_OPTION_INIT("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
+ GF_OPTION_INIT("optimistic-change-log", ec->optimistic_changelog, bool,
+ failed);
+ GF_OPTION_INIT("parallel-writes", ec->parallel_writes, bool, failed);
+ GF_OPTION_INIT("stripe-cache", ec->stripe_cache, uint32, failed);
+ GF_OPTION_INIT("quorum-count", ec->quorum_count, uint32, failed);
+ GF_OPTION_INIT("ec-read-mask", read_mask_str, str, failed);
+
+ if (ec_assign_read_mask(ec, read_mask_str))
+ goto failed;
+
+ this->itable = inode_table_new(EC_SHD_INODE_LRU_LIMIT, this);
+ if (!this->itable)
+ goto failed;
+
+ if (ec->shd.iamshd)
+ ec_selfheal_daemon_init(this);
+ gf_msg_debug(this->name, 0, "Disperse translator initialized.");
+
+ ec->leaf_to_subvolid = dict_new();
+ if (!ec->leaf_to_subvolid)
+ goto failed;
+ if (glusterfs_reachable_leaves(this, ec->leaf_to_subvolid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_SUBVOL_BUILD_FAIL,
+ "Failed to build subvol "
+ "dictionary");
+ goto failed;
+ }
+
+ if (ec_subvol_to_subvol_id_transform(ec, ec->leaf_to_subvolid) < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_SUBVOL_ID_DICT_SET_FAIL,
+ "Failed to build subvol-id "
+ "dictionary");
+ goto failed;
+ }
+
+ ec_statistics_init(ec);
+
+ return 0;
+
+failed:
+ __ec_destroy_private(this);
+
+ return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+ ec_selfheal_daemon_fini(this);
+ __ec_destroy_private(this);
+}
+
+int32_t
+ec_gf_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata)
+{
+ ec_access(frame, this, -1, EC_MINIMUM_ONE, default_access_cbk, NULL, loc,
+ mask, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ ec_create(frame, this, -1, EC_MINIMUM_MIN, default_create_cbk, NULL, loc,
+ flags, mode, umask, fd, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ ec_discard(frame, this, -1, EC_MINIMUM_MIN, default_discard_cbk, NULL, fd,
+ offset, len, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
+{
+ uint32_t fop_flags = EC_MINIMUM_ALL;
+
+ if (cmd == ENTRYLK_UNLOCK)
+ fop_flags = EC_MINIMUM_ONE;
+ ec_entrylk(frame, this, -1, fop_flags, default_entrylk_cbk, NULL, volume,
+ loc, basename, cmd, type, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+ fd_t *fd, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
+{
+ uint32_t fop_flags = EC_MINIMUM_ALL;
+
+ if (cmd == ENTRYLK_UNLOCK)
+ fop_flags = EC_MINIMUM_ONE;
+ ec_fentrylk(frame, this, -1, fop_flags, default_fentrylk_cbk, NULL, volume,
+ fd, basename, cmd, type, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ ec_fallocate(frame, this, -1, EC_MINIMUM_MIN, default_fallocate_cbk, NULL,
+ fd, mode, offset, len, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ ec_flush(frame, this, -1, EC_MINIMUM_MIN, default_flush_cbk, NULL, fd,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ ec_fsync(frame, this, -1, EC_MINIMUM_MIN, default_fsync_cbk, NULL, fd,
+ datasync, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ ec_fsyncdir(frame, this, -1, EC_MINIMUM_MIN, default_fsyncdir_cbk, NULL, fd,
+ datasync, xdata);
+
+ return 0;
+}
+
+int
+ec_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+ xlator_t **subvols)
+{
+ xlator_t *this = frame->this;
+ ec_t *ec = this->private;
+
+ memcpy(subvols, ec->xl_list, sizeof(*subvols) * ec->nodes);
+
+ if (type == MARKER_XTIME_TYPE) {
+ /*Don't error out on ENOENT/ENOTCONN */
+ gauge[MCNT_NOTFOUND] = 0;
+ gauge[MCNT_ENOTCONN] = 0;
+ }
+
+ return ec->nodes;
+}
+
+int32_t
+ec_handle_heal_commands(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ dict_t *dict_rsp = NULL;
+ int op_ret = -1;
+ int op_errno = ENOMEM;
+
+ if (!name || strcmp(name, GF_HEAL_INFO))
+ return -1;
+
+ op_errno = -ec_get_heal_info(this, loc, &dict_rsp);
+ if (op_errno <= 0) {
+ op_errno = op_ret = 0;
+ }
+
+ STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict_rsp, NULL);
+ if (dict_rsp)
+ dict_unref(dict_rsp);
+ return 0;
+}
+
+int32_t
+ec_gf_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ int error = 0;
+ ec_t *ec = this->private;
+ int32_t fop_flags = EC_MINIMUM_ONE;
+
+ if (name && strcmp(name, EC_XATTR_HEAL) != 0) {
+ EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out);
+ }
+
+ if (ec_handle_heal_commands(frame, this, loc, name, xdata) == 0)
+ return 0;
+
+ if (cluster_handle_marker_getxattr(frame, loc, name, ec->vol_uuid, NULL,
+ ec_marker_populate_args) == 0)
+ return 0;
+
+ if (name && ((fnmatch(GF_XATTR_STIME_PATTERN, name, 0) == 0) ||
+ XATTR_IS_NODE_UUID(name) || XATTR_IS_NODE_UUID_LIST(name))) {
+ fop_flags = EC_MINIMUM_ALL;
+ }
+
+ ec_getxattr(frame, this, -1, fop_flags, default_getxattr_cbk, NULL, loc,
+ name, xdata);
+
+ return 0;
+out:
+ error = ENODATA;
+ STACK_UNWIND_STRICT(getxattr, frame, -1, error, NULL, NULL);
+ return 0;
+}
+
+int32_t
+ec_gf_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
+{
+ int error = 0;
+
+ EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out);
+
+ ec_fgetxattr(frame, this, -1, EC_MINIMUM_ONE, default_fgetxattr_cbk, NULL,
+ fd, name, xdata);
+ return 0;
+out:
+ error = ENODATA;
+ STACK_UNWIND_STRICT(fgetxattr, frame, -1, error, NULL, NULL);
+ return 0;
+}
+
+int32_t
+ec_gf_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ int32_t fop_flags = EC_MINIMUM_ALL;
+
+ if (flock->l_type == F_UNLCK)
+ fop_flags = EC_MINIMUM_ONE;
+
+ ec_inodelk(frame, this, &frame->root->lk_owner, -1, fop_flags,
+ default_inodelk_cbk, NULL, volume, loc, cmd, flock, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ int32_t fop_flags = EC_MINIMUM_ALL;
+
+ if (flock->l_type == F_UNLCK)
+ fop_flags = EC_MINIMUM_ONE;
+ ec_finodelk(frame, this, &frame->root->lk_owner, -1, fop_flags,
+ default_finodelk_cbk, NULL, volume, fd, cmd, flock, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ ec_link(frame, this, -1, EC_MINIMUM_MIN, default_link_cbk, NULL, oldloc,
+ newloc, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ int32_t fop_flags = EC_MINIMUM_ALL;
+
+ if (flock->l_type == F_UNLCK)
+ fop_flags = EC_MINIMUM_ONE;
+ ec_lk(frame, this, -1, fop_flags, default_lk_cbk, NULL, fd, cmd, flock,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ ec_lookup(frame, this, -1, EC_MINIMUM_MIN, default_lookup_cbk, NULL, loc,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+ ec_mkdir(frame, this, -1, EC_MINIMUM_MIN, default_mkdir_cbk, NULL, loc,
+ mode, umask, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ ec_mknod(frame, this, -1, EC_MINIMUM_MIN, default_mknod_cbk, NULL, loc,
+ mode, rdev, umask, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ ec_open(frame, this, -1, EC_MINIMUM_MIN, default_open_cbk, NULL, loc, flags,
+ fd, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
+{
+ ec_opendir(frame, this, -1, EC_MINIMUM_MIN, default_opendir_cbk, NULL, loc,
+ fd, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ ec_readdir(frame, this, -1, EC_MINIMUM_ONE, default_readdir_cbk, NULL, fd,
+ size, offset, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ ec_readdirp(frame, this, -1, EC_MINIMUM_ONE, default_readdirp_cbk, NULL, fd,
+ size, offset, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
+{
+ ec_readlink(frame, this, -1, EC_MINIMUM_ONE, default_readlink_cbk, NULL,
+ loc, size, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ ec_readv(frame, this, -1, EC_MINIMUM_MIN, default_readv_cbk, NULL, fd, size,
+ offset, flags, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ int error = 0;
+
+ EC_INTERNAL_XATTR_OR_GOTO(name, xdata, error, out);
+
+ ec_removexattr(frame, this, -1, EC_MINIMUM_MIN, default_removexattr_cbk,
+ NULL, loc, name, xdata);
+
+ return 0;
+out:
+ STACK_UNWIND_STRICT(removexattr, frame, -1, error, NULL);
+ return 0;
+}
+
+int32_t
+ec_gf_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ int error = 0;
+
+ EC_INTERNAL_XATTR_OR_GOTO(name, xdata, error, out);
+
+ ec_fremovexattr(frame, this, -1, EC_MINIMUM_MIN, default_fremovexattr_cbk,
+ NULL, fd, name, xdata);
+
+ return 0;
+out:
+ STACK_UNWIND_STRICT(fremovexattr, frame, -1, error, NULL);
+ return 0;
+}
+
+int32_t
+ec_gf_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ ec_rename(frame, this, -1, EC_MINIMUM_MIN, default_rename_cbk, NULL, oldloc,
+ newloc, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+ dict_t *xdata)
+{
+ ec_rmdir(frame, this, -1, EC_MINIMUM_MIN, default_rmdir_cbk, NULL, loc,
+ xflags, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ ec_setattr(frame, this, -1, EC_MINIMUM_MIN, default_setattr_cbk, NULL, loc,
+ stbuf, valid, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ ec_fsetattr(frame, this, -1, EC_MINIMUM_MIN, default_fsetattr_cbk, NULL, fd,
+ stbuf, valid, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ int error = 0;
+
+ EC_INTERNAL_XATTR_OR_GOTO("", dict, error, out);
+
+ ec_setxattr(frame, this, -1, EC_MINIMUM_MIN, default_setxattr_cbk, NULL,
+ loc, dict, flags, xdata);
+
+ return 0;
+out:
+ STACK_UNWIND_STRICT(setxattr, frame, -1, error, NULL);
+ return 0;
+}
+
+int32_t
+ec_gf_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ int error = 0;
+
+ EC_INTERNAL_XATTR_OR_GOTO("", dict, error, out);
+
+ ec_fsetxattr(frame, this, -1, EC_MINIMUM_MIN, default_fsetxattr_cbk, NULL,
+ fd, dict, flags, xdata);
+
+ return 0;
+out:
+ STACK_UNWIND_STRICT(fsetxattr, frame, -1, error, NULL);
+ return 0;
+}
+
+int32_t
+ec_gf_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ ec_stat(frame, this, -1, EC_MINIMUM_MIN, default_stat_cbk, NULL, loc,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ ec_fstat(frame, this, -1, EC_MINIMUM_MIN, default_fstat_cbk, NULL, fd,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ ec_statfs(frame, this, -1, EC_MINIMUM_MIN, default_statfs_cbk, NULL, loc,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ ec_symlink(frame, this, -1, EC_MINIMUM_MIN, default_symlink_cbk, NULL,
+ linkname, loc, umask, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ ec_truncate(frame, this, -1, EC_MINIMUM_MIN, default_truncate_cbk, NULL,
+ loc, offset, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ ec_ftruncate(frame, this, -1, EC_MINIMUM_MIN, default_ftruncate_cbk, NULL,
+ fd, offset, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+ dict_t *xdata)
+{
+ ec_unlink(frame, this, -1, EC_MINIMUM_MIN, default_unlink_cbk, NULL, loc,
+ xflags, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ ec_writev(frame, this, -1, EC_MINIMUM_MIN, default_writev_cbk, NULL, fd,
+ vector, count, offset, flags, iobref, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ ec_xattrop(frame, this, -1, EC_MINIMUM_MIN, default_xattrop_cbk, NULL, loc,
+ optype, xattr, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ ec_fxattrop(frame, this, -1, EC_MINIMUM_MIN, default_fxattrop_cbk, NULL, fd,
+ optype, xattr, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ default_zerofill_failure_cbk(frame, ENOTSUP);
+
+ return 0;
+}
+
+int32_t
+ec_gf_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata)
+{
+ ec_seek(frame, this, -1, EC_MINIMUM_ONE, default_seek_cbk, NULL, fd, offset,
+ what, xdata);
+
+ return 0;
+}
+
+int32_t
+ec_gf_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ ec_ipc(frame, this, -1, EC_MINIMUM_MIN, default_ipc_cbk, NULL, op, xdata);
+ return 0;
+}
+
+int32_t
+ec_gf_forget(xlator_t *this, inode_t *inode)
+{
+ uint64_t value = 0;
+ ec_inode_t *ctx = NULL;
+
+ if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) {
+ ctx = (ec_inode_t *)(uintptr_t)value;
+ /* We can only forget an inode if it has been unlocked, so the stripe
+ * cache should also be empty. */
+ GF_ASSERT(list_empty(&ctx->stripe_cache.lru));
+ GF_FREE(ctx);
+ }
+
+ return 0;
+}
+
+void
+ec_gf_release_fd(xlator_t *this, fd_t *fd)
+{
+ uint64_t value = 0;
+ ec_fd_t *ctx = NULL;
+
+ if ((fd_ctx_del(fd, this, &value) == 0) && (value != 0)) {
+ ctx = (ec_fd_t *)(uintptr_t)value;
+ loc_wipe(&ctx->loc);
+ GF_FREE(ctx);
+ }
+}
+
+int32_t
+ec_gf_release(xlator_t *this, fd_t *fd)
+{
+ ec_gf_release_fd(this, fd);
+
+ return 0;
+}
+
+int32_t
+ec_gf_releasedir(xlator_t *this, fd_t *fd)
+{
+ ec_gf_release_fd(this, fd);
+
+ return 0;
+}
+
+int32_t
+ec_dump_private(xlator_t *this)
+{
+ ec_t *ec = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char tmp[65];
+
+ GF_ASSERT(this);
+
+ ec = this->private;
+ GF_ASSERT(ec);
+
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+ gf_proc_dump_add_section("%s", key_prefix);
+ gf_proc_dump_write("up", "%u", ec->up);
+ gf_proc_dump_write("nodes", "%u", ec->nodes);
+ gf_proc_dump_write("redundancy", "%u", ec->redundancy);
+ gf_proc_dump_write("fragment_size", "%u", ec->fragment_size);
+ gf_proc_dump_write("stripe_size", "%u", ec->stripe_size);
+ gf_proc_dump_write("childs_up", "%u", ec->xl_up_count);
+ gf_proc_dump_write("childs_up_mask", "%s",
+ ec_bin(tmp, sizeof(tmp), ec->xl_up, ec->nodes));
+ if (ec->read_mask) {
+ gf_proc_dump_write("read-mask", "%s",
+ ec_bin(tmp, sizeof(tmp), ec->read_mask, ec->nodes));
+ }
+ gf_proc_dump_write("background-heals", "%d", ec->background_heals);
+ gf_proc_dump_write("heal-wait-qlength", "%d", ec->heal_wait_qlen);
+ gf_proc_dump_write("self-heal-window-size", "%" PRIu32,
+ ec->self_heal_window_size);
+ gf_proc_dump_write("healers", "%d", ec->healers);
+ gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters);
+ gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]);
+ gf_proc_dump_write("parallel-writes", "%d", ec->parallel_writes);
+ gf_proc_dump_write("quorum-count", "%u", ec->quorum_count);
+
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache",
+ this->type, this->name);
+ gf_proc_dump_add_section("%s", key_prefix);
+
+ gf_proc_dump_write("hits", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.stripe_cache.hits));
+ gf_proc_dump_write("misses", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.stripe_cache.misses));
+ gf_proc_dump_write("updates", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.stripe_cache.updates));
+ gf_proc_dump_write("invalidations", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.stripe_cache.invals));
+ gf_proc_dump_write("evicts", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.stripe_cache.evicts));
+ gf_proc_dump_write("allocations", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.stripe_cache.allocs));
+ gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.stripe_cache.errors));
+ gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.shd.attempted));
+ gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(ec->stats.shd.completed));
+
+ return 0;
+}
+
+struct xlator_fops fops = {.lookup = ec_gf_lookup,
+ .stat = ec_gf_stat,
+ .fstat = ec_gf_fstat,
+ .truncate = ec_gf_truncate,
+ .ftruncate = ec_gf_ftruncate,
+ .access = ec_gf_access,
+ .readlink = ec_gf_readlink,
+ .mknod = ec_gf_mknod,
+ .mkdir = ec_gf_mkdir,
+ .unlink = ec_gf_unlink,
+ .rmdir = ec_gf_rmdir,
+ .symlink = ec_gf_symlink,
+ .rename = ec_gf_rename,
+ .link = ec_gf_link,
+ .create = ec_gf_create,
+ .open = ec_gf_open,
+ .readv = ec_gf_readv,
+ .writev = ec_gf_writev,
+ .flush = ec_gf_flush,
+ .fsync = ec_gf_fsync,
+ .opendir = ec_gf_opendir,
+ .readdir = ec_gf_readdir,
+ .readdirp = ec_gf_readdirp,
+ .fsyncdir = ec_gf_fsyncdir,
+ .statfs = ec_gf_statfs,
+ .setxattr = ec_gf_setxattr,
+ .getxattr = ec_gf_getxattr,
+ .fsetxattr = ec_gf_fsetxattr,
+ .fgetxattr = ec_gf_fgetxattr,
+ .removexattr = ec_gf_removexattr,
+ .fremovexattr = ec_gf_fremovexattr,
+ .lk = ec_gf_lk,
+ .inodelk = ec_gf_inodelk,
+ .finodelk = ec_gf_finodelk,
+ .entrylk = ec_gf_entrylk,
+ .fentrylk = ec_gf_fentrylk,
+ .xattrop = ec_gf_xattrop,
+ .fxattrop = ec_gf_fxattrop,
+ .setattr = ec_gf_setattr,
+ .fsetattr = ec_gf_fsetattr,
+ .fallocate = ec_gf_fallocate,
+ .discard = ec_gf_discard,
+ .zerofill = ec_gf_zerofill,
+ .seek = ec_gf_seek,
+ .ipc = ec_gf_ipc};
+
+struct xlator_cbks cbks = {.forget = ec_gf_forget,
+ .release = ec_gf_release,
+ .releasedir = ec_gf_releasedir};
+
+struct xlator_dumpops dumpops = {.priv = ec_dump_private};
+
+struct volume_options options[] = {
+ {.key = {"redundancy"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "{{ volume.redundancy }}",
+ .description = "Maximum number of bricks that can fail "
+ "simultaneously without losing data."},
+ {
+ .key = {"self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .description = "self-heal daemon enable/disable",
+ .default_value = "enable",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ },
+ {.key = {"iam-self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the disperse "
+ "translator is running as part of self-heal-daemon "
+ "or not."},
+ {.key = {"eager-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {GD_OP_VERSION_3_7_10},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description = "Enable/Disable eager lock for regular files on a "
+ "disperse volume. If a fop takes a lock and completes "
+ "its operation, it waits for next 1 second before "
+ "releasing the lock, to see if the lock can be reused "
+ "for next fop from the same client. If ec finds any lock "
+ "contention within 1 second it releases the lock "
+ "immediately before time expires. This improves the "
+ "performance of file operations. However, as it takes "
+ "lock on first brick, for few operations like read, "
+ "discovery of lock contention might take long time and "
+ "can actually degrade the performance. If eager lock is "
+ "disabled, lock will be released as soon as fop "
+ "completes."},
+ {.key = {"other-eager-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {GD_OP_VERSION_3_13_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description = "It's equivalent to the eager-lock option but for non "
+ "regular files."},
+ {.key = {"eager-lock-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 60,
+ .default_value = "1",
+ .op_version = {GD_OP_VERSION_4_0_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse", "locks", "timeout"},
+ .description = "Maximum time (in seconds) that a lock on an inode is "
+ "kept held if no new operations on the inode are "
+ "received."},
+ {.key = {"other-eager-lock-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 60,
+ .default_value = "1",
+ .op_version = {GD_OP_VERSION_4_0_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse", "locks", "timeout"},
+ .description = "It's equivalent to eager-lock-timeout option but for "
+ "non regular files."},
+ {
+ .key = {"background-heals"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0, /*Disabling background heals*/
+ .max = 256,
+ .default_value = "8",
+ .op_version = {GD_OP_VERSION_3_7_3},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description = "This option can be used to control number of parallel"
+ " heals",
+ },
+ {
+ .key = {"heal-wait-qlength"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max =
+ 65536, /*Around 100MB as of now with sizeof(ec_fop_data_t) at 1800*/
+ .default_value = "128",
+ .op_version = {GD_OP_VERSION_3_7_3},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description = "This option can be used to control number of heals"
+ " that can wait",
+ },
+ {.key = {"heal-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 60,
+ .max = INT_MAX,
+ .default_value = "600",
+ .op_version = {GD_OP_VERSION_3_7_3},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"disperse"},
+ .description = "time interval for checking the need to self-heal "
+ "in self-heal-daemon"},
+ {
+ .key = {"read-policy"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"round-robin", "gfid-hash"},
+ .default_value = "gfid-hash",
+ .op_version = {GD_OP_VERSION_3_7_6},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description =
+ "inode-read fops happen only on 'k' number of bricks in"
+ " n=k+m disperse subvolume. 'round-robin' selects the read"
+ " subvolume using round-robin algo. 'gfid-hash' selects read"
+ " subvolume based on hash of the gfid of that file/directory.",
+ },
+ {.key = {"shd-max-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 64,
+ .default_value = "1",
+ .op_version = {GD_OP_VERSION_3_9_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description = "Maximum number of parallel heals SHD can do per local "
+ "brick. This can substantially lower heal times, "
+ "but can also crush your bricks if you don't have "
+ "the storage hardware to support this."},
+ {.key = {"shd-wait-qlength"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 65536,
+ .default_value = "1024",
+ .op_version = {GD_OP_VERSION_3_9_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description = "This option can be used to control number of heals"
+ " that can wait in SHD per subvolume"},
+ {.key = {"cpu-extensions"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"none", "auto", "x64", "sse", "avx"},
+ .default_value = "auto",
+ .op_version = {GD_OP_VERSION_3_9_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description = "force the cpu extensions to be used to accelerate the "
+ "galois field computations."},
+ {.key = {"self-heal-window-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 1024,
+ .default_value = "1",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"disperse"},
+ .description = "Maximum number blocks(128KB) per file for which "
+ "self-heal process would be applied simultaneously."},
+ {.key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {GD_OP_VERSION_3_10_1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT,
+ .tags = {"disperse"},
+ .description = "Set/Unset dirty flag for every update fop at the start"
+ "of the fop. If OFF, this option impacts performance of"
+ "entry operations or metadata operations as it will"
+ "set dirty flag at the start and unset it at the end of"
+ "ALL update fop. If ON and all the bricks are good,"
+ "dirty flag will be set at the start only for file fops"
+ "For metadata and entry fops dirty flag will not be set"
+ "at the start, if all the bricks are good. This does"
+ "not impact performance for metadata operations and"
+ "entry operation but has a very small window to miss"
+ "marking entry as dirty in case it is required to be"
+ "healed"},
+ {.key = {"parallel-writes"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This controls if writes can be wound in parallel as long"
+ "as it doesn't modify same stripes"},
+ {.key = {"stripe-cache"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0, /*Disabling stripe_cache*/
+ .max = EC_STRIPE_CACHE_MAX_SIZE,
+ .default_value = "4",
+ .description = "This option will keep the last stripe of write fop"
+ "in memory. If next write falls in this stripe, we need"
+ "not to read it again from backend and we can save READ"
+ "fop going over the network. This will improve performance,"
+ "specially for sequential writes. However, this will also"
+ "lead to extra memory consumption, maximum "
+ "(cache size * stripe size) Bytes per open file."},
+ {
+ .key = {"quorum-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "0",
+ .description =
+ "This option can be used to define how many successes on"
+ "the bricks constitute a success to the application. This"
+ " count should be in the range"
+ "[disperse-data-count, disperse-count] (inclusive)",
+ },
+ {
+ .key = {"ec-read-mask"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = NULL,
+ .description = "This option can be used to choose which bricks can be"
+ " used for reading data/metadata of a file/directory",
+ },
+ {
+ .key = {NULL},
+ },
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1},
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "disperse",
+ .category = GF_MAINTAINED,
+};
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
new file mode 100644
index 00000000000..6f6de6d5981
--- /dev/null
+++ b/xlators/cluster/ec/src/ec.h
@@ -0,0 +1,34 @@
+/*
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_H__
+#define __EC_H__
+
+#include "ec-method.h"
+
+#define EC_XATTR_PREFIX "trusted.ec."
+#define EC_XATTR_CONFIG EC_XATTR_PREFIX "config"
+#define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
+#define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
+#define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
+#define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
+#define EC_STRIPE_CACHE_MAX_SIZE 10
+#define EC_VERSION_SIZE 2
+#define EC_SHD_INODE_LRU_LIMIT 10
+
+#define EC_MAX_FRAGMENTS EC_METHOD_MAX_FRAGMENTS
+/* The maximum number of nodes is derived from the maximum allowed fragments
+ * using the rule that redundancy cannot be equal or greater than the number
+ * of fragments.
+ */
+#define EC_MAX_NODES min(EC_MAX_FRAGMENTS * 2 - 1, EC_METHOD_MAX_NODES)
+
+#endif /* __EC_H__ */
diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am
deleted file mode 100644
index 5f78a296533..00000000000
--- a/xlators/cluster/ha/src/Makefile.am
+++ /dev/null
@@ -1,15 +0,0 @@
-xlator_LTLIBRARIES = ha.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster
-
-ha_la_LDFLAGS = -module -avoidversion
-
-ha_la_SOURCES = ha-helpers.c ha.c
-ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = ha.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c
deleted file mode 100644
index 7c361547af8..00000000000
--- a/xlators/cluster/ha/src/ha-helpers.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include "xlator.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "dict.h"
-#include "compat-errno.h"
-#include "ha.h"
-
-#define HA_TRANSPORT_NOTCONN(_ret, _errno, _fd) \
- ((_ret == -1) && (_fd ? (_errno == EBADFD):(_errno == ENOTCONN)))
-
-int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd)
-{
- ha_local_t *local = NULL;
- int i = -1;
- ha_private_t *pvt = NULL;
- int child_count = 0;
- int ret = -1;
- hafd_t *hafdp = NULL;
- xlator_t *this = NULL;
- uint64_t tmp_hafdp = 0;
-
- this = frame->this;
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
-
- if (local == NULL) {
- ret = fd_ctx_get (fd, this, &tmp_hafdp);
- if (ret < 0) {
- goto out;
- }
- hafdp = (hafd_t *)(long)tmp_hafdp;
- local = frame->local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (local == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- local->state = GF_CALLOC (1, child_count,
- gf_ha_mt_child_count);
- if (local->state == NULL) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* take care of the preferred subvolume */
- if (pvt->pref_subvol == -1)
- local->active = hafdp->active;
- else
- local->active = pvt->pref_subvol;
-
- LOCK (&hafdp->lock);
- memcpy (local->state, hafdp->fdstate, child_count);
- UNLOCK (&hafdp->lock);
-
- /* in case the preferred subvolume is down */
- if ((local->active != -1) && (local->state[local->active] == 0))
- local->active = -1;
-
- for (i = 0; i < child_count; i++) {
- if (local->state[i]) {
- if (local->active == -1)
- local->active = i;
- local->tries++;
- }
- }
- if (local->active == -1) {
- ret = -ENOTCONN;
- goto out;
- }
- local->fd = fd_ref (fd);
- }
- ret = 0;
-out:
- return ret;
-}
-
-int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno)
-{
- xlator_t *xl = NULL;
- ha_private_t *pvt = NULL;
- xlator_t **children = NULL;
- int prev_child = -1;
- hafd_t *hafdp = NULL;
- int ret = -1;
- call_stub_t *stub = NULL;
- ha_local_t *local = NULL;
- uint64_t tmp_hafdp = 0;
-
- xl = frame->this;
- pvt = xl->private;
- children = pvt->children;
- prev_child = (long) cookie;
- local = frame->local;
-
- if (op_ret == -1) {
- gf_log (xl->name, GF_LOG_ERROR ,"(child=%s) (op_ret=%d op_errno=%s)",
- children[prev_child]->name, op_ret, strerror (op_errno));
- }
-
- if (HA_TRANSPORT_NOTCONN (op_ret, op_errno, (local->fd))) {
- ret = 0;
- if (local->fd) {
- ret = fd_ctx_get (local->fd, xl, &tmp_hafdp);
- }
- hafdp = (hafd_t *)(long)tmp_hafdp;
- if (ret == 0) {
- if (local->fd) {
- LOCK(&hafdp->lock);
- hafdp->fdstate[prev_child] = 0;
- UNLOCK(&hafdp->lock);
- }
- local->tries--;
- if (local->tries != 0) {
- while (1) {
- local->active = (local->active + 1) % pvt->child_count;
- if (local->state[local->active])
- break;
- }
- stub = local->stub;
- local->stub = NULL;
- call_resume (stub);
- return -1;
- }
- }
- }
- if (local->stub) {
- call_stub_destroy (local->stub);
- local->stub = NULL;
- }
-
- if (local->fd) {
- GF_FREE (local->state);
- local->state = NULL;
-
- fd_unref (local->fd);
- local->fd = NULL;
- }
- return 0;
-}
-
-int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode)
-{
- int i = -1;
- ha_private_t *pvt = NULL;
- xlator_t *xl = NULL;
- int ret = -1;
- ha_local_t *local = NULL;
- uint64_t tmp_state = 0;
-
- xl = frame->this;
- pvt = xl->private;
- local = frame->local;
-
- if (local == NULL) {
- local = frame->local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (local == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- local->active = pvt->pref_subvol;
- ret = inode_ctx_get (inode, xl, &tmp_state);
- if (ret < 0) {
- goto out;
- }
- local->state = (char *)(long)tmp_state;
- if (local->active != -1 && local->state[local->active] == 0)
- local->active = -1;
- for (i = 0; i < pvt->child_count; i++) {
- if (local->state[i]) {
- if (local->active == -1)
- local->active = i;
- local->tries++;
- }
- }
- if (local->active == -1) {
- ret = -ENOTCONN;
- goto out;
- }
- }
- ret = 0;
-out:
- return ret;
-}
diff --git a/xlators/cluster/ha/src/ha-mem-types.h b/xlators/cluster/ha/src/ha-mem-types.h
deleted file mode 100644
index b460588aa03..00000000000
--- a/xlators/cluster/ha/src/ha-mem-types.h
+++ /dev/null
@@ -1,37 +0,0 @@
-
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef __HA_MEM_TYPES_H__
-#define __HA_MEM_TYPES_H__
-
-#include "mem-types.h"
-
-enum gf_ha_mem_types_ {
- gf_ha_mt_ha_local_t = gf_common_mt_end + 1,
- gf_ha_mt_hafd_t,
- gf_ha_mt_char,
- gf_ha_mt_child_count,
- gf_ha_mt_xlator_t,
- gf_ha_mt_ha_private_t,
- gf_ha_mt_end
-};
-#endif
-
diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c
deleted file mode 100644
index 7bb1824a138..00000000000
--- a/xlators/cluster/ha/src/ha.c
+++ /dev/null
@@ -1,4023 +0,0 @@
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-/* generate errors randomly, code is simple now, better alogorithm
- * can be written to decide what error to be returned and when
- */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "dict.h"
-#include "compat-errno.h"
-#include "ha.h"
-
-/*
- * TODO:
- * - dbench fails if ha over server side afr
- * - lock calls - lock on all subvols.
- * - support preferred-subvolume option. code already there.
- * - do not alloc the call-stub in case only one subvol is up.
- */
-
-void
-ha_local_wipe (ha_local_t *local)
-{
- if (local->stub) {
- call_stub_destroy (local->stub);
- local->stub = NULL;
- }
-
- if (local->state) {
- GF_FREE (local->state);
- local->state = NULL;
- }
-
- if (local->dict) {
- dict_unref (local->dict);
- local->dict = NULL;
- }
-
- loc_wipe (&local->loc);
-
- if (local->fd) {
- fd_unref (local->fd);
- local->fd = NULL;
- }
-
- if (local->inode) {
- inode_unref (local->inode);
- local->inode = NULL;
- }
-
- GF_FREE (local);
- return;
-}
-
-
-int
-ha_forget (xlator_t *this,
- inode_t *inode)
-{
- uint64_t stateino = 0;
- char *state = NULL;
- if (!inode_ctx_del (inode, this, &stateino)) {
- state = ((char *)(long)stateino);
- GF_FREE (state);
- }
-
- return 0;
-
-}
-
-int32_t
-ha_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- int child_count = 0, i = 0, callcnt = 0;
- char *state = NULL;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_state = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++) {
- if (pvt->children[i] == prev_frame->this)
- break;
- }
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_ERROR, "(child=%s) (op_ret=%d op_errno=%s)",
- children[i]->name, op_ret, strerror (op_errno));
- }
- inode_ctx_get (local->inode, this, &tmp_state);
- state = (char *)(long)tmp_state;
-
- LOCK (&frame->lock);
- if (local->revalidate == 1) {
- if ((!op_ret) != state[i]) {
- local->revalidate_error = 1;
- gf_log (this->name, GF_LOG_DEBUG, "revalidate error on %s",
- pvt->children[i]->name);
- }
- } else {
- if (op_ret == 0) {
- state[i] = 1;
- }
- }
- if (local->op_ret == -1 && op_ret == 0) {
- local->op_ret = 0;
- local->buf = *buf;
- local->postparent = *postparent;
- if (dict)
- local->dict = dict_ref (dict);
- }
- if (op_ret == -1 && op_ret != ENOTCONN)
- local->op_errno = op_errno;
- callcnt = --local->call_count;
- UNLOCK (&frame->lock);
-
- if (callcnt == 0) {
- dict_t *ctx = local->dict;
- inode_t *inode = local->inode;
- if (local->revalidate_error == 1) {
- local->op_ret = -1;
- local->op_errno = EIO;
- gf_log (this->name, GF_LOG_DEBUG, "revalidate error, returning EIO");
- }
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- inode,
- &local->buf,
- ctx,
- &local->postparent);
- if (inode)
- inode_unref (inode);
- if (ctx)
- dict_unref (ctx);
- }
- return 0;
-}
-
-int32_t
-ha_lookup (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *xattr_req)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- int child_count = 0, i = 0;
- char *state = NULL;
- xlator_t **children = NULL;
- int ret = -1;
- int32_t op_errno = EINVAL;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- children = pvt->children;
-
- frame->local = local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto unwind;
- }
-
- child_count = pvt->child_count;
- local->inode = inode_ref (loc->inode);
-
- ret = inode_ctx_get (loc->inode, this, NULL);
- if (ret) {
- state = GF_CALLOC (1, child_count, gf_ha_mt_child_count);
- if (state == NULL) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto unwind;
- }
-
- inode_ctx_put (loc->inode, this, (uint64_t)(long)state);
- } else
- local->revalidate = 1;
-
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->call_count = child_count;
-
- for (i = 0; i < child_count; i++) {
- STACK_WIND (frame,
- ha_lookup_cbk,
- children[i],
- children[i]->fops->lookup,
- loc,
- xattr_req);
- }
- return 0;
-
-unwind:
- local = frame->local;
- frame->local = NULL;
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_stat_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- buf);
- }
- return 0;
-}
-
-int32_t
-ha_stat (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- ha_local_t *local = NULL;
- int op_errno = ENOTCONN;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_stat_stub (frame, ha_stat, loc);
-
- STACK_WIND_COOKIE (frame,
- ha_stat_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->stat,
- loc);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL);
- return 0;
-}
-
-int32_t
-ha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *statpre,
- struct iatt *statpost)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost);
- }
- return 0;
-}
-
-
-int32_t
-ha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
- int32_t valid)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_setattr_stub (frame, ha_setattr, loc, stbuf, valid);
-
- STACK_WIND_COOKIE (frame,
- ha_setattr_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->setattr,
- loc, stbuf, valid);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-ha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
- int32_t valid)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_fsetattr_stub (frame, ha_fsetattr, fd, stbuf, valid);
-
- STACK_WIND_COOKIE (frame,
- ha_setattr_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->fsetattr,
- fd, stbuf, valid);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-ha_truncate_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- prebuf,
- postbuf);
- }
- return 0;
-}
-
-int32_t
-ha_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_truncate_stub (frame, ha_truncate, loc, offset);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_truncate_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->truncate,
- loc,
- offset);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
- int32_t
-ha_ftruncate_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- prebuf, postbuf);
- }
- return 0;
-}
-
-int32_t
-ha_ftruncate (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- off_t offset)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_ftruncate_stub (frame, ha_ftruncate, fd, offset);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_ftruncate_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->ftruncate,
- fd,
- offset);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
-int32_t
-ha_access_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- }
- return 0;
-}
-
-int32_t
-ha_access (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t mask)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_access_stub (frame, ha_access, loc, mask);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_access_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->access,
- loc,
- mask);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno);
-
- ha_local_wipe (local);
- return 0;
-}
-
-
- int32_t
-ha_readlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- const char *path,
- struct iatt *sbuf)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- path,
- sbuf);
- }
- return 0;
-}
-
-int32_t
-ha_readlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- size_t size)
-{
- ha_local_t *local = frame->local;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_readlink_stub (frame, ha_readlink, loc, size);
-
- STACK_WIND_COOKIE (frame,
- ha_readlink_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->readlink,
- loc,
- size);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int
-ha_mknod_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- int child_count = 0, i = 0, cnt = 0, ret = 0;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++)
- if (prev_frame->this == children[i])
- break;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "(path=%s) (op_ret=%d op_errno=%d)",
- local->stub->args.mknod.loc.path, op_ret, op_errno);
- }
- ret = inode_ctx_get (local->stub->args.mknod.loc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "unwind(-1), inode_ctx_get() error");
- /* It is difficult to handle this error at this stage
- * as we still expect more cbks, we can't return as
- * of now
- */
- } else if (op_ret == 0) {
- stateino[i] = 1;
- }
- LOCK (&frame->lock);
- cnt = --local->call_count;
- UNLOCK (&frame->lock);
-
- if (cnt == 0) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- local->stub->args.mknod.loc.inode,
- &local->buf, &local->preparent,
- &local->postparent);
- call_stub_destroy (stub);
- }
- return 0;
-}
-
-int32_t
-ha_mknod_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- int child_count = 0, i = 0, cnt = 0, ret = 0;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++)
- if (prev_frame->this == children[i])
- break;
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mknod.loc.path, op_ret, op_errno);
- }
-
- ret = inode_ctx_get (local->stub->args.mknod.loc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error");
- /* FIXME: handle the case */
- }
- if (op_ret == 0) {
- stateino[i] = 1;
- local->op_ret = 0;
- local->first_success = 1;
- local->buf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
- }
- cnt = --local->call_count;
- for (i = local->active + 1; i < child_count; i++) {
- if (local->state[i])
- break;
- }
-
- if (cnt == 0 || i == child_count) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- stub = local->stub;
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- local->stub->args.mknod.loc.inode, &local->buf,
- &local->preparent, &local->postparent);
- call_stub_destroy (stub);
- return 0;
- }
-
- local->active = i;
-
- if (local->first_success == 0) {
- STACK_WIND (frame,
- ha_mknod_cbk,
- children[i],
- children[i]->fops->mknod,
- &local->stub->args.mknod.loc,
- local->stub->args.mknod.mode,
- local->stub->args.mknod.rdev);
- return 0;
- }
- cnt = local->call_count;
-
- for (; i < child_count; i++) {
- if (local->state[i]) {
- STACK_WIND (frame,
- ha_mknod_lookup_cbk,
- children[i],
- children[i]->fops->lookup,
- &local->stub->args.mknod.loc,
- 0);
- if (--cnt == 0)
- break;
- }
- }
- return 0;
-}
-
-int32_t
-ha_mknod (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode,
- dev_t rdev)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- int child_count = 0, i = 0;
- char *stateino = NULL;
- int32_t op_errno = EINVAL;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
-
- frame->local = local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->stub = fop_mknod_stub (frame, ha_mknod, loc, mode, rdev);
- if (!local->stub) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->state = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!local->state) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- memcpy (local->state, pvt->state, child_count);
- local->active = -1;
-
- stateino = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!stateino) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
-
- for (i = 0; i < child_count; i++) {
- if (local->state[i]) {
- local->call_count++;
- if (local->active == -1)
- local->active = i;
- }
- }
-
- STACK_WIND (frame,
- ha_mknod_cbk,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->mknod,
- loc, mode, rdev);
- return 0;
-
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL);
- ha_local_wipe (local);
- return 0;
-}
-
-
-int
-ha_mkdir_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- int child_count = 0, i = 0, cnt = 0;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++)
- if (prev_frame->this == children[i])
- break;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno);
- }
- inode_ctx_get (local->stub->args.mkdir.loc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (op_ret == 0)
- stateino[i] = 1;
-
- LOCK (&frame->lock);
- cnt = --local->call_count;
- UNLOCK (&frame->lock);
-
- if (cnt == 0) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- local->stub->args.mkdir.loc.inode, &local->buf,
- &local->preparent, &local->postparent);
- call_stub_destroy (stub);
- }
- return 0;
-}
-
-int32_t
-ha_mkdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- int child_count = 0, i = 0, cnt = 0;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++)
- if (prev_frame->this == children[i])
- break;
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno);
- }
-
- inode_ctx_get (local->stub->args.mkdir.loc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (op_ret == 0) {
- stateino[i] = 1;
- local->op_ret = 0;
- local->first_success = 1;
- local->buf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
- }
- cnt = --local->call_count;
- for (i = local->active + 1; i < child_count; i++) {
- if (local->state[i])
- break;
- }
-
- if (cnt == 0 || i == child_count) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- stub = local->stub;
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- local->stub->args.mkdir.loc.inode, &local->buf,
- &local->preparent, &local->postparent);
- call_stub_destroy (stub);
- return 0;
- }
-
- local->active = i;
-
- if (local->first_success == 0) {
- STACK_WIND (frame,
- ha_mkdir_cbk,
- children[i],
- children[i]->fops->mkdir,
- &local->stub->args.mkdir.loc,
- local->stub->args.mkdir.mode);
- return 0;
- }
- cnt = local->call_count;
-
- for (; i < child_count; i++) {
- if (local->state[i]) {
- STACK_WIND (frame,
- ha_mkdir_lookup_cbk,
- children[i],
- children[i]->fops->lookup,
- &local->stub->args.mkdir.loc,
- 0);
- if (--cnt == 0)
- break;
- }
- }
- return 0;
-}
-
-int32_t
-ha_mkdir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- int child_count = 0, i = 0;
- char *stateino = NULL;
- int32_t op_errno = EINVAL;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
-
- frame->local = local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!frame->local) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->stub = fop_mkdir_stub (frame, ha_mkdir, loc, mode);
- if (!local->stub) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->state = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!local->state) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- memcpy (local->state, pvt->state, child_count);
- local->active = -1;
-
- stateino = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!stateino) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
- for (i = 0; i < child_count; i++) {
- if (local->state[i]) {
- local->call_count++;
- if (local->active == -1)
- local->active = i;
- }
- }
-
- STACK_WIND (frame,
- ha_mkdir_cbk,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->mkdir,
- loc, mode);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL);
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_unlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0) {
- STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent);
- }
- return 0;
-}
-
-int32_t
-ha_unlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
-
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_unlink_stub (frame, ha_unlink, loc);
- if (!local->stub) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_unlink_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->unlink,
- loc);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
- int32_t
-ha_rmdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- preparent,
- postparent);
- }
- return 0;
-}
-
-int32_t
-ha_rmdir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- ha_local_t *local = frame->local;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_rmdir_stub (frame, ha_rmdir, loc);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_rmdir_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->rmdir,
- loc);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int
-ha_symlink_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- int child_count = 0, i = 0, cnt = 0;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++)
- if (prev_frame->this == children[i])
- break;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno);
- }
- inode_ctx_get (local->stub->args.symlink.loc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (op_ret == 0)
- stateino[i] = 1;
-
- LOCK (&frame->lock);
- cnt = --local->call_count;
- UNLOCK (&frame->lock);
-
- if (cnt == 0) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- local->stub->args.symlink.loc.inode, &local->buf,
- &local->preparent, &local->postparent);
- call_stub_destroy (stub);
- }
- return 0;
-}
-
-int32_t
-ha_symlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- int child_count = 0, i = 0, cnt = 0;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++)
- if (prev_frame->this == children[i])
- break;
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno);
- }
- inode_ctx_get (local->stub->args.symlink.loc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (op_ret == 0) {
- stateino[i] = 1;
- local->op_ret = 0;
- local->first_success = 1;
- local->buf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
- }
- cnt = --local->call_count;
- for (i = local->active + 1; i < child_count; i++) {
- if (local->state[i])
- break;
- }
-
- if (cnt == 0 || i == child_count) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- stub = local->stub;
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- local->stub->args.symlink.loc.inode, &local->buf,
- &local->preparent, &local->postparent);
- call_stub_destroy (stub);
- return 0;
- }
-
- local->active = i;
-
- if (local->first_success == 0) {
- STACK_WIND (frame,
- ha_symlink_cbk,
- children[i],
- children[i]->fops->symlink,
- local->stub->args.symlink.linkname,
- &local->stub->args.symlink.loc);
- return 0;
- }
- cnt = local->call_count;
-
- for (; i < child_count; i++) {
- if (local->state[i]) {
- STACK_WIND (frame,
- ha_symlink_lookup_cbk,
- children[i],
- children[i]->fops->lookup,
- &local->stub->args.symlink.loc,
- 0);
- if (--cnt == 0)
- break;
- }
- }
- return 0;
-}
-
-int32_t
-ha_symlink (call_frame_t *frame,
- xlator_t *this,
- const char *linkname,
- loc_t *loc)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- int child_count = 0, i = 0;
- char *stateino = NULL;
- int32_t op_errno = EINVAL;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
-
- frame->local = local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->stub = fop_symlink_stub (frame, ha_symlink, linkname, loc);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->state = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!local->state) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- memcpy (local->state, pvt->state, child_count);
- local->active = -1;
-
- stateino = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!stateino) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
-
- for (i = 0; i < child_count; i++) {
- if (local->state[i]) {
- local->call_count++;
- if (local->active == -1) {
- local->active = i;
- }
- }
- }
-
- STACK_WIND (frame,
- ha_symlink_cbk,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->symlink,
- linkname, loc);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL);
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_rename_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf,
- struct iatt *preoldparent,
- struct iatt *postoldparent,
- struct iatt *prenewparent,
- struct iatt *postnewparent)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame, op_ret, op_errno, buf, preoldparent,
- postoldparent, prenewparent, postnewparent);
- }
- return 0;
-}
-
-int32_t
-ha_rename (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, oldloc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_rename_stub (frame, ha_rename, oldloc, newloc);
- if (!local->stub) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_rename_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->rename,
- oldloc, newloc);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-int
-ha_link_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- int child_count = 0, i = 0, cnt = 0;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++)
- if (prev_frame->this == children[i])
- break;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno);
- }
- inode_ctx_get (local->stub->args.link.newloc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (op_ret == 0)
- stateino[i] = 1;
-
- LOCK (&frame->lock);
- cnt = --local->call_count;
- UNLOCK (&frame->lock);
-
- if (cnt == 0) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- local->stub->args.link.oldloc.inode, &local->buf,
- &local->preparent, &local->postparent);
- call_stub_destroy (stub);
- }
- return 0;
-}
-
-int32_t
-ha_link_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- int child_count = 0, i = 0, cnt = 0;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- for (i = 0; i < child_count; i++)
- if (prev_frame->this == children[i])
- break;
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno);
- }
- inode_ctx_get (local->stub->args.link.newloc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (op_ret == 0) {
- stateino[i] = 1;
- local->op_ret = 0;
- local->first_success = 1;
- local->buf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
- }
- cnt = --local->call_count;
- for (i = local->active + 1; i < child_count; i++) {
- if (local->state[i])
- break;
- }
-
- if (cnt == 0 || i == child_count) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- stub = local->stub;
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- local->stub->args.link.oldloc.inode, &local->buf,
- &local->preparent, &local->postparent);
- call_stub_destroy (stub);
- return 0;
- }
-
- local->active = i;
-
- if (local->first_success == 0) {
- STACK_WIND (frame,
- ha_link_cbk,
- children[i],
- children[i]->fops->link,
- &local->stub->args.link.oldloc,
- &local->stub->args.link.newloc);
- return 0;
- }
- cnt = local->call_count;
-
- for (; i < child_count; i++) {
- if (local->state[i]) {
- STACK_WIND (frame,
- ha_link_lookup_cbk,
- children[i],
- children[i]->fops->lookup,
- &local->stub->args.link.newloc,
- 0);
- if (--cnt == 0)
- break;
- }
- }
- return 0;
-}
-
-int32_t
-ha_link (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- int child_count = 0, i = 0;
- char *stateino = NULL;
- int32_t ret = 0, op_errno = 0;
- uint64_t tmp_stateino = 0;
-
- ret = inode_ctx_get (newloc->inode, this, &tmp_stateino);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
- }
- stateino = (char *)(long)tmp_stateino;
-
- if (stateino == NULL) {
- gf_log (this->name, GF_LOG_ERROR,
- "newloc->inode's ctx is NULL, returning EINVAL");
- STACK_UNWIND (frame, -1, EINVAL, oldloc->inode, NULL, NULL,
- NULL);
- return 0;
- }
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
-
- frame->local = local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!frame->local) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->stub = fop_link_stub (frame, ha_link, oldloc, newloc);
- if (!local->stub) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->state = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!local->state) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- memcpy (local->state, pvt->state, child_count);
- local->active = -1;
-
- for (i = 0; i < child_count; i++) {
- if (local->state[i]) {
- local->call_count++;
- if (local->active == -1)
- local->active = i;
- }
- }
-
- STACK_WIND (frame,
- ha_link_cbk,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->link,
- oldloc,
- newloc);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-ha_create_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- int i, child_count = 0, cnt = 0, ret = 0;
- char *stateino = NULL;
- hafd_t *hafdp = NULL;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- uint64_t tmp_stateino = 0;
- uint64_t tmp_hafdp = 0;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- prev_frame = cookie;
- children = pvt->children;
-
- ret = inode_ctx_get (local->stub->args.create.loc.inode,
- this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error");
- /* FIXME: handle */
- }
- ret = fd_ctx_get (local->stub->args.create.fd, this, &tmp_hafdp);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error");
- /* FIXME: handle */
- }
- hafdp = (hafd_t *)(long)tmp_hafdp;
-
- for (i = 0; i < child_count; i++) {
- if (prev_frame->this == children[i])
- break;
- }
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.create.loc.path, op_ret, op_errno);
- }
- if (op_ret != -1) {
- stateino[i] = 1;
- hafdp->fdstate[i] = 1;
- if (local->op_ret == -1) {
- local->op_ret = 0;
- local->buf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
- local->first_success = 1;
- }
- local->stub->args.create.flags &= (~O_EXCL);
- }
- LOCK (&frame->lock);
- cnt = --local->call_count;
- UNLOCK (&frame->lock);
-
- for (i = local->active + 1; i < child_count; i++) {
- if (local->state[i])
- break;
- }
-
- if (cnt == 0 || i == child_count) {
- char *state = local->state;
- call_stub_t *stub = local->stub;
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- stub->args.create.fd,
- stub->args.create.loc.inode, &local->buf,
- &local->preparent, &local->postparent);
- GF_FREE (state);
- call_stub_destroy (stub);
- return 0;
- }
- local->active = i;
- cnt = local->call_count;
- for (; i < child_count; i++) {
- if (local->state[i]) {
- STACK_WIND (frame,
- ha_create_cbk,
- children[i],
- children[i]->fops->create,
- &local->stub->args.create.loc,
- local->stub->args.create.flags,
- local->stub->args.create.mode,
- local->stub->args.create.fd);
- if ((local->first_success == 0) || (cnt == 0))
- break;
- }
- }
- return 0;
-}
-
-int32_t
-ha_create (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags,
- mode_t mode, fd_t *fd)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- int i, child_count = 0;
- char *stateino = NULL;
- xlator_t **children = NULL;
- hafd_t *hafdp = NULL;
- int32_t op_errno = EINVAL;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- children = pvt->children;
-
- if (local == NULL) {
- frame->local = local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->stub = fop_create_stub (frame, ha_create, loc, flags, mode, fd);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->state = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!local->state) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->active = -1;
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- memcpy (local->state, pvt->state, child_count);
-
- for (i = 0; i < pvt->child_count; i++) {
- if (local->state[i]) {
- local->call_count++;
- if (local->active == -1)
- local->active = i;
- }
- }
- /* FIXME handle active -1 */
- stateino = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!stateino) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t);
- if (!hafdp) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!hafdp->fdstate) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- hafdp->path = gf_strdup(loc->path);
- if (!hafdp->path) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- LOCK_INIT (&hafdp->lock);
- fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
- inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
- }
-
- STACK_WIND (frame,
- ha_create_cbk,
- children[local->active],
- children[local->active]->fops->create,
- loc, flags, mode, fd);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- ha_local_wipe (local);
-
- if (stateino) {
- GF_FREE (stateino);
- stateino = NULL;
- }
-
- if (hafdp) {
- if (hafdp->fdstate) {
- GF_FREE (hafdp->fdstate);
- }
-
- if (hafdp->path) {
- GF_FREE (hafdp->path);
- }
-
- GF_FREE (hafdp);
- }
-
- return 0;
-}
-
- int32_t
-ha_open_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- xlator_t **children = NULL;
- int i = 0, child_count = 0, callcnt = 0, ret = 0;
- call_frame_t *prev_frame = NULL;
- hafd_t *hafdp = NULL;
- uint64_t tmp_hafdp = 0;
-
- local = frame->local;
- pvt = this->private;
- children = pvt->children;
- child_count = pvt->child_count;
- prev_frame = cookie;
-
- ret = fd_ctx_get (local->fd, this, &tmp_hafdp);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
- }
- hafdp = (hafd_t *)(long)tmp_hafdp;
-
- for (i = 0; i < child_count; i++)
- if (children[i] == prev_frame->this)
- break;
- LOCK (&frame->lock);
- if (op_ret != -1) {
- hafdp->fdstate[i] = 1;
- local->op_ret = 0;
- }
- if (op_ret == -1 && op_errno != ENOTCONN)
- local->op_errno = op_errno;
- callcnt = --local->call_count;
- UNLOCK (&frame->lock);
-
- if (callcnt == 0) {
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- local->fd);
- }
- return 0;
-}
-
-int32_t
-ha_open (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags, fd_t *fd, int wbflags)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- xlator_t **children = NULL;
- int cnt = 0, i, child_count = 0, ret = 0;
- hafd_t *hafdp = NULL;
- uint64_t tmp_stateino = 0;
- int32_t op_errno = ENOMEM;
-
- local = frame->local;
- pvt = this->private;
- children = pvt->children;
- child_count = pvt->child_count;
-
-
- frame->local = local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->fd = fd;
-
- hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t);
- if (!hafdp) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!hafdp->fdstate) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- hafdp->path = gf_strdup (loc->path);
- if (!hafdp->path) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- hafdp->active = -1;
- if (pvt->pref_subvol == -1) {
- hafdp->active = fd->inode->ino % child_count;
- }
-
- LOCK_INIT (&hafdp->lock);
- fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
- ret = inode_ctx_get (loc->inode, this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- for (i = 0; i < child_count; i++)
- if (stateino[i])
- cnt++;
- local->call_count = cnt;
- for (i = 0; i < child_count; i++) {
- if (stateino[i]) {
- STACK_WIND (frame,
- ha_open_cbk,
- children[i],
- children[i]->fops->open,
- loc, flags, fd, wbflags);
- if (--cnt == 0)
- break;
- }
- }
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, fd);
- if (hafdp) {
- if (hafdp->fdstate) {
- GF_FREE (hafdp->fdstate);
- hafdp->fdstate = NULL;
- }
-
- if (hafdp->path) {
- GF_FREE (hafdp->path);
- hafdp->path = NULL;
- }
-
- GF_FREE (hafdp);
- }
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_readv_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iovec *vector,
- int32_t count,
- struct iatt *stbuf,
- struct iobref *iobref)
-{
- int ret = 0;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- vector,
- count,
- stbuf,
- iobref);
- }
- return 0;
-}
-
-int32_t
-ha_readv (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_readv_stub (frame, ha_readv, fd, size, offset);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_readv_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->readv,
- fd,
- size,
- offset);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_writev_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- int ret = 0;
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- prebuf,
- postbuf);
- }
- return 0;
-}
-
-int32_t
-ha_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t off,
- struct iobref *iobref)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_writev_stub (frame, ha_writev, fd, vector, count, off,
- iobref);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_writev_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->writev,
- fd,
- vector,
- count,
- off,
- iobref);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_flush_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int ret = 0;
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- }
- return 0;
-}
-
-int32_t
-ha_flush (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_flush_stub (frame, ha_flush, fd);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_flush_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->flush,
- fd);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno);
-
- ha_local_wipe (local);
- return 0;
-}
-
-
- int32_t
-ha_fsync_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- int ret = 0;
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- }
- return 0;
-}
-
-int32_t
-ha_fsync (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_fsync_stub (frame, ha_fsync, fd, flags);
- if (!local->stub) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- op_errno = ENOMEM;
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_fsync_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->fsync,
- fd,
- flags);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_fstat_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf)
-{
- int ret = 0;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- buf);
- }
- return 0;
-}
-
-int32_t
-ha_fstat (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
-
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_fstat_stub (frame, ha_fstat, fd);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_fstat_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->fstat,
- fd);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
-int32_t
-ha_opendir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- xlator_t **children = NULL;
- int i = 0, child_count = 0, callcnt = 0, ret = 0;
- call_frame_t *prev_frame = NULL;
- hafd_t *hafdp = NULL;
- uint64_t tmp_hafdp = 0;
-
- local = frame->local;
- pvt = this->private;
- children = pvt->children;
- child_count = pvt->child_count;
- prev_frame = cookie;
-
- ret = fd_ctx_get (local->fd, this, &tmp_hafdp);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
- }
- hafdp = (hafd_t *)(long)tmp_hafdp;
-
- for (i = 0; i < child_count; i++)
- if (children[i] == prev_frame->this)
- break;
- LOCK (&frame->lock);
- if (op_ret != -1) {
- hafdp->fdstate[i] = 1;
- local->op_ret = 0;
- }
- if (op_ret == -1 && op_errno != ENOTCONN)
- local->op_errno = op_errno;
- callcnt = --local->call_count;
- UNLOCK (&frame->lock);
-
- if (callcnt == 0) {
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- local->fd);
- }
- return 0;
-}
-
-int32_t
-ha_opendir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, fd_t *fd)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- char *stateino = NULL;
- xlator_t **children = NULL;
- int cnt = 0, i, child_count = 0, ret = 0;
- hafd_t *hafdp = NULL;
- uint64_t tmp_stateino = 0;
- int32_t op_errno = EINVAL;
-
- local = frame->local;
- pvt = this->private;
- children = pvt->children;
- child_count = pvt->child_count;
-
- frame->local = local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->fd = fd;
-
- hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t);
- if (!hafdp) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!hafdp->fdstate) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- hafdp->path = gf_strdup (loc->path);
- if (!hafdp->path) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- LOCK_INIT (&hafdp->lock);
- fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
- ret = inode_ctx_get (loc->inode, this, &tmp_stateino);
- stateino = (char *)(long)tmp_stateino;
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error");
- }
- for (i = 0; i < child_count; i++)
- if (stateino[i])
- cnt++;
- local->call_count = cnt;
- for (i = 0; i < child_count; i++) {
- if (stateino[i]) {
- STACK_WIND (frame,
- ha_opendir_cbk,
- children[i],
- children[i]->fops->opendir,
- loc, fd);
- if (--cnt == 0)
- break;
- }
- }
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL);
- ha_local_wipe (local);
- if (hafdp) {
- if (hafdp->fdstate) {
- GF_FREE (hafdp->fdstate);
- hafdp->fdstate = NULL;
- }
-
- if (hafdp->path) {
- GF_FREE (hafdp->path);
- hafdp->path = NULL;
- }
-
- GF_FREE (hafdp);
- }
- return 0;
-}
-
- int32_t
-ha_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entries,
- int32_t count)
-{
- int ret = 0;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- entries,
- count);
- }
- return 0;
-}
-
-int32_t
-ha_getdents (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset,
- int32_t flag)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_getdents_stub (frame, ha_getdents, fd, size, offset,
- flag);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_getdents_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->getdents,
- fd,
- size,
- offset,
- flag);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL, 0);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_setdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int ret = 0;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- }
- return 0;
-}
-
-int32_t
-ha_setdents (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags,
- dir_entry_t *entries,
- int32_t count)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
-
- local->stub = fop_setdents_stub (frame, ha_setdents, fd, flags, entries,
- count);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_setdents_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->setdents,
- fd,
- flags,
- entries,
- count);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_fsyncdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int ret = 0;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- }
- return 0;
-}
-
-int32_t
-ha_fsyncdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_fsyncdir_stub (frame, ha_fsyncdir, fd, flags);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_fsyncdir_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->fsyncdir,
- fd,
- flags);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
-
- int32_t
-ha_statfs_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct statvfs *buf)
-{
- ha_local_t *local = NULL;
- ha_private_t *priv = NULL;
-
- local = frame->local;
- if (-1 == op_ret) {
- local->active = (local->active + 1) % priv->child_count;
- local->tries--;
- if (!local->tries)
- goto out;
-
- STACK_WIND (frame, ha_statfs_cbk,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->statfs,
- &local->loc);
- return 0;
- }
-
- out:
- loc_wipe (&local->loc);
- STACK_UNWIND (frame, op_ret, op_errno, buf);
-
- return 0;
-}
-
-int32_t
-ha_statfs (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- ha_private_t *priv = NULL;
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- /* The normal way of handling failover doesn't work here
- * as loc->inode may be null in this case.
- */
- local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- priv = this->private;
- frame->local = local;
- local->active = priv->pref_subvol;
- if (-1 == local->active)
- local->active = 0;
- local->tries = priv->child_count;
- loc_copy (&local->loc, loc);
-
- STACK_WIND (frame, ha_statfs_cbk, HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->statfs, loc);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno, NULL);
- return 0;
-}
-
- int32_t
-ha_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- }
- return 0;
-}
-
-int32_t
-ha_setxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *dict,
- int32_t flags)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_setxattr_stub (frame, ha_setxattr, loc, dict, flags);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_setxattr_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->setxattr,
- loc,
- dict,
- flags);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_getxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- dict);
- }
- return 0;
-}
-
-int32_t
-ha_getxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_getxattr_stub (frame, ha_getxattr, loc, name);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_getxattr_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->getxattr,
- loc,
- name);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
-int32_t
-ha_xattrop_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- int ret = -1;
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0) {
- STACK_UNWIND (frame, op_ret, op_errno, dict);
- }
- return 0;
-}
-
-
-int32_t
-ha_xattrop (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- gf_xattrop_flags_t flags,
- dict_t *dict)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
-
- local->stub = fop_xattrop_stub (frame, ha_xattrop, loc, flags, dict);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_xattrop_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->xattrop,
- loc,
- flags,
- dict);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, dict);
-
- ha_local_wipe (local);
- return 0;
-}
-
-int32_t
-ha_fxattrop_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- int ret = -1;
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0)
- STACK_UNWIND (frame, op_ret, op_errno, dict);
- return 0;
-}
-
-int32_t
-ha_fxattrop (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- gf_xattrop_flags_t flags,
- dict_t *dict)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_fxattrop_stub (frame, ha_fxattrop, fd, flags, dict);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_fxattrop_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->fxattrop,
- fd,
- flags,
- dict);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, dict);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_removexattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int ret = -1;
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- }
- return 0;
-}
-
-int32_t
-ha_removexattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
-
- local->stub = fop_removexattr_stub (frame, ha_removexattr, loc, name);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_removexattr_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->removexattr,
- loc,
- name);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno);
-
- ha_local_wipe (local);
- return 0;
-}
-
-int32_t
-ha_lk_setlk_unlck_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct gf_flock *lock)
-{
- ha_local_t *local = NULL;
- int cnt = 0;
- call_stub_t *stub = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- cnt = --local->call_count;
- if (op_ret == 0)
- local->op_ret = 0;
- UNLOCK (&frame->lock);
-
- if (cnt == 0) {
- stub = local->stub;
- GF_FREE (local->state);
- if (stub->args.lk.lock.l_type == F_UNLCK) {
- STACK_UNWIND (frame, local->op_ret, local->op_errno, &stub->args.lk.lock);
- } else {
- STACK_UNWIND (frame, -1, EIO, NULL);
- }
- call_stub_destroy (stub);
- }
- return 0;
-}
-
-int32_t
-ha_lk_setlk_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct gf_flock *lock)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- xlator_t **children = NULL;
- int i = 0, cnt = 0, j = 0;
- int child_count = 0;
- call_frame_t *prev_frame = NULL;
- char *state = NULL;
-
- local = frame->local;
- pvt = this->private;
- children = pvt->children;
- child_count = pvt->child_count;
- prev_frame = cookie;
- state = local->state;
-
- if (op_ret == 0)
- local->op_ret = 0;
-
- if ((op_ret == 0) || (op_ret == -1 && op_errno == ENOTCONN)) {
- for (i = 0; i < child_count; i++) {
- if (prev_frame->this == cookie)
- break;
- }
- i++;
- for (; i < child_count; i++) {
- if (local->state[i])
- break;
- }
- if (i == child_count) {
- call_stub_t *stub = local->stub;
- GF_FREE (local->state);
- STACK_UNWIND (frame, 0, op_errno, &stub->args.lk.lock);
- call_stub_destroy (stub);
- return 0;
- }
- STACK_WIND (frame,
- ha_lk_setlk_cbk,
- children[i],
- children[i]->fops->lk,
- local->stub->args.lk.fd,
- local->stub->args.lk.cmd,
- &local->stub->args.lk.lock);
- return 0;
- } else {
- for (i = 0; i < child_count; i++) {
- if (prev_frame->this == cookie)
- break;
- }
- cnt = 0;
- for (j = 0; j < i; j++) {
- if (state[i])
- cnt++;
- }
- if (cnt) {
- struct gf_flock lock;
- lock = local->stub->args.lk.lock;
- for (i = 0; i < child_count; i++) {
- if (state[i]) {
- STACK_WIND (frame,
- ha_lk_setlk_unlck_cbk,
- children[i],
- children[i]->fops->lk,
- local->stub->args.lk.fd,
- local->stub->args.lk.cmd,
- &lock);
- if (--cnt == 0)
- break;
- }
- }
- return 0;
- } else {
- GF_FREE (local->state);
- call_stub_destroy (local->stub);
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- lock);
- return 0;
- }
- }
-}
-
-int32_t
-ha_lk_getlk_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct gf_flock *lock)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- fd_t *fd = NULL;
- int child_count = 0, i = 0;
- xlator_t **children = NULL;
- call_frame_t *prev_frame = NULL;
-
- local = frame->local;
- pvt = this->private;
- fd = local->stub->args.lk.fd;
- child_count = pvt->child_count;
- children = pvt->children;
- prev_frame = cookie;
-
- if (op_ret == 0) {
- GF_FREE (local->state);
- call_stub_destroy (local->stub);
- STACK_UNWIND (frame, 0, 0, lock);
- return 0;
- }
-
- for (i = 0; i < child_count; i++) {
- if (prev_frame->this == children[i])
- break;
- }
-
- for (; i < child_count; i++) {
- if (local->state[i])
- break;
- }
-
- if (i == child_count) {
- GF_FREE (local->state);
- call_stub_destroy (local->stub);
- STACK_UNWIND (frame, op_ret, op_errno, lock);
- return 0;
- }
-
- STACK_WIND (frame,
- ha_lk_getlk_cbk,
- children[i],
- children[i]->fops->lk,
- fd,
- local->stub->args.lk.cmd,
- &local->stub->args.lk.lock);
- return 0;
-}
-
-int32_t
-ha_lk (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t cmd,
- struct gf_flock *lock)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- hafd_t *hafdp = NULL;
- char *state = NULL;
- int child_count = 0, i = 0, cnt = 0, ret = 0;
- xlator_t **children = NULL;
- uint64_t tmp_hafdp = 0;
- int32_t op_errno = EINVAL;
-
- local = frame->local;
- pvt = this->private;
- child_count = pvt->child_count;
- children = pvt->children;
- ret = fd_ctx_get (fd, this, &tmp_hafdp);
- if (ret < 0)
- gf_log (this->name, GF_LOG_ERROR, "fd_ctx_get failed");
-
- if (local == NULL) {
- local = frame->local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->active = -1;
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- }
- hafdp = (hafd_t *)(long)tmp_hafdp;
-
- if (local->active == -1) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- local->stub = fop_lk_stub (frame, ha_lk, fd, cmd, lock);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- local->state = GF_CALLOC (1, child_count, gf_ha_mt_char);
- if (!local->state) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- state = hafdp->fdstate;
- LOCK (&hafdp->lock);
- memcpy (local->state, state, child_count);
- UNLOCK (&hafdp->lock);
- if (cmd == F_GETLK) {
- for (i = 0; i < child_count; i++) {
- if (local->state[i])
- break;
- }
- STACK_WIND (frame,
- ha_lk_getlk_cbk,
- children[i],
- children[i]->fops->lk,
- fd,
- cmd,
- lock);
- } else if (cmd == F_SETLK && lock->l_type == F_UNLCK) {
- for (i = 0; i < child_count; i++) {
- if (local->state[i])
- local->call_count++;
- }
- cnt = local->call_count;
- for (i = 0; i < child_count; i++) {
- if (local->state[i]) {
- STACK_WIND (frame,
- ha_lk_setlk_unlck_cbk,
- children[i],
- children[i]->fops->lk,
- fd, cmd, lock);
- if (--cnt == 0)
- break;
- }
- }
- } else {
- for (i = 0; i < child_count; i++) {
- if (local->state[i])
- break;
- }
- STACK_WIND (frame,
- ha_lk_setlk_cbk,
- children[i],
- children[i]->fops->lk,
- fd,
- cmd,
- lock);
- }
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
- int32_t
-ha_inode_entry_lk_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno);
- }
- return 0;
-}
-
-int32_t
-ha_inodelk (call_frame_t *frame,
- xlator_t *this,
- const char *volume,
- loc_t *loc,
- int32_t cmd,
- struct gf_flock *lock)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_inodelk_stub (frame, ha_inodelk, volume,
- loc, cmd, lock);
- STACK_WIND_COOKIE (frame,
- ha_inode_entry_lk_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->inodelk,
- volume,
- loc,
- cmd,
- lock);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno);
- return 0;
-}
-
-int32_t
-ha_entrylk (call_frame_t *frame,
- xlator_t *this,
- const char *volume,
- loc_t *loc,
- const char *basename,
- entrylk_cmd cmd,
- entrylk_type type)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_entrylk_stub (frame, ha_entrylk, volume,
- loc, basename, cmd, type);
- STACK_WIND_COOKIE (frame,
- ha_inode_entry_lk_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->entrylk,
- volume, loc, basename, cmd, type);
- return 0;
-err:
- STACK_UNWIND (frame, -1, op_errno);
- return 0;
-}
-
- int32_t
-ha_checksum_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- uint8_t *file_checksum,
- uint8_t *dir_checksum)
-{
- int ret = -1;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0) {
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- file_checksum,
- dir_checksum);
- }
- return 0;
-}
-
-int32_t
-ha_checksum (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flag)
-{
- int op_errno = 0;
- ha_local_t *local = NULL;
-
- op_errno = ha_alloc_init_inode (frame, loc->inode);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- local->stub = fop_checksum_stub (frame, ha_checksum, loc, flag);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- STACK_WIND_COOKIE (frame,
- ha_checksum_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->checksum,
- loc,
- flag);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
-int32_t
-ha_common_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries)
-{
- int ret = 0;
-
- ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
- if (ret == 0)
- STACK_UNWIND (frame, op_ret, op_errno, entries);
- return 0;
-}
-
-int32_t
-ha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t off);
-
-int32_t
-ha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t off);
-int32_t
-ha_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t off, int whichop)
-{
- ha_local_t *local = NULL;
- int op_errno = 0;
-
- op_errno = ha_alloc_init_fd (frame, fd);
- if (op_errno < 0) {
- op_errno = -op_errno;
- goto err;
- }
- local = frame->local;
- if (whichop == GF_FOP_READDIR)
- local->stub = fop_readdir_stub (frame, ha_readdir, fd, size,
- off);
- else
- local->stub = fop_readdirp_stub (frame, ha_readdirp, fd, size,
- off);
- if (!local->stub) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- if (whichop == GF_FOP_READDIR)
- STACK_WIND_COOKIE (frame, ha_common_readdir_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->readdir,
- fd, size, off);
- else
- STACK_WIND_COOKIE (frame, ha_common_readdir_cbk,
- (void *)(long)local->active,
- HA_ACTIVE_CHILD(this, local),
- HA_ACTIVE_CHILD(this, local)->fops->readdirp,
- fd, size, off);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-
- ha_local_wipe (local);
- return 0;
-}
-
-
-int32_t
-ha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t off)
-{
- ha_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR);
- return 0;
-}
-
-int32_t
-ha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t off)
-{
- ha_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP);
- return 0;
-}
-
-/* Management operations */
-
- int32_t
-ha_stats_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct xlator_stats *stats)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- int i = 0;
-
- local = frame->local;
- pvt = this->private;
- prev_frame = cookie;
- children = pvt->children;
-
- if (op_ret == -1 && op_errno == ENOTCONN) {
- for (i = 0; i < pvt->child_count; i++) {
- if (prev_frame->this == children[i])
- break;
- }
- i++;
- for (; i < pvt->child_count; i++) {
- if (pvt->state[i])
- break;
- }
-
- if (i == pvt->child_count) {
- STACK_UNWIND (frame, -1, ENOTCONN, NULL);
- return 0;
- }
- STACK_WIND (frame,
- ha_stats_cbk,
- children[i],
- children[i]->mops->stats,
- local->flags);
- return 0;
- }
-
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- stats);
- return 0;
-}
-
-int32_t
-ha_stats (call_frame_t *frame,
- xlator_t *this,
- int32_t flags)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- xlator_t **children = NULL;
- int i = 0;
- int32_t op_errno = EINVAL;
-
- local = frame->local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- pvt = this->private;
- children = pvt->children;
- for (i = 0; i < pvt->child_count; i++) {
- if (pvt->state[i])
- break;
- }
-
- if (i == pvt->child_count) {
- op_errno = ENOTCONN;
- goto err;
- }
- local->flags = flags;
-
- STACK_WIND (frame,
- ha_stats_cbk,
- children[i],
- children[i]->mops->stats,
- flags);
- return 0;
-
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-
- ha_local_wipe (local);
- return 0;
-
-}
-
-
-int32_t
-ha_getspec_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- char *spec_data)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- call_frame_t *prev_frame = NULL;
- xlator_t **children = NULL;
- int i = 0;
-
- local = frame->local;
- pvt = this->private;
- prev_frame = cookie;
- children = pvt->children;
-
- if (op_ret == -1 && op_errno == ENOTCONN) {
- for (i = 0; i < pvt->child_count; i++) {
- if (prev_frame->this == children[i])
- break;
- }
- i++;
- for (; i < pvt->child_count; i++) {
- if (pvt->state[i])
- break;
- }
-
- if (i == pvt->child_count) {
- STACK_UNWIND (frame, -1, ENOTCONN, NULL);
- return 0;
- }
- STACK_WIND (frame,
- ha_getspec_cbk,
- children[i],
- children[i]->mops->getspec,
- local->pattern,
- local->flags);
- return 0;
- }
-
- STACK_UNWIND (frame,
- op_ret,
- op_errno,
- spec_data);
- return 0;
-}
-
-int32_t
-ha_getspec (call_frame_t *frame,
- xlator_t *this,
- const char *key,
- int32_t flags)
-{
- ha_local_t *local = NULL;
- ha_private_t *pvt = NULL;
- xlator_t **children = NULL;
- int i = 0;
- int32_t op_errno = EINVAL;
-
- local = frame->local = GF_CALLOC (1, sizeof (*local),
- gf_ha_mt_ha_local_t);
- if (!local) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto err;
- }
-
- pvt = this->private;
- children = pvt->children;
-
- for (i = 0; i < pvt->child_count; i++) {
- if (pvt->state[i])
- break;
- }
-
- if (i == pvt->child_count) {
- op_errno = ENOTCONN;
- goto err;
- }
- local->flags = flags;
- local->pattern = (char *)key;
-
- STACK_WIND (frame,
- ha_getspec_cbk,
- children[i],
- children[i]->mops->getspec,
- key, flags);
- return 0;
-err:
- local = frame->local;
- frame->local = NULL;
-
- STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-
- ha_local_wipe (local);
- return 0;
-
-}
-
-int32_t
-ha_closedir (xlator_t *this,
- fd_t *fd)
-{
- hafd_t *hafdp = NULL;
- int op_errno = 0;
- uint64_t tmp_hafdp = 0;
-
- op_errno = fd_ctx_del (fd, this, &tmp_hafdp);
- if (op_errno != 0) {
- gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error");
- return 0;
- }
- hafdp = (hafd_t *)(long)tmp_hafdp;
-
- GF_FREE (hafdp->fdstate);
- GF_FREE (hafdp->path);
- LOCK_DESTROY (&hafdp->lock);
- return 0;
-}
-
-int32_t
-ha_close (xlator_t *this,
- fd_t *fd)
-{
- hafd_t *hafdp = NULL;
- int op_errno = 0;
- uint64_t tmp_hafdp = 0;
-
- op_errno = fd_ctx_del (fd, this, &tmp_hafdp);
- if (op_errno != 0) {
- gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error");
- return 0;
- }
- hafdp = (hafd_t *)(long)tmp_hafdp;
-
- GF_FREE (hafdp->fdstate);
- GF_FREE (hafdp->path);
- LOCK_DESTROY (&hafdp->lock);
- return 0;
-}
-
-/* notify */
-int32_t
-notify (xlator_t *this,
- int32_t event,
- void *data,
- ...)
-{
- ha_private_t *pvt = NULL;
- int32_t i = 0, upcnt = 0;
-
- pvt = this->private;
- if (pvt == NULL) {
- gf_log (this->name, GF_LOG_DEBUG, "got notify before init()");
- return 0;
- }
-
- switch (event)
- {
- case GF_EVENT_CHILD_DOWN:
- {
- for (i = 0; i < pvt->child_count; i++) {
- if (data == pvt->children[i])
- break;
- }
- gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_DOWN from %s", pvt->children[i]->name);
- pvt->state[i] = 0;
- for (i = 0; i < pvt->child_count; i++) {
- if (pvt->state[i])
- break;
- }
- if (i == pvt->child_count) {
- default_notify (this, event, data);
- }
- }
- break;
- case GF_EVENT_CHILD_UP:
- {
- for (i = 0; i < pvt->child_count; i++) {
- if (data == pvt->children[i])
- break;
- }
-
- gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_UP from %s", pvt->children[i]->name);
-
- pvt->state[i] = 1;
-
- for (i = 0; i < pvt->child_count; i++) {
- if (pvt->state[i])
- upcnt++;
- }
-
- if (upcnt == 1) {
- default_notify (this, event, data);
- }
- }
- break;
-
- default:
- {
- default_notify (this, event, data);
- }
- }
-
- return 0;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_ha_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-
-int
-init (xlator_t *this)
-{
- ha_private_t *pvt = NULL;
- xlator_list_t *trav = NULL;
- int count = 0, ret = 0;
-
-
- if (!this->children) {
- gf_log (this->name,GF_LOG_ERROR,
- "FATAL: ha should have one or more child defined");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
-
- trav = this->children;
- pvt = GF_CALLOC (1, sizeof (ha_private_t), gf_ha_mt_ha_private_t);
-
- ret = dict_get_int32 (this->options, "preferred-subvolume",
- &pvt->pref_subvol);
- if (ret < 0) {
- pvt->pref_subvol = -1;
- }
-
- trav = this->children;
- while (trav) {
- count++;
- trav = trav->next;
- }
-
- pvt->child_count = count;
- pvt->children = GF_CALLOC (count, sizeof (xlator_t*),
- gf_ha_mt_xlator_t);
-
- trav = this->children;
- count = 0;
- while (trav) {
- pvt->children[count] = trav->xlator;
- count++;
- trav = trav->next;
- }
-
- pvt->state = GF_CALLOC (1, count, gf_ha_mt_char);
- this->private = pvt;
- return 0;
-}
-
-void
-fini (xlator_t *this)
-{
- ha_private_t *priv = NULL;
- priv = this->private;
- GF_FREE (priv);
- return;
-}
-
-
-struct xlator_fops fops = {
- .lookup = ha_lookup,
- .stat = ha_stat,
- .readlink = ha_readlink,
- .mknod = ha_mknod,
- .mkdir = ha_mkdir,
- .unlink = ha_unlink,
- .rmdir = ha_rmdir,
- .symlink = ha_symlink,
- .rename = ha_rename,
- .link = ha_link,
- .truncate = ha_truncate,
- .create = ha_create,
- .open = ha_open,
- .readv = ha_readv,
- .writev = ha_writev,
- .statfs = ha_statfs,
- .flush = ha_flush,
- .fsync = ha_fsync,
- .setxattr = ha_setxattr,
- .getxattr = ha_getxattr,
- .removexattr = ha_removexattr,
- .opendir = ha_opendir,
- .readdir = ha_readdir,
- .readdirp = ha_readdirp,
- .getdents = ha_getdents,
- .fsyncdir = ha_fsyncdir,
- .access = ha_access,
- .ftruncate = ha_ftruncate,
- .fstat = ha_fstat,
- .lk = ha_lk,
- .setdents = ha_setdents,
- .lookup_cbk = ha_lookup_cbk,
- .checksum = ha_checksum,
- .xattrop = ha_xattrop,
- .fxattrop = ha_fxattrop,
- .setattr = ha_setattr,
- .fsetattr = ha_fsetattr,
-};
-
-struct xlator_cbks cbks = {
- .release = ha_close,
- .releasedir = ha_closedir,
- .forget = ha_forget,
-};
diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h
deleted file mode 100644
index d6a519fa8b6..00000000000
--- a/xlators/cluster/ha/src/ha.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __HA_H_
-#define __HA_H_
-
-#include "ha-mem-types.h"
-
-typedef struct {
- call_stub_t *stub;
- int32_t op_ret, op_errno;
- int32_t active, tries, revalidate, revalidate_error;
- int32_t call_count;
- char *state, *pattern;
- dict_t *dict;
- loc_t loc;
- struct iatt buf;
- struct iatt postparent;
- struct iatt preparent;
- fd_t *fd;
- inode_t *inode;
- int32_t flags;
- int32_t first_success;
-} ha_local_t;
-
-typedef struct {
- char *state;
- xlator_t **children;
- int child_count, pref_subvol;
-} ha_private_t;
-
-typedef struct {
- char *fdstate;
- char *path;
- gf_lock_t lock;
- int active;
-} hafd_t;
-
-#define HA_ACTIVE_CHILD(this, local) (((ha_private_t *)this->private)->children[local->active])
-
-extern int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd);
-
-extern int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) ;
-
-extern int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode);
-
-#endif
diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am
deleted file mode 100644
index 26e19137a8b..00000000000
--- a/xlators/cluster/map/src/Makefile.am
+++ /dev/null
@@ -1,15 +0,0 @@
-xlator_LTLIBRARIES = map.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster
-
-map_la_LDFLAGS = -module -avoidversion
-
-map_la_SOURCES = map.c map-helper.c
-map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = map.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c
deleted file mode 100644
index 24ab6eb5060..00000000000
--- a/xlators/cluster/map/src/map-helper.c
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- Copyright (c) 2009-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "map.h"
-
-
-xlator_t *
-map_subvol_next (xlator_t *this, xlator_t *prev)
-{
- map_private_t *priv = NULL;
- xlator_t *next = NULL;
- int i = 0;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (priv->xlarray[i].xl == prev) {
- if ((i + 1) < priv->child_count)
- next = priv->xlarray[i + 1].xl;
- break;
- }
- }
-
- return next;
-}
-
-int
-map_subvol_cnt (xlator_t *this, xlator_t *subvol)
-{
- int i = 0;
- int ret = -1;
- map_private_t *priv = NULL;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (subvol == priv->xlarray[i].xl) {
- ret = i;
- break;
- }
- }
-
- return ret;
-}
-
-int
-map_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
-{
- map_private_t *priv = NULL;
- int cnt = 0;
- int max = 0;
- uint64_t y = 0;
-
- if (x == ((uint64_t) -1)) {
- y = (uint64_t) -1;
- goto out;
- }
-
- priv = this->private;
-
- max = priv->child_count;
- cnt = map_subvol_cnt (this, subvol);
-
- y = ((x * max) + cnt);
-
-out:
- if (y_p)
- *y_p = y;
-
- return 0;
-}
-
-
-int
-map_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
- uint64_t *x_p)
-{
- int cnt = 0;
- int max = 0;
- uint64_t x = 0;
- xlator_t *subvol = 0;
- map_private_t *priv = NULL;
-
- priv = this->private;
- max = priv->child_count;
-
- cnt = y % max;
- x = y / max;
-
- subvol = priv->xlarray[cnt].xl;
-
- if (subvol_p)
- *subvol_p = subvol;
-
- if (x_p)
- *x_p = x;
-
- return 0;
-}
-
-
-xlator_t *
-get_mapping_subvol_from_path (xlator_t *this, const char *path)
-{
- map_private_t *priv = NULL;
- struct map_pattern *map = NULL;
-
- /* To make sure we handle '/' properly */
- if (!strcmp (path, "/"))
- return NULL;
-
- priv = this->private;
-
- map = priv->map;
- while (map) {
- if (!strncmp (map->directory, path, map->dir_len)) {
- if ((path[map->dir_len] == '/') ||
- (path[map->dir_len] == '\0')) {
- return map->xl;
- }
- }
-
- map = map->next;
- }
-
- return priv->default_xl;
-}
-
-xlator_t *
-get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode)
-{
- uint64_t subvol = 0;
- int ret = -1;
-
- ret = inode_ctx_get (inode, this, &subvol);
- if (ret != 0)
- return NULL;
-
- return (xlator_t *)(long)subvol;
-}
-
-int
-check_multiple_volume_entry (xlator_t *this,
- xlator_t *subvol)
-{
- int ret = -1;
- int idx = 0;
- map_private_t *priv = NULL;
-
- priv = this->private;
-
- for (idx = 0; idx < priv->child_count; idx++) {
- if (priv->xlarray[idx].xl == subvol) {
- if (priv->xlarray[idx].mapped) {
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume '%s' is already mapped",
- subvol->name);
- goto out;
- }
- priv->xlarray[idx].mapped = 1;
- ret = 0;
- goto out;
- }
- }
-
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume '%s' is not found",
- subvol->name);
-
- out:
- return ret;
-}
-
-int
-verify_dir_and_assign_subvol (xlator_t *this,
- const char *directory,
- const char *subvol)
-{
- int default_flag = 0;
- int ret = -1;
- int idx = 0;
- map_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- struct map_pattern *tmp_map = NULL;
-
- priv = this->private;
-
- /* check if directory is valid, ie, its a top level dir, and
- * not includes a '*' in it.
- */
- if (!strcmp ("*", directory)) {
- default_flag = 1;
- } else {
- if (directory[0] != '/') {
- gf_log (this->name, GF_LOG_ERROR,
- "map takes absolute path, starting with '/'. "
- "not '%s'", directory);
- goto out;
- }
- for (idx = 1; idx < (strlen (directory) - 1); idx++) {
- if (directory[idx] == '/') {
- gf_log (this->name, GF_LOG_ERROR,
- "map takes only top level directory, "
- "not '%s'", directory);
- goto out;
- }
- }
- }
-
- /* Assign proper subvolume */
- trav = this->children;
- while (trav) {
- if (!strcmp (trav->xlator->name, subvol)) {
-
- /* Check if there is another directory for
- * same volume, if yes, return error.
- */
- ret = check_multiple_volume_entry (this,
- trav->xlator);
- if (ret != 0) {
- goto out;
- }
-
- ret = 0;
- if (default_flag) {
- if (priv->default_xl) {
- ret = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "'*' specified more than "
- "once. don't confuse me!!!");
- }
-
- priv->default_xl = trav->xlator;
- goto out;
- }
-
- tmp_map = GF_CALLOC (1, sizeof (struct map_pattern),
- gf_map_mt_map_pattern);
- tmp_map->xl = trav->xlator;
- tmp_map->dir_len = strlen (directory);
-
- /* make sure that the top level directory starts
- * with '/' and ends without '/'
- */
- tmp_map->directory = gf_strdup (directory);
- if (directory[tmp_map->dir_len - 1] == '/') {
- tmp_map->dir_len--;
- }
-
- if (!priv->map)
- priv->map = tmp_map;
- else {
- struct map_pattern *trav_map = NULL;
- trav_map = priv->map;
- while (trav_map->next)
- trav_map = trav_map->next;
- trav_map->next = tmp_map;
- }
-
- goto out;
- }
-
- trav = trav->next;
- }
-
- gf_log (this->name, GF_LOG_ERROR,
- "map volume '%s' is not proper subvolume", subvol);
-
- out:
- return ret;
-}
-
-int
-assign_default_subvol (xlator_t *this, const char *default_xl)
-{
- int ret = -1;
- map_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
-
- priv = this->private;
- trav = this->children;
-
- while (trav) {
- if (!strcmp (trav->xlator->name, default_xl)) {
- ret = check_multiple_volume_entry (this,
- trav->xlator);
- if (ret != 0) {
- goto out;
- }
- if (priv->default_xl)
- gf_log (this->name, GF_LOG_WARNING,
- "default-volume option provided, "
- "overriding earlier '*' option");
- priv->default_xl = trav->xlator;
- return 0;
- }
- trav = trav->next;
- }
-
- gf_log (this->name, GF_LOG_ERROR,
- "default-volume value is not an valid subvolume. check again");
- out:
- return -1;
-}
-
-void
-verify_if_all_subvolumes_got_used (xlator_t *this)
-{
- int idx = 0;
- map_private_t *priv = NULL;
-
- priv = this->private;
-
- for (idx = 0; idx < priv->child_count; idx++) {
- if (!priv->xlarray[idx].mapped) {
- if (!priv->default_xl) {
- priv->default_xl = priv->xlarray[idx].xl;
- priv->xlarray[idx].mapped = 1;
- } else {
- gf_log (this->name, GF_LOG_WARNING,
- "subvolume '%s' is not mapped to "
- "any directory",
- priv->xlarray[idx].xl->name);
- }
- }
- }
-
- if (!priv->default_xl) {
- gf_log (this->name, GF_LOG_WARNING,
- "default subvolume not specified, filesystem "
- "may not work properly. Check 'map' translator "
- "documentation for more info");
- }
-
- return ;
-}
diff --git a/xlators/cluster/map/src/map-mem-types.h b/xlators/cluster/map/src/map-mem-types.h
deleted file mode 100644
index 23c798deaf3..00000000000
--- a/xlators/cluster/map/src/map-mem-types.h
+++ /dev/null
@@ -1,35 +0,0 @@
-
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef __MAP_MEM_TYPES_H__
-#define __MAP_MEM_TYPES_H__
-
-#include "mem-types.h"
-
-enum gf_map_mem_types_ {
- gf_map_mt_map_private_t = gf_common_mt_end + 1,
- gf_map_mt_map_local_t,
- gf_map_mt_map_xlator_array,
- gf_map_mt_map_pattern,
- gf_map_mt_end
-};
-#endif
-
diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c
deleted file mode 100644
index 73186e8cb3a..00000000000
--- a/xlators/cluster/map/src/map.c
+++ /dev/null
@@ -1,2577 +0,0 @@
-/*
- Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "map.h"
-
-/* TODO :
- * -> support for 'get' 'put' API in through xattrs.
- * -> define the behavior of notify()
- */
-
-static int32_t
-map_stat_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf)
-
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, buf);
- return 0;
-}
-
-static int32_t
-map_setattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *statpre,
- struct iatt *statpost)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, statpre->ia_ino, &statpre->ia_ino);
- map_itransform (this, prev->this, statpost->ia_ino, &statpost->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost);
- return 0;
-}
-
-static int32_t
-map_fsetattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *statpre,
- struct iatt *statpost)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, statpre->ia_ino, &statpre->ia_ino);
- map_itransform (this, prev->this, statpost->ia_ino, &statpost->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost);
- return 0;
-}
-
-static int32_t
-map_truncate_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
-
-static int32_t
-map_ftruncate_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
-
-
-static int32_t
-map_access_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-static int32_t
-map_readlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- const char *path,
- struct iatt *sbuf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, path, sbuf);
- return 0;
-}
-
-static int32_t
-map_unlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent);
- return 0;
-}
-
-static int32_t
-map_rmdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent);
- return 0;
-}
-
-
-static int32_t
-map_rename_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf,
- struct iatt *preoldparent,
- struct iatt *postoldparent,
- struct iatt *prenewparent,
- struct iatt *postnewparent)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, buf);
- return 0;
-}
-
-static int32_t
-map_link_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
- return 0;
-}
-
-static int32_t
-map_open_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- STACK_UNWIND (frame, op_ret, op_errno, fd);
- return 0;
-}
-
-static int32_t
-map_readv_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iovec *vector,
- int32_t count,
- struct iatt *stbuf,
- struct iobref *iobref)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref);
- return 0;
-}
-
-static int32_t
-map_writev_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
-
-static int32_t
-map_flush_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-static int32_t
-map_fsync_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
-
-
-static int32_t
-map_fstat_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, buf);
- return 0;
-}
-
-
-static int32_t
-map_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entries,
- int32_t count)
-{
- STACK_UNWIND (frame, op_ret, op_errno, entries, count);
- return 0;
-}
-
-
-static int32_t
-map_setdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-static int32_t
-map_fsyncdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-static int32_t
-map_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-static int32_t
-map_fsetxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-static int32_t
-map_fgetxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- STACK_UNWIND (frame, op_ret, op_errno, dict);
- return 0;
-}
-
-
-
-static int32_t
-map_getxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- STACK_UNWIND (frame, op_ret, op_errno, dict);
- return 0;
-}
-
-int32_t
-map_xattrop_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- STACK_UNWIND (frame, op_ret, op_errno, dict);
- return 0;
-}
-
-int32_t
-map_fxattrop_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict)
-{
- STACK_UNWIND (frame, op_ret, op_errno, dict);
- return 0;
-}
-
-static int32_t
-map_removexattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-static int32_t
-map_lk_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct gf_flock *lock)
-{
- STACK_UNWIND (frame, op_ret, op_errno, lock);
- return 0;
-}
-
-
-static int32_t
-map_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-
-static int32_t
-map_finodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-static int32_t
-map_entrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-static int32_t
-map_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-static int32_t
-map_newentry_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
- return 0;
-
-}
-
-
-static int32_t
-map_create_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
- return 0;
-}
-
-
-/*
- * map_normalize_stats -
- */
-void
-map_normalize_stats (struct statvfs *buf,
- unsigned long bsize,
- unsigned long frsize)
-{
- double factor;
-
- if (buf->f_bsize != bsize) {
- factor = ((double) buf->f_bsize) / bsize;
- buf->f_bsize = bsize;
- buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree);
- buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
- }
-
- if (buf->f_frsize != frsize) {
- factor = ((double) buf->f_frsize) / frsize;
- buf->f_frsize = frsize;
- buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
- }
-}
-
-
-int32_t
-map_statfs_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct statvfs *stbuf)
-{
- struct statvfs *dict_buf = NULL;
- map_local_t *local = NULL;
- int this_call_cnt = 0;
- unsigned long bsize;
- unsigned long frsize;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- this_call_cnt = --local->call_count;
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto unlock;
- }
- local->op_ret = 0;
-
- /* when a call is successfull, add it to local->dict */
- dict_buf = &local->statvfs;
-
- if (dict_buf->f_bsize != 0) {
- bsize = max (dict_buf->f_bsize,
- stbuf->f_bsize);
-
- frsize = max (dict_buf->f_frsize,
- stbuf->f_frsize);
- map_normalize_stats(dict_buf, bsize, frsize);
- map_normalize_stats(stbuf, bsize, frsize);
- } else {
- dict_buf->f_bsize = stbuf->f_bsize;
- dict_buf->f_frsize = stbuf->f_frsize;
- }
-
- dict_buf->f_blocks += stbuf->f_blocks;
- dict_buf->f_bfree += stbuf->f_bfree;
- dict_buf->f_bavail += stbuf->f_bavail;
- dict_buf->f_files += stbuf->f_files;
- dict_buf->f_ffree += stbuf->f_ffree;
- dict_buf->f_favail += stbuf->f_favail;
- dict_buf->f_fsid = stbuf->f_fsid;
- dict_buf->f_flag = stbuf->f_flag;
- dict_buf->f_namemax = stbuf->f_namemax;
- }
-unlock:
- UNLOCK (&frame->lock);
-
- if (!this_call_cnt) {
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->statvfs);
- }
-
- return 0;
-}
-
-int32_t
-map_single_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- call_frame_t *prev = NULL;
- prev = cookie;
-
- map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino);
-
- STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict);
-
- return 0;
-}
-
-int32_t
-map_root_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- int callcnt = 0;
- map_local_t *local = NULL;
- inode_t *tmp_inode = NULL;
- dict_t *tmp_dict = NULL;
-
- local = frame->local;
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if ((op_ret == 0) && (local->op_ret == -1)) {
- local->op_ret = 0;
- local->stbuf = *buf;
- if (dict)
- local->dict = dict_ref (dict);
- local->inode = inode_ref (inode);
- }
- if (op_ret == -1)
- local->op_errno = op_errno;
-
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- tmp_dict = local->dict;
- tmp_inode = local->inode;
-
- STACK_UNWIND (frame, local->op_ret,
- local->op_errno, local->inode,
- &local->stbuf, local->dict);
-
- inode_unref (local->inode);
- if (tmp_dict)
- dict_unref (tmp_dict);
- }
-
- return 0;
-}
-
-
-int32_t
-map_opendir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- int callcnt = 0;
- map_local_t *local = NULL;
- fd_t *local_fd = NULL;
-
- local = frame->local;
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto unlock;
- }
-
- local->op_ret = 0;
- }
- unlock:
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local_fd = local->fd;
- local->fd = NULL;
-
- STACK_UNWIND (frame, local->op_ret,
- local->op_errno, local_fd);
-
- fd_unref (local_fd);
- }
- return 0;
-}
-
-int32_t
-map_single_readdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- gf_dirent_t *entries)
-{
- call_frame_t *prev = NULL;
- gf_dirent_t *orig_entry = NULL;
-
- prev = cookie;
-
- list_for_each_entry (orig_entry, &entries->list, list) {
- map_itransform (this, prev->this, orig_entry->d_ino,
- &orig_entry->d_ino);
- }
- STACK_UNWIND (frame, op_ret, op_errno, entries);
-
- return 0;
-}
-
-
-int32_t
-map_single_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries)
-{
- call_frame_t *prev = NULL;
- gf_dirent_t *orig_entry = NULL;
-
- prev = cookie;
-
- list_for_each_entry (orig_entry, &entries->list, list) {
- map_itransform (this, prev->this, orig_entry->d_ino,
- &orig_entry->d_ino);
- orig_entry->d_stat.ia_ino = orig_entry->d_ino;
- }
- STACK_UNWIND (frame, op_ret, op_errno, entries);
- return 0;
-}
-
-int
-map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *orig_entries);
-
-int
-map_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *orig_entries);
-
-int
-map_generic_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *orig_entries,
- int whichop)
-{
- map_local_t *local = NULL;
- gf_dirent_t entries;
- gf_dirent_t *orig_entry = NULL;
- gf_dirent_t *entry = NULL;
- call_frame_t *prev = NULL;
- xlator_t *subvol = NULL;
- xlator_t *next_subvol = NULL;
- off_t next_offset = 0;
- int count = 0;
- fd_t *local_fd = NULL;
-
- INIT_LIST_HEAD (&entries.list);
- prev = cookie;
- local = frame->local;
-
- if (op_ret < 0)
- goto done;
-
- list_for_each_entry (orig_entry, &orig_entries->list, list) {
- subvol = prev->this;
-
- entry = gf_dirent_for_name (orig_entry->d_name);
- if (!entry) {
- gf_log (this->name, GF_LOG_ERROR,
- "memory allocation failed :(");
- goto unwind;
- }
-
- map_itransform (this, subvol, orig_entry->d_ino,
- &entry->d_ino);
- map_itransform (this, subvol, orig_entry->d_off,
- &entry->d_off);
-
- if (whichop == GF_FOP_READDIRP)
- entry->d_stat.ia_ino = entry->d_ino;
- entry->d_type = orig_entry->d_type;
- entry->d_len = orig_entry->d_len;
-
- list_add_tail (&entry->list, &entries.list);
- count++;
- next_offset = orig_entry->d_off;
- }
-
- op_ret = count;
-
-done:
- if (count == 0) {
- /* non-zero next_offset means that
- EOF is not yet hit on the current subvol
- */
- if (next_offset == 0) {
- next_subvol = map_subvol_next (this, prev->this);
- } else {
- next_subvol = prev->this;
- }
-
- if (!next_subvol) {
- goto unwind;
- }
-
- if (whichop == GF_FOP_READDIR)
- STACK_WIND (frame, map_readdir_cbk, next_subvol,
- next_subvol->fops->readdir, local->fd,
- local->size, 0);
- else
- STACK_WIND (frame, map_readdirp_cbk, next_subvol,
- next_subvol->fops->readdirp, local->fd,
- local->size, 0);
- return 0;
- }
-
-unwind:
- if (op_ret < 0)
- op_ret = 0;
-
- local_fd = local->fd;
- local->fd = NULL;
-
- STACK_UNWIND (frame, op_ret, op_errno, &entries);
-
- fd_unref (local_fd);
-
- gf_dirent_free (&entries);
-
- return 0;
-}
-
-
-int
-map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *orig_entries)
-{
- map_generic_readdir_cbk (frame, cookie, this, op_ret, op_errno,
- orig_entries, GF_FOP_READDIR);
- return 0;
-}
-
-
-int
-map_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *orig_entries)
-{
- map_generic_readdir_cbk (frame, cookie, this, op_ret, op_errno,
- orig_entries, GF_FOP_READDIRP);
- return 0;
-}
-
-
-/* Management operations */
-
-static int32_t
-map_checksum_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- uint8_t *file_checksum,
- uint8_t *dir_checksum)
-{
- STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum);
- return 0;
-}
-
-
-/* Fops starts here */
-
-int32_t
-map_stat (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_stat_cbk, subvol, subvol->fops->stat, loc);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_setattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- struct iatt *stbuf,
- int32_t valid)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- GF_VALIDATE_OR_GOTO ("map", this, err);
- GF_VALIDATE_OR_GOTO (this->name, frame, err);
- GF_VALIDATE_OR_GOTO (this->name, loc, err);
- GF_VALIDATE_OR_GOTO (this->name, loc->inode, err);
- GF_VALIDATE_OR_GOTO (this->name, loc->path, err);
- GF_VALIDATE_OR_GOTO (this->name, stbuf, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_setattr_cbk, subvol,
- subvol->fops->setattr, loc, stbuf, valid);
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_fsetattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iatt *stbuf,
- int32_t valid)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- GF_VALIDATE_OR_GOTO ("map", this, err);
- GF_VALIDATE_OR_GOTO (this->name, frame, err);
- GF_VALIDATE_OR_GOTO (this->name, fd, err);
- GF_VALIDATE_OR_GOTO (this->name, stbuf, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_fsetattr_cbk, subvol,
- subvol->fops->fsetattr, fd, stbuf, valid);
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_truncate_cbk, subvol,
- subvol->fops->truncate, loc, offset);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_ftruncate (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- off_t offset)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_ftruncate_cbk, subvol,
- subvol->fops->ftruncate, fd, offset);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_access (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t mask)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_access_cbk, subvol,
- subvol->fops->access, loc, mask);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_readlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- size_t size)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_readlink_cbk, subvol,
- subvol->fops->readlink, loc, size);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_unlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_unlink_cbk, subvol, subvol->fops->unlink, loc);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_rmdir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_rmdir_cbk, subvol, subvol->fops->rmdir, loc);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_rename (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- int32_t op_errno = 1;
- xlator_t *old_subvol = NULL;
- xlator_t *new_subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (oldloc, err);
- VALIDATE_OR_GOTO (oldloc->inode, err);
- VALIDATE_OR_GOTO (oldloc->path, err);
- VALIDATE_OR_GOTO (newloc, err);
-
- old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode);
- if (!old_subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- if (newloc->path) {
- new_subvol = get_mapping_subvol_from_path (this, newloc->path);
- if (new_subvol && (new_subvol != old_subvol)) {
- op_errno = EXDEV;
- goto err;
- }
- }
-
- STACK_WIND (frame, map_rename_cbk, old_subvol,
- old_subvol->fops->rename, oldloc, newloc);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_link (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- int32_t op_errno = 1;
- xlator_t *old_subvol = NULL;
- xlator_t *new_subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (oldloc, err);
- VALIDATE_OR_GOTO (oldloc->inode, err);
- VALIDATE_OR_GOTO (oldloc->path, err);
- VALIDATE_OR_GOTO (newloc, err);
-
- old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode);
- if (!old_subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- if (newloc->path) {
- new_subvol = get_mapping_subvol_from_path (this, newloc->path);
- if (new_subvol && (new_subvol != old_subvol)) {
- op_errno = EXDEV;
- goto err;
- }
- }
-
- STACK_WIND (frame, map_link_cbk, old_subvol,
- old_subvol->fops->link, oldloc, newloc);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_open (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags, fd_t *fd, int wbflags)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_open_cbk, subvol,
- subvol->fops->open, loc, flags, fd, wbflags);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_readv (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_readv_cbk, subvol,
- subvol->fops->readv, fd, size, offset);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t off,
- struct iobref *iobref)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_writev_cbk, subvol,
- subvol->fops->writev, fd, vector, count, off, iobref);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_flush (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_flush_cbk, subvol, subvol->fops->flush, fd);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_fsync (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_fsync_cbk, subvol,
- subvol->fops->fsync, fd, flags);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_fstat (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_fstat_cbk, subvol, subvol->fops->fstat, fd);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_getdents (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset,
- int32_t flag)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_getdents_cbk, subvol,
- subvol->fops->getdents, fd, size, offset, flag);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_setdents (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags,
- dir_entry_t *entries,
- int32_t count)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_setdents_cbk, subvol,
- subvol->fops->setdents, fd, flags, entries, count);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_fsyncdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_fsyncdir_cbk, subvol,
- subvol->fops->fsyncdir, fd, flags);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-
-
-int32_t
-map_setxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *dict,
- int32_t flags)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_setxattr_cbk, subvol,
- subvol->fops->setxattr, loc, dict, flags);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_getxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_getxattr_cbk, subvol,
- subvol->fops->getxattr, loc, name);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_fsetxattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- dict_t *dict,
- int32_t flags)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_fsetxattr_cbk, subvol,
- subvol->fops->fsetxattr, fd, dict, flags);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_fgetxattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- const char *name)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_fgetxattr_cbk, subvol,
- subvol->fops->fgetxattr, fd, name);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_xattrop (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- gf_xattrop_flags_t flags,
- dict_t *dict)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_xattrop_cbk, subvol,
- subvol->fops->xattrop, loc, flags, dict);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_fxattrop (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- gf_xattrop_flags_t flags,
- dict_t *dict)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_fxattrop_cbk, subvol,
- subvol->fops->fxattrop, fd, flags, dict);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_removexattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_removexattr_cbk, subvol,
- subvol->fops->removexattr, loc, name);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_lk (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t cmd,
- struct gf_flock *lock)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_lk_cbk, subvol,
- subvol->fops->lk, fd, cmd, lock);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_inodelk_cbk, subvol,
- subvol->fops->inodelk, volume, loc, cmd, lock);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_finodelk_cbk, subvol,
- subvol->fops->finodelk, volume, fd, cmd, lock);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_entrylk_cbk, subvol,
- subvol->fops->entrylk, volume, loc, basename, cmd, type);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_fentrylk_cbk, subvol,
- subvol->fops->fentrylk, volume, fd, basename, cmd, type);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_checksum (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flag)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, map_checksum_cbk, subvol,
- subvol->fops->checksum, loc, flag);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_mknod (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode,
- dev_t rdev)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- subvol = get_mapping_subvol_from_path (this, loc->path);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
- if (op_errno != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set subvolume ptr in inode ctx",
- loc->path);
- }
-
- STACK_WIND (frame, map_newentry_cbk, subvol,
- subvol->fops->mknod, loc, mode, rdev);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_mkdir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- subvol = get_mapping_subvol_from_path (this, loc->path);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
- if (op_errno != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set subvolume ptr in inode ctx",
- loc->path);
- }
-
- STACK_WIND (frame, map_newentry_cbk, subvol,
- subvol->fops->mkdir, loc, mode);
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_symlink (call_frame_t *frame,
- xlator_t *this,
- const char *linkpath,
- loc_t *loc)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- subvol = get_mapping_subvol_from_path (this, loc->path);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
- if (op_errno != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set subvolume ptr in inode ctx",
- loc->path);
- }
-
- STACK_WIND (frame, map_newentry_cbk, subvol,
- subvol->fops->symlink, linkpath, loc);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-map_create (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags,
- mode_t mode, fd_t *fd)
-{
- int32_t op_errno = 1;
- xlator_t *subvol = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- subvol = get_mapping_subvol_from_path (this, loc->path);
- if (!subvol) {
- op_errno = EINVAL;
- goto err;
- }
-
- op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
- if (op_errno != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set subvolume ptr in inode ctx",
- loc->path);
- }
-
- STACK_WIND (frame, map_create_cbk, subvol,
- subvol->fops->create, loc, flags, mode, fd);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_lookup (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *xattr_req)
-{
- int32_t op_errno = EINVAL;
- xlator_t *subvol = NULL;
- map_local_t *local = NULL;
- map_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
-
- if (loc->inode->ino == 1)
- goto root_inode;
-
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- subvol = get_mapping_subvol_from_path (this, loc->path);
- if (!subvol) {
- goto err;
- }
-
- op_errno = inode_ctx_put (loc->inode, this,
- (uint64_t)(long)subvol);
- if (op_errno != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set subvolume in inode ctx",
- loc->path);
- }
- }
-
- /* Just one callback */
- STACK_WIND (frame, map_single_lookup_cbk, subvol,
- subvol->fops->lookup, loc, xattr_req);
-
- return 0;
-
- root_inode:
- local = GF_CALLOC (1, sizeof (map_local_t),
- gf_map_mt_map_local_t);
-
- frame->local = local;
- local->call_count = priv->child_count;
- local->op_ret = -1;
-
- trav = this->children;
- while (trav) {
- STACK_WIND (frame, map_root_lookup_cbk, trav->xlator,
- trav->xlator->fops->lookup, loc, xattr_req);
- trav = trav->next;
- }
-
- return 0;
-
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_statfs (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- int32_t op_errno = EINVAL;
- xlator_t *subvol = NULL;
- map_local_t *local = NULL;
- map_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- if (loc->inode->ino == 1)
- goto root_inode;
- subvol = get_mapping_subvol_from_ctx (this, loc->inode);
- if (!subvol) {
- goto err;
- }
-
- /* Just one callback */
- STACK_WIND (frame, map_statfs_cbk, subvol, subvol->fops->statfs, loc);
-
- return 0;
-
- root_inode:
- local = GF_CALLOC (1, sizeof (map_local_t),
- gf_map_mt_map_local_t);
-
- priv = this->private;
- frame->local = local;
- local->call_count = priv->child_count;
- local->op_ret = -1;
-
- trav = this->children;
- while (trav) {
- STACK_WIND (frame, map_statfs_cbk, trav->xlator,
- trav->xlator->fops->statfs, loc);
- trav = trav->next;
- }
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL);
-
- return 0;
-}
-
-int32_t
-map_opendir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, fd_t *fd)
-{
- int32_t op_errno = EINVAL;
- xlator_t *subvol = NULL;
- map_local_t *local = NULL;
- map_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- if (loc->inode->ino == 1)
- goto root_inode;
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- goto err;
- }
-
- /* Just one callback */
- STACK_WIND (frame, map_opendir_cbk, subvol,
- subvol->fops->opendir, loc, fd);
-
- return 0;
-
- root_inode:
- local = GF_CALLOC (1, sizeof (map_local_t),
- gf_map_mt_map_local_t);
-
- priv = this->private;
- frame->local = local;
- local->call_count = priv->child_count;
- local->op_ret = -1;
- local->fd = fd_ref (fd);
-
- trav = this->children;
- while (trav) {
- STACK_WIND (frame, map_opendir_cbk, trav->xlator,
- trav->xlator->fops->opendir, loc, fd);
- trav = trav->next;
- }
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL);
-
- return 0;
-}
-
-
-int32_t
-map_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, int whichop)
-{
- int32_t op_errno = EINVAL;
- xlator_t *subvol = NULL;
- map_local_t *local = NULL;
- map_private_t *priv = NULL;
- xlator_t *xvol = NULL;
- off_t xoff = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- if (fd->inode->ino == 1)
- goto root_inode;
-
- subvol = get_mapping_subvol_from_ctx (this, fd->inode);
- if (!subvol) {
- goto err;
- }
-
- /* Just one callback */
- if (whichop == GF_FOP_READDIR)
- STACK_WIND (frame, map_single_readdir_cbk, subvol,
- subvol->fops->readdir, fd, size, yoff);
- else
- STACK_WIND (frame, map_single_readdirp_cbk, subvol,
- subvol->fops->readdirp, fd, size, yoff);
-
- return 0;
-
- root_inode:
- /* readdir on '/' */
- local = GF_CALLOC (1, sizeof (map_local_t),
- gf_map_mt_map_local_t);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR,
- "memory allocation failed :(");
- op_errno = ENOMEM;
- goto err;
- }
-
- priv = this->private;
- frame->local = local;
- local->op_errno = ENOENT;
- local->op_ret = -1;
-
- local->fd = fd_ref (fd);
- local->size = size;
-
- map_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
-
- if (whichop == GF_FOP_READDIR)
- STACK_WIND (frame, map_readdir_cbk, xvol, xvol->fops->readdir,
- fd, size, xoff);
- else
- STACK_WIND (frame, map_readdirp_cbk, xvol, xvol->fops->readdirp,
- fd, size, xoff);
-
- return 0;
- err:
- STACK_UNWIND (frame, -1, op_errno, NULL);
-
- return 0;
-}
-
-
-
-int32_t
-map_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff)
-{
- map_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIR);
- return 0;
-}
-
-int32_t
-map_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff)
-{
- map_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP);
- return 0;
-}
-
-
-void
-fini (xlator_t *this)
-{
- map_private_t *priv = NULL;
- struct map_pattern *trav_map = NULL;
- struct map_pattern *tmp_map = NULL;
-
- priv = this->private;
-
- if (priv) {
- if (priv->xlarray)
- GF_FREE (priv->xlarray);
-
- trav_map = priv->map;
- while (trav_map) {
- tmp_map = trav_map;
- trav_map = trav_map->next;
- GF_FREE (tmp_map);
- }
-
- GF_FREE(priv);
- }
-
- return;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_map_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-
-int
-init (xlator_t *this)
-{
- map_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int count = 0;
- int ret = -1;
- char *pattern_string = NULL;
- char *map_pair_str = NULL;
- char *tmp_str = NULL;
- char *tmp_str1 = NULL;
- char *dup_map_pair = NULL;
- char *dir_str = NULL;
- char *subvol_str = NULL;
- char *map_xl = NULL;
-
-
- if (!this->children) {
- gf_log (this->name,GF_LOG_ERROR,
- "FATAL: map should have one or more child defined");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
-
- priv = GF_CALLOC (1, sizeof (map_private_t),
- gf_map_mt_map_private_t);
- this->private = priv;
-
- /* allocate xlator array */
- trav = this->children;
- while (trav) {
- count++;
- trav = trav->next;
- }
- priv->xlarray = GF_CALLOC (1, sizeof (struct map_xlator_array) * count,
- gf_map_mt_map_xlator_array);
- priv->child_count = count;
-
- /* build xlator array */
- count = 0;
- trav = this->children;
- while (trav) {
- priv->xlarray[count++].xl = trav->xlator;
- trav = trav->next;
- }
-
- /* map dir1:brick1;dir2:brick2;dir3:brick3;*:brick4 */
- ret = dict_get_str (this->options, "map-directory", &pattern_string);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "map.pattern not given, can't continue");
- goto err;
- }
- map_pair_str = strtok_r (pattern_string, ";", &tmp_str);
- while (map_pair_str) {
- dup_map_pair = gf_strdup (map_pair_str);
- dir_str = strtok_r (dup_map_pair, ":", &tmp_str1);
- if (!dir_str) {
- gf_log (this->name, GF_LOG_ERROR,
- "directory string invalid");
- goto err;
- }
- subvol_str = strtok_r (NULL, ":", &tmp_str1);
- if (!subvol_str) {
- gf_log (this->name, GF_LOG_ERROR,
- "mapping subvolume string invalid");
- goto err;
- }
- ret = verify_dir_and_assign_subvol (this,
- dir_str,
- subvol_str);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "verification failed");
- goto err;
- }
-
- GF_FREE (dup_map_pair);
-
- map_pair_str = strtok_r (NULL, ";", &tmp_str);
- }
-
- /* default-volume brick4 */
- ret = dict_get_str (this->options, "default-volume", &map_xl);
- if (ret == 0) {
- ret = assign_default_subvol (this, map_xl);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "assigning default failed");
- goto err;
- }
- }
-
- verify_if_all_subvolumes_got_used (this);
-
- return 0;
- err:
- fini (this);
- return -1;
-}
-
-
-struct xlator_fops fops = {
- .lookup = map_lookup,
- .mknod = map_mknod,
- .create = map_create,
-
- .stat = map_stat,
- .fstat = map_fstat,
- .truncate = map_truncate,
- .ftruncate = map_ftruncate,
- .access = map_access,
- .readlink = map_readlink,
- .setxattr = map_setxattr,
- .getxattr = map_getxattr,
- .fsetxattr = map_fsetxattr,
- .fgetxattr = map_fgetxattr,
- .removexattr = map_removexattr,
- .open = map_open,
- .readv = map_readv,
- .writev = map_writev,
- .flush = map_flush,
- .fsync = map_fsync,
- .statfs = map_statfs,
- .lk = map_lk,
- .opendir = map_opendir,
- .readdir = map_readdir,
- .readdirp = map_readdirp,
- .fsyncdir = map_fsyncdir,
- .symlink = map_symlink,
- .unlink = map_unlink,
- .link = map_link,
- .mkdir = map_mkdir,
- .rmdir = map_rmdir,
- .rename = map_rename,
- .inodelk = map_inodelk,
- .finodelk = map_finodelk,
- .entrylk = map_entrylk,
- .fentrylk = map_fentrylk,
- .xattrop = map_xattrop,
- .fxattrop = map_fxattrop,
- .setdents = map_setdents,
- .getdents = map_getdents,
- .checksum = map_checksum,
- .setattr = map_setattr,
- .fsetattr = map_fsetattr,
-};
-
-struct xlator_cbks cbks = {
-};
-
-struct volume_options options[] = {
- { .key = {"map-directory"},
- .type = GF_OPTION_TYPE_ANY
- },
- { .key = {"default-volume"},
- .type = GF_OPTION_TYPE_XLATOR
- },
-
- { .key = {NULL} }
-};
diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h
deleted file mode 100644
index 44ba3ee125f..00000000000
--- a/xlators/cluster/map/src/map.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MAP_H__
-#define __MAP_H__
-
-#include "xlator.h"
-#include "map-mem-types.h"
-
-struct map_pattern {
- struct map_pattern *next;
- xlator_t *xl;
- char *directory;
- int dir_len;
-};
-
-struct map_xlator_array {
- xlator_t *xl;
- int mapped; /* yes/no */
-};
-
-typedef struct {
- struct map_pattern *map;
- xlator_t *default_xl;
- struct map_xlator_array *xlarray;
- int child_count;
-} map_private_t;
-
-typedef struct {
- int32_t op_ret;
- int32_t op_errno;
- int call_count;
- struct statvfs statvfs;
- struct iatt stbuf;
- inode_t *inode;
- dict_t *dict;
- fd_t *fd;
-
- size_t size;
-} map_local_t;
-
-xlator_t *map_subvol_next (xlator_t *this, xlator_t *prev);
-int map_subvol_cnt (xlator_t *this, xlator_t *subvol);
-
-int map_itransform (xlator_t *this, xlator_t *subvol,
- uint64_t x, uint64_t *y_p);
-int map_deitransform (xlator_t *this, uint64_t y,
- xlator_t **subvol_p, uint64_t *x_p);
-
-
-xlator_t *get_mapping_subvol_from_path (xlator_t *this, const char *path);
-xlator_t *get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode);
-
-int check_multiple_volume_entry (xlator_t *this, xlator_t *subvol);
-int verify_dir_and_assign_subvol (xlator_t *this,
- const char *directory, const char *subvol);
-int assign_default_subvol (xlator_t *this, const char *default_xl);
-void verify_if_all_subvolumes_got_used (xlator_t *this);
-
-
-#endif /* __MAP_H__ */
diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am
deleted file mode 100644
index 6d4fae4d268..00000000000
--- a/xlators/cluster/stripe/src/Makefile.am
+++ /dev/null
@@ -1,16 +0,0 @@
-
-xlator_LTLIBRARIES = stripe.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-
-stripe_la_LDFLAGS = -module -avoidversion
-
-stripe_la_SOURCES = stripe.c
-stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = stripe.h stripe-mem-types.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h
deleted file mode 100644
index 2d2757e86a8..00000000000
--- a/xlators/cluster/stripe/src/stripe-mem-types.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef __STRIPE_MEM_TYPES_H__
-#define __STRIPE_MEM_TYPES_H__
-
-#include "mem-types.h"
-
-enum gf_stripe_mem_types_ {
- gf_stripe_mt_stripe_local_t = gf_common_mt_end + 1,
- gf_stripe_mt_iovec,
- gf_stripe_mt_readv_replies,
- gf_stripe_mt_stripe_fd_ctx_t,
- gf_stripe_mt_char,
- gf_stripe_mt_int8_t,
- gf_stripe_mt_xlator_t,
- gf_stripe_mt_stripe_private_t,
- gf_stripe_mt_stripe_options,
- gf_stripe_mt_end
-};
-#endif
-
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
deleted file mode 100644
index b2c70cfe59d..00000000000
--- a/xlators/cluster/stripe/src/stripe.c
+++ /dev/null
@@ -1,4002 +0,0 @@
-/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * xlators/cluster/stripe:
- * Stripe translator, stripes the data accross its child nodes,
- * as per the options given in the volfile. The striping works
- * fairly simple. It writes files at different offset as per
- * calculation. So, 'ls -l' output at the real posix level will
- * show file size bigger than the actual size. But when one does
- * 'df' or 'du <file>', real size of the file on the server is shown.
- *
- * WARNING:
- * Stripe translator can't regenerate data if a child node gets disconnected.
- * So, no 'self-heal' for stripe. Hence the advice, use stripe only when its
- * very much necessary, or else, use it in combination with AFR, to have a
- * backup copy.
- */
-
-#include "stripe.h"
-
-void
-stripe_local_wipe (stripe_local_t *local)
-{
- if (!local)
- goto out;
-
- if (local->loc.path)
- loc_wipe (&local->loc);
- if (local->loc2.path)
- loc_wipe (&local->loc2);
-out:
- return;
-}
-
-/**
- * stripe_get_matching_bs - Get the matching block size for the given path.
- */
-int32_t
-stripe_get_matching_bs (const char *path, struct stripe_options *opts,
- uint64_t default_bs)
-{
- struct stripe_options *trav = NULL;
- char *pathname = NULL;
- uint64_t block_size = 0;
-
- block_size = default_bs;
-
- if (!path || !opts)
- goto out;
-
- /* FIXME: is a strdup really necessary? */
- pathname = gf_strdup (path);
- if (!pathname)
- goto out;
-
- trav = opts;
- while (trav) {
- if (!fnmatch (trav->path_pattern, pathname, FNM_NOESCAPE)) {
- block_size = trav->block_size;
- break;
- }
- trav = trav->next;
- }
-
- GF_FREE (pathname);
-
-out:
- return block_size;
-}
-
-
-
-int32_t
-stripe_sh_chown_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- int callcnt = -1;
- stripe_local_t *local = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- stripe_local_wipe (local);
- STACK_DESTROY (frame->root);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_sh_make_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!frame || !frame->local || !cookie || !this) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- STACK_WIND (frame, stripe_sh_chown_cbk, prev->this,
- prev->this->fops->setattr, &local->loc,
- &local->stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID));
-
-out:
- return 0;
-}
-
-int32_t
-stripe_entry_self_heal (call_frame_t *frame, xlator_t *this,
- stripe_local_t *local)
-{
- xlator_list_t *trav = NULL;
- call_frame_t *rframe = NULL;
- stripe_local_t *rlocal = NULL;
- stripe_private_t *priv = NULL;
- dict_t *dict = NULL;
- int ret = 0;
-
- if (!local || !this || !frame) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- if (!(IA_ISREG (local->stbuf.ia_type) ||
- IA_ISDIR (local->stbuf.ia_type)))
- return 0;
-
- priv = this->private;
- trav = this->children;
- rframe = copy_frame (frame);
- if (!rframe) {
- goto out;
- }
- rlocal = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!rlocal) {
- goto out;
- }
- rframe->local = rlocal;
- rlocal->call_count = priv->child_count;
- loc_copy (&rlocal->loc, &local->loc);
- memcpy (&rlocal->stbuf, &local->stbuf, sizeof (struct iatt));
-
- dict = dict_new ();
- if (!dict)
- goto out;
-
- ret = dict_set_static_bin (dict, "gfid-req", local->stbuf.ia_gfid, 16);
- if (ret)
- gf_log (this->name, GF_LOG_WARNING,
- "failed to set gfid-req");
-
- while (trav) {
- if (IA_ISREG (local->stbuf.ia_type)) {
- STACK_WIND (rframe, stripe_sh_make_entry_cbk,
- trav->xlator, trav->xlator->fops->mknod,
- &local->loc,
- st_mode_from_ia (local->stbuf.ia_prot,
- local->stbuf.ia_type), 0,
- dict);
- }
- if (IA_ISDIR (local->stbuf.ia_type)) {
- STACK_WIND (rframe, stripe_sh_make_entry_cbk,
- trav->xlator, trav->xlator->fops->mkdir,
- &local->loc, st_mode_from_ia (local->stbuf.ia_prot,
- local->stbuf.ia_type),
- dict);
- }
- trav = trav->next;
- }
-
-out:
- if (dict)
- dict_unref (dict);
-
- return 0;
-}
-
-int32_t
-stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *dict, struct iatt *postparent)
-{
- int32_t callcnt = 0;
- dict_t *tmp_dict = NULL;
- inode_t *tmp_inode = NULL;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- if (op_errno != ENOENT)
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name,
- strerror (op_errno));
- if (local->op_errno != ESTALE)
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- if (op_errno == ENOENT)
- local->entry_self_heal_needed = 1;
- }
-
- if (op_ret >= 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this) {
- local->stbuf = *buf;
- local->postparent = *postparent;
- local->inode = inode_ref (inode);
- local->dict = dict_ref (dict);
- }
- local->stbuf_blocks += buf->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->op_ret == 0 && local->entry_self_heal_needed)
- stripe_entry_self_heal (frame, this, local);
-
- if (local->failed)
- local->op_ret = -1;
-
- tmp_dict = local->dict;
- tmp_inode = local->inode;
-
- if (local->op_ret != -1) {
- local->stbuf.ia_blocks = local->stbuf_blocks;
- local->stbuf.ia_size = local->stbuf_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- }
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (lookup, frame, local->op_ret,
- local->op_errno, local->inode,
- &local->stbuf, local->dict,
- &local->postparent);
-
- if (tmp_inode)
- inode_unref (tmp_inode);
- if (tmp_dict)
- dict_unref (tmp_dict);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *xattr_req)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- loc_copy (&local->loc, loc);
-
- /* Everytime in stripe lookup, all child nodes
- should be looked up */
- local->call_count = priv->child_count;
- while (trav) {
- STACK_WIND (frame, stripe_lookup_cbk, trav->xlator,
- trav->xlator->fops->lookup,
- loc, xattr_req);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this) {
- local->stbuf = *buf;
- }
-
- local->stbuf_blocks += buf->ia_blocks;
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (stat, frame, local->op_ret,
- local->op_errno, &local->stbuf);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_stat_cbk, trav->xlator,
- trav->xlator->fops->stat, loc);
- trav = trav->next;
- }
-
- return 0;
-
-err:
- STACK_UNWIND_STRICT (stat, frame, -1, op_errno, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct statvfs *stbuf)
-{
- stripe_local_t *local = NULL;
- int32_t callcnt = 0;
-
- if (!this || !frame || !frame->local) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret && (op_errno != ENOTCONN)) {
- local->op_errno = op_errno;
- }
- if (op_ret == 0) {
- struct statvfs *dict_buf = &local->statvfs_buf;
- dict_buf->f_bsize = stbuf->f_bsize;
- dict_buf->f_frsize = stbuf->f_frsize;
- dict_buf->f_blocks += stbuf->f_blocks;
- dict_buf->f_bfree += stbuf->f_bfree;
- dict_buf->f_bavail += stbuf->f_bavail;
- dict_buf->f_files += stbuf->f_files;
- dict_buf->f_ffree += stbuf->f_ffree;
- dict_buf->f_favail += stbuf->f_favail;
- dict_buf->f_fsid = stbuf->f_fsid;
- dict_buf->f_flag = stbuf->f_flag;
- dict_buf->f_namemax = stbuf->f_namemax;
- local->op_ret = 0;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- STACK_UNWIND_STRICT (statfs, frame, local->op_ret,
- local->op_errno, &local->statvfs_buf);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
-
- trav = this->children;
- priv = this->private;
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- frame->local = local;
-
- local->call_count = priv->child_count;
- while (trav) {
- STACK_WIND (frame, stripe_statfs_cbk, trav->xlator,
- trav->xlator->fops->statfs, loc);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (statfs, frame, -1, op_errno, NULL);
- return 0;
-}
-
-
-
-int32_t
-stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
- if (FIRST_CHILD(this) == prev->this) {
- local->pre_buf = *prebuf;
- local->post_buf = *postbuf;
- }
-
- local->prebuf_blocks += prebuf->ia_blocks;
- local->postbuf_blocks += postbuf->ia_blocks;
-
- if (local->prebuf_size < prebuf->ia_size)
- local->prebuf_size = prebuf->ia_size;
-
- if (local->postbuf_size < postbuf->ia_size)
- local->postbuf_size = postbuf->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->pre_buf.ia_blocks = local->prebuf_blocks;
- local->pre_buf.ia_size = local->prebuf_size;
- local->post_buf.ia_blocks = local->postbuf_blocks;
- local->post_buf.ia_size = local->postbuf_size;
- }
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (truncate, frame, local->op_ret,
- local->op_errno, &local->pre_buf,
- &local->post_buf);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_truncate_cbk, trav->xlator,
- trav->xlator->fops->truncate, loc, offset);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this) {
- local->pre_buf = *preop;
- local->post_buf = *postop;
- }
-
- local->prebuf_blocks += preop->ia_blocks;
- local->postbuf_blocks += postop->ia_blocks;
-
- if (local->prebuf_size < preop->ia_size)
- local->prebuf_size = preop->ia_size;
- if (local->postbuf_size < postop->ia_size)
- local->postbuf_size = postop->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->pre_buf.ia_blocks = local->prebuf_blocks;
- local->pre_buf.ia_size = local->prebuf_size;
- local->post_buf.ia_blocks = local->postbuf_blocks;
- local->post_buf.ia_size = local->postbuf_size;
- }
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (setattr, frame, local->op_ret,
- local->op_errno, &local->pre_buf,
- &local->post_buf);
- }
-out:
- return 0;
-}
-
-
-int32_t
-stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_setattr_cbk,
- trav->xlator, trav->xlator->fops->setattr,
- loc, stbuf, valid);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (setattr, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_setattr_cbk, trav->xlator,
- trav->xlator->fops->fsetattr, fd, stbuf, valid);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (fsetattr, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- local->stbuf.ia_blocks += buf->ia_blocks;
- local->preparent.ia_blocks += preoldparent->ia_blocks;
- local->postparent.ia_blocks += postoldparent->ia_blocks;
- local->pre_buf.ia_blocks += prenewparent->ia_blocks;
- local->post_buf.ia_blocks += postnewparent->ia_blocks;
-
- if (local->stbuf.ia_size < buf->ia_size)
- local->stbuf.ia_size = buf->ia_size;
-
- if (local->preparent.ia_size < preoldparent->ia_size)
- local->preparent.ia_size = preoldparent->ia_size;
-
- if (local->postparent.ia_size < postoldparent->ia_size)
- local->postparent.ia_size = postoldparent->ia_size;
-
- if (local->pre_buf.ia_size < prenewparent->ia_size)
- local->pre_buf.ia_size = prenewparent->ia_size;
-
- if (local->post_buf.ia_size < postnewparent->ia_size)
- local->post_buf.ia_size = postnewparent->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preparent,
- &local->postparent, &local->pre_buf,
- &local->post_buf);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_first_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- op_errno = EINVAL;
- goto unwind;
- }
-
- if (op_ret == -1) {
- goto unwind;
- }
-
- local = frame->local;
- trav = this->children;
-
- local->stbuf = *buf;
- local->preparent = *preoldparent;
- local->postparent = *postoldparent;
- local->pre_buf = *prenewparent;
- local->post_buf = *postnewparent;
-
- local->op_ret = 0;
- local->call_count--;
-
- trav = trav->next; /* Skip first child */
- while (trav) {
- STACK_WIND (frame, stripe_stack_rename_cbk,
- trav->xlator, trav->xlator->fops->rename,
- &local->loc, &local->loc2);
- trav = trav->next;
- }
- return 0;
-
-unwind:
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (rename, frame, -1, op_errno, buf, preoldparent,
- postoldparent, prenewparent, postnewparent);
- return 0;
-}
-
-int32_t
-stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
- loc_t *newloc)
-{
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (oldloc, err);
- VALIDATE_OR_GOTO (oldloc->path, err);
- VALIDATE_OR_GOTO (oldloc->inode, err);
- VALIDATE_OR_GOTO (newloc, err);
-
- priv = this->private;
- trav = this->children;
-
- /* If any one node is down, don't allow rename */
- if (priv->nodes_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->loc2, newloc);
-
- local->call_count = priv->child_count;
-
- frame->local = local;
-
- STACK_WIND (frame, stripe_first_rename_cbk, trav->xlator,
- trav->xlator->fops->rename, oldloc, newloc);
-
- return 0;
-err:
- STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
-}
-
-
-
-int32_t
-stripe_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG, "%s returned %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- if (FIRST_CHILD(this) == prev->this) {
- local->preparent = *preparent;
- local->postparent = *postparent;
- }
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
-
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- }
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (unlink, frame, local->op_ret,
- local->op_errno, &local->preparent,
- &local->postparent);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Don't unlink a file if a node is down */
- if (priv->nodes_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_unlink_cbk,
- trav->xlator, trav->xlator->fops->unlink,
- loc);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_first_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,struct iatt *preparent,
- struct iatt *postparent)
-
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- op_errno = EINVAL;
- goto err;
- }
-
- if (op_ret == -1) {
- goto err;
- }
-
- trav = this->children;
- local = frame->local;
-
- local->call_count--; /* First child successful */
- trav = trav->next; /* Skip first child */
-
- local->preparent = *preparent;
- local->postparent = *postparent;
- local->preparent_size = preparent->ia_size;
- local->postparent_size = postparent->ia_size;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- while (trav) {
- STACK_WIND (frame, stripe_unlink_cbk, trav->xlator,
- trav->xlator->fops->rmdir, &local->loc,
- local->flags);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, NULL, NULL);
- return 0;
-
-}
-
-int32_t
-stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* don't delete a directory if any of the subvolume is down */
- if (priv->nodes_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- loc_copy (&local->loc, loc);
- local->flags = flags;
- local->call_count = priv->child_count;
-
- STACK_WIND (frame, stripe_first_rmdir_cbk, trav->xlator,
- trav->xlator->fops->rmdir, loc, flags);
-
- return 0;
-err:
- STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (mknod, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf,
- &local->preparent, &local->postparent);
- }
-out:
- return 0;
-}
-
-
-/**
- */
-int32_t
-stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->op_ret == -1) {
- local->call_count = priv->child_count;
- while (trav) {
- STACK_WIND (frame,
- stripe_mknod_ifreg_fail_unlink_cbk,
- trav->xlator,
- trav->xlator->fops->unlink,
- &local->loc);
- trav = trav->next;
- }
- return 0;
- }
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (mknod, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf,
- &local->preparent, &local->postparent);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- int ret = 0;
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
-
- if (FIRST_CHILD(this) == prev->this) {
- local->stbuf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
- }
-
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
-
- if ((local->op_ret != -1) && priv->xattr_supported) {
- /* Send a setxattr request to nodes where the
- files are created */
- int32_t i = 0;
- char size_key[256] = {0,};
- char index_key[256] = {0,};
- char count_key[256] = {0,};
- dict_t *dict = NULL;
-
- sprintf (size_key,
- "trusted.%s.stripe-size", this->name);
- sprintf (count_key,
- "trusted.%s.stripe-count", this->name);
- sprintf (index_key,
- "trusted.%s.stripe-index", this->name);
-
- local->call_count = priv->child_count;
- memcpy (local->loc.inode->gfid, local->stbuf.ia_gfid, 16);
- for (i = 0; i < priv->child_count; i++) {
- dict = get_new_dict ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to allocate dict");
- }
-
- dict_ref (dict);
- /* TODO: check return value */
- ret = dict_set_int64 (dict, size_key,
- local->stripe_size);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR,
- "%s: set stripe-size failed",
- local->loc.path);
- ret = dict_set_int32 (dict, count_key,
- priv->child_count);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR,
- "%s: set child_count failed",
- local->loc.path);
- ret = dict_set_int32 (dict, index_key, i);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR,
- "%s: set stripe-index failed",
- local->loc.path);
-
- STACK_WIND (frame,
- stripe_mknod_ifreg_setxattr_cbk,
- priv->xl_array[i],
- priv->xl_array[i]->fops->setxattr,
- &local->loc, dict, 0);
-
- dict_unref (dict);
- }
- return 0;
- }
-
- /* Create itself has failed.. so return
- without setxattring */
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (mknod, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf,
- &local->preparent, &local->postparent);
- }
-out:
- return 0;
-}
-
-
-int32_t
-stripe_single_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
-}
-
-
-int
-stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t rdev, dict_t *params)
-{
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- if (S_ISREG(mode)) {
- /* NOTE: on older kernels (older than 2.6.9),
- creat() fops is sent as mknod() + open(). Hence handling
- S_IFREG files is necessary */
- if (priv->nodes_down) {
- gf_log (this->name, GF_LOG_WARNING,
- "Some node down, returning EIO");
- op_errno = EIO;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->stripe_size = stripe_get_matching_bs (loc->path,
- priv->pattern,
- priv->block_size);
- frame->local = local;
- local->inode = loc->inode;
- loc_copy (&local->loc, loc);
-
- /* Everytime in stripe lookup, all child nodes should
- be looked up */
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_mknod_ifreg_cbk,
- trav->xlator, trav->xlator->fops->mknod,
- loc, mode, rdev, params);
- trav = trav->next;
- }
-
- /* This case is handled, no need to continue further. */
- return 0;
- }
-
- STACK_WIND (frame, stripe_single_mknod_cbk,
- FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
- loc, mode, rdev, params);
-
- return 0;
-err:
- STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- inode_t *local_inode = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
-
- if (op_ret >= 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this) {
- local->inode = inode_ref (inode);
- local->stbuf = *buf;
- local->postparent = *postparent;
- local->preparent = *preparent;
- }
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- local_inode = local->inode;
-
- if (local->op_ret != -1) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
- STACK_UNWIND_STRICT (mkdir, frame, local->op_ret,
- local->op_errno, local->inode,
- &local->stbuf, &local->preparent,
- &local->postparent);
-
- if (local_inode)
- inode_unref (local_inode);
- }
-out:
- return 0;
-}
-
-
-int
-stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dict_t *params)
-{
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- local->call_count = priv->child_count;
- frame->local = local;
-
- /* Everytime in stripe lookup, all child nodes should be looked up */
- while (trav) {
- STACK_WIND (frame, stripe_mkdir_cbk,
- trav->xlator, trav->xlator->fops->mkdir,
- loc, mode, params);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- inode_t *local_inode = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
-
- if (op_ret >= 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this) {
- local->inode = inode_ref (inode);
- local->stbuf = *buf;
- local->postparent = *postparent;
- local->preparent = *preparent;
- }
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- local_inode = local->inode;
-
- if (local->op_ret != -1) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
- STACK_UNWIND_STRICT (link, frame, local->op_ret,
- local->op_errno, local->inode,
- &local->stbuf, &local->preparent,
- &local->postparent);
-
- if (local_inode)
- inode_unref (local_inode);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (oldloc, err);
- VALIDATE_OR_GOTO (oldloc->path, err);
- VALIDATE_OR_GOTO (oldloc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* If any one node is down, don't allow link operation */
- if (priv->nodes_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- /* Everytime in stripe lookup, all child
- nodes should be looked up */
- while (trav) {
- STACK_WIND (frame, stripe_link_cbk,
- trav->xlator, trav->xlator->fops->link,
- oldloc, newloc);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (link, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_create_fail_unlink_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- fd_t *lfd = NULL;
- stripe_local_t *local = NULL;
- inode_t *local_inode = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local_inode = local->inode;
- lfd = local->fd;
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (create, frame, local->op_ret, local->op_errno,
- local->fd, local->inode, &local->stbuf,
- &local->preparent, &local->postparent);
-
- if (local_inode)
- inode_unref (local_inode);
- if (lfd)
- fd_unref (lfd);
- }
-out:
- return 0;
-}
-
-
-int32_t
-stripe_create_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- inode_t *local_inode = NULL;
- fd_t *lfd = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t callcnt = 0;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->op_ret == -1) {
- local->call_count = priv->child_count;
- trav = this->children;
- while (trav) {
- STACK_WIND (frame,
- stripe_create_fail_unlink_cbk,
- trav->xlator,
- trav->xlator->fops->unlink,
- &local->loc);
- trav = trav->next;
- }
-
- return 0;
- }
-
- lfd = local->fd;
- local_inode = local->inode;
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (create, frame, local->op_ret, local->op_errno,
- local->fd, local->inode, &local->stbuf,
- &local->preparent, &local->postparent);
-
- if (local_inode)
- inode_unref (local_inode);
- if (lfd)
- fd_unref (lfd);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd,
- inode_t *inode, struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- fd_t *lfd = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- inode_t *local_inode = NULL;
- call_frame_t *prev = NULL;
- int ret = 0;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- priv = this->private;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- /* Get the mapping in inode private */
- /* Get the stat buf right */
- if (FIRST_CHILD(this) == prev->this) {
- local->stbuf = *buf;
- local->preparent = *preparent;
- local->postparent = *postparent;
- }
-
- local->stbuf_blocks += buf->ia_blocks;
- local->preparent_blocks += preparent->ia_blocks;
- local->postparent_blocks += postparent->ia_blocks;
-
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- if (local->preparent_size < preparent->ia_size)
- local->preparent_size = preparent->ia_size;
- if (local->postparent_size < postparent->ia_size)
- local->postparent_size = postparent->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->preparent.ia_blocks = local->preparent_blocks;
- local->preparent.ia_size = local->preparent_size;
- local->postparent.ia_blocks = local->postparent_blocks;
- local->postparent.ia_size = local->postparent_size;
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
-
- /* */
- if (local->op_ret >= 0) {
- fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t),
- gf_stripe_mt_stripe_fd_ctx_t);
- if (!fctx) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unwind;
- }
-
- fctx->stripe_size = local->stripe_size;
- fctx->stripe_count = priv->child_count;
- fctx->static_array = 1;
- fctx->xl_array = priv->xl_array;
- fd_ctx_set (local->fd, this,
- (uint64_t)(long)fctx);
- }
-
- if ((local->op_ret != -1) &&
- local->stripe_size && priv->xattr_supported) {
- /* Send a setxattr request to nodes where
- the files are created */
- int32_t i = 0;
- char size_key[256] = {0,};
- char index_key[256] = {0,};
- char count_key[256] = {0,};
- dict_t *dict = NULL;
-
- sprintf (size_key,
- "trusted.%s.stripe-size", this->name);
- sprintf (count_key,
- "trusted.%s.stripe-count", this->name);
- sprintf (index_key,
- "trusted.%s.stripe-index", this->name);
-
- local->call_count = priv->child_count;
- memcpy (local->loc.inode->gfid, local->stbuf.ia_gfid, 16);
- for (i = 0; i < priv->child_count; i++) {
- dict = get_new_dict ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "error allocating dict");
- }
- dict_ref (dict);
-
- /* TODO: check return values */
- ret = dict_set_int64 (dict, size_key,
- local->stripe_size);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR,
- "%s: set stripe-size failed",
- local->loc.path);
-
- ret = dict_set_int32 (dict, count_key,
- priv->child_count);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR,
- "%s: set stripe-size failed",
- local->loc.path);
-
- ret = dict_set_int32 (dict, index_key, i);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR,
- "%s: set stripe-size failed",
- local->loc.path);
-
- STACK_WIND (frame, stripe_create_setxattr_cbk,
- priv->xl_array[i],
- priv->xl_array[i]->fops->setxattr,
- &local->loc, dict, 0);
-
- dict_unref (dict);
- }
- return 0;
- }
-
-unwind:
- /* Create itself has failed.. so return
- without setxattring */
- lfd = local->fd;
- local_inode = local->inode;
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (create, frame, local->op_ret,
- local->op_errno, local->fd,
- local->inode, &local->stbuf,
- &local->preparent, &local->postparent);
-
- if (local_inode)
- inode_unref (local_inode);
- if (lfd)
- fd_unref (lfd);
- }
-
-out:
- return 0;
-}
-
-
-/**
- * stripe_create - If a block-size is specified for the 'name', create the
- * file in all the child nodes. If not, create it in only first child.
- *
- * @name- complete path of the file to be created.
- */
-int32_t
-stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, mode_t mode, fd_t *fd, dict_t *params)
-{
- stripe_private_t *priv = NULL;
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
-
- /* files created in O_APPEND mode does not allow lseek() on fd */
- flags &= ~O_APPEND;
-
- if (priv->first_child_down || priv->nodes_down) {
- gf_log (this->name, GF_LOG_DEBUG,
- "First node down, returning EIO");
- op_errno = EIO;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- local->stripe_size = stripe_get_matching_bs (loc->path,
- priv->pattern,
- priv->block_size);
- frame->local = local;
- local->inode = inode_ref (loc->inode);
- loc_copy (&local->loc, loc);
- local->fd = fd_ref (fd);
-
- local->call_count = priv->child_count;
-
- trav = this->children;
- while (trav) {
- STACK_WIND (frame, stripe_create_cbk, trav->xlator,
- trav->xlator->fops->create, loc, flags,
- mode, fd, params);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- fd_t *lfd = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0)
- local->op_ret = op_ret;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret == -1) {
- if (local->fctx) {
- if (!local->fctx->static_array)
- GF_FREE (local->fctx->xl_array);
- GF_FREE (local->fctx);
- }
- } else {
- fd_ctx_set (local->fd, this,
- (uint64_t)(long)local->fctx);
- }
-
- lfd = local->fd;
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (open, frame, local->op_ret,
- local->op_errno, local->fd);
- if (lfd)
- fd_unref (lfd);
-
- }
-out:
- return 0;
-}
-
-
-int32_t
-stripe_open_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
-{
- int32_t index = 0;
- int32_t callcnt = 0;
- char key[256] = {0,};
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- data_t *data = NULL;
- call_frame_t *prev = NULL;
- fd_t *lfd = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = (call_frame_t *)cookie;
- priv = this->private;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_ret = -1;
- if (local->op_errno != EIO)
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- goto unlock;
- }
-
- if (!dict)
- goto unlock;
-
- if (!local->fctx) {
- local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t),
- gf_stripe_mt_stripe_fd_ctx_t);
- if (!local->fctx) {
- local->op_errno = ENOMEM;
- local->op_ret = -1;
- goto unlock;
- }
-
- local->fctx->static_array = 0;
- }
- /* Stripe block size */
- sprintf (key, "trusted.%s.stripe-size", this->name);
- data = dict_get (dict, key);
- if (!data) {
- local->xattr_self_heal_needed = 1;
- } else {
- if (!local->fctx->stripe_size) {
- local->fctx->stripe_size =
- data_to_int64 (data);
- }
-
- if (local->fctx->stripe_size != data_to_int64 (data)) {
- gf_log (this->name, GF_LOG_WARNING,
- "stripe-size mismatch in blocks");
- local->xattr_self_heal_needed = 1;
- }
- }
- /* Stripe count */
- sprintf (key, "trusted.%s.stripe-count", this->name);
- data = dict_get (dict, key);
- if (!data) {
- local->xattr_self_heal_needed = 1;
- goto unlock;
- }
- if (!local->fctx->xl_array) {
- local->fctx->stripe_count = data_to_int32 (data);
- if (!local->fctx->stripe_count) {
- gf_log (this->name, GF_LOG_ERROR,
- "error with stripe-count xattr");
- local->op_ret = -1;
- local->op_errno = EIO;
- goto unlock;
- }
-
- local->fctx->xl_array =
- GF_CALLOC (local->fctx->stripe_count,
- sizeof (xlator_t *),
- gf_stripe_mt_xlator_t);
- if (!local->fctx->xl_array) {
- local->op_errno = ENOMEM;
- local->op_ret = -1;
- goto unlock;
- }
- }
- if (local->fctx->stripe_count != data_to_int32 (data)) {
- gf_log (this->name, GF_LOG_ERROR,
- "error with stripe-count xattr (%d != %d)",
- local->fctx->stripe_count, data_to_int32 (data));
- local->op_ret = -1;
- local->op_errno = EIO;
- goto unlock;
- }
-
- /* index */
- sprintf (key, "trusted.%s.stripe-index", this->name);
- data = dict_get (dict, key);
- if (!data) {
- local->xattr_self_heal_needed = 1;
- goto unlock;
- }
- index = data_to_int32 (data);
- if (index > priv->child_count) {
- gf_log (this->name, GF_LOG_ERROR,
- "error with stripe-index xattr (%d)", index);
- local->op_ret = -1;
- local->op_errno = EIO;
- goto unlock;
- }
- if (local->fctx->xl_array) {
- if (local->fctx->xl_array[index]) {
- gf_log (this->name, GF_LOG_ERROR,
- "duplicate entry @ index (%d)", index);
- local->op_ret = -1;
- local->op_errno = EIO;
- goto unlock;
- }
- local->fctx->xl_array[index] = prev->this;
- }
- local->entry_count++;
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- /* TODO: if self-heal flag is set, do it */
- if (local->xattr_self_heal_needed) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: stripe info need to be healed",
- local->loc.path);
- }
-
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret)
- goto err;
-
- if (local->entry_count != local->fctx->stripe_count) {
- gf_log (this->name, GF_LOG_ERROR,
- "entry-count (%d) != stripe-count (%d)",
- local->entry_count, local->fctx->stripe_count);
- local->op_ret = -1;
- local->op_errno = EIO;
- goto err;
- }
- if (!local->fctx->stripe_size) {
- gf_log (this->name, GF_LOG_ERROR, "stripe size not set");
- local->op_ret = -1;
- local->op_errno = EIO;
- goto err;
- }
-
- local->call_count = local->fctx->stripe_count;
-
- trav = this->children;
- while (trav) {
- STACK_WIND (frame, stripe_open_cbk, trav->xlator,
- trav->xlator->fops->open, &local->loc,
- local->flags, local->fd, 0);
- trav = trav->next;
- }
- }
-
- return 0;
-err:
- lfd = local->fd;
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (open, frame, local->op_ret, local->op_errno,
- local->fd);
- if (lfd)
- fd_unref (lfd);
-out:
- return 0;
-}
-
-/**
- * stripe_open -
- */
-int32_t
-stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, fd_t *fd, int32_t wbflags)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- /* files opened in O_APPEND mode does not allow lseek() on fd */
- flags &= ~O_APPEND;
-
- local->fd = fd_ref (fd);
- frame->local = local;
- loc_copy (&local->loc, loc);
-
- /* Striped files */
- local->flags = flags;
- local->call_count = priv->child_count;
- local->stripe_size = stripe_get_matching_bs (loc->path,
- priv->pattern,
- priv->block_size);
-
- if (priv->xattr_supported) {
- while (trav) {
- STACK_WIND (frame, stripe_open_getxattr_cbk,
- trav->xlator, trav->xlator->fops->getxattr,
- loc, NULL);
- trav = trav->next;
- }
- return 0;
- }
- local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t),
- gf_stripe_mt_stripe_fd_ctx_t);
- if (!local->fctx) {
- op_errno = ENOMEM;
- goto err;
- }
-
- local->fctx->static_array = 1;
- local->fctx->stripe_size = local->stripe_size;
- local->fctx->stripe_count = priv->child_count;
- local->fctx->xl_array = priv->xl_array;
-
- while (trav) {
- STACK_WIND (frame, stripe_open_cbk, trav->xlator,
- trav->xlator->fops->open,
- &local->loc, local->flags, local->fd,
- wbflags);
- trav = trav->next;
- }
- return 0;
-err:
- STACK_UNWIND_STRICT (open, frame, -1, op_errno, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- fd_t *local_fd = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0)
- local->op_ret = op_ret;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local_fd = local->fd;
- STACK_UNWIND_STRICT (opendir, frame, local->op_ret,
- local->op_errno, local->fd);
- if (local_fd)
- fd_unref (local_fd);
- }
-out:
- return 0;
-}
-
-
-int32_t
-stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
-{
- xlator_list_t *trav = NULL;
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
- local->call_count = priv->child_count;
- local->fd = fd_ref (fd);
-
- while (trav) {
- STACK_WIND (frame, stripe_opendir_cbk, trav->xlator,
- trav->xlator->fops->opendir, loc, fd);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (opendir, frame, -1, op_errno, NULL);
- return 0;
-}
-
-int32_t
-stripe_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
- if (op_ret >= 0) {
- if (FIRST_CHILD(this) == prev->this) {
- /* First successful call, copy the *lock */
- local->op_ret = op_ret;
- local->lock = *lock;
- }
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
- STACK_UNWIND_STRICT (lk, frame, local->op_ret,
- local->op_errno, &local->lock);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
- struct gf_flock *lock)
-{
- stripe_local_t *local = NULL;
- xlator_list_t *trav = NULL;
- stripe_private_t *priv = NULL;
- int32_t op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- trav = this->children;
- priv = this->private;
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_lk_cbk, trav->xlator,
- trav->xlator->fops->lk, fd, cmd, lock);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (lk, frame, -1, op_errno, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
- if (op_ret >= 0)
- local->op_ret = op_ret;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (flush, frame, local->op_ret,
- local->op_errno);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- if (priv->first_child_down) {
- op_errno = ENOTCONN;
- goto err;
- }
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_flush_cbk, trav->xlator,
- trav->xlator->fops->flush, fd);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (flush, frame, -1, op_errno);
- return 0;
-}
-
-
-
-int32_t
-stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- if (FIRST_CHILD(this) == prev->this) {
- local->pre_buf = *prebuf;
- local->post_buf = *postbuf;
- }
- local->prebuf_blocks += prebuf->ia_blocks;
- local->postbuf_blocks += postbuf->ia_blocks;
-
- if (local->prebuf_size < prebuf->ia_size)
- local->prebuf_size = prebuf->ia_size;
-
- if (local->postbuf_size < postbuf->ia_size)
- local->postbuf_size = postbuf->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->pre_buf.ia_blocks = local->prebuf_blocks;
- local->pre_buf.ia_size = local->prebuf_size;
- local->post_buf.ia_blocks = local->postbuf_blocks;
- local->post_buf.ia_size = local->postbuf_size;
- }
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (fsync, frame, local->op_ret,
- local->op_errno, &local->pre_buf,
- &local->post_buf);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_fsync_cbk, trav->xlator,
- trav->xlator->fops->fsync, fd, flags);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int32_t
-stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- if (FIRST_CHILD(this) == prev->this)
- local->stbuf = *buf;
-
- local->stbuf_blocks += buf->ia_blocks;
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- if (local->op_ret != -1) {
- local->stbuf.ia_size = local->stbuf_size;
- local->stbuf.ia_blocks = local->stbuf_blocks;
- }
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (fstat, frame, local->op_ret,
- local->op_errno, &local->stbuf);
- }
-
-out:
- return 0;
-}
-
-int32_t
-stripe_fstat (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_fstat_cbk, trav->xlator,
- trav->xlator->fops->fstat, fd);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (fstat, frame, -1, op_errno, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_truncate_cbk, trav->xlator,
- trav->xlator->fops->ftruncate, fd, offset);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- if ((op_errno != ENOENT) ||
- (prev->this == FIRST_CHILD (this)))
- local->failed = 1;
- }
- if (op_ret >= 0)
- local->op_ret = op_ret;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed)
- local->op_ret = -1;
-
- stripe_local_wipe (local);
- STACK_UNWIND_STRICT (fsyncdir, frame, local->op_ret,
- local->op_errno);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags)
-{
- stripe_local_t *local = NULL;
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- int32_t op_errno = 1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- priv = this->private;
- trav = this->children;
-
- /* Initialization */
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- local->op_ret = -1;
- frame->local = local;
- local->call_count = priv->child_count;
-
- while (trav) {
- STACK_WIND (frame, stripe_fsyncdir_cbk, trav->xlator,
- trav->xlator->fops->fsyncdir, fd, flags);
- trav = trav->next;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (fsyncdir, frame, -1, op_errno);
- return 0;
-}
-
-
-int32_t
-stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- int32_t i = 0;
- int32_t callcnt = 0;
- int32_t count = 0;
- stripe_local_t *local = NULL;
- struct iovec *vec = NULL;
- struct iatt tmp_stbuf = {0,};
- struct iobref *tmp_iobref = NULL;
- struct iobuf *iobuf = NULL;
-
- if (!this || !frame || !frame->local) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret != -1)
- if (local->stbuf_size < buf->ia_size)
- local->stbuf_size = buf->ia_size;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- op_ret = 0;
-
- /* Keep extra space for filling in '\0's */
- vec = GF_CALLOC ((local->count * 2), sizeof (struct iovec),
- gf_stripe_mt_iovec);
- if (!vec) {
- op_ret = -1;
- goto done;
- }
-
- for (i = 0; i < local->wind_count; i++) {
- if (local->replies[i].op_ret) {
- memcpy ((vec + count), local->replies[i].vector,
- (local->replies[i].count * sizeof (struct iovec)));
- count += local->replies[i].count;
- op_ret += local->replies[i].op_ret;
- }
- if ((local->replies[i].op_ret <
- local->replies[i].requested_size) &&
- (local->stbuf_size > (local->offset + op_ret))) {
- /* Fill in 0s here */
- vec[count].iov_len =
- (local->replies[i].requested_size -
- local->replies[i].op_ret);
- iobuf = iobuf_get (this->ctx->iobuf_pool);
- if (!iobuf) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_ret = -1;
- op_errno = ENOMEM;
- goto done;
- }
- memset (iobuf->ptr, 0, vec[count].iov_len);
- iobref_add (local->iobref, iobuf);
- vec[count].iov_base = iobuf->ptr;
-
- op_ret += vec[count].iov_len;
- count++;
- }
- GF_FREE (local->replies[i].vector);
- }
-
- /* FIXME: notice that st_ino, and st_dev (gen) will be
- * different than what inode will have. Make sure this doesn't
- * cause any bugs at higher levels */
- memcpy (&tmp_stbuf, &local->replies[0].stbuf,
- sizeof (struct iatt));
- tmp_stbuf.ia_size = local->stbuf_size;
-
- done:
- GF_FREE (local->replies);
- tmp_iobref = local->iobref;
- fd_unref (local->fd);
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vec,
- count, &tmp_stbuf, tmp_iobref);
-
- iobref_unref (tmp_iobref);
- if (vec)
- GF_FREE (vec);
- }
-out:
- return 0;
-}
-
-/**
- * stripe_readv_cbk - get all the striped reads, and order it properly, send it
- * to above layer after putting it in a single vector.
- */
-int32_t
-stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iovec *vector,
- int32_t count, struct iatt *stbuf, struct iobref *iobref)
-{
- int32_t index = 0;
- int32_t callcnt = 0;
- int32_t final_count = 0;
- int32_t need_to_check_proper_size = 0;
- call_frame_t *mframe = NULL;
- stripe_local_t *mlocal = NULL;
- stripe_local_t *local = NULL;
- struct iovec *final_vec = NULL;
- struct iatt tmp_stbuf = {0,};
- struct iobref *tmp_iobref = NULL;
- stripe_fd_ctx_t *fctx = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto end;
- }
-
- local = frame->local;
- index = local->node_index;
- mframe = local->orig_frame;
- if (!mframe)
- goto out;
-
- mlocal = mframe->local;
- if (!mlocal)
- goto out;
-
- fctx = mlocal->fctx;
-
- LOCK (&mframe->lock);
- {
- mlocal->replies[index].op_ret = op_ret;
- mlocal->replies[index].op_errno = op_errno;
- mlocal->replies[index].requested_size = local->readv_size;
- if (op_ret >= 0) {
- mlocal->replies[index].stbuf = *stbuf;
- mlocal->replies[index].count = count;
- mlocal->replies[index].vector = iov_dup (vector, count);
-
- if (!mlocal->iobref)
- mlocal->iobref = iobref_new ();
- iobref_merge (mlocal->iobref, iobref);
- }
- callcnt = ++mlocal->call_count;
- }
- UNLOCK(&mframe->lock);
-
- if (callcnt == mlocal->wind_count) {
- op_ret = 0;
-
- for (index=0; index < mlocal->wind_count; index++) {
- /* check whether each stripe returned
- * 'expected' number of bytes */
- if (mlocal->replies[index].op_ret == -1) {
- op_ret = -1;
- op_errno = mlocal->replies[index].op_errno;
- break;
- }
- /* TODO: handle the 'holes' within the read range
- properly */
- if (mlocal->replies[index].op_ret <
- mlocal->replies[index].requested_size) {
- need_to_check_proper_size = 1;
- }
-
- op_ret += mlocal->replies[index].op_ret;
- mlocal->count += mlocal->replies[index].count;
- }
- if (op_ret == -1)
- goto done;
- if (need_to_check_proper_size)
- goto check_size;
-
- final_vec = GF_CALLOC (mlocal->count, sizeof (struct iovec),
- gf_stripe_mt_iovec);
-
- if (!final_vec) {
- op_ret = -1;
- goto done;
- }
-
- for (index = 0; index < mlocal->wind_count; index++) {
- memcpy ((final_vec + final_count),
- mlocal->replies[index].vector,
- (mlocal->replies[index].count *
- sizeof (struct iovec)));
- final_count += mlocal->replies[index].count;
- GF_FREE (mlocal->replies[index].vector);
- }
-
- /* FIXME: notice that st_ino, and st_dev (gen) will be
- * different than what inode will have. Make sure this doesn't
- * cause any bugs at higher levels */
- memcpy (&tmp_stbuf, &mlocal->replies[0].stbuf,
- sizeof (struct iatt));
-
- done:
- /* */
- GF_FREE (mlocal->replies);
- tmp_iobref = mlocal->iobref;
- fd_unref (mlocal->fd);
- STACK_UNWIND_STRICT (readv, mframe, op_ret, op_errno, final_vec,
- final_count, &tmp_stbuf, tmp_iobref);
-
- iobref_unref (tmp_iobref);
- if (final_vec)
- GF_FREE (final_vec);
- }
-
- goto out;
-
-check_size:
- mlocal->call_count = fctx->stripe_count;
-
- for (index = 0; index < fctx->stripe_count; index++) {
- STACK_WIND (mframe, stripe_readv_fstat_cbk,
- (fctx->xl_array[index]),
- (fctx->xl_array[index])->fops->fstat,
- mlocal->fd);
- }
-
-out:
- STACK_DESTROY (frame->root);
-end:
- return 0;
-}
-
-
-int32_t
-stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t offset)
-{
- int32_t op_errno = EINVAL;
- int32_t idx = 0;
- int32_t index = 0;
- int32_t num_stripe = 0;
- int32_t off_index = 0;
- size_t frame_size = 0;
- off_t rounded_end = 0;
- uint64_t tmp_fctx = 0;
- uint64_t stripe_size = 0;
- off_t rounded_start = 0;
- off_t frame_offset = offset;
- stripe_local_t *local = NULL;
- call_frame_t *rframe = NULL;
- stripe_local_t *rlocal = NULL;
- stripe_fd_ctx_t *fctx = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- fd_ctx_get (fd, this, &tmp_fctx);
- if (!tmp_fctx) {
- op_errno = EBADFD;
- goto err;
- }
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
- stripe_size = fctx->stripe_size;
-
- if (!stripe_size) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Wrong stripe size for the file");
- goto err;
- }
- /* The file is stripe across the child nodes. Send the read request
- * to the child nodes appropriately after checking which region of
- * the file is in which child node. Always '0-<stripe_size>' part of
- * the file resides in the first child.
- */
- rounded_start = floor (offset, stripe_size);
- rounded_end = roof (offset+size, stripe_size);
- num_stripe = (rounded_end- rounded_start)/stripe_size;
-
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
-
- /* This is where all the vectors should be copied. */
- local->replies = GF_CALLOC (num_stripe, sizeof (struct readv_replies),
- gf_stripe_mt_readv_replies);
- if (!local->replies) {
- op_errno = ENOMEM;
- goto err;
- }
-
- off_index = (offset / stripe_size) % fctx->stripe_count;
- local->wind_count = num_stripe;
- local->readv_size = size;
- local->offset = offset;
- local->fd = fd_ref (fd);
- local->fctx = fctx;
-
- for (index = off_index; index < (num_stripe + off_index); index++) {
- rframe = copy_frame (frame);
- rlocal = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!rlocal) {
- op_errno = ENOMEM;
- goto err;
- }
-
- frame_size = min (roof (frame_offset+1, stripe_size),
- (offset + size)) - frame_offset;
-
- rlocal->node_index = index - off_index;
- rlocal->orig_frame = frame;
- rlocal->readv_size = frame_size;
- rframe->local = rlocal;
- idx = (index % fctx->stripe_count);
- STACK_WIND (rframe, stripe_readv_cbk, fctx->xl_array[idx],
- fctx->xl_array[idx]->fops->readv,
- fd, frame_size, frame_offset);
-
- frame_offset += frame_size;
- }
-
- return 0;
-err:
- if (local && local->fd)
- fd_unref (local->fd);
-
- STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- int32_t callcnt = 0;
- stripe_local_t *local = NULL;
- call_frame_t *prev = NULL;
-
- if (!this || !frame || !frame->local || !cookie) {
- gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
- goto out;
- }
-
- prev = cookie;
- local = frame->local;
-
- LOCK(&frame->lock);
- {
- callcnt = ++local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- local->op_ret = -1;
- }
- if (op_ret >= 0) {
- local->op_ret += op_ret;
- local->post_buf = *postbuf;
- local->pre_buf = *prebuf;
- }
- }
- UNLOCK (&frame->lock);
-
- if ((callcnt == local->wind_count) && local->unwind) {
- STACK_UNWIND_STRICT (writev, frame, local->op_ret,
- local->op_errno, &local->pre_buf,
- &local->post_buf);
- }
-out:
- return 0;
-}
-
-int32_t
-stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref)
-{
- struct iovec *tmp_vec = NULL;
- stripe_local_t *local = NULL;
- stripe_fd_ctx_t *fctx = NULL;
- int32_t op_errno = 1;
- int32_t idx = 0;
- int32_t total_size = 0;
- int32_t offset_offset = 0;
- int32_t remaining_size = 0;
- int32_t tmp_count = count;
- off_t fill_size = 0;
- uint64_t stripe_size = 0;
- uint64_t tmp_fctx = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
-
- fd_ctx_get (fd, this, &tmp_fctx);
- if (!tmp_fctx) {
- op_errno = EINVAL;
- goto err;
- }
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
- stripe_size = fctx->stripe_size;
-
- /* File has to be stripped across the child nodes */
- for (idx = 0; idx< count; idx ++) {
- total_size += vector[idx].iov_len;
- }
- remaining_size = total_size;
-
- local = GF_CALLOC (1, sizeof (stripe_local_t),
- gf_stripe_mt_stripe_local_t);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
- frame->local = local;
- local->stripe_size = stripe_size;
-
- while (1) {
- /* Send striped chunk of the vector to child
- nodes appropriately. */
- idx = (((offset + offset_offset) /
- local->stripe_size) % fctx->stripe_count);
-
- fill_size = (local->stripe_size -
- ((offset + offset_offset) % local->stripe_size));
- if (fill_size > remaining_size)
- fill_size = remaining_size;
-
- remaining_size -= fill_size;
-
- tmp_count = iov_subset (vector, count, offset_offset,
- offset_offset + fill_size, NULL);
- tmp_vec = GF_CALLOC (tmp_count, sizeof (struct iovec),
- gf_stripe_mt_iovec);
- if (!tmp_vec) {
- op_errno = ENOMEM;
- goto err;
- }
- tmp_count = iov_subset (vector, count, offset_offset,
- offset_offset + fill_size, tmp_vec);
-
- local->wind_count++;
- if (remaining_size == 0)
- local->unwind = 1;
-
- STACK_WIND (frame, stripe_writev_cbk, fctx->xl_array[idx],
- fctx->xl_array[idx]->fops->writev, fd, tmp_vec,
- tmp_count, offset + offset_offset, iobref);
- GF_FREE (tmp_vec);
- offset_offset += fill_size;
- if (remaining_size == 0)
- break;
- }
-
- return 0;
-err:
- STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-stripe_release (xlator_t *this, fd_t *fd)
-{
- uint64_t tmp_fctx = 0;
- stripe_fd_ctx_t *fctx = NULL;
-
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- fd_ctx_del (fd, this, &tmp_fctx);
- if (!tmp_fctx) {
- goto err;
- }
-
- fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
-
- if (!fctx->static_array)
- GF_FREE (fctx->xl_array);
-
- GF_FREE (fctx);
-
-err:
- return 0;
-}
-
-
-int32_t
-notify (xlator_t *this, int32_t event, void *data, ...)
-{
- stripe_private_t *priv = NULL;
- int down_client = 0;
- int i = 0;
-
- if (!this)
- return 0;
-
- priv = this->private;
- if (!priv)
- return 0;
-
- switch (event)
- {
- case GF_EVENT_CHILD_UP:
- {
- /* get an index number to set */
- for (i = 0; i < priv->child_count; i++) {
- if (data == priv->xl_array[i])
- break;
- }
- priv->state[i] = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->state[i])
- down_client++;
- }
-
- LOCK (&priv->lock);
- {
- priv->nodes_down = down_client;
- if (data == FIRST_CHILD (this))
- priv->first_child_down = 0;
- if (!priv->nodes_down)
- default_notify (this, event, data);
- }
- UNLOCK (&priv->lock);
- }
- break;
- case GF_EVENT_CHILD_DOWN:
- {
- /* get an index number to set */
- for (i = 0; i < priv->child_count; i++) {
- if (data == priv->xl_array[i])
- break;
- }
- priv->state[i] = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->state[i])
- down_client++;
- }
-
- LOCK (&priv->lock);
- {
- priv->nodes_down = down_client;
-
- if (data == FIRST_CHILD (this))
- priv->first_child_down = 1;
- if (priv->nodes_down)
- default_notify (this, event, data);
- }
- UNLOCK (&priv->lock);
- }
- break;
-
- default:
- {
- /* */
- default_notify (this, event, data);
- }
- break;
- }
-
- return 0;
-}
-
-int
-set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data)
-{
- int ret = -1;
- char *tmp_str = NULL;
- char *tmp_str1 = NULL;
- char *dup_str = NULL;
- char *stripe_str = NULL;
- char *pattern = NULL;
- char *num = NULL;
- struct stripe_options *temp_stripeopt = NULL;
- struct stripe_options *stripe_opt = NULL;
-
- if (!this || !priv || !data)
- goto out;
-
- /* Get the pattern for striping.
- "option block-size *avi:10MB" etc */
- stripe_str = strtok_r (data, ",", &tmp_str);
- while (stripe_str) {
- dup_str = gf_strdup (stripe_str);
- stripe_opt = CALLOC (1, sizeof (struct stripe_options));
- if (!stripe_opt) {
- GF_FREE (dup_str);
- goto out;
- }
-
- pattern = strtok_r (dup_str, ":", &tmp_str1);
- num = strtok_r (NULL, ":", &tmp_str1);
- if (!num) {
- num = pattern;
- pattern = "*";
- }
- if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "invalid number format \"%s\"", num);
- goto out;
- }
- memcpy (stripe_opt->path_pattern, pattern, strlen (pattern));
-
- gf_log (this->name, GF_LOG_DEBUG,
- "block-size : pattern %s : size %"PRId64,
- stripe_opt->path_pattern, stripe_opt->block_size);
-
- if (!priv->pattern) {
- priv->pattern = stripe_opt;
- } else {
- temp_stripeopt = priv->pattern;
- while (temp_stripeopt->next)
- temp_stripeopt = temp_stripeopt->next;
- temp_stripeopt->next = stripe_opt;
- }
- stripe_str = strtok_r (NULL, ",", &tmp_str);
- GF_FREE (dup_str);
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- goto out;
-
- ret = xlator_mem_acct_init (this, gf_stripe_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- goto out;
- }
-
-out:
- return ret;
-}
-int
-validate_options (xlator_t *this, dict_t *options, char **op_errstr)
-{
-
-
- data_t *data = NULL;
- int ret = 0;
- stripe_private_t *priv = NULL;
-
- data = dict_get (options, "block-size");
- if (data) {
- gf_log (this->name, GF_LOG_TRACE,"Reconfiguring Stripe"
- " Block-size");
- priv = GF_CALLOC (1, sizeof (stripe_private_t),
- gf_stripe_mt_stripe_private_t);
- if (!priv) {
- gf_log ("",GF_LOG_ERROR, "Unable to allocate memory");
- ret = -1;
- goto out;
- }
-
- ret = set_stripe_block_size (this, priv, data->data);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Reconfigue: Block-Size reconfiguration failed");
- *op_errstr = gf_strdup ("Error, could not parse list");
- ret = -1;
- goto out;
- }
- gf_log (this->name, GF_LOG_TRACE,
- "Reconfigue: Block-Size reconfigured Successfully");
- }
-
-out:
- if (priv)
- GF_FREE (priv);
- return ret;
-
-}
-
-int
-reconfigure (xlator_t *this, dict_t *options)
-{
-
- stripe_private_t *priv = NULL;
- data_t *data = NULL;
- int ret = 0;
-
-
- priv = this->private;
-
- data = dict_get (options, "block-size");
- if (data) {
- gf_log (this->name, GF_LOG_TRACE,"Reconfiguring Stripe"
- " Block-size");
- ret = set_stripe_block_size (this, priv, data->data);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Reconfigue: Block-Size reconfiguration failed");
- ret = -1;
- goto out;
- }
- gf_log (this->name, GF_LOG_TRACE,
- "Reconfigue: Block-Size reconfigured Successfully");
- }
- else {
- priv->block_size = (128 * GF_UNIT_KB);
- }
-
-
-out:
- return ret;
-
-}
-
-/**
- * init - This function is called when xlator-graph gets initialized.
- * The option given in volfiles are parsed here.
- * @this -
- */
-int32_t
-init (xlator_t *this)
-{
- stripe_private_t *priv = NULL;
- xlator_list_t *trav = NULL;
- data_t *data = NULL;
- int32_t count = 0;
- int ret = -1;
-
- if (!this)
- goto out;
-
- trav = this->children;
- while (trav) {
- count++;
- trav = trav->next;
- }
-
- if (!count) {
- gf_log (this->name, GF_LOG_ERROR,
- "stripe configured without \"subvolumes\" option. "
- "exiting");
- goto out;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
-
- if (count == 1) {
- gf_log (this->name, GF_LOG_ERROR,
- "stripe configured with only one \"subvolumes\" option."
- " please check the volume. exiting");
- goto out;
- }
-
- priv = GF_CALLOC (1, sizeof (stripe_private_t),
- gf_stripe_mt_stripe_private_t);
-
- if (!priv)
- goto out;
- priv->xl_array = GF_CALLOC (count, sizeof (xlator_t *),
- gf_stripe_mt_xlator_t);
- if (!priv->xl_array)
- goto out;
-
- priv->state = GF_CALLOC (count, sizeof (int8_t),
- gf_stripe_mt_int8_t);
- if (!priv->state)
- goto out;
-
- priv->child_count = count;
- LOCK_INIT (&priv->lock);
-
- trav = this->children;
- count = 0;
- while (trav) {
- priv->xl_array[count++] = trav->xlator;
- trav = trav->next;
- }
-
- if (count > 256) {
- gf_log (this->name, GF_LOG_ERROR,
- "maximum number of stripe subvolumes supported "
- "is 256");
- goto out;
- }
-
- priv->block_size = (128 * GF_UNIT_KB);
- /* option stripe-pattern *avi:1GB,*pdf:4096 */
- data = dict_get (this->options, "block-size");
- if (!data) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No \"option block-size <x>\" given, defaulting "
- "to 128KB");
- } else {
- ret = set_stripe_block_size (this, priv, data->data);
- if (ret)
- goto out;
- }
-
- priv->xattr_supported = 1;
- data = dict_get (this->options, "use-xattr");
- if (data) {
- if (gf_string2boolean (data->data,
- &priv->xattr_supported) == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "error setting hard check for extended "
- "attribute");
- //return -1;
- }
- }
-
- /* notify related */
- priv->nodes_down = priv->child_count;
- this->private = priv;
-
- ret = 0;
-out:
- if (ret) {
- if (priv) {
- if (priv->xl_array)
- GF_FREE (priv->xl_array);
- GF_FREE (priv);
- }
- }
- return ret;
-}
-
-/**
- * fini - Free all the private variables
- * @this -
- */
-void
-fini (xlator_t *this)
-{
- stripe_private_t *priv = NULL;
- struct stripe_options *prev = NULL;
- struct stripe_options *trav = NULL;
-
- if (!this)
- goto out;
-
- priv = this->private;
- if (priv) {
- this->private = NULL;
- if (priv->xl_array)
- GF_FREE (priv->xl_array);
-
- trav = priv->pattern;
- while (trav) {
- prev = trav;
- trav = trav->next;
- FREE (prev);
- }
- LOCK_DESTROY (&priv->lock);
- GF_FREE (priv);
- }
-
-out:
- return;
-}
-
-
-struct xlator_fops fops = {
- .stat = stripe_stat,
- .unlink = stripe_unlink,
- .rename = stripe_rename,
- .link = stripe_link,
- .truncate = stripe_truncate,
- .create = stripe_create,
- .open = stripe_open,
- .readv = stripe_readv,
- .writev = stripe_writev,
- .statfs = stripe_statfs,
- .flush = stripe_flush,
- .fsync = stripe_fsync,
- .ftruncate = stripe_ftruncate,
- .fstat = stripe_fstat,
- .mkdir = stripe_mkdir,
- .rmdir = stripe_rmdir,
- .lk = stripe_lk,
- .opendir = stripe_opendir,
- .fsyncdir = stripe_fsyncdir,
- .setattr = stripe_setattr,
- .fsetattr = stripe_fsetattr,
- .lookup = stripe_lookup,
- .mknod = stripe_mknod,
-};
-
-struct xlator_cbks cbks = {
- .release = stripe_release,
-};
-
-
-struct volume_options options[] = {
- { .key = {"block-size"},
- .type = GF_OPTION_TYPE_ANY
- },
- { .key = {"use-xattr"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h
deleted file mode 100644
index d94d82d6dc9..00000000000
--- a/xlators/cluster/stripe/src/stripe.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef _STRIPE_H_
-#define _STRIPE_H_
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat.h"
-#include "compat-errno.h"
-#include "stripe-mem-types.h"
-#include <fnmatch.h>
-#include <signal.h>
-
-
-/**
- * struct stripe_options : This keeps the pattern and the block-size
- * information, which is used for striping on a file.
- */
-struct stripe_options {
- struct stripe_options *next;
- char path_pattern[256];
- uint64_t block_size;
-};
-
-/**
- * Private structure for stripe translator
- */
-struct stripe_private {
- struct stripe_options *pattern;
- xlator_t **xl_array;
- uint64_t block_size;
- gf_lock_t lock;
- uint8_t nodes_down;
- int8_t first_child_down;
- int8_t child_count;
- int8_t *state; /* Current state of child node */
- gf_boolean_t xattr_supported; /* default yes */
-};
-
-/**
- * Used to keep info about the replies received from fops->readv calls
- */
-struct readv_replies {
- struct iovec *vector;
- int32_t count; //count of vector
- int32_t op_ret; //op_ret of readv
- int32_t op_errno;
- int32_t requested_size;
- struct iatt stbuf; /* 'stbuf' is also a part of reply */
-};
-
-typedef struct _stripe_fd_ctx {
- off_t stripe_size;
- int stripe_count;
- int static_array;
- xlator_t **xl_array;
-} stripe_fd_ctx_t;
-
-
-/**
- * Local structure to be passed with all the frames in case of STACK_WIND
- */
-struct stripe_local; /* this itself is used inside the structure; */
-
-struct stripe_local {
- struct stripe_local *next;
- call_frame_t *orig_frame;
-
- stripe_fd_ctx_t *fctx;
-
- /* Used by _cbk functions */
- struct iatt stbuf;
- struct iatt pre_buf;
- struct iatt post_buf;
- struct iatt preparent;
- struct iatt postparent;
-
- off_t stbuf_size;
- off_t prebuf_size;
- off_t postbuf_size;
- off_t preparent_size;
- off_t postparent_size;
-
- blkcnt_t stbuf_blocks;
- blkcnt_t prebuf_blocks;
- blkcnt_t postbuf_blocks;
- blkcnt_t preparent_blocks;
- blkcnt_t postparent_blocks;
-
- struct readv_replies *replies;
- struct statvfs statvfs_buf;
- dir_entry_t *entry;
-
- int8_t revalidate;
- int8_t failed;
- int8_t unwind;
-
- size_t readv_size;
- int32_t entry_count;
- int32_t node_index;
- int32_t call_count;
- int32_t wind_count; /* used instead of child_cound
- in case of read and write */
- int32_t op_ret;
- int32_t op_errno;
- int32_t count;
- int32_t flags;
- char *name;
- inode_t *inode;
-
- loc_t loc;
- loc_t loc2;
-
- /* For File I/O fops */
- dict_t *dict;
-
- /* General usage */
- off_t offset;
- off_t stripe_size;
-
- int xattr_self_heal_needed;
- int entry_self_heal_needed;
-
- int8_t *list;
- struct gf_flock lock;
- fd_t *fd;
- void *value;
- struct iobref *iobref;
-};
-
-typedef struct stripe_local stripe_local_t;
-typedef struct stripe_private stripe_private_t;
-
-#endif /* _STRIPE_H_ */
diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am
deleted file mode 100644
index 2a1fe837263..00000000000
--- a/xlators/cluster/unify/src/Makefile.am
+++ /dev/null
@@ -1,16 +0,0 @@
-
-xlator_LTLIBRARIES = unify.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/legacy/cluster
-
-unify_la_LDFLAGS = -module -avoidversion
-
-unify_la_SOURCES = unify.c unify-self-heal.c
-unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = unify.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/cluster/unify/src/unify-mem-types.h b/xlators/cluster/unify/src/unify-mem-types.h
deleted file mode 100644
index dcf96477935..00000000000
--- a/xlators/cluster/unify/src/unify-mem-types.h
+++ /dev/null
@@ -1,41 +0,0 @@
-
-/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef __UNIFY_MEM_TYPES_H__
-#define __UNIFY_MEM_TYPES_H__
-
-#include "mem-types.h"
-
-enum gf_unify_mem_types_ {
- gf_unify_mt_char = gf_common_mt_end + 1,
- gf_unify_mt_int16_t,
- gf_unify_mt_xlator_t,
- gf_unify_mt_unify_private_t,
- gf_unify_mt_xlator_list_t,
- gf_unify_mt_dir_entry_t,
- gf_unify_mt_off_t,
- gf_unify_mt_int,
- gf_unify_mt_unify_self_heal_struct,
- gf_unify_mt_unify_local_t,
- gf_unify_mt_end
-};
-#endif
-
diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c
deleted file mode 100644
index 725523a2e42..00000000000
--- a/xlators/cluster/unify/src/unify-self-heal.c
+++ /dev/null
@@ -1,1239 +0,0 @@
-/*
- Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * unify-self-heal.c :
- * This file implements few functions which enables 'unify' translator
- * to be consistent in its behaviour when
- * > a node fails,
- * > a node gets added,
- * > a failed node comes back
- * > a new namespace server is added (ie, an fresh namespace server).
- *
- * This functionality of 'unify' will enable glusterfs to support storage
- * system failure, and maintain consistancy. This works both ways, ie, when
- * an entry (either file or directory) is found on namespace server, and not
- * on storage nodes, its created in storage nodes and vica-versa.
- *
- * The two fops, where it can be implemented are 'getdents ()' and 'lookup ()'
- *
- */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "unify.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "common-utils.h"
-
-int32_t
-unify_sh_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count);
-
-int32_t
-unify_sh_ns_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count);
-
-int32_t
-unify_bgsh_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count);
-
-int32_t
-unify_bgsh_ns_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count);
-
-/**
- * unify_local_wipe - free all the extra allocation of local->* here.
- */
-static void
-unify_local_wipe (unify_local_t *local)
-{
- /* Free the strdup'd variables in the local structure */
- if (local->name) {
- GF_FREE (local->name);
- }
-
- if (local->sh_struct) {
- if (local->sh_struct->offset_list)
- GF_FREE (local->sh_struct->offset_list);
-
- if (local->sh_struct->entry_list)
- GF_FREE (local->sh_struct->entry_list);
-
- if (local->sh_struct->count_list)
- GF_FREE (local->sh_struct->count_list);
-
- GF_FREE (local->sh_struct);
- }
-
- loc_wipe (&local->loc1);
- loc_wipe (&local->loc2);
-}
-
-int32_t
-unify_sh_setdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int32_t callcnt = -1;
- unify_local_t *local = frame->local;
- inode_t *inode = NULL;
- dict_t *tmp_dict = NULL;
- dir_entry_t *prev, *entry, *trav;
-
- LOCK (&frame->lock);
- {
- /* if local->call_count == 0, that means, setdents on
- * storagenodes is still pending.
- */
- if (local->call_count)
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (callcnt == 0) {
- if (local->sh_struct->entry_list[0]) {
- prev = entry = local->sh_struct->entry_list[0];
- if (!entry)
- return 0;
- trav = entry->next;
- while (trav) {
- prev->next = trav->next;
- GF_FREE (trav->name);
- if (IA_ISLNK (trav->buf.ia_type))
- GF_FREE (trav->link);
- GF_FREE (trav);
- trav = prev->next;
- }
- GF_FREE (entry);
- }
-
- if (!local->flags) {
- if (local->sh_struct->count_list[0] >=
- UNIFY_SELF_HEAL_GETDENTS_COUNT) {
- /* count == size, that means, there are more entries
- to read from */
- //local->call_count = 0;
- local->sh_struct->offset_list[0] +=
- UNIFY_SELF_HEAL_GETDENTS_COUNT;
- STACK_WIND (frame,
- unify_sh_ns_getdents_cbk,
- NS(this),
- NS(this)->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- local->sh_struct->offset_list[0],
- GF_GET_DIR_ONLY);
- }
- } else {
- inode = local->loc1.inode;
- fd_unref (local->fd);
- tmp_dict = local->dict;
-
- unify_local_wipe (local);
-
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- inode, &local->stbuf, local->dict,
- &local->oldpostparent);
- if (tmp_dict)
- dict_unref (tmp_dict);
- }
- }
-
- return 0;
-}
-
-
-int32_t
-unify_sh_ns_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count)
-{
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- long index = 0;
- unsigned long final = 0;
- dir_entry_t *tmp = GF_CALLOC (1, sizeof (dir_entry_t),
- gf_unify_mt_dir_entry_t);
-
- local->sh_struct->entry_list[0] = tmp;
- local->sh_struct->count_list[0] = count;
- if (entry) {
- tmp->next = entry->next;
- entry->next = NULL;
- }
-
- if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) {
- final = 1;
- }
-
- LOCK (&frame->lock);
- {
- /* local->call_count will be '0' till now. make it 1 so, it
- can be UNWIND'ed for the last call. */
- local->call_count = priv->child_count;
- if (final)
- local->flags = 1;
- }
- UNLOCK (&frame->lock);
-
- for (index = 0; index < priv->child_count; index++)
- {
- STACK_WIND_COOKIE (frame,
- unify_sh_setdents_cbk,
- (void *)index,
- priv->xl_array[index],
- priv->xl_array[index]->fops->setdents,
- local->fd, GF_SET_DIR_ONLY,
- local->sh_struct->entry_list[0], count);
- }
-
- return 0;
-}
-
-int32_t
-unify_sh_ns_setdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int32_t callcnt = -1;
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- long index = (long)cookie;
- dir_entry_t *prev, *entry, *trav;
-
- LOCK (&frame->lock);
- {
- if (local->sh_struct->entry_list[index]) {
- prev = entry = local->sh_struct->entry_list[index];
- trav = entry->next;
- while (trav) {
- prev->next = trav->next;
- GF_FREE (trav->name);
- if (IA_ISLNK (trav->buf.ia_type))
- GF_FREE (trav->link);
- GF_FREE (trav);
- trav = prev->next;
- }
- GF_FREE (entry);
- }
- }
- UNLOCK (&frame->lock);
-
- if (local->sh_struct->count_list[index] <
- UNIFY_SELF_HEAL_GETDENTS_COUNT) {
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
- } else {
- /* count == size, that means, there are more entries
- to read from */
- local->sh_struct->offset_list[index] +=
- UNIFY_SELF_HEAL_GETDENTS_COUNT;
- STACK_WIND_COOKIE (frame,
- unify_sh_getdents_cbk,
- cookie,
- priv->xl_array[index],
- priv->xl_array[index]->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- local->sh_struct->offset_list[index],
- GF_GET_ALL);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir on (%s) with offset %"PRId64"",
- priv->xl_array[index]->name,
- local->sh_struct->offset_list[index]);
- }
-
- if (!callcnt) {
- /* All storage nodes have done unified setdents on NS node.
- * Now, do getdents from NS and do setdents on storage nodes.
- */
-
- /* sh_struct->offset_list is no longer required for
- storage nodes now */
- local->sh_struct->offset_list[0] = 0; /* reset */
-
- STACK_WIND (frame,
- unify_sh_ns_getdents_cbk,
- NS(this),
- NS(this)->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- 0, /* In this call, do send '0' as offset */
- GF_GET_DIR_ONLY);
- }
-
- return 0;
-}
-
-
-/**
- * unify_sh_getdents_cbk -
- */
-int32_t
-unify_sh_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count)
-{
- int32_t callcnt = -1;
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- long index = (long)cookie;
- dir_entry_t *tmp = NULL;
-
- if (op_ret >= 0 && count > 0) {
- /* There is some dentry found, just send the dentry to NS */
- tmp = GF_CALLOC (1, sizeof (dir_entry_t),
- gf_unify_mt_dir_entry_t);
- local->sh_struct->entry_list[index] = tmp;
- local->sh_struct->count_list[index] = count;
- if (entry) {
- tmp->next = entry->next;
- entry->next = NULL;
- }
- STACK_WIND_COOKIE (frame,
- unify_sh_ns_setdents_cbk,
- cookie,
- NS(this),
- NS(this)->fops->setdents,
- local->fd,
- GF_SET_IF_NOT_PRESENT,
- local->sh_struct->entry_list[index],
- count);
- return 0;
- }
-
- if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) {
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
- } else {
- /* count == size, that means, there are more entries
- to read from */
- local->sh_struct->offset_list[index] +=
- UNIFY_SELF_HEAL_GETDENTS_COUNT;
- STACK_WIND_COOKIE (frame,
- unify_sh_getdents_cbk,
- cookie,
- priv->xl_array[index],
- priv->xl_array[index]->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- local->sh_struct->offset_list[index],
- GF_GET_ALL);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir on (%s) with offset %"PRId64"",
- priv->xl_array[index]->name,
- local->sh_struct->offset_list[index]);
- }
-
- if (!callcnt) {
- /* All storage nodes have done unified setdents on NS node.
- * Now, do getdents from NS and do setdents on storage nodes.
- */
-
- /* sh_struct->offset_list is no longer required for
- storage nodes now */
- local->sh_struct->offset_list[0] = 0; /* reset */
-
- STACK_WIND (frame,
- unify_sh_ns_getdents_cbk,
- NS(this),
- NS(this)->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- 0, /* In this call, do send '0' as offset */
- GF_GET_DIR_ONLY);
- }
-
- return 0;
-}
-
-/**
- * unify_sh_opendir_cbk -
- *
- * @cookie:
- */
-int32_t
-unify_sh_opendir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- int32_t callcnt = 0;
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- int16_t index = 0;
- inode_t *inode = NULL;
- dict_t *tmp_dict = NULL;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- } else {
- gf_log (this->name, GF_LOG_WARNING, "failed");
- local->failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local->call_count = priv->child_count + 1;
-
- if (!local->failed) {
- /* send getdents() namespace after finishing
- storage nodes */
- local->call_count--;
-
- fd_bind (fd);
-
- if (local->call_count) {
- /* Used as the offset index. This list keeps
- * track of offset sent to each node during
- * STACK_WIND.
- */
- local->sh_struct->offset_list =
- GF_CALLOC (priv->child_count,
- sizeof (off_t),
- gf_unify_mt_off_t);
- ERR_ABORT (local->sh_struct->offset_list);
-
- local->sh_struct->entry_list =
- GF_CALLOC (priv->child_count,
- sizeof (dir_entry_t *),
- gf_unify_mt_dir_entry_t);
- ERR_ABORT (local->sh_struct->entry_list);
-
- local->sh_struct->count_list =
- GF_CALLOC (priv->child_count,
- sizeof (int),
- gf_unify_mt_int);
- ERR_ABORT (local->sh_struct->count_list);
-
- /* Send getdents on all the fds */
- for (index = 0;
- index < priv->child_count; index++) {
- STACK_WIND_COOKIE (frame,
- unify_sh_getdents_cbk,
- (void *)(long)index,
- priv->xl_array[index],
- priv->xl_array[index]->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- 0, /* In this call, do send '0' as offset */
- GF_GET_ALL);
- }
-
- /* did stack wind, so no need to unwind here */
- return 0;
- } /* (local->call_count) */
- } /* (!local->failed) */
-
- /* Opendir failed on one node. */
- inode = local->loc1.inode;
- fd_unref (local->fd);
- tmp_dict = local->dict;
-
- unify_local_wipe (local);
- /* Only 'self-heal' failed, lookup() was successful. */
- local->op_ret = 0;
-
- /* This is lookup_cbk ()'s UNWIND. */
- STACK_UNWIND (frame, local->op_ret, local->op_errno, inode,
- &local->stbuf, local->dict, &local->oldpostparent);
- if (tmp_dict)
- dict_unref (tmp_dict);
- }
-
- return 0;
-}
-
-/**
- * gf_sh_checksum_cbk -
- *
- * @frame: frame used in lookup. get a copy of it, and use that copy.
- * @this: pointer to unify xlator.
- * @inode: pointer to inode, for which the consistency check is required.
- *
- */
-int32_t
-unify_sh_checksum_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- uint8_t *file_checksum,
- uint8_t *dir_checksum)
-{
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- int16_t index = 0;
- int32_t callcnt = 0;
- inode_t *inode = NULL;
- dict_t *tmp_dict = NULL;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret >= 0) {
- if (NS(this) == (xlator_t *)cookie) {
- memcpy (local->sh_struct->ns_file_checksum,
- file_checksum, NAME_MAX);
- memcpy (local->sh_struct->ns_dir_checksum,
- dir_checksum, NAME_MAX);
- } else {
- if (local->entry_count == 0) {
- /* Initialize the dir_checksum to be
- * used for comparision with other
- * storage nodes. Should be done for
- * the first successful call *only*.
- */
- /* Using 'entry_count' as a flag */
- local->entry_count = 1;
- memcpy (local->sh_struct->dir_checksum,
- dir_checksum, NAME_MAX);
- }
-
- /* Reply from the storage nodes */
- for (index = 0;
- index < NAME_MAX; index++) {
- /* Files should be present in
- only one node */
- local->sh_struct->file_checksum[index] ^= file_checksum[index];
-
- /* directory structure should be
- same accross */
- if (local->sh_struct->dir_checksum[index] != dir_checksum[index])
- local->failed = 1;
- }
- }
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- for (index = 0; index < NAME_MAX ; index++) {
- if (local->sh_struct->file_checksum[index] !=
- local->sh_struct->ns_file_checksum[index]) {
- local->failed = 1;
- break;
- }
- if (local->sh_struct->dir_checksum[index] !=
- local->sh_struct->ns_dir_checksum[index]) {
- local->failed = 1;
- break;
- }
- }
-
- if (local->failed) {
- /* Log it, it should be a rare event */
- gf_log (this->name, GF_LOG_WARNING,
- "Self-heal triggered on directory %s",
- local->loc1.path);
-
- /* Any self heal will be done at directory level */
- local->call_count = 0;
- local->op_ret = -1;
- local->failed = 0;
-
- local->fd = fd_create (local->loc1.inode,
- frame->root->pid);
-
- local->call_count = priv->child_count + 1;
-
- for (index = 0;
- index < (priv->child_count + 1); index++) {
- STACK_WIND_COOKIE (frame,
- unify_sh_opendir_cbk,
- priv->xl_array[index]->name,
- priv->xl_array[index],
- priv->xl_array[index]->fops->opendir,
- &local->loc1,
- local->fd);
- }
- /* opendir can be done on the directory */
- return 0;
- }
-
- /* no mismatch */
- inode = local->loc1.inode;
- tmp_dict = local->dict;
-
- unify_local_wipe (local);
-
- /* This is lookup_cbk ()'s UNWIND. */
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- inode,
- &local->stbuf,
- local->dict, &local->oldpostparent);
- if (tmp_dict)
- dict_unref (tmp_dict);
- }
-
- return 0;
-}
-
-/* Foreground self-heal part over */
-
-/* Background self-heal part */
-
-int32_t
-unify_bgsh_setdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int32_t callcnt = -1;
- unify_local_t *local = frame->local;
- dir_entry_t *prev, *entry, *trav;
-
- LOCK (&frame->lock);
- {
- /* if local->call_count == 0, that means, setdents
- on storagenodes is still pending. */
- if (local->call_count)
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
-
- if (callcnt == 0) {
- if (local->sh_struct->entry_list[0]) {
- prev = entry = local->sh_struct->entry_list[0];
- trav = entry->next;
- while (trav) {
- prev->next = trav->next;
- GF_FREE (trav->name);
- if (IA_ISLNK (trav->buf.ia_type))
- GF_FREE (trav->link);
- GF_FREE (trav);
- trav = prev->next;
- }
- GF_FREE (entry);
- }
-
- if (!local->flags) {
- if (local->sh_struct->count_list[0] >=
- UNIFY_SELF_HEAL_GETDENTS_COUNT) {
- /* count == size, that means, there are more
- entries to read from */
- //local->call_count = 0;
- local->sh_struct->offset_list[0] +=
- UNIFY_SELF_HEAL_GETDENTS_COUNT;
- STACK_WIND (frame,
- unify_bgsh_ns_getdents_cbk,
- NS(this),
- NS(this)->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- local->sh_struct->offset_list[0],
- GF_GET_DIR_ONLY);
- }
- } else {
- fd_unref (local->fd);
- unify_local_wipe (local);
- STACK_DESTROY (frame->root);
- }
- }
-
- return 0;
-}
-
-
-int32_t
-unify_bgsh_ns_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count)
-{
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- long index = 0;
- unsigned long final = 0;
- dir_entry_t *tmp = GF_CALLOC (1, sizeof (dir_entry_t),
- gf_unify_mt_dir_entry_t);
-
- local->sh_struct->entry_list[0] = tmp;
- local->sh_struct->count_list[0] = count;
- if (entry) {
- tmp->next = entry->next;
- entry->next = NULL;
- }
-
- if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) {
- final = 1;
- }
-
- LOCK (&frame->lock);
- {
- /* local->call_count will be '0' till now. make it 1 so,
- it can be UNWIND'ed for the last call. */
- local->call_count = priv->child_count;
- if (final)
- local->flags = 1;
- }
- UNLOCK (&frame->lock);
-
- for (index = 0; index < priv->child_count; index++)
- {
- STACK_WIND_COOKIE (frame,
- unify_bgsh_setdents_cbk,
- (void *)index,
- priv->xl_array[index],
- priv->xl_array[index]->fops->setdents,
- local->fd, GF_SET_DIR_ONLY,
- local->sh_struct->entry_list[0], count);
- }
-
- return 0;
-}
-
-int32_t
-unify_bgsh_ns_setdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int32_t callcnt = -1;
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- long index = (long)cookie;
- dir_entry_t *prev, *entry, *trav;
-
- if (local->sh_struct->entry_list[index]) {
- prev = entry = local->sh_struct->entry_list[index];
- if (!entry)
- return 0;
- trav = entry->next;
- while (trav) {
- prev->next = trav->next;
- GF_FREE (trav->name);
- if (IA_ISLNK (trav->buf.ia_type))
- GF_FREE (trav->link);
- GF_FREE (trav);
- trav = prev->next;
- }
- GF_FREE (entry);
- }
-
- if (local->sh_struct->count_list[index] <
- UNIFY_SELF_HEAL_GETDENTS_COUNT) {
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
- } else {
- /* count == size, that means, there are more entries
- to read from */
- local->sh_struct->offset_list[index] +=
- UNIFY_SELF_HEAL_GETDENTS_COUNT;
- STACK_WIND_COOKIE (frame,
- unify_bgsh_getdents_cbk,
- cookie,
- priv->xl_array[index],
- priv->xl_array[index]->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- local->sh_struct->offset_list[index],
- GF_GET_ALL);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir on (%s) with offset %"PRId64"",
- priv->xl_array[index]->name,
- local->sh_struct->offset_list[index]);
- }
-
- if (!callcnt) {
- /* All storage nodes have done unified setdents on NS node.
- * Now, do getdents from NS and do setdents on storage nodes.
- */
-
- /* sh_struct->offset_list is no longer required for
- storage nodes now */
- local->sh_struct->offset_list[0] = 0; /* reset */
-
- STACK_WIND (frame,
- unify_bgsh_ns_getdents_cbk,
- NS(this),
- NS(this)->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- 0, /* In this call, do send '0' as offset */
- GF_GET_DIR_ONLY);
- }
-
- return 0;
-}
-
-
-/**
- * unify_bgsh_getdents_cbk -
- */
-int32_t
-unify_bgsh_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count)
-{
- int32_t callcnt = -1;
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- long index = (long)cookie;
- dir_entry_t *tmp = NULL;
-
- if (op_ret >= 0 && count > 0) {
- /* There is some dentry found, just send the dentry to NS */
- tmp = GF_CALLOC (1, sizeof (dir_entry_t),
- gf_unify_mt_dir_entry_t);
- local->sh_struct->entry_list[index] = tmp;
- local->sh_struct->count_list[index] = count;
- if (entry) {
- tmp->next = entry->next;
- entry->next = NULL;
- }
- STACK_WIND_COOKIE (frame,
- unify_bgsh_ns_setdents_cbk,
- cookie,
- NS(this),
- NS(this)->fops->setdents,
- local->fd,
- GF_SET_IF_NOT_PRESENT,
- local->sh_struct->entry_list[index],
- count);
- return 0;
- }
-
- if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) {
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
- } else {
- /* count == size, that means, there are more entries to read from */
- local->sh_struct->offset_list[index] +=
- UNIFY_SELF_HEAL_GETDENTS_COUNT;
-
- STACK_WIND_COOKIE (frame,
- unify_bgsh_getdents_cbk,
- cookie,
- priv->xl_array[index],
- priv->xl_array[index]->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- local->sh_struct->offset_list[index],
- GF_GET_ALL);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir on (%s) with offset %"PRId64"",
- priv->xl_array[index]->name,
- local->sh_struct->offset_list[index]);
- }
-
- if (!callcnt) {
- /* All storage nodes have done unified setdents on NS node.
- * Now, do getdents from NS and do setdents on storage nodes.
- */
-
- /* sh_struct->offset_list is no longer required for
- storage nodes now */
- local->sh_struct->offset_list[0] = 0; /* reset */
-
- STACK_WIND (frame,
- unify_bgsh_ns_getdents_cbk,
- NS(this),
- NS(this)->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- 0, /* In this call, do send '0' as offset */
- GF_GET_DIR_ONLY);
- }
-
- return 0;
-}
-
-/**
- * unify_bgsh_opendir_cbk -
- *
- * @cookie:
- */
-int32_t
-unify_bgsh_opendir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- int32_t callcnt = 0;
- int16_t index = 0;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- } else {
- local->failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local->call_count = priv->child_count + 1;
-
- if (!local->failed) {
- /* send getdents() namespace after finishing
- storage nodes */
- local->call_count--;
- callcnt = local->call_count;
-
- fd_bind (fd);
-
- if (local->call_count) {
- /* Used as the offset index. This list keeps
- track of offset sent to each node during
- STACK_WIND. */
- local->sh_struct->offset_list =
- GF_CALLOC (priv->child_count,
- sizeof (off_t),
- gf_unify_mt_off_t);
- ERR_ABORT (local->sh_struct->offset_list);
-
- local->sh_struct->entry_list =
- GF_CALLOC (priv->child_count,
- sizeof (dir_entry_t *),
- gf_unify_mt_dir_entry_t);
- ERR_ABORT (local->sh_struct->entry_list);
-
- local->sh_struct->count_list =
- GF_CALLOC (priv->child_count,
- sizeof (int),
- gf_unify_mt_int);
- ERR_ABORT (local->sh_struct->count_list);
-
- /* Send getdents on all the fds */
- for (index = 0;
- index < priv->child_count; index++) {
- STACK_WIND_COOKIE (frame,
- unify_bgsh_getdents_cbk,
- (void *)(long)index,
- priv->xl_array[index],
- priv->xl_array[index]->fops->getdents,
- local->fd,
- UNIFY_SELF_HEAL_GETDENTS_COUNT,
- 0, /* In this call, do send '0' as offset */
- GF_GET_ALL);
- }
- /* did a stack wind, so no need to unwind here */
- return 0;
- } /* (local->call_count) */
- } /* (!local->failed) */
-
- /* Opendir failed on one node. */
- fd_unref (local->fd);
-
- unify_local_wipe (local);
- STACK_DESTROY (frame->root);
- }
-
- return 0;
-}
-
-/**
- * gf_bgsh_checksum_cbk -
- *
- * @frame: frame used in lookup. get a copy of it, and use that copy.
- * @this: pointer to unify xlator.
- * @inode: pointer to inode, for which the consistency check is required.
- *
- */
-int32_t
-unify_bgsh_checksum_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- uint8_t *file_checksum,
- uint8_t *dir_checksum)
-{
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- int16_t index = 0;
- int32_t callcnt = 0;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret >= 0) {
- if (NS(this) == (xlator_t *)cookie) {
- memcpy (local->sh_struct->ns_file_checksum,
- file_checksum, NAME_MAX);
- memcpy (local->sh_struct->ns_dir_checksum,
- dir_checksum, NAME_MAX);
- } else {
- if (local->entry_count == 0) {
- /* Initialize the dir_checksum to be
- * used for comparision with other
- * storage nodes. Should be done for
- * the first successful call *only*.
- */
- /* Using 'entry_count' as a flag */
- local->entry_count = 1;
- memcpy (local->sh_struct->dir_checksum,
- dir_checksum, NAME_MAX);
- }
-
- /* Reply from the storage nodes */
- for (index = 0;
- index < NAME_MAX; index++) {
- /* Files should be present in only
- one node */
- local->sh_struct->file_checksum[index] ^= file_checksum[index];
-
- /* directory structure should be same
- accross */
- if (local->sh_struct->dir_checksum[index] != dir_checksum[index])
- local->failed = 1;
- }
- }
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- for (index = 0; index < NAME_MAX ; index++) {
- if (local->sh_struct->file_checksum[index] !=
- local->sh_struct->ns_file_checksum[index]) {
- local->failed = 1;
- break;
- }
- if (local->sh_struct->dir_checksum[index] !=
- local->sh_struct->ns_dir_checksum[index]) {
- local->failed = 1;
- break;
- }
- }
-
- if (local->failed) {
- /* Log it, it should be a rare event */
- gf_log (this->name, GF_LOG_WARNING,
- "Self-heal triggered on directory %s",
- local->loc1.path);
-
- /* Any self heal will be done at the directory level */
- local->op_ret = -1;
- local->failed = 0;
-
- local->fd = fd_create (local->loc1.inode,
- frame->root->pid);
- local->call_count = priv->child_count + 1;
-
- for (index = 0;
- index < (priv->child_count + 1); index++) {
- STACK_WIND_COOKIE (frame,
- unify_bgsh_opendir_cbk,
- priv->xl_array[index]->name,
- priv->xl_array[index],
- priv->xl_array[index]->fops->opendir,
- &local->loc1,
- local->fd);
- }
-
- /* opendir can be done on the directory */
- return 0;
- }
-
- /* no mismatch */
- unify_local_wipe (local);
- STACK_DESTROY (frame->root);
- }
-
- return 0;
-}
-
-/* Background self-heal part over */
-
-
-
-
-/**
- * zr_unify_self_heal -
- *
- * @frame: frame used in lookup. get a copy of it, and use that copy.
- * @this: pointer to unify xlator.
- * @inode: pointer to inode, for which the consistency check is required.
- *
- */
-int32_t
-zr_unify_self_heal (call_frame_t *frame,
- xlator_t *this,
- unify_local_t *local)
-{
- unify_private_t *priv = this->private;
- call_frame_t *bg_frame = NULL;
- unify_local_t *bg_local = NULL;
- inode_t *tmp_inode = NULL;
- dict_t *tmp_dict = NULL;
- int16_t index = 0;
-
- if (local->inode_generation < priv->inode_generation) {
- /* Any self heal will be done at the directory level */
- /* Update the inode's generation to the current generation
- value. */
- local->inode_generation = priv->inode_generation;
- inode_ctx_put (local->loc1.inode, this,
- (uint64_t)(long)local->inode_generation);
-
- if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) {
- local->op_ret = 0;
- local->failed = 0;
- local->call_count = priv->child_count + 1;
- local->sh_struct =
- GF_CALLOC (1, sizeof (struct unify_self_heal_struct),
- gf_unify_mt_unify_self_heal_struct);
-
- /* +1 is for NS */
- for (index = 0;
- index < (priv->child_count + 1); index++) {
- STACK_WIND_COOKIE (frame,
- unify_sh_checksum_cbk,
- priv->xl_array[index],
- priv->xl_array[index],
- priv->xl_array[index]->fops->checksum,
- &local->loc1,
- 0);
- }
-
- /* Self-heal in foreground, hence no need
- to UNWIND here */
- return 0;
- }
-
- /* Self Heal done in background */
- bg_frame = copy_frame (frame);
- INIT_LOCAL (bg_frame, bg_local);
- loc_copy (&bg_local->loc1, &local->loc1);
- bg_local->op_ret = 0;
- bg_local->failed = 0;
- bg_local->call_count = priv->child_count + 1;
- bg_local->sh_struct =
- GF_CALLOC (1, sizeof (struct unify_self_heal_struct),
- gf_unify_mt_unify_self_heal_struct);
-
- /* +1 is for NS */
- for (index = 0; index < (priv->child_count + 1); index++) {
- STACK_WIND_COOKIE (bg_frame,
- unify_bgsh_checksum_cbk,
- priv->xl_array[index],
- priv->xl_array[index],
- priv->xl_array[index]->fops->checksum,
- &bg_local->loc1,
- 0);
- }
- }
-
- /* generation number matches, self heal already done or
- * self heal done in background: just do STACK_UNWIND
- */
- tmp_inode = local->loc1.inode;
- tmp_dict = local->dict;
-
- unify_local_wipe (local);
-
- /* This is lookup_cbk ()'s UNWIND. */
- STACK_UNWIND (frame,
- local->op_ret,
- local->op_errno,
- tmp_inode,
- &local->stbuf,
- local->dict,
- &local->oldpostparent);
-
- if (tmp_dict)
- dict_unref (tmp_dict);
-
- return 0;
-}
-
diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c
deleted file mode 100644
index 422c8d6d8e2..00000000000
--- a/xlators/cluster/unify/src/unify.c
+++ /dev/null
@@ -1,4589 +0,0 @@
-/*
- Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * xlators/cluster/unify:
- * - This xlator is one of the main translator in GlusterFS, which
- * actually does the clustering work of the file system. One need to
- * understand that, unify assumes file to be existing in only one of
- * the child node, and directories to be present on all the nodes.
- *
- * NOTE:
- * Now, unify has support for global namespace, which is used to keep a
- * global view of fs's namespace tree. The stat for directories are taken
- * just from the namespace, where as for files, just 'ia_ino' is taken from
- * Namespace node, and other stat info is taken from the actual storage node.
- * Also Namespace node helps to keep consistant inode for files across
- * glusterfs (re-)mounts.
- */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "unify.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include <signal.h>
-#include <libgen.h>
-#include "compat-errno.h"
-#include "compat.h"
-
-#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \
- if (!(_loc && _loc->inode)) { \
- STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \
- return 0; \
- } \
-} while(0)
-
-
-#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \
- if (!(_fd && !fd_ctx_get (_fd, this, NULL))) { \
- STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \
- return 0; \
- } \
-} while(0)
-
-#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \
- if (!_fd) { \
- STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \
- return 0; \
- } \
-} while(0)
-
-/**
- * unify_local_wipe - free all the extra allocation of local->* here.
- */
-static void
-unify_local_wipe (unify_local_t *local)
-{
- /* Free the strdup'd variables in the local structure */
- if (local->name) {
- GF_FREE (local->name);
- }
- loc_wipe (&local->loc1);
- loc_wipe (&local->loc2);
-}
-
-
-
-/*
- * unify_normalize_stats -
- */
-void
-unify_normalize_stats (struct statvfs *buf,
- unsigned long bsize,
- unsigned long frsize)
-{
- double factor;
-
- if (buf->f_bsize != bsize) {
- factor = ((double) buf->f_bsize) / bsize;
- buf->f_bsize = bsize;
- buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree);
- buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
- }
-
- if (buf->f_frsize != frsize) {
- factor = ((double) buf->f_frsize) / frsize;
- buf->f_frsize = frsize;
- buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
- }
-}
-
-
-xlator_t *
-unify_loc_subvol (loc_t *loc, xlator_t *this)
-{
- unify_private_t *priv = NULL;
- xlator_t *subvol = NULL;
- int16_t *list = NULL;
- long index = 0;
- xlator_t *subvol_i = NULL;
- int ret = 0;
- uint64_t tmp_list = 0;
-
- priv = this->private;
- subvol = NS (this);
-
- if (!IA_ISDIR (loc->inode->ia_type)) {
- ret = inode_ctx_get (loc->inode, this, &tmp_list);
- list = (int16_t *)(long)tmp_list;
- if (!list)
- goto out;
-
- for (index = 0; list[index] != -1; index++) {
- subvol_i = priv->xl_array[list[index]];
- if (subvol_i != NS (this)) {
- subvol = subvol_i;
- break;
- }
- }
- }
-out:
- return subvol;
-}
-
-
-
-/**
- * unify_statfs_cbk -
- */
-int32_t
-unify_statfs_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct statvfs *stbuf)
-{
- int32_t callcnt = 0;
- struct statvfs *dict_buf = NULL;
- unsigned long bsize;
- unsigned long frsize;
- unify_local_t *local = (unify_local_t *)frame->local;
- call_frame_t *prev_frame = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret >= 0) {
- /* when a call is successfull, add it to local->dict */
- dict_buf = &local->statvfs_buf;
-
- if (dict_buf->f_bsize != 0) {
- bsize = max (dict_buf->f_bsize,
- stbuf->f_bsize);
-
- frsize = max (dict_buf->f_frsize,
- stbuf->f_frsize);
- unify_normalize_stats(dict_buf, bsize, frsize);
- unify_normalize_stats(stbuf, bsize, frsize);
- } else {
- dict_buf->f_bsize = stbuf->f_bsize;
- dict_buf->f_frsize = stbuf->f_frsize;
- }
-
- dict_buf->f_blocks += stbuf->f_blocks;
- dict_buf->f_bfree += stbuf->f_bfree;
- dict_buf->f_bavail += stbuf->f_bavail;
- dict_buf->f_files += stbuf->f_files;
- dict_buf->f_ffree += stbuf->f_ffree;
- dict_buf->f_favail += stbuf->f_favail;
- dict_buf->f_fsid = stbuf->f_fsid;
- dict_buf->f_flag = stbuf->f_flag;
- dict_buf->f_namemax = stbuf->f_namemax;
- local->op_ret = op_ret;
- } else {
- /* fop on storage node has failed due to some error */
- if (op_errno != ENOTCONN) {
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): %s",
- prev_frame->this->name,
- strerror (op_errno));
- }
- local->op_errno = op_errno;
- }
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->statvfs_buf);
- }
-
- return 0;
-}
-
-/**
- * unify_statfs -
- */
-int32_t
-unify_statfs (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- unify_local_t *local = NULL;
- xlator_list_t *trav = this->children;
-
- INIT_LOCAL (frame, local);
- local->call_count = ((unify_private_t *)this->private)->child_count;
-
- while(trav) {
- STACK_WIND (frame,
- unify_statfs_cbk,
- trav->xlator,
- trav->xlator->fops->statfs,
- loc);
- trav = trav->next;
- }
-
- return 0;
-}
-
-/**
- * unify_buf_cbk -
- */
-int32_t
-unify_buf_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf)
-{
- int32_t callcnt = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s(): child(%s): path(%s): %s",
- gf_fop_list[frame->root->op],
- prev_frame->this->name,
- (local->loc1.path)?local->loc1.path:"",
- strerror (op_errno));
-
- local->op_errno = op_errno;
- if ((op_errno == ENOENT) && priv->optimist)
- local->op_ret = 0;
- }
-
- if (op_ret >= 0) {
- local->op_ret = 0;
-
- if (NS (this) == prev_frame->this) {
- local->ia_ino = buf->ia_ino;
- /* If the entry is directory, get the stat
- from NS node */
- if (IA_ISDIR (buf->ia_type) ||
- !local->stbuf.ia_blksize) {
- local->stbuf = *buf;
- }
- }
-
- if ((!IA_ISDIR (buf->ia_type)) &&
- (NS (this) != prev_frame->this)) {
- /* If file, take the stat info from Storage
- node. */
- local->stbuf = *buf;
- }
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- /* If the inode number is not filled, operation should
- fail */
- if (!local->ia_ino)
- local->op_ret = -1;
-
- local->stbuf.ia_ino = local->ia_ino;
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->stbuf);
- }
-
- return 0;
-}
-
-#define check_if_dht_linkfile(s) \
- ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) == S_ISVTX)
-
-/**
- * unify_lookup_cbk -
- */
-int32_t
-unify_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
- inode_t *tmp_inode = NULL;
- dict_t *local_dict = NULL;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- if (local->revalidate &&
- (op_errno == ESTALE)) {
- /* ESTALE takes priority */
- local->op_errno = op_errno;
- local->failed = 1;
- }
-
- if ((op_errno != ENOTCONN)
- && (op_errno != ENOENT)
- && (local->op_errno != ESTALE)) {
- /* if local->op_errno is already ESTALE, then
- * ESTALE has to propogated to the parent first.
- * do not enter here.
- */
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- priv->xl_array[(long)cookie]->name,
- local->loc1.path, strerror (op_errno));
- local->op_errno = op_errno;
- local->failed = 1;
-
- } else if (local->revalidate &&
- (local->op_errno != ESTALE) &&
- !(priv->optimist && (op_errno == ENOENT))) {
-
- gf_log (this->name,
- (op_errno == ENOTCONN) ?
- GF_LOG_DEBUG:GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- priv->xl_array[(long)cookie]->name,
- local->loc1.path, strerror (op_errno));
- local->op_errno = op_errno;
- local->failed = 1;
- }
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
-
- if (check_if_dht_linkfile(buf)) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "file %s may be DHT link file on %s, "
- "make sure the backend is not shared "
- "between unify and DHT",
- local->loc1.path,
- priv->xl_array[(long)cookie]->name);
- }
-
- if (local->stbuf.ia_type && local->stbuf.ia_blksize) {
- /* make sure we already have a stbuf
- stored in local->stbuf */
- if (IA_ISDIR (local->stbuf.ia_type) &&
- !IA_ISDIR (buf->ia_type)) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "[CRITICAL] '%s' is directory "
- "on namespace, non-directory "
- "on node '%s', returning EIO",
- local->loc1.path,
- priv->xl_array[(long)cookie]->name);
- local->return_eio = 1;
- }
- if (!IA_ISDIR (local->stbuf.ia_type) &&
- IA_ISDIR (buf->ia_type)) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "[CRITICAL] '%s' is directory "
- "on node '%s', non-directory "
- "on namespace, returning EIO",
- local->loc1.path,
- priv->xl_array[(long)cookie]->name);
- local->return_eio = 1;
- }
- }
-
- if (!local->revalidate && !IA_ISDIR (buf->ia_type)) {
- /* This is the first time lookup on file*/
- if (!local->list) {
- /* list is not allocated, allocate
- the max possible range */
- local->list = GF_CALLOC (1, 2 * (priv->child_count + 2),
- gf_unify_mt_int16_t);
- if (!local->list) {
- gf_log (this->name,
- GF_LOG_CRITICAL,
- "Not enough memory");
- STACK_UNWIND (frame, -1,
- ENOMEM, inode,
- NULL, NULL, NULL);
- return 0;
- }
- }
- /* update the index of the list */
- local->list [local->index++] =
- (int16_t)(long)cookie;
- }
-
- if (!local->revalidate && IA_ISDIR (buf->ia_type)) {
- /* fresh lookup of a directory */
- inode_ctx_put (local->loc1.inode, this,
- priv->inode_generation);
- }
-
- if ((!local->dict) && dict &&
- (priv->xl_array[(long)cookie] != NS(this))) {
- local->dict = dict_ref (dict);
- }
-
- /* index of NS node is == total child count */
- if (priv->child_count == (int16_t)(long)cookie) {
- /* Take the inode number from namespace */
- local->ia_ino = buf->ia_ino;
- if (IA_ISDIR (buf->ia_type) ||
- !(local->stbuf.ia_blksize)) {
- local->stbuf = *buf;
- local->oldpostparent = *postparent;
- }
- } else if (!IA_ISDIR (buf->ia_type)) {
- /* If file, then get the stat from
- storage node */
- local->stbuf = *buf;
- }
-
- if (local->ia_nlink < buf->ia_nlink) {
- local->ia_nlink = buf->ia_nlink;
- }
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local_dict = local->dict;
- if (local->return_eio) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "[CRITICAL] Unable to fix the path (%s) with "
- "self-heal, try manual verification. "
- "returning EIO.", local->loc1.path);
- unify_local_wipe (local);
- STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL);
- if (local_dict) {
- dict_unref (local_dict);
- }
- return 0;
- }
-
- if (!local->stbuf.ia_blksize) {
- /* Inode not present */
- local->op_ret = -1;
- } else {
- if (!local->revalidate &&
- !IA_ISDIR (local->stbuf.ia_type)) {
- /* If its a file, big array is useless,
- allocate the smaller one */
- int16_t *list = NULL;
- list = GF_CALLOC (1, 2 * (local->index + 1),
- gf_unify_mt_int16_t);
- ERR_ABORT (list);
- memcpy (list, local->list, 2 * local->index);
- /* Make the end of the list as -1 */
- GF_FREE (local->list);
- local->list = list;
- local->list [local->index] = -1;
- /* Update the inode's ctx with proper array */
- /* TODO: log on failure */
- inode_ctx_put (local->loc1.inode, this,
- (uint64_t)(long)local->list);
- }
-
- if (IA_ISDIR(local->loc1.inode->ia_type)) {
- /* lookup is done for directory */
- if (local->failed && priv->self_heal) {
- /* Triggering self-heal */
- /* means, self-heal required for this
- inode */
- local->inode_generation = 0;
- priv->inode_generation++;
- }
- } else {
- local->stbuf.ia_ino = local->ia_ino;
- }
-
- local->stbuf.ia_nlink = local->ia_nlink;
- }
- if (local->op_ret == -1) {
- if (!local->revalidate && local->list)
- GF_FREE (local->list);
- }
-
- if ((local->op_ret >= 0) && local->failed &&
- local->revalidate) {
- /* Done revalidate, but it failed */
- if ((op_errno != ENOTCONN)
- && (local->op_errno != ESTALE)) {
- gf_log (this->name, GF_LOG_ERROR,
- "Revalidate failed for path(%s): %s",
- local->loc1.path, strerror (op_errno));
- }
- local->op_ret = -1;
- }
-
- if ((priv->self_heal && !priv->optimist) &&
- (!local->revalidate && (local->op_ret == 0) &&
- IA_ISDIR(local->stbuf.ia_type))) {
- /* Let the self heal be done here */
- zr_unify_self_heal (frame, this, local);
- local_dict = NULL;
- } else {
- if (local->failed) {
- /* NOTE: directory lookup is sent to all
- * subvolumes and success from a subvolume
- * might set local->op_ret to 0 (zero) */
- local->op_ret = -1;
- }
-
- /* either no self heal, or op_ret == -1 (failure) */
- tmp_inode = local->loc1.inode;
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- tmp_inode, &local->stbuf, local->dict,
- &local->oldpostparent);
- }
- if (local_dict) {
- dict_unref (local_dict);
- }
- }
-
- return 0;
-}
-
-/**
- * unify_lookup -
- */
-int32_t
-unify_lookup (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *xattr_req)
-{
- unify_local_t *local = NULL;
- unify_private_t *priv = this->private;
- int16_t *list = NULL;
- long index = 0;
-
- if (!(loc && loc->inode)) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: Argument not right", loc?loc->path:"(null)");
- STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL, NULL);
- return 0;
- }
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- loc_copy (&local->loc1, loc);
- if (local->loc1.path == NULL) {
- gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
- STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL, NULL);
- return 0;
- }
-
- if (inode_ctx_get (loc->inode, this, NULL)
- && IA_ISDIR (loc->inode->ia_type)) {
- local->revalidate = 1;
- }
-
- if (!inode_ctx_get (loc->inode, this, NULL) &&
- loc->inode->ia_type &&
- !IA_ISDIR (loc->inode->ia_type)) {
- uint64_t tmp_list = 0;
- /* check if revalidate or fresh lookup */
- inode_ctx_get (loc->inode, this, &tmp_list);
- local->list = (int16_t *)(long)tmp_list;
- }
-
- if (local->list) {
- list = local->list;
- for (index = 0; list[index] != -1; index++);
- if (index != 2) {
- if (index < 2) {
- gf_log (this->name, GF_LOG_ERROR,
- "returning ESTALE for %s: file "
- "count is %ld", loc->path, index);
- /* Print where all the file is present */
- for (index = 0;
- local->list[index] != -1; index++) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: found on %s", loc->path,
- priv->xl_array[list[index]]->name);
- }
- unify_local_wipe (local);
- STACK_UNWIND (frame, -1, ESTALE,
- NULL, NULL, NULL, NULL);
- return 0;
- } else {
- /* There are more than 2 presences */
- /* Just log and continue */
- gf_log (this->name, GF_LOG_ERROR,
- "%s: file count is %ld",
- loc->path, index);
- /* Print where all the file is present */
- for (index = 0;
- local->list[index] != -1; index++) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: found on %s", loc->path,
- priv->xl_array[list[index]]->name);
- }
- }
- }
-
- /* is revalidate */
- local->revalidate = 1;
-
- for (index = 0; list[index] != -1; index++)
- local->call_count++;
-
- for (index = 0; list[index] != -1; index++) {
- char need_break = (list[index+1] == -1);
- STACK_WIND_COOKIE (frame,
- unify_lookup_cbk,
- (void *)(long)list[index], //cookie
- priv->xl_array [list[index]],
- priv->xl_array [list[index]]->fops->lookup,
- loc,
- xattr_req);
- if (need_break)
- break;
- }
- } else {
- if (loc->inode->ia_type) {
- if (inode_ctx_get (loc->inode, this, NULL)) {
- inode_ctx_get (loc->inode, this,
- &local->inode_generation);
- }
- }
- /* This is first call, there is no list */
- /* call count should be all child + 1 namespace */
- local->call_count = priv->child_count + 1;
-
- for (index = 0; index <= priv->child_count; index++) {
- STACK_WIND_COOKIE (frame,
- unify_lookup_cbk,
- (void *)index, //cookie
- priv->xl_array[index],
- priv->xl_array[index]->fops->lookup,
- loc,
- xattr_req);
- }
- }
-
- return 0;
-}
-
-/**
- * unify_stat - if directory, get the stat directly from NameSpace child.
- * if file, check for a hint and send it only there (also to NS).
- * if its a fresh stat, then do it on all the nodes.
- *
- * NOTE: for all the call, sending cookie as xlator pointer, which will be
- * used in cbk.
- */
-int32_t
-unify_stat (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- unify_local_t *local = NULL;
- unify_private_t *priv = this->private;
- int16_t index = 0;
- int16_t *list = NULL;
- uint64_t tmp_list = 0;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- loc_copy (&local->loc1, loc);
- if (local->loc1.path == NULL) {
- gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
- STACK_UNWIND (frame, -1, ENOMEM, NULL);
- return 0;
- }
- local->ia_ino = loc->inode->ino;
- if (IA_ISDIR (loc->inode->ia_type)) {
- /* Directory */
- local->call_count = 1;
- STACK_WIND (frame, unify_buf_cbk, NS(this),
- NS(this)->fops->stat, loc);
- } else {
- /* File */
- inode_ctx_get (loc->inode, this, &tmp_list);
- list = (int16_t *)(long)tmp_list;
-
- for (index = 0; list[index] != -1; index++)
- local->call_count++;
-
- for (index = 0; list[index] != -1; index++) {
- char need_break = (list[index+1] == -1);
- STACK_WIND (frame,
- unify_buf_cbk,
- priv->xl_array[list[index]],
- priv->xl_array[list[index]]->fops->stat,
- loc);
- if (need_break)
- break;
- }
- }
-
- return 0;
-}
-
-/**
- * unify_access_cbk -
- */
-int32_t
-unify_access_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-/**
- * unify_access - Send request to only namespace, which has all the
- * attributes set for the file.
- */
-int32_t
-unify_access (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t mask)
-{
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- STACK_WIND (frame,
- unify_access_cbk,
- NS(this),
- NS(this)->fops->access,
- loc,
- mask);
-
- return 0;
-}
-
-int32_t
-unify_mkdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
- inode_t *tmp_inode = NULL;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if ((op_ret == -1) && !(priv->optimist &&
- (op_errno == ENOENT ||
- op_errno == EEXIST))) {
- /* TODO: Decrement the inode_generation of
- * this->inode's parent inode, hence the missing
- * directory is created properly by self-heal.
- * Currently, there is no way to get the parent
- * inode directly.
- */
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- priv->xl_array[(long)cookie]->name,
- local->loc1.path, strerror (op_errno));
- if (op_errno != EEXIST)
- local->failed = 1;
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0)
- local->op_ret = 0;
-
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (!local->failed) {
- inode_ctx_put (local->loc1.inode, this,
- priv->inode_generation);
- }
-
- tmp_inode = local->loc1.inode;
- unify_local_wipe (local);
-
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- tmp_inode, &local->stbuf,
- &local->oldpreparent, &local->oldpostparent);
- }
-
- return 0;
-}
-
-/**
- * unify_ns_mkdir_cbk -
- */
-int32_t
-unify_ns_mkdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
- long index = 0;
-
- if (op_ret == -1) {
- /* No need to send mkdir request to other servers,
- * as namespace action failed
- */
- gf_log (this->name, GF_LOG_ERROR,
- "namespace: path(%s): %s",
- local->name, strerror (op_errno));
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, inode, NULL,
- NULL, NULL);
- return 0;
- }
-
- /* Create one inode for this entry */
- local->op_ret = 0;
- local->stbuf = *buf;
-
- local->oldpreparent = *preparent;
- local->oldpostparent = *postparent;
-
- local->call_count = priv->child_count;
-
- /* Send mkdir request to all the nodes now */
- for (index = 0; index < priv->child_count; index++) {
- STACK_WIND_COOKIE (frame,
- unify_mkdir_cbk,
- (void *)index, //cookie
- priv->xl_array[index],
- priv->xl_array[index]->fops->mkdir,
- &local->loc1,
- local->mode);
- }
-
- return 0;
-}
-
-
-/**
- * unify_mkdir -
- */
-int32_t
-unify_mkdir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode)
-{
- unify_local_t *local = NULL;
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- local->mode = mode;
-
- loc_copy (&local->loc1, loc);
-
- if (local->loc1.path == NULL) {
- gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
- STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame,
- unify_ns_mkdir_cbk,
- NS(this),
- NS(this)->fops->mkdir,
- loc,
- mode);
- return 0;
-}
-
-/**
- * unify_rmdir_cbk -
- */
-int32_t
-unify_rmdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT)))
- local->op_ret = 0;
- if (op_ret == -1)
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->oldpreparent, &local->oldpostparent);
- }
-
- return 0;
-}
-
-/**
- * unify_ns_rmdir_cbk -
- */
-int32_t
-unify_ns_rmdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int16_t index = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
-
- if (op_ret == -1) {
- /* No need to send rmdir request to other servers,
- * as namespace action failed
- */
- gf_log (this->name,
- ((op_errno != ENOTEMPTY) ?
- GF_LOG_ERROR : GF_LOG_DEBUG),
- "namespace: path(%s): %s",
- local->loc1.path, strerror (op_errno));
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL);
- return 0;
- }
-
- local->call_count = priv->child_count;
-
- local->oldpreparent = *preparent;
- local->oldpostparent = *postparent;
-
- for (index = 0; index < priv->child_count; index++) {
- STACK_WIND (frame,
- unify_rmdir_cbk,
- priv->xl_array[index],
- priv->xl_array[index]->fops->rmdir,
- &local->loc1);
- }
-
- return 0;
-}
-
-/**
- * unify_rmdir -
- */
-int32_t
-unify_rmdir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- unify_local_t *local = NULL;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
-
- loc_copy (&local->loc1, loc);
- if (local->loc1.path == NULL) {
- gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
- STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame,
- unify_ns_rmdir_cbk,
- NS(this),
- NS(this)->fops->rmdir,
- loc);
-
- return 0;
-}
-
-/**
- * unify_open_cbk -
- */
-int32_t
-unify_open_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- int32_t callcnt = 0;
- unify_local_t *local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- if (NS(this) != (xlator_t *)cookie) {
- /* Store child node's ptr, used in
- all the f*** / FileIO calls */
- fd_ctx_set (fd, this, (uint64_t)(long)cookie);
- }
- }
- if (op_ret == -1) {
- local->op_errno = op_errno;
- local->failed = 1;
- }
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if ((local->failed == 1) && (local->op_ret >= 0)) {
- local->call_count = 1;
- /* return -1 to user */
- local->op_ret = -1;
- //local->op_errno = EIO;
-
- if (!fd_ctx_get (local->fd, this, NULL)) {
- gf_log (this->name, GF_LOG_ERROR,
- "Open success on child node, "
- "failed on namespace");
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "Open success on namespace, "
- "failed on child node");
- }
- }
-
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret,
- local->op_errno, local->fd);
- }
-
- return 0;
-}
-
-#ifdef GF_DARWIN_HOST_OS
-/**
- * unify_create_lookup_cbk -
- */
-int32_t
-unify_open_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- int16_t index = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- priv->xl_array[(long)cookie]->name,
- local->loc1.path, strerror (op_errno));
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- local->index++;
- if (NS(this) == priv->xl_array[(long)cookie]) {
- local->list[0] = (int16_t)(long)cookie;
- } else {
- local->list[1] = (int16_t)(long)cookie;
- }
- if (IA_ISDIR (buf->ia_type))
- local->failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- int16_t file_list[3] = {0,};
- local->op_ret = -1;
-
- file_list[0] = local->list[0];
- file_list[1] = local->list[1];
- file_list[2] = -1;
-
- if (local->index != 2) {
- /* Lookup failed, can't do open */
- gf_log (this->name, GF_LOG_ERROR,
- "%s: present on %d nodes",
- local->name, local->index);
-
- if (local->index < 2) {
- unify_local_wipe (local);
- gf_log (this->name, GF_LOG_ERROR,
- "returning as file found on less "
- "than 2 nodes");
- STACK_UNWIND (frame, local->op_ret,
- local->op_errno, local->fd);
- return 0;
- }
- }
-
- if (local->failed) {
- /* Open on directory, return EISDIR */
- unify_local_wipe (local);
- STACK_UNWIND (frame, -1, EISDIR, local->fd);
- return 0;
- }
-
- /* Everything is perfect :) */
- local->call_count = 2;
-
- for (index = 0; file_list[index] != -1; index++) {
- char need_break = (file_list[index+1] == -1);
- STACK_WIND_COOKIE (frame,
- unify_open_cbk,
- priv->xl_array[file_list[index]],
- priv->xl_array[file_list[index]],
- priv->xl_array[file_list[index]]->fops->open,
- &local->loc1,
- local->flags,
- local->fd, local->wbflags);
- if (need_break)
- break;
- }
- }
-
- return 0;
-}
-
-
-int32_t
-unify_open_readlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- const char *path,
- struct iatt *sbuf)
-{
- int16_t index = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
-
- if (op_ret == -1) {
- STACK_UNWIND (frame, -1, ENOENT);
- return 0;
- }
-
- if (path[0] == '/') {
- local->name = gf_strdup (path);
- ERR_ABORT (local->name);
- } else {
- char *tmp_str = gf_strdup (local->loc1.path);
- char *tmp_base = dirname (tmp_str);
- local->name = GF_CALLOC (1, ZR_PATH_MAX, gf_unify_mt_char);
- strcpy (local->name, tmp_base);
- strncat (local->name, "/", 1);
- strcat (local->name, path);
- GF_FREE (tmp_str);
- }
-
- local->list = GF_CALLOC (1, sizeof (int16_t) * 3,
- gf_unify_mt_int16_t);
- ERR_ABORT (local->list);
- local->call_count = priv->child_count + 1;
- local->op_ret = -1;
- for (index = 0; index <= priv->child_count; index++) {
- /* Send the lookup to all the nodes including namespace */
- STACK_WIND_COOKIE (frame,
- unify_open_lookup_cbk,
- (void *)(long)index,
- priv->xl_array[index],
- priv->xl_array[index]->fops->lookup,
- &local->loc1,
- NULL);
- }
-
- return 0;
-}
-#endif /* GF_DARWIN_HOST_OS */
-
-/**
- * unify_open -
- */
-int32_t
-unify_open (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags,
- fd_t *fd,
- int32_t wbflags)
-{
- unify_private_t *priv = this->private;
- unify_local_t *local = NULL;
- int16_t *list = NULL;
- int16_t index = 0;
- int16_t file_list[3] = {0,};
- uint64_t tmp_list = 0;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- /* Init */
- INIT_LOCAL (frame, local);
- loc_copy (&local->loc1, loc);
- local->fd = fd;
- local->flags = flags;
- local->wbflags = wbflags;
- inode_ctx_get (loc->inode, this, &tmp_list);
- list = (int16_t *)(long)tmp_list;
-
- local->list = list;
- file_list[0] = priv->child_count; /* Thats namespace */
- file_list[2] = -1;
- for (index = 0; list[index] != -1; index++) {
- local->call_count++;
- if (list[index] != priv->child_count)
- file_list[1] = list[index];
- }
-
- if (local->call_count != 2) {
- /* If the lookup was done for file */
- gf_log (this->name, GF_LOG_ERROR,
- "%s: entry_count is %d",
- loc->path, local->call_count);
- for (index = 0; local->list[index] != -1; index++)
- gf_log (this->name, GF_LOG_ERROR, "%s: found on %s",
- loc->path, priv->xl_array[list[index]]->name);
-
- if (local->call_count < 2) {
- gf_log (this->name, GF_LOG_ERROR,
- "returning EIO as file found on onlyone node");
- STACK_UNWIND (frame, -1, EIO, fd);
- return 0;
- }
- }
-
-#ifdef GF_DARWIN_HOST_OS
- /* Handle symlink here */
- if (IA_ISLNK (loc->inode->ia_type)) {
- /* Callcount doesn't matter here */
- STACK_WIND (frame,
- unify_open_readlink_cbk,
- NS(this),
- NS(this)->fops->readlink,
- loc, ZR_PATH_MAX);
- return 0;
- }
-#endif /* GF_DARWIN_HOST_OS */
-
- local->call_count = 2;
- for (index = 0; file_list[index] != -1; index++) {
- char need_break = (file_list[index+1] == -1);
- STACK_WIND_COOKIE (frame,
- unify_open_cbk,
- priv->xl_array[file_list[index]], //cookie
- priv->xl_array[file_list[index]],
- priv->xl_array[file_list[index]]->fops->open,
- loc,
- flags,
- fd, wbflags);
- if (need_break)
- break;
- }
-
- return 0;
-}
-
-
-int32_t
-unify_create_unlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- unify_local_t *local = frame->local;
- inode_t *inode = local->loc1.inode;
-
- unify_local_wipe (local);
-
- STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd,
- inode, &local->stbuf,
- &local->oldpreparent, &local->oldpostparent);
-
- return 0;
-}
-
-/**
- * unify_create_open_cbk -
- */
-int32_t
-unify_create_open_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- int ret = 0;
- int32_t callcnt = 0;
- unify_local_t *local = frame->local;
- inode_t *inode = NULL;
- xlator_t *child = NULL;
- uint64_t tmp_value = 0;
-
- LOCK (&frame->lock);
- {
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- if (NS(this) != (xlator_t *)cookie) {
- /* Store child node's ptr, used in all
- the f*** / FileIO calls */
- /* TODO: log on failure */
- ret = fd_ctx_get (fd, this, &tmp_value);
- cookie = (void *)(long)tmp_value;
- } else {
- /* NOTE: open successful on namespace.
- * fd's ctx can be used to identify open
- * failure on storage subvolume. cool
- * ide ;) */
- local->failed = 0;
- }
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- ((xlator_t *)cookie)->name,
- local->loc1.path, strerror (op_errno));
- local->op_errno = op_errno;
- local->failed = 1;
- }
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed == 1 && (local->op_ret >= 0)) {
- local->call_count = 1;
- /* return -1 to user */
- local->op_ret = -1;
- local->op_errno = EIO;
- local->fd = fd;
- local->call_count = 1;
-
- if (!fd_ctx_get (local->fd, this, &tmp_value)) {
- child = (xlator_t *)(long)tmp_value;
-
- gf_log (this->name, GF_LOG_ERROR,
- "Create success on child node, "
- "failed on namespace");
-
- STACK_WIND (frame,
- unify_create_unlink_cbk,
- child,
- child->fops->unlink,
- &local->loc1);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "Create success on namespace, "
- "failed on child node");
-
- STACK_WIND (frame,
- unify_create_unlink_cbk,
- NS(this),
- NS(this)->fops->unlink,
- &local->loc1);
- }
- return 0;
- }
- inode = local->loc1.inode;
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno, fd,
- inode, &local->stbuf,
- &local->oldpreparent, &local->oldpostparent);
- }
- return 0;
-}
-
-/**
- * unify_create_lookup_cbk -
- */
-int32_t
-unify_create_lookup_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- dict_t *dict,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- int16_t index = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- priv->xl_array[(long)cookie]->name,
- local->loc1.path, strerror (op_errno));
- local->op_errno = op_errno;
- local->failed = 1;
- }
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- local->list[local->index++] = (int16_t)(long)cookie;
- if (NS(this) == priv->xl_array[(long)cookie]) {
- local->ia_ino = buf->ia_ino;
- } else {
- local->stbuf = *buf;
- }
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- int16_t *list = local->list;
- int16_t file_list[3] = {0,};
- local->op_ret = -1;
-
- local->list [local->index] = -1;
- file_list[0] = list[0];
- file_list[1] = list[1];
- file_list[2] = -1;
-
- local->stbuf.ia_ino = local->ia_ino;
- /* TODO: log on failure */
- inode_ctx_put (local->loc1.inode, this,
- (uint64_t)(long)local->list);
-
- if (local->index != 2) {
- /* Lookup failed, can't do open */
- gf_log (this->name, GF_LOG_ERROR,
- "%s: present on %d nodes",
- local->loc1.path, local->index);
- file_list[0] = priv->child_count;
- for (index = 0; list[index] != -1; index++) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: found on %s", local->loc1.path,
- priv->xl_array[list[index]]->name);
- if (list[index] != priv->child_count)
- file_list[1] = list[index];
- }
-
- if (local->index < 2) {
- unify_local_wipe (local);
- gf_log (this->name, GF_LOG_ERROR,
- "returning EIO as file found on "
- "only one node");
- STACK_UNWIND (frame, -1, EIO,
- local->fd, inode, NULL,
- NULL, NULL);
- return 0;
- }
- }
- /* Everything is perfect :) */
- local->call_count = 2;
-
- for (index = 0; file_list[index] != -1; index++) {
- char need_break = (file_list[index+1] == -1);
- STACK_WIND_COOKIE (frame,
- unify_create_open_cbk,
- priv->xl_array[file_list[index]],
- priv->xl_array[file_list[index]],
- priv->xl_array[file_list[index]]->fops->open,
- &local->loc1,
- local->flags,
- local->fd, 0);
- if (need_break)
- break;
- }
- }
-
- return 0;
-}
-
-
-/**
- * unify_create_cbk -
- */
-int32_t
-unify_create_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int ret = 0;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
- inode_t *tmp_inode = NULL;
-
- if (op_ret == -1) {
- /* send unlink () on Namespace */
- local->op_errno = op_errno;
- local->op_ret = -1;
- local->call_count = 1;
- gf_log (this->name, GF_LOG_ERROR,
- "create failed on %s (file %s, error %s), "
- "sending unlink to namespace",
- prev_frame->this->name,
- local->loc1.path, strerror (op_errno));
-
- STACK_WIND (frame,
- unify_create_unlink_cbk,
- NS(this),
- NS(this)->fops->unlink,
- &local->loc1);
-
- return 0;
- }
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- local->stbuf = *buf;
- /* Just inode number should be from NS node */
- local->stbuf.ia_ino = local->ia_ino;
-
- /* TODO: log on failure */
- ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this);
- }
-
- tmp_inode = local->loc1.inode;
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd,
- tmp_inode, &local->stbuf,
- &local->oldpreparent, &local->oldpostparent);
-
- return 0;
-}
-
-/**
- * unify_ns_create_cbk -
- *
- */
-int32_t
-unify_ns_create_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- struct sched_ops *sched_ops = NULL;
- xlator_t *sched_xl = NULL;
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- int16_t *list = NULL;
- int16_t index = 0;
-
- if (op_ret == -1) {
- /* No need to send create request to other servers, as
- namespace action failed. Handle exclusive create here. */
- if ((op_errno != EEXIST) ||
- ((op_errno == EEXIST) &&
- ((local->flags & O_EXCL) == O_EXCL))) {
- /* If its just a create call without O_EXCL,
- don't do this */
- gf_log (this->name, GF_LOG_ERROR,
- "namespace: path(%s): %s",
- local->loc1.path, strerror (op_errno));
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf,
- preparent, postparent);
- return 0;
- }
- }
-
- if (op_ret >= 0) {
- /* Get the inode number from the NS node */
- local->ia_ino = buf->ia_ino;
-
- local->oldpreparent = *preparent;
- local->oldpostparent = *postparent;
-
- local->op_ret = -1;
-
- /* Start the mapping list */
- list = GF_CALLOC (1, sizeof (int16_t) * 3,
- gf_unify_mt_int16_t);
- ERR_ABORT (list);
- inode_ctx_put (inode, this, (uint64_t)(long)list);
- list[0] = priv->child_count;
- list[2] = -1;
-
- /* This means, file doesn't exist anywhere in the Filesystem */
- sched_ops = priv->sched_ops;
-
- /* Send create request to the scheduled node now */
- sched_xl = sched_ops->schedule (this, local->loc1.path);
- if (sched_xl == NULL)
- {
- /* send unlink () on Namespace */
- local->op_errno = ENOTCONN;
- local->op_ret = -1;
- local->call_count = 1;
- gf_log (this->name, GF_LOG_ERROR,
- "no node online to schedule create:(file %s) "
- "sending unlink to namespace",
- (local->loc1.path)?local->loc1.path:"");
-
- STACK_WIND (frame,
- unify_create_unlink_cbk,
- NS(this),
- NS(this)->fops->unlink,
- &local->loc1);
-
- return 0;
- }
-
- for (index = 0; index < priv->child_count; index++)
- if (sched_xl == priv->xl_array[index])
- break;
- list[1] = index;
-
- STACK_WIND (frame, unify_create_cbk,
- sched_xl, sched_xl->fops->create,
- &local->loc1, local->flags, local->mode, fd);
- } else {
- /* File already exists, and there is no O_EXCL flag */
-
- gf_log (this->name, GF_LOG_DEBUG,
- "File(%s) already exists on namespace, sending "
- "open instead", local->loc1.path);
-
- local->list = GF_CALLOC (1, sizeof (int16_t) * 3,
- gf_unify_mt_int16_t);
- ERR_ABORT (local->list);
- local->call_count = priv->child_count + 1;
- local->op_ret = -1;
- for (index = 0; index <= priv->child_count; index++) {
- /* Send lookup() to all nodes including namespace */
- STACK_WIND_COOKIE (frame,
- unify_create_lookup_cbk,
- (void *)(long)index,
- priv->xl_array[index],
- priv->xl_array[index]->fops->lookup,
- &local->loc1,
- NULL);
- }
- }
- return 0;
-}
-
-/**
- * unify_create - create a file in global namespace first, so other
- * clients can see them. Create the file in storage nodes in background.
- */
-int32_t
-unify_create (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags,
- mode_t mode,
- fd_t *fd)
-{
- unify_local_t *local = NULL;
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- local->mode = mode;
- local->flags = flags;
- local->fd = fd;
-
- loc_copy (&local->loc1, loc);
- if (local->loc1.path == NULL) {
- gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
- STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL,
- NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame,
- unify_ns_create_cbk,
- NS(this),
- NS(this)->fops->create,
- loc,
- flags | O_EXCL,
- mode,
- fd);
-
- return 0;
-}
-
-
-/**
- * unify_opendir_cbk -
- */
-int32_t
-unify_opendir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- fd_t *fd)
-{
- STACK_UNWIND (frame, op_ret, op_errno, fd);
-
- return 0;
-}
-
-/**
- * unify_opendir -
- */
-int32_t
-unify_opendir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- fd_t *fd)
-{
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- STACK_WIND (frame, unify_opendir_cbk,
- NS(this), NS(this)->fops->opendir, loc, fd);
-
- return 0;
-}
-
-
-int32_t
-unify_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *statpre,
- struct iatt *statpost)
-{
- int32_t callcnt = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s(): child(%s): path(%s): %s",
- gf_fop_list[frame->root->op],
- prev_frame->this->name,
- (local->loc1.path)?local->loc1.path:"",
- strerror (op_errno));
-
- local->op_errno = op_errno;
- if ((op_errno == ENOENT) && priv->optimist)
- local->op_ret = 0;
- }
-
- if (op_ret >= 0) {
- local->op_ret = 0;
-
- if (NS (this) == prev_frame->this) {
- local->ia_ino = statpost->ia_ino;
- /* If the entry is directory, get the stat
- from NS node */
- if (IA_ISDIR (statpost->ia_type) ||
- !local->stpost.ia_blksize) {
- local->stpre = *statpre;
- local->stpost = *statpost;
- }
- }
-
- if ((!IA_ISDIR (statpost->ia_type)) &&
- (NS (this) != prev_frame->this)) {
- /* If file, take the stat info from Storage
- node. */
- local->stpre = *statpre;
- local->stpost = *statpost;
- }
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- /* If the inode number is not filled, operation should
- fail */
- if (!local->ia_ino)
- local->op_ret = -1;
-
- local->stpre.ia_ino = local->ia_ino;
- local->stpost.ia_ino = local->ia_ino;
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->stpre, &local->stpost);
- }
-
- return 0;
-}
-
-
-int32_t
-unify_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid)
-{
- unify_local_t *local = NULL;
- unify_private_t *priv = this->private;
- int32_t index = 0;
- int32_t callcnt = 0;
- uint64_t tmp_list = 0;
-
- if (!(loc && loc->inode)) {
- STACK_UNWIND (frame, -1, EINVAL, NULL, NULL);
- return 0;
- }
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- loc_copy (&local->loc1, loc);
-
- if (IA_ISDIR (loc->inode->ia_type)) {
- local->call_count = 1;
-
- STACK_WIND (frame,
- unify_setattr_cbk,
- NS (this),
- NS (this)->fops->setattr,
- loc, stbuf, valid);
- } else {
- inode_ctx_get (loc->inode, this, &tmp_list);
- local->list = (int16_t *)(long)tmp_list;
-
- for (index = 0; local->list[index] != -1; index++) {
- local->call_count++;
- callcnt++;
- }
-
- for (index = 0; local->list[index] != -1; index++) {
- STACK_WIND (frame,
- unify_setattr_cbk,
- priv->xl_array[local->list[index]],
- priv->xl_array[local->list[index]]->fops->setattr,
- loc, stbuf, valid);
-
- if (!--callcnt)
- break;
- }
- }
-
- return 0;
-}
-
-
-int32_t
-unify_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid)
-{
- unify_local_t *local = NULL;
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
-
- if (!fd_ctx_get (fd, this, &tmp_child)) {
- /* If its set, then its file */
- child = (xlator_t *)(long)tmp_child;
-
- local->call_count = 2;
-
- STACK_WIND (frame, unify_setattr_cbk, child,
- child->fops->fsetattr, fd, stbuf, valid);
-
- STACK_WIND (frame, unify_setattr_cbk, NS(this),
- NS(this)->fops->fsetattr, fd, stbuf, valid);
- } else {
- local->call_count = 1;
-
- STACK_WIND (frame, unify_setattr_cbk,
- NS(this), NS(this)->fops->fsetattr,
- fd, stbuf, valid);
- }
-
- return 0;
-}
-
-
-/**
- * unify_truncate_cbk -
- */
-int32_t
-unify_truncate_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- int32_t callcnt = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- prev_frame->this->name,
- (local->loc1.path)?local->loc1.path:"",
- strerror (op_errno));
- local->op_errno = op_errno;
- if (!((op_errno == ENOENT) && priv->optimist))
- local->op_ret = -1;
- }
-
- if (op_ret >= 0) {
- if (NS (this) == prev_frame->this) {
- local->ia_ino = postbuf->ia_ino;
- /* If the entry is directory, get the
- stat from NS node */
- if (IA_ISDIR (postbuf->ia_type) ||
- !local->stbuf.ia_blksize) {
- local->stbuf = *prebuf;
- local->poststbuf = *postbuf;
- }
- }
-
- if ((!IA_ISDIR (postbuf->ia_type)) &&
- (NS (this) != prev_frame->this)) {
- /* If file, take the stat info from
- Storage node. */
- local->stbuf = *prebuf;
- local->poststbuf = *postbuf;
- }
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->ia_ino) {
- local->stbuf.ia_ino = local->ia_ino;
- local->poststbuf.ia_ino = local->ia_ino;
- } else {
- local->op_ret = -1;
- }
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->poststbuf);
- }
-
- return 0;
-}
-
-
-/**
- * unify_truncate -
- */
-int32_t
-unify_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset)
-{
- unify_local_t *local = NULL;
- unify_private_t *priv = this->private;
- int32_t index = 0;
- int32_t callcnt = 0;
- uint64_t tmp_list = 0;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- loc_copy (&local->loc1, loc);
- local->ia_ino = loc->inode->ino;
-
- if (IA_ISDIR (loc->inode->ia_type)) {
- local->call_count = 1;
-
- STACK_WIND (frame,
- unify_truncate_cbk,
- NS(this),
- NS(this)->fops->truncate,
- loc,
- 0);
- } else {
- local->op_ret = 0;
- inode_ctx_get (loc->inode, this, &tmp_list);
- local->list = (int16_t *)(long)tmp_list;
-
- for (index = 0; local->list[index] != -1; index++) {
- local->call_count++;
- callcnt++;
- }
-
- /* Don't send offset to NS truncate */
- STACK_WIND (frame, unify_truncate_cbk, NS(this),
- NS(this)->fops->truncate, loc, 0);
- callcnt--;
-
- for (index = 0; local->list[index] != -1; index++) {
- if (NS(this) != priv->xl_array[local->list[index]]) {
- STACK_WIND (frame,
- unify_truncate_cbk,
- priv->xl_array[local->list[index]],
- priv->xl_array[local->list[index]]->fops->truncate,
- loc,
- offset);
- if (!--callcnt)
- break;
- }
- }
- }
-
- return 0;
-}
-
-/**
- * unify_readlink_cbk -
- */
-int32_t
-unify_readlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- const char *path,
- struct iatt *sbuf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, path, sbuf);
- return 0;
-}
-
-/**
- * unify_readlink - Read the link only from the storage node.
- */
-int32_t
-unify_readlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- size_t size)
-{
- unify_private_t *priv = this->private;
- int32_t entry_count = 0;
- int16_t *list = NULL;
- int16_t index = 0;
- uint64_t tmp_list = 0;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- inode_ctx_get (loc->inode, this, &tmp_list);
- list = (int16_t *)(long)tmp_list;
-
- for (index = 0; list[index] != -1; index++)
- entry_count++;
-
- if (entry_count >= 2) {
- for (index = 0; list[index] != -1; index++) {
- if (priv->xl_array[list[index]] != NS(this)) {
- STACK_WIND (frame,
- unify_readlink_cbk,
- priv->xl_array[list[index]],
- priv->xl_array[list[index]]->fops->readlink,
- loc,
- size);
- break;
- }
- }
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "returning ENOENT, no softlink files found "
- "on storage node");
- STACK_UNWIND (frame, -1, ENOENT, NULL);
- }
-
- return 0;
-}
-
-
-/**
- * unify_unlink_cbk -
- */
-int32_t
-unify_unlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret == 0 || ((op_errno == ENOENT) && priv->optimist))
- local->op_ret = 0;
- if (op_ret == -1)
- local->op_errno = op_errno;
-
- if (((call_frame_t *)cookie)->this == NS(this)) {
- local->oldpreparent = *preparent;
- local->oldpostparent = *postparent;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->oldpreparent, &local->oldpostparent);
- }
-
- return 0;
-}
-
-
-/**
- * unify_unlink -
- */
-int32_t
-unify_unlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc)
-{
- unify_private_t *priv = this->private;
- unify_local_t *local = NULL;
- int16_t *list = NULL;
- int16_t index = 0;
- uint64_t tmp_list = 0;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- loc_copy (&local->loc1, loc);
-
- inode_ctx_get (loc->inode, this, &tmp_list);
- list = (int16_t *)(long)tmp_list;
-
- for (index = 0; list[index] != -1; index++)
- local->call_count++;
-
- if (local->call_count) {
- for (index = 0; list[index] != -1; index++) {
- char need_break = (list[index+1] == -1);
- STACK_WIND (frame,
- unify_unlink_cbk,
- priv->xl_array[list[index]],
- priv->xl_array[list[index]]->fops->unlink,
- loc);
- if (need_break)
- break;
- }
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: returning ENOENT", loc->path);
- STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
- }
-
- return 0;
-}
-
-
-/**
- * unify_readv_cbk -
- */
-int32_t
-unify_readv_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iovec *vector,
- int32_t count,
- struct iatt *stbuf,
- struct iobref *iobref)
-{
- STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref);
- return 0;
-}
-
-/**
- * unify_readv -
- */
-int32_t
-unify_readv (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset)
-{
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- STACK_WIND (frame,
- unify_readv_cbk,
- child,
- child->fops->readv,
- fd,
- size,
- offset);
-
-
- return 0;
-}
-
-/**
- * unify_writev_cbk -
- */
-int32_t
-unify_writev_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- unify_local_t *local = NULL;
-
- local = frame->local;
-
- local->stbuf = *prebuf;
- local->stbuf.ia_ino = local->ia_ino;
-
- local->poststbuf = *postbuf;
- local->poststbuf.ia_ino = local->ia_ino;
-
- STACK_UNWIND (frame, op_ret, op_errno,
- &local->stbuf, &local->poststbuf);
- return 0;
-}
-
-/**
- * unify_writev -
- */
-int32_t
-unify_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t off,
- struct iobref *iobref)
-{
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
- unify_local_t *local = NULL;
-
- INIT_LOCAL (frame, local);
- local->ia_ino = fd->inode->ino;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- STACK_WIND (frame,
- unify_writev_cbk,
- child,
- child->fops->writev,
- fd,
- vector,
- count,
- off,
- iobref);
-
- return 0;
-}
-
-/**
- * unify_ftruncate -
- */
-int32_t
-unify_ftruncate (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- off_t offset)
-{
- xlator_t *child = NULL;
- unify_local_t *local = NULL;
- uint64_t tmp_child = 0;
-
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- local->op_ret = 0;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- local->call_count = 2;
-
- STACK_WIND (frame, unify_truncate_cbk,
- child, child->fops->ftruncate,
- fd, offset);
-
- STACK_WIND (frame, unify_truncate_cbk,
- NS(this), NS(this)->fops->ftruncate,
- fd, 0);
-
- return 0;
-}
-
-
-/**
- * unify_flush_cbk -
- */
-int32_t
-unify_flush_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-/**
- * unify_flush -
- */
-int32_t
-unify_flush (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- STACK_WIND (frame, unify_flush_cbk, child,
- child->fops->flush, fd);
-
- return 0;
-}
-
-
-/**
- * unify_fsync_cbk -
- */
-int32_t
-unify_fsync_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
-
-/**
- * unify_fsync -
- */
-int32_t
-unify_fsync (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- STACK_WIND (frame, unify_fsync_cbk, child,
- child->fops->fsync, fd, flags);
-
- return 0;
-}
-
-/**
- * unify_fstat - Send fstat FOP to Namespace only if its directory, and to
- * both namespace and the storage node if its a file.
- */
-int32_t
-unify_fstat (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd)
-{
- unify_local_t *local = NULL;
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd);
-
- INIT_LOCAL (frame, local);
- local->ia_ino = fd->inode->ino;
-
- if (!fd_ctx_get (fd, this, &tmp_child)) {
- /* If its set, then its file */
- child = (xlator_t *)(long)tmp_child;
- local->call_count = 2;
-
- STACK_WIND (frame, unify_buf_cbk, child,
- child->fops->fstat, fd);
-
- STACK_WIND (frame, unify_buf_cbk, NS(this),
- NS(this)->fops->fstat, fd);
-
- } else {
- /* this is an directory */
- local->call_count = 1;
- STACK_WIND (frame, unify_buf_cbk, NS(this),
- NS(this)->fops->fstat, fd);
- }
-
- return 0;
-}
-
-/**
- * unify_getdents_cbk -
- */
-int32_t
-unify_getdents_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entry,
- int32_t count)
-{
- STACK_UNWIND (frame, op_ret, op_errno, entry, count);
- return 0;
-}
-
-/**
- * unify_getdents - send the FOP request to all the nodes.
- */
-int32_t
-unify_getdents (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset,
- int32_t flag)
-{
- UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
-
- STACK_WIND (frame, unify_getdents_cbk, NS(this),
- NS(this)->fops->getdents, fd, size, offset, flag);
-
- return 0;
-}
-
-
-/**
- * unify_readdir_cbk -
- */
-int32_t
-unify_readdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- gf_dirent_t *buf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, buf);
-
- return 0;
-}
-
-/**
- * unify_readdir - send the FOP request to all the nodes.
- */
-int32_t
-unify_readdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset)
-{
- UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
-
- STACK_WIND (frame, unify_readdir_cbk, NS(this),
- NS(this)->fops->readdir, fd, size, offset);
-
- return 0;
-}
-
-
-int32_t
-unify_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *buf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, buf);
-
- return 0;
-}
-
-
-int32_t
-unify_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
-{
- UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
-
- STACK_WIND (frame, unify_readdirp_cbk, NS(this),
- NS(this)->fops->readdirp, fd, size, offset);
-
- return 0;
-}
-
-
-/**
- * unify_fsyncdir_cbk -
- */
-int32_t
-unify_fsyncdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
-
- return 0;
-}
-
-/**
- * unify_fsyncdir -
- */
-int32_t
-unify_fsyncdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags)
-{
- UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
-
- STACK_WIND (frame, unify_fsyncdir_cbk,
- NS(this), NS(this)->fops->fsyncdir, fd, flags);
-
- return 0;
-}
-
-/**
- * unify_lk_cbk - UNWIND frame with the proper return arguments.
- */
-int32_t
-unify_lk_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct gf_flock *lock)
-{
- STACK_UNWIND (frame, op_ret, op_errno, lock);
- return 0;
-}
-
-/**
- * unify_lk - Send it to all the storage nodes, (should be 1) which has file.
- */
-int32_t
-unify_lk (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t cmd,
- struct gf_flock *lock)
-{
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- STACK_WIND (frame, unify_lk_cbk, child,
- child->fops->lk, fd, cmd, lock);
-
- return 0;
-}
-
-
-int32_t
-unify_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno);
-
-static int32_t
-unify_setxattr_file_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- unify_private_t *private = this->private;
- unify_local_t *local = frame->local;
- xlator_t *sched_xl = NULL;
- struct sched_ops *sched_ops = NULL;
-
- if (op_ret == -1) {
- if (!ENOTSUP)
- gf_log (this->name, GF_LOG_ERROR,
- "setxattr with XATTR_CREATE on ns: "
- "path(%s) key(%s): %s",
- local->loc1.path, local->name,
- strerror (op_errno));
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
- }
-
- LOCK (&frame->lock);
- {
- local->failed = 0;
- local->op_ret = 0;
- local->op_errno = 0;
- local->call_count = 1;
- }
- UNLOCK (&frame->lock);
-
- /* schedule XATTR_CREATE on one of the child node */
- sched_ops = private->sched_ops;
-
- /* Send create request to the scheduled node now */
- sched_xl = sched_ops->schedule (this, local->name);
- if (!sched_xl) {
- STACK_UNWIND (frame, -1, ENOTCONN);
- return 0;
- }
-
- STACK_WIND (frame,
- unify_setxattr_cbk,
- sched_xl,
- sched_xl->fops->setxattr,
- &local->loc1,
- local->dict,
- local->flags);
- return 0;
-}
-
-/**
- * unify_setxattr_cbk - When all the child nodes return, UNWIND frame.
- */
-int32_t
-unify_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int32_t callcnt = 0;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
- dict_t *dict = NULL;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- gf_log (this->name, (((op_errno == ENOENT) ||
- (op_errno == ENOTSUP))?
- GF_LOG_DEBUG : GF_LOG_ERROR),
- "child(%s): path(%s): %s",
- prev_frame->this->name,
- (local->loc1.path)?local->loc1.path:"",
- strerror (op_errno));
- if (local->failed == -1) {
- local->failed = 1;
- }
- local->op_errno = op_errno;
- } else {
- local->failed = 0;
- local->op_ret = op_ret;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (local->failed && local->name &&
- ZR_FILE_CONTENT_REQUEST(local->name)) {
- dict = get_new_dict ();
- dict_set (dict, local->dict->members_list->key,
- data_from_dynptr(NULL, 0));
- dict_ref (dict);
-
- local->call_count = 1;
-
- STACK_WIND (frame,
- unify_setxattr_file_cbk,
- NS(this),
- NS(this)->fops->setxattr,
- &local->loc1,
- dict,
- XATTR_CREATE);
-
- dict_unref (dict);
- return 0;
- }
-
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno);
- }
-
- return 0;
-}
-
-/**
- * unify_sexattr - This function should be sent to all the storage nodes,
- * which contains the file, (excluding namespace).
- */
-int32_t
-unify_setxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *dict,
- int32_t flags)
-{
- unify_private_t *priv = this->private;
- unify_local_t *local = NULL;
- int16_t *list = NULL;
- int16_t index = 0;
- int32_t call_count = 0;
- uint64_t tmp_list = 0;
- data_pair_t *trav = dict->members_list;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- local->failed = -1;
- loc_copy (&local->loc1, loc);
-
- if (IA_ISDIR (loc->inode->ia_type)) {
-
- if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) {
- /* direct the storage xlators to change file
- content only if file exists */
- local->flags = flags;
- local->dict = dict;
- local->name = gf_strdup (trav->key);
- flags |= XATTR_REPLACE;
- }
-
- local->call_count = priv->child_count;
- for (index = 0; index < priv->child_count; index++) {
- STACK_WIND (frame,
- unify_setxattr_cbk,
- priv->xl_array[index],
- priv->xl_array[index]->fops->setxattr,
- loc, dict, flags);
- }
- return 0;
- }
-
- inode_ctx_get (loc->inode, this, &tmp_list);
- list = (int16_t *)(long)tmp_list;
-
- for (index = 0; list[index] != -1; index++) {
- if (NS(this) != priv->xl_array[list[index]]) {
- local->call_count++;
- call_count++;
- }
- }
-
- if (local->call_count) {
- for (index = 0; list[index] != -1; index++) {
- if (priv->xl_array[list[index]] != NS(this)) {
- STACK_WIND (frame,
- unify_setxattr_cbk,
- priv->xl_array[list[index]],
- priv->xl_array[list[index]]->fops->setxattr,
- loc,
- dict,
- flags);
- if (!--call_count)
- break;
- }
- }
- return 0;
- }
-
- /* No entry in storage nodes */
- gf_log (this->name, GF_LOG_DEBUG,
- "returning ENOENT, file not found on storage node.");
- STACK_UNWIND (frame, -1, ENOENT);
-
- return 0;
-}
-
-
-/**
- * unify_getxattr_cbk - This function is called from only one child, so, no
- * need of any lock or anything else, just send it to above layer
- */
-int32_t
-unify_getxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *value)
-{
- int32_t callcnt = 0;
- dict_t *local_value = NULL;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_log (this->name,
- (((op_errno == ENOENT) ||
- (op_errno == ENODATA) ||
- (op_errno == ENOTSUP)) ?
- GF_LOG_DEBUG : GF_LOG_ERROR),
- "child(%s): path(%s): %s",
- prev_frame->this->name,
- (local->loc1.path)?local->loc1.path:"",
- strerror (op_errno));
- } else {
- if (!local->dict)
- local->dict = dict_ref (value);
- local->op_ret = op_ret;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local_value = local->dict;
- local->dict = NULL;
-
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- local_value);
-
- if (local_value)
- dict_unref (local_value);
- }
-
- return 0;
-}
-
-
-/**
- * unify_getxattr - This FOP is sent to only the storage node.
- */
-int32_t
-unify_getxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- unify_private_t *priv = this->private;
- int16_t *list = NULL;
- int16_t index = 0;
- int16_t count = 0;
- unify_local_t *local = NULL;
- uint64_t tmp_list = 0;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
- INIT_LOCAL (frame, local);
-
- if (IA_ISDIR (loc->inode->ia_type)) {
- local->call_count = priv->child_count;
- for (index = 0; index < priv->child_count; index++)
- STACK_WIND (frame,
- unify_getxattr_cbk,
- priv->xl_array[index],
- priv->xl_array[index]->fops->getxattr,
- loc,
- name);
- return 0;
- }
-
- inode_ctx_get (loc->inode, this, &tmp_list);
- list = (int16_t *)(long)tmp_list;
-
- for (index = 0; list[index] != -1; index++) {
- if (NS(this) != priv->xl_array[list[index]]) {
- local->call_count++;
- count++;
- }
- }
-
- if (count) {
- for (index = 0; list[index] != -1; index++) {
- if (priv->xl_array[list[index]] != NS(this)) {
- STACK_WIND (frame,
- unify_getxattr_cbk,
- priv->xl_array[list[index]],
- priv->xl_array[list[index]]->fops->getxattr,
- loc,
- name);
- if (!--count)
- break;
- }
- }
- } else {
- dict_t *tmp_dict = get_new_dict ();
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: returning ENODATA, no file found on storage node",
- loc->path);
- STACK_UNWIND (frame, -1, ENODATA, tmp_dict);
- dict_destroy (tmp_dict);
- }
-
- return 0;
-}
-
-/**
- * unify_removexattr_cbk - Wait till all the child node returns the call
- * and then UNWIND to above layer.
- */
-int32_t
-unify_removexattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- int32_t callcnt = 0;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret == -1) {
- local->op_errno = op_errno;
- if (op_errno != ENOTSUP)
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- prev_frame->this->name,
- local->loc1.path, strerror (op_errno));
- } else {
- local->op_ret = op_ret;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- STACK_UNWIND (frame, local->op_ret, local->op_errno);
- }
-
- return 0;
-}
-
-/**
- * unify_removexattr - Send it to all the child nodes which has the files.
- */
-int32_t
-unify_removexattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name)
-{
- unify_private_t *priv = this->private;
- unify_local_t *local = NULL;
- int16_t *list = NULL;
- int16_t index = 0;
- int32_t call_count = 0;
- uint64_t tmp_list = 0;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
-
- if (IA_ISDIR (loc->inode->ia_type)) {
- local->call_count = priv->child_count;
- for (index = 0; index < priv->child_count; index++)
- STACK_WIND (frame,
- unify_removexattr_cbk,
- priv->xl_array[index],
- priv->xl_array[index]->fops->removexattr,
- loc,
- name);
-
- return 0;
- }
-
- inode_ctx_get (loc->inode, this, &tmp_list);
- list = (int16_t *)(long)tmp_list;
-
- for (index = 0; list[index] != -1; index++) {
- if (NS(this) != priv->xl_array[list[index]]) {
- local->call_count++;
- call_count++;
- }
- }
-
- if (local->call_count) {
- for (index = 0; list[index] != -1; index++) {
- if (priv->xl_array[list[index]] != NS(this)) {
- STACK_WIND (frame,
- unify_removexattr_cbk,
- priv->xl_array[list[index]],
- priv->xl_array[list[index]]->fops->removexattr,
- loc,
- name);
- if (!--call_count)
- break;
- }
- }
- return 0;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: returning ENOENT, not found on storage node.", loc->path);
- STACK_UNWIND (frame, -1, ENOENT);
-
- return 0;
-}
-
-
-int32_t
-unify_mknod_unlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- unify_local_t *local = frame->local;
-
- if (op_ret == -1)
- gf_log (this->name, GF_LOG_ERROR,
- "%s: %s", local->loc1.path, strerror (op_errno));
-
- unify_local_wipe (local);
- /* No log required here as this -1 is for mknod call */
- STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL);
- return 0;
-}
-
-/**
- * unify_mknod_cbk -
- */
-int32_t
-unify_mknod_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- unify_local_t *local = frame->local;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "mknod failed on storage node, sending unlink to "
- "namespace");
- local->op_errno = op_errno;
- STACK_WIND (frame,
- unify_mknod_unlink_cbk,
- NS(this),
- NS(this)->fops->unlink,
- &local->loc1);
- return 0;
- }
-
- local->stbuf = *buf;
- local->stbuf.ia_ino = local->ia_ino;
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf,
- &local->oldpreparent, &local->oldpostparent);
- return 0;
-}
-
-/**
- * unify_ns_mknod_cbk -
- */
-int32_t
-unify_ns_mknod_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- struct sched_ops *sched_ops = NULL;
- xlator_t *sched_xl = NULL;
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- int16_t *list = NULL;
- int16_t index = 0;
- call_frame_t *prev_frame = cookie;
-
- if (op_ret == -1) {
- /* No need to send mknod request to other servers,
- * as namespace action failed
- */
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s): %s",
- prev_frame->this->name, local->loc1.path,
- strerror (op_errno));
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
- }
-
- /* Create one inode for this entry */
- local->op_ret = 0;
- local->stbuf = *buf;
- local->ia_ino = buf->ia_ino;
-
- local->oldpreparent = *preparent;
- local->oldpostparent = *postparent;
-
- list = GF_CALLOC (1, sizeof (int16_t) * 3, gf_unify_mt_int16_t);
- ERR_ABORT (list);
- list[0] = priv->child_count;
- list[2] = -1;
- inode_ctx_put (inode, this, (uint64_t)(long)list);
-
- sched_ops = priv->sched_ops;
-
- /* Send mknod request to scheduled node now */
- sched_xl = sched_ops->schedule (this, local->loc1.path);
- if (!sched_xl) {
- gf_log (this->name, GF_LOG_ERROR,
- "mknod failed on storage node, no node online "
- "at the moment, sending unlink to NS");
- local->op_errno = ENOTCONN;
- STACK_WIND (frame,
- unify_mknod_unlink_cbk,
- NS(this),
- NS(this)->fops->unlink,
- &local->loc1);
-
- return 0;
- }
-
- for (index = 0; index < priv->child_count; index++)
- if (sched_xl == priv->xl_array[index])
- break;
- list[1] = index;
-
- STACK_WIND (frame, unify_mknod_cbk,
- sched_xl, sched_xl->fops->mknod,
- &local->loc1, local->mode, local->dev);
-
- return 0;
-}
-
-/**
- * unify_mknod - Create a device on namespace first, and later create on
- * the storage node.
- */
-int32_t
-unify_mknod (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- mode_t mode,
- dev_t rdev)
-{
- unify_local_t *local = NULL;
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- local->mode = mode;
- local->dev = rdev;
- loc_copy (&local->loc1, loc);
- if (local->loc1.path == NULL) {
- gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
- STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL);
- return 0;
- }
-
- STACK_WIND (frame,
- unify_ns_mknod_cbk,
- NS(this),
- NS(this)->fops->mknod,
- loc,
- mode,
- rdev);
-
- return 0;
-}
-
-int32_t
-unify_symlink_unlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- unify_local_t *local = frame->local;
- if (op_ret == -1)
- gf_log (this->name, GF_LOG_ERROR,
- "%s: %s", local->loc1.path, strerror (op_errno));
-
- unify_local_wipe (local);
- STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL);
- return 0;
-}
-
-/**
- * unify_symlink_cbk -
- */
-int32_t
-unify_symlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- unify_local_t *local = frame->local;
-
- if (op_ret == -1) {
- /* Symlink on storage node failed, hence send unlink
- to the NS node */
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR,
- "symlink on storage node failed, sending unlink "
- "to namespace");
-
- STACK_WIND (frame,
- unify_symlink_unlink_cbk,
- NS(this),
- NS(this)->fops->unlink,
- &local->loc1);
-
- return 0;
- }
-
- local->stbuf = *buf;
- local->stbuf.ia_ino = local->ia_ino;
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf,
- &local->oldpreparent, &local->oldpostparent);
-
- return 0;
-}
-
-/**
- * unify_ns_symlink_cbk -
- */
-int32_t
-unify_ns_symlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
-
- struct sched_ops *sched_ops = NULL;
- xlator_t *sched_xl = NULL;
- int16_t *list = NULL;
- unify_local_t *local = frame->local;
- unify_private_t *priv = this->private;
- int16_t index = 0;
-
- if (op_ret == -1) {
- /* No need to send symlink request to other servers,
- * as namespace action failed
- */
- gf_log (this->name, GF_LOG_ERROR,
- "namespace: path(%s): %s",
- local->loc1.path, strerror (op_errno));
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, NULL, buf,
- preparent, postparent);
- return 0;
- }
-
- /* Create one inode for this entry */
- local->op_ret = 0;
- local->ia_ino = buf->ia_ino;
-
- local->oldpreparent = *preparent;
- local->oldpostparent = *postparent;
-
- /* Start the mapping list */
-
- list = GF_CALLOC (1, sizeof (int16_t) * 3, gf_unify_mt_int16_t);
- ERR_ABORT (list);
- list[0] = priv->child_count; //namespace's index
- list[2] = -1;
- inode_ctx_put (inode, this, (uint64_t)(long)list);
-
- sched_ops = priv->sched_ops;
-
- /* Send symlink request to all the nodes now */
- sched_xl = sched_ops->schedule (this, local->loc1.path);
- if (!sched_xl) {
- /* Symlink on storage node failed, hence send unlink
- to the NS node */
- local->op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_ERROR,
- "symlink on storage node failed, no node online, "
- "sending unlink to namespace");
-
- STACK_WIND (frame,
- unify_symlink_unlink_cbk,
- NS(this),
- NS(this)->fops->unlink,
- &local->loc1);
-
- return 0;
- }
-
- for (index = 0; index < priv->child_count; index++)
- if (sched_xl == priv->xl_array[index])
- break;
- list[1] = index;
-
- STACK_WIND (frame,
- unify_symlink_cbk,
- sched_xl,
- sched_xl->fops->symlink,
- local->name,
- &local->loc1);
-
- return 0;
-}
-
-/**
- * unify_symlink -
- */
-int32_t
-unify_symlink (call_frame_t *frame,
- xlator_t *this,
- const char *linkpath,
- loc_t *loc)
-{
- unify_local_t *local = NULL;
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- loc_copy (&local->loc1, loc);
- local->name = gf_strdup (linkpath);
-
- if ((local->name == NULL) ||
- (local->loc1.path == NULL)) {
- gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
- STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL);
- return 0;
- }
-
- STACK_WIND (frame,
- unify_ns_symlink_cbk,
- NS(this),
- NS(this)->fops->symlink,
- linkpath,
- loc);
-
- return 0;
-}
-
-
-int32_t
-unify_rename_unlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- int32_t callcnt = 0;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s -> %s): %s",
- prev_frame->this->name,
- local->loc1.path, local->loc2.path,
- strerror (op_errno));
-
- }
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local->stbuf.ia_ino = local->ia_ino;
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->stbuf);
- }
- return 0;
-}
-
-int32_t
-unify_ns_rename_undo_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf,
- struct iatt *preoldparent,
- struct iatt *postoldparent,
- struct iatt *prenewparent,
- struct iatt *postnewparent)
-{
- unify_local_t *local = frame->local;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "namespace: path(%s -> %s): %s",
- local->loc1.path, local->loc2.path,
- strerror (op_errno));
- }
-
- local->stbuf.ia_ino = local->ia_ino;
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf);
- return 0;
-}
-
-int32_t
-unify_rename_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf,
- struct iatt *preoldparent,
- struct iatt *postoldparent,
- struct iatt *prenewparent,
- struct iatt *postnewparent)
-{
- int32_t index = 0;
- int32_t callcnt = 0;
- int16_t *list = NULL;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
- call_frame_t *prev_frame = cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret >= 0) {
- if (!IA_ISDIR (buf->ia_type))
- local->stbuf = *buf;
- local->op_ret = op_ret;
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "child(%s): path(%s -> %s): %s",
- prev_frame->this->name,
- local->loc1.path, local->loc2.path,
- strerror (op_errno));
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- local->stbuf.ia_ino = local->ia_ino;
- if (IA_ISDIR (local->loc1.inode->ia_type)) {
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->oldpreparent,
- &local->oldpostparent, &local->newpreparent,
- &local->newpostparent);
- return 0;
- }
-
- if (local->op_ret == -1) {
- /* TODO: check this logic */
-
- /* Rename failed in storage node, successful on NS,
- * hence, rename back the entries in NS */
- /* NOTE: this will be done only if the destination
- * doesn't exists, if the destination exists, the
- * job of correcting NS is left to self-heal
- */
- if (!local->index) {
- loc_t tmp_oldloc = {
- /* its actual 'newloc->path' */
- .path = local->loc2.path,
- .inode = local->loc1.inode,
- .parent = local->loc2.parent
- };
-
- loc_t tmp_newloc = {
- /* Actual 'oldloc->path' */
- .path = local->loc1.path,
- .parent = local->loc1.parent
- };
-
- gf_log (this->name, GF_LOG_ERROR,
- "rename succussful on namespace, on "
- "stroage node failed, reverting back");
-
- STACK_WIND (frame,
- unify_ns_rename_undo_cbk,
- NS(this),
- NS(this)->fops->rename,
- &tmp_oldloc,
- &tmp_newloc);
- return 0;
- }
- } else {
- /* Rename successful on storage nodes */
-
- int32_t idx = 0;
- int16_t *tmp_list = NULL;
- uint64_t tmp_list_int64 = 0;
- if (local->loc2.inode) {
- inode_ctx_get (local->loc2.inode,
- this, &tmp_list_int64);
- list = (int16_t *)(long)tmp_list_int64;
-
- }
-
- if (list) {
- for (index = 0; list[index] != -1; index++);
- tmp_list = GF_CALLOC (1, index * 2,
- gf_unify_mt_int16_t);
- memcpy (tmp_list, list, index * 2);
-
- for (index = 0; list[index] != -1; index++) {
- /* TODO: Check this logic. */
- /* If the destination file exists in
- * the same storage node where we sent
- * 'rename' call, no need to send
- * unlink
- */
- for (idx = 0;
- local->list[idx] != -1; idx++) {
- if (tmp_list[index] == local->list[idx]) {
- tmp_list[index] = priv->child_count;
- continue;
- }
- }
-
- if (NS(this) != priv->xl_array[tmp_list[index]]) {
- local->call_count++;
- callcnt++;
- }
- }
-
- if (local->call_count) {
- if (callcnt > 1)
- gf_log (this->name,
- GF_LOG_ERROR,
- "%s->%s: more (%d) "
- "subvolumes have the "
- "newloc entry",
- local->loc1.path,
- local->loc2.path,
- callcnt);
-
- for (index=0;
- tmp_list[index] != -1; index++) {
- if (NS(this) != priv->xl_array[tmp_list[index]]) {
- STACK_WIND (frame,
- unify_rename_unlink_cbk,
- priv->xl_array[tmp_list[index]],
- priv->xl_array[tmp_list[index]]->fops->unlink,
- &local->loc2);
- if (!--callcnt)
- break;
- }
- }
-
- GF_FREE (tmp_list);
- return 0;
- }
- if (tmp_list)
- GF_FREE (tmp_list);
- }
- }
-
- /* Need not send 'unlink' to storage node */
- unify_local_wipe (local);
- STACK_UNWIND (frame, local->op_ret,
- local->op_errno, &local->stbuf,
- &local->oldpreparent, &local->oldpostparent,
- &local->newpreparent, &local->newpostparent);
- }
-
- return 0;
-}
-
-int32_t
-unify_ns_rename_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *buf,
- struct iatt *preoldparent,
- struct iatt *postoldparent,
- struct iatt *prenewparent,
- struct iatt *postnewparent)
-{
- int32_t index = 0;
- int32_t callcnt = 0;
- int16_t *list = NULL;
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
-
- if (op_ret == -1) {
- /* Free local->new_inode */
- gf_log (this->name, GF_LOG_ERROR,
- "namespace: path(%s -> %s): %s",
- local->loc1.path, local->loc2.path,
- strerror (op_errno));
-
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, buf,
- preoldparent, postoldparent,
- prenewparent, postnewparent);
- return 0;
- }
-
- local->stbuf = *buf;
- local->ia_ino = buf->ia_ino;
-
- local->oldpreparent = *preoldparent;
- local->oldpostparent = *postoldparent;
- local->newpreparent = *prenewparent;
- local->newpostparent = *postnewparent;
-
- /* Everything is fine. */
- if (IA_ISDIR (buf->ia_type)) {
- local->call_count = priv->child_count;
- for (index=0; index < priv->child_count; index++) {
- STACK_WIND (frame,
- unify_rename_cbk,
- priv->xl_array[index],
- priv->xl_array[index]->fops->rename,
- &local->loc1,
- &local->loc2);
- }
-
- return 0;
- }
-
- local->call_count = 0;
- /* send rename */
- list = local->list;
- for (index=0; list[index] != -1; index++) {
- if (NS(this) != priv->xl_array[list[index]]) {
- local->call_count++;
- callcnt++;
- }
- }
-
- if (local->call_count) {
- for (index=0; list[index] != -1; index++) {
- if (NS(this) != priv->xl_array[list[index]]) {
- STACK_WIND (frame,
- unify_rename_cbk,
- priv->xl_array[list[index]],
- priv->xl_array[list[index]]->fops->rename,
- &local->loc1,
- &local->loc2);
- if (!--callcnt)
- break;
- }
- }
- } else {
- /* file doesn't seem to be present in storage nodes */
- gf_log (this->name, GF_LOG_CRITICAL,
- "CRITICAL: source file not in storage node, "
- "rename successful on namespace :O");
- unify_local_wipe (local);
- STACK_UNWIND (frame, -1, EIO, NULL,
- NULL, NULL, /* preoldparent, postoldparent */
- NULL, NULL); /* prenewparent, postnewparent */
- }
- return 0;
-}
-
-
-/**
- * unify_rename - One of the tricky function. The deadliest of all :O
- */
-int32_t
-unify_rename (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- unify_local_t *local = NULL;
- uint64_t tmp_list = 0;
-
- /* Initialization */
- INIT_LOCAL (frame, local);
- loc_copy (&local->loc1, oldloc);
- loc_copy (&local->loc2, newloc);
-
- if ((local->loc1.path == NULL) ||
- (local->loc2.path == NULL)) {
- gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
- STACK_UNWIND (frame, -1, ENOMEM, NULL,
- NULL, NULL, /* preoldparent, postoldparent */
- NULL, NULL); /* prenewparent, postnewparent */
- return 0;
- }
-
- inode_ctx_get (oldloc->inode, this, &tmp_list);
- local->list = (int16_t *)(long)tmp_list;
-
- STACK_WIND (frame,
- unify_ns_rename_cbk,
- NS(this),
- NS(this)->fops->rename,
- oldloc,
- newloc);
- return 0;
-}
-
-/**
- * unify_link_cbk -
- */
-int32_t
-unify_link_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- unify_local_t *local = frame->local;
-
- if (op_ret >= 0)
- local->stbuf = *buf;
- local->stbuf.ia_ino = local->ia_ino;
-
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf,
- &local->oldpreparent, &local->oldpostparent);
-
- return 0;
-}
-
-/**
- * unify_ns_link_cbk -
- */
-int32_t
-unify_ns_link_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- inode_t *inode,
- struct iatt *buf,
- struct iatt *preparent,
- struct iatt *postparent)
-{
- unify_private_t *priv = this->private;
- unify_local_t *local = frame->local;
- int16_t *list = local->list;
- int16_t index = 0;
-
- if (op_ret == -1) {
- /* No need to send link request to other servers,
- * as namespace action failed
- */
- gf_log (this->name, GF_LOG_ERROR,
- "namespace: path(%s -> %s): %s",
- local->loc1.path, local->loc2.path,
- strerror (op_errno));
- unify_local_wipe (local);
- STACK_UNWIND (frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
- }
-
- /* Update inode for this entry */
- local->op_ret = 0;
- local->ia_ino = buf->ia_ino;
-
- local->oldpreparent = *preparent;
- local->oldpostparent = *postparent;
-
- /* Send link request to the node now */
- for (index = 0; list[index] != -1; index++) {
- char need_break = (list[index+1] == -1);
- if (priv->xl_array[list[index]] != NS (this)) {
- STACK_WIND (frame,
- unify_link_cbk,
- priv->xl_array[list[index]],
- priv->xl_array[list[index]]->fops->link,
- &local->loc1,
- &local->loc2);
- break;
- }
- if (need_break)
- break;
- }
-
- return 0;
-}
-
-/**
- * unify_link -
- */
-int32_t
-unify_link (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc)
-{
- unify_local_t *local = NULL;
- uint64_t tmp_list = 0;
-
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc);
- UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc);
-
- /* Initialization */
- INIT_LOCAL (frame, local);
-
- loc_copy (&local->loc1, oldloc);
- loc_copy (&local->loc2, newloc);
-
- inode_ctx_get (oldloc->inode, this, &tmp_list);
- local->list = (int16_t *)(long)tmp_list;
-
- STACK_WIND (frame,
- unify_ns_link_cbk,
- NS(this),
- NS(this)->fops->link,
- oldloc,
- newloc);
-
- return 0;
-}
-
-
-/**
- * unify_checksum_cbk -
- */
-int32_t
-unify_checksum_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- uint8_t *fchecksum,
- uint8_t *dchecksum)
-{
- STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum);
-
- return 0;
-}
-
-/**
- * unify_checksum -
- */
-int32_t
-unify_checksum (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flag)
-{
- STACK_WIND (frame,
- unify_checksum_cbk,
- NS(this),
- NS(this)->fops->checksum,
- loc,
- flag);
-
- return 0;
-}
-
-
-/**
- * unify_finodelk_cbk -
- */
-int
-unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-/**
- * unify_finodelk
- */
-int
-unify_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int cmd, struct gf_flock *flock)
-{
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- STACK_WIND (frame, unify_finodelk_cbk,
- child, child->fops->finodelk,
- volume, fd, cmd, flock);
-
- return 0;
-}
-
-
-
-/**
- * unify_fentrylk_cbk -
- */
-int
-unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-/**
- * unify_fentrylk
- */
-int
-unify_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
-
-{
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- STACK_WIND (frame, unify_fentrylk_cbk,
- child, child->fops->fentrylk,
- volume, fd, basename, cmd, type);
-
- return 0;
-}
-
-
-
-/**
- * unify_fxattrop_cbk -
- */
-int
-unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
-{
- STACK_UNWIND (frame, op_ret, op_errno, xattr);
- return 0;
-}
-
-/**
- * unify_fxattrop
- */
-int
-unify_fxattrop (call_frame_t *frame, xlator_t *this,
- fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr)
-{
- UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
- xlator_t *child = NULL;
- uint64_t tmp_child = 0;
-
- fd_ctx_get (fd, this, &tmp_child);
- child = (xlator_t *)(long)tmp_child;
-
- STACK_WIND (frame, unify_fxattrop_cbk,
- child, child->fops->fxattrop,
- fd, optype, xattr);
-
- return 0;
-}
-
-
-/**
- * unify_inodelk_cbk -
- */
-int
-unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-
-/**
- * unify_inodelk
- */
-int
-unify_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int cmd, struct gf_flock *flock)
-{
- xlator_t *child = NULL;
-
- child = unify_loc_subvol (loc, this);
-
- STACK_WIND (frame, unify_inodelk_cbk,
- child, child->fops->inodelk,
- volume, loc, cmd, flock);
-
- return 0;
-}
-
-
-
-/**
- * unify_entrylk_cbk -
- */
-int
-unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-/**
- * unify_entrylk
- */
-int
-unify_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
-
-{
- xlator_t *child = NULL;
-
- child = unify_loc_subvol (loc, this);
-
- STACK_WIND (frame, unify_entrylk_cbk,
- child, child->fops->entrylk,
- volume, loc, basename, cmd, type);
-
- return 0;
-}
-
-
-
-/**
- * unify_xattrop_cbk -
- */
-int
-unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
-{
- STACK_UNWIND (frame, op_ret, op_errno, xattr);
- return 0;
-}
-
-/**
- * unify_xattrop
- */
-int
-unify_xattrop (call_frame_t *frame, xlator_t *this,
- loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr)
-{
- xlator_t *child = NULL;
-
- child = unify_loc_subvol (loc, this);
-
- STACK_WIND (frame, unify_xattrop_cbk,
- child, child->fops->xattrop,
- loc, optype, xattr);
-
- return 0;
-}
-
-int
-unify_forget (xlator_t *this,
- inode_t *inode)
-{
- int16_t *list = NULL;
- uint64_t tmp_list = 0;
-
- if (inode->ia_type && (!IA_ISDIR(inode->ia_type))) {
- inode_ctx_get (inode, this, &tmp_list);
- if (tmp_list) {
- list = (int16_t *)(long)tmp_list;
- GF_FREE (list);
- }
- }
-
- return 0;
-}
-
-/**
- * notify
- */
-int32_t
-notify (xlator_t *this,
- int32_t event,
- void *data,
- ...)
-{
- unify_private_t *priv = this->private;
- struct sched_ops *sched = NULL;
-
- if (!priv) {
- return 0;
- }
-
- sched = priv->sched_ops;
- if (!sched) {
- gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O");
- raise (SIGTERM);
- return 0;
- }
- if (priv->namespace == data) {
- if (event == GF_EVENT_CHILD_UP) {
- sched->notify (this, event, data);
- }
- return 0;
- }
-
- switch (event)
- {
- case GF_EVENT_CHILD_UP:
- {
- /* Call scheduler's update () to enable it for scheduling */
- sched->notify (this, event, data);
-
- LOCK (&priv->lock);
- {
- /* Increment the inode's generation, which is
- used for self_heal */
- ++priv->inode_generation;
- ++priv->num_child_up;
- }
- UNLOCK (&priv->lock);
-
- if (!priv->is_up) {
- default_notify (this, event, data);
- priv->is_up = 1;
- }
- }
- break;
- case GF_EVENT_CHILD_DOWN:
- {
- /* Call scheduler's update () to disable the child node
- * for scheduling
- */
- sched->notify (this, event, data);
- LOCK (&priv->lock);
- {
- --priv->num_child_up;
- }
- UNLOCK (&priv->lock);
-
- if (priv->num_child_up == 0) {
- /* Send CHILD_DOWN to upper layer */
- default_notify (this, event, data);
- priv->is_up = 0;
- }
- }
- break;
-
- default:
- {
- default_notify (this, event, data);
- }
- break;
- }
-
- return 0;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_unify_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-
-/**
- * init - This function is called first in the xlator, while initializing.
- * All the config file options are checked and appropriate flags are set.
- *
- * @this -
- */
-int32_t
-init (xlator_t *this)
-{
- int32_t ret = 0;
- int32_t count = 0;
- data_t *scheduler = NULL;
- data_t *data = NULL;
- xlator_t *ns_xl = NULL;
- xlator_list_t *trav = NULL;
- xlator_list_t *xlparent = NULL;
- xlator_list_t *parent = NULL;
- unify_private_t *_private = NULL;
-
-
- /* Check for number of child nodes, if there is no child nodes, exit */
- if (!this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "No child nodes specified. check \"subvolumes \" "
- "option in volfile");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
-
- /* Check for 'scheduler' in volume */
- scheduler = dict_get (this->options, "scheduler");
- if (!scheduler) {
- gf_log (this->name, GF_LOG_ERROR,
- "\"option scheduler <x>\" is missing in volfile");
- return -1;
- }
-
- /* Setting "option namespace <node>" */
- data = dict_get (this->options, "namespace");
- if(!data) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "namespace option not specified, Exiting");
- return -1;
- }
- /* Search namespace in the child node, if found, exit */
- trav = this->children;
- while (trav) {
- if (strcmp (trav->xlator->name, data->data) == 0)
- break;
- trav = trav->next;
- }
- if (trav) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "namespace node used as a subvolume, Exiting");
- return -1;
- }
-
- /* Search for the namespace node, if found, continue */
- ns_xl = this->next;
- while (ns_xl) {
- if (strcmp (ns_xl->name, data->data) == 0)
- break;
- ns_xl = ns_xl->next;
- }
- if (!ns_xl) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "namespace node not found in volfile, Exiting");
- return -1;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "namespace node specified as %s", data->data);
-
- _private = GF_CALLOC (1, sizeof (*_private),
- gf_unify_mt_unify_private_t);
- ERR_ABORT (_private);
- _private->sched_ops = get_scheduler (this, scheduler->data);
- if (!_private->sched_ops) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Error while loading scheduler. Exiting");
- GF_FREE (_private);
- return -1;
- }
-
- if (ns_xl->parents) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Namespace node should not be a child of any other node. Exiting");
- GF_FREE (_private);
- return -1;
- }
-
- _private->namespace = ns_xl;
-
- /* update _private structure */
- {
- count = 0;
- trav = this->children;
- /* Get the number of child count */
- while (trav) {
- count++;
- trav = trav->next;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Child node count is %d", count);
-
- _private->child_count = count;
- if (count == 1) {
- /* TODO: Should I error out here? */
- gf_log (this->name, GF_LOG_CRITICAL,
- "WARNING: You have defined only one "
- "\"subvolumes\" for unify volume. It may not "
- "be the desired config, review your volume "
- "volfile. If this is how you are testing it,"
- " you may hit some performance penalty");
- }
-
- _private->xl_array = GF_CALLOC (1,
- sizeof (xlator_t) * (count + 1),
- gf_unify_mt_xlator_t);
- ERR_ABORT (_private->xl_array);
-
- count = 0;
- trav = this->children;
- while (trav) {
- _private->xl_array[count++] = trav->xlator;
- trav = trav->next;
- }
- _private->xl_array[count] = _private->namespace;
-
- /* self-heal part, start with generation '1' */
- _private->inode_generation = 1;
- /* Because, Foreground part is tested well */
- _private->self_heal = ZR_UNIFY_FG_SELF_HEAL;
- data = dict_get (this->options, "self-heal");
- if (data) {
- if (strcasecmp (data->data, "off") == 0)
- _private->self_heal = ZR_UNIFY_SELF_HEAL_OFF;
-
- if (strcasecmp (data->data, "foreground") == 0)
- _private->self_heal = ZR_UNIFY_FG_SELF_HEAL;
-
- if (strcasecmp (data->data, "background") == 0)
- _private->self_heal = ZR_UNIFY_BG_SELF_HEAL;
- }
-
- /* optimist - ask bulde for more about it */
- data = dict_get (this->options, "optimist");
- if (data) {
- if (gf_string2boolean (data->data,
- &_private->optimist) == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "optimist excepts only boolean "
- "options");
- }
- }
-
- LOCK_INIT (&_private->lock);
- }
-
- /* Now that everything is fine. */
- this->private = (void *)_private;
- {
- ret = _private->sched_ops->mem_acct_init (this);
-
- if (ret == -1) {
- return -1;
- }
-
- /* Initialize scheduler, if everything else is successful */
- ret = _private->sched_ops->init (this);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Initializing scheduler failed, Exiting");
- GF_FREE (_private);
- return -1;
- }
-
-
- ret = 0;
-
- /* This section is required because some fops may look
- * for 'xl->parent' variable
- */
- xlparent = GF_CALLOC (1, sizeof (*xlparent),
- gf_unify_mt_xlator_list_t);
- xlparent->xlator = this;
- if (!ns_xl->parents) {
- ns_xl->parents = xlparent;
- } else {
- parent = ns_xl->parents;
- while (parent->next)
- parent = parent->next;
- parent->next = xlparent;
- }
- }
-
- /* Tell namespace node that init is done */
- xlator_notify (ns_xl, GF_EVENT_PARENT_UP, this);
-
- return 0;
-}
-
-/**
- * fini - Free all the allocated memory
- */
-void
-fini (xlator_t *this)
-{
- unify_private_t *priv = this->private;
- priv->sched_ops->fini (this);
- this->private = NULL;
- LOCK_DESTROY (&priv->lock);
- GF_FREE (priv->xl_array);
- GF_FREE (priv);
- return;
-}
-
-
-struct xlator_fops fops = {
- .stat = unify_stat,
- .readlink = unify_readlink,
- .mknod = unify_mknod,
- .mkdir = unify_mkdir,
- .unlink = unify_unlink,
- .rmdir = unify_rmdir,
- .symlink = unify_symlink,
- .rename = unify_rename,
- .link = unify_link,
- .truncate = unify_truncate,
- .create = unify_create,
- .open = unify_open,
- .readv = unify_readv,
- .writev = unify_writev,
- .statfs = unify_statfs,
- .flush = unify_flush,
- .fsync = unify_fsync,
- .setxattr = unify_setxattr,
- .getxattr = unify_getxattr,
- .removexattr = unify_removexattr,
- .opendir = unify_opendir,
- .readdir = unify_readdir,
- .readdirp = unify_readdirp,
- .fsyncdir = unify_fsyncdir,
- .access = unify_access,
- .ftruncate = unify_ftruncate,
- .fstat = unify_fstat,
- .lk = unify_lk,
- .lookup = unify_lookup,
- .getdents = unify_getdents,
- .checksum = unify_checksum,
- .inodelk = unify_inodelk,
- .finodelk = unify_finodelk,
- .entrylk = unify_entrylk,
- .fentrylk = unify_fentrylk,
- .xattrop = unify_xattrop,
- .fxattrop = unify_fxattrop,
- .setattr = unify_setattr,
- .fsetattr = unify_fsetattr,
-};
-
-
-struct xlator_cbks cbks = {
- .forget = unify_forget,
-};
-
-struct volume_options options[] = {
- { .key = { "namespace" },
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = { "scheduler" },
- .value = { "alu", "rr", "random", "nufa", "switch" },
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"self-heal"},
- .value = { "foreground", "background", "off" },
- .type = GF_OPTION_TYPE_STR
- },
- /* TODO: remove it some time later */
- { .key = {"optimist"},
- .type = GF_OPTION_TYPE_BOOL
- },
-
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h
deleted file mode 100644
index 3cfe725f43a..00000000000
--- a/xlators/cluster/unify/src/unify.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#ifndef _UNIFY_H
-#define _UNIFY_H
-
-#include "scheduler.h"
-#include "list.h"
-#include "unify-mem-types.h"
-
-#define MAX_DIR_ENTRY_STRING (32 * 1024)
-
-#define ZR_UNIFY_SELF_HEAL_OFF 0
-#define ZR_UNIFY_FG_SELF_HEAL 1
-#define ZR_UNIFY_BG_SELF_HEAL 2
-
-/* Sometimes one should use completely random numbers.. its good :p */
-#define UNIFY_SELF_HEAL_GETDENTS_COUNT 512
-
-#define NS(xl) (((unify_private_t *)xl->private)->namespace)
-
-/* This is used to allocate memory for local structure */
-#define INIT_LOCAL(fr, loc) \
-do { \
- loc = GF_CALLOC (1, sizeof (unify_local_t), gf_unify_mt_unify_local_t); \
- ERR_ABORT (loc); \
- if (!loc) { \
- STACK_UNWIND (fr, -1, ENOMEM); \
- return 0; \
- } \
- fr->local = loc; \
- loc->op_ret = -1; \
- loc->op_errno = ENOENT; \
-} while (0)
-
-
-
-struct unify_private {
- /* Update this structure depending on requirement */
- void *scheduler; /* THIS SHOULD BE THE FIRST VARIABLE,
- if xlator is using scheduler */
- struct sched_ops *sched_ops; /* Scheduler options */
- xlator_t *namespace; /* ptr to namespace xlator */
- xlator_t **xl_array;
- gf_boolean_t optimist;
- int16_t child_count;
- int16_t num_child_up;
- uint8_t self_heal;
- uint8_t is_up;
- uint64_t inode_generation;
- gf_lock_t lock;
-};
-typedef struct unify_private unify_private_t;
-
-struct unify_self_heal_struct {
- uint8_t dir_checksum[NAME_MAX];
- uint8_t ns_dir_checksum[NAME_MAX];
- uint8_t file_checksum[NAME_MAX];
- uint8_t ns_file_checksum[NAME_MAX];
- off_t *offset_list;
- int *count_list;
- dir_entry_t **entry_list;
-};
-
-
-struct _unify_local_t {
- int32_t call_count;
- int32_t op_ret;
- int32_t op_errno;
- mode_t mode;
- off_t offset;
- dev_t dev;
- uid_t uid;
- gid_t gid;
- int32_t flags;
- int32_t entry_count;
- int32_t count; // dir_entry_t count;
- fd_t *fd;
- struct iatt stbuf;
- struct iatt stpre;
- struct iatt stpost;
- struct statvfs statvfs_buf;
- struct timespec tv[2];
- char *name;
- int32_t revalidate;
-
- ino_t ia_ino;
- nlink_t ia_nlink;
-
- dict_t *dict;
-
- int16_t *list;
- int16_t *new_list; /* Used only in case of rename */
- int16_t index;
-
- int32_t failed;
- int32_t return_eio; /* Used in case of different st-mode
- present for a given path */
-
- uint64_t inode_generation; /* used to store the per directory
- * inode_generation. Got from inode's ctx
- * of directory inodes
- */
-
- struct unify_self_heal_struct *sh_struct;
- loc_t loc1, loc2;
-
- struct iatt poststbuf;
- /* When not used for rename, old*
- * are used as the attrs for the current
- * parent directory.
- */
- struct iatt oldpreparent;
- struct iatt oldpostparent;
- struct iatt newpreparent;
- struct iatt newpostparent;
- int32_t wbflags;
-};
-typedef struct _unify_local_t unify_local_t;
-
-int32_t zr_unify_self_heal (call_frame_t *frame,
- xlator_t *this,
- unify_local_t *local);
-
-#endif /* _UNIFY_H */
diff --git a/xlators/debug/Makefile.am b/xlators/debug/Makefile.am
index b655554efec..88fac1c6d9e 100644
--- a/xlators/debug/Makefile.am
+++ b/xlators/debug/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = trace error-gen io-stats
+SUBDIRS = error-gen io-stats sink trace delay-gen
CLEANFILES =
diff --git a/xlators/features/access-control/Makefile.am b/xlators/debug/delay-gen/Makefile.am
index a985f42a877..a985f42a877 100644
--- a/xlators/features/access-control/Makefile.am
+++ b/xlators/debug/delay-gen/Makefile.am
diff --git a/xlators/debug/delay-gen/src/Makefile.am b/xlators/debug/delay-gen/src/Makefile.am
new file mode 100644
index 00000000000..8f758dec199
--- /dev/null
+++ b/xlators/debug/delay-gen/src/Makefile.am
@@ -0,0 +1,11 @@
+
+xlator_LTLIBRARIES = delay-gen.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
+delay_gen_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+delay_gen_la_SOURCES = delay-gen.c
+delay_gen_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+noinst_HEADERS = delay-gen.h delay-gen-mem-types.h delay-gen-messages.h
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
+CLEANFILES =
diff --git a/xlators/debug/delay-gen/src/delay-gen-mem-types.h b/xlators/debug/delay-gen/src/delay-gen-mem-types.h
new file mode 100644
index 00000000000..c89a9217193
--- /dev/null
+++ b/xlators/debug/delay-gen/src/delay-gen-mem-types.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+#ifndef __DELAY_GEN_MEM_TYPES_H__
+#define __DELAY_GEN_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_delay_gen_mem_types_ {
+ gf_delay_gen_mt_dg_t = gf_common_mt_end + 1,
+ gf_delay_gen_mt_end
+};
+
+#endif /* __DELAY_GEN_MEM_TYPES_H__ */
diff --git a/xlators/debug/delay-gen/src/delay-gen-messages.h b/xlators/debug/delay-gen/src/delay-gen-messages.h
new file mode 100644
index 00000000000..bc98cec2885
--- /dev/null
+++ b/xlators/debug/delay-gen/src/delay-gen-messages.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+#ifndef __DELAY_GEN_MESSAGES_H__
+#define __DELAY_GEN_MESSAGES_H__
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+#endif /* __DELAY_GEN_MESSAGES_H__ */
diff --git a/xlators/debug/delay-gen/src/delay-gen.c b/xlators/debug/delay-gen/src/delay-gen.c
new file mode 100644
index 00000000000..4698f1fd785
--- /dev/null
+++ b/xlators/debug/delay-gen/src/delay-gen.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+#include "delay-gen.h"
+
+#define DELAY_GRANULARITY (1 << 20)
+
+#define DG_FOP(fop, name, frame, this, args...) \
+ do { \
+ delay_gen(this, fop); \
+ default_##name(frame, this, args); \
+ } while (0)
+
+int
+delay_gen(xlator_t *this, int fop)
+{
+ dg_t *dg = this->private;
+
+ if (!dg->enable[fop] || !dg->delay_ppm)
+ return 0;
+
+ if ((rand() % DELAY_GRANULARITY) < dg->delay_ppm)
+ gf_nanosleep(dg->delay_duration * GF_US_IN_NS);
+
+ return 0;
+}
+
+int32_t
+dg_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_RENAME, rename, frame, this, oldloc, newloc, xdata);
+ return 0;
+}
+
+int32_t
+dg_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_IPC, ipc, frame, this, op, xdata);
+ return 0;
+}
+
+int32_t
+dg_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ lock_migration_info_t *locklist, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_SETACTIVELK, setactivelk, frame, this, loc, locklist, xdata);
+ return 0;
+}
+
+int32_t
+dg_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FLUSH, flush, frame, this, fd, xdata);
+ return 0;
+}
+
+int32_t
+dg_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_READDIR, readdir, frame, this, fd, size, off, xdata);
+ return 0;
+}
+
+int32_t
+dg_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_SETXATTR, setxattr, frame, this, loc, dict, flags, xdata);
+ return 0;
+}
+
+int32_t
+dg_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_MKNOD, mknod, frame, this, loc, mode, rdev, umask, xdata);
+ return 0;
+}
+
+int32_t
+dg_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FSETXATTR, fsetxattr, frame, this, fd, dict, flags, xdata);
+ return 0;
+}
+
+int32_t
+dg_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_READ, readv, frame, this, fd, size, offset, flags, xdata);
+ return 0;
+}
+
+int32_t
+dg_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_INODELK, inodelk, frame, this, volume, loc, cmd, lock, xdata);
+ return 0;
+}
+
+int32_t
+dg_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FREMOVEXATTR, fremovexattr, frame, this, fd, name, xdata);
+ return 0;
+}
+
+int32_t
+dg_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_OPEN, open, frame, this, loc, flags, fd, xdata);
+ return 0;
+}
+
+int32_t
+dg_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_XATTROP, xattrop, frame, this, loc, flags, dict, xdata);
+ return 0;
+}
+
+int32_t
+dg_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_ENTRYLK, entrylk, frame, this, volume, loc, basename, cmd,
+ type, xdata);
+ return 0;
+}
+
+int32_t
+dg_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_GETACTIVELK, getactivelk, frame, this, loc, xdata);
+ return 0;
+}
+
+int32_t
+dg_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FINODELK, finodelk, frame, this, volume, fd, cmd, lock,
+ xdata);
+ return 0;
+}
+
+int32_t
+dg_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_CREATE, create, frame, this, loc, flags, mode, umask, fd,
+ xdata);
+ return 0;
+}
+
+int32_t
+dg_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_DISCARD, discard, frame, this, fd, offset, len, xdata);
+ return 0;
+}
+
+int32_t
+dg_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_MKDIR, mkdir, frame, this, loc, mode, umask, xdata);
+ return 0;
+}
+
+int32_t
+dg_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_LK, lk, frame, this, fd, cmd, lock, xdata);
+ return 0;
+}
+
+int32_t
+dg_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t off, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_WRITE, writev, frame, this, fd, vector, count, off, flags,
+ iobref, xdata);
+ return 0;
+}
+
+int32_t
+dg_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_ACCESS, access, frame, this, loc, mask, xdata);
+ return 0;
+}
+
+int32_t
+dg_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_LOOKUP, lookup, frame, this, loc, xdata);
+ return 0;
+}
+
+int32_t
+dg_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_RMDIR, rmdir, frame, this, loc, flags, xdata);
+ return 0;
+}
+
+int32_t
+dg_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FALLOCATE, fallocate, frame, this, fd, keep_size, offset, len,
+ xdata);
+ return 0;
+}
+
+int32_t
+dg_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FSTAT, fstat, frame, this, fd, xdata);
+ return 0;
+}
+
+int32_t
+dg_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_LEASE, lease, frame, this, loc, lease, xdata);
+ return 0;
+}
+
+int32_t
+dg_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_STAT, stat, frame, this, loc, xdata);
+ return 0;
+}
+
+int32_t
+dg_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_TRUNCATE, truncate, frame, this, loc, offset, xdata);
+ return 0;
+}
+
+int32_t
+dg_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_GETXATTR, getxattr, frame, this, loc, name, xdata);
+ return 0;
+}
+
+int32_t
+dg_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_SYMLINK, symlink, frame, this, linkpath, loc, umask, xdata);
+ return 0;
+}
+
+int32_t
+dg_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_ZEROFILL, zerofill, frame, this, fd, offset, len, xdata);
+ return 0;
+}
+
+int32_t
+dg_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FSYNCDIR, fsyncdir, frame, this, fd, flags, xdata);
+ return 0;
+}
+
+int32_t
+dg_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FGETXATTR, fgetxattr, frame, this, fd, name, xdata);
+ return 0;
+}
+
+int32_t
+dg_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_READDIRP, readdirp, frame, this, fd, size, off, xdata);
+ return 0;
+}
+
+int32_t
+dg_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_LINK, link, frame, this, oldloc, newloc, xdata);
+ return 0;
+}
+
+int32_t
+dg_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FXATTROP, fxattrop, frame, this, fd, flags, dict, xdata);
+ return 0;
+}
+
+int32_t
+dg_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FTRUNCATE, ftruncate, frame, this, fd, offset, xdata);
+ return 0;
+}
+
+int32_t
+dg_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_RCHECKSUM, rchecksum, frame, this, fd, offset, len, xdata);
+ return 0;
+}
+
+int32_t
+dg_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_UNLINK, unlink, frame, this, loc, flags, xdata);
+ return 0;
+}
+
+int32_t
+dg_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FENTRYLK, fentrylk, frame, this, volume, fd, basename, cmd,
+ type, xdata);
+ return 0;
+}
+
+int32_t
+dg_getspec(call_frame_t *frame, xlator_t *this, const char *key, int32_t flags)
+{
+ DG_FOP(GF_FOP_GETSPEC, getspec, frame, this, key, flags);
+ return 0;
+}
+
+int32_t
+dg_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_SETATTR, setattr, frame, this, loc, stbuf, valid, xdata);
+ return 0;
+}
+
+int32_t
+dg_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FSYNC, fsync, frame, this, fd, flags, xdata);
+ return 0;
+}
+
+int32_t
+dg_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_STATFS, statfs, frame, this, loc, xdata);
+ return 0;
+}
+
+int32_t
+dg_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_SEEK, seek, frame, this, fd, offset, what, xdata);
+ return 0;
+}
+
+int32_t
+dg_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_FSETATTR, fsetattr, frame, this, fd, stbuf, valid, xdata);
+ return 0;
+}
+
+int32_t
+dg_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_OPENDIR, opendir, frame, this, loc, fd, xdata);
+ return 0;
+}
+
+int32_t
+dg_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
+{
+ DG_FOP(GF_FOP_READLINK, readlink, frame, this, loc, size, xdata);
+ return 0;
+}
+
+int32_t
+dg_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ DG_FOP(GF_FOP_REMOVEXATTR, removexattr, frame, this, loc, name, xdata);
+ return 0;
+}
+
+int32_t
+dg_forget(xlator_t *this, inode_t *inode)
+{
+ return 0;
+}
+
+int32_t
+dg_release(xlator_t *this, fd_t *fd)
+{
+ return 0;
+}
+
+int32_t
+dg_releasedir(xlator_t *this, fd_t *fd)
+{
+ return 0;
+}
+
+static int
+delay_gen_parse_fill_fops(dg_t *dg, char *enable_fops)
+{
+ char *op_no_str = NULL;
+ int op_no = -1;
+ int i = 0;
+ int ret = 0;
+ xlator_t *this = THIS;
+ char *saveptr = NULL;
+ char *dup_enable_fops = NULL;
+
+ if (strlen(enable_fops) == 0) {
+ for (i = GF_FOP_NULL + 1; i < GF_FOP_MAXVALUE; i++)
+ dg->enable[i] = 1;
+ } else {
+ dup_enable_fops = gf_strdup(enable_fops);
+ if (!dup_enable_fops) {
+ ret = -1;
+ goto out;
+ }
+ op_no_str = strtok_r(dup_enable_fops, ",", &saveptr);
+ while (op_no_str) {
+ op_no = gf_fop_int(op_no_str);
+ if (op_no == -1) {
+ gf_log(this->name, GF_LOG_WARNING, "Wrong option value %s",
+ op_no_str);
+ ret = -1;
+ goto out;
+ } else {
+ dg->enable[op_no] = 1;
+ }
+
+ op_no_str = strtok_r(NULL, ",", &saveptr);
+ }
+ }
+out:
+ GF_FREE(dup_enable_fops);
+ return ret;
+}
+
+void
+delay_gen_set_delay_ppm(dg_t *dg, double percent)
+{
+ double ppm;
+
+ ppm = (percent / 100.0) * (double)DELAY_GRANULARITY;
+ dg->delay_ppm = ppm;
+}
+
+int32_t
+init(xlator_t *this)
+{
+ dg_t *dg = NULL;
+ int32_t ret = 0;
+ double delay_percent = 0;
+ char *delay_enable_fops = NULL;
+
+ if (!this->children || this->children->next) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "delay-gen not configured with one subvolume");
+ ret = -1;
+ goto out;
+ }
+
+ if (!this->parents) {
+ gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+ }
+
+ dg = GF_CALLOC(1, sizeof(*dg), gf_delay_gen_mt_dg_t);
+
+ if (!dg) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = -1;
+
+ GF_OPTION_INIT("delay-percentage", delay_percent, percent, out);
+ GF_OPTION_INIT("enable", delay_enable_fops, str, out);
+ GF_OPTION_INIT("delay-duration", dg->delay_duration, int32, out);
+
+ delay_gen_set_delay_ppm(dg, delay_percent);
+
+ ret = delay_gen_parse_fill_fops(dg, delay_enable_fops);
+ if (ret)
+ goto out;
+
+ this->private = dg;
+
+ ret = 0;
+out:
+ if (ret)
+ GF_FREE(dg);
+ return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+ GF_FREE(this->private);
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init(this, gf_delay_gen_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Memory accounting init"
+ " failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
+reconfigure(xlator_t *this, dict_t *dict)
+{
+ /*At the moment I don't see any need to implement this. In future
+ *if this is needed we can add code here.
+ */
+ return 0;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+ return default_notify(this, event, data);
+}
+
+struct xlator_fops fops = {
+ .rename = dg_rename,
+ .ipc = dg_ipc,
+ .setactivelk = dg_setactivelk,
+ .flush = dg_flush,
+ .readdir = dg_readdir,
+ .setxattr = dg_setxattr,
+ .mknod = dg_mknod,
+ .fsetxattr = dg_fsetxattr,
+ .readv = dg_readv,
+ .inodelk = dg_inodelk,
+ .fremovexattr = dg_fremovexattr,
+ .open = dg_open,
+ .xattrop = dg_xattrop,
+ .entrylk = dg_entrylk,
+ .getactivelk = dg_getactivelk,
+ .finodelk = dg_finodelk,
+ .create = dg_create,
+ .discard = dg_discard,
+ .mkdir = dg_mkdir,
+ .lk = dg_lk,
+ .writev = dg_writev,
+ .access = dg_access,
+ .lookup = dg_lookup,
+ .rmdir = dg_rmdir,
+ .fallocate = dg_fallocate,
+ .fstat = dg_fstat,
+ .lease = dg_lease,
+ .stat = dg_stat,
+ .truncate = dg_truncate,
+ .getxattr = dg_getxattr,
+ .symlink = dg_symlink,
+ .zerofill = dg_zerofill,
+ .fsyncdir = dg_fsyncdir,
+ .fgetxattr = dg_fgetxattr,
+ .readdirp = dg_readdirp,
+ .link = dg_link,
+ .fxattrop = dg_fxattrop,
+ .ftruncate = dg_ftruncate,
+ .rchecksum = dg_rchecksum,
+ .unlink = dg_unlink,
+ .fentrylk = dg_fentrylk,
+ .getspec = dg_getspec,
+ .setattr = dg_setattr,
+ .fsync = dg_fsync,
+ .statfs = dg_statfs,
+ .seek = dg_seek,
+ .fsetattr = dg_fsetattr,
+ .opendir = dg_opendir,
+ .readlink = dg_readlink,
+ .removexattr = dg_removexattr,
+};
+
+struct xlator_cbks cbks = {
+ .forget = dg_forget,
+ .release = dg_release,
+ .releasedir = dg_releasedir,
+};
+
+struct volume_options options[] = {
+ {
+ .key = {"delay-percentage"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "10%",
+ .description = "Percentage delay of operations when enabled.",
+ .op_version = {GD_OP_VERSION_3_13_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"delay-gen"},
+ },
+
+ {
+ .key = {"delay-duration"},
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Delay duration in micro seconds",
+ .default_value = "100000",
+ .op_version = {GD_OP_VERSION_3_13_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"delay-gen"},
+ },
+
+ {
+ .key = {"enable"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Accepts a string which takes ',' separated fop "
+ "strings to denote which fops are enabled for delay",
+ .op_version = {GD_OP_VERSION_3_13_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"delay-gen"},
+ .default_value = "",
+ },
+
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {GD_OP_VERSION_3_12_0},
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "delay-gen",
+ .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/delay-gen/src/delay-gen.h b/xlators/debug/delay-gen/src/delay-gen.h
new file mode 100644
index 00000000000..afa95e5eb2d
--- /dev/null
+++ b/xlators/debug/delay-gen/src/delay-gen.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+#ifndef __DELAY_GEN_H__
+#define __DELAY_GEN_H__
+
+#include "delay-gen-mem-types.h"
+#include "delay-gen-messages.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+typedef struct {
+ int enable[GF_FOP_MAXVALUE];
+ int op_count;
+ int delay_ppm;
+ int delay_duration;
+} dg_t;
+
+#endif /* __DELAY_GEN_H__ */
diff --git a/xlators/debug/error-gen/src/Makefile.am b/xlators/debug/error-gen/src/Makefile.am
index f353b61e69c..038d2e8e66d 100644
--- a/xlators/debug/error-gen/src/Makefile.am
+++ b/xlators/debug/error-gen/src/Makefile.am
@@ -2,15 +2,17 @@
xlator_LTLIBRARIES = error-gen.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
-error_gen_la_LDFLAGS = -module -avoidversion
+error_gen_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
error_gen_la_SOURCES = error-gen.c
error_gen_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = error-gen.h
+noinst_HEADERS = error-gen.h error-gen-mem-types.h
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
CLEANFILES =
diff --git a/xlators/debug/error-gen/src/error-gen-mem-types.h b/xlators/debug/error-gen/src/error-gen-mem-types.h
new file mode 100644
index 00000000000..b9b713af8fc
--- /dev/null
+++ b/xlators/debug/error-gen/src/error-gen-mem-types.h
@@ -0,0 +1,20 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __ERROR_GEN_MEM_TYPES_H__
+#define __ERROR_GEN_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_error_gen_mem_types_ {
+ gf_error_gen_mt_eg_t = gf_common_mt_end + 1,
+ gf_error_gen_mt_end
+};
+#endif
diff --git a/xlators/debug/error-gen/src/error-gen.c b/xlators/debug/error-gen/src/error-gen.c
index 84a42072770..d45655ef4c3 100644
--- a/xlators/debug/error-gen/src/error-gen.c
+++ b/xlators/debug/error-gen/src/error-gen.c
@@ -1,2024 +1,1663 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#include "xlator.h"
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/xlator.h>
#include "error-gen.h"
+#include <glusterfs/statedump.h>
+#include <glusterfs/defaults.h>
+
+/*
+ * The user can specify an error probability as a float percentage, but we
+ * store it internally as a numerator with this as the denominator. When it's
+ * used, it's like this:
+ *
+ * (rand() % FAILURE_GRANULARITY) < error_rate
+ *
+ * To minimize rounding errors from the modulo operation, it's good for this to
+ * be a power of two.
+ *
+ * (BTW this is just the normal case. If "random-failure" is set, that does
+ * something completely different and this number is irrelevant. See error_gen
+ * for the legacy code.)
+ */
+#define FAILURE_GRANULARITY (1 << 20)
sys_error_t error_no_list[] = {
- [GF_FOP_LOOKUP] = { .error_no_count = 4,
- .error_no = {ENOENT,ENOTDIR,
- ENAMETOOLONG,EAGAIN}},
- [GF_FOP_STAT] = { .error_no_count = 7,
- .error_no = {EACCES,EBADF,EFAULT,
- ENAMETOOLONG,ENOENT,
- ENOMEM,ENOTDIR}},
- [GF_FOP_READLINK] = { .error_no_count = 8,
- .error_no = {EACCES,EFAULT,EINVAL,EIO,
- ENAMETOOLONG,ENOENT,ENOMEM,
- ENOTDIR}},
- [GF_FOP_MKNOD] = { .error_no_count = 11,
- .error_no = {EACCES,EEXIST,EFAULT,
- EINVAL,ENAMETOOLONG,
- ENOENT,ENOMEM,ENOSPC,
- ENOTDIR,EPERM,EROFS}},
- [GF_FOP_MKDIR] = { .error_no_count = 10,
- .error_no = {EACCES,EEXIST,EFAULT,
- ENAMETOOLONG,ENOENT,
- ENOMEM,ENOSPC,ENOTDIR,
- EPERM,EROFS}},
- [GF_FOP_UNLINK] = { .error_no_count = 10,
- .error_no = {EACCES,EBUSY,EFAULT,EIO,
- EISDIR,ENAMETOOLONG,
- ENOENT,ENOMEM,ENOTDIR,
- EPERM,EROFS}},
- [GF_FOP_RMDIR] = { .error_no_count = 8,
- .error_no = {EACCES,EBUSY,EFAULT,
- ENOMEM,ENOTDIR,ENOTEMPTY,
- EPERM,EROFS}},
- [GF_FOP_SYMLINK] = { .error_no_count = 11,
- .error_no = {EACCES,EEXIST,EFAULT,EIO,
- ENAMETOOLONG,ENOENT,ENOMEM,
- ENOSPC,ENOTDIR,EPERM,
- EROFS}},
- [GF_FOP_RENAME] = { .error_no_count = 13,
- .error_no = {EACCES,EBUSY,EFAULT,
- EINVAL,EISDIR,EMLINK,
- ENAMETOOLONG,ENOENT,ENOMEM,
- ENOSPC,ENOTDIR,EEXIST,
- EXDEV}},
- [GF_FOP_LINK] = { .error_no_count = 13,
- .error_no = {EACCES,EFAULT,EEXIST,EIO,
- EMLINK,ENAMETOOLONG,
- ENOENT,ENOMEM,ENOSPC,
- ENOTDIR,EPERM,EROFS,
- EXDEV}},
- [GF_FOP_TRUNCATE] = { .error_no_count = 10,
- .error_no = {EACCES,EFAULT,EFBIG,
- EINTR,EINVAL,EIO,EISDIR,
- ENAMETOOLONG,ENOENT,
- EISDIR}},
- [GF_FOP_CREATE] = {.error_no_count = 10,
- .error_no = {EACCES,EEXIST,EFAULT,
- EISDIR,EMFILE,ENAMETOOLONG,
- ENFILE,ENODEV,ENOENT,
- ENODEV}},
- [GF_FOP_OPEN] = { .error_no_count = 10,
- .error_no = {EACCES,EEXIST,EFAULT,
- EISDIR,EMFILE,
- ENAMETOOLONG,ENFILE,
- ENODEV,ENOENT,ENOMEM}},
- [GF_FOP_READ] = { .error_no_count = 5,
- .error_no = {EINVAL,EBADF,EFAULT,EISDIR,
- ENAMETOOLONG}},
- [GF_FOP_WRITE] = { .error_no_count = 5,
- .error_no = {EINVAL,EBADF,EFAULT,EISDIR,
- ENAMETOOLONG}},
- [GF_FOP_STATFS] = {.error_no_count = 10,
- .error_no = {EACCES,EBADF,EFAULT,EINTR,
- EIO,ENAMETOOLONG,ENOENT,
- ENOMEM,ENOSYS,ENOTDIR}},
- [GF_FOP_FLUSH] = { .error_no_count = 5,
- .error_no = {EACCES,EFAULT,
- ENAMETOOLONG,ENOSYS,
- ENOENT}},
- [GF_FOP_FSYNC] = { .error_no_count = 4,
- .error_no = {EBADF,EIO,EROFS,EINVAL}},
- [GF_FOP_SETXATTR] = { .error_no_count = 4,
- .error_no = {EACCES,EBADF,EINTR,
- ENAMETOOLONG}},
- [GF_FOP_GETXATTR] = { .error_no_count = 4,
- .error_no = {EACCES,EBADF,ENAMETOOLONG,
- EINTR}},
- [GF_FOP_REMOVEXATTR] = { .error_no_count = 4,
- .error_no = {EACCES,EBADF,ENAMETOOLONG,
- EINTR}},
- [GF_FOP_OPENDIR] = { .error_no_count = 8,
- .error_no = {EACCES,EEXIST,EFAULT,
- EISDIR,EMFILE,
- ENAMETOOLONG,ENFILE,
- ENODEV}},
- [GF_FOP_READDIR] = { .error_no_count = 5,
- .error_no = {EINVAL,EACCES,EBADF,
- EMFILE,ENOENT}},
- [GF_FOP_READDIRP] = { .error_no_count = 5,
- .error_no = {EINVAL,EACCES,EBADF,
- EMFILE,ENOENT}},
- [GF_FOP_FSYNCDIR] = { .error_no_count = 4,
- .error_no = {EBADF,EIO,EROFS,EINVAL}},
- [GF_FOP_ACCESS] = { .error_no_count = 8,
- .error_no = {EACCES,ENAMETOOLONG,
- ENOENT,ENOTDIR,EROFS,
- EFAULT,EINVAL,EIO}},
- [GF_FOP_FTRUNCATE] = { .error_no_count = 9,
- .error_no = {EACCES,EFAULT,EFBIG,
- EINTR,EINVAL,EIO,EISDIR,
- ENAMETOOLONG,ENOENT}},
- [GF_FOP_FSTAT] = { .error_no_count = 7,
- .error_no = {EACCES,EBADF,EFAULT,
- ENAMETOOLONG,ENOENT,
- ENOMEM,ENOTDIR}},
- [GF_FOP_LK] = { .error_no_count = 4,
- .error_no = {EACCES,EFAULT,ENOENT,
- EINTR}},
- [GF_FOP_XATTROP] = { .error_no_count = 5,
- .error_no = {EACCES,EFAULT,
- ENAMETOOLONG,ENOSYS,
- ENOENT}},
- [GF_FOP_FXATTROP] = { .error_no_count = 4,
- .error_no = {EBADF,EIO,EROFS,EINVAL}},
- [GF_FOP_INODELK] = { .error_no_count = 4,
- .error_no = {EACCES,EBADF,EINTR,
- ENAMETOOLONG}},
- [GF_FOP_FINODELK] = { .error_no_count = 4,
- .error_no = {EACCES,EBADF,EINTR,
- ENAMETOOLONG}},
- [GF_FOP_ENTRYLK] = { .error_no_count = 4,
- .error_no = {EACCES,EBADF,
- ENAMETOOLONG,EINTR}},
- [GF_FOP_FENTRYLK] = { .error_no_count = 10,
- .error_no = {EACCES,EEXIST,EFAULT,
- EISDIR,EMFILE,
- ENAMETOOLONG,ENFILE,
- ENODEV,ENOENT,ENOMEM}},
- [GF_FOP_SETATTR] = {.error_no_count = 11,
- .error_no = {EACCES,EFAULT,EIO,
- ENAMETOOLONG,ENOENT,
- ENOMEM,ENOTDIR,EPERM,
- EROFS,EBADF,EIO}},
- [GF_FOP_FSETATTR] = { .error_no_count = 11,
- .error_no = {EACCES,EFAULT,EIO,
- ENAMETOOLONG,ENOENT,
- ENOMEM,ENOTDIR,EPERM,
- EROFS,EBADF,EIO}},
- [GF_FOP_GETSPEC] = { .error_no_count = 4,
- .error_no = {EACCES,EBADF,ENAMETOOLONG,
- EINTR}}
-};
+ [GF_FOP_LOOKUP] = {.error_no_count = 4,
+ .error_no = {ENOENT, ENOTDIR, ENAMETOOLONG, EAGAIN}},
+ [GF_FOP_STAT] = {.error_no_count = 6,
+ .error_no = {EACCES, EFAULT, ENAMETOOLONG, ENOENT, ENOMEM,
+ ENOTDIR}},
+ [GF_FOP_READLINK] = {.error_no_count = 8,
+ .error_no = {EACCES, EFAULT, EINVAL, EIO, ENAMETOOLONG,
+ ENOENT, ENOMEM, ENOTDIR}},
+ [GF_FOP_MKNOD] = {.error_no_count = 11,
+ .error_no = {EACCES, EEXIST, EFAULT, EINVAL, ENAMETOOLONG,
+ ENOENT, ENOMEM, ENOSPC, ENOTDIR, EPERM,
+ EROFS}},
+ [GF_FOP_MKDIR] = {.error_no_count = 10,
+ .error_no = {EACCES, EEXIST, EFAULT, ENAMETOOLONG, ENOENT,
+ ENOMEM, ENOSPC, ENOTDIR, EPERM, EROFS}},
+ [GF_FOP_UNLINK] = {.error_no_count = 10,
+ .error_no = {EACCES, EBUSY, EFAULT, EIO, EISDIR,
+ ENAMETOOLONG, ENOENT, ENOMEM, ENOTDIR,
+ EPERM, EROFS}},
+ [GF_FOP_RMDIR] = {.error_no_count = 8,
+ .error_no = {EACCES, EBUSY, EFAULT, ENOMEM, ENOTDIR,
+ ENOTEMPTY, EPERM, EROFS}},
+ [GF_FOP_SYMLINK] = {.error_no_count = 11,
+ .error_no = {EACCES, EEXIST, EFAULT, EIO, ENAMETOOLONG,
+ ENOENT, ENOMEM, ENOSPC, ENOTDIR, EPERM,
+ EROFS}},
+ [GF_FOP_RENAME] = {.error_no_count = 13,
+ .error_no = {EACCES, EBUSY, EFAULT, EINVAL, EISDIR,
+ EMLINK, ENAMETOOLONG, ENOENT, ENOMEM,
+ ENOSPC, ENOTDIR, EEXIST, EXDEV}},
+ [GF_FOP_LINK] = {.error_no_count = 13,
+ .error_no = {EACCES, EFAULT, EEXIST, EIO, EMLINK,
+ ENAMETOOLONG, ENOENT, ENOMEM, ENOSPC, ENOTDIR,
+ EPERM, EROFS, EXDEV}},
+ [GF_FOP_TRUNCATE] = {.error_no_count = 10,
+ .error_no = {EACCES, EFAULT, EFBIG, EINTR, EINVAL, EIO,
+ EISDIR, ENAMETOOLONG, ENOENT, EISDIR}},
+ [GF_FOP_CREATE] = {.error_no_count = 10,
+ .error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
+ ENAMETOOLONG, ENFILE, ENODEV, ENOENT,
+ ENODEV}},
+ [GF_FOP_OPEN] = {.error_no_count = 10,
+ .error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
+ ENAMETOOLONG, ENFILE, ENODEV, ENOENT,
+ ENOMEM}},
+ [GF_FOP_READ] = {.error_no_count = 5,
+ .error_no = {EINVAL, EBADF, EFAULT, EISDIR, ENAMETOOLONG}},
+ [GF_FOP_WRITE] = {.error_no_count = 7,
+ .error_no = {EINVAL, EBADF, EFAULT, EISDIR, ENAMETOOLONG,
+ ENOSPC, GF_ERROR_SHORT_WRITE}},
+ [GF_FOP_STATFS] = {.error_no_count = 9,
+ .error_no = {EACCES, EFAULT, EINTR, EIO, ENAMETOOLONG,
+ ENOENT, ENOMEM, ENOSYS, ENOTDIR}},
+ [GF_FOP_FLUSH] = {.error_no_count = 5,
+ .error_no = {EACCES, EFAULT, ENAMETOOLONG, ENOSYS,
+ ENOENT}},
+ [GF_FOP_FSYNC] = {.error_no_count = 4,
+ .error_no = {EBADF, EIO, EROFS, EINVAL}},
+ [GF_FOP_SETXATTR] = {.error_no_count = 3,
+ .error_no = {EACCES, EINTR, ENAMETOOLONG}},
+ [GF_FOP_GETXATTR] = {.error_no_count = 3,
+ .error_no = {EACCES, ENAMETOOLONG, EINTR}},
+ [GF_FOP_REMOVEXATTR] = {.error_no_count = 3,
+ .error_no = {EACCES, ENAMETOOLONG, EINTR}},
+ [GF_FOP_FSETXATTR] = {.error_no_count = 4,
+ .error_no = {EACCES, EBADF, EINTR, ENAMETOOLONG}},
+ [GF_FOP_FGETXATTR] = {.error_no_count = 4,
+ .error_no = {EACCES, EBADF, ENAMETOOLONG, EINTR}},
+ [GF_FOP_FREMOVEXATTR] = {.error_no_count = 4,
+ .error_no = {EACCES, EBADF, ENAMETOOLONG, EINTR}},
+ [GF_FOP_OPENDIR] = {.error_no_count = 8,
+ .error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
+ ENAMETOOLONG, ENFILE, ENODEV}},
+ [GF_FOP_READDIR] = {.error_no_count = 5,
+ .error_no = {EINVAL, EACCES, EBADF, EMFILE, ENOENT}},
+ [GF_FOP_READDIRP] = {.error_no_count = 5,
+ .error_no = {EINVAL, EACCES, EBADF, EMFILE, ENOENT}},
+ [GF_FOP_FSYNCDIR] = {.error_no_count = 4,
+ .error_no = {EBADF, EIO, EROFS, EINVAL}},
+ [GF_FOP_ACCESS] = {.error_no_count = 8,
+ .error_no = {EACCES, ENAMETOOLONG, ENOENT, ENOTDIR,
+ EROFS, EFAULT, EINVAL, EIO}},
+ [GF_FOP_FTRUNCATE] = {.error_no_count = 9,
+ .error_no = {EACCES, EFAULT, EFBIG, EINTR, EINVAL,
+ EIO, EISDIR, ENAMETOOLONG, ENOENT}},
+ [GF_FOP_FSTAT] = {.error_no_count = 7,
+ .error_no = {EACCES, EBADF, EFAULT, ENAMETOOLONG, ENOENT,
+ ENOMEM, ENOTDIR}},
+ [GF_FOP_LK] = {.error_no_count = 4,
+ .error_no = {EACCES, EFAULT, ENOENT, EINTR}},
+ [GF_FOP_XATTROP] = {.error_no_count = 5,
+ .error_no = {EACCES, EFAULT, ENAMETOOLONG, ENOSYS,
+ ENOENT}},
+ [GF_FOP_FXATTROP] = {.error_no_count = 4,
+ .error_no = {EBADF, EIO, EROFS, EINVAL}},
+ [GF_FOP_INODELK] = {.error_no_count = 3,
+ .error_no = {EACCES, EINTR, ENAMETOOLONG}},
+ [GF_FOP_FINODELK] = {.error_no_count = 4,
+ .error_no = {EACCES, EBADF, EINTR, ENAMETOOLONG}},
+ [GF_FOP_ENTRYLK] = {.error_no_count = 3,
+ .error_no = {EACCES, ENAMETOOLONG, EINTR}},
+ [GF_FOP_FENTRYLK] = {.error_no_count = 10,
+ .error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
+ ENAMETOOLONG, ENFILE, ENODEV, ENOENT,
+ ENOMEM}},
+ [GF_FOP_SETATTR] = {.error_no_count = 10,
+ .error_no = {EACCES, EFAULT, EIO, ENAMETOOLONG, ENOENT,
+ ENOMEM, ENOTDIR, EPERM, EROFS, EIO}},
+ [GF_FOP_FSETATTR] = {.error_no_count = 11,
+ .error_no = {EACCES, EFAULT, EIO, ENAMETOOLONG, ENOENT,
+ ENOMEM, ENOTDIR, EPERM, EROFS, EBADF,
+ EIO}},
+ [GF_FOP_GETSPEC] = {.error_no_count = 3,
+ .error_no = {EACCES, ENAMETOOLONG, EINTR}}};
+
+int
+generate_rand_no(int op_no)
+{
+ int rand_no = 0;
+ int error_no_list_size = 0;
+
+ error_no_list_size = sizeof(error_no_list) / sizeof(error_no_list[0]);
+
+ if (op_no < error_no_list_size)
+ /* coverity[DC.WEAK_CRYPTO] */
+ rand_no = rand() % error_no_list[op_no].error_no_count;
+ return rand_no;
+}
+
+int
+conv_errno_to_int(char **error_no)
+{
+ if (!strcmp((*error_no), "ENOENT"))
+ return ENOENT;
+ else if (!strcmp((*error_no), "ENOTDIR"))
+ return ENOTDIR;
+ else if (!strcmp((*error_no), "ENAMETOOLONG"))
+ return ENAMETOOLONG;
+ else if (!strcmp((*error_no), "EACCES"))
+ return EACCES;
+ else if (!strcmp((*error_no), "EBADF"))
+ return EBADF;
+ else if (!strcmp((*error_no), "EFAULT"))
+ return EFAULT;
+ else if (!strcmp((*error_no), "ENOMEM"))
+ return ENOMEM;
+ else if (!strcmp((*error_no), "EINVAL"))
+ return EINVAL;
+ else if (!strcmp((*error_no), "EIO"))
+ return EIO;
+ else if (!strcmp((*error_no), "EEXIST"))
+ return EEXIST;
+ else if (!strcmp((*error_no), "ENOSPC"))
+ return ENOSPC;
+ else if (!strcmp((*error_no), "EPERM"))
+ return EPERM;
+ else if (!strcmp((*error_no), "EROFS"))
+ return EROFS;
+ else if (!strcmp((*error_no), "EBUSY"))
+ return EBUSY;
+ else if (!strcmp((*error_no), "EISDIR"))
+ return EISDIR;
+ else if (!strcmp((*error_no), "ENOTEMPTY"))
+ return ENOTEMPTY;
+ else if (!strcmp((*error_no), "EMLINK"))
+ return EMLINK;
+ else if (!strcmp((*error_no), "ENODEV"))
+ return ENODEV;
+ else if (!strcmp((*error_no), "EXDEV"))
+ return EXDEV;
+ else if (!strcmp((*error_no), "EMFILE"))
+ return EMFILE;
+ else if (!strcmp((*error_no), "ENFILE"))
+ return ENFILE;
+ else if (!strcmp((*error_no), "ENOSYS"))
+ return ENOSYS;
+ else if (!strcmp((*error_no), "EINTR"))
+ return EINTR;
+ else if (!strcmp((*error_no), "EFBIG"))
+ return EFBIG;
+ else if (!strcmp((*error_no), "GF_ERROR_SHORT_WRITE"))
+ return GF_ERROR_SHORT_WRITE;
+ else
+ return EAGAIN;
+}
+
+int
+error_gen(xlator_t *this, int op_no)
+{
+ eg_t *egp = NULL;
+ int count = 0;
+ int error_no_int = 0;
+ int rand_no = 0;
+ int ret = 0;
+ gf_boolean_t should_err = _gf_false;
+ int error_no_list_size = 0;
+
+ egp = this->private;
+
+ if (egp->random_failure) {
+ /*
+ * I honestly don't know why anyone would use this "feature"
+ * but I'll try to preserve its functionality anyway. Without
+ * locking twice to update failure_iter_no and egp->op_count
+ * separately, then not locking at all to update
+ * egp->failure_iter_no. That's not needed for compatibility,
+ * and it's abhorrently wrong. I have *some* standards.
+ */
+ LOCK(&egp->lock);
+ {
+ count = ++(egp->op_count);
+ error_no_int = egp->error_no_int;
+ if ((count % egp->failure_iter_no) == 0) {
+ egp->op_count = 0;
+ /* coverity[DC.WEAK_CRYPTO] */
+ egp->failure_iter_no = 3 + (rand() % GF_UNIVERSAL_ANSWER);
+ should_err = _gf_true;
+ }
+ }
+ UNLOCK(&egp->lock);
+ } else {
+ /*
+ * It turns out that rand() is almost universally implemented
+ * as a linear congruential PRNG, which is about as cheap as
+ * it gets. This gets us real random behavior, including
+ * phenomena like streaks and dry spells, with controllable
+ * long-term probability, cheaply.
+ */
+ if ((rand() % FAILURE_GRANULARITY) < egp->failure_iter_no) {
+ should_err = _gf_true;
+ }
+ }
+
+ error_no_list_size = sizeof(error_no_list) / sizeof(error_no_list[0]);
+ if (should_err) {
+ if (error_no_int)
+ ret = error_no_int;
+ else {
+ rand_no = generate_rand_no(op_no);
+ if (op_no >= error_no_list_size)
+ op_no = 0;
+ if (rand_no >= error_no_list[op_no].error_no_count)
+ rand_no = 0;
+ ret = error_no_list[op_no].error_no[rand_no];
+ }
+ }
+
+ return ret;
+}
int
-generate_rand_no (int op_no)
+error_gen_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- int rand_no = 0;
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_LOOKUP];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_LOOKUP);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL);
+ return 0;
+ }
- if (op_no < GF_FOP_MAXVALUE)
- rand_no = rand () % error_no_list[op_no].error_no_count;
- return rand_no;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup,
+ loc, xdata);
+ return 0;
}
int
-conv_errno_to_int (char **error_no)
+error_gen_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- if (!strcmp ((*error_no), "ENOENT"))
- return ENOENT;
- else if (!strcmp ((*error_no), "ENOTDIR"))
- return ENOTDIR;
- else if (!strcmp ((*error_no), "ENAMETOOLONG"))
- return ENAMETOOLONG;
- else if (!strcmp ((*error_no), "EACCES"))
- return EACCES;
- else if (!strcmp ((*error_no), "EBADF"))
- return EBADF;
- else if (!strcmp ((*error_no), "EFAULT"))
- return EFAULT;
- else if (!strcmp ((*error_no), "ENOMEM"))
- return ENOMEM;
- else if (!strcmp ((*error_no), "EINVAL"))
- return EINVAL;
- else if (!strcmp ((*error_no), "EIO"))
- return EIO;
- else if (!strcmp ((*error_no), "EEXIST"))
- return EEXIST;
- else if (!strcmp ((*error_no), "ENOSPC"))
- return ENOSPC;
- else if (!strcmp ((*error_no), "EPERM"))
- return EPERM;
- else if (!strcmp ((*error_no), "EROFS"))
- return EROFS;
- else if (!strcmp ((*error_no), "EBUSY"))
- return EBUSY;
- else if (!strcmp ((*error_no), "EISDIR"))
- return EISDIR;
- else if (!strcmp ((*error_no), "ENOTEMPTY"))
- return ENOTEMPTY;
- else if (!strcmp ((*error_no), "EMLINK"))
- return EMLINK;
- else if (!strcmp ((*error_no), "ENODEV"))
- return ENODEV;
- else if (!strcmp ((*error_no), "EXDEV"))
- return EXDEV;
- else if (!strcmp ((*error_no), "EMFILE"))
- return EMFILE;
- else if (!strcmp ((*error_no), "ENFILE"))
- return ENFILE;
- else if (!strcmp ((*error_no), "ENOSYS"))
- return ENOSYS;
- else if (!strcmp ((*error_no), "EINTR"))
- return EINTR;
- else if (!strcmp ((*error_no), "EFBIG"))
- return EFBIG;
- else
- return EAGAIN;
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_STAT];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_STAT);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat,
+ loc, xdata);
+ return 0;
}
int
-get_fop_int (char **op_no_str)
+error_gen_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- if (!strcmp ((*op_no_str), "lookup"))
- return GF_FOP_LOOKUP;
- else if (!strcmp ((*op_no_str), "stat"))
- return GF_FOP_STAT;
- else if (!strcmp ((*op_no_str), "readlink"))
- return GF_FOP_READLINK;
- else if (!strcmp ((*op_no_str), "mknod"))
- return GF_FOP_MKNOD;
- else if (!strcmp ((*op_no_str), "mkdir"))
- return GF_FOP_MKDIR;
- else if (!strcmp ((*op_no_str), "unlink"))
- return GF_FOP_UNLINK;
- else if (!strcmp ((*op_no_str), "rmdir"))
- return GF_FOP_RMDIR;
- else if (!strcmp ((*op_no_str), "symlink"))
- return GF_FOP_SYMLINK;
- else if (!strcmp ((*op_no_str), "rename"))
- return GF_FOP_RENAME;
- else if (!strcmp ((*op_no_str), "link"))
- return GF_FOP_LINK;
- else if (!strcmp ((*op_no_str), "truncate"))
- return GF_FOP_TRUNCATE;
- else if (!strcmp ((*op_no_str), "create"))
- return GF_FOP_CREATE;
- else if (!strcmp ((*op_no_str), "open"))
- return GF_FOP_OPEN;
- else if (!strcmp ((*op_no_str), "readv"))
- return GF_FOP_READ;
- else if (!strcmp ((*op_no_str), "writev"))
- return GF_FOP_WRITE;
- else if (!strcmp ((*op_no_str), "statfs"))
- return GF_FOP_STATFS;
- else if (!strcmp ((*op_no_str), "flush"))
- return GF_FOP_FLUSH;
- else if (!strcmp ((*op_no_str), "fsync"))
- return GF_FOP_FSYNC;
- else if (!strcmp ((*op_no_str), "setxattr"))
- return GF_FOP_SETXATTR;
- else if (!strcmp ((*op_no_str), "getxattr"))
- return GF_FOP_GETXATTR;
- else if (!strcmp ((*op_no_str), "removexattr"))
- return GF_FOP_REMOVEXATTR;
- else if (!strcmp ((*op_no_str), "opendir"))
- return GF_FOP_OPENDIR;
- else if (!strcmp ((*op_no_str), "readdir"))
- return GF_FOP_READDIR;
- else if (!strcmp ((*op_no_str), "readdirp"))
- return GF_FOP_READDIRP;
- else if (!strcmp ((*op_no_str), "fsyncdir"))
- return GF_FOP_FSYNCDIR;
- else if (!strcmp ((*op_no_str), "access"))
- return GF_FOP_ACCESS;
- else if (!strcmp ((*op_no_str), "ftruncate"))
- return GF_FOP_FTRUNCATE;
- else if (!strcmp ((*op_no_str), "fstat"))
- return GF_FOP_FSTAT;
- else if (!strcmp ((*op_no_str), "lk"))
- return GF_FOP_LK;
- else if (!strcmp ((*op_no_str), "xattrop"))
- return GF_FOP_XATTROP;
- else if (!strcmp ((*op_no_str), "fxattrop"))
- return GF_FOP_FXATTROP;
- else if (!strcmp ((*op_no_str), "inodelk"))
- return GF_FOP_INODELK;
- else if (!strcmp ((*op_no_str), "finodelk"))
- return GF_FOP_FINODELK;
- else if (!strcmp ((*op_no_str), "etrylk"))
- return GF_FOP_ENTRYLK;
- else if (!strcmp ((*op_no_str), "fentrylk"))
- return GF_FOP_FENTRYLK;
- else if (!strcmp ((*op_no_str), "setattr"))
- return GF_FOP_SETATTR;
- else if (!strcmp ((*op_no_str), "fsetattr"))
- return GF_FOP_FSETATTR;
- else if (!strcmp ((*op_no_str), "getspec"))
- return GF_FOP_GETSPEC;
- else
- return -1;
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_SETATTR];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_SETATTR);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(setattr, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
}
int
-error_gen (xlator_t *this, int op_no)
+error_gen_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- eg_t *egp = NULL;
- int count = 0;
- int failure_iter_no = GF_FAILURE_DEFAULT;
- char *error_no = NULL;
- int rand_no = 0;
- int ret = 0;
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- egp = this->private;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FSETATTR];
- LOCK (&egp->lock);
- {
- count = ++egp->op_count;
- failure_iter_no = egp->failure_iter_no;
- error_no = egp->error_no;
- }
- UNLOCK (&egp->lock);
-
- if((count % failure_iter_no) == 0) {
- LOCK (&egp->lock);
- {
- egp->op_count = 0;
- }
- UNLOCK (&egp->lock);
-
- if (error_no)
- ret = conv_errno_to_int (&error_no);
- else {
-
- rand_no = generate_rand_no (op_no);
- if (op_no >= GF_FOP_MAXVALUE)
- op_no = 0;
- if (rand_no >= error_no_list[op_no].error_no_count)
- rand_no = 0;
- ret = error_no_list[op_no].error_no[rand_no];
- }
- egp->failure_iter_no = 3 + (rand () % GF_UNIVERSAL_ANSWER);
- }
- return ret;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FSETATTR);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fsetattr, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
-int
-error_gen_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *dict, struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode,
- buf, dict, postparent);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ return 0;
}
-
int
-error_gen_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *xattr_req)
+error_gen_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_LOOKUP];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_LOOKUP);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, NULL, NULL, NULL,
- NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_lookup_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lookup,
- loc, xattr_req);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_TRUNCATE];
-int
-error_gen_forget (xlator_t *this, inode_t *inode)
-{
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_TRUNCATE);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
int
-error_gen_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
+error_gen_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf);
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- return 0;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FTRUNCATE];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FTRUNCATE);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(ftruncate, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
}
int
-error_gen_stat (call_frame_t *frame, xlator_t *this, loc_t *loc)
+error_gen_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_STAT];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_STAT);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (stat, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_stat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat,
- loc);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_ACCESS];
-int
-error_gen_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
-{
- STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop);
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_ACCESS);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(access, frame, -1, op_errno, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->access,
+ loc, mask, xdata);
+ return 0;
+}
int
-error_gen_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid)
+error_gen_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_SETATTR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_SETATTR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (setattr, frame, -1, op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_setattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setattr,
- loc, stbuf, valid);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_READLINK];
-int
-error_gen_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FSETATTR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FSETATTR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (fsetattr, frame, -1, op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_setattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetattr,
- fd, stbuf, valid);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_READLINK);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(readlink, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
-int
-error_gen_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
-{
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,
- prebuf, postbuf);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink,
+ loc, size, xdata);
+ return 0;
}
-
int
-error_gen_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
- off_t offset)
+error_gen_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_TRUNCATE];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_TRUNCATE);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (truncate, frame, -1, op_errno,
- NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc, offset);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_MKNOD];
-int
-error_gen_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno,
- prebuf, postbuf);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_MKNOD);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ xdata);
+ return 0;
+ }
-int
-error_gen_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset)
-{
- int op_errno = 0;
- eg_t *egp =NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FTRUNCATE];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FTRUNCATE);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno,
- NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd, offset);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
+ loc, mode, rdev, umask, xdata);
+ return 0;
}
-
int
-error_gen_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+error_gen_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
- STACK_UNWIND_STRICT (access, frame, op_ret, op_errno);
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- return 0;
-}
+ egp = this->private;
+ enable = egp->enable[GF_FOP_MKDIR];
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_MKDIR);
-int
-error_gen_access (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t mask)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_ACCESS];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_ACCESS);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (access, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_access_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->access,
- loc, mask);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+ loc, mode, umask, xdata);
+ return 0;
+}
int
-error_gen_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *path, struct iatt *sbuf)
+error_gen_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, sbuf);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_UNLINK];
-int
-error_gen_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc,
- size_t size)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_READLINK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_READLINK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (readlink, frame, -1, op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_readlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readlink,
- loc, size);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_UNLINK);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(unlink, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
-int
-error_gen_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno,
- inode, buf,
- preparent, postparent);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
}
-
int
-error_gen_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
- mode_t mode, dev_t rdev, dict_t *params)
+error_gen_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_MKNOD];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_MKNOD);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, NULL,
- NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_mknod_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod,
- loc, mode, rdev, params);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_RMDIR];
-int
-error_gen_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno,
- inode, buf,
- preparent, postparent);
- return 0;
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_RMDIR);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(rmdir, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir,
+ loc, flags, xdata);
+ return 0;
}
int
-error_gen_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params)
+error_gen_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_MKDIR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_MKDIR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, NULL, NULL,
- NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_mkdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir,
- loc, mode, params);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_SYMLINK];
-int
-error_gen_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
- preparent, postparent);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_SYMLINK);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(symlink, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL); /* pre & post parent attr */
+ return 0;
+ }
-int
-error_gen_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_UNLINK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_UNLINK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- loc);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ linkpath, loc, umask, xdata);
+ return 0;
}
-
int
-error_gen_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent)
+error_gen_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
{
- STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
- preparent, postparent);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_RENAME];
-int
-error_gen_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_RMDIR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_RMDIR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_rmdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rmdir,
- loc, flags);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_RENAME);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
+ }
-int
-error_gen_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
}
-
int
-error_gen_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
- loc_t *loc, dict_t *params)
+error_gen_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_SYMLINK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_SYMLINK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (symlink, frame, -1, op_errno, NULL, NULL,
- NULL, NULL); /* pre & post parent attr */
- return 0;
- }
-
- STACK_WIND (frame, error_gen_symlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink,
- linkpath, loc, params);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_LINK];
-int
-error_gen_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
-{
- STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf,
- preoldparent, postoldparent,
- prenewparent, postnewparent);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_LINK);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
+ }
-int
-error_gen_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_RENAME];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_RENAME);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL,
- NULL, NULL, NULL, NULL); /* pre & post parent attr */
- return 0;
- }
-
- STACK_WIND (frame, error_gen_rename_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename,
- oldloc, newloc);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
}
-
int
-error_gen_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
+error_gen_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_CREATE];
-int
-error_gen_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_LINK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_LINK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (link, frame, -1, op_errno, NULL, NULL,
- NULL, NULL); /* pre & post parent attr */
- return 0;
- }
-
- STACK_WIND (frame, error_gen_link_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link,
- oldloc, newloc);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_CREATE);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
+ }
-int
-error_gen_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- fd_t *fd, inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
- preparent, postparent);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
}
-
int
-error_gen_create (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, mode_t mode, fd_t *fd, dict_t *params)
+error_gen_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_CREATE];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_CREATE);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL, NULL,
- NULL, NULL, NULL); /* pre & post attr */
- return 0;
- }
-
- STACK_WIND (frame, error_gen_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, fd, params);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_OPEN];
-int
-error_gen_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_OPEN);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(open, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
-int
-error_gen_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, fd_t *fd, int32_t wbflags)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_OPEN];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_OPEN);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (open, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_open_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- loc, flags, fd, wbflags);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
}
-
int
-error_gen_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count,
- struct iatt *stbuf, struct iobref *iobref)
+error_gen_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
{
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
- vector, count, stbuf, iobref);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_READ];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_READ);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL,
+ xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+}
int
-error_gen_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset)
+error_gen_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t off, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ struct iovec *shortvec = NULL;
- egp = this->private;
- enable = egp->enable[GF_FOP_READ];
+ egp = this->private;
+ enable = egp->enable[GF_FOP_WRITE];
- if (enable)
- op_errno = error_gen (this, GF_FOP_READ);
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_WRITE);
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0,
- NULL, NULL);
- return 0;
- }
+ if (op_errno == GF_ERROR_SHORT_WRITE) {
+ /*
+ * A short write error returns some value less than what was
+ * requested from a write. To simulate this, replace the vector
+ * with one half the size;
+ */
+ shortvec = iov_dup(vector, 1);
+ shortvec->iov_len /= 2;
+ count = 1;
+ goto wind;
+ } else if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
+wind:
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+ fd, shortvec ? shortvec : vector, count, off, flags, iobref,
+ xdata);
-
- STACK_WIND (frame, error_gen_readv_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv,
- fd, size, offset);
- return 0;
+ if (shortvec)
+ GF_FREE(shortvec);
+ return 0;
}
-
int
-error_gen_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
+error_gen_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FLUSH];
-int
-error_gen_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count,
- off_t off, struct iobref *iobref)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_WRITE];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_WRITE);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_writev_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev,
- fd, vector, count, off, iobref);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FLUSH);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(flush, frame, -1, op_errno, xdata);
+ return 0;
+ }
-int
-error_gen_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush,
+ fd, xdata);
+ return 0;
}
-
int
-error_gen_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+error_gen_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FLUSH];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FLUSH);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (flush, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_flush_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FSYNC];
-int
-error_gen_fsync_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FSYNC);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, xdata);
+ return 0;
+ }
-int
-error_gen_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FSYNC];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FSYNC);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_fsync_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync,
- fd, flags);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+ return 0;
}
-
int
-error_gen_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
+error_gen_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FSTAT];
-int
-error_gen_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FSTAT];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FSTAT);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (fstat, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_fstat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat,
- fd);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FSTAT);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fstat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
-int
-error_gen_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat,
+ fd, xdata);
+ return 0;
}
-
int
-error_gen_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+error_gen_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_OPENDIR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_OPENDIR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (opendir, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_opendir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->opendir,
- loc, fd);
- return 0;
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_OPENDIR];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_OPENDIR);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(opendir, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir,
+ loc, fd, xdata);
+ return 0;
}
int
-error_gen_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+error_gen_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
{
- STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FSYNCDIR];
-int
-error_gen_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t flags)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FSYNCDIR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FSYNCDIR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (fsyncdir, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_fsyncdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsyncdir,
- fd, flags);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FSYNCDIR);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fsyncdir, frame, -1, op_errno, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsyncdir,
+ fd, flags, xdata);
+ return 0;
+}
int
-error_gen_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct statvfs *buf)
+error_gen_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf);
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- return 0;
-}
+ egp = this->private;
+ enable = egp->enable[GF_FOP_STATFS];
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_STATFS);
-int
-error_gen_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_STATFS];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_STATFS);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (statfs, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_statfs_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->statfs,
- loc);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(statfs, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->statfs,
+ loc, xdata);
+ return 0;
+}
int
-error_gen_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+error_gen_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
{
- STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno);
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- return 0;
-}
+ egp = this->private;
+ enable = egp->enable[GF_FOP_SETXATTR];
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_SETXATTR);
-int
-error_gen_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *dict, int32_t flags)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_SETXATTR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_SETXATTR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_setxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setxattr,
- loc, dict, flags);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(setxattr, frame, -1, op_errno, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+}
int
-error_gen_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+error_gen_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_GETXATTR];
-int
-error_gen_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_GETXATTR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_GETXATTR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (getxattr, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_getxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr,
- loc, name);
- return 0;
-}
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_GETXATTR);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(getxattr, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+}
int
-error_gen_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+error_gen_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict);
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- return 0;
-}
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FSETXATTR];
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FSETXATTR);
-int
-error_gen_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t flags, dict_t *dict)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_XATTROP];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_XATTROP);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (xattrop, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_xattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->xattrop,
- loc, flags, dict);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fsetxattr, frame, -1, op_errno, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+ return 0;
+}
int
-error_gen_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+error_gen_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
{
- STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict);
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- return 0;
-}
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FGETXATTR];
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FGETXATTR);
-int
-error_gen_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t flags, dict_t *dict)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FXATTROP];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FXATTROP);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (fxattrop, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_fxattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fxattrop,
- fd, flags, dict);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fgetxattr, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+ return 0;
+}
int
-error_gen_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+error_gen_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
- STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno);
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- return 0;
-}
+ egp = this->private;
+ enable = egp->enable[GF_FOP_XATTROP];
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_XATTROP);
-int
-error_gen_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name)
-{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_REMOVEXATTR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_REMOVEXATTR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (removexattr, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_removexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
- loc, name);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(xattrop, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop,
+ loc, flags, dict, xdata);
+ return 0;
+}
int
-error_gen_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
+error_gen_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
- STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FXATTROP];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FXATTROP);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fxattrop, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fxattrop,
+ fd, flags, dict, xdata);
+ return 0;
+}
int
-error_gen_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
- struct gf_flock *lock)
+error_gen_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_LK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_LK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (lk, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_lk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lk,
- fd, cmd, lock);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_REMOVEXATTR];
-int
-error_gen_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_REMOVEXATTR);
-{
- STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(removexattr, frame, -1, op_errno, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+ return 0;
+}
int
-error_gen_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd,
- struct gf_flock *lock)
+error_gen_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_INODELK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_INODELK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (inodelk, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_inodelk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->inodelk,
- volume, loc, cmd, lock);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FREMOVEXATTR];
-int
-error_gen_finodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FREMOVEXATTR);
-{
- STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fremovexattr, frame, -1, op_errno, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+ return 0;
+}
int
-error_gen_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd,
- struct gf_flock *lock)
+error_gen_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FINODELK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FINODELK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (finodelk, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_finodelk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk,
- volume, fd, cmd, lock);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_LK];
-int
-error_gen_entrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_LK);
-{
- STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(lk, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lk, fd,
+ cmd, lock, xdata);
+ return 0;
+}
int
-error_gen_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+error_gen_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_ENTRYLK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_ENTRYLK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (entrylk, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_entrylk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->entrylk,
- volume, loc, basename, cmd, type);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_INODELK];
-int
-error_gen_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_INODELK);
-{
- STACK_UNWIND_STRICT (fentrylk, frame, op_ret, op_errno);
- return 0;
-}
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(inodelk, frame, -1, op_errno, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->inodelk,
+ volume, loc, cmd, lock, xdata);
+ return 0;
+}
int
-error_gen_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+error_gen_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_FENTRYLK];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_FENTRYLK);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (fentrylk, frame, -1, op_errno);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_fentrylk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fentrylk,
- volume, fd, basename, cmd, type);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FINODELK];
-/* Management operations */
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FINODELK);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(finodelk, frame, -1, op_errno, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->finodelk,
+ volume, fd, cmd, lock, xdata);
+ return 0;
+}
int
-error_gen_getspec_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, char *spec_data)
+error_gen_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
{
- STACK_UNWIND_STRICT (getspec, frame, op_ret, op_errno, spec_data);
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
- return 0;
-}
+ egp = this->private;
+ enable = egp->enable[GF_FOP_ENTRYLK];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_ENTRYLK);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(entrylk, frame, -1, op_errno, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk,
+ volume, loc, basename, cmd, type, xdata);
+ return 0;
+}
int
-error_gen_getspec (call_frame_t *frame, xlator_t *this, const char *key,
- int32_t flags)
+error_gen_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+ fd_t *fd, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_GETSPEC];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_GETSPEC);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (getspec, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_getspec_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getspec,
- key, flags);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_FENTRYLK];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_FENTRYLK);
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(fentrylk, frame, -1, op_errno, xdata);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fentrylk,
+ volume, fd, basename, cmd, type, xdata);
+ return 0;
+}
int
-error_gen_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries)
+error_gen_getspec(call_frame_t *frame, xlator_t *this, const char *key,
+ int32_t flags)
{
- STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_GETSPEC];
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_GETSPEC);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(getspec, frame, -1, op_errno, NULL);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->getspec,
+ key, flags);
+ return 0;
+}
int
-error_gen_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t off)
+error_gen_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_READDIR];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_READDIR);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (readdir, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_readdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdir,
- fd, size, off);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+
+ egp = this->private;
+ enable = egp->enable[GF_FOP_READDIR];
+
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_READDIR);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(readdir, frame, -1, op_errno, NULL, xdata);
+ return 0;
+ }
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir,
+ fd, size, off, xdata);
+ return 0;
+}
int
-error_gen_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries)
+error_gen_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict)
{
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries);
- return 0;
-}
+ int op_errno = 0;
+ eg_t *egp = NULL;
+ int enable = 1;
+ egp = this->private;
+ enable = egp->enable[GF_FOP_READDIRP];
-int
-error_gen_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t off)
+ if (enable)
+ op_errno = error_gen(this, GF_FOP_READDIRP);
+
+ if (op_errno) {
+ GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+ STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL);
+ return 0;
+ }
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
+ fd, size, off, dict);
+ return 0;
+}
+
+static void
+error_gen_set_failure(eg_t *pvt, double percent)
{
- int op_errno = 0;
- eg_t *egp = NULL;
- int enable = 1;
-
- egp = this->private;
- enable = egp->enable[GF_FOP_READDIRP];
-
- if (enable)
- op_errno = error_gen (this, GF_FOP_READDIRP);
-
- if (op_errno) {
- GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
- STACK_UNWIND_STRICT (readdirp, frame, -1, op_errno, NULL);
- return 0;
- }
-
- STACK_WIND (frame, error_gen_readdirp_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdirp,
- fd, size, off);
- return 0;
+ double ppm;
+
+ GF_ASSERT(pvt);
+
+ ppm = (percent / 100.0) * (double)FAILURE_GRANULARITY;
+ pvt->failure_iter_no = (int)ppm;
}
+static void
+error_gen_parse_fill_fops(eg_t *pvt, char *enable_fops)
+{
+ char *op_no_str = NULL;
+ int op_no = -1;
+ int i = 0;
+ xlator_t *this = THIS;
+ char *saveptr = NULL;
-int
-error_gen_closedir (xlator_t *this, fd_t *fd)
+ GF_ASSERT(pvt);
+ GF_ASSERT(this);
+
+ for (i = 0; i < GF_FOP_MAXVALUE; i++)
+ pvt->enable[i] = 0;
+
+ if (!enable_fops) {
+ gf_log(this->name, GF_LOG_WARNING, "All fops are enabled.");
+ for (i = 0; i < GF_FOP_MAXVALUE; i++)
+ pvt->enable[i] = 1;
+ } else {
+ op_no_str = strtok_r(enable_fops, ",", &saveptr);
+ while (op_no_str) {
+ op_no = gf_fop_int(op_no_str);
+ if (op_no == -1) {
+ gf_log(this->name, GF_LOG_WARNING, "Wrong option value %s",
+ op_no_str);
+ } else
+ pvt->enable[op_no] = 1;
+
+ op_no_str = strtok_r(NULL, ",", &saveptr);
+ }
+ }
+}
+
+int32_t
+error_gen_priv_dump(xlator_t *this)
{
- return 0;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ int ret = -1;
+ eg_t *conf = NULL;
+
+ if (!this)
+ goto out;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ ret = TRY_LOCK(&conf->lock);
+ if (ret != 0) {
+ return ret;
+ }
+
+ gf_proc_dump_add_section("xlator.debug.error-gen.%s.priv", this->name);
+ gf_proc_dump_build_key(key_prefix, "xlator.debug.error-gen", "%s.priv",
+ this->name);
+
+ gf_proc_dump_write("op_count", "%d", conf->op_count);
+ gf_proc_dump_write("failure_iter_no", "%d", conf->failure_iter_no);
+ gf_proc_dump_write("error_no", "%d", conf->error_no_int);
+ gf_proc_dump_write("random_failure", "%d", conf->random_failure);
+
+ UNLOCK(&conf->lock);
+out:
+ return ret;
}
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init(this, gf_error_gen_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Memory accounting init"
+ " failed");
+ return ret;
+ }
+
+ return ret;
+}
int
-error_gen_close (xlator_t *this, fd_t *fd)
+reconfigure(xlator_t *this, dict_t *options)
{
- return 0;
-}
+ eg_t *pvt = NULL;
+ int32_t ret = 0;
+ char *error_enable_fops = NULL;
+ char *error_no = NULL;
+ double failure_percent_dbl = 0.0;
+
+ if (!this || !this->private)
+ goto out;
+
+ pvt = this->private;
+
+ ret = -1;
+
+ GF_OPTION_RECONF("error-no", error_no, options, str, out);
+
+ if (error_no)
+ pvt->error_no_int = conv_errno_to_int(&error_no);
+
+ GF_OPTION_RECONF("failure", failure_percent_dbl, options, percent, out);
+
+ GF_OPTION_RECONF("enable", error_enable_fops, options, str, out);
+ GF_OPTION_RECONF("random-failure", pvt->random_failure, options, bool, out);
+
+ error_gen_parse_fill_fops(pvt, error_enable_fops);
+ error_gen_set_failure(pvt, failure_percent_dbl);
+
+ ret = 0;
+out:
+ gf_log(this ? this->name : "error-gen", GF_LOG_DEBUG,
+ "reconfigure returning %d", ret);
+ return ret;
+}
int
-init (xlator_t *this)
+init(xlator_t *this)
{
- eg_t *pvt = NULL;
- data_t *error_no = NULL;
- data_t *failure_percent = NULL;
- data_t *enable = NULL;
- int32_t ret = 0;
- char *error_enable_fops = NULL;
- char *op_no_str = NULL;
- int op_no = -1;
- int i = 0;
- int32_t failure_percent_int = 0;
-
- if (!this->children || this->children->next) {
- gf_log (this->name, GF_LOG_ERROR,
- "error-gen not configured with one subvolume");
- ret = -1;
- goto out;
- }
+ eg_t *pvt = NULL;
+ int32_t ret = 0;
+ char *error_enable_fops = NULL;
+ char *error_no = NULL;
+ double failure_percent_dbl = 0.0;
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
+ if (!this->children || this->children->next) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "error-gen not configured with one subvolume");
+ ret = -1;
+ goto out;
+ }
- error_no = dict_get (this->options, "error-no");
- failure_percent = dict_get (this->options, "failure");
- enable = dict_get (this->options, "enable");
+ if (!this->parents) {
+ gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+ }
- pvt = CALLOC (1, sizeof (eg_t));
+ pvt = GF_CALLOC(1, sizeof(eg_t), gf_error_gen_mt_eg_t);
- if (!pvt) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory.");
- ret = -1;
- goto out;
- }
+ if (!pvt) {
+ ret = -1;
+ goto out;
+ }
- LOCK_INIT (&pvt->lock);
+ LOCK_INIT(&pvt->lock);
- for (i = 0; i < GF_FOP_MAXVALUE; i++)
- pvt->enable[i] = 0;
- if (!error_no) {
- gf_log (this->name, GF_LOG_DEBUG,
- "error-no not specified.");
- } else {
- pvt->error_no = data_to_str (error_no);
- }
+ ret = -1;
- if (!failure_percent) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failure percent not specified.");
- pvt->failure_iter_no = 100/GF_FAILURE_DEFAULT;
- } else {
- failure_percent_int = data_to_int32 (failure_percent);
- if (failure_percent_int)
- pvt->failure_iter_no = 100/failure_percent_int;
- else
- pvt->failure_iter_no = 100/GF_FAILURE_DEFAULT;
- }
+ GF_OPTION_INIT("error-no", error_no, str, out);
- if (!enable) {
- gf_log (this->name, GF_LOG_WARNING,
- "All fops are enabled.");
- for (i = 0; i < GF_FOP_MAXVALUE; i++)
- pvt->enable[i] = 1;
- } else {
- error_enable_fops = data_to_str (enable);
- op_no_str = error_enable_fops;
- while ((*error_enable_fops) != '\0') {
- error_enable_fops++;
- if (((*error_enable_fops) == ',') ||
- ((*error_enable_fops) == '\0')) {
- if ((*error_enable_fops) != '\0') {
- (*error_enable_fops) = '\0';
- error_enable_fops++;
- }
- op_no = get_fop_int (&op_no_str);
- if (op_no == -1) {
- gf_log (this->name, GF_LOG_WARNING,
- "Wrong option value %s",
- op_no_str);
- } else
- pvt->enable[op_no] = 1;
- op_no_str = error_enable_fops;
- }
- }
- }
- this->private = pvt;
+ if (error_no)
+ pvt->error_no_int = conv_errno_to_int(&error_no);
+
+ GF_OPTION_INIT("failure", failure_percent_dbl, percent, out);
+
+ GF_OPTION_INIT("enable", error_enable_fops, str, out);
+
+ GF_OPTION_INIT("random-failure", pvt->random_failure, bool, out);
- /* Give some seed value here */
- srand (time(NULL));
+ error_gen_parse_fill_fops(pvt, error_enable_fops);
+ error_gen_set_failure(pvt, failure_percent_dbl);
+
+ this->private = pvt;
+
+ /* Give some seed value here. */
+ srand(gf_time());
+
+ ret = 0;
out:
- return ret;
+ if (ret)
+ GF_FREE(pvt);
+ return ret;
}
-
void
-fini (xlator_t *this)
+fini(xlator_t *this)
{
- eg_t *pvt = NULL;
-
- if (!this)
- return;
- pvt = this->private;
+ eg_t *pvt = NULL;
- if (pvt) {
- LOCK_DESTROY (&pvt->lock);
- FREE (pvt);
- gf_log (this->name, GF_LOG_DEBUG, "fini called");
- }
+ if (!this)
return;
+ pvt = this->private;
+
+ if (pvt) {
+ LOCK_DESTROY(&pvt->lock);
+ GF_FREE(pvt);
+ gf_log(this->name, GF_LOG_DEBUG, "fini called");
+ }
+ return;
}
-struct xlator_fops fops = {
- .lookup = error_gen_lookup,
- .stat = error_gen_stat,
- .readlink = error_gen_readlink,
- .mknod = error_gen_mknod,
- .mkdir = error_gen_mkdir,
- .unlink = error_gen_unlink,
- .rmdir = error_gen_rmdir,
- .symlink = error_gen_symlink,
- .rename = error_gen_rename,
- .link = error_gen_link,
- .truncate = error_gen_truncate,
- .create = error_gen_create,
- .open = error_gen_open,
- .readv = error_gen_readv,
- .writev = error_gen_writev,
- .statfs = error_gen_statfs,
- .flush = error_gen_flush,
- .fsync = error_gen_fsync,
- .setxattr = error_gen_setxattr,
- .getxattr = error_gen_getxattr,
- .removexattr = error_gen_removexattr,
- .opendir = error_gen_opendir,
- .readdir = error_gen_readdir,
- .readdirp = error_gen_readdirp,
- .fsyncdir = error_gen_fsyncdir,
- .access = error_gen_access,
- .ftruncate = error_gen_ftruncate,
- .fstat = error_gen_fstat,
- .lk = error_gen_lk,
- .lookup_cbk = error_gen_lookup_cbk,
- .xattrop = error_gen_xattrop,
- .fxattrop = error_gen_fxattrop,
- .inodelk = error_gen_inodelk,
- .finodelk = error_gen_finodelk,
- .entrylk = error_gen_entrylk,
- .fentrylk = error_gen_fentrylk,
- .setattr = error_gen_setattr,
- .fsetattr = error_gen_fsetattr,
- .getspec = error_gen_getspec,
+struct xlator_dumpops dumpops = {
+ .priv = error_gen_priv_dump,
};
-struct xlator_cbks cbks = {
- .release = error_gen_close,
- .releasedir = error_gen_closedir,
+struct xlator_cbks cbks;
+
+struct xlator_fops fops = {
+ .lookup = error_gen_lookup,
+ .stat = error_gen_stat,
+ .readlink = error_gen_readlink,
+ .mknod = error_gen_mknod,
+ .mkdir = error_gen_mkdir,
+ .unlink = error_gen_unlink,
+ .rmdir = error_gen_rmdir,
+ .symlink = error_gen_symlink,
+ .rename = error_gen_rename,
+ .link = error_gen_link,
+ .truncate = error_gen_truncate,
+ .create = error_gen_create,
+ .open = error_gen_open,
+ .readv = error_gen_readv,
+ .writev = error_gen_writev,
+ .statfs = error_gen_statfs,
+ .flush = error_gen_flush,
+ .fsync = error_gen_fsync,
+ .setxattr = error_gen_setxattr,
+ .getxattr = error_gen_getxattr,
+ .removexattr = error_gen_removexattr,
+ .fsetxattr = error_gen_fsetxattr,
+ .fgetxattr = error_gen_fgetxattr,
+ .fremovexattr = error_gen_fremovexattr,
+ .opendir = error_gen_opendir,
+ .readdir = error_gen_readdir,
+ .readdirp = error_gen_readdirp,
+ .fsyncdir = error_gen_fsyncdir,
+ .access = error_gen_access,
+ .ftruncate = error_gen_ftruncate,
+ .fstat = error_gen_fstat,
+ .lk = error_gen_lk,
+ .xattrop = error_gen_xattrop,
+ .fxattrop = error_gen_fxattrop,
+ .inodelk = error_gen_inodelk,
+ .finodelk = error_gen_finodelk,
+ .entrylk = error_gen_entrylk,
+ .fentrylk = error_gen_fentrylk,
+ .setattr = error_gen_setattr,
+ .fsetattr = error_gen_fsetattr,
+ .getspec = error_gen_getspec,
};
struct volume_options options[] = {
- { .key = {"failure"},
- .type = GF_OPTION_TYPE_INT },
- { .key = {"error-no"},
- .value = {"ENOENT","ENOTDIR","ENAMETOOLONG","EACCES","EBADF",
- "EFAULT","ENOMEM","EINVAL","EIO","EEXIST","ENOSPC",
- "EPERM","EROFS","EBUSY","EISDIR","ENOTEMPTY","EMLINK"
- "ENODEV","EXDEV","EMFILE","ENFILE","ENOSYS","EINTR",
- "EFBIG","EAGAIN"},
- .type = GF_OPTION_TYPE_STR },
- { .key = {"enable"},
- .type = GF_OPTION_TYPE_STR },
- { .key = {NULL} }
+ {
+ .key = {"failure"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .description = "Percentage failure of operations when enabled.",
+ },
+
+ {
+ .key = {"error-no"},
+ .value = {"ENOENT",
+ "ENOTDIR",
+ "ENAMETOOLONG",
+ "EACCES",
+ "EBADF",
+ "EFAULT",
+ "ENOMEM",
+ "EINVAL",
+ "EIO",
+ "EEXIST",
+ "ENOSPC",
+ "EPERM",
+ "EROFS",
+ "EBUSY",
+ "EISDIR",
+ "ENOTEMPTY",
+ "EMLINK"
+ "ENODEV",
+ "EXDEV",
+ "EMFILE",
+ "ENFILE",
+ "ENOSYS",
+ "EINTR",
+ "EFBIG",
+ "EAGAIN",
+ "GF_ERROR_SHORT_WRITE"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {3},
+ .tags = {"error-gen"},
+ .flags = OPT_FLAG_SETTABLE,
+
+ },
+
+ {
+ .key = {"random-failure"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {3},
+ .tags = {"error-gen"},
+ .flags = OPT_FLAG_SETTABLE,
+ },
+
+ {
+ .key = {"enable", "error-fops"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Accepts a string which takes ',' separated fop "
+ "strings to denote which fops are enabled for error",
+ .op_version = {3},
+ .tags = {"error-gen"},
+ .flags = OPT_FLAG_SETTABLE,
+ },
+
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1},
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "error-gen",
+ .category = GF_TECH_PREVIEW,
};
diff --git a/xlators/debug/error-gen/src/error-gen.h b/xlators/debug/error-gen/src/error-gen.h
index 1c902cf4f21..2478cd5b21c 100644
--- a/xlators/debug/error-gen/src/error-gen.h
+++ b/xlators/debug/error-gen/src/error-gen.h
@@ -1,43 +1,49 @@
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#ifndef _ERROR_GEN_H
#define _ERROR_GEN_H
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include "error-gen-mem-types.h"
#define GF_FAILURE_DEFAULT 10
+/*
+ * Pseudo-errors refer to errors beyond the scope of traditional <-1, op_errno>
+ * returns. This facilitates the ability to return unexpected, but not -1 values
+ * and/or to inject operations that lead to implicit error conditions. The range
+ * for pseudo errors resides at a high value to avoid conflicts with the errno
+ * range.
+ */
+enum GF_PSEUDO_ERRORS {
+ GF_ERROR_SHORT_WRITE = 1000, /* short writev return value */
+ GF_ERROR_MAX
+};
+
typedef struct {
- int enable[GF_FOP_MAXVALUE];
- int op_count;
- int failure_iter_no;
- char *error_no;
- gf_lock_t lock;
+ int enable[GF_FOP_MAXVALUE];
+ int op_count;
+ /*
+ * This is only an iteration number in the random-failure case. For
+ * the normal controlled-probability case, it's actually a numerator
+ * for the failure probability (see FAILURE_GRANULARITY declaration).
+ * It's just not worth blowing up the diff by changing it.
+ */
+ int failure_iter_no;
+ int error_no_int;
+ gf_boolean_t random_failure;
+ gf_lock_t lock;
} eg_t;
typedef struct {
- int error_no_count;
- int error_no[20];
+ int error_no_count;
+ int error_no[20];
} sys_error_t;
#endif
diff --git a/xlators/debug/io-stats/src/Makefile.am b/xlators/debug/io-stats/src/Makefile.am
index b894e79c3fe..c69f3caf0fe 100644
--- a/xlators/debug/io-stats/src/Makefile.am
+++ b/xlators/debug/io-stats/src/Makefile.am
@@ -2,14 +2,18 @@
xlator_LTLIBRARIES = io-stats.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
-io_stats_la_LDFLAGS = -module -avoidversion
+io_stats_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
io_stats_la_SOURCES = io-stats.c
io_stats_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = io-stats-mem-types.h
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
CLEANFILES =
diff --git a/xlators/debug/io-stats/src/io-stats-mem-types.h b/xlators/debug/io-stats/src/io-stats-mem-types.h
index 0c711029af9..51d38d8b97c 100644
--- a/xlators/debug/io-stats/src/io-stats-mem-types.h
+++ b/xlators/debug/io-stats/src/io-stats-mem-types.h
@@ -1,33 +1,27 @@
-
/*
- Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#ifndef __IO_STATS_MEM_TYPES_H__
#define __IO_STATS_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
+
+extern const char *__progname;
enum gf_io_stats_mem_types_ {
- gf_io_stats_mt_ios_conf = gf_common_mt_end + 1,
- gf_io_stats_mt_ios_fd,
- gf_io_stats_mt_end
+ gf_io_stats_mt_ios_conf = gf_common_mt_end + 1,
+ gf_io_stats_mt_ios_fd,
+ gf_io_stats_mt_ios_stat,
+ gf_io_stats_mt_ios_stat_list,
+ gf_io_stats_mt_ios_sample_buf,
+ gf_io_stats_mt_ios_sample,
+ gf_io_stats_mt_end
};
#endif
-
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
index fff16ede215..aa00c446e5a 100644
--- a/xlators/debug/io-stats/src/io-stats.c
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -1,27 +1,14 @@
/*
- Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#include "xlator.h"
-#endif
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/xlator.h>
+#include <glusterfs/syscall.h>
/**
* xlators/debug/io_stats :
@@ -30,1765 +17,4464 @@
*
* a) total read data - since process start, last interval and per fd
* b) total write data - since process start, last interval and per fd
- * c) counts of read IO block size - since process start, last interval and per fd
- * d) counts of write IO block size - since process start, last interval and per fd
- * e) counts of all FOP types passing through it
+ * c) counts of read IO block size - since process start, last interval and per
+ * fd d) counts of write IO block size - since process start, last interval and
+ * per fd e) counts of all FOP types passing through it
*
- * Usage: setfattr -n io-stats-dump /tmp/filename /mnt/gluster
+ * Usage: setfattr -n trusted.io-stats-dump /tmp/filename /mnt/gluster
+ * output is written to /tmp/filename.<iostats xlator instance name>
*
*/
#include <fnmatch.h>
#include <errno.h>
-#include "glusterfs.h"
-#include "xlator.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
#include "io-stats-mem-types.h"
+#include <stdarg.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/syncop.h>
+#include <pwd.h>
+#include <grp.h>
+#include <glusterfs/upcall-utils.h>
+#include <glusterfs/async.h>
+
+#define MAX_LIST_MEMBERS 100
+#define DEFAULT_PWD_BUF_SZ 16384
+#define DEFAULT_GRP_BUF_SZ 16384
+#define IOS_BLOCK_COUNT_SIZE 32
+
+#define IOS_STATS_DUMP_DIR DEFAULT_VAR_RUN_DIRECTORY
+
+typedef enum {
+ IOS_STATS_TYPE_NONE,
+ IOS_STATS_TYPE_OPEN,
+ IOS_STATS_TYPE_READ,
+ IOS_STATS_TYPE_WRITE,
+ IOS_STATS_TYPE_OPENDIR,
+ IOS_STATS_TYPE_READDIRP,
+ IOS_STATS_TYPE_READ_THROUGHPUT,
+ IOS_STATS_TYPE_WRITE_THROUGHPUT,
+ IOS_STATS_TYPE_MAX
+} ios_stats_type_t;
+
+typedef enum {
+ IOS_STATS_THRU_READ,
+ IOS_STATS_THRU_WRITE,
+ IOS_STATS_THRU_MAX,
+} ios_stats_thru_t;
+
+/* This is same as gf1_cli_info_op */
+/* had to be defined here again, so we have modularity between
+ xdr, xlator, and library functions */
+typedef enum ios_info_op {
+ GF_IOS_INFO_NONE = 0,
+ GF_IOS_INFO_ALL = 1,
+ GF_IOS_INFO_INCREMENTAL = 2,
+ GF_IOS_INFO_CUMULATIVE = 3,
+ GF_IOS_INFO_CLEAR = 4,
+} ios_info_op_t;
+
+struct ios_stat_lat {
+ struct timeval time;
+ double throughput;
+};
+
+struct ios_stat {
+ gf_lock_t lock;
+ uuid_t gfid;
+ char *filename;
+ gf_atomic_t counters[IOS_STATS_TYPE_MAX];
+ struct ios_stat_lat thru_counters[IOS_STATS_THRU_MAX];
+ gf_atomic_t refcnt;
+};
+
+struct ios_stat_list {
+ struct list_head list;
+ struct ios_stat *iosstat;
+ double value;
+};
+
+struct ios_stat_head {
+ gf_lock_t lock;
+ double min_cnt;
+ uint64_t members;
+ struct ios_stat_list *iosstats;
+};
+
+typedef struct _ios_sample_t {
+ uid_t uid;
+ gid_t gid;
+ char identifier[UNIX_PATH_MAX];
+ glusterfs_fop_t fop_type;
+ struct timeval timestamp;
+ double elapsed;
+} ios_sample_t;
+
+typedef struct _ios_sample_buf_t {
+ uint64_t pos; /* Position in write buffer */
+ uint64_t size; /* Size of ring buffer */
+ uint64_t collected; /* Number of samples we've collected */
+ uint64_t observed; /* Number of FOPs we've observed */
+ ios_sample_t *ios_samples; /* Our list of samples */
+} ios_sample_buf_t;
struct ios_lat {
- double min;
- double max;
- double avg;
+ double min;
+ double max;
+ double avg;
+ uint64_t total;
};
struct ios_global_stats {
- uint64_t data_written;
- uint64_t data_read;
- uint64_t block_count_write[32];
- uint64_t block_count_read[32];
- uint64_t fop_hits[GF_FOP_MAXVALUE];
- struct timeval started_at;
- struct ios_lat latency[GF_FOP_MAXVALUE];
+ gf_atomic_t data_written;
+ gf_atomic_t data_read;
+ gf_atomic_t block_count_write[IOS_BLOCK_COUNT_SIZE];
+ gf_atomic_t block_count_read[IOS_BLOCK_COUNT_SIZE];
+ gf_atomic_t fop_hits[GF_FOP_MAXVALUE];
+ gf_atomic_t upcall_hits[GF_UPCALL_FLAGS_MAXVALUE];
+ time_t started_at;
+ struct ios_lat latency[GF_FOP_MAXVALUE];
+ uint64_t nr_opens;
+ uint64_t max_nr_opens;
+ struct timeval max_openfd_time;
};
+typedef enum {
+ IOS_DUMP_TYPE_NONE = 0,
+ IOS_DUMP_TYPE_FILE = 1,
+ IOS_DUMP_TYPE_DICT = 2,
+ IOS_DUMP_TYPE_JSON_FILE = 3,
+ IOS_DUMP_TYPE_SAMPLES = 4,
+ IOS_DUMP_TYPE_MAX = 5
+} ios_dump_type_t;
struct ios_conf {
- gf_lock_t lock;
- struct ios_global_stats cumulative;
- uint64_t increment;
- struct ios_global_stats incremental;
- gf_boolean_t dump_fd_stats;
- int measure_latency;
+ gf_lock_t lock;
+ struct ios_global_stats cumulative;
+ uint64_t increment;
+ struct ios_global_stats incremental;
+ gf_boolean_t dump_fd_stats;
+ gf_boolean_t count_fop_hits;
+ gf_boolean_t measure_latency;
+ struct ios_stat_head list[IOS_STATS_TYPE_MAX];
+ struct ios_stat_head thru_list[IOS_STATS_THRU_MAX];
+ int32_t ios_dump_interval;
+ pthread_t dump_thread;
+ gf_boolean_t dump_thread_should_die;
+ gf_boolean_t dump_thread_running;
+ gf_lock_t ios_sampling_lock;
+ int32_t ios_sample_interval;
+ int32_t ios_sample_buf_size;
+ ios_sample_buf_t *ios_sample_buf;
+ struct dnscache *dnscache;
+ int32_t ios_dnscache_ttl_sec;
+ /*
+ * What we really need here is just a unique value to keep files
+ * created by this instance distinct from those created by any other.
+ * On the client side this isn't a problem, so we just use the
+ * translator name. On the server side conflicts can occur, so the
+ * volfile-generation code automatically sets this (via an option)
+ * to be the brick path.
+ *
+ * NB While the *field* name has changed, it didn't seem worth changing
+ * all of the cases where "xlator_name" is used as a *variable* name.
+ */
+ char *unique_id;
+ ios_dump_type_t dump_format;
};
-
struct ios_fd {
- char *filename;
- uint64_t data_written;
- uint64_t data_read;
- uint64_t block_count_write[32];
- uint64_t block_count_read[32];
- struct timeval opened_at;
+ char *filename;
+ gf_atomic_t data_written;
+ gf_atomic_t data_read;
+ gf_atomic_t block_count_write[IOS_BLOCK_COUNT_SIZE];
+ gf_atomic_t block_count_read[IOS_BLOCK_COUNT_SIZE];
+ struct timeval opened_at;
+};
+
+struct ios_dump_args {
+ ios_dump_type_t type;
+ union {
+ FILE *logfp;
+ dict_t *dict;
+ } u;
};
+typedef int (*block_dump_func)(xlator_t *, struct ios_dump_args *, int, int,
+ uint64_t);
struct ios_local {
- struct timeval wind_at;
- struct timeval unwind_at;
+ struct timeval wind_at;
+ struct timeval unwind_at;
};
-#define END_FOP_LATENCY(frame, op) \
- do { \
- struct ios_conf *conf = NULL; \
- \
- conf = this->private; \
- if (conf && conf->measure_latency) { \
- gettimeofday (&frame->end, NULL); \
- update_ios_latency (conf, frame, GF_FOP_##op); \
- } \
- } while (0)
-
-#define START_FOP_LATENCY(frame) \
- do { \
- struct ios_conf *conf = NULL; \
- \
- conf = this->private; \
- if (conf && conf->measure_latency) { \
- gettimeofday (&frame->begin, NULL); \
- } \
- } while (0)
-
-
-#define BUMP_FOP(op) \
- do { \
- struct ios_conf *conf = NULL; \
- \
- conf = this->private; \
- if (!conf) \
- break; \
- LOCK (&conf->lock); \
- { \
- conf->cumulative.fop_hits[GF_FOP_##op]++; \
- conf->incremental.fop_hits[GF_FOP_##op]++; \
- } \
- UNLOCK (&conf->lock); \
- } while (0)
-
-
-#define BUMP_READ(fd, len) \
- do { \
- struct ios_conf *conf = NULL; \
- struct ios_fd *iosfd = NULL; \
- int lb2 = 0; \
- \
- conf = this->private; \
- lb2 = log_base2 (len); \
- ios_fd_ctx_get (fd, this, &iosfd); \
- if (!conf) \
- break; \
- \
- LOCK (&conf->lock); \
- { \
- conf->cumulative.data_read += len; \
- conf->incremental.data_read += len; \
- conf->cumulative.block_count_read[lb2]++; \
- conf->incremental.block_count_read[lb2]++; \
- \
- if (iosfd) { \
- iosfd->data_read += len; \
- iosfd->block_count_read[lb2]++; \
- } \
- } \
- UNLOCK (&conf->lock); \
- } while (0)
-
-
-#define BUMP_WRITE(fd, len) \
- do { \
- struct ios_conf *conf = NULL; \
- struct ios_fd *iosfd = NULL; \
- int lb2 = 0; \
- \
- conf = this->private; \
- lb2 = log_base2 (len); \
- ios_fd_ctx_get (fd, this, &iosfd); \
- if (!conf) \
- break; \
- \
- LOCK (&conf->lock); \
- { \
- conf->cumulative.data_written += len; \
- conf->incremental.data_written += len; \
- conf->cumulative.block_count_write[lb2]++; \
- conf->incremental.block_count_write[lb2]++; \
- \
- if (iosfd) { \
- iosfd->data_written += len; \
- iosfd->block_count_write[lb2]++; \
- } \
- } \
- UNLOCK (&conf->lock); \
- } while (0)
-
-
-int
-ios_fd_ctx_get (fd_t *fd, xlator_t *this, struct ios_fd **iosfd)
-{
- uint64_t iosfd64 = 0;
- unsigned long iosfdlong = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, this, &iosfd64);
- iosfdlong = iosfd64;
- if (ret != -1)
- *iosfd = (void *) iosfdlong;
+struct volume_options options[];
- return ret;
+static int
+is_fop_latency_started(call_frame_t *frame)
+{
+ GF_ASSERT(frame);
+ struct timeval epoch = {
+ 0,
+ };
+ return memcmp(&frame->begin, &epoch, sizeof(epoch));
}
+#define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples"
+#ifdef GF_LINUX_HOST_OS
+#define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats"
+#else
+#define _IOS_DUMP_DIR DATADIR "/db/glusterd/stats"
+#endif
-
-int
-ios_fd_ctx_set (fd_t *fd, xlator_t *this, struct ios_fd *iosfd)
+#define END_FOP_LATENCY(frame, op) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ \
+ conf = this->private; \
+ if (conf && conf->measure_latency) { \
+ timespec_now(&frame->end); \
+ update_ios_latency(conf, frame, GF_FOP_##op); \
+ } \
+ } while (0)
+
+#define START_FOP_LATENCY(frame) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ \
+ conf = this->private; \
+ if (conf && conf->measure_latency) { \
+ timespec_now(&frame->begin); \
+ } else { \
+ memset(&frame->begin, 0, sizeof(frame->begin)); \
+ } \
+ } while (0)
+
+#define BUMP_FOP(op) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ \
+ conf = this->private; \
+ if (!conf) \
+ break; \
+ GF_ATOMIC_INC(conf->cumulative.fop_hits[GF_FOP_##op]); \
+ GF_ATOMIC_INC(conf->incremental.fop_hits[GF_FOP_##op]); \
+ } while (0)
+
+#define UPDATE_PROFILE_STATS(frame, op) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ \
+ if (!is_fop_latency_started(frame)) \
+ break; \
+ conf = this->private; \
+ if (conf && conf->measure_latency && conf->count_fop_hits) { \
+ BUMP_FOP(op); \
+ timespec_now(&frame->end); \
+ update_ios_latency(conf, frame, GF_FOP_##op); \
+ } \
+ } while (0)
+
+#define BUMP_THROUGHPUT(iosstat, type) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ double elapsed; \
+ struct timespec *begin, *end; \
+ double throughput; \
+ int flag = 0; \
+ struct timeval tv = { \
+ 0, \
+ }; \
+ \
+ begin = &frame->begin; \
+ end = &frame->end; \
+ \
+ elapsed = gf_tsdiff(begin, end) / 1000.0; \
+ throughput = op_ret / elapsed; \
+ \
+ conf = this->private; \
+ gettimeofday(&tv, NULL); \
+ LOCK(&iosstat->lock); \
+ { \
+ if (iosstat->thru_counters[type].throughput <= throughput) { \
+ iosstat->thru_counters[type].throughput = throughput; \
+ memcpy(&iosstat->thru_counters[type].time, &tv, \
+ sizeof(struct timeval)); \
+ flag = 1; \
+ } \
+ } \
+ UNLOCK(&iosstat->lock); \
+ if (flag) \
+ ios_stat_add_to_list(&conf->thru_list[type], throughput, iosstat); \
+ } while (0)
+
+static int
+ios_fd_ctx_get(fd_t *fd, xlator_t *this, struct ios_fd **iosfd)
{
- uint64_t iosfd64 = 0;
- int ret = 0;
+ uint64_t iosfd64 = 0;
+ unsigned long iosfdlong = 0;
+ int ret = 0;
- iosfd64 = (unsigned long) iosfd;
- ret = fd_ctx_set (fd, this, iosfd64);
+ ret = fd_ctx_get(fd, this, &iosfd64);
+ iosfdlong = iosfd64;
+ if (ret != -1)
+ *iosfd = (void *)iosfdlong;
- return ret;
+ return ret;
}
+static int
+ios_fd_ctx_set(fd_t *fd, xlator_t *this, struct ios_fd *iosfd)
+{
+ uint64_t iosfd64 = 0;
+ int ret = 0;
-#define ios_log(this, logfp, fmt ...) \
- do { \
- if (logfp) { \
- fprintf (logfp, fmt); \
- fprintf (logfp, "\n"); \
- } \
- gf_log (this->name, GF_LOG_NORMAL, fmt); \
- } while (0)
+ iosfd64 = (unsigned long)iosfd;
+ ret = fd_ctx_set(fd, this, iosfd64);
+ return ret;
+}
-int
-io_stats_dump_global (xlator_t *this, struct ios_global_stats *stats,
- struct timeval *now, int interval, FILE *logfp)
+static int
+ios_stat_ref(struct ios_stat *iosstat)
{
- int i = 0;
+ uint64_t refcnt = 0;
+ refcnt = GF_ATOMIC_INC(iosstat->refcnt);
- if (interval == -1)
- ios_log (this, logfp, "=== Cumulative stats ===");
- else
- ios_log (this, logfp, "=== Interval %d stats ===",
- interval);
- ios_log (this, logfp, " Duration : %"PRId64"secs",
- (uint64_t) (now->tv_sec - stats->started_at.tv_sec));
- ios_log (this, logfp, " BytesRead : %"PRId64,
- stats->data_read);
- ios_log (this, logfp, " BytesWritten : %"PRId64,
- stats->data_written);
-
- for (i = 0; i < 32; i++) {
- if (stats->block_count_read[i])
- ios_log (this, logfp, " Read %06db+ : %"PRId64,
- (1 << i), stats->block_count_read[i]);
- }
+ return refcnt;
+}
- for (i = 0; i < 32; i++) {
- if (stats->block_count_write[i])
- ios_log (this, logfp, "Write %06db+ : %"PRId64,
- (1 << i), stats->block_count_write[i]);
+static int
+ios_stat_unref(struct ios_stat *iosstat)
+{
+ int cleanup = 0;
+ uint64_t refcnt = 0;
+
+ refcnt = GF_ATOMIC_DEC(iosstat->refcnt);
+ if (refcnt == 0) {
+ if (iosstat->filename) {
+ GF_FREE(iosstat->filename);
+ iosstat->filename = NULL;
}
+ cleanup = 1;
+ }
- for (i = 0; i < GF_FOP_MAXVALUE; i++) {
- if (stats->fop_hits[i] && !stats->latency[i].avg)
- ios_log (this, logfp, "%14s : %"PRId64,
- gf_fop_list[i], stats->fop_hits[i]);
- else if (stats->fop_hits[i] && stats->latency[i].avg)
- ios_log (this, logfp, "%14s : %"PRId64 ", latency"
- "(avg: %f, min: %f, max: %f)",
- gf_fop_list[i], stats->fop_hits[i],
- stats->latency[i].avg, stats->latency[i].min,
- stats->latency[i].max);
+ if (cleanup) {
+ LOCK_DESTROY(&iosstat->lock);
+ GF_FREE(iosstat);
+ iosstat = NULL;
+ }
+
+ return 0;
+}
+
+static int
+ios_stat_add_to_list(struct ios_stat_head *list_head, uint64_t value,
+ struct ios_stat *iosstat)
+{
+ struct ios_stat_list *new = NULL;
+ struct ios_stat_list *entry = NULL;
+ struct ios_stat_list *t = NULL;
+ struct ios_stat_list *list_entry = NULL;
+ struct ios_stat_list *tmp = NULL;
+ struct ios_stat_list *last = NULL;
+ struct ios_stat *stat = NULL;
+ int cnt = 0;
+ int found = 0;
+ int reposition = 0;
+ double min_count = 0;
+
+ LOCK(&list_head->lock);
+ {
+ if (list_head->min_cnt == 0)
+ list_head->min_cnt = value;
+ if ((list_head->members == MAX_LIST_MEMBERS) &&
+ (list_head->min_cnt > value))
+ goto out;
+
+ list_for_each_entry_safe(entry, t, &list_head->iosstats->list, list)
+ {
+ cnt++;
+ if (cnt == list_head->members)
+ last = entry;
+
+ if (!gf_uuid_compare(iosstat->gfid, entry->iosstat->gfid)) {
+ list_entry = entry;
+ found = cnt;
+ entry->value = value;
+ if (!reposition) {
+ if (cnt == list_head->members)
+ list_head->min_cnt = value;
+ goto out;
+ }
+ break;
+ } else if (entry->value <= value && !reposition) {
+ reposition = cnt;
+ tmp = entry;
+ if (cnt == list_head->members - 1)
+ min_count = entry->value;
+ }
}
+ if (found) {
+ list_del(&list_entry->list);
+ list_add_tail(&list_entry->list, &tmp->list);
+ if (min_count)
+ list_head->min_cnt = min_count;
+ goto out;
+ } else if (list_head->members == MAX_LIST_MEMBERS && reposition) {
+ new = GF_CALLOC(1, sizeof(*new), gf_io_stats_mt_ios_stat_list);
+ new->iosstat = iosstat;
+ new->value = value;
+ ios_stat_ref(iosstat);
+ list_add_tail(&new->list, &tmp->list);
+ if (last) {
+ stat = last->iosstat;
+ last->iosstat = NULL;
+ ios_stat_unref(stat);
+ list_del(&last->list);
+ GF_FREE(last);
+ }
+ if (reposition == MAX_LIST_MEMBERS)
+ list_head->min_cnt = value;
+ else if (min_count) {
+ list_head->min_cnt = min_count;
+ }
+ } else if (list_head->members < MAX_LIST_MEMBERS) {
+ new = GF_CALLOC(1, sizeof(*new), gf_io_stats_mt_ios_stat_list);
+ new->iosstat = iosstat;
+ new->value = value;
+ ios_stat_ref(iosstat);
+ if (reposition) {
+ list_add_tail(&new->list, &tmp->list);
+ } else {
+ list_add_tail(&new->list, &entry->list);
+ }
+ list_head->members++;
+ if (list_head->min_cnt > value)
+ list_head->min_cnt = value;
+ }
+ }
+out:
+ UNLOCK(&list_head->lock);
+ return 0;
+}
- return 0;
+static void
+ios_bump_read(xlator_t *this, fd_t *fd, size_t len)
+{
+ struct ios_conf *conf = NULL;
+ struct ios_fd *iosfd = NULL;
+ int lb2 = 0;
+
+ conf = this->private;
+ lb2 = log_base2(len);
+ ios_fd_ctx_get(fd, this, &iosfd);
+ if (!conf)
+ return;
+
+ GF_ATOMIC_ADD(conf->cumulative.data_read, len);
+ GF_ATOMIC_ADD(conf->incremental.data_read, len);
+ GF_ATOMIC_INC(conf->cumulative.block_count_read[lb2]);
+ GF_ATOMIC_INC(conf->incremental.block_count_read[lb2]);
+
+ if (iosfd) {
+ GF_ATOMIC_ADD(iosfd->data_read, len);
+ GF_ATOMIC_INC(iosfd->block_count_read[lb2]);
+ }
}
+static void
+ios_bump_write(xlator_t *this, fd_t *fd, size_t len)
+{
+ struct ios_conf *conf = NULL;
+ struct ios_fd *iosfd = NULL;
+ int lb2 = 0;
+
+ conf = this->private;
+ lb2 = log_base2(len);
+ ios_fd_ctx_get(fd, this, &iosfd);
+ if (!conf)
+ return;
+
+ GF_ATOMIC_ADD(conf->cumulative.data_written, len);
+ GF_ATOMIC_ADD(conf->incremental.data_written, len);
+ GF_ATOMIC_INC(conf->cumulative.block_count_write[lb2]);
+ GF_ATOMIC_INC(conf->incremental.block_count_write[lb2]);
+
+ if (iosfd) {
+ GF_ATOMIC_ADD(iosfd->data_written, len);
+ GF_ATOMIC_INC(iosfd->block_count_write[lb2]);
+ }
+}
+
+static void
+ios_bump_upcall(xlator_t *this, gf_upcall_flags_t event)
+{
+ struct ios_conf *conf = NULL;
+
+ conf = this->private;
+ if (!conf)
+ return;
+ if (conf->count_fop_hits) {
+ GF_ATOMIC_INC(conf->cumulative.upcall_hits[event]);
+ GF_ATOMIC_INC(conf->incremental.upcall_hits[event]);
+ }
+}
+
+static void
+ios_bump_stats(xlator_t *this, struct ios_stat *iosstat, ios_stats_type_t type)
+{
+ struct ios_conf *conf = NULL;
+ uint64_t value = 0;
+
+ conf = this->private;
+
+ value = GF_ATOMIC_INC(iosstat->counters[type]);
+ ios_stat_add_to_list(&conf->list[type], value, iosstat);
+}
int
-io_stats_dump (xlator_t *this, char *filename, inode_t *inode,
- const char *path)
+ios_inode_ctx_set(inode_t *inode, xlator_t *this, struct ios_stat *iosstat)
{
- struct ios_conf *conf = NULL;
- struct ios_global_stats cumulative = {0, };
- struct ios_global_stats incremental = {0, };
- int increment = 0;
- struct timeval now;
- FILE *logfp = NULL;
+ uint64_t iosstat64 = 0;
+ int ret = 0;
- conf = this->private;
+ ios_stat_ref(iosstat);
+ iosstat64 = (unsigned long)iosstat;
+ ret = inode_ctx_put(inode, this, iosstat64);
+ return ret;
+}
- gettimeofday (&now, NULL);
- LOCK (&conf->lock);
- {
- cumulative = conf->cumulative;
- incremental = conf->incremental;
+int
+ios_inode_ctx_get(inode_t *inode, xlator_t *this, struct ios_stat **iosstat)
+{
+ uint64_t iosstat64 = 0;
+ unsigned long iosstatlong = 0;
+ int ret = 0;
- increment = conf->increment++;
+ ret = inode_ctx_get(inode, this, &iosstat64);
+ iosstatlong = iosstat64;
+ if (ret != -1)
+ *iosstat = (void *)iosstatlong;
- memset (&conf->incremental, 0, sizeof (conf->incremental));
- conf->incremental.started_at = now;
- }
- UNLOCK (&conf->lock);
+ return ret;
+}
- logfp = fopen (filename, "w+");
- io_stats_dump_global (this, &cumulative, &now, -1, logfp);
- io_stats_dump_global (this, &incremental, &now, increment, logfp);
+/*
+ * So why goto all this trouble? Why not just queue up some samples in
+ * a big list and malloc away? Well malloc is expensive relative
+ * to what we are measuring, so cannot have any malloc's (or worse
+ * callocs) in our measurement code paths. Instead, we are going to
+ * pre-allocate a circular buffer and collect a maximum number of samples.
+ * Prior to dumping them all we'll create a new buffer and swap the
+ * old buffer with the new, and then proceed to dump the statistics
+ * in our dump thread.
+ *
+ */
+ios_sample_buf_t *
+ios_create_sample_buf(size_t buf_size)
+{
+ ios_sample_buf_t *ios_sample_buf = NULL;
+ ios_sample_t *ios_samples = NULL;
+
+ ios_sample_buf = GF_CALLOC(1, sizeof(*ios_sample_buf),
+ gf_io_stats_mt_ios_sample_buf);
+ if (!ios_sample_buf)
+ goto err;
+
+ ios_samples = GF_CALLOC(buf_size, sizeof(*ios_samples),
+ gf_io_stats_mt_ios_sample);
+
+ if (!ios_samples)
+ goto err;
+
+ ios_sample_buf->ios_samples = ios_samples;
+ ios_sample_buf->size = buf_size;
+ ios_sample_buf->pos = 0;
+ ios_sample_buf->observed = 0;
+ ios_sample_buf->collected = 0;
+
+ return ios_sample_buf;
+err:
+ GF_FREE(ios_sample_buf);
+ return NULL;
+}
- if (logfp)
- fclose (logfp);
+void
+ios_destroy_sample_buf(ios_sample_buf_t *ios_sample_buf)
+{
+ GF_FREE(ios_sample_buf->ios_samples);
+ GF_FREE(ios_sample_buf);
+}
+
+static int
+ios_init_sample_buf(struct ios_conf *conf)
+{
+ int32_t ret = -1;
+
+ GF_ASSERT(conf);
+ LOCK(&conf->lock);
+ conf->ios_sample_buf = ios_create_sample_buf(conf->ios_sample_buf_size);
+ if (!conf->ios_sample_buf)
+ goto out;
+ ret = 0;
+out:
+ UNLOCK(&conf->lock);
+ return ret;
+}
+
+static int
+ios_stats_cleanup(xlator_t *this, inode_t *inode)
+{
+ struct ios_stat *iosstat = NULL;
+ uint64_t iosstat64 = 0;
+
+ inode_ctx_del(inode, this, &iosstat64);
+ if (!iosstat64) {
+ gf_log(this->name, GF_LOG_WARNING, "could not get inode ctx");
return 0;
+ }
+ iosstat = (void *)(long)iosstat64;
+ if (iosstat) {
+ ios_stat_unref(iosstat);
+ }
+ return 0;
}
+#define ios_log(this, logfp, fmt...) \
+ do { \
+ if (logfp) { \
+ fprintf(logfp, fmt); \
+ fprintf(logfp, "\n"); \
+ } \
+ gf_log(this->name, GF_LOG_DEBUG, fmt); \
+ } while (0)
int
-io_stats_dump_fd (xlator_t *this, struct ios_fd *iosfd)
+ios_dump_file_stats(struct ios_stat_head *list_head, xlator_t *this,
+ FILE *logfp)
{
- struct ios_conf *conf = NULL;
- struct timeval now;
- uint64_t sec = 0;
- uint64_t usec = 0;
- int i = 0;
+ struct ios_stat_list *entry = NULL;
- conf = this->private;
+ LOCK(&list_head->lock);
+ {
+ list_for_each_entry(entry, &list_head->iosstats->list, list)
+ {
+ ios_log(this, logfp, "%-12.0f %s", entry->value,
+ entry->iosstat->filename);
+ }
+ }
+ UNLOCK(&list_head->lock);
+ return 0;
+}
+
+int
+ios_dump_throughput_stats(struct ios_stat_head *list_head, xlator_t *this,
+ FILE *logfp, ios_stats_thru_t type)
+{
+ struct ios_stat_list *entry = NULL;
+ char timestr[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+
+ LOCK(&list_head->lock);
+ {
+ list_for_each_entry(entry, &list_head->iosstats->list, list)
+ {
+ gf_time_fmt_tv(timestr, sizeof timestr,
+ &entry->iosstat->thru_counters[type].time,
+ gf_timefmt_FT);
+
+ ios_log(this, logfp, "%s \t %-10.2f \t %s", timestr, entry->value,
+ entry->iosstat->filename);
+ }
+ }
+ UNLOCK(&list_head->lock);
+ return 0;
+}
- if (!conf->dump_fd_stats)
- return 0;
+int
+_io_stats_get_key_prefix(xlator_t *this, char **key_prefix)
+{
+ char *key_root = "gluster";
+ char *xlator_name = NULL;
+ char *instance_name = NULL;
+ size_t key_len = 0;
+ int bytes_written = 0;
+ int i = 0;
+ int ret = 0;
+ struct ios_conf *conf = this->private;
+
+ xlator_name = strdupa(conf->unique_id);
+ for (i = 0; i < strlen(xlator_name); i++) {
+ if (xlator_name[i] == '/')
+ xlator_name[i] = '_';
+ }
+
+ instance_name = this->instance_name;
+ if (this->name && strcmp(this->name, "glustershd") == 0) {
+ xlator_name = "shd";
+ } else if (this->prev && strcmp(this->prev->name, "nfs-server") == 0) {
+ xlator_name = "nfsd";
+ if (this->prev->instance_name)
+ instance_name = strdupa(this->prev->instance_name);
+ }
+
+ if (strcmp(__progname, "glusterfsd") == 0)
+ key_root = "gluster.brick";
+
+ if (instance_name) {
+ /* +3 for 2 x "." + NULL */
+ key_len = strlen(key_root) + strlen(xlator_name) +
+ strlen(instance_name) + 3;
+ *key_prefix = GF_CALLOC(key_len, sizeof(char), gf_common_mt_char);
+ if (!*key_prefix) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ bytes_written = snprintf(*key_prefix, key_len, "%s.%s.%s", key_root,
+ xlator_name, instance_name);
+ if (bytes_written != key_len - 1) {
+ ret = -EINVAL;
+ goto err;
+ }
+ } else {
+ /* +2 for 1 x "." + NULL */
+ key_len = strlen(key_root) + strlen(xlator_name) + 2;
+ *key_prefix = GF_CALLOC(key_len, sizeof(char), gf_common_mt_char);
+ if (!*key_prefix) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ bytes_written = snprintf(*key_prefix, key_len, "%s.%s", key_root,
+ xlator_name);
+ if (bytes_written != key_len - 1) {
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+ return 0;
+err:
+ GF_FREE(*key_prefix);
+ *key_prefix = NULL;
+ return ret;
+}
- if (!iosfd)
- return 0;
+int
+io_stats_dump_global_to_json_logfp(xlator_t *this,
+ struct ios_global_stats *stats, time_t now,
+ int interval, FILE *logfp)
+{
+ int i = 0;
+ int j = 0;
+ struct ios_conf *conf = NULL;
+ char *key_prefix = NULL;
+ char *str_prefix = NULL;
+ char *lc_fop_name = NULL;
+ int ret = 1; /* Default to error */
+ int rw_size;
+ char *rw_unit = NULL;
+ uint64_t fop_hits;
+ float fop_lat_ave;
+ float fop_lat_min;
+ float fop_lat_max;
+ double interval_sec;
+ double fop_ave_usec = 0.0;
+ double fop_ave_usec_sum = 0.0;
+ double weighted_fop_ave_usec = 0.0;
+ double weighted_fop_ave_usec_sum = 0.0;
+ long total_fop_hits = 0;
+ loc_t unused_loc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+
+ interval_sec = (double)(now - stats->started_at);
+
+ conf = this->private;
+
+ ret = _io_stats_get_key_prefix(this, &key_prefix);
+ if (ret) {
+ goto out;
+ }
+
+ if (interval == -1) {
+ str_prefix = "aggr";
+
+ } else {
+ str_prefix = "inter";
+ }
+ ios_log(this, logfp, "{");
+
+ for (i = 0; i < 31; i++) {
+ rw_size = (1 << i);
+ if (rw_size >= 1024 * 1024) {
+ rw_size = rw_size / (1024 * 1024);
+ rw_unit = "mb";
+ } else if (rw_size >= 1024) {
+ rw_size = rw_size / 1024;
+ rw_unit = "kb";
+ } else {
+ rw_unit = "b";
+ }
- gettimeofday (&now, NULL);
+ if (interval == -1) {
+ ios_log(this, logfp, "\"%s.%s.read_%d%s\": %" GF_PRI_ATOMIC ",",
+ key_prefix, str_prefix, rw_size, rw_unit,
+ GF_ATOMIC_GET(stats->block_count_read[i]));
+ ios_log(this, logfp, "\"%s.%s.write_%d%s\": %" GF_PRI_ATOMIC ",",
+ key_prefix, str_prefix, rw_size, rw_unit,
+ GF_ATOMIC_GET(stats->block_count_write[i]));
+ } else {
+ ios_log(this, logfp, "\"%s.%s.read_%d%s_per_sec\": %0.2lf,",
+ key_prefix, str_prefix, rw_size, rw_unit,
+ (double)(GF_ATOMIC_GET(stats->block_count_read[i]) /
+ interval_sec));
+ ios_log(this, logfp, "\"%s.%s.write_%d%s_per_sec\": %0.2lf,",
+ key_prefix, str_prefix, rw_size, rw_unit,
+ (double)(GF_ATOMIC_GET(stats->block_count_write[i]) /
+ interval_sec));
+ }
+ }
+
+ if (interval == -1) {
+ ios_log(this, logfp, "\"%s.%s.fds.open_count\": %" PRId64 ",",
+ key_prefix, str_prefix, conf->cumulative.nr_opens);
+ ios_log(this, logfp, "\"%s.%s.fds.max_open_count\": %" PRId64 ",",
+ key_prefix, str_prefix, conf->cumulative.max_nr_opens);
+ }
+
+ for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+ lc_fop_name = strdupa(gf_fop_list[i]);
+ for (j = 0; lc_fop_name[j]; j++) {
+ lc_fop_name[j] = tolower(lc_fop_name[j]);
+ }
- if (iosfd->opened_at.tv_usec > now.tv_usec) {
- now.tv_usec += 1000000;
- now.tv_usec--;
+ fop_hits = GF_ATOMIC_GET(stats->fop_hits[i]);
+ fop_lat_ave = 0.0;
+ fop_lat_min = 0.0;
+ fop_lat_max = 0.0;
+ if (fop_hits) {
+ if (stats->latency[i].avg) {
+ fop_lat_ave = stats->latency[i].avg;
+ fop_lat_min = stats->latency[i].min;
+ fop_lat_max = stats->latency[i].max;
+ }
+ }
+ if (interval == -1) {
+ ios_log(this, logfp, "\"%s.%s.fop.%s.count\": %" GF_PRI_ATOMIC ",",
+ key_prefix, str_prefix, lc_fop_name, fop_hits);
+ } else {
+ ios_log(this, logfp, "\"%s.%s.fop.%s.per_sec\": %0.2lf,",
+ key_prefix, str_prefix, lc_fop_name,
+ (double)(fop_hits / interval_sec));
}
- sec = now.tv_sec - iosfd->opened_at.tv_sec;
- usec = now.tv_usec - iosfd->opened_at.tv_usec;
-
- gf_log (this->name, GF_LOG_NORMAL,
- "--- fd stats ---");
-
- if (iosfd->filename)
- gf_log (this->name, GF_LOG_NORMAL,
- " Filename : %s",
- iosfd->filename);
-
- if (sec)
- gf_log (this->name, GF_LOG_NORMAL,
- " Lifetime : %"PRId64"secs, %"PRId64"usecs",
- sec, usec);
-
- if (iosfd->data_read)
- gf_log (this->name, GF_LOG_NORMAL,
- " BytesRead : %"PRId64" bytes",
- iosfd->data_read);
-
- if (iosfd->data_written)
- gf_log (this->name, GF_LOG_NORMAL,
- " BytesWritten : %"PRId64" bytes",
- iosfd->data_written);
-
- for (i = 0; i < 32; i++) {
- if (iosfd->block_count_read[i])
- gf_log (this->name, GF_LOG_NORMAL,
- " Read %06db+ : %"PRId64,
- (1 << i), iosfd->block_count_read[i]);
+ ios_log(this, logfp, "\"%s.%s.fop.%s.latency_ave_usec\": %0.2lf,",
+ key_prefix, str_prefix, lc_fop_name, fop_lat_ave);
+ ios_log(this, logfp, "\"%s.%s.fop.%s.latency_min_usec\": %0.2lf,",
+ key_prefix, str_prefix, lc_fop_name, fop_lat_min);
+ ios_log(this, logfp, "\"%s.%s.fop.%s.latency_max_usec\": %0.2lf,",
+ key_prefix, str_prefix, lc_fop_name, fop_lat_max);
+
+ fop_ave_usec_sum += fop_lat_ave;
+ weighted_fop_ave_usec_sum += fop_hits * fop_lat_ave;
+ total_fop_hits += fop_hits;
+ }
+
+ if (total_fop_hits) {
+ weighted_fop_ave_usec = weighted_fop_ave_usec_sum / total_fop_hits;
+ /* Extra key that does not print out an entry w/ 0.00 for
+ * intervals with no data
+ */
+ ios_log(this, logfp,
+ "\"%s.%s.fop.weighted_latency_ave_usec_nozerofill\": "
+ "%0.4lf,",
+ key_prefix, str_prefix, weighted_fop_ave_usec);
+ }
+ ios_log(this, logfp, "\"%s.%s.fop.weighted_latency_ave_usec\": %0.4lf,",
+ key_prefix, str_prefix, weighted_fop_ave_usec);
+ ios_log(this, logfp, "\"%s.%s.fop.weighted_fop_count\": %ld,", key_prefix,
+ str_prefix, total_fop_hits);
+
+ fop_ave_usec = fop_ave_usec_sum / GF_FOP_MAXVALUE;
+ ios_log(this, logfp, "\"%s.%s.fop.unweighted_latency_ave_usec\":%0.4lf,",
+ key_prefix, str_prefix, fop_ave_usec);
+
+ for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++) {
+ lc_fop_name = strdupa(gf_upcall_list[i]);
+ for (j = 0; lc_fop_name[j]; j++) {
+ lc_fop_name[j] = tolower(lc_fop_name[j]);
}
- for (i = 0; i < 32; i++) {
- if (iosfd->block_count_write[i])
- gf_log (this->name, GF_LOG_NORMAL,
- "Write %06db+ : %"PRId64,
- (1 << i), iosfd->block_count_write[i]);
+ fop_hits = GF_ATOMIC_GET(stats->upcall_hits[i]);
+ if (interval == -1) {
+ ios_log(this, logfp, "\"%s.%s.fop.%s.count\": %" GF_PRI_ATOMIC ",",
+ key_prefix, str_prefix, lc_fop_name, fop_hits);
+ } else {
+ ios_log(this, logfp, "\"%s.%s.fop.%s.per_sec\": %0.2lf,",
+ key_prefix, str_prefix, lc_fop_name,
+ (double)(fop_hits / interval_sec));
}
- return 0;
+ }
+
+ ret = syncop_getxattr(this, &unused_loc, &xattr, IO_THREADS_QUEUE_SIZE_KEY,
+ NULL, NULL);
+ if (xattr) {
+ /*
+ * Iterate over the dictionary returned to us by io-threads and
+ * dump the results to the stats file.
+ */
+ data_pair_t *curr = NULL;
+
+ dict_foreach_inline(xattr, curr)
+ {
+ ios_log(this, logfp, "\"%s.%s.%s.queue_size\": %d,", key_prefix,
+ str_prefix, curr->key, data_to_int32(curr->value));
+ }
+
+ /* Free the dictionary */
+ dict_unref(xattr);
+ } else {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Unable to get queue size counts from "
+ "the io-threads translator!");
+ }
+
+ if (interval == -1) {
+ ios_log(this, logfp, "\"%s.%s.uptime\": %" PRIu64 ",", key_prefix,
+ str_prefix, (uint64_t)(now - stats->started_at));
+ ios_log(this, logfp,
+ "\"%s.%s.bytes_read\": "
+ "%" GF_PRI_ATOMIC ",",
+ key_prefix, str_prefix, GF_ATOMIC_GET(stats->data_read));
+ ios_log(this, logfp,
+ "\"%s.%s.bytes_written\": "
+ "%" GF_PRI_ATOMIC "",
+ key_prefix, str_prefix, GF_ATOMIC_GET(stats->data_written));
+ } else {
+ ios_log(this, logfp, "\"%s.%s.sample_interval_sec\": %0.2lf,",
+ key_prefix, str_prefix, interval_sec);
+ ios_log(this, logfp, "\"%s.%s.bytes_read_per_sec\": %0.2lf,",
+ key_prefix, str_prefix,
+ (double)(GF_ATOMIC_GET(stats->data_read) / interval_sec));
+ ios_log(this, logfp, "\"%s.%s.bytes_written_per_sec\": %0.2lf",
+ key_prefix, str_prefix,
+ (double)(GF_ATOMIC_GET(stats->data_written) / interval_sec));
+ }
+
+ ios_log(this, logfp, "}");
+ ret = 0;
+out:
+ GF_FREE(key_prefix);
+ return ret;
}
-int
-update_ios_latency (struct ios_conf *conf, call_frame_t *frame,
- glusterfs_fop_t op)
+char *
+_resolve_username(xlator_t *this, uid_t uid)
{
- double elapsed;
- struct timeval *begin, *end;
- double avg;
+ struct passwd pwd;
+ struct passwd *pwd_result = NULL;
+ size_t pwd_buf_len;
+ char *pwd_buf = NULL;
+ char *ret = NULL;
+
+ /* Prepare our buffer for the uid->username translation */
+#ifdef _SC_GETGR_R_SIZE_MAX
+ pwd_buf_len = sysconf(_SC_GETGR_R_SIZE_MAX);
+#else
+ pwd_buf_len = -1;
+#endif
+ if (pwd_buf_len == -1) {
+ pwd_buf_len = DEFAULT_PWD_BUF_SZ; /* per the man page */
+ }
- begin = &frame->begin;
- end = &frame->end;
+ pwd_buf = alloca(pwd_buf_len);
+ if (!pwd_buf)
+ goto err;
- elapsed = (end->tv_sec - begin->tv_sec) * 1e6
- + (end->tv_usec - begin->tv_usec);
+ getpwuid_r(uid, &pwd, pwd_buf, pwd_buf_len, &pwd_result);
+ if (!pwd_result)
+ goto err;
- /* Cumulative */
- if (!conf->cumulative.latency[op].min)
- conf->cumulative.latency[op].min = elapsed;
- if (conf->cumulative.latency[op].min > elapsed)
- conf->cumulative.latency[op].min = elapsed;
- if (conf->cumulative.latency[op].max < elapsed)
- conf->cumulative.latency[op].max = elapsed;
+ ret = gf_strdup(pwd.pw_name);
+ if (ret)
+ return ret;
+ else
+ gf_log(this->name, GF_LOG_ERROR,
+ "gf_strdup failed, failing username "
+ "resolution.");
+err:
+ return ret;
+}
- avg = conf->cumulative.latency[op].avg;
+char *
+_resolve_group_name(xlator_t *this, gid_t gid)
+{
+ struct group grp;
+ struct group *grp_result = NULL;
+ size_t grp_buf_len;
+ char *grp_buf = NULL;
+ char *ret = NULL;
+
+ /* Prepare our buffer for the gid->group name translation */
+#ifdef _SC_GETGR_R_SIZE_MAX
+ grp_buf_len = sysconf(_SC_GETGR_R_SIZE_MAX);
+#else
+ grp_buf_len = -1;
+#endif
+ if (grp_buf_len == -1) {
+ grp_buf_len = DEFAULT_GRP_BUF_SZ; /* per the man page */
+ }
- conf->cumulative.latency[op].avg =
- avg + (elapsed - avg) / conf->cumulative.fop_hits[op];
+ grp_buf = alloca(grp_buf_len);
+ if (!grp_buf) {
+ goto err;
+ }
- /* Incremental */
- if (!conf->incremental.latency[op].min)
- conf->incremental.latency[op].min = elapsed;
- if (conf->incremental.latency[op].min > elapsed)
- conf->incremental.latency[op].min = elapsed;
- if (conf->incremental.latency[op].max < elapsed)
- conf->incremental.latency[op].max = elapsed;
+ if (getgrgid_r(gid, &grp, grp_buf, grp_buf_len, &grp_result) != 0)
+ goto err;
- avg = conf->incremental.latency[op].avg;
+ if (!grp_result)
+ goto err;
- conf->incremental.latency[op].avg =
- avg + (elapsed - avg) / conf->incremental.fop_hits[op];
+ ret = gf_strdup(grp.gr_name);
+ if (ret)
+ return ret;
+ else
+ gf_log(this->name, GF_LOG_ERROR,
+ "gf_strdup failed, failing username "
+ "resolution.");
+err:
+ return ret;
+}
- return 0;
+/*
+ * This function writes out a latency sample to a given file descriptor
+ * and beautifies the output in the process.
+ */
+void
+_io_stats_write_latency_sample(xlator_t *this, ios_sample_t *sample,
+ FILE *logfp)
+{
+ double epoch_time = 0.00;
+ char *xlator_name = NULL;
+ char *instance_name = NULL;
+ char *hostname = NULL;
+ char *identifier = NULL;
+ char *port = NULL;
+ char *port_pos = NULL;
+ char *group_name = NULL;
+ char *username = NULL;
+ struct ios_conf *conf = NULL;
+
+ conf = this->private;
+
+ epoch_time = (sample->timestamp).tv_sec +
+ ((sample->timestamp).tv_usec / 1000000.0);
+
+ if (strlen(sample->identifier) == 0) {
+ hostname = "Unknown";
+ port = "Unknown";
+ } else {
+ identifier = strdupa(sample->identifier);
+ port_pos = strrchr(identifier, ':');
+ if (!port_pos || strlen(port_pos) < 2)
+ goto err;
+ port = strdupa(port_pos + 1);
+ if (!port)
+ goto err;
+ *port_pos = '\0';
+ hostname = gf_rev_dns_lookup_cached(identifier, conf->dnscache);
+ if (!hostname)
+ hostname = "Unknown";
+ }
+
+ xlator_name = conf->unique_id;
+ if (!xlator_name || strlen(xlator_name) == 0)
+ xlator_name = "Unknown";
+
+ instance_name = this->instance_name;
+ if (!instance_name || strlen(instance_name) == 0)
+ instance_name = "N/A";
+
+ /* Resolve the UID to a string username */
+ username = _resolve_username(this, sample->uid);
+ if (!username) {
+ username = GF_MALLOC(30, gf_common_mt_char);
+ if (!username) {
+ goto out;
+ }
+ sprintf(username, "%d", (int32_t)sample->uid);
+ }
+
+ /* Resolve the GID to a string group name */
+ group_name = _resolve_group_name(this, sample->gid);
+ if (!group_name) {
+ group_name = GF_MALLOC(30, gf_common_mt_char);
+ if (!group_name) {
+ goto out;
+ }
+ sprintf(group_name, "%d", (int32_t)sample->gid);
+ }
+
+ ios_log(this, logfp, "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s", epoch_time,
+ fop_enum_to_pri_string(sample->fop_type),
+ gf_fop_string(sample->fop_type), sample->elapsed, xlator_name,
+ instance_name, username, group_name, hostname, port);
+ goto out;
+err:
+ gf_log(this->name, GF_LOG_ERROR, "Error parsing socket identifier");
+out:
+ GF_FREE(group_name);
+ GF_FREE(username);
}
+/*
+ * Takes our current sample buffer in conf->io_sample_buf, and saves
+ * a reference to this, init's a new buffer, and then dumps out the
+ * contents of the saved reference.
+ */
int
-io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+io_stats_dump_latency_samples_logfp(xlator_t *this, FILE *logfp)
{
- struct ios_fd *iosfd = NULL;
- char *path = NULL;
+ uint64_t i = 0;
+ struct ios_conf *conf = NULL;
+ ios_sample_buf_t *sample_buf = NULL;
+ int ret = 1; /* Default to error */
+
+ conf = this->private;
+
+ /* Save pointer to old buffer; the CS equivalent of
+ * Indiana Jones: https://www.youtube.com/watch?v=Pr-8AP0To4k,
+ * though ours will end better I hope!
+ */
+ sample_buf = conf->ios_sample_buf;
+ if (!sample_buf) {
+ gf_log(this->name, GF_LOG_WARNING, "Sampling buffer is null, bailing!");
+ goto out;
+ }
+
+ /* Empty case, nothing to do, exit. */
+ if (sample_buf->collected == 0) {
+ gf_log(this->name, GF_LOG_DEBUG, "No samples, dump not required.");
+ ret = 0;
+ goto out;
+ }
+
+ /* Init a new buffer, so we are free to work on the one we saved a
+ * reference to above.
+ */
+ if (ios_init_sample_buf(conf) != 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Failed to init new sampling buffer, out of memory?");
+ goto out;
+ }
+
+ /* Wrap-around case, dump from pos to sample_buf->size -1
+ * and then from 0 to sample_buf->pos (covered off by
+ * "simple case")
+ */
+ if (sample_buf->collected > sample_buf->pos + 1) {
+ for (i = sample_buf->pos; i < sample_buf->size; i++) {
+ _io_stats_write_latency_sample(this, &(sample_buf->ios_samples[i]),
+ logfp);
+ }
+ }
- path = frame->local;
- frame->local = NULL;
+ /* Simple case: Dump from 0 to sample_buf->pos */
+ for (i = 0; i < sample_buf->pos; i++) {
+ _io_stats_write_latency_sample(this, &(sample_buf->ios_samples[i]),
+ logfp);
+ }
+ ios_destroy_sample_buf(sample_buf);
- if (!path)
- goto unwind;
+out:
+ return ret;
+}
- if (op_ret < 0) {
- GF_FREE (path);
- goto unwind;
+int
+io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats,
+ time_t now, int interval, FILE *logfp)
+{
+ int i = 0;
+ int per_line = 0;
+ int index = 0;
+ struct ios_stat_head *list_head = NULL;
+ struct ios_conf *conf = NULL;
+ char timestr[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+ char str_header[128] = {0};
+ char str_read[128] = {0};
+ char str_write[128] = {0};
+ uint64_t fop_hits = 0;
+ uint64_t block_count_read = 0;
+ uint64_t block_count_write = 0;
+
+ conf = this->private;
+
+ if (interval == -1)
+ ios_log(this, logfp, "\n=== Cumulative stats ===");
+ else
+ ios_log(this, logfp, "\n=== Interval %d stats ===", interval);
+ ios_log(this, logfp, " Duration : %" PRIu64 " secs",
+ (uint64_t)(now - stats->started_at));
+ ios_log(this, logfp, " BytesRead : %" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(stats->data_read));
+ ios_log(this, logfp, " BytesWritten : %" GF_PRI_ATOMIC "\n",
+ GF_ATOMIC_GET(stats->data_written));
+
+ snprintf(str_header, sizeof(str_header), "%-12s %c", "Block Size", ':');
+ snprintf(str_read, sizeof(str_read), "%-12s %c", "Read Count", ':');
+ snprintf(str_write, sizeof(str_write), "%-12s %c", "Write Count", ':');
+ index = 14;
+ for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+ block_count_read = GF_ATOMIC_GET(stats->block_count_read[i]);
+ block_count_write = GF_ATOMIC_GET(stats->block_count_write[i]);
+ if ((block_count_read == 0) && (block_count_write == 0))
+ continue;
+ per_line++;
+
+ snprintf(str_header + index, sizeof(str_header) - index, "%16dB+",
+ (1 << i));
+ if (block_count_read)
+ snprintf(str_read + index, sizeof(str_read) - index, "%18" PRId64,
+ block_count_read);
+ else
+ snprintf(str_read + index, sizeof(str_read) - index, "%18s", "0");
+ if (block_count_write)
+ snprintf(str_write + index, sizeof(str_write) - index,
+ "%18" GF_PRI_ATOMIC, block_count_write);
+ else
+ snprintf(str_write + index, sizeof(str_write) - index, "%18s", "0");
+
+ index += 18;
+ if (per_line == 3) {
+ ios_log(this, logfp, "%s", str_header);
+ ios_log(this, logfp, "%s", str_read);
+ ios_log(this, logfp, "%s\n", str_write);
+
+ snprintf(str_header, sizeof(str_header), "%-12s %c", "Block Size",
+ ':');
+ snprintf(str_read, sizeof(str_read), "%-12s %c", "Read Count", ':');
+ snprintf(str_write, sizeof(str_write), "%-12s %c", "Write Count",
+ ':');
+
+ index = 14;
+ per_line = 0;
}
-
- iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd);
- if (!iosfd) {
- GF_FREE (path);
- goto unwind;
+ }
+
+ if (per_line != 0) {
+ ios_log(this, logfp, "%s", str_header);
+ ios_log(this, logfp, "%s", str_read);
+ ios_log(this, logfp, "%s\n", str_write);
+ }
+
+ ios_log(this, logfp, "%-13s %10s %14s %14s %14s", "Fop", "Call Count",
+ "Avg-Latency", "Min-Latency", "Max-Latency");
+ ios_log(this, logfp, "%-13s %10s %14s %14s %14s", "---", "----------",
+ "-----------", "-----------", "-----------");
+
+ for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+ fop_hits = GF_ATOMIC_GET(stats->fop_hits[i]);
+ if (fop_hits && !stats->latency[i].avg)
+ ios_log(this, logfp,
+ "%-13s %10" GF_PRI_ATOMIC
+ " %11s "
+ "us %11s us %11s us",
+ gf_fop_list[i], fop_hits, "0", "0", "0");
+ else if (fop_hits && stats->latency[i].avg)
+ ios_log(this, logfp,
+ "%-13s %10" GF_PRI_ATOMIC
+ " "
+ "%11.2lf us %11.2lf us %11.2lf us",
+ gf_fop_list[i], fop_hits, stats->latency[i].avg,
+ stats->latency[i].min, stats->latency[i].max);
+ }
+
+ for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++) {
+ fop_hits = GF_ATOMIC_GET(stats->upcall_hits[i]);
+ if (fop_hits)
+ ios_log(this, logfp,
+ "%-13s %10" PRId64
+ " %11s "
+ "us %11s us %11s us",
+ gf_upcall_list[i], fop_hits, "0", "0", "0");
+ }
+
+ ios_log(this, logfp,
+ "------ ----- ----- ----- ----- ----- ----- ----- "
+ " ----- ----- ----- -----\n");
+
+ if (interval == -1) {
+ LOCK(&conf->lock);
+ {
+ gf_time_fmt_tv(timestr, sizeof timestr,
+ &conf->cumulative.max_openfd_time, gf_timefmt_FT);
+ ios_log(this, logfp,
+ "Current open fd's: %" PRId64 " Max open fd's: %" PRId64
+ " time %s",
+ conf->cumulative.nr_opens, conf->cumulative.max_nr_opens,
+ timestr);
}
+ UNLOCK(&conf->lock);
+ ios_log(this, logfp, "\n==========Open File Stats========");
+ ios_log(this, logfp, "\nCOUNT: \t FILE NAME");
+ list_head = &conf->list[IOS_STATS_TYPE_OPEN];
+ ios_dump_file_stats(list_head, this, logfp);
+
+ ios_log(this, logfp, "\n==========Read File Stats========");
+ ios_log(this, logfp, "\nCOUNT: \t FILE NAME");
+ list_head = &conf->list[IOS_STATS_TYPE_READ];
+ ios_dump_file_stats(list_head, this, logfp);
+
+ ios_log(this, logfp, "\n==========Write File Stats========");
+ ios_log(this, logfp, "\nCOUNT: \t FILE NAME");
+ list_head = &conf->list[IOS_STATS_TYPE_WRITE];
+ ios_dump_file_stats(list_head, this, logfp);
+
+ ios_log(this, logfp, "\n==========Directory open stats========");
+ ios_log(this, logfp, "\nCOUNT: \t DIRECTORY NAME");
+ list_head = &conf->list[IOS_STATS_TYPE_OPENDIR];
+ ios_dump_file_stats(list_head, this, logfp);
+
+ ios_log(this, logfp, "\n========Directory readdirp Stats=======");
+ ios_log(this, logfp, "\nCOUNT: \t DIRECTORY NAME");
+ list_head = &conf->list[IOS_STATS_TYPE_READDIRP];
+ ios_dump_file_stats(list_head, this, logfp);
+
+ ios_log(this, logfp, "\n========Read Throughput File Stats=====");
+ ios_log(this, logfp,
+ "\nTIMESTAMP \t\t\t THROUGHPUT(KBPS)"
+ "\tFILE NAME");
+ list_head = &conf->thru_list[IOS_STATS_THRU_READ];
+ ios_dump_throughput_stats(list_head, this, logfp, IOS_STATS_THRU_READ);
+
+ ios_log(this, logfp, "\n======Write Throughput File Stats======");
+ ios_log(this, logfp,
+ "\nTIMESTAMP \t\t\t THROUGHPUT(KBPS)"
+ "\tFILE NAME");
+ list_head = &conf->thru_list[IOS_STATS_THRU_WRITE];
+ ios_dump_throughput_stats(list_head, this, logfp, IOS_STATS_THRU_WRITE);
+ }
+ return 0;
+}
- iosfd->filename = path;
- gettimeofday (&iosfd->opened_at, NULL);
-
- ios_fd_ctx_set (fd, this, iosfd);
+int
+io_stats_dump_global_to_dict(xlator_t *this, struct ios_global_stats *stats,
+ time_t now, int interval, dict_t *dict)
+{
+ int ret = 0;
+ char key[64] = {0};
+ uint64_t sec = 0;
+ int i = 0;
+ uint64_t count = 0;
+ uint64_t fop_hits = 0;
+
+ GF_ASSERT(stats);
+ GF_ASSERT(now);
+ GF_ASSERT(dict);
+ GF_ASSERT(this);
+
+ if (interval == -1)
+ snprintf(key, sizeof(key), "cumulative");
+ else
+ snprintf(key, sizeof(key), "interval");
+ ret = dict_set_int32(dict, key, interval);
+ if (ret)
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set "
+ "interval %d",
+ interval);
+
+ snprintf(key, sizeof(key), "%d-duration", interval);
+ sec = now - stats->started_at;
+ ret = dict_set_uint64(dict, key, sec);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set "
+ "duration(%d) - %" PRId64,
+ interval, sec);
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "%d-total-read", interval);
+ ret = dict_set_uint64(dict, key, GF_ATOMIC_GET(stats->data_read));
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set total "
+ "read(%d) - %" GF_PRI_ATOMIC,
+ interval, GF_ATOMIC_GET(stats->data_read));
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "%d-total-write", interval);
+ ret = dict_set_uint64(dict, key, GF_ATOMIC_GET(stats->data_written));
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set total "
+ "write(%d) - %" GF_PRI_ATOMIC,
+ interval, GF_ATOMIC_GET(stats->data_written));
+ goto out;
+ }
+ for (i = 0; i < 32; i++) {
+ count = GF_ATOMIC_GET(stats->block_count_read[i]);
+ if (count) {
+ snprintf(key, sizeof(key), "%d-read-%d", interval, (1 << i));
+ ret = dict_set_uint64(dict, key, count);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to "
+ "set read-%db+, with: %" PRId64,
+ (1 << i), count);
+ goto out;
+ }
+ }
+ }
+
+ for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+ count = GF_ATOMIC_GET(stats->block_count_write[i]);
+ if (count) {
+ snprintf(key, sizeof(key), "%d-write-%d", interval, (1 << i));
+ ret = dict_set_uint64(dict, key, count);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to "
+ "set write-%db+, with: %" PRId64,
+ (1 << i), count);
+ goto out;
+ }
+ }
+ }
+
+ for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+ fop_hits = GF_ATOMIC_GET(stats->fop_hits[i]);
+ if (fop_hits == 0)
+ continue;
+ snprintf(key, sizeof(key), "%d-%d-hits", interval, i);
+ ret = dict_set_uint64(dict, key, fop_hits);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set "
+ "%s-fop-hits: %" GF_PRI_ATOMIC,
+ gf_fop_list[i], fop_hits);
+ goto out;
+ }
-unwind:
- END_FOP_LATENCY (frame, CREATE);
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
- preparent, postparent);
- return 0;
+ if (stats->latency[i].avg == 0)
+ continue;
+ snprintf(key, sizeof(key), "%d-%d-avglatency", interval, i);
+ ret = dict_set_double(dict, key, stats->latency[i].avg);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set %s "
+ "avglatency(%d) with %f",
+ gf_fop_list[i], interval, stats->latency[i].avg);
+ goto out;
+ }
+ snprintf(key, sizeof(key), "%d-%d-minlatency", interval, i);
+ ret = dict_set_double(dict, key, stats->latency[i].min);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set %s "
+ "minlatency(%d) with %f",
+ gf_fop_list[i], interval, stats->latency[i].min);
+ goto out;
+ }
+ snprintf(key, sizeof(key), "%d-%d-maxlatency", interval, i);
+ ret = dict_set_double(dict, key, stats->latency[i].max);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set %s "
+ "maxlatency(%d) with %f",
+ gf_fop_list[i], interval, stats->latency[i].max);
+ goto out;
+ }
+ }
+ for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++) {
+ fop_hits = GF_ATOMIC_GET(stats->upcall_hits[i]);
+ if (fop_hits == 0)
+ continue;
+ snprintf(key, sizeof(key), "%d-%d-upcall-hits", interval, i);
+ ret = dict_set_uint64(dict, key, fop_hits);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to "
+ "set %s-upcall-hits: %" PRIu64,
+ gf_upcall_list[i], fop_hits);
+ goto out;
+ }
+ }
+out:
+ gf_log(this->name, GF_LOG_DEBUG, "returning %d", ret);
+ return ret;
}
+int
+io_stats_dump_global(xlator_t *this, struct ios_global_stats *stats, time_t now,
+ int interval, struct ios_dump_args *args)
+{
+ int ret = -1;
+
+ GF_ASSERT(args);
+ GF_ASSERT(now);
+ GF_ASSERT(stats);
+ GF_ASSERT(this);
+
+ switch (args->type) {
+ case IOS_DUMP_TYPE_JSON_FILE:
+ ret = io_stats_dump_global_to_json_logfp(this, stats, now, interval,
+ args->u.logfp);
+ break;
+ case IOS_DUMP_TYPE_FILE:
+ ret = io_stats_dump_global_to_logfp(this, stats, now, interval,
+ args->u.logfp);
+ break;
+ case IOS_DUMP_TYPE_DICT:
+ ret = io_stats_dump_global_to_dict(this, stats, now, interval,
+ args->u.dict);
+ break;
+ default:
+ GF_ASSERT(0);
+ ret = -1;
+ break;
+ }
+ return ret;
+}
int
-io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
+ios_dump_args_init(struct ios_dump_args *args, ios_dump_type_t type,
+ void *output)
{
- struct ios_fd *iosfd = NULL;
- char *path = NULL;
+ int ret = 0;
+
+ GF_ASSERT(args);
+ GF_ASSERT(type > IOS_DUMP_TYPE_NONE && type < IOS_DUMP_TYPE_MAX);
+ GF_ASSERT(output);
+
+ args->type = type;
+ switch (args->type) {
+ case IOS_DUMP_TYPE_JSON_FILE:
+ case IOS_DUMP_TYPE_FILE:
+ args->u.logfp = output;
+ break;
+ case IOS_DUMP_TYPE_DICT:
+ args->u.dict = output;
+ break;
+ default:
+ GF_ASSERT(0);
+ ret = -1;
+ }
+
+ return ret;
+}
- path = frame->local;
- frame->local = NULL;
+static void
+ios_global_stats_clear(struct ios_global_stats *stats, time_t now)
+{
+ GF_ASSERT(stats);
+ GF_ASSERT(now);
- if (!path)
- goto unwind;
+ memset(stats, 0, sizeof(*stats));
+ stats->started_at = now;
+}
- if (op_ret < 0) {
- GF_FREE (path);
- goto unwind;
- }
+int
+io_stats_dump(xlator_t *this, struct ios_dump_args *args, ios_info_op_t op,
+ gf_boolean_t is_peek)
+{
+ struct ios_conf *conf = NULL;
+ struct ios_global_stats cumulative = {};
+ struct ios_global_stats incremental = {};
+ int increment = 0;
+ time_t now = 0;
+
+ GF_ASSERT(this);
+ GF_ASSERT(args);
+ GF_ASSERT(args->type > IOS_DUMP_TYPE_NONE);
+ GF_ASSERT(args->type < IOS_DUMP_TYPE_MAX);
+
+ conf = this->private;
+ now = gf_time();
+
+ LOCK(&conf->lock);
+ {
+ if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_CUMULATIVE)
+ cumulative = conf->cumulative;
+
+ if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_INCREMENTAL) {
+ incremental = conf->incremental;
+ increment = conf->increment;
+
+ if (!is_peek) {
+ increment = conf->increment++;
- iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd);
- if (!iosfd) {
- GF_FREE (path);
- goto unwind;
+ ios_global_stats_clear(&conf->incremental, now);
+ }
}
+ }
+ UNLOCK(&conf->lock);
- iosfd->filename = path;
- gettimeofday (&iosfd->opened_at, NULL);
+ if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_CUMULATIVE)
+ io_stats_dump_global(this, &cumulative, now, -1, args);
- ios_fd_ctx_set (fd, this, iosfd);
+ if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_INCREMENTAL)
+ io_stats_dump_global(this, &incremental, now, increment, args);
-unwind:
- END_FOP_LATENCY (frame, OPEN);
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd);
- return 0;
+ return 0;
}
-
int
-io_stats_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
+io_stats_dump_fd(xlator_t *this, struct ios_fd *iosfd)
{
- END_FOP_LATENCY (frame, STAT);
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf);
+ struct ios_conf *conf = NULL;
+ struct timeval now;
+ int i = 0;
+ double usecs = 0;
+ uint64_t data_read = 0;
+ uint64_t data_written = 0;
+ uint64_t block_count_read = 0;
+ uint64_t block_count_write = 0;
+
+ conf = this->private;
+
+ if (!conf->dump_fd_stats)
+ return 0;
+
+ if (!iosfd)
return 0;
+
+ gettimeofday(&now, NULL);
+ usecs = gf_tvdiff(&iosfd->opened_at, &now);
+
+ gf_log(this->name, GF_LOG_INFO, "--- fd stats ---");
+
+ if (iosfd->filename)
+ gf_log(this->name, GF_LOG_INFO, " Filename : %s", iosfd->filename);
+
+ if (usecs)
+ gf_log(this->name, GF_LOG_INFO, " Lifetime : %lf secs", usecs);
+
+ data_read = GF_ATOMIC_GET(iosfd->data_read);
+ if (data_read)
+ gf_log(this->name, GF_LOG_INFO, " BytesRead : %" PRId64 " bytes",
+ data_read);
+
+ data_written = GF_ATOMIC_GET(iosfd->data_written);
+ if (data_written)
+ gf_log(this->name, GF_LOG_INFO, " BytesWritten : %" PRId64 " bytes",
+ data_written);
+
+ for (i = 0; i < 32; i++) {
+ block_count_read = GF_ATOMIC_GET(iosfd->block_count_read[i]);
+ if (block_count_read)
+ gf_log(this->name, GF_LOG_INFO,
+ " Read %06db+ :"
+ "%" PRId64,
+ (1 << i), block_count_read);
+ }
+ for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+ block_count_write = GF_ATOMIC_GET(iosfd->block_count_write[i]);
+ if (block_count_write)
+ gf_log(this->name, GF_LOG_INFO, "Write %06db+ : %" PRId64, (1 << i),
+ block_count_write);
+ }
+ return 0;
}
+void
+collect_ios_latency_sample(struct ios_conf *conf, glusterfs_fop_t fop_type,
+ double elapsed, call_frame_t *frame)
+{
+ ios_sample_buf_t *ios_sample_buf = NULL;
+ ios_sample_t *ios_sample = NULL;
+ struct timespec *timestamp = NULL;
+ call_stack_t *root = NULL;
+
+ ios_sample_buf = conf->ios_sample_buf;
+ LOCK(&conf->ios_sampling_lock);
+ if (conf->ios_sample_interval == 0 ||
+ ios_sample_buf->observed % conf->ios_sample_interval != 0)
+ goto out;
+
+ timestamp = &frame->begin;
+ root = frame->root;
+
+ ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]);
+ ios_sample->elapsed = elapsed;
+ ios_sample->fop_type = fop_type;
+ ios_sample->uid = root->uid;
+ ios_sample->gid = root->gid;
+ (ios_sample->timestamp).tv_sec = timestamp->tv_sec;
+ (ios_sample->timestamp).tv_usec = timestamp->tv_nsec / 1000;
+ memcpy(&ios_sample->identifier, &root->identifier,
+ sizeof(root->identifier));
+
+ /* We've reached the end of the circular buffer, start from the
+ * beginning. */
+ if (ios_sample_buf->pos == (ios_sample_buf->size - 1))
+ ios_sample_buf->pos = 0;
+ else
+ ios_sample_buf->pos++;
+ ios_sample_buf->collected++;
+out:
+ ios_sample_buf->observed++;
+ UNLOCK(&conf->ios_sampling_lock);
+ return;
+}
-int
-io_stats_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count,
- struct iatt *buf, struct iobref *iobref)
+static void
+update_ios_latency_stats(struct ios_global_stats *stats, double elapsed,
+ glusterfs_fop_t op)
{
- int len = 0;
- fd_t *fd = NULL;
+ double avg;
- fd = frame->local;
- frame->local = NULL;
+ GF_ASSERT(stats);
- if (op_ret > 0) {
- len = iov_length (vector, count);
- BUMP_READ (fd, len);
- }
+ stats->latency[op].total += elapsed;
- END_FOP_LATENCY (frame, READ);
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
- vector, count, buf, iobref);
- return 0;
-}
+ if (!stats->latency[op].min)
+ stats->latency[op].min = elapsed;
+ if (stats->latency[op].min > elapsed)
+ stats->latency[op].min = elapsed;
+ if (stats->latency[op].max < elapsed)
+ stats->latency[op].max = elapsed;
+
+ avg = stats->latency[op].avg;
+ stats->latency[op].avg = avg + (elapsed - avg) /
+ GF_ATOMIC_GET(stats->fop_hits[op]);
+}
int
-io_stats_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
+update_ios_latency(struct ios_conf *conf, call_frame_t *frame,
+ glusterfs_fop_t op)
{
- END_FOP_LATENCY (frame, WRITE);
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
+ double elapsed;
+ struct timespec *begin, *end;
+ begin = &frame->begin;
+ end = &frame->end;
+ elapsed = gf_tsdiff(begin, end) / 1000.0;
+ update_ios_latency_stats(&conf->cumulative, elapsed, op);
+ update_ios_latency_stats(&conf->incremental, elapsed, op);
+ collect_ios_latency_sample(conf, op, elapsed, frame);
-int
-io_stats_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *buf)
+ return 0;
+}
+
+int32_t
+io_stats_dump_stats_to_dict(xlator_t *this, dict_t *resp,
+ ios_stats_type_t flags, int32_t list_cnt)
{
- END_FOP_LATENCY (frame, READDIRP);
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, buf);
- return 0;
+ struct ios_conf *conf = NULL;
+ int cnt = 0;
+ char key[32];
+ int keylen;
+ struct ios_stat_head *list_head = NULL;
+ struct ios_stat_list *entry = NULL;
+ int ret = -1;
+ ios_stats_thru_t index = IOS_STATS_THRU_MAX;
+ char timestr[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+ char *dict_timestr = NULL;
+
+ conf = this->private;
+
+ switch (flags) {
+ case IOS_STATS_TYPE_OPEN:
+ list_head = &conf->list[IOS_STATS_TYPE_OPEN];
+ LOCK(&conf->lock);
+ {
+ ret = dict_set_uint64(resp, "current-open",
+ conf->cumulative.nr_opens);
+ if (ret)
+ goto unlock;
+ ret = dict_set_uint64(resp, "max-open",
+ conf->cumulative.max_nr_opens);
+
+ gf_time_fmt_tv(timestr, sizeof timestr,
+ &conf->cumulative.max_openfd_time,
+ gf_timefmt_FT);
+
+ dict_timestr = gf_strdup(timestr);
+ if (!dict_timestr)
+ goto unlock;
+ ret = dict_set_dynstr(resp, "max-openfd-time", dict_timestr);
+ if (ret)
+ goto unlock;
+ }
+ unlock:
+ UNLOCK(&conf->lock);
+ /* Do not proceed if we came here because of some error
+ * during the dict operation */
+ if (ret)
+ goto out;
+ break;
+ case IOS_STATS_TYPE_READ:
+ list_head = &conf->list[IOS_STATS_TYPE_READ];
+ break;
+ case IOS_STATS_TYPE_WRITE:
+ list_head = &conf->list[IOS_STATS_TYPE_WRITE];
+ break;
+ case IOS_STATS_TYPE_OPENDIR:
+ list_head = &conf->list[IOS_STATS_TYPE_OPENDIR];
+ break;
+ case IOS_STATS_TYPE_READDIRP:
+ list_head = &conf->list[IOS_STATS_TYPE_READDIRP];
+ break;
+ case IOS_STATS_TYPE_READ_THROUGHPUT:
+ list_head = &conf->thru_list[IOS_STATS_THRU_READ];
+ index = IOS_STATS_THRU_READ;
+ break;
+ case IOS_STATS_TYPE_WRITE_THROUGHPUT:
+ list_head = &conf->thru_list[IOS_STATS_THRU_WRITE];
+ index = IOS_STATS_THRU_WRITE;
+ break;
+
+ default:
+ goto out;
+ }
+ ret = dict_set_int32_sizen(resp, "top-op", flags);
+ if (!list_cnt)
+ goto out;
+ LOCK(&list_head->lock);
+ {
+ list_for_each_entry(entry, &list_head->iosstats->list, list)
+ {
+ cnt++;
+ keylen = snprintf(key, sizeof(key), "filename-%d", cnt);
+ ret = dict_set_strn(resp, key, keylen, entry->iosstat->filename);
+ if (ret)
+ goto unlock_list_head;
+ snprintf(key, sizeof(key), "value-%d", cnt);
+ ret = dict_set_uint64(resp, key, entry->value);
+ if (ret)
+ goto unlock_list_head;
+ if (index != IOS_STATS_THRU_MAX) {
+ keylen = snprintf(key, sizeof(key), "time-sec-%d", cnt);
+ ret = dict_set_int32n(
+ resp, key, keylen,
+ entry->iosstat->thru_counters[index].time.tv_sec);
+ if (ret)
+ goto unlock_list_head;
+ keylen = snprintf(key, sizeof(key), "time-usec-%d", cnt);
+ ret = dict_set_int32n(
+ resp, key, keylen,
+ entry->iosstat->thru_counters[index].time.tv_usec);
+ if (ret)
+ goto unlock_list_head;
+ }
+ if (cnt == list_cnt)
+ break;
+ }
+ }
+unlock_list_head:
+ UNLOCK(&list_head->lock);
+ /* ret is !=0 if some dict operation in the above critical region
+ * failed. */
+ if (ret)
+ goto out;
+ ret = dict_set_int32_sizen(resp, "members", cnt);
+out:
+ return ret;
}
+static struct ios_stat *
+ios_init_iosstat(xlator_t *this, char *path, uuid_t gfid, inode_t *inode)
+{
+ struct ios_stat *iosstat = NULL;
+ int i = 0;
+
+ iosstat = GF_CALLOC(1, sizeof(*iosstat), gf_io_stats_mt_ios_stat);
+ if (!iosstat)
+ goto out;
+
+ iosstat->filename = gf_strdup(path);
+ gf_uuid_copy(iosstat->gfid, gfid);
+ LOCK_INIT(&iosstat->lock);
+
+ for (i = 0; i < IOS_STATS_TYPE_MAX; i++)
+ GF_ATOMIC_INIT(iosstat->counters[i], 0);
+
+ ios_inode_ctx_set(inode, this, iosstat);
+
+out:
+ return iosstat;
+}
int
-io_stats_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *buf)
+io_stats_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- END_FOP_LATENCY (frame, READDIR);
- STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf);
- return 0;
-}
+ struct ios_fd *iosfd = NULL;
+ char *path = NULL;
+ struct ios_stat *iosstat = NULL;
+ struct ios_conf *conf = NULL;
+
+ conf = this->private;
+
+ path = frame->local;
+ frame->local = NULL;
+
+ if (!path)
+ goto unwind;
+
+ if (op_ret < 0) {
+ GF_FREE(path);
+ goto unwind;
+ }
+
+ iosfd = GF_CALLOC(1, sizeof(*iosfd), gf_io_stats_mt_ios_fd);
+ if (!iosfd) {
+ GF_FREE(path);
+ goto unwind;
+ }
+
+ iosfd->filename = path;
+ gettimeofday(&iosfd->opened_at, NULL);
+
+ ios_fd_ctx_set(fd, this, iosfd);
+ LOCK(&conf->lock);
+ {
+ conf->cumulative.nr_opens++;
+ if (conf->cumulative.nr_opens > conf->cumulative.max_nr_opens) {
+ conf->cumulative.max_nr_opens = conf->cumulative.nr_opens;
+ conf->cumulative.max_openfd_time = iosfd->opened_at;
+ }
+ }
+ UNLOCK(&conf->lock);
+
+ iosstat = ios_init_iosstat(this, path, buf->ia_gfid, inode);
+ if (!iosstat)
+ GF_FREE(path);
+unwind:
+ UPDATE_PROFILE_STATS(frame, CREATE);
+ STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
int
-io_stats_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
+io_stats_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- END_FOP_LATENCY (frame, FSYNC);
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
+ struct ios_fd *iosfd = NULL;
+ char *path = NULL;
+ struct ios_stat *iosstat = NULL;
+ struct ios_conf *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ path = frame->local;
+ frame->local = NULL;
+
+ if (!path)
+ goto unwind;
+
+ if (op_ret < 0) {
+ GF_FREE(path);
+ goto unwind;
+ }
+
+ iosfd = GF_CALLOC(1, sizeof(*iosfd), gf_io_stats_mt_ios_fd);
+ if (!iosfd) {
+ GF_FREE(path);
+ goto unwind;
+ }
+
+ iosfd->filename = path;
+ GF_ATOMIC_INIT(iosfd->data_read, 0);
+ GF_ATOMIC_INIT(iosfd->data_written, 0);
+ for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+ GF_ATOMIC_INIT(iosfd->block_count_write[i], 0);
+ GF_ATOMIC_INIT(iosfd->block_count_read[i], 0);
+ }
+ gettimeofday(&iosfd->opened_at, NULL);
+
+ ios_fd_ctx_set(fd, this, iosfd);
+
+ ios_inode_ctx_get(fd->inode, this, &iosstat);
+ if (!iosstat) {
+ iosstat = ios_init_iosstat(this, path, fd->inode->gfid, fd->inode);
+ }
+
+ LOCK(&conf->lock);
+ {
+ conf->cumulative.nr_opens++;
+ if (conf->cumulative.nr_opens > conf->cumulative.max_nr_opens) {
+ conf->cumulative.max_nr_opens = conf->cumulative.nr_opens;
+ conf->cumulative.max_openfd_time = iosfd->opened_at;
+ }
+ }
+ UNLOCK(&conf->lock);
+ if (iosstat) {
+ ios_bump_stats(this, iosstat, IOS_STATS_TYPE_OPEN);
+ iosstat = NULL;
+ }
+unwind:
+ UPDATE_PROFILE_STATS(frame, OPEN);
+ STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
int
-io_stats_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop)
+io_stats_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
{
- END_FOP_LATENCY (frame, SETATTR);
- STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop);
- return 0;
+ UPDATE_PROFILE_STATS(frame, STAT);
+ STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
}
-
int
-io_stats_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent)
+io_stats_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct iatt *buf, struct iobref *iobref,
+ dict_t *xdata)
{
- END_FOP_LATENCY (frame, UNLINK);
- STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
- preparent, postparent);
- return 0;
+ int len = 0;
+ fd_t *fd = NULL;
+ struct ios_stat *iosstat = NULL;
+
+ fd = frame->local;
+ frame->local = NULL;
+
+ if (op_ret > 0) {
+ len = iov_length(vector, count);
+ ios_bump_read(this, fd, len);
+ }
+
+ UPDATE_PROFILE_STATS(frame, READ);
+ ios_inode_ctx_get(fd->inode, this, &iosstat);
+
+ if (iosstat) {
+ ios_bump_stats(this, iosstat, IOS_STATS_TYPE_READ);
+ BUMP_THROUGHPUT(iosstat, IOS_STATS_THRU_READ);
+ iosstat = NULL;
+ }
+
+ STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, buf,
+ iobref, xdata);
+ return 0;
}
-
int
-io_stats_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
+io_stats_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- END_FOP_LATENCY (frame, RENAME);
- STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf,
- preoldparent, postoldparent,
- prenewparent, postnewparent);
- return 0;
-}
+ struct ios_stat *iosstat = NULL;
+ inode_t *inode = NULL;
+ UPDATE_PROFILE_STATS(frame, WRITE);
+ if (frame->local) {
+ inode = frame->local;
+ frame->local = NULL;
+ ios_inode_ctx_get(inode, this, &iosstat);
+ if (iosstat) {
+ ios_bump_stats(this, iosstat, IOS_STATS_TYPE_WRITE);
+ BUMP_THROUGHPUT(iosstat, IOS_STATS_THRU_WRITE);
+ inode = NULL;
+ iosstat = NULL;
+ }
+ }
+
+ STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
int
-io_stats_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, const char *buf,
- struct iatt *sbuf)
+io_stats_copy_file_range_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *stbuf, struct iatt *prebuf_dst,
+ struct iatt *postbuf_dst, dict_t *xdata)
{
- END_FOP_LATENCY (frame, READLINK);
- STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, sbuf);
- return 0;
-}
+ UPDATE_PROFILE_STATS(frame, COPY_FILE_RANGE);
+ STACK_UNWIND_STRICT(copy_file_range, frame, op_ret, op_errno, stbuf,
+ prebuf_dst, postbuf_dst, xdata);
+ return 0;
+}
int
-io_stats_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr, struct iatt *postparent)
+io_stats_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+ dict_t *xdata)
{
- END_FOP_LATENCY (frame, LOOKUP);
- STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xattr,
- postparent);
- return 0;
-}
+ struct ios_stat *iosstat = NULL;
+ inode_t *inode = frame->local;
+ frame->local = NULL;
+
+ UPDATE_PROFILE_STATS(frame, READDIRP);
+
+ ios_inode_ctx_get(inode, this, &iosstat);
+
+ if (iosstat) {
+ ios_bump_stats(this, iosstat, IOS_STATS_TYPE_READDIRP);
+ iosstat = NULL;
+ }
+
+ STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
int
-io_stats_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+io_stats_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+ dict_t *xdata)
{
- END_FOP_LATENCY (frame, SYMLINK);
- STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
+ UPDATE_PROFILE_STATS(frame, READDIR);
+ STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, buf, xdata);
+ return 0;
}
-
int
-io_stats_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+io_stats_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- END_FOP_LATENCY (frame, MKNOD);
- STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
+ UPDATE_PROFILE_STATS(frame, FSYNC);
+ STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
}
+int
+io_stats_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preop,
+ struct iatt *postop, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, SETATTR);
+ STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, preop, postop, xdata);
+ return 0;
+}
int
-io_stats_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+io_stats_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- END_FOP_LATENCY (frame, MKDIR);
- STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
+ UPDATE_PROFILE_STATS(frame, UNLINK);
+ STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+ xdata);
+ return 0;
}
+int
+io_stats_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, RENAME);
+ STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent,
+ postoldparent, prenewparent, postnewparent, xdata);
+ return 0;
+}
int
-io_stats_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+io_stats_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, const char *buf,
+ struct iatt *sbuf, dict_t *xdata)
{
- END_FOP_LATENCY (frame, LINK);
- STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
+ UPDATE_PROFILE_STATS(frame, READLINK);
+ STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
+ return 0;
}
+int
+io_stats_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ UPDATE_PROFILE_STATS(frame, LOOKUP);
+ STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+ postparent);
+ return 0;
+}
int
-io_stats_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+io_stats_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- END_FOP_LATENCY (frame, FLUSH);
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno);
- return 0;
+ UPDATE_PROFILE_STATS(frame, SYMLINK);
+ STACK_UNWIND_STRICT(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+ return 0;
}
+int
+io_stats_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, MKNOD);
+ STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+ return 0;
+}
int
-io_stats_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
+io_stats_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- if (op_ret >= 0)
- ios_fd_ctx_set (fd, this, 0);
+ char *path = frame->local;
- END_FOP_LATENCY (frame, OPENDIR);
- STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd);
- return 0;
-}
+ if (!path)
+ goto unwind;
+ UPDATE_PROFILE_STATS(frame, MKDIR);
+ if (op_ret < 0)
+ goto unwind;
+
+ /* allocate a struct ios_stat and set the inode ctx */
+ ios_init_iosstat(this, path, buf->ia_gfid, inode);
+
+unwind:
+ /* local is assigned with path */
+ GF_FREE(frame->local);
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+ return 0;
+}
int
-io_stats_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent)
+io_stats_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- END_FOP_LATENCY (frame, RMDIR);
- STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
- preparent, postparent);
- return 0;
+ UPDATE_PROFILE_STATS(frame, LINK);
+ STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+ return 0;
}
-
int
-io_stats_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
+io_stats_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- END_FOP_LATENCY (frame, TRUNCATE);
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,
- prebuf, postbuf);
- return 0;
+ UPDATE_PROFILE_STATS(frame, FLUSH);
+ STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata);
+ return 0;
}
-
int
-io_stats_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct statvfs *buf)
+io_stats_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- END_FOP_LATENCY (frame, STATFS);
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf);
- return 0;
+ struct ios_stat *iosstat = NULL;
+ int ret = -1;
+
+ UPDATE_PROFILE_STATS(frame, OPENDIR);
+ if (op_ret < 0)
+ goto unwind;
+
+ ios_fd_ctx_set(fd, this, 0);
+
+ ret = ios_inode_ctx_get(fd->inode, this, &iosstat);
+ if (!ret)
+ ios_bump_stats(this, iosstat, IOS_STATS_TYPE_OPENDIR);
+
+unwind:
+ STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata);
+ return 0;
}
+int
+io_stats_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, RMDIR);
+
+ STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, preparent, postparent,
+ xdata);
+ return 0;
+}
int
-io_stats_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+io_stats_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- END_FOP_LATENCY (frame, SETXATTR);
- STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno);
- return 0;
+ UPDATE_PROFILE_STATS(frame, TRUNCATE);
+ STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
}
+int
+io_stats_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+ dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, STATFS);
+ STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
int
-io_stats_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+io_stats_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- END_FOP_LATENCY (frame, GETXATTR);
- STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict);
- return 0;
+ UPDATE_PROFILE_STATS(frame, SETXATTR);
+ STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
}
+int
+io_stats_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, GETXATTR);
+ STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
int
-io_stats_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+io_stats_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- END_FOP_LATENCY (frame, REMOVEXATTR);
- STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno);
- return 0;
+ UPDATE_PROFILE_STATS(frame, REMOVEXATTR);
+ STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata);
+ return 0;
}
+int
+io_stats_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, FSETXATTR);
+ STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-io_stats_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+io_stats_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- END_FOP_LATENCY (frame, FSYNCDIR);
- STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno);
- return 0;
+ UPDATE_PROFILE_STATS(frame, FGETXATTR);
+ STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
}
+int
+io_stats_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, FREMOVEXATTR);
+ STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-io_stats_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+io_stats_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- END_FOP_LATENCY (frame, ACCESS);
- STACK_UNWIND_STRICT (access, frame, op_ret, op_errno);
- return 0;
+ UPDATE_PROFILE_STATS(frame, FSYNCDIR);
+ STACK_UNWIND_STRICT(fsyncdir, frame, op_ret, op_errno, xdata);
+ return 0;
}
+int
+io_stats_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, ACCESS);
+ STACK_UNWIND_STRICT(access, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-io_stats_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
+io_stats_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- END_FOP_LATENCY (frame, FTRUNCATE);
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno,
- prebuf, postbuf);
- return 0;
+ UPDATE_PROFILE_STATS(frame, FTRUNCATE);
+ STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
}
+int
+io_stats_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, FSTAT);
+ STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
int
-io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
+io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- END_FOP_LATENCY (frame, FSTAT);
- STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf);
- return 0;
+ UPDATE_PROFILE_STATS(frame, FALLOCATE);
+ STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
}
+int
+io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, DISCARD);
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
int
-io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
+io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- END_FOP_LATENCY (frame, LK);
- STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock);
- return 0;
+ UPDATE_PROFILE_STATS(frame, ZEROFILL);
+ STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
}
+int32_t
+io_stats_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, IPC);
+ STACK_UNWIND_STRICT(ipc, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-io_stats_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+io_stats_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
{
- END_FOP_LATENCY (frame, ENTRYLK);
- STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno);
- return 0;
+ UPDATE_PROFILE_STATS(frame, LK);
+ STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, lock, xdata);
+ return 0;
}
+int
+io_stats_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, ENTRYLK);
+ STACK_UNWIND_STRICT(entrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-io_stats_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+io_stats_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- END_FOP_LATENCY (frame, XATTROP);
- STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict);
- return 0;
+ UPDATE_PROFILE_STATS(frame, FENTRYLK);
+ STACK_UNWIND_STRICT(fentrylk, frame, op_ret, op_errno, xdata);
+ return 0;
}
+int
+io_stats_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, uint32_t weak_checksum,
+ uint8_t *strong_checksum, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, RCHECKSUM);
+ STACK_UNWIND_STRICT(rchecksum, frame, op_ret, op_errno, weak_checksum,
+ strong_checksum, xdata);
+ return 0;
+}
int
-io_stats_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+io_stats_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata)
{
- END_FOP_LATENCY (frame, FXATTROP);
- STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict);
- return 0;
+ UPDATE_PROFILE_STATS(frame, SEEK);
+ STACK_UNWIND_STRICT(seek, frame, op_ret, op_errno, offset, xdata);
+ return 0;
}
+int
+io_stats_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_lease *lease,
+ dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, LEASE);
+ STACK_UNWIND_STRICT(lease, frame, op_ret, op_errno, lease, xdata);
+ return 0;
+}
int
-io_stats_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+io_stats_getactivelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ lock_migration_info_t *locklist, dict_t *xdata)
{
- END_FOP_LATENCY (frame, INODELK);
- STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno);
- return 0;
+ UPDATE_PROFILE_STATS(frame, GETACTIVELK);
+ STACK_UNWIND_STRICT(getactivelk, frame, op_ret, op_errno, locklist, xdata);
+ return 0;
}
+int
+io_stats_setactivelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, SETACTIVELK);
+ STACK_UNWIND_STRICT(setactivelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-io_stats_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+io_stats_compound_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, void *data,
+ dict_t *xdata)
{
- BUMP_FOP (ENTRYLK);
+ UPDATE_PROFILE_STATS(frame, COMPOUND);
+ STACK_UNWIND_STRICT(compound, frame, op_ret, op_errno, data, xdata);
+ return 0;
+}
- START_FOP_LATENCY (frame);
+int
+io_stats_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, XATTROP);
+ STACK_UNWIND_STRICT(xattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_entrylk_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->entrylk,
- volume, loc, basename, cmd, type);
- return 0;
+int
+io_stats_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, FXATTROP);
+ STACK_UNWIND_STRICT(fxattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
}
+int
+io_stats_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, INODELK);
+ STACK_UNWIND_STRICT(inodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-io_stats_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock)
+io_stats_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
{
+ START_FOP_LATENCY(frame);
- BUMP_FOP (INODELK);
+ STACK_WIND(frame, io_stats_entrylk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, cmd,
+ type, xdata);
+ return 0;
+}
- START_FOP_LATENCY (frame);
+int
+io_stats_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+ fd_t *fd, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_inodelk_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->inodelk,
- volume, loc, cmd, flock);
- return 0;
+ STACK_WIND(frame, io_stats_fentrylk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, cmd,
+ type, xdata);
+ return 0;
}
-
int
-io_stats_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+io_stats_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
+ START_FOP_LATENCY(frame);
- END_FOP_LATENCY (frame, FINODELK);
- STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno);
- return 0;
+ STACK_WIND(frame, io_stats_inodelk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, flock,
+ xdata);
+ return 0;
}
-
int
-io_stats_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock)
+io_stats_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- BUMP_FOP (FINODELK);
+ UPDATE_PROFILE_STATS(frame, FINODELK);
+ STACK_UNWIND_STRICT(finodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
- START_FOP_LATENCY (frame);
+int
+io_stats_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_finodelk_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->finodelk,
- volume, fd, cmd, flock);
- return 0;
+ STACK_WIND(frame, io_stats_finodelk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, flock,
+ xdata);
+ return 0;
}
-
int
-io_stats_xattrop (call_frame_t *frame, xlator_t *this,
- loc_t *loc, gf_xattrop_flags_t flags, dict_t *dict)
+io_stats_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
- BUMP_FOP (XATTROP);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_xattrop_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_xattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->xattrop,
- loc, flags, dict);
+int
+io_stats_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_fxattrop_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata);
+ return 0;
}
-
int
-io_stats_fxattrop (call_frame_t *frame, xlator_t *this,
- fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict)
+io_stats_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- BUMP_FOP (FXATTROP);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_fxattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fxattrop,
- fd, flags, dict);
+int
+io_stats_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
}
-
int
-io_stats_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+io_stats_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
{
- BUMP_FOP (LOOKUP);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_readlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_lookup_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lookup,
- loc, xattr_req);
+int
+io_stats_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_mknod_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata);
+ return 0;
}
-
int
-io_stats_stat (call_frame_t *frame, xlator_t *this, loc_t *loc)
+io_stats_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
- BUMP_FOP (STAT);
+ if (loc->path)
+ frame->local = gf_strdup(loc->path);
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_stat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat,
- loc);
-
- return 0;
+ STACK_WIND(frame, io_stats_mkdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+ return 0;
}
-
int
-io_stats_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size)
+io_stats_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- BUMP_FOP (READLINK);
-
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_readlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readlink,
- loc, size);
-
- return 0;
+ STACK_WIND(frame, io_stats_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+ return 0;
}
-
int
-io_stats_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev, dict_t *params)
+io_stats_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- BUMP_FOP (MKNOD);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_rmdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_mknod_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod,
- loc, mode, dev, params);
+int
+io_stats_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_symlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata);
+ return 0;
}
-
int
-io_stats_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *params)
+io_stats_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
{
- BUMP_FOP (MKDIR);
-
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_mkdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir,
- loc, mode, params);
- return 0;
+ STACK_WIND(frame, io_stats_rename_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+ return 0;
}
-
int
-io_stats_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+io_stats_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- BUMP_FOP (UNLINK);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
-
- STACK_WIND (frame, io_stats_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- loc);
- return 0;
+ STACK_WIND(frame, io_stats_link_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ return 0;
}
-
int
-io_stats_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags)
+io_stats_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- BUMP_FOP (RMDIR);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_rmdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rmdir,
- loc, flags);
+int
+io_stats_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
}
-
int
-io_stats_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, dict_t *params)
+io_stats_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
{
- BUMP_FOP (SYMLINK);
-
- START_FOP_LATENCY (frame);
+ if (loc->path)
+ frame->local = gf_strdup(loc->path);
- STACK_WIND (frame, io_stats_symlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink,
- linkpath, loc, params);
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
}
-
int
-io_stats_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
+io_stats_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- BUMP_FOP (RENAME);
+ if (loc->path)
+ frame->local = gf_strdup(loc->path);
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_rename_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename,
- oldloc, newloc);
-
- return 0;
+ STACK_WIND(frame, io_stats_create_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+ xdata);
+ return 0;
}
-
int
-io_stats_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
+io_stats_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
{
- BUMP_FOP (LINK);
+ frame->local = fd;
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_link_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link,
- oldloc, newloc);
- return 0;
+ STACK_WIND(frame, io_stats_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+ return 0;
}
-
int
-io_stats_setattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct iatt *stbuf, int32_t valid)
+io_stats_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- BUMP_FOP (SETATTR);
+ int len = 0;
- START_FOP_LATENCY (frame);
+ if (fd->inode)
+ frame->local = fd->inode;
+ len = iov_length(vector, count);
- STACK_WIND (frame, io_stats_setattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setattr,
- loc, stbuf, valid);
+ ios_bump_write(this, fd, len);
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+ flags, iobref, xdata);
+ return 0;
}
-
int
-io_stats_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset)
+io_stats_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in,
+ off_t off_in, fd_t *fd_out, off_t off_out, size_t len,
+ uint32_t flags, dict_t *xdata)
{
- BUMP_FOP (TRUNCATE);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_copy_file_range_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->copy_file_range, fd_in, off_in, fd_out,
+ off_out, len, flags, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc, offset);
+int
+io_stats_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_statfs_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
}
-
int
-io_stats_open (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, fd_t *fd, int32_t wbflags)
+io_stats_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- BUMP_FOP (OPEN);
+ START_FOP_LATENCY(frame);
- frame->local = gf_strdup (loc->path);
+ STACK_WIND(frame, io_stats_flush_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush, fd, xdata);
+ return 0;
+}
- START_FOP_LATENCY (frame);
+int
+io_stats_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_open_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- loc, flags, fd, wbflags);
- return 0;
+ STACK_WIND(frame, io_stats_fsync_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+ return 0;
}
+int
+conditional_dump(dict_t *dict, char *key, data_t *value, void *data)
+{
+ struct {
+ xlator_t *this;
+ inode_t *inode;
+ const char *path;
+ } * stub;
+ xlator_t *this = NULL;
+ char *filename = NULL;
+ FILE *logfp = NULL;
+ struct ios_dump_args args = {0};
+ int pid, namelen, dirlen;
+ char dump_key[100];
+ char *slash_ptr = NULL;
+ char *path_in_value = NULL;
+ char *identifier = NULL;
+ struct ios_conf *conf = NULL;
+
+ stub = data;
+ this = stub->this;
+ conf = this->private;
+
+ /* Don't do this on 'brick-side', only do this on client side */
+ /* Addresses CVE-2018-14659 */
+ if (this->ctx->process_mode != GF_CLIENT_PROCESS) {
+ gf_log(this->name, GF_LOG_DEBUG,
+ "taking io-stats dump using setxattr not permitted on brick."
+ " Use 'gluster profile' instead");
+ return -1;
+ }
+
+ /* Create a file name that is appended with the io-stats instance
+ name as well. This helps when there is more than a single io-stats
+ instance in the graph, or the client and server processes are running
+ on the same node */
+ /* For the sanity of where the file should be located, we should make
+ sure file is written only inside RUNDIR (ie, /var/run/gluster) */
+ /* TODO: provide an option to dump it to different directory of
+ choice, based on options */
+ /* name format: /var/run/gluster/<passed in path/filename>.<xlator name
+ * slashes to -> */
+
+ path_in_value = alloca0(value->len + 1);
+
+ /* We need a memcpy here because of the way dict_unserialize works */
+
+ memcpy(path_in_value, data_to_str(value), value->len);
+ path_in_value[value->len] = '\0';
+
+ if (strstr(path_in_value, "../")) {
+ gf_log(this->name, GF_LOG_ERROR, "%s: no \"../\" allowed in path",
+ path_in_value);
+ return -1;
+ }
+
+ if (path_in_value[0] == '/') {
+ path_in_value = path_in_value + 1;
+ }
+
+ dirlen = strlen(IOS_STATS_DUMP_DIR);
+ if (conf->unique_id) {
+ /* this->name will be the same for all bricks of the volume */
+ identifier = conf->unique_id;
+ } else {
+ identifier = this->name;
+ }
+
+ namelen = (dirlen + value->len + strlen(identifier) + 3);
+ /* +3 for '/', '.' and '\0' added in snprintf below*/
+
+ filename = alloca0(namelen);
+ snprintf(filename, namelen, "%s/%s.%s", IOS_STATS_DUMP_DIR, path_in_value,
+ identifier);
+
+ /* convert any slashes to '-' so that fopen works correctly */
+ slash_ptr = strchr(filename + dirlen + 1, '/');
+ while (slash_ptr) {
+ *slash_ptr = '-';
+ slash_ptr = strchr(slash_ptr, '/');
+ }
+
+ pid = getpid();
+
+ if (!strncmp(filename, "", 1)) {
+ gf_log(this->name, GF_LOG_ERROR, "No filename given");
+ return -1;
+ }
+ logfp = fopen(filename, "w+");
+ if (!logfp) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to open %s "
+ "for writing",
+ filename);
+ return -1;
+ }
+ sprintf(dump_key, "*io*stat*%d_json_dump", pid);
+ if (fnmatch(dump_key, key, 0) == 0) {
+ (void)ios_dump_args_init(&args, IOS_DUMP_TYPE_JSON_FILE, logfp);
+ } else {
+ (void)ios_dump_args_init(&args, IOS_DUMP_TYPE_FILE, logfp);
+ }
+ io_stats_dump(this, &args, GF_IOS_INFO_ALL, _gf_false);
+ fclose(logfp);
+ return 0;
+}
int
-io_stats_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- fd_t *fd, dict_t *params)
+_ios_destroy_dump_thread(struct ios_conf *conf)
{
- BUMP_FOP (CREATE);
+ conf->dump_thread_should_die = _gf_true;
+ if (conf->dump_thread_running) {
+ (void)pthread_cancel(conf->dump_thread);
+ (void)pthread_join(conf->dump_thread, NULL);
+ }
+ return 0;
+}
- frame->local = gf_strdup (loc->path);
+void *
+_ios_dump_thread(xlator_t *this)
+{
+ struct ios_conf *conf = NULL;
+ FILE *stats_logfp = NULL;
+ FILE *samples_logfp = NULL;
+ struct ios_dump_args args = {0};
+ int i;
+ int stats_bytes_written = 0;
+ int samples_bytes_written = 0;
+ char stats_filename[PATH_MAX];
+ char samples_filename[PATH_MAX];
+ char *xlator_name;
+ char *instance_name;
+ gf_boolean_t log_stats_fopen_failure = _gf_true;
+ gf_boolean_t log_samples_fopen_failure = _gf_true;
+ int old_cancel_type;
+
+ conf = this->private;
+ gf_log(this->name, GF_LOG_INFO,
+ "IO stats dump thread started, "
+ "polling IO stats every %d seconds",
+ conf->ios_dump_interval);
+ xlator_name = strdupa(conf->unique_id);
+ for (i = 0; i < strlen(xlator_name); i++) {
+ if (xlator_name[i] == '/')
+ xlator_name[i] = '_';
+ }
+ instance_name = this->instance_name;
+ if (this->name && strcmp(this->name, "glustershd") == 0) {
+ xlator_name = "shd";
+ } else if (this->prev && strcmp(this->prev->name, "nfs-server") == 0) {
+ xlator_name = "nfsd";
+ instance_name = this->prev->instance_name;
+ }
+ if (sys_mkdir(_IOS_DUMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG) == (-1)) {
+ if (errno != EEXIST) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "could not create stats-dump directory %s", _IOS_DUMP_DIR);
+ goto out;
+ }
+ }
+ if (sys_mkdir(_IOS_SAMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG) == (-1)) {
+ if (errno != EEXIST) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "could not create stats-sample directory %s", _IOS_SAMP_DIR);
+ goto out;
+ }
+ }
+ if (instance_name) {
+ stats_bytes_written = snprintf(stats_filename, PATH_MAX,
+ "%s/%s_%s_%s.dump", _IOS_DUMP_DIR,
+ __progname, xlator_name, instance_name);
+ samples_bytes_written = snprintf(
+ samples_filename, PATH_MAX, "%s/%s_%s_%s.samp", _IOS_SAMP_DIR,
+ __progname, xlator_name, instance_name);
+ } else {
+ stats_bytes_written = snprintf(stats_filename, PATH_MAX,
+ "%s/%s_%s.dump", _IOS_DUMP_DIR,
+ __progname, xlator_name);
+ samples_bytes_written = snprintf(samples_filename, PATH_MAX,
+ "%s/%s_%s.samp", _IOS_SAMP_DIR,
+ __progname, xlator_name);
+ }
+ if ((stats_bytes_written >= PATH_MAX) ||
+ (samples_bytes_written >= PATH_MAX)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Invalid path for stats dump (%s) and/or latency "
+ "samples (%s)",
+ stats_filename, samples_filename);
+ goto out;
+ }
+ while (1) {
+ if (conf->dump_thread_should_die)
+ break;
+ (void)pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS,
+ &old_cancel_type);
+ sleep(conf->ios_dump_interval);
+ (void)pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, &old_cancel_type);
+ /*
+ * It's not clear whether we should reopen this each time, or
+ * just hold it open and rewind/truncate on each iteration.
+ * Leaving it alone for now.
+ */
+ stats_logfp = fopen(stats_filename, "w+");
+ if (stats_logfp) {
+ (void)ios_dump_args_init(&args, conf->dump_format, stats_logfp);
+ io_stats_dump(this, &args, GF_IOS_INFO_ALL, _gf_false);
+ fclose(stats_logfp);
+ log_stats_fopen_failure = _gf_true;
+ } else if (log_stats_fopen_failure) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "could not open stats-dump file %s (%s)", stats_filename,
+ strerror(errno));
+ log_stats_fopen_failure = _gf_false;
+ }
+ samples_logfp = fopen(samples_filename, "w+");
+ if (samples_logfp) {
+ io_stats_dump_latency_samples_logfp(this, samples_logfp);
+ fclose(samples_logfp);
+ log_samples_fopen_failure = _gf_true;
+ } else if (log_samples_fopen_failure) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "could not open samples-dump file %s (%s)", samples_filename,
+ strerror(errno));
+ log_samples_fopen_failure = _gf_false;
+ }
+ }
+out:
+ conf->dump_thread_running = _gf_false;
+ gf_log(this->name, GF_LOG_INFO, "IO stats dump thread terminated");
+ return NULL;
+}
- START_FOP_LATENCY (frame);
+static gf_boolean_t
+match_special_xattr(dict_t *d, char *k, data_t *val, void *mdata)
+{
+ gf_boolean_t ret = _gf_false;
+ if (fnmatch("*io*stat*dump", k, 0) == 0) {
+ ret = _gf_true;
+ }
- STACK_WIND (frame, io_stats_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, fd, params);
- return 0;
+ return ret;
}
-
int
-io_stats_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset)
+io_stats_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- BUMP_FOP (READ);
+ struct {
+ xlator_t *this;
+ inode_t *inode;
+ const char *path;
+ } stub;
- frame->local = fd;
+ stub.this = this;
+ stub.inode = loc->inode;
+ stub.path = loc->path;
- START_FOP_LATENCY (frame);
+ (void)dict_foreach_match(dict, match_special_xattr, NULL, conditional_dump,
+ &stub);
- STACK_WIND (frame, io_stats_readv_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv,
- fd, size, offset);
- return 0;
-}
+ START_FOP_LATENCY(frame);
+ STACK_WIND(frame, io_stats_setxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+ return 0;
+}
int
-io_stats_writev (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iovec *vector,
- int32_t count, off_t offset,
- struct iobref *iobref)
+io_stats_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- int len = 0;
+ START_FOP_LATENCY(frame);
+ STACK_WIND(frame, io_stats_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+ return 0;
+}
- len = iov_length (vector, count);
+int
+io_stats_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- BUMP_FOP (WRITE);
- BUMP_WRITE (fd, len);
+ STACK_WIND(frame, io_stats_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+ return 0;
+}
- START_FOP_LATENCY (frame);
+int
+io_stats_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_writev_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev,
- fd, vector, count, offset, iobref);
- return 0;
+ STACK_WIND(frame, io_stats_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+ return 0;
}
-
int
-io_stats_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+io_stats_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
{
- BUMP_FOP (STATFS);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
-
- STACK_WIND (frame, io_stats_statfs_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->statfs,
- loc);
- return 0;
+ STACK_WIND(frame, io_stats_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+ return 0;
}
-
int
-io_stats_flush (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
+io_stats_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
{
- BUMP_FOP (FLUSH);
-
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_flush_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- return 0;
+ STACK_WIND(frame, io_stats_fremovexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+ return 0;
}
-
int
-io_stats_fsync (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t flags)
+io_stats_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
- BUMP_FOP (FSYNC);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
-
- STACK_WIND (frame, io_stats_fsync_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync,
- fd, flags);
- return 0;
+ STACK_WIND(frame, io_stats_opendir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+ return 0;
}
+int
+io_stats_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict)
+{
+ frame->local = fd->inode;
+ START_FOP_LATENCY(frame);
-void
-conditional_dump (dict_t *dict, char *key, data_t *value, void *data)
-{
- struct {
- xlator_t *this;
- inode_t *inode;
- const char *path;
- } *stub;
- xlator_t *this = NULL;
- inode_t *inode = NULL;
- const char *path = NULL;
- char *filename = NULL;
-
- stub = data;
- this = stub->this;
- inode = stub->inode;
- path = stub->path;
-
- filename = alloca (value->len + 1);
- memset (filename, 0, value->len + 1);
- memcpy (filename, data_to_str (value), value->len);
-
- if (fnmatch ("*io*stat*dump", key, 0) == 0) {
- io_stats_dump (this, filename, inode, path);
- }
+ STACK_WIND(frame, io_stats_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+ return 0;
}
-
int
-io_stats_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict,
- int32_t flags)
+io_stats_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
{
- struct {
- xlator_t *this;
- inode_t *inode;
- const char *path;
- } stub;
+ START_FOP_LATENCY(frame);
- BUMP_FOP (SETXATTR);
+ STACK_WIND(frame, io_stats_readdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata);
+ return 0;
+}
- stub.this = this;
- stub.inode = loc->inode;
- stub.path = loc->path;
+int
+io_stats_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- dict_foreach (dict, conditional_dump, &stub);
+ STACK_WIND(frame, io_stats_fsyncdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata);
+ return 0;
+}
- START_FOP_LATENCY (frame);
+int
+io_stats_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_setxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setxattr,
- loc, dict, flags);
- return 0;
+ STACK_WIND(frame, io_stats_access_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+ return 0;
}
-
int
-io_stats_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+io_stats_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
- BUMP_FOP (GETXATTR);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
-
- STACK_WIND (frame, io_stats_getxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr,
- loc, name);
- return 0;
+ STACK_WIND(frame, io_stats_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
}
-
int
-io_stats_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+io_stats_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- BUMP_FOP (REMOVEXATTR);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_removexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
- loc, name);
+int
+io_stats_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ return 0;
}
-
int
-io_stats_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd)
+io_stats_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
{
- BUMP_FOP (OPENDIR);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_fallocate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
- STACK_WIND (frame, io_stats_opendir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->opendir,
- loc, fd);
- return 0;
+ return 0;
}
int
-io_stats_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
+io_stats_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
{
- BUMP_FOP (READDIRP);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
- STACK_WIND (frame, io_stats_readdirp_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdirp,
- fd, size, offset);
-
- return 0;
+ return 0;
}
-
int
-io_stats_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset)
+io_stats_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
{
- BUMP_FOP (READDIR);
-
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_readdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdir,
- fd, size, offset);
+ STACK_WIND(frame, io_stats_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
- return 0;
+ return 0;
}
+int32_t
+io_stats_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
+
+ STACK_WIND(frame, io_stats_ipc_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc, op, xdata);
+ return 0;
+}
int
-io_stats_fsyncdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t datasync)
+io_stats_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
{
- BUMP_FOP (FSYNCDIR);
-
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_fsyncdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsyncdir,
- fd, datasync);
- return 0;
+ STACK_WIND(frame, io_stats_lk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata);
+ return 0;
}
-
int
-io_stats_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask)
+io_stats_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
{
- BUMP_FOP (ACCESS);
-
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_access_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->access,
- loc, mask);
- return 0;
+ STACK_WIND(frame, io_stats_rchecksum_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
+ return 0;
}
-
int
-io_stats_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset)
+io_stats_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata)
{
- BUMP_FOP (FTRUNCATE);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
+ STACK_WIND(frame, io_stats_seek_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata);
+ return 0;
+}
- STACK_WIND (frame, io_stats_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd, offset);
+int
+io_stats_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
- return 0;
+ STACK_WIND(frame, io_stats_lease_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lease, loc, lease, xdata);
+ return 0;
}
-
int
-io_stats_fsetattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *stbuf, int32_t valid)
+io_stats_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
{
- BUMP_FOP (FSETATTR);
-
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_setattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetattr,
- fd, stbuf, valid);
- return 0;
+ STACK_WIND(frame, io_stats_getactivelk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getactivelk, loc, xdata);
+ return 0;
}
-
int
-io_stats_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
+io_stats_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ lock_migration_info_t *locklist, dict_t *xdata)
{
- BUMP_FOP (FSTAT);
+ START_FOP_LATENCY(frame);
- START_FOP_LATENCY (frame);
-
- STACK_WIND (frame, io_stats_fstat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat,
- fd);
- return 0;
+ STACK_WIND(frame, io_stats_setactivelk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setactivelk, loc, locklist, xdata);
+ return 0;
}
-
int
-io_stats_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t cmd, struct gf_flock *lock)
+io_stats_compound(call_frame_t *frame, xlator_t *this, void *args,
+ dict_t *xdata)
{
- BUMP_FOP (LK);
-
- START_FOP_LATENCY (frame);
+ START_FOP_LATENCY(frame);
- STACK_WIND (frame, io_stats_lk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lk,
- fd, cmd, lock);
- return 0;
+ STACK_WIND(frame, io_stats_compound_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->compound, args, xdata);
+ return 0;
}
-
int
-io_stats_release (xlator_t *this, fd_t *fd)
+io_stats_release(xlator_t *this, fd_t *fd)
{
- struct ios_fd *iosfd = NULL;
-
- BUMP_FOP (RELEASE);
+ struct ios_fd *iosfd = NULL;
+ struct ios_conf *conf = NULL;
- ios_fd_ctx_get (fd, this, &iosfd);
- if (iosfd) {
- io_stats_dump_fd (this, iosfd);
+ BUMP_FOP(RELEASE);
- if (iosfd->filename)
- GF_FREE (iosfd->filename);
- GF_FREE (iosfd);
+ conf = this->private;
+ if (conf) {
+ LOCK(&conf->lock);
+ {
+ conf->cumulative.nr_opens--;
}
+ UNLOCK(&conf->lock);
+ }
- return 0;
-}
+ ios_fd_ctx_get(fd, this, &iosfd);
+ if (iosfd) {
+ io_stats_dump_fd(this, iosfd);
+
+ GF_FREE(iosfd->filename);
+ GF_FREE(iosfd);
+ }
+ return 0;
+}
int
-io_stats_releasedir (xlator_t *this, fd_t *fd)
+io_stats_releasedir(xlator_t *this, fd_t *fd)
{
- BUMP_FOP (RELEASEDIR);
+ BUMP_FOP(RELEASEDIR);
- return 0;
+ return 0;
}
-
int
-io_stats_forget (xlator_t *this, inode_t *inode)
+io_stats_forget(xlator_t *this, inode_t *inode)
{
- BUMP_FOP (FORGET);
-
- return 0;
+ BUMP_FOP(FORGET);
+ ios_stats_cleanup(this, inode);
+ return 0;
}
-int
-reconfigure (xlator_t *this, dict_t *options)
+static int
+ios_init_top_stats(struct ios_conf *conf)
{
- struct ios_conf *conf = NULL;
- char *str = NULL;
- int ret = 0;
- char *log_str = NULL;
- glusterfs_ctx_t *ctx = NULL;
+ int i = 0;
- if (!this || !this->private)
- return -1;
+ GF_ASSERT(conf);
- conf = this->private;
+ for (i = 0; i < IOS_STATS_TYPE_MAX; i++) {
+ conf->list[i].iosstats = GF_CALLOC(1, sizeof(*conf->list[i].iosstats),
+ gf_io_stats_mt_ios_stat);
- ret = dict_get_str (options, "dump-fd-stats", &str);
- if (ret == 0) {
- ret = gf_string2boolean (str, &conf->dump_fd_stats);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "'dump-fd-stats' takes only boolean arguments");
- return -1;
- }
+ if (!conf->list[i].iosstats)
+ return -1;
- if (conf->dump_fd_stats) {
- gf_log (this->name, GF_LOG_DEBUG,
- "enabling dump-fd-stats");
- }
- }
+ INIT_LIST_HEAD(&conf->list[i].iosstats->list);
+ LOCK_INIT(&conf->list[i].lock);
+ }
- ret = dict_get_str_boolean (options, "latency-measurement", 0);
- if (ret != -1) {
- if (conf->measure_latency != ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "changing latency measurement from %d to %d",
- conf->measure_latency, ret);
- }
- conf->measure_latency = ret;
+ for (i = 0; i < IOS_STATS_THRU_MAX; i++) {
+ conf->thru_list[i].iosstats = GF_CALLOC(
+ 1, sizeof(*conf->thru_list[i].iosstats), gf_io_stats_mt_ios_stat);
+
+ if (!conf->thru_list[i].iosstats)
+ return -1;
+
+ INIT_LIST_HEAD(&conf->thru_list[i].iosstats->list);
+ LOCK_INIT(&conf->thru_list[i].lock);
+ }
+
+ return 0;
+}
+
+static void
+ios_destroy_top_stats(struct ios_conf *conf)
+{
+ int i = 0;
+ struct ios_stat_head *list_head = NULL;
+ struct ios_stat_list *entry = NULL;
+ struct ios_stat_list *tmp = NULL;
+ struct ios_stat_list *list = NULL;
+ struct ios_stat *stat = NULL;
+
+ GF_ASSERT(conf);
+
+ LOCK(&conf->lock);
+
+ conf->cumulative.nr_opens = 0;
+ conf->cumulative.max_nr_opens = 0;
+ conf->cumulative.max_openfd_time.tv_sec = 0;
+ conf->cumulative.max_openfd_time.tv_usec = 0;
+
+ for (i = 0; i < IOS_STATS_TYPE_MAX; i++) {
+ list_head = &conf->list[i];
+ if (!list_head)
+ continue;
+ list_for_each_entry_safe(entry, tmp, &list_head->iosstats->list, list)
+ {
+ list = entry;
+ stat = list->iosstat;
+ ios_stat_unref(stat);
+ list_del(&list->list);
+ GF_FREE(list);
+ list_head->members--;
}
- ctx = glusterfs_ctx_get ();
- if (!ctx)
- return -1;
-
- ret = dict_get_str (options, "log-level", &log_str);
- if (!ret) {
- if (!is_gf_log_command(this, "trusted.glusterfs.set-log-level", log_str)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "changing log-level to %s", log_str);
- }
+ GF_FREE(list_head->iosstats);
+ }
+
+ for (i = 0; i < IOS_STATS_THRU_MAX; i++) {
+ list_head = &conf->thru_list[i];
+ if (!list_head)
+ continue;
+ list_for_each_entry_safe(entry, tmp, &list_head->iosstats->list, list)
+ {
+ list = entry;
+ stat = list->iosstat;
+ ios_stat_unref(stat);
+ list_del(&list->list);
+ GF_FREE(list);
+ list_head->members--;
}
- return 0;
+ GF_FREE(list_head->iosstats);
+ }
+
+ UNLOCK(&conf->lock);
+
+ return;
}
-int32_t
-mem_acct_init (xlator_t *this)
+static void
+io_stats_clear(struct ios_conf *conf)
{
- int ret = -1;
-
- if (!this)
- return ret;
+ time_t now = 0;
+
+ GF_ASSERT(conf);
+ now = gf_time();
+
+ LOCK(&conf->lock);
+ {
+ ios_global_stats_clear(&conf->cumulative, now);
+ ios_global_stats_clear(&conf->incremental, now);
+ conf->increment = 0;
+ }
+ UNLOCK(&conf->lock);
+}
- ret = xlator_mem_acct_init (this, gf_io_stats_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- " failed");
- return ret;
- }
+int32_t
+io_priv(xlator_t *this)
+{
+ int i;
+ char key[GF_DUMP_MAX_BUF_LEN];
+ char key_prefix_cumulative[GF_DUMP_MAX_BUF_LEN];
+ char key_prefix_incremental[GF_DUMP_MAX_BUF_LEN];
+ double min, max, avg;
+ uint64_t count, total;
+ struct ios_conf *conf = NULL;
+
+ conf = this->private;
+ if (!conf)
+ return -1;
+
+ if (!conf->count_fop_hits || !conf->measure_latency)
+ return -1;
+
+ gf_proc_dump_write("cumulative.data_read", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(conf->cumulative.data_read));
+ gf_proc_dump_write("cumulative.data_written", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(conf->cumulative.data_written));
+
+ gf_proc_dump_write("incremental.data_read", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(conf->incremental.data_read));
+ gf_proc_dump_write("incremental.data_written", "%" GF_PRI_ATOMIC,
+ GF_ATOMIC_GET(conf->incremental.data_written));
+
+ snprintf(key_prefix_cumulative, GF_DUMP_MAX_BUF_LEN, "%s.cumulative",
+ this->name);
+ snprintf(key_prefix_incremental, GF_DUMP_MAX_BUF_LEN, "%s.incremental",
+ this->name);
+
+ for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+ count = GF_ATOMIC_GET(conf->cumulative.fop_hits[i]);
+ total = conf->cumulative.latency[i].total;
+ min = conf->cumulative.latency[i].min;
+ max = conf->cumulative.latency[i].max;
+ avg = conf->cumulative.latency[i].avg;
+
+ gf_proc_dump_build_key(key, key_prefix_cumulative, "%s",
+ (char *)gf_fop_list[i]);
+
+ gf_proc_dump_write(key, "%" PRId64 ",%" PRId64 ",%.03f,%.03f,%.03f",
+ count, total, min, max, avg);
+
+ count = GF_ATOMIC_GET(conf->incremental.fop_hits[i]);
+ total = conf->incremental.latency[i].total;
+ min = conf->incremental.latency[i].min;
+ max = conf->incremental.latency[i].max;
+ avg = conf->incremental.latency[i].avg;
+
+ gf_proc_dump_build_key(key, key_prefix_incremental, "%s",
+ (char *)gf_fop_list[i]);
+
+ gf_proc_dump_write(key, "%" PRId64 ",%" PRId64 ",%.03f,%.03f,%.03f",
+ count, total, min, max, avg);
+ }
+
+ return 0;
+}
- return ret;
+static void
+ios_set_log_format_code(struct ios_conf *conf, char *dump_format_str)
+{
+ if (strcmp(dump_format_str, "json") == 0)
+ conf->dump_format = IOS_DUMP_TYPE_JSON_FILE;
+ else if (strcmp(dump_format_str, "text") == 0)
+ conf->dump_format = IOS_DUMP_TYPE_FILE;
+ else if (strcmp(dump_format_str, "dict") == 0)
+ conf->dump_format = IOS_DUMP_TYPE_DICT;
+ else if (strcmp(dump_format_str, "samples") == 0)
+ conf->dump_format = IOS_DUMP_TYPE_SAMPLES;
}
-int
-init (xlator_t *this)
+void
+xlator_set_loglevel(xlator_t *this, int log_level)
{
- dict_t *options = NULL;
- struct ios_conf *conf = NULL;
- char *str = NULL;
- int ret = 0;
- char *log_str = NULL;
+ glusterfs_ctx_t *ctx = NULL;
+ glusterfs_graph_t *active = NULL;
+ xlator_t *top = NULL;
+ xlator_t *trav = this;
+
+ ctx = this->ctx;
+ GF_ASSERT(ctx);
+ active = ctx->active;
+ top = active->first;
+
+ if (log_level == -1)
+ return;
- if (!this)
- return -1;
+ if (ctx->cmd_args.brick_mux) {
+ /* Set log-level for all brick xlators */
+ top->loglevel = log_level;
- if (!this->children || this->children->next) {
- gf_log (this->name, GF_LOG_ERROR,
- "io_stats translator requires one subvolume");
- return -1;
+ /* Set log-level for parent xlator */
+ if (this->parents)
+ this->parents->xlator->loglevel = log_level;
+
+ while (trav) {
+ trav->loglevel = log_level;
+ trav = trav->next;
}
+ } else {
+ gf_log_set_loglevel(this->ctx, log_level);
+ }
+}
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ struct ios_conf *conf = NULL;
+ int ret = -1;
+ char *sys_log_str = NULL;
+ char *log_format_str = NULL;
+ char *logger_str = NULL;
+ char *dump_format_str = NULL;
+ int sys_log_level = -1;
+ char *log_str = NULL;
+ int log_level = -1;
+ int log_format = -1;
+ int logger = -1;
+ uint32_t log_buf_size = 0;
+ uint32_t log_flush_timeout = 0;
+ int32_t old_dump_interval;
+ int32_t threads;
+
+ if (!this || !this->private)
+ goto out;
+
+ conf = this->private;
+
+ GF_OPTION_RECONF("dump-fd-stats", conf->dump_fd_stats, options, bool, out);
+
+ GF_OPTION_RECONF("count-fop-hits", conf->count_fop_hits, options, bool,
+ out);
+
+ GF_OPTION_RECONF("latency-measurement", conf->measure_latency, options,
+ bool, out);
+
+ old_dump_interval = conf->ios_dump_interval;
+ GF_OPTION_RECONF("ios-dump-interval", conf->ios_dump_interval, options,
+ int32, out);
+ if ((old_dump_interval <= 0) && (conf->ios_dump_interval > 0)) {
+ conf->dump_thread_running = _gf_true;
+ conf->dump_thread_should_die = _gf_false;
+ ret = gf_thread_create(&conf->dump_thread, NULL,
+ (void *)&_ios_dump_thread, this, "iosdump");
+ if (ret) {
+ conf->dump_thread_running = _gf_false;
+ gf_log(this ? this->name : "io-stats", GF_LOG_ERROR,
+ "Failed to start thread"
+ "while reconfigure. Returning %d",
+ ret);
+ goto out;
}
+ } else if ((old_dump_interval > 0) && (conf->ios_dump_interval == 0)) {
+ _ios_destroy_dump_thread(conf);
+ }
+
+ GF_OPTION_RECONF("ios-sample-interval", conf->ios_sample_interval, options,
+ int32, out);
+ GF_OPTION_RECONF("ios-dump-format", dump_format_str, options, str, out);
+ ios_set_log_format_code(conf, dump_format_str);
+ GF_OPTION_RECONF("ios-sample-buf-size", conf->ios_sample_buf_size, options,
+ int32, out);
+ GF_OPTION_RECONF("sys-log-level", sys_log_str, options, str, out);
+ if (sys_log_str) {
+ sys_log_level = glusterd_check_log_level(sys_log_str);
+ set_sys_log_level(sys_log_level);
+ }
+
+ GF_OPTION_RECONF("log-level", log_str, options, str, out);
+ if (log_str) {
+ log_level = glusterd_check_log_level(log_str);
+ /* Set loglevel for all children and server xlators */
+ xlator_set_loglevel(this, log_level);
+ }
+
+ GF_OPTION_RECONF("logger", logger_str, options, str, out);
+ if (logger_str) {
+ logger = gf_check_logger(logger_str);
+ gf_log_set_logger(logger);
+ }
+
+ GF_OPTION_RECONF("log-format", log_format_str, options, str, out);
+ if (log_format_str) {
+ log_format = gf_check_log_format(log_format_str);
+ gf_log_set_logformat(log_format);
+ }
+
+ GF_OPTION_RECONF("log-buf-size", log_buf_size, options, uint32, out);
+ gf_log_set_log_buf_size(log_buf_size);
+
+ GF_OPTION_RECONF("log-flush-timeout", log_flush_timeout, options, time,
+ out);
+ gf_log_set_log_flush_timeout(log_flush_timeout);
+
+ GF_OPTION_RECONF("threads", threads, options, int32, out);
+ gf_async_adjust_threads(threads);
+
+ ret = 0;
+out:
+ gf_log(this ? this->name : "io-stats", GF_LOG_DEBUG,
+ "reconfigure returning %d", ret);
+ return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ int ret = -1;
- options = this->options;
+ if (!this)
+ return ret;
- conf = GF_CALLOC (1, sizeof(*conf), gf_io_stats_mt_ios_conf);
+ ret = xlator_mem_acct_init(this, gf_io_stats_mt_end + 1);
- if (!conf) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- return -1;
- }
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Memory accounting init"
+ " failed");
+ return ret;
+ }
- LOCK_INIT (&conf->lock);
+ return ret;
+}
- gettimeofday (&conf->cumulative.started_at, NULL);
- gettimeofday (&conf->incremental.started_at, NULL);
+void
+ios_conf_destroy(struct ios_conf *conf)
+{
+ if (!conf)
+ return;
- ret = dict_get_str (options, "dump-fd-stats", &str);
- if (ret == 0) {
- ret = gf_string2boolean (str, &conf->dump_fd_stats);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "'dump-fd-stats' takes only boolean arguments");
- return -1;
- }
+ ios_destroy_top_stats(conf);
+ _ios_destroy_dump_thread(conf);
+ ios_destroy_sample_buf(conf->ios_sample_buf);
+ LOCK_DESTROY(&conf->lock);
+ gf_dnscache_deinit(conf->dnscache);
+ GF_FREE(conf);
+}
- if (conf->dump_fd_stats) {
- gf_log (this->name, GF_LOG_DEBUG,
- "enabling dump-fd-stats");
- }
- }
+static void
+ios_init_stats(struct ios_global_stats *stats)
+{
+ int i = 0;
- ret = dict_get_str_boolean (options, "latency-measurement", 0);
- if (ret != -1) {
- conf->measure_latency = ret;
- if (conf->measure_latency)
- gf_log (this->name, GF_LOG_DEBUG,
- "enabling latency measurement");
- }
+ GF_ATOMIC_INIT(stats->data_read, 0);
+ GF_ATOMIC_INIT(stats->data_written, 0);
- ret = dict_get_str (options, "log-level", &log_str);
- if (!ret) {
- if (!is_gf_log_command(this, "trusted.glusterfs.set-log-level", log_str)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "changing log-level to %s", log_str);
- }
- }
+ for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+ GF_ATOMIC_INIT(stats->block_count_write[i], 0);
+ GF_ATOMIC_INIT(stats->block_count_read[i], 0);
+ }
- this->private = conf;
+ for (i = 0; i < GF_FOP_MAXVALUE; i++)
+ GF_ATOMIC_INIT(stats->fop_hits[i], 0);
- return 0;
+ for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++)
+ GF_ATOMIC_INIT(stats->upcall_hits[i], 0);
+
+ stats->started_at = gf_time();
}
+int
+init(xlator_t *this)
+{
+ struct ios_conf *conf = NULL;
+ char *volume_id = NULL;
+ char *sys_log_str = NULL;
+ char *logger_str = NULL;
+ char *log_format_str = NULL;
+ char *dump_format_str = NULL;
+ int logger = -1;
+ int log_format = -1;
+ int sys_log_level = -1;
+ char *log_str = NULL;
+ int log_level = -1;
+ int ret = -1;
+ uint32_t log_buf_size = 0;
+ uint32_t log_flush_timeout = 0;
+ int32_t threads;
+
+ if (!this)
+ return -1;
+
+ if (!this->children) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "io_stats translator requires at least one subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ /* This is very much valid as io-stats currently is loaded
+ * on top of volumes on both client and server, hence this is
+ * not an warning message */
+ gf_log(this->name, GF_LOG_DEBUG, "dangling volume. check volfile ");
+ }
+
+ conf = GF_CALLOC(1, sizeof(*conf), gf_io_stats_mt_ios_conf);
+
+ if (!conf)
+ goto out;
+
+ if (dict_get_str(this->options, "unique-id", &conf->unique_id) != 0) {
+ /* This is always set on servers, so we must be a client. */
+ conf->unique_id = this->name;
+ }
+
+ ret = dict_get_strn(this->options, "volume-id", SLEN("volume-id"),
+ &volume_id);
+ if (!ret) {
+ strncpy(this->graph->volume_id, volume_id, GF_UUID_BUF_SIZE);
+ }
+ /*
+ * Init it just after calloc, so that we are sure the lock is inited
+ * in case of error paths.
+ */
+ LOCK_INIT(&conf->lock);
+ LOCK_INIT(&conf->ios_sampling_lock);
+
+ ios_init_stats(&conf->cumulative);
+ ios_init_stats(&conf->incremental);
+
+ ret = ios_init_top_stats(conf);
+ if (ret)
+ goto out;
+
+ GF_OPTION_INIT("dump-fd-stats", conf->dump_fd_stats, bool, out);
+
+ GF_OPTION_INIT("count-fop-hits", conf->count_fop_hits, bool, out);
+
+ GF_OPTION_INIT("latency-measurement", conf->measure_latency, bool, out);
+
+ GF_OPTION_INIT("ios-dump-interval", conf->ios_dump_interval, int32, out);
+
+ GF_OPTION_INIT("ios-sample-interval", conf->ios_sample_interval, int32,
+ out);
+
+ GF_OPTION_INIT("ios-dump-format", dump_format_str, str, out);
+ ios_set_log_format_code(conf, dump_format_str);
+
+ GF_OPTION_INIT("ios-sample-buf-size", conf->ios_sample_buf_size, int32,
+ out);
+
+ ret = ios_init_sample_buf(conf);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Out of memory.");
+ goto out;
+ }
+
+ GF_OPTION_INIT("ios-dnscache-ttl-sec", conf->ios_dnscache_ttl_sec, int32,
+ out);
+ conf->dnscache = gf_dnscache_init(conf->ios_dnscache_ttl_sec);
+ if (!conf->dnscache) {
+ ret = -1;
+ goto out;
+ }
+
+ GF_OPTION_INIT("sys-log-level", sys_log_str, str, out);
+ if (sys_log_str) {
+ sys_log_level = glusterd_check_log_level(sys_log_str);
+ set_sys_log_level(sys_log_level);
+ }
+
+ GF_OPTION_INIT("log-level", log_str, str, out);
+ if (log_str) {
+ log_level = glusterd_check_log_level(log_str);
+ if (DEFAULT_LOG_LEVEL != log_level)
+ gf_log_set_loglevel(this->ctx, log_level);
+ }
+
+ GF_OPTION_INIT("logger", logger_str, str, out);
+ if (logger_str) {
+ logger = gf_check_logger(logger_str);
+ gf_log_set_logger(logger);
+ }
+
+ GF_OPTION_INIT("log-format", log_format_str, str, out);
+ if (log_format_str) {
+ log_format = gf_check_log_format(log_format_str);
+ gf_log_set_logformat(log_format);
+ }
+
+ GF_OPTION_INIT("log-buf-size", log_buf_size, uint32, out);
+ gf_log_set_log_buf_size(log_buf_size);
+
+ GF_OPTION_INIT("log-flush-timeout", log_flush_timeout, time, out);
+ gf_log_set_log_flush_timeout(log_flush_timeout);
+
+ GF_OPTION_INIT("threads", threads, int32, out);
+ gf_async_adjust_threads(threads);
+
+ this->private = conf;
+ if (conf->ios_dump_interval > 0) {
+ conf->dump_thread_running = _gf_true;
+ conf->dump_thread_should_die = _gf_false;
+ ret = gf_thread_create(&conf->dump_thread, NULL,
+ (void *)&_ios_dump_thread, this, "iosdump");
+ if (ret) {
+ conf->dump_thread_running = _gf_false;
+ gf_log(this ? this->name : "io-stats", GF_LOG_ERROR,
+ "Failed to start thread"
+ "in init. Returning %d",
+ ret);
+ goto out;
+ }
+ }
+ return 0;
+out:
+ ios_conf_destroy(conf);
+ return ret;
+}
void
-fini (xlator_t *this)
+fini(xlator_t *this)
{
- struct ios_conf *conf = NULL;
-
- if (!this)
- return;
+ struct ios_conf *conf = NULL;
- conf = this->private;
-
- if (!conf)
- return;
- this->private = NULL;
+ if (!this)
+ return;
- GF_FREE(conf);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "io-stats translator unloaded");
- return;
+ ios_conf_destroy(conf);
+ this->private = NULL;
+ gf_log(this->name, GF_LOG_INFO, "io-stats translator unloaded");
+ return;
}
int
-validate_options (xlator_t *this, dict_t *options, char **op_errstr)
+notify(xlator_t *this, int32_t event, void *data, ...)
{
- int ret = -1;
- char *log_str = NULL;
+ int ret = 0;
+ struct ios_dump_args args = {0};
+ dict_t *output = NULL;
+ dict_t *dict = NULL;
+ int32_t op = 0;
+ int32_t list_cnt = 0;
+ double throughput = 0;
+ double time = 0;
+ gf_boolean_t is_peek = _gf_false;
+ va_list ap;
+ struct gf_upcall *up_data = NULL;
+ struct gf_upcall_cache_invalidation *up_ci = NULL;
+
+ dict = data;
+ va_start(ap, data);
+ output = va_arg(ap, dict_t *);
+ va_end(ap);
+ switch (event) {
+ case GF_EVENT_TRANSLATOR_INFO:
+ ret = dict_get_str_boolean(dict, "clear-stats", _gf_false);
+ if (ret) {
+ ret = dict_set_int32(output, "top-op", op);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to set top-op in dict");
+ goto out;
+ }
+ ios_destroy_top_stats(this->private);
+ ret = ios_init_top_stats(this->private);
+ if (ret)
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to reset top stats");
+ ret = dict_set_int32(output, "stats-cleared", ret ? 0 : 1);
+ if (ret)
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to set stats-cleared"
+ " in dict");
+ goto out;
+ }
+
+ ret = dict_get_int32(dict, "top-op", &op);
+ if (!ret) {
+ ret = dict_get_int32(dict, "list-cnt", &list_cnt);
+ if (op > IOS_STATS_TYPE_NONE && op < IOS_STATS_TYPE_MAX)
+ ret = io_stats_dump_stats_to_dict(this, output, op,
+ list_cnt);
+ if (op == IOS_STATS_TYPE_READ_THROUGHPUT ||
+ op == IOS_STATS_TYPE_WRITE_THROUGHPUT) {
+ ret = dict_get_double(dict, "throughput", &throughput);
+ if (!ret) {
+ ret = dict_get_double(dict, "time", &time);
+ if (ret)
+ goto out;
+ ret = dict_set_double(output, "throughput", throughput);
+ if (ret)
+ goto out;
+ ret = dict_set_double(output, "time", time);
+ if (ret)
+ goto out;
+ }
+ ret = 0;
+ }
+ } else {
+ ret = dict_get_int32(dict, "info-op", &op);
+ if (ret || op < GF_IOS_INFO_ALL || GF_IOS_INFO_CLEAR < op)
+ op = GF_IOS_INFO_ALL;
+
+ ret = dict_set_int32(output, "info-op", op);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to set info-op in dict");
+ goto out;
+ }
- ret = dict_get_str (options, "log-level", &log_str);
- if (ret)
- return 0;
- ret = glusterd_check_log_level(log_str);
- if (ret == -1)
- *op_errstr = gf_strdup ("Invalid log level. possible option are"
- " DEBUG|WARNING|ERROR|CRITICAL|NONE|TRACE");
- else
- ret = 0;
- return ret;
+ if (GF_IOS_INFO_CLEAR == op) {
+ io_stats_clear(this->private);
+
+ ret = dict_set_int32(output, "stats-cleared", 1);
+ if (ret)
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to set stats-cleared"
+ " in dict");
+ } else {
+ ret = dict_get_str_boolean(dict, "peek", _gf_false);
+ if (-1 != ret)
+ is_peek = ret;
+
+ (void)ios_dump_args_init(&args, IOS_DUMP_TYPE_DICT, output);
+ ret = io_stats_dump(this, &args, op, is_peek);
+ }
+ }
+ break;
+ case GF_EVENT_UPCALL:
+ up_data = (struct gf_upcall *)data;
+ ios_bump_upcall(this, GF_UPCALL);
+
+ switch (up_data->event_type) {
+ case GF_UPCALL_RECALL_LEASE:
+ ios_bump_upcall(this, GF_UPCALL_LEASE_RECALL);
+ break;
+ case GF_UPCALL_CACHE_INVALIDATION:
+ up_ci = (struct gf_upcall_cache_invalidation *)
+ up_data->data;
+ if (up_ci->flags & (UP_XATTR | UP_XATTR_RM))
+ ios_bump_upcall(this, GF_UPCALL_CI_XATTR);
+ if (up_ci->flags & IATT_UPDATE_FLAGS)
+ ios_bump_upcall(this, GF_UPCALL_CI_STAT);
+ if (up_ci->flags & UP_RENAME_FLAGS)
+ ios_bump_upcall(this, GF_UPCALL_CI_RENAME);
+ if (up_ci->flags & UP_FORGET)
+ ios_bump_upcall(this, GF_UPCALL_CI_FORGET);
+ if (up_ci->flags & UP_NLINK)
+ ios_bump_upcall(this, GF_UPCALL_CI_NLINK);
+ break;
+ default:
+ gf_msg_debug(this->name, 0,
+ "Unknown upcall event "
+ "type :%d",
+ up_data->event_type);
+ break;
+ }
+
+ default_notify(this, event, data);
+ break;
+ default:
+ default_notify(this, event, data);
+ break;
+ }
+out:
+ return ret;
}
+struct xlator_dumpops dumpops = {.priv = io_priv};
+
struct xlator_fops fops = {
- .stat = io_stats_stat,
- .readlink = io_stats_readlink,
- .mknod = io_stats_mknod,
- .mkdir = io_stats_mkdir,
- .unlink = io_stats_unlink,
- .rmdir = io_stats_rmdir,
- .symlink = io_stats_symlink,
- .rename = io_stats_rename,
- .link = io_stats_link,
- .truncate = io_stats_truncate,
- .open = io_stats_open,
- .readv = io_stats_readv,
- .writev = io_stats_writev,
- .statfs = io_stats_statfs,
- .flush = io_stats_flush,
- .fsync = io_stats_fsync,
- .setxattr = io_stats_setxattr,
- .getxattr = io_stats_getxattr,
- .removexattr = io_stats_removexattr,
- .opendir = io_stats_opendir,
- .readdir = io_stats_readdir,
- .readdirp = io_stats_readdirp,
- .fsyncdir = io_stats_fsyncdir,
- .access = io_stats_access,
- .ftruncate = io_stats_ftruncate,
- .fstat = io_stats_fstat,
- .create = io_stats_create,
- .lk = io_stats_lk,
- .inodelk = io_stats_inodelk,
- .finodelk = io_stats_finodelk,
- .entrylk = io_stats_entrylk,
- .lookup = io_stats_lookup,
- .xattrop = io_stats_xattrop,
- .fxattrop = io_stats_fxattrop,
- .setattr = io_stats_setattr,
- .fsetattr = io_stats_fsetattr,
+ .stat = io_stats_stat,
+ .readlink = io_stats_readlink,
+ .mknod = io_stats_mknod,
+ .mkdir = io_stats_mkdir,
+ .unlink = io_stats_unlink,
+ .rmdir = io_stats_rmdir,
+ .symlink = io_stats_symlink,
+ .rename = io_stats_rename,
+ .link = io_stats_link,
+ .truncate = io_stats_truncate,
+ .open = io_stats_open,
+ .readv = io_stats_readv,
+ .writev = io_stats_writev,
+ .statfs = io_stats_statfs,
+ .flush = io_stats_flush,
+ .fsync = io_stats_fsync,
+ .setxattr = io_stats_setxattr,
+ .getxattr = io_stats_getxattr,
+ .removexattr = io_stats_removexattr,
+ .fsetxattr = io_stats_fsetxattr,
+ .fgetxattr = io_stats_fgetxattr,
+ .fremovexattr = io_stats_fremovexattr,
+ .opendir = io_stats_opendir,
+ .readdir = io_stats_readdir,
+ .readdirp = io_stats_readdirp,
+ .fsyncdir = io_stats_fsyncdir,
+ .access = io_stats_access,
+ .ftruncate = io_stats_ftruncate,
+ .fstat = io_stats_fstat,
+ .create = io_stats_create,
+ .lk = io_stats_lk,
+ .inodelk = io_stats_inodelk,
+ .finodelk = io_stats_finodelk,
+ .entrylk = io_stats_entrylk,
+ .fentrylk = io_stats_fentrylk,
+ .lookup = io_stats_lookup,
+ .xattrop = io_stats_xattrop,
+ .fxattrop = io_stats_fxattrop,
+ .setattr = io_stats_setattr,
+ .fsetattr = io_stats_fsetattr,
+ .fallocate = io_stats_fallocate,
+ .discard = io_stats_discard,
+ .zerofill = io_stats_zerofill,
+ .ipc = io_stats_ipc,
+ .rchecksum = io_stats_rchecksum,
+ .seek = io_stats_seek,
+ .lease = io_stats_lease,
+ .getactivelk = io_stats_getactivelk,
+ .setactivelk = io_stats_setactivelk,
+ .compound = io_stats_compound,
+ .copy_file_range = io_stats_copy_file_range,
};
struct xlator_cbks cbks = {
- .release = io_stats_release,
- .releasedir = io_stats_releasedir,
- .forget = io_stats_forget,
+ .release = io_stats_release,
+ .releasedir = io_stats_releasedir,
+ .forget = io_stats_forget,
};
struct volume_options options[] = {
- { .key = {"dump-fd-stats"},
- .type = GF_OPTION_TYPE_BOOL,
- },
- { .key = { "latency-measurement" },
- .type = GF_OPTION_TYPE_BOOL,
- },
- { .key = {"log-level"},
- .type = GF_OPTION_TYPE_STR,
- },
- { .key = {NULL} },
+ {.key = {"dump-fd-stats"},
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "If on stats related to file-operations would be "
+ "tracked inside GlusterFS data-structures."},
+ {.key = {"ios-dump-interval"},
+ .type = GF_OPTION_TYPE_INT,
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .min = 0,
+ .max = 3600,
+ .default_value = "0",
+ .description = "Interval (in seconds) at which to auto-dump "
+ "statistics. Zero disables automatic dumping."},
+ {.key = {"ios-sample-interval"},
+ .type = GF_OPTION_TYPE_INT,
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .min = 0,
+ .max = 65535,
+ .default_value = "0",
+ .description = "Interval in which we want to collect FOP latency "
+ "samples. 2 means collect a sample every 2nd FOP."},
+ {.key = {"ios-dump-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_3_12_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .default_value = "json",
+ .description = " The dump-format option specifies the format in which"
+ " to dump the statistics. Select between \"text\", "
+ "\"json\", \"dict\" and \"samples\". Default is "
+ "\"json\".",
+ .value = {"text", "json", "dict", "samples"}},
+ {.key = {"ios-sample-buf-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .min = 1024,
+ .max = 1024 * 1024,
+ .default_value = "65535",
+ .description = "The maximum size of our FOP sampling ring buffer."},
+ {.key = {"ios-dnscache-ttl-sec"},
+ .type = GF_OPTION_TYPE_INT,
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .min = 1,
+ .max = 3600 * 72,
+ .default_value = "86400",
+ .description = "The interval after wish a cached DNS entry will be "
+ "re-validated. Default: 24 hrs"},
+ {.key = {"latency-measurement"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .default_value = "off",
+ .description = "If on stats related to the latency of each operation "
+ "would be tracked inside GlusterFS data-structures. "},
+ {
+ .key = {"count-fop-hits"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"io-stats"},
+ },
+ {.key = {"log-level"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"DEBUG", "WARNING", "ERROR", "INFO", "CRITICAL", "NONE",
+ "TRACE"}},
+
+ /* These are synthetic entries to assist validation of CLI's *
+ * volume set command */
+ {.key = {"client-log-level"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .default_value = "INFO",
+ .description = "Changes the log-level of the clients",
+ .value = {"DEBUG", "WARNING", "ERROR", "INFO", "CRITICAL", "NONE",
+ "TRACE"}},
+ {.key = {"sys-log-level"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "CRITICAL",
+ .description = "Gluster's syslog log-level",
+ .value = {"WARNING", "ERROR", "INFO", "CRITICAL"}},
+ {.key = {"brick-log-level"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .default_value = "INFO",
+ .description = "Changes the log-level of the bricks",
+ .value = {"DEBUG", "WARNING", "ERROR", "INFO", "CRITICAL", "NONE",
+ "TRACE"}},
+ {.key = {"logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}},
+ {.key = {"client-logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .default_value = GF_LOGGER_GLUSTER_LOG,
+ .description = "Changes the logging sub-system to log to, for the "
+ "clients",
+ .value = {GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}},
+ {.key = {"brick-logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .default_value = GF_LOGGER_GLUSTER_LOG,
+ .description = "Changes the logging sub-system to log to, for the "
+ "bricks",
+ .value = {GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}},
+ {.key = {"log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}},
+ {.key = {"client-log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .default_value = GF_LOG_FORMAT_WITH_MSG_ID,
+ .description = "Changes log format for the clients",
+ .value = {GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}},
+ {.key = {"brick-log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .default_value = GF_LOG_FORMAT_WITH_MSG_ID,
+ .description = "Changes the log format for the bricks",
+ .value = {GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}},
+ {
+ .key = {"log-buf-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = GF_LOG_LRU_BUFSIZE_MIN,
+ .max = GF_LOG_LRU_BUFSIZE_MAX,
+ .default_value = "5",
+ },
+ {.key = {"client-log-buf-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .min = GF_LOG_LRU_BUFSIZE_MIN,
+ .max = GF_LOG_LRU_BUFSIZE_MAX,
+ .default_value = "5",
+ .description = "This option determines the maximum number of unique "
+ "log messages that can be buffered for a time equal to"
+ " the value of the option client-log-flush-timeout."},
+ {.key = {"brick-log-buf-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .min = GF_LOG_LRU_BUFSIZE_MIN,
+ .max = GF_LOG_LRU_BUFSIZE_MAX,
+ .default_value = "5",
+ .description = "This option determines the maximum number of unique "
+ "log messages that can be buffered for a time equal to"
+ " the value of the option brick-log-flush-timeout."},
+ {
+ .key = {"log-flush-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
+ .min = GF_LOG_FLUSH_TIMEOUT_MIN,
+ .max = GF_LOG_FLUSH_TIMEOUT_MAX,
+ .default_value = "120",
+ },
+ {.key = {"client-log-flush-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .min = GF_LOG_FLUSH_TIMEOUT_MIN,
+ .max = GF_LOG_FLUSH_TIMEOUT_MAX,
+ .default_value = "120",
+ .description = "This option determines the maximum number of unique "
+ "log messages that can be buffered for a time equal to"
+ " the value of the option client-log-flush-timeout."},
+ {.key = {"brick-log-flush-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
+ .op_version = {GD_OP_VERSION_3_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats"},
+ .min = GF_LOG_FLUSH_TIMEOUT_MIN,
+ .max = GF_LOG_FLUSH_TIMEOUT_MAX,
+ .default_value = "120",
+ .description = "This option determines the maximum number of unique "
+ "log messages that can be buffered for a time equal to"
+ " the value of the option brick-log-flush-timeout."},
+ {.key = {"unique-id"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "/no/such/path",
+ .description = "Unique ID for our files."},
+ {.key = {"global-threading"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {GD_OP_VERSION_6_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"io-stats", "threading"},
+ .description = "This option enables the global threading support for "
+ "bricks. If enabled, it's recommended to also enable "
+ "'performance.iot-pass-through'"},
+ {.key = {"threads"}, .type = GF_OPTION_TYPE_INT},
+ {.key = {"brick-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "16",
+ .min = 0,
+ .max = GF_ASYNC_MAX_THREADS,
+ .op_version = {GD_OP_VERSION_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"io-stats", "threading"},
+ .description = "When global threading is used, this value determines the "
+ "maximum amount of threads that can be created on bricks"},
+ {.key = {"client-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "16",
+ .min = 0,
+ .max = GF_ASYNC_MAX_THREADS,
+ .op_version = {GD_OP_VERSION_6_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .tags = {"io-stats", "threading"},
+ .description = "When global threading is used, this value determines the "
+ "maximum amount of threads that can be created on clients"},
+ {.key = {"volume-id"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_7_1},
+ .tags = {"global", "volume-id"},
+ .description =
+ "This option points to the 'unique' UUID particular to this "
+ "volume, which would be set in 'graph->volume_id'"},
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "io-stats",
+ .category = GF_MAINTAINED,
};
diff --git a/xlators/cluster/stripe/Makefile.am b/xlators/debug/sink/Makefile.am
index d471a3f9243..f2689244371 100644
--- a/xlators/cluster/stripe/Makefile.am
+++ b/xlators/debug/sink/Makefile.am
@@ -1,3 +1,2 @@
SUBDIRS = src
-CLEANFILES =
diff --git a/xlators/debug/sink/src/Makefile.am b/xlators/debug/sink/src/Makefile.am
new file mode 100644
index 00000000000..f952c2ce6bc
--- /dev/null
+++ b/xlators/debug/sink/src/Makefile.am
@@ -0,0 +1,14 @@
+xlator_LTLIBRARIES = sink.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_builddir)/rpc/xdr/src
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+sink_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+sink_la_SOURCES = sink.c
+sink_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+CLEANFILES =
+
diff --git a/xlators/debug/sink/src/sink.c b/xlators/debug/sink/src/sink.c
new file mode 100644
index 00000000000..9822bbb732e
--- /dev/null
+++ b/xlators/debug/sink/src/sink.c
@@ -0,0 +1,94 @@
+/*
+ Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+int32_t
+init(xlator_t *this)
+{
+ return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+ return;
+}
+
+/*
+ * notify - when parent sends PARENT_UP, send CHILD_UP event from here
+ */
+int32_t
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+ switch (event) {
+ case GF_EVENT_PARENT_UP:
+ /* Tell the parent that this xlator is up */
+ default_notify(this, GF_EVENT_CHILD_UP, data);
+ break;
+ case GF_EVENT_PARENT_DOWN:
+ /* Tell the parent that this xlator is down */
+ default_notify(this, GF_EVENT_CHILD_DOWN, data);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * A lookup on "/" is done while mounting or glfs_init() is performed. This
+ * needs to return a valid directory for the root of the mountpoint.
+ *
+ * In case this xlator is used for more advanced debugging, it will need to be
+ * extended to support different LOOKUPs too.
+ */
+static int32_t
+sink_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ struct iatt stbuf = {
+ 0,
+ };
+ struct iatt postparent = {
+ 0,
+ };
+
+ /* the root of the volume always need to be a directory */
+ stbuf.ia_type = IA_IFDIR;
+
+ STACK_UNWIND_STRICT(lookup, frame, 0, 0, loc ? loc->inode : NULL, &stbuf,
+ xdata, &postparent);
+
+ return 0;
+}
+
+struct xlator_fops fops = {
+ .lookup = sink_lookup,
+};
+
+struct xlator_cbks cbks = {};
+
+struct volume_options options[] = {
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .op_version = {GD_OP_VERSION_3_12_0},
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "sink",
+ .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/trace/src/Makefile.am b/xlators/debug/trace/src/Makefile.am
index 0f1679a049d..a37ea63af04 100644
--- a/xlators/debug/trace/src/Makefile.am
+++ b/xlators/debug/trace/src/Makefile.am
@@ -2,13 +2,16 @@
xlator_LTLIBRARIES = trace.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
-trace_la_LDFLAGS = -module -avoidversion
+trace_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
trace_la_SOURCES = trace.c
trace_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+noinst_HEADERS = trace.h trace-mem-types.h
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
CLEANFILES =
diff --git a/xlators/debug/trace/src/trace-mem-types.h b/xlators/debug/trace/src/trace-mem-types.h
new file mode 100644
index 00000000000..18a7e0414a6
--- /dev/null
+++ b/xlators/debug/trace/src/trace-mem-types.h
@@ -0,0 +1,20 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __TRACE_MEM_TYPES_H__
+#define __TRACE_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_trace_mem_types_ {
+ gf_trace_mt_trace_conf_t = gf_common_mt_end + 1,
+ gf_trace_mt_end
+};
+#endif
diff --git a/xlators/debug/trace/src/trace.c b/xlators/debug/trace/src/trace.c
index a562e813e92..6ed0ca00342 100644
--- a/xlators/debug/trace/src/trace.c
+++ b/xlators/debug/trace/src/trace.c
@@ -1,26 +1,15 @@
/*
- Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include "trace.h"
+#include "trace-mem-types.h"
/**
* xlators/debug/trace :
@@ -28,2203 +17,3518 @@
* their _cbk functions, which later passes the call to next layer.
* Very helpful translator for debugging.
*/
+#define TRACE_STAT_TO_STR(buf, str) trace_stat_to_str(buf, str, sizeof(str))
+
+static void
+trace_stat_to_str(struct iatt *buf, char *str, size_t len)
+{
+ char atime_buf[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+ char mtime_buf[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+ char ctime_buf[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+
+ if (!buf)
+ return;
-#include <time.h>
-#include <errno.h>
-#include "glusterfs.h"
-#include "xlator.h"
-#include "common-utils.h"
-
-#define ERR_EINVAL_NORETURN(cond) \
- do \
- { \
- if ((cond)) \
- { \
- gf_log ("ERROR", GF_LOG_ERROR, \
- "%s: %s: (%s) is true", \
- __FILE__, __FUNCTION__, #cond); \
- } \
- } while (0)
-
-
-typedef struct trace_private {
- int32_t debug_flag;
-} trace_private_t;
-
-
-struct {
- char *name;
- int enabled;
-} trace_fop_names[GF_FOP_MAXVALUE];
-
-int trace_log_level = GF_LOG_NORMAL;
-
-static char *
-trace_stat_to_str (struct iatt *stbuf)
-{
- char *statstr = NULL;
- char atime_buf[256] = {0,};
- char mtime_buf[256] = {0,};
- char ctime_buf[256] = {0,};
- int asprint_ret_value = 0;
- uint64_t ia_time = 0;
-
- ia_time = stbuf->ia_atime;
- strftime (atime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = stbuf->ia_mtime;
- strftime (mtime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = stbuf->ia_ctime;
- strftime (ctime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- asprint_ret_value = gf_asprintf (&statstr,
- "ia_ino=%"PRIu64
- ", st_mode=%o, ia_nlink=%"GF_PRI_NLINK", "
- "ia_uid=%d, ia_gid=%d, ia_size=%"PRId64", ia_blocks=%"PRId64
- ", ia_atime=%s, ia_mtime=%s, ia_ctime=%s",
- stbuf->ia_ino,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- stbuf->ia_nlink, stbuf->ia_uid,
- stbuf->ia_gid, stbuf->ia_size,
- stbuf->ia_blocks, atime_buf,
- mtime_buf, ctime_buf);
-
- if (asprint_ret_value < 0)
- statstr = NULL;
-
- return statstr;
-}
-
-
-int
-trace_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
-{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_CREATE].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, fd=%p, ino=%"PRIu64" "
- "*stbuf {%s}, *preparent {%s}, *postparent = "
- "{%s})",
- frame->root->unique, op_ret, fd, inode->ino,
- statstr, preparentstr, postparentstr);
-
- if (statstr)
- GF_FREE (statstr);
- if (preparentstr)
- GF_FREE (preparentstr);
- if (postparentstr)
- GF_FREE (postparentstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ gf_time_fmt(atime_buf, sizeof atime_buf, buf->ia_atime, gf_timefmt_dirent);
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
- preparent, postparent);
- return 0;
-}
+ gf_time_fmt(mtime_buf, sizeof mtime_buf, buf->ia_mtime, gf_timefmt_dirent);
+ gf_time_fmt(ctime_buf, sizeof ctime_buf, buf->ia_ctime, gf_timefmt_dirent);
-int
-trace_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
+ snprintf(str, len,
+ "gfid=%s ino=%" PRIu64
+ ", mode=%o, "
+ "nlink=%" GF_PRI_NLINK ", uid=%u, gid=%u, size=%" PRIu64
+ ", "
+ "blocks=%" PRIu64
+ ", atime=%s mtime=%s ctime=%s "
+ "atime_sec=%" PRId64 ", atime_nsec=%" PRIu32
+ ","
+ " mtime_sec=%" PRId64 ", mtime_nsec=%" PRIu32
+ ", "
+ "ctime_sec=%" PRId64 ", ctime_nsec=%" PRIu32 "",
+ uuid_utoa(buf->ia_gfid), buf->ia_ino,
+ st_mode_from_ia(buf->ia_prot, buf->ia_type), buf->ia_nlink,
+ buf->ia_uid, buf->ia_gid, buf->ia_size, buf->ia_blocks, atime_buf,
+ mtime_buf, ctime_buf, buf->ia_atime, buf->ia_atime_nsec,
+ buf->ia_mtime, buf->ia_mtime_nsec, buf->ia_ctime,
+ buf->ia_ctime_nsec);
+}
+
+int
+dump_history_trace(circular_buffer_t *cb, void *data)
+{
+ char timestr[GF_TIMESTR_SIZE] = {
+ 0,
+ };
- if (trace_fop_names[GF_FOP_OPEN].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d, *fd=%p)",
- frame->root->unique, op_ret, op_errno, fd);
- }
-
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd);
- return 0;
-}
-
-
-int
-trace_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- uint64_t ia_time = 0;
- char atime_buf[256];
- char mtime_buf[256];
- char ctime_buf[256];
-
- if (trace_fop_names[GF_FOP_STAT].enabled) {
- if (op_ret >= 0) {
- ia_time = buf->ia_atime;
- strftime (atime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = buf->ia_mtime;
- strftime (mtime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = buf->ia_ctime;
- strftime (ctime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, buf {"
- "ia_ino=%"PRIu64", st_mode=%o, ia_nlink=%"GF_PRI_NLINK", "
- "ia_uid=%d, ia_gid=%d, ia_rdev=%"PRIu64", ia_size=%"PRId64
- ", ia_blksize=%"GF_PRI_BLKSIZE", ia_blocks=%"PRId64", "
- "ia_atime=%s, ia_mtime=%s, ia_ctime=%s})",
- frame->root->unique, op_ret, buf->ia_ino,
- st_mode_from_ia (buf->ia_prot, buf->ia_type),
- buf->ia_nlink, buf->ia_uid, buf->ia_gid,
- buf->ia_rdev, buf->ia_size, buf->ia_blksize,
- buf->ia_blocks, atime_buf, mtime_buf, ctime_buf);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ /* Since we are continuing with adding entries to the buffer even when
+ gettimeofday () fails, it's safe to check tm and then dump the time
+ at which the entry was added to the buffer */
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf);
- return 0;
-}
-
-
-int
-trace_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iovec *vector,
- int32_t count, struct iatt *buf, struct iobref *iobref)
-{
- uint64_t ia_time = 0;
- char atime_buf[256];
- char mtime_buf[256];
- char ctime_buf[256];
-
- if (trace_fop_names[GF_FOP_READ].enabled) {
- if (op_ret >= 0) {
- ia_time = buf->ia_atime;
- strftime (atime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = buf->ia_mtime;
- strftime (mtime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = buf->ia_ctime;
- strftime (ctime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d, *buf {"
- "ia_ino=%"PRIu64", st_mode=%o, ia_nlink=%"GF_PRI_NLINK", "
- "ia_uid=%d, ia_gid=%d, ia_rdev=%"PRIu64", "
- "ia_size=%"PRId64", ia_blksize=%"GF_PRI_BLKSIZE", "
- "ia_blocks=%"PRId64", ia_atime=%s, ia_mtime=%s, ia_ctime=%s})",
- frame->root->unique, op_ret, op_errno, buf->ia_ino,
- st_mode_from_ia (buf->ia_prot, buf->ia_type),
- buf->ia_nlink, buf->ia_uid, buf->ia_gid,
- buf->ia_rdev, buf->ia_size, buf->ia_blksize, buf->ia_blocks,
- atime_buf, mtime_buf, ctime_buf);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ gf_time_fmt_tv(timestr, sizeof timestr, &cb->tv, gf_timefmt_Ymd_T);
+ gf_proc_dump_write("TIME", "%s", timestr);
+
+ gf_proc_dump_write("FOP", "%s\n", (char *)cb->data);
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count,
- buf, iobref);
- return 0;
+ return 0;
}
-
int
-trace_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
-{
- char *preopstr = NULL;
- char *postopstr = NULL;
+trace_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ char preparentstr[1024] = {
+ 0,
+ };
+ char postparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_CREATE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret >= 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ TRACE_STAT_TO_STR(preparent, preparentstr);
+ TRACE_STAT_TO_STR(postparent, postparentstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s (op_ret=%d, fd=%p"
+ "*stbuf {%s}, *preparent {%s}, "
+ "*postparent = {%s})",
+ frame->root->unique, uuid_utoa(inode->gfid), op_ret, fd,
+ statstr, preparentstr, postparentstr);
+
+ /* for 'release' log */
+ fd_ctx_set(fd, this, 0);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": (op_ret=%d, op_errno=%d)",
+ frame->root->unique, op_ret, op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int
+trace_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_OPEN].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, op_errno=%d, "
+ "*fd=%p",
+ frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+ fd);
+
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ /* for 'release' log */
+ if (op_ret >= 0)
+ fd_ctx_set(fd, this, 0);
+
+ TRACE_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+int
+trace_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_STAT].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ (void)snprintf(
+ string, sizeof(string), "%" PRId64 ": gfid=%s op_ret=%d buf=%s",
+ frame->root->unique, uuid_utoa(frame->local), op_ret, statstr);
+ } else {
+ (void)snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+trace_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct iatt *buf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READ].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret >= 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ snprintf(
+ string, sizeof(string), "%" PRId64 ": gfid=%s op_ret=%d buf=%s",
+ frame->root->unique, uuid_utoa(frame->local), op_ret, statstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf,
+ iobref, xdata);
+ return 0;
+}
+
+int
+trace_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ char preopstr[1024] = {
+ 0,
+ };
+ char postopstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_WRITE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret >= 0) {
+ TRACE_STAT_TO_STR(prebuf, preopstr);
+ TRACE_STAT_TO_STR(postbuf, postopstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s})",
+ frame->root->unique, op_ret, preopstr, postopstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int
+trace_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+ dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 " : gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(readdir, frame, op_ret, op_errno, buf, xdata);
+
+ return 0;
+}
+
+int
+trace_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+ dict_t *xdata)
+{
+ int count = 0;
+ char statstr[1024] = {
+ 0,
+ };
+ char string[4096] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+ gf_dirent_t *entry = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READDIRP].enabled) {
+ snprintf(string, sizeof(string),
+ "%" PRId64 " : gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+
+ LOG_ELEMENT(conf, string);
+ }
+ if (op_ret < 0)
+ goto out;
+
+ list_for_each_entry(entry, &buf->list, list)
+ {
+ count++;
+ TRACE_STAT_TO_STR(&entry->d_stat, statstr);
+ snprintf(string, sizeof(string),
+ "entry no. %d, pargfid=%s, "
+ "bname=%s *buf {%s}",
+ count, uuid_utoa(frame->local), entry->d_name, statstr);
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ TRACE_STACK_UNWIND(readdirp, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+trace_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ char preopstr[1024] = {
+ 0,
+ };
+ char postopstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSYNC].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(prebuf, preopstr);
+ TRACE_STAT_TO_STR(postbuf, postopstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s}",
+ frame->root->unique, op_ret, preopstr, postopstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+ return 0;
+}
+
+int
+trace_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
+{
+ char preopstr[1024] = {
+ 0,
+ };
+ char postopstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SETATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(statpre, preopstr);
+ TRACE_STAT_TO_STR(statpost, postopstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s})",
+ frame->root->unique, op_ret, preopstr, postopstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, statpost,
+ xdata);
+ return 0;
+}
+
+int
+trace_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
+{
+ char preopstr[1024] = {
+ 0,
+ };
+ char postopstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(statpre, preopstr);
+ TRACE_STAT_TO_STR(statpost, postopstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s})",
+ frame->root->unique, op_ret, preopstr, postopstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d)",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, statpre, statpost,
+ xdata);
+ return 0;
+}
+
+int
+trace_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ char preparentstr[1024] = {
+ 0,
+ };
+ char postparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_UNLINK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(preparent, preparentstr);
+ TRACE_STAT_TO_STR(postparent, postparentstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ " *preparent = {%s}, "
+ "*postparent = {%s})",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ preparentstr, postparentstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent,
+ xdata);
+ return 0;
+}
+
+int
+trace_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ char preoldparentstr[1024] = {
+ 0,
+ };
+ char postoldparentstr[1024] = {
+ 0,
+ };
+ char prenewparentstr[1024] = {
+ 0,
+ };
+ char postnewparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_RENAME].enabled) {
+ char string[6044] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ TRACE_STAT_TO_STR(preoldparent, preoldparentstr);
+ TRACE_STAT_TO_STR(postoldparent, postoldparentstr);
+ TRACE_STAT_TO_STR(prenewparent, prenewparentstr);
+ TRACE_STAT_TO_STR(postnewparent, postnewparentstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, "
+ "*stbuf = {%s}, *preoldparent = {%s},"
+ " *postoldparent = {%s}"
+ " *prenewparent = {%s}, "
+ "*postnewparent = {%s})",
+ frame->root->unique, op_ret, statstr, preoldparentstr,
+ postoldparentstr, prenewparentstr, postnewparentstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent,
+ postoldparent, prenewparent, postnewparent, xdata);
+ return 0;
+}
+
+int
+trace_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, const char *buf,
+ struct iatt *stbuf, dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READLINK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(stbuf, statstr);
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, op_errno=%d,"
+ "buf=%s, stbuf = { %s })",
+ frame->root->unique, op_ret, op_errno, buf, statstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, stbuf, xdata);
+ return 0;
+}
+
+int
+trace_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ char postparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ TRACE_STAT_TO_STR(postparent, postparentstr);
+ /* print buf->ia_gfid instead of inode->gfid,
+ * since if the inode is not yet linked to the
+ * inode table (fresh lookup) then null gfid
+ * will be printed.
+ */
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s (op_ret=%d "
+ "*buf {%s}, *postparent {%s}",
+ frame->root->unique, uuid_utoa(buf->ia_gfid), op_ret,
+ statstr, postparentstr);
+
+ /* For 'forget' */
+ inode_ctx_put(inode, this, 0);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+ postparent);
+ return 0;
+}
+
+int
+trace_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ char preparentstr[1024] = {
+ 0,
+ };
+ char postparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ TRACE_STAT_TO_STR(preparent, preparentstr);
+ TRACE_STAT_TO_STR(postparent, postparentstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s (op_ret=%d "
+ "*stbuf = {%s}, *preparent = {%s}, "
+ "*postparent = {%s})",
+ frame->root->unique, uuid_utoa(inode->gfid), op_ret,
+ statstr, preparentstr, postparentstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": op_ret=%d, op_errno=%d", frame->root->unique,
+ op_ret, op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+ return 0;
+}
+
+int
+trace_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ char preparentstr[1024] = {
+ 0,
+ };
+ char postparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ char string[4096] = {
+ 0,
+ };
+ if (trace_fop_names[GF_FOP_MKNOD].enabled) {
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ TRACE_STAT_TO_STR(preparent, preparentstr);
+ TRACE_STAT_TO_STR(postparent, postparentstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s (op_ret=%d "
+ "*stbuf = {%s}, *preparent = {%s}, "
+ "*postparent = {%s})",
+ frame->root->unique, uuid_utoa(inode->gfid), op_ret,
+ statstr, preparentstr, postparentstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": (op_ret=%d, op_errno=%d)",
+ frame->root->unique, op_ret, op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+ return 0;
+}
+
+int
+trace_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ char preparentstr[1024] = {
+ 0,
+ };
+ char postparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_MKDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ TRACE_STAT_TO_STR(preparent, preparentstr);
+ TRACE_STAT_TO_STR(postparent, postparentstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s (op_ret=%d "
+ ", *stbuf = {%s}, *prebuf = {%s}, "
+ "*postbuf = {%s} )",
+ frame->root->unique, uuid_utoa(inode->gfid), op_ret,
+ statstr, preparentstr, postparentstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": (op_ret=%d, op_errno=%d)",
+ frame->root->unique, op_ret, op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+ return 0;
+}
+
+int
+trace_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ char preparentstr[1024] = {
+ 0,
+ };
+ char postparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ char string[4096] = {
+ 0,
+ };
+ if (trace_fop_names[GF_FOP_LINK].enabled) {
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ TRACE_STAT_TO_STR(preparent, preparentstr);
+ TRACE_STAT_TO_STR(postparent, postparentstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, "
+ "*stbuf = {%s}, *prebuf = {%s},"
+ " *postbuf = {%s})",
+ frame->root->unique, op_ret, statstr, preparentstr,
+ postparentstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+ return 0;
+}
+
+int
+trace_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ char string[4096] = {
+ 0,
+ };
+ if (trace_fop_names[GF_FOP_FLUSH].enabled) {
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+trace_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ char string[4096] = {
+ 0,
+ };
+ if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, op_errno=%d,"
+ " fd=%p",
+ frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+ fd);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ /* for 'releasedir' log */
+ if (op_ret >= 0)
+ fd_ctx_set(fd, this, 0);
+
+ TRACE_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+int
+trace_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ char preparentstr[1024] = {
+ 0,
+ };
+ char postparentstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_RMDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(preparent, preparentstr);
+ TRACE_STAT_TO_STR(postparent, postparentstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "*prebuf={%s}, *postbuf={%s}",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ preparentstr, postparentstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent, postparent,
+ xdata);
+ return 0;
+}
+
+int
+trace_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ char preopstr[1024] = {
+ 0,
+ };
+ char postopstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(prebuf, preopstr);
+ TRACE_STAT_TO_STR(postbuf, postopstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s} )",
+ frame->root->unique, op_ret, preopstr, postopstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
+trace_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+ dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_STATFS].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": ({f_bsize=%lu, "
+ "f_frsize=%lu, "
+ "f_blocks=%" GF_PRI_FSBLK ", f_bfree=%" GF_PRI_FSBLK
+ ", "
+ "f_bavail=%" GF_PRI_FSBLK
+ ", "
+ "f_files=%" GF_PRI_FSBLK
+ ", "
+ "f_ffree=%" GF_PRI_FSBLK
+ ", "
+ "f_favail=%" GF_PRI_FSBLK
+ ", "
+ "f_fsid=%lu, f_flag=%lu, "
+ "f_namemax=%lu}) => ret=%d",
+ frame->root->unique, buf->f_bsize, buf->f_frsize,
+ buf->f_blocks, buf->f_bfree, buf->f_bavail, buf->f_files,
+ buf->f_ffree, buf->f_favail, buf->f_fsid, buf->f_flag,
+ buf->f_namemax, op_ret);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": (op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, op_ret, op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+trace_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+trace_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, op_errno=%d,"
+ " dict=%p",
+ frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+ dict);
- if (trace_fop_names[GF_FOP_WRITE].enabled) {
- if (op_ret >= 0) {
- preopstr = trace_stat_to_str (prebuf);
- postopstr = trace_stat_to_str (postbuf);
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+
+ return 0;
+}
+
+int
+trace_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSETXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, ino = %"PRIu64
- ", *prebuf = {%s}, *postbuf = {%s})",
- frame->root->unique, op_ret, postbuf->ia_ino,
- preopstr, postopstr);
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
- if (preopstr)
- GF_FREE (preopstr);
+int
+trace_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
- if (postopstr)
- GF_FREE (postopstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ conf = this->private;
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FGETXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, op_errno=%d,"
+ " dict=%p",
+ frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+ dict);
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
int
-trace_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *buf)
+trace_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_READDIR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64" :(op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf);
-
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
int
-trace_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *buf)
+trace_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_READDIRP].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64" :(op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, buf);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(fsyncdir, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+trace_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_ACCESS].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+trace_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ char prebufstr[1024] = {
+ 0,
+ };
+ char postbufstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(prebuf, prebufstr);
+ TRACE_STAT_TO_STR(postbuf, postbufstr);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s} )",
+ frame->root->unique, op_ret, prebufstr, postbufstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
+trace_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ char statstr[1024] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSTAT].enabled) {
+ char string[4096] = {0.};
+ if (op_ret == 0) {
+ TRACE_STAT_TO_STR(buf, statstr);
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d "
+ "buf=%s",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ statstr);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+trace_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_LK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (op_ret == 0) {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "{l_type=%d, l_whence=%d, "
+ "l_start=%" PRId64
+ ", "
+ "l_len=%" PRId64 ", l_pid=%u})",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ lock->l_type, lock->l_whence, lock->l_start, lock->l_len,
+ lock->l_pid);
+ } else {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ }
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(lk, frame, op_ret, op_errno, lock, xdata);
+ return 0;
+}
+
+int
+trace_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+trace_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FENTRYLK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-trace_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
+trace_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- char *preopstr = NULL;
- char *postopstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_FSYNC].enabled) {
- if (op_ret >= 0) {
- preopstr = trace_stat_to_str (prebuf);
- postopstr = trace_stat_to_str (postbuf);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, ino = %"PRIu64
- ", *prebuf = {%s}, *postbuf = {%s}",
- frame->root->unique, op_ret, postbuf->ia_ino,
- preopstr, postopstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_XATTROP].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
- if (preopstr)
- GF_FREE (preopstr);
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
- if (postopstr)
- GF_FREE (postopstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+int
+trace_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf);
-
- return 0;
-}
-
-
-int
-trace_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *statpre, struct iatt *statpost)
-{
- uint64_t ia_time = 0;
- char atime_pre[256] = {0,};
- char mtime_pre[256] = {0,};
- char ctime_pre[256] = {0,};
- char atime_post[256] = {0,};
- char mtime_post[256] = {0,};
- char ctime_post[256] = {0,};
-
- if (trace_fop_names[GF_FOP_SETATTR].enabled) {
- if (op_ret >= 0) {
- ia_time = statpre->ia_atime;
- strftime (atime_pre, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpre->ia_mtime;
- strftime (mtime_pre, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpre->ia_ctime;
- strftime (ctime_pre, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpost->ia_atime;
- strftime (atime_post, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpost->ia_mtime;
- strftime (mtime_post, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpost->ia_ctime;
- strftime (ctime_post, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, *statpre "
- "{ia_ino=%"PRIu64", st_mode=%o, ia_uid=%d, "
- "ia_gid=%d, ia_atime=%s, ia_mtime=%s, "
- "ia_ctime=%s}, *statpost {ia_ino=%"PRIu64", "
- "st_mode=%o, ia_uid=%d, ia_gid=%d, ia_atime=%s,"
- " ia_mtime=%s, ia_ctime=%s})",
- frame->root->unique, op_ret, statpre->ia_ino,
- st_mode_from_ia (statpre->ia_prot, statpre->ia_type),
- statpre->ia_uid,
- statpre->ia_gid, atime_pre, mtime_pre,
- ctime_pre, statpost->ia_ino,
- st_mode_from_ia (statpost->ia_prot, statpost->ia_type),
- statpost->ia_uid, statpost->ia_gid, atime_post,
- mtime_post, ctime_post);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ conf = this->private;
- STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, statpre, statpost);
- return 0;
-}
-
-
-int
-trace_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *statpre, struct iatt *statpost)
-{
- uint64_t ia_time = 0;
- char atime_pre[256] = {0,};
- char mtime_pre[256] = {0,};
- char ctime_pre[256] = {0,};
- char atime_post[256] = {0,};
- char mtime_post[256] = {0,};
- char ctime_post[256] = {0,};
-
- if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
- if (op_ret >= 0) {
- ia_time = statpre->ia_atime;
- strftime (atime_pre, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpre->ia_mtime;
- strftime (mtime_pre, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpre->ia_ctime;
- strftime (ctime_pre, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpost->ia_atime;
- strftime (atime_post, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpost->ia_mtime;
- strftime (mtime_post, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = statpost->ia_ctime;
- strftime (ctime_post, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, *statpre "
- "{ia_ino=%"PRIu64", st_mode=%o, ia_uid=%d, "
- "ia_gid=%d, ia_atime=%s, ia_mtime=%s, "
- "ia_ctime=%s}, *statpost {ia_ino=%"PRIu64", "
- "st_mode=%o, ia_uid=%d, ia_gid=%d, ia_atime=%s,"
- " ia_mtime=%s, ia_ctime=%s})",
- frame->root->unique, op_ret, statpre->ia_ino,
- st_mode_from_ia (statpre->ia_prot, statpre->ia_type),
- statpre->ia_uid,
- statpre->ia_gid, atime_pre, mtime_pre,
- ctime_pre, statpost->ia_ino,
- st_mode_from_ia (statpost->ia_prot, statpost->ia_type),
- statpost->ia_uid, statpost->ia_gid, atime_post,
- mtime_post, ctime_post);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
- STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno,
- statpre, statpost);
- return 0;
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
}
-
int
-trace_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent)
+trace_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- char *preparentstr = NULL;
- char *postparentstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_UNLINK].enabled) {
- if (op_ret >= 0) {
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, *preparent = {%s}, "
- "*postparent = {%s})",
- frame->root->unique, op_ret, preparentstr,
- postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_INODELK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
- if (preparentstr)
- GF_FREE (preparentstr);
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
- if (postparentstr)
- GF_FREE (postparentstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+int
+trace_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
- preparent, postparent);
- return 0;
-}
-
-
-int
-trace_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent)
-{
- char *statstr = NULL;
- char *preoldparentstr = NULL;
- char *postoldparentstr = NULL;
- char *prenewparentstr = NULL;
- char *postnewparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_RENAME].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preoldparentstr = trace_stat_to_str (preoldparent);
- postoldparentstr = trace_stat_to_str (postoldparent);
-
- prenewparentstr = trace_stat_to_str (prenewparent);
- postnewparentstr = trace_stat_to_str (postnewparent);
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, *stbuf = {%s}, "
- "*preoldparent = {%s}, *postoldparent = {%s}"
- " *prenewparent = {%s}, *postnewparent = {%s})",
- frame->root->unique, op_ret, statstr,
- preoldparentstr, postoldparentstr,
- prenewparentstr, postnewparentstr);
-
- if (preoldparentstr)
- GF_FREE (preoldparentstr);
-
- if (postoldparentstr)
- GF_FREE (postoldparentstr);
-
- if (prenewparentstr)
- GF_FREE (prenewparentstr);
-
- if (postnewparentstr)
- GF_FREE (postnewparentstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d, buf {ia_ino=%"PRIu64"})",
- frame->root->unique, op_ret, op_errno,
- (buf? buf->ia_ino : 0));
- }
+ conf = this->private;
- STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf,
- preoldparent, postoldparent,
- prenewparent, postnewparent);
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FINODELK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
int
-trace_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *buf, struct iatt *stbuf)
+trace_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, uint32_t weak_checksum,
+ uint8_t *strong_checksum, dict_t *xdata)
{
- char *statstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_READLINK].enabled) {
+ conf = this->private;
- if (op_ret == 0) {
- statstr = trace_stat_to_str (stbuf);
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d, buf=%s, "
- "stbuf = { %s })",
- frame->root->unique, op_ret, op_errno, buf,
- statstr);
- } else
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d",
- frame->root->unique, op_ret, op_errno);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_RCHECKSUM].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s op_ret=%d op_errno=%d",
+ frame->root->unique, uuid_utoa(frame->local), op_ret,
+ op_errno);
- if (statstr)
- GF_FREE (statstr);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, stbuf);
- return 0;
+out:
+ TRACE_STACK_UNWIND(rchecksum, frame, op_ret, op_errno, weak_checksum,
+ strong_checksum, xdata);
+
+ return 0;
}
+/* *_cbk section over <----------> fop section start */
int
-trace_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr, struct iatt *postparent)
+trace_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
{
- char *statstr = NULL;
- char *postparentstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- postparentstr = trace_stat_to_str (postparent);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, ino=%"PRIu64", "
- "*buf {%s}, *postparent {%s}",
- frame->root->unique, op_ret, inode->ino,
- statstr, postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s volume=%s, (path=%s "
+ "basename=%s, cmd=%s, type=%s)",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), volume,
+ loc->path, basename,
+ ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"),
+ ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK"));
- if (statstr)
- GF_FREE (statstr);
- if (postparentstr)
- GF_FREE (postparentstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ frame->local = loc->inode->gfid;
- STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf,
- xattr, postparent);
- return 0;
-}
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_entrylk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, cmd,
+ type, xdata);
+ return 0;
+}
int
-trace_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+trace_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
+ char *cmd_str = NULL;
+ char *type_str = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, ino=%"PRIu64", "
- "*stbuf = {%s}, *preparent = {%s}, "
- "*postparent = {%s})",
- frame->root->unique, op_ret, inode->ino,
- statstr, preparentstr, postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_INODELK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ switch (cmd) {
+#if F_GETLK != F_GETLK64
+ case F_GETLK64:
+#endif
+ case F_GETLK:
+ cmd_str = "GETLK";
+ break;
- if (statstr)
- GF_FREE (statstr);
+#if F_SETLK != F_SETLK64
+ case F_SETLK64:
+#endif
+ case F_SETLK:
+ cmd_str = "SETLK";
+ break;
- if (preparentstr)
- GF_FREE (preparentstr);
+#if F_SETLKW != F_SETLKW64
+ case F_SETLKW64:
+#endif
+ case F_SETLKW:
+ cmd_str = "SETLKW";
+ break;
+
+ default:
+ cmd_str = "UNKNOWN";
+ break;
+ }
+
+ switch (flock->l_type) {
+ case F_RDLCK:
+ type_str = "READ";
+ break;
+ case F_WRLCK:
+ type_str = "WRITE";
+ break;
+ case F_UNLCK:
+ type_str = "UNLOCK";
+ break;
+ default:
+ type_str = "UNKNOWN";
+ break;
+ }
+
+ snprintf(
+ string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s volume=%s, (path=%s "
+ "cmd=%s, type=%s, start=%llu, len=%llu, "
+ "pid=%llu)",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), volume, loc->path,
+ cmd_str, type_str, (unsigned long long)flock->l_start,
+ (unsigned long long)flock->l_len, (unsigned long long)flock->l_pid);
+
+ frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_inodelk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, flock,
+ xdata);
+ return 0;
+}
+
+int
+trace_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ char *cmd_str = NULL;
+ char *type_str = NULL;
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FINODELK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ switch (cmd) {
+#if F_GETLK != F_GETLK64
+ case F_GETLK64:
+#endif
+ case F_GETLK:
+ cmd_str = "GETLK";
+ break;
- if (postparentstr)
- GF_FREE (postparentstr);
+#if F_SETLK != F_SETLK64
+ case F_SETLK64:
+#endif
+ case F_SETLK:
+ cmd_str = "SETLK";
+ break;
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+#if F_SETLKW != F_SETLKW64
+ case F_SETLKW64:
+#endif
+ case F_SETLKW:
+ cmd_str = "SETLKW";
+ break;
+
+ default:
+ cmd_str = "UNKNOWN";
+ break;
}
- STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
-}
+ switch (flock->l_type) {
+ case F_RDLCK:
+ type_str = "READ";
+ break;
+ case F_WRLCK:
+ type_str = "WRITE";
+ break;
+ case F_UNLCK:
+ type_str = "UNLOCK";
+ break;
+ default:
+ type_str = "UNKNOWN";
+ break;
+ }
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s volume=%s, (fd =%p "
+ "cmd=%s, type=%s, start=%llu, len=%llu, "
+ "pid=%llu)",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), volume, fd,
+ cmd_str, type_str, (unsigned long long)flock->l_start,
+ (unsigned long long)flock->l_len,
+ (unsigned long long)flock->l_pid);
+ frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_finodelk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, flock,
+ xdata);
+ return 0;
+}
int
-trace_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+trace_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_MKNOD].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, ino=%"PRIu64", "
- "*stbuf = {%s}, *preparent = {%s}, "
- "*postparent = {%s})",
- frame->root->unique, op_ret, inode->ino,
- statstr, preparentstr, postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_XATTROP].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s (path=%s flags=%d)", frame->root->unique,
+ uuid_utoa(loc->inode->gfid), loc->path, flags);
- if (statstr)
- GF_FREE (statstr);
+ frame->local = loc->inode->gfid;
- if (preparentstr)
- GF_FREE (preparentstr);
+ LOG_ELEMENT(conf, string);
+ }
- if (postparentstr)
- GF_FREE (postparentstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+out:
+ STACK_WIND(frame, trace_xattrop_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata);
- STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
+ return 0;
}
-
int
-trace_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+trace_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_MKDIR].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, ino = %"PRIu64
- ", *stbuf = {%s}, *prebuf = {%s}, "
- "*postbuf = {%s} )",
- frame->root->unique, op_ret, buf->ia_ino,
- statstr, preparentstr, postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p, flags=%d",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, flags);
- if (statstr)
- GF_FREE (statstr);
+ frame->local = fd->inode->gfid;
- if (preparentstr)
- GF_FREE (preparentstr);
+ LOG_ELEMENT(conf, string);
+ }
- if (postparentstr)
- GF_FREE (postparentstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+out:
+ STACK_WIND(frame, trace_fxattrop_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata);
- STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
+ return 0;
}
-
int
-trace_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent)
+trace_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_LINK].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, ino = %"PRIu64
- ", *stbuf = {%s}, *prebuf = {%s}, "
- "*postbuf = {%s})",
- frame->root->unique, op_ret, buf->ia_ino,
- statstr, preparentstr, postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ /* TODO: print all the keys mentioned in xattr_req */
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path);
- if (statstr)
- GF_FREE (statstr);
+ frame->local = loc->inode->gfid;
- if (preparentstr)
- GF_FREE (preparentstr);
+ LOG_ELEMENT(conf, string);
+ }
- if (postparentstr)
- GF_FREE (postparentstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+out:
+ STACK_WIND(frame, trace_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
- STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
- preparent, postparent);
- return 0;
+ return 0;
}
-
int
-trace_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+trace_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_FLUSH].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_STAT].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path);
-int
-trace_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d, fd=%p)",
- frame->root->unique, op_ret, op_errno, fd);
- }
+ frame->local = loc->inode->gfid;
- STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd);
- return 0;
-}
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+
+ return 0;
+}
int
-trace_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent)
+trace_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
{
- char *preparentstr = NULL;
- char *postparentstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_RMDIR].enabled) {
- if (op_ret >= 0) {
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s}",
- frame->root->unique, op_ret, preparentstr,
- postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READLINK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s path=%s, "
+ "size=%" GF_PRI_SIZET ")",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ size);
- if (preparentstr)
- GF_FREE (preparentstr);
+ frame->local = loc->inode->gfid;
- if (postparentstr)
- GF_FREE (postparentstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
- preparent, postparent);
- return 0;
-}
+out:
+ STACK_WIND(frame, trace_readlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+ return 0;
+}
int
-trace_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
+trace_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata)
{
- char *preopstr = NULL;
- char *postopstr = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
- if (op_ret >= 0) {
- preopstr = trace_stat_to_str (prebuf);
- postopstr = trace_stat_to_str (postbuf);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s} )",
- frame->root->unique, op_ret, preopstr,
- postopstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_MKNOD].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s path=%s mode=%d "
+ "umask=0%o, dev=%" GF_PRI_DEV ")",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ mode, umask, dev);
- if (preopstr)
- GF_FREE (preopstr);
+ LOG_ELEMENT(conf, string);
+ }
- if (postopstr)
- GF_FREE (postopstr);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+out:
+ STACK_WIND(frame, trace_mknod_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata);
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
+ return 0;
}
-
int
-trace_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct statvfs *buf)
+trace_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_STATFS].enabled) {
- if (op_ret >= 0) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": ({f_bsize=%lu, f_frsize=%lu, f_blocks=%"GF_PRI_FSBLK
- ", f_bfree=%"GF_PRI_FSBLK", f_bavail=%"GF_PRI_FSBLK", "
- "f_files=%"GF_PRI_FSBLK", f_ffree=%"GF_PRI_FSBLK", f_favail=%"
- GF_PRI_FSBLK", f_fsid=%lu, f_flag=%lu, f_namemax=%lu}) => ret=%d",
- frame->root->unique, buf->f_bsize, buf->f_frsize, buf->f_blocks,
- buf->f_bfree, buf->f_bavail, buf->f_files, buf->f_ffree,
- buf->f_favail, buf->f_fsid, buf->f_flag, buf->f_namemax, op_ret);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_MKDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s path=%s mode=%d"
+ " umask=0%o",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ mode, umask);
-int
-trace_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno);
- return 0;
+out:
+ STACK_WIND(frame, trace_mkdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+ return 0;
}
-
int
-trace_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+trace_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d, dict=%p)",
- frame->root->unique, op_ret, op_errno, dict);
- }
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_UNLINK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s flag=%d",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ xflag);
+ frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+ return 0;
+}
int
-trace_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+trace_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_RMDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s path=%s flags=%d", frame->root->unique,
+ uuid_utoa(loc->inode->gfid), loc->path, flags);
+ frame->local = loc->inode->gfid;
-int
-trace_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno);
- return 0;
-}
+out:
+ STACK_WIND(frame, trace_rmdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+ return 0;
+}
int
-trace_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+trace_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_ACCESS].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (access, frame, op_ret, op_errno);
- return 0;
-}
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s linkpath=%s, path=%s"
+ " umask=0%o",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), linkpath,
+ loc->path, umask);
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_symlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata);
+
+ return 0;
+}
int
-trace_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *prebuf, struct iatt *postbuf)
+trace_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- char *prebufstr = NULL;
- char *postbufstr = NULL;
+ char oldgfid[50] = {
+ 0,
+ };
+ char newgfid[50] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
- if (op_ret >= 0) {
- prebufstr = trace_stat_to_str (prebuf);
- postbufstr = trace_stat_to_str (postbuf);
+ conf = this->private;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s} )",
- frame->root->unique, op_ret,
- prebufstr, postbufstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_RENAME].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (newloc->inode)
+ uuid_utoa_r(newloc->inode->gfid, newgfid);
+ else
+ strcpy(newgfid, "0");
- if (prebufstr)
- GF_FREE (prebufstr);
+ uuid_utoa_r(oldloc->inode->gfid, oldgfid);
- if (postbufstr)
- GF_FREE (postbufstr);
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": oldgfid=%s oldpath=%s --> "
+ "newgfid=%s newpath=%s",
+ frame->root->unique, oldgfid, oldloc->path, newgfid,
+ newloc->path);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ frame->local = oldloc->inode->gfid;
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
-
-
-int
-trace_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- uint64_t ia_time = 0;
- char atime_buf[256] = {0, };
- char mtime_buf[256] = {0, };
- char ctime_buf[256] = {0, };
-
- if (trace_fop_names[GF_FOP_FSTAT].enabled) {
- if (op_ret >= 0) {
- ia_time = buf->ia_atime;
- strftime (atime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = buf->ia_mtime;
- strftime (mtime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = buf->ia_ctime;
- strftime (ctime_buf, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, *buf {"
- "ia_ino=%"PRIu64", st_mode=%o, ia_nlink=%"GF_PRI_NLINK", "
- "ia_uid=%d, ia_gid=%d, ia_rdev=%"PRIu64", ia_size=%"PRId64", "
- "ia_blksize=%"GF_PRI_BLKSIZE", ia_blocks=%"PRId64", ia_atime=%s, "
- "ia_mtime=%s, ia_ctime=%s})",
- frame->root->unique, op_ret, buf->ia_ino,
- st_mode_from_ia (buf->ia_prot, buf->ia_type),
- buf->ia_nlink, buf->ia_uid, buf->ia_gid,
- buf->ia_rdev, buf->ia_size, buf->ia_blksize,
- buf->ia_blocks, atime_buf, mtime_buf, ctime_buf);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf);
- return 0;
-}
+out:
+ STACK_WIND(frame, trace_rename_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+ return 0;
+}
int
-trace_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock)
+trace_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_LK].enabled) {
- if (op_ret >= 0) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, {l_type=%d, l_whence=%d, "
- "l_start=%"PRId64", l_len=%"PRId64", l_pid=%u})",
- frame->root->unique, op_ret, lock->l_type, lock->l_whence,
- lock->l_start, lock->l_len, lock->l_pid);
- } else {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
- }
+ char oldgfid[50] = {
+ 0,
+ };
+ char newgfid[50] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_LINK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (newloc->inode)
+ uuid_utoa_r(newloc->inode->gfid, newgfid);
+ else
+ strcpy(newgfid, "0");
+ uuid_utoa_r(oldloc->inode->gfid, oldgfid);
-int
-trace_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": op_ret=%d, op_errno=%d",
- frame->root->unique, op_ret, op_errno);
- }
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": oldgfid=%s oldpath=%s --> "
+ "newgfid=%s newpath=%s",
+ frame->root->unique, oldgfid, oldloc->path, newgfid,
+ newloc->path);
- STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno);
- return 0;
-}
+ frame->local = oldloc->inode->gfid;
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_link_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ return 0;
+}
int
-trace_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
+trace_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_XATTROP].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
- }
+ char actime_str[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+ char modtime_str[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict);
- return 0;
-}
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SETATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (valid & GF_SET_ATTR_MODE) {
+ snprintf(
+ string, sizeof(string), "%" PRId64 ": gfid=%s path=%s mode=%o)",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ st_mode_from_ia(stbuf->ia_prot, stbuf->ia_type));
+ LOG_ELEMENT(conf, string);
+ memset(string, 0, sizeof(string));
+ }
-int
-trace_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict)
-{
- if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
+ if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s path=%s uid=%o,"
+ " gid=%o",
+ frame->root->unique, uuid_utoa(loc->inode->gfid),
+ loc->path, stbuf->ia_uid, stbuf->ia_gid);
+
+ LOG_ELEMENT(conf, string);
+ memset(string, 0, sizeof(string));
}
- STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict);
- return 0;
-}
+ if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+ gf_time_fmt(actime_str, sizeof actime_str, stbuf->ia_atime,
+ gf_timefmt_bdT);
+ gf_time_fmt(modtime_str, sizeof modtime_str, stbuf->ia_mtime,
+ gf_timefmt_bdT);
-int
-trace_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- if (trace_fop_names[GF_FOP_INODELK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": op_ret=%d, op_errno=%d",
- frame->root->unique, op_ret, op_errno);
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s path=%s "
+ "ia_atime=%s, ia_mtime=%s",
+ frame->root->unique, uuid_utoa(loc->inode->gfid),
+ loc->path, actime_str, modtime_str);
+
+ LOG_ELEMENT(conf, string);
+ memset(string, 0, sizeof(string));
+ }
+ frame->local = loc->inode->gfid;
+ }
+
+out:
+ STACK_WIND(frame, trace_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+
+ return 0;
+}
+
+int
+trace_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ char actime_str[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+ char modtime_str[GF_TIMESTR_SIZE] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ if (valid & GF_SET_ATTR_MODE) {
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s fd=%p, mode=%o", frame->root->unique,
+ uuid_utoa(fd->inode->gfid), fd,
+ st_mode_from_ia(stbuf->ia_prot, stbuf->ia_type));
+
+ LOG_ELEMENT(conf, string);
+ memset(string, 0, sizeof(string));
}
- STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno);
- return 0;
-}
+ if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s fd=%p, uid=%o, "
+ "gid=%o",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd,
+ stbuf->ia_uid, stbuf->ia_gid);
+
+ LOG_ELEMENT(conf, string);
+ memset(string, 0, sizeof(string));
+ }
+
+ if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+ gf_time_fmt(actime_str, sizeof actime_str, stbuf->ia_atime,
+ gf_timefmt_bdT);
+
+ gf_time_fmt(modtime_str, sizeof modtime_str, stbuf->ia_mtime,
+ gf_timefmt_bdT);
+
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s fd=%p "
+ "ia_atime=%s, ia_mtime=%s",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd,
+ actime_str, modtime_str);
+ LOG_ELEMENT(conf, string);
+ memset(string, 0, sizeof(string));
+ }
+ frame->local = fd->inode->gfid;
+ }
-int
-trace_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+out:
+ STACK_WIND(frame, trace_fsetattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+
+ return 0;
+}
+
+static int
+trace_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": volume=%s, (loc= {path=%s, ino=%"PRIu64"} basename=%s, cmd=%s, type=%s)",
- frame->root->unique, volume, loc->path, loc->inode->ino, basename,
- ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"),
- ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK"));
- }
+ trace_conf_t *conf = this->private;
- STACK_WIND (frame, trace_entrylk_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->entrylk,
- volume, loc, basename, cmd, type);
- return 0;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SEEK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s op_ret=%d op_errno=%d, "
+ "offset=%" PRId64 "",
+ frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+ offset);
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata);
+ return 0;
}
+static int
+trace_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata)
+{
+ trace_conf_t *conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SEEK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s fd=%p "
+ "offset=%" PRId64 " what=%d",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, offset,
+ what);
+
+ frame->local = fd->inode->gfid;
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_seek_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata);
+
+ return 0;
+}
int
-trace_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
- loc_t *loc, int32_t cmd, struct gf_flock *flock)
+trace_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
{
- char *cmd_str = NULL;
- char *type_str = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_INODELK].enabled) {
- switch (cmd) {
-#if F_GETLK != F_GETLK64
- case F_GETLK64:
-#endif
- case F_GETLK:
- cmd_str = "GETLK";
- break;
+ conf = this->private;
-#if F_SETLK != F_SETLK64
- case F_SETLK64:
-#endif
- case F_SETLK:
- cmd_str = "SETLK";
- break;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s path=%s, "
+ "offset=%" PRId64 "",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ offset);
-#if F_SETLKW != F_SETLKW64
- case F_SETLKW64:
-#endif
- case F_SETLKW:
- cmd_str = "SETLKW";
- break;
-
- default:
- cmd_str = "UNKNOWN";
- break;
- }
-
- switch (flock->l_type) {
- case F_RDLCK:
- type_str = "READ";
- break;
- case F_WRLCK:
- type_str = "WRITE";
- break;
- case F_UNLCK:
- type_str = "UNLOCK";
- break;
- default:
- type_str = "UNKNOWN";
- break;
- }
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": volume=%s, (loc {path=%s, ino=%"PRIu64"}, "
- "cmd=%s, type=%s, start=%llu, len=%llu, pid=%llu)",
- frame->root->unique, volume, loc->path, loc->inode->ino,
- cmd_str, type_str, (unsigned long long) flock->l_start,
- (unsigned long long) flock->l_len,
- (unsigned long long) flock->l_pid);
- }
+ frame->local = loc->inode->gfid;
- STACK_WIND (frame, trace_inodelk_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->inodelk,
- volume, loc, cmd, flock);
- return 0;
-}
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+
+ return 0;
+}
int
-trace_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+trace_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_FINODELK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": op_ret=%d, op_errno=%d",
- frame->root->unique, op_ret, op_errno);
- }
+ trace_conf_t *conf = NULL;
- STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno);
- return 0;
-}
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_OPEN].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s path=%s flags=%d fd=%p",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ flags, fd);
+ frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+}
int
-trace_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
- fd_t *fd, int32_t cmd, struct gf_flock *flock)
+trace_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- char *cmd_str = NULL, *type_str = NULL;
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_FINODELK].enabled) {
- switch (cmd) {
-#if F_GETLK != F_GETLK64
- case F_GETLK64:
-#endif
- case F_GETLK:
- cmd_str = "GETLK";
- break;
+ conf = this->private;
-#if F_SETLK != F_SETLK64
- case F_SETLK64:
-#endif
- case F_SETLK:
- cmd_str = "SETLK";
- break;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_CREATE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s path=%s, fd=%p, "
+ "flags=0%o mode=0%o umask=0%o",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ fd, flags, mode, umask);
-#if F_SETLKW != F_SETLKW64
- case F_SETLKW64:
-#endif
- case F_SETLKW:
- cmd_str = "SETLKW";
- break;
-
- default:
- cmd_str = "UNKNOWN";
- break;
- }
-
- switch (flock->l_type) {
- case F_RDLCK:
- type_str = "READ";
- break;
- case F_WRLCK:
- type_str = "WRITE";
- break;
- case F_UNLCK:
- type_str = "UNLOCK";
- break;
- default:
- type_str = "UNKNOWN";
- break;
- }
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": volume=%s, (fd =%p, ino=%"PRIu64"}, "
- "cmd=%s, type=%s, start=%llu, len=%llu, pid=%llu)",
- frame->root->unique, volume, fd, fd->inode->ino,
- cmd_str, type_str, (unsigned long long) flock->l_start,
- (unsigned long long) flock->l_len,
- (unsigned long long) flock->l_pid);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_WIND (frame, trace_finodelk_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->finodelk,
- volume, fd, cmd, flock);
- return 0;
+out:
+ STACK_WIND(frame, trace_create_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+ xdata);
+ return 0;
}
-
int
-trace_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t flags, dict_t *dict)
+trace_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_XATTROP].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (path=%s, ino=%"PRIu64" flags=%d)",
- frame->root->unique, loc->path, loc->inode->ino, flags);
+ trace_conf_t *conf = NULL;
- }
+ conf = this->private;
- STACK_WIND (frame, trace_xattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->xattrop,
- loc, flags, dict);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READ].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s fd=%p, size=%" GF_PRI_SIZET
+ "offset=%" PRId64 " flags=0%x)",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, size,
+ offset, flags);
- return 0;
-}
+ frame->local = fd->inode->gfid;
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+ return 0;
+}
int
-trace_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t flags, dict_t *dict)
+trace_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (fd=%p, flags=%d)",
- frame->root->unique, fd, flags);
+ trace_conf_t *conf = NULL;
+ int i = 0;
+ size_t total_size = 0;
- }
+ conf = this->private;
- STACK_WIND (frame, trace_fxattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fxattrop,
- fd, flags, dict);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_WRITE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ for (i = 0; i < count; i++)
+ total_size += vector[i].iov_len;
- return 0;
-}
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s fd=%p, count=%d, "
+ " offset=%" PRId64 " flags=0%x write_size=%zu",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, count,
+ offset, flags, total_size);
+
+ frame->local = fd->inode->gfid;
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+ flags, iobref, xdata);
+ return 0;
+}
int
-trace_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+trace_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
- /* TODO: print all the keys mentioned in xattr_req */
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
- frame->root->unique, loc->path,
- loc->inode->ino);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_lookup_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lookup,
- loc, xattr_req);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_STATFS].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s",
+ frame->root->unique,
+ (loc->inode) ? uuid_utoa(loc->inode->gfid) : "0", loc->path);
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_statfs_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
+}
int
-trace_stat (call_frame_t *frame, xlator_t *this, loc_t *loc)
+trace_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_STAT].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
- frame->root->unique, loc->path, loc->inode->ino);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_stat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat,
- loc);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FLUSH].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd);
+
+ frame->local = fd->inode->gfid;
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_flush_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush, fd, xdata);
+ return 0;
+}
int
-trace_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size)
+trace_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_READLINK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, size=%"GF_PRI_SIZET")",
- frame->root->unique, loc->path, loc->inode->ino, size);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_readlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readlink,
- loc, size);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSYNC].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s flags=%d fd=%p",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), flags, fd);
+
+ frame->local = fd->inode->gfid;
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_fsync_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+ return 0;
+}
int
-trace_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
- mode_t mode, dev_t dev, dict_t *params)
+trace_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_MKNOD].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, mode=%d, dev=%"GF_PRI_DEV")",
- frame->root->unique, loc->path, loc->inode->ino, mode, dev);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_mknod_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod,
- loc, mode, dev, params);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s path=%s flags=%d", frame->root->unique,
+ uuid_utoa(loc->inode->gfid), loc->path, flags);
+ frame->local = loc->inode->gfid;
-int
-trace_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dict_t *params)
-{
- if (trace_fop_names[GF_FOP_MKDIR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (path=%s, ino=%"PRIu64", mode=%d)",
- frame->root->unique, loc->path,
- ((loc->inode)? loc->inode->ino : 0), mode);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_WIND (frame, trace_mkdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir,
- loc, mode, params);
- return 0;
+out:
+ STACK_WIND(frame, trace_setxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+ return 0;
}
-
int
-trace_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
+trace_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_UNLINK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
- frame->root->unique, loc->path, loc->inode->ino);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- loc);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s name=%s",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ name);
-int
-trace_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags)
-{
- if (trace_fop_names[GF_FOP_RMDIR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, flags=%d)",
- frame->root->unique, loc->path, loc->inode->ino, flags);
- }
+ frame->local = loc->inode->gfid;
- STACK_WIND (frame, trace_rmdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rmdir,
- loc, flags);
+ LOG_ELEMENT(conf, string);
+ }
- return 0;
+out:
+ STACK_WIND(frame, trace_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+ return 0;
}
-
int
-trace_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
- loc_t *loc, dict_t *params)
+trace_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (linkpath=%s, loc {path=%s, ino=%"PRIu64"})",
- frame->root->unique, linkpath, loc->path,
- ((loc->inode)? loc->inode->ino : 0));
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_symlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink,
- linkpath, loc, params);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s name=%s",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ name);
+ frame->local = loc->inode->gfid;
-int
-trace_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc)
-{
- if (trace_fop_names[GF_FOP_RENAME].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (oldloc {path=%s, ino=%"PRIu64"}, "
- "newloc{path=%s, ino=%"PRIu64"})",
- frame->root->unique, oldloc->path, oldloc->ino,
- newloc->path, newloc->ino);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_WIND (frame, trace_rename_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename,
- oldloc, newloc);
+out:
+ STACK_WIND(frame, trace_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
- return 0;
+ return 0;
}
-
int
-trace_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc)
+trace_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_LINK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (oldloc {path=%s, ino=%"PRIu64"}, "
- "newloc {path=%s, ino=%"PRIu64"})",
- frame->root->unique, oldloc->path, oldloc->inode->ino,
- newloc->path, newloc->inode->ino);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_link_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link,
- oldloc, newloc);
- return 0;
-}
-
-
-int
-trace_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid)
-{
- uint64_t ia_time = 0;
- char actime_str[256] = {0,};
- char modtime_str[256] = {0,};
-
- if (trace_fop_names[GF_FOP_SETATTR].enabled) {
- if (valid & GF_SET_ATTR_MODE) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"},"
- " mode=%o)", frame->root->unique, loc->path,
- loc->inode->ino,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type));
- }
-
- if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"},"
- " uid=%o, gid=%o)",
- frame->root->unique, loc->path, loc->inode->ino,
- stbuf->ia_uid, stbuf->ia_gid);
- }
-
- if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
- ia_time = stbuf->ia_atime;
- strftime (actime_str, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- ia_time = stbuf->ia_mtime;
- strftime (modtime_str, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
-
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, "
- "*stbuf=%p {ia_atime=%s, ia_mtime=%s})",
- frame->root->unique, loc->path, loc->inode->ino,
- stbuf, actime_str, modtime_str);
- }
- }
+ conf = this->private;
- STACK_WIND (frame, trace_setattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setattr,
- loc, stbuf, valid);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s fd=%p",
+ frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+ fd);
- return 0;
-}
+ frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_opendir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+ return 0;
+}
int
-trace_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid)
+trace_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict)
{
- uint64_t ia_time = 0;
- char actime_str[256] = {0,};
- char modtime_str[256] = {0,};
-
- if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
- if (valid & GF_SET_ATTR_MODE) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*fd=%p, mode=%o)",
- frame->root->unique, fd,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type));
- }
+ trace_conf_t *conf = NULL;
- if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*fd=%p, uid=%o, gid=%o)",
- frame->root->unique, fd,
- stbuf->ia_uid, stbuf->ia_gid);
- }
+ conf = this->private;
- if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
- ia_time = stbuf->ia_atime;
- strftime (actime_str, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READDIRP].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s fd=%p, size=%" GF_PRI_SIZET
+ ", offset=%" PRId64 " dict=%p",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, size,
+ offset, dict);
- ia_time = stbuf->ia_mtime;
- strftime (modtime_str, 256, "[%b %d %H:%M:%S]",
- localtime ((time_t *)&ia_time));
+ frame->local = fd->inode->gfid;
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*fd=%p"
- "*stbuf=%p {ia_atime=%s, ia_mtime=%s})",
- frame->root->unique, fd, stbuf, actime_str,
- modtime_str);
- }
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_WIND (frame, trace_fsetattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetattr,
- fd, stbuf, valid);
+out:
+ STACK_WIND(frame, trace_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
- return 0;
+ return 0;
}
-
int
-trace_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
- off_t offset)
+trace_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, offset=%"PRId64")",
- frame->root->unique, loc->path, loc->inode->ino, offset);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc, offset);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s fd=%p, size=%" GF_PRI_SIZET
+ ", offset=%" PRId64,
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, size,
+ offset);
+ frame->local = fd->inode->gfid;
-int
-trace_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, fd_t *fd, int32_t wbflags)
-{
- if (trace_fop_names[GF_FOP_OPEN].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, flags=%d, "
- "fd=%p, wbflags=%d)",
- frame->root->unique, loc->path, loc->inode->ino, flags,
- fd, wbflags);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_WIND (frame, trace_open_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- loc, flags, fd, wbflags);
- return 0;
-}
+out:
+ STACK_WIND(frame, trace_readdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata);
+ return 0;
+}
int
-trace_create (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, mode_t mode, fd_t *fd, dict_t *params)
+trace_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_CREATE].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, flags=0%o mode=0%o)",
- frame->root->unique, loc->path, loc->inode->ino, flags, mode);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, fd, params);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s datasync=%d fd=%p", frame->root->unique,
+ uuid_utoa(fd->inode->gfid), datasync, fd);
-int
-trace_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t offset)
-{
- if (trace_fop_names[GF_FOP_READ].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
- frame->root->unique, fd, size, offset);
- }
+ frame->local = fd->inode->gfid;
- STACK_WIND (frame, trace_readv_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv,
- fd, size, offset);
- return 0;
-}
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_fsyncdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata);
+ return 0;
+}
int
-trace_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count,
- off_t offset, struct iobref *iobref)
+trace_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_WRITE].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*fd=%p, *vector=%p, count=%d, offset=%"PRId64")",
- frame->root->unique, fd, vector, count, offset);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_writev_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev,
- fd, vector, count, offset, iobref);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_ACCESS].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s path=%s mask=0%o", frame->root->unique,
+ uuid_utoa(loc->inode->gfid), loc->path, mask);
-int
-trace_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- if (trace_fop_names[GF_FOP_STATFS].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
- frame->root->unique, loc->path,
- ((loc->inode)? loc->inode->ino : 0));
- }
+ frame->local = loc->inode->gfid;
- STACK_WIND (frame, trace_statfs_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->statfs,
- loc);
- return 0;
-}
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_access_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+ return 0;
+}
-int
-trace_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+int32_t
+trace_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_FLUSH].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*fd=%p)",
- frame->root->unique, fd);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_flush_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_RCHECKSUM].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s offset=%" PRId64 "len=%u fd=%p",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), offset, len,
+ fd);
-int
-trace_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags)
-{
- if (trace_fop_names[GF_FOP_FSYNC].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (flags=%d, *fd=%p)",
- frame->root->unique, flags, fd);
- }
+ frame->local = fd->inode->gfid;
- STACK_WIND (frame, trace_fsync_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync,
- fd, flags);
- return 0;
-}
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_rchecksum_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
-int
-trace_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags)
+ return 0;
+}
+
+int32_t
+trace_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+ fd_t *fd, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, dict=%p, flags=%d)",
- frame->root->unique, loc->path,
- ((loc->inode)? loc->inode->ino : 0), dict, flags);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_setxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setxattr,
- loc, dict, flags);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FENTRYLK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s volume=%s, (fd=%p "
+ "basename=%s, cmd=%s, type=%s)",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), volume, fd,
+ basename,
+ ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"),
+ ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK"));
-int
-trace_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
-{
- if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}), name=%s",
- frame->root->unique, loc->path,
- ((loc->inode)? loc->inode->ino : 0), name);
- }
+ frame->local = fd->inode->gfid;
- STACK_WIND (frame, trace_getxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr,
- loc, name);
- return 0;
-}
+ LOG_ELEMENT(conf, string);
+ }
+out:
+ STACK_WIND(frame, trace_fentrylk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, cmd,
+ type, xdata);
+ return 0;
+}
-int
-trace_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+int32_t
+trace_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, name=%s)",
- frame->root->unique, loc->path,
- ((loc->inode)? loc->inode->ino : 0), name);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_removexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
- loc, name);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FGETXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p name=%s",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, name);
+ frame->local = fd->inode->gfid;
-int
-trace_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
-{
- if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64":( loc {path=%s, ino=%"PRIu64"}, fd=%p)",
- frame->root->unique, loc->path, loc->inode->ino, fd);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_WIND (frame, trace_opendir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->opendir,
- loc, fd);
- return 0;
+out:
+ STACK_WIND(frame, trace_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+ return 0;
}
-int
-trace_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
+int32_t
+trace_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_READDIRP].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
- frame->root->unique, fd, size, offset);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_readdirp_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdirp,
- fd, size, offset);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSETXATTR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p flags=%d",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, flags);
+
+ frame->local = fd->inode->gfid;
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+ return 0;
+}
int
-trace_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t offset)
+trace_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_READDIR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
- frame->root->unique, fd, size, offset);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_readdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdir,
- fd, size, offset);
+ conf = this->private;
- return 0;
-}
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64 ": gfid=%s offset=%" PRId64 " fd=%p",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), offset, fd);
+ frame->local = fd->inode->gfid;
-int
-trace_fsyncdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t datasync)
-{
- if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (datasync=%d, *fd=%p)",
- frame->root->unique, datasync, fd);
- }
+ LOG_ELEMENT(conf, string);
+ }
- STACK_WIND (frame, trace_fsyncdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsyncdir,
- fd, datasync);
- return 0;
-}
+out:
+ STACK_WIND(frame, trace_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+}
int
-trace_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask)
+trace_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_ACCESS].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*loc {path=%s, ino=%"PRIu64"}, mask=0%o)",
- frame->root->unique, loc->path,
- ((loc->inode)? loc->inode->ino : 0), mask);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_access_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->access,
- loc, mask);
- return 0;
-}
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSTAT].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd);
-int
-trace_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset)
-{
- if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (offset=%"PRId64", *fd=%p)",
- frame->root->unique, offset, fd);
- }
+ frame->local = fd->inode->gfid;
- STACK_WIND (frame, trace_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd, offset);
+ LOG_ELEMENT(conf, string);
+ }
- return 0;
+out:
+ STACK_WIND(frame, trace_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ return 0;
}
-
int
-trace_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd)
+trace_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
{
- if (trace_fop_names[GF_FOP_FSTAT].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*fd=%p)",
- frame->root->unique, fd);
- }
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_LK].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string),
+ "%" PRId64
+ ": gfid=%s fd=%p, cmd=%d, "
+ "lock {l_type=%d, "
+ "l_whence=%d, l_start=%" PRId64
+ ", "
+ "l_len=%" PRId64 ", l_pid=%u})",
+ frame->root->unique, uuid_utoa(fd->inode->gfid), fd, cmd,
+ lock->l_type, lock->l_whence, lock->l_start, lock->l_len,
+ lock->l_pid);
+
+ frame->local = fd->inode->gfid;
- STACK_WIND (frame, trace_fstat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat,
- fd);
- return 0;
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ STACK_WIND(frame, trace_lk_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata);
+ return 0;
}
+int32_t
+trace_forget(xlator_t *this, inode_t *inode)
+{
+ trace_conf_t *conf = NULL;
-int
-trace_lk (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t cmd, struct gf_flock *lock)
+ conf = this->private;
+ /* If user want to understand when a lookup happens,
+ he should know about 'forget' too */
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "gfid=%s", uuid_utoa(inode->gfid));
+
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ return 0;
+}
+
+int32_t
+trace_releasedir(xlator_t *this, fd_t *fd)
{
- if (trace_fop_names[GF_FOP_LK].enabled) {
- gf_log (this->name, GF_LOG_NORMAL,
- "%"PRId64": (*fd=%p, cmd=%d, lock {l_type=%d, l_whence=%d, "
- "l_start=%"PRId64", l_len=%"PRId64", l_pid=%u})",
- frame->root->unique, fd, cmd, lock->l_type, lock->l_whence,
- lock->l_start, lock->l_len, lock->l_pid);
- }
+ trace_conf_t *conf = NULL;
- STACK_WIND (frame, trace_lk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lk,
- fd, cmd, lock);
- return 0;
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "gfid=%s fd=%p",
+ uuid_utoa(fd->inode->gfid), fd);
+
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ return 0;
}
+int32_t
+trace_release(xlator_t *this, fd_t *fd)
+{
+ trace_conf_t *conf = NULL;
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_OPEN].enabled ||
+ trace_fop_names[GF_FOP_CREATE].enabled) {
+ char string[4096] = {
+ 0,
+ };
+ snprintf(string, sizeof(string), "gfid=%s fd=%p",
+ uuid_utoa(fd->inode->gfid), fd);
+
+ LOG_ELEMENT(conf, string);
+ }
+
+out:
+ return 0;
+}
void
-enable_all_calls (int enabled)
+enable_all_calls(int enabled)
{
- int i;
+ int i;
- for (i = 0; i < GF_FOP_MAXVALUE; i++)
- trace_fop_names[i].enabled = enabled;
+ for (i = 0; i < GF_FOP_MAXVALUE; i++)
+ trace_fop_names[i].enabled = enabled;
}
-
void
-enable_call (const char *name, int enabled)
+enable_call(const char *name, int enabled)
{
- int i;
- for (i = 0; i < GF_FOP_MAXVALUE; i++)
- if (!strcasecmp(trace_fop_names[i].name, name))
- trace_fop_names[i].enabled = enabled;
+ int i;
+ for (i = 0; i < GF_FOP_MAXVALUE; i++)
+ if (!strcasecmp(trace_fop_names[i].name, name))
+ trace_fop_names[i].enabled = enabled;
}
-
/*
include = 1 for "include-ops"
= 0 for "exclude-ops"
*/
void
-process_call_list (const char *list, int include)
+process_call_list(const char *list, int include)
{
- enable_all_calls (include ? 0 : 1);
+ enable_all_calls(include ? 0 : 1);
- char *call = strsep ((char **)&list, ",");
+ char *call = strsep((char **)&list, ",");
- while (call) {
- enable_call (call, include);
- call = strsep ((char **)&list, ",");
- }
+ while (call) {
+ enable_call(call, include);
+ call = strsep((char **)&list, ",");
+ }
}
+int32_t
+trace_dump_history(xlator_t *this)
+{
+ int ret = -1;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+ 0,
+ };
+ trace_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO("trace", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->history, out);
+
+ conf = this->private;
+ // Is it ok to return silently if log-history option his off?
+ if (conf && conf->log_history == _gf_true) {
+ gf_proc_dump_build_key(key_prefix, "xlator.debug.trace", "history");
+ gf_proc_dump_add_section("%s", key_prefix);
+ eh_dump(this->history, NULL, dump_history_trace);
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
int32_t
-init (xlator_t *this)
+mem_acct_init(xlator_t *this)
{
- dict_t *options = NULL;
- char *includes = NULL, *excludes = NULL;
- char *forced_loglevel = NULL;
+ int ret = -1;
- if (!this)
- return -1;
+ if (!this)
+ return ret;
- if (!this->children || this->children->next) {
- gf_log (this->name, GF_LOG_ERROR,
- "trace translator requires one subvolume");
- return -1;
- }
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
+ ret = xlator_mem_acct_init(this, gf_trace_mt_end + 1);
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Memory accounting init"
+ " failed");
+ return ret;
+ }
- options = this->options;
- includes = data_to_str (dict_get (options, "include-ops"));
- excludes = data_to_str (dict_get (options, "exclude-ops"));
+ return ret;
+}
- {
- int i;
- for (i = 0; i < GF_FOP_MAXVALUE; i++) {
- trace_fop_names[i].name = (gf_fop_list[i] ?
- gf_fop_list[i] : ":O");
- trace_fop_names[i].enabled = 1;
- }
- }
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ int32_t ret = -1;
+ trace_conf_t *conf = NULL;
+ char *includes = NULL, *excludes = NULL;
- if (includes && excludes) {
- gf_log (this->name,
- GF_LOG_ERROR,
- "must specify only one of 'include-ops' and 'exclude-ops'");
- return -1;
- }
- if (includes)
- process_call_list (includes, 1);
- if (excludes)
- process_call_list (excludes, 0);
-
- if (dict_get (options, "force-log-level")) {
- forced_loglevel = data_to_str (dict_get (options,
- "force-log-level"));
- if (!forced_loglevel)
- goto setloglevel;
-
- if (strcmp (forced_loglevel, "NORMAL") == 0)
- trace_log_level = GF_LOG_NORMAL;
- else if (strcmp (forced_loglevel, "TRACE") == 0)
- trace_log_level = GF_LOG_TRACE;
- else if (strcmp (forced_loglevel, "ERROR") == 0)
- trace_log_level = GF_LOG_ERROR;
- else if (strcmp (forced_loglevel, "DEBUG") == 0)
- trace_log_level = GF_LOG_DEBUG;
- else if (strcmp (forced_loglevel, "WARNING") == 0)
- trace_log_level = GF_LOG_WARNING;
- else if (strcmp (forced_loglevel, "CRITICAL") == 0)
- trace_log_level = GF_LOG_CRITICAL;
- else if (strcmp (forced_loglevel, "NONE") == 0)
- trace_log_level = GF_LOG_NONE;
+ GF_VALIDATE_OR_GOTO("quick-read", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO(this->name, options, out);
+
+ conf = this->private;
+
+ includes = data_to_str(dict_get(options, "include-ops"));
+ excludes = data_to_str(dict_get(options, "exclude-ops"));
+
+ {
+ int i;
+ for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+ if (gf_fop_list[i])
+ strncpy(trace_fop_names[i].name, gf_fop_list[i],
+ sizeof(trace_fop_names[i].name));
+ else
+ strncpy(trace_fop_names[i].name, ":0",
+ sizeof(trace_fop_names[i].name));
+ trace_fop_names[i].enabled = 1;
+ trace_fop_names[i].name[sizeof(trace_fop_names[i].name) - 1] = 0;
}
+ }
+
+ if (includes && excludes) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "must specify only one of 'include-ops' and "
+ "'exclude-ops'");
+ goto out;
+ }
+
+ if (includes)
+ process_call_list(includes, 1);
+ if (excludes)
+ process_call_list(excludes, 0);
+
+ /* Should resizing of the event-history be allowed in reconfigure?
+ * for which a new event_history might have to be allocated and the
+ * older history has to be freed.
+ */
+ GF_OPTION_RECONF("log-file", conf->log_file, options, bool, out);
+
+ GF_OPTION_RECONF("log-history", conf->log_history, options, bool, out);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+ dict_t *options = NULL;
+ char *includes = NULL, *excludes = NULL;
+ char *forced_loglevel = NULL;
+ eh_t *history = NULL;
+ int ret = -1;
+ uint64_t history_size = TRACE_DEFAULT_HISTORY_SIZE;
+ trace_conf_t *conf = NULL;
+
+ if (!this)
+ return -1;
+
+ if (!this->children || this->children->next) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "trace translator requires one subvolume");
+ return -1;
+ }
+ if (!this->parents) {
+ gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+ }
+
+ conf = GF_CALLOC(1, sizeof(trace_conf_t), gf_trace_mt_trace_conf_t);
+ if (!conf) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "cannot allocate "
+ "xl->private");
+ return -1;
+ }
+
+ options = this->options;
+ includes = data_to_str(dict_get(options, "include-ops"));
+ excludes = data_to_str(dict_get(options, "exclude-ops"));
+
+ {
+ int i;
+ for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+ if (gf_fop_list[i])
+ strncpy(trace_fop_names[i].name, gf_fop_list[i],
+ sizeof(trace_fop_names[i].name));
+ else
+ strncpy(trace_fop_names[i].name, ":O",
+ sizeof(trace_fop_names[i].name));
+ trace_fop_names[i].enabled = 1;
+ trace_fop_names[i].name[sizeof(trace_fop_names[i].name) - 1] = 0;
+ }
+ }
+
+ if (includes && excludes) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "must specify only one of 'include-ops' and "
+ "'exclude-ops'");
+ return -1;
+ }
+
+ if (includes)
+ process_call_list(includes, 1);
+ if (excludes)
+ process_call_list(excludes, 0);
+
+ GF_OPTION_INIT("history-size", history_size, size, out);
+ conf->history_size = history_size;
+
+ gf_log(this->name, GF_LOG_INFO, "history size %" PRIu64, history_size);
+
+ GF_OPTION_INIT("log-file", conf->log_file, bool, out);
+
+ gf_log(this->name, GF_LOG_INFO, "logging to file %s",
+ (conf->log_file == _gf_true) ? "enabled" : "disabled");
+
+ GF_OPTION_INIT("log-history", conf->log_history, bool, out);
+
+ gf_log(this->name, GF_LOG_DEBUG, "logging to history %s",
+ (conf->log_history == _gf_true) ? "enabled" : "disabled");
+
+ history = eh_new(history_size, _gf_false, NULL);
+ if (!history) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "event history cannot be "
+ "initialized");
+ return -1;
+ }
+
+ this->history = history;
+
+ conf->trace_log_level = GF_LOG_INFO;
+
+ if (dict_get(options, "force-log-level")) {
+ forced_loglevel = data_to_str(dict_get(options, "force-log-level"));
+ if (!forced_loglevel)
+ goto setloglevel;
+
+ if (strcmp(forced_loglevel, "INFO") == 0)
+ conf->trace_log_level = GF_LOG_INFO;
+ else if (strcmp(forced_loglevel, "TRACE") == 0)
+ conf->trace_log_level = GF_LOG_TRACE;
+ else if (strcmp(forced_loglevel, "ERROR") == 0)
+ conf->trace_log_level = GF_LOG_ERROR;
+ else if (strcmp(forced_loglevel, "DEBUG") == 0)
+ conf->trace_log_level = GF_LOG_DEBUG;
+ else if (strcmp(forced_loglevel, "WARNING") == 0)
+ conf->trace_log_level = GF_LOG_WARNING;
+ else if (strcmp(forced_loglevel, "CRITICAL") == 0)
+ conf->trace_log_level = GF_LOG_CRITICAL;
+ else if (strcmp(forced_loglevel, "NONE") == 0)
+ conf->trace_log_level = GF_LOG_NONE;
+ }
setloglevel:
- gf_log_set_loglevel (trace_log_level);
+ gf_log_set_loglevel(this->ctx, conf->trace_log_level);
+ this->private = conf;
+ ret = 0;
+out:
+ if (ret == -1) {
+ if (history)
+ GF_FREE(history);
+ if (conf)
+ GF_FREE(conf);
+ }
- return 0;
+ return ret;
}
void
-fini (xlator_t *this)
+fini(xlator_t *this)
{
- if (!this)
- return;
-
- gf_log (this->name, GF_LOG_NORMAL,
- "trace translator unloaded");
+ if (!this)
return;
+
+ if (this->history)
+ eh_destroy(this->history);
+
+ gf_log(this->name, GF_LOG_INFO, "trace translator unloaded");
+ return;
}
struct xlator_fops fops = {
- .stat = trace_stat,
- .readlink = trace_readlink,
- .mknod = trace_mknod,
- .mkdir = trace_mkdir,
- .unlink = trace_unlink,
- .rmdir = trace_rmdir,
- .symlink = trace_symlink,
- .rename = trace_rename,
- .link = trace_link,
- .truncate = trace_truncate,
- .open = trace_open,
- .readv = trace_readv,
- .writev = trace_writev,
- .statfs = trace_statfs,
- .flush = trace_flush,
- .fsync = trace_fsync,
- .setxattr = trace_setxattr,
- .getxattr = trace_getxattr,
- .removexattr = trace_removexattr,
- .opendir = trace_opendir,
- .readdir = trace_readdir,
- .readdirp = trace_readdirp,
- .fsyncdir = trace_fsyncdir,
- .access = trace_access,
- .ftruncate = trace_ftruncate,
- .fstat = trace_fstat,
- .create = trace_create,
- .lk = trace_lk,
- .inodelk = trace_inodelk,
- .finodelk = trace_finodelk,
- .entrylk = trace_entrylk,
- .lookup = trace_lookup,
- .xattrop = trace_xattrop,
- .fxattrop = trace_fxattrop,
- .setattr = trace_setattr,
- .fsetattr = trace_fsetattr,
+ .stat = trace_stat,
+ .readlink = trace_readlink,
+ .mknod = trace_mknod,
+ .mkdir = trace_mkdir,
+ .unlink = trace_unlink,
+ .rmdir = trace_rmdir,
+ .symlink = trace_symlink,
+ .rename = trace_rename,
+ .link = trace_link,
+ .truncate = trace_truncate,
+ .open = trace_open,
+ .readv = trace_readv,
+ .writev = trace_writev,
+ .statfs = trace_statfs,
+ .flush = trace_flush,
+ .fsync = trace_fsync,
+ .setxattr = trace_setxattr,
+ .getxattr = trace_getxattr,
+ .fsetxattr = trace_fsetxattr,
+ .fgetxattr = trace_fgetxattr,
+ .removexattr = trace_removexattr,
+ .opendir = trace_opendir,
+ .readdir = trace_readdir,
+ .readdirp = trace_readdirp,
+ .fsyncdir = trace_fsyncdir,
+ .access = trace_access,
+ .ftruncate = trace_ftruncate,
+ .fstat = trace_fstat,
+ .create = trace_create,
+ .lk = trace_lk,
+ .inodelk = trace_inodelk,
+ .finodelk = trace_finodelk,
+ .entrylk = trace_entrylk,
+ .fentrylk = trace_fentrylk,
+ .lookup = trace_lookup,
+ .rchecksum = trace_rchecksum,
+ .xattrop = trace_xattrop,
+ .fxattrop = trace_fxattrop,
+ .setattr = trace_setattr,
+ .fsetattr = trace_fsetattr,
+ .seek = trace_seek,
};
-
struct xlator_cbks cbks = {
+ .release = trace_release,
+ .releasedir = trace_releasedir,
+ .forget = trace_forget,
};
struct volume_options options[] = {
- { .key = {"include-ops", "include"},
- .type = GF_OPTION_TYPE_STR,
- /*.value = { ""} */
- },
- { .key = {"exclude-ops", "exclude"},
- .type = GF_OPTION_TYPE_STR
- /*.value = { ""} */
- },
- { .key = {NULL} },
+ {
+ .key = {"include-ops", "include"},
+ .type = GF_OPTION_TYPE_STR,
+ /*.value = { ""} */
+ },
+ {
+ .key = {"exclude-ops", "exclude"},
+ .type = GF_OPTION_TYPE_STR
+ /*.value = { ""} */
+ },
+ {
+ .key = {"history-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .default_value = "1024",
+ },
+ {
+ .key = {"log-file"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ },
+ {
+ .key = {"log-history"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ },
+ {.key = {NULL}},
+};
+
+struct xlator_dumpops dumpops = {.history = trace_dump_history};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1},
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "trace",
+ .category = GF_TECH_PREVIEW,
};
diff --git a/xlators/debug/trace/src/trace.h b/xlators/debug/trace/src/trace.h
new file mode 100644
index 00000000000..b16304799da
--- /dev/null
+++ b/xlators/debug/trace/src/trace.h
@@ -0,0 +1,55 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <time.h>
+#include <errno.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/event-history.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/circ-buff.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/options.h>
+
+#define TRACE_DEFAULT_HISTORY_SIZE 1024
+
+typedef struct {
+ /* Since the longest fop name is fremovexattr i.e 12 characters, array size
+ * is kept 24, i.e double of the maximum.
+ */
+ char name[24];
+ int enabled;
+} trace_fop_name_t;
+
+trace_fop_name_t trace_fop_names[GF_FOP_MAXVALUE];
+
+typedef struct {
+ gf_boolean_t log_file;
+ gf_boolean_t log_history;
+ uint64_t history_size;
+ int trace_log_level;
+} trace_conf_t;
+
+#define TRACE_STACK_UNWIND(op, frame, params...) \
+ do { \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT(op, frame, params); \
+ } while (0);
+
+#define LOG_ELEMENT(_conf, _string) \
+ do { \
+ if (_conf) { \
+ if ((_conf->log_history) == _gf_true) \
+ gf_log_eh("%s", _string); \
+ if ((_conf->log_file) == _gf_true) \
+ gf_log(THIS->name, _conf->trace_log_level, "%s", _string); \
+ } \
+ } while (0);
diff --git a/xlators/encryption/Makefile.am b/xlators/encryption/Makefile.am
deleted file mode 100644
index 2cbde680fac..00000000000
--- a/xlators/encryption/Makefile.am
+++ /dev/null
@@ -1,3 +0,0 @@
-SUBDIRS = rot-13
-
-CLEANFILES =
diff --git a/xlators/encryption/rot-13/src/rot-13.c b/xlators/encryption/rot-13/src/rot-13.c
deleted file mode 100644
index a7e9c3c1454..00000000000
--- a/xlators/encryption/rot-13/src/rot-13.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <ctype.h>
-#include <sys/uio.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-
-#include "rot-13.h"
-
-/*
- * This is a rot13 ``encryption'' xlator. It rot13's data when
- * writing to disk and rot13's it back when reading it.
- * This xlator is meant as an example, NOT FOR PRODUCTION
- * USE ;) (hence no error-checking)
- */
-
-void
-rot13 (char *buf, int len)
-{
- int i;
- for (i = 0; i < len; i++) {
- if (buf[i] >= 'a' && buf[i] <= 'z')
- buf[i] = 'a' + ((buf[i] - 'a' + 13) % 26);
- else if (buf[i] >= 'A' && buf[i] <= 'Z')
- buf[i] = 'A' + ((buf[i] - 'A' + 13) % 26);
- }
-}
-
-void
-rot13_iovec (struct iovec *vector, int count)
-{
- int i;
- for (i = 0; i < count; i++) {
- rot13 (vector[i].iov_base, vector[i].iov_len);
- }
-}
-
-int32_t
-rot13_readv_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iovec *vector,
- int32_t count,
- struct iatt *stbuf,
- struct iobref *iobref)
-{
- rot_13_private_t *priv = (rot_13_private_t *)this->private;
-
- if (priv->decrypt_read)
- rot13_iovec (vector, count);
-
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, stbuf, iobref);
- return 0;
-}
-
-int32_t
-rot13_readv (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset)
-{
- STACK_WIND (frame,
- rot13_readv_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readv,
- fd, size, offset);
- return 0;
-}
-
-int32_t
-rot13_writev_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct iatt *prebuf,
- struct iatt *postbuf)
-{
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
-}
-
-int32_t
-rot13_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t offset,
- struct iobref *iobref)
-{
- rot_13_private_t *priv = (rot_13_private_t *)this->private;
- if (priv->encrypt_write)
- rot13_iovec (vector, count);
-
- STACK_WIND (frame,
- rot13_writev_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->writev,
- fd, vector, count, offset,
- iobref);
- return 0;
-}
-
-int32_t
-init (xlator_t *this)
-{
- data_t *data = NULL;
- rot_13_private_t *priv = NULL;
-
- if (!this->children || this->children->next) {
- gf_log ("rot13", GF_LOG_ERROR,
- "FATAL: rot13 should have exactly one child");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
-
- priv = GF_CALLOC (sizeof (rot_13_private_t), 1, 0);
- if (!priv)
- return -1;
-
- priv->decrypt_read = 1;
- priv->encrypt_write = 1;
-
- data = dict_get (this->options, "encrypt-write");
- if (data) {
- if (gf_string2boolean (data->data, &priv->encrypt_write) == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "encrypt-write takes only boolean options");
- return -1;
- }
- }
-
- data = dict_get (this->options, "decrypt-read");
- if (data) {
- if (gf_string2boolean (data->data, &priv->decrypt_read) == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "decrypt-read takes only boolean options");
- return -1;
- }
- }
-
- this->private = priv;
- gf_log ("rot13", GF_LOG_DEBUG, "rot13 xlator loaded");
- return 0;
-}
-
-void
-fini (xlator_t *this)
-{
- rot_13_private_t *priv = this->private;
-
- if (!priv)
- return;
- this->private = NULL;
- GF_FREE (priv);
-
- return;
-}
-
-struct xlator_fops fops = {
- .readv = rot13_readv,
- .writev = rot13_writev
-};
-
-struct xlator_cbks cbks = {
-};
-
-struct volume_options options[] = {
- { .key = {"encrypt-write"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"decrypt-read"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {NULL} },
-};
diff --git a/xlators/encryption/rot-13/src/rot-13.h b/xlators/encryption/rot-13/src/rot-13.h
deleted file mode 100644
index f8f0517b995..00000000000
--- a/xlators/encryption/rot-13/src/rot-13.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __ROT_13_H__
-#define __ROT_13_H__
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-typedef struct {
- gf_boolean_t encrypt_write;
- gf_boolean_t decrypt_read;
-} rot_13_private_t;
-
-#endif /* __ROT_13_H__ */
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
index 8b23b939b0e..c57897f11ea 100644
--- a/xlators/features/Makefile.am
+++ b/xlators/features/Makefile.am
@@ -1,3 +1,14 @@
-SUBDIRS = locks trash quota read-only access-control mac-compat quiesce #path-converter # filter
+if BUILD_CLOUDSYNC
+ CLOUDSYNC_DIR = cloudsync
+endif
-CLEANFILES =
+if BUILD_METADISP
+ METADISP_DIR = metadisp
+endif
+
+SUBDIRS = locks quota read-only quiesce marker index barrier arbiter upcall \
+ compress changelog gfid-access snapview-client snapview-server trash \
+ shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \
+ utime $(METADISP_DIR)
+
+CLEANFILES =
diff --git a/xlators/features/access-control/src/Makefile.am b/xlators/features/access-control/src/Makefile.am
deleted file mode 100644
index 6ab8cc4ec4e..00000000000
--- a/xlators/features/access-control/src/Makefile.am
+++ /dev/null
@@ -1,13 +0,0 @@
-xlator_LTLIBRARIES = access-control.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-access_control_la_LDFLAGS = -module -avoidversion
-access_control_la_SOURCES = access-control.c
-access_control_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = access-control.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)\
- -L$(xlatordir)/
-
-CLEANFILES =
diff --git a/xlators/features/access-control/src/access-control.c b/xlators/features/access-control/src/access-control.c
deleted file mode 100644
index 7fb0651215c..00000000000
--- a/xlators/features/access-control/src/access-control.c
+++ /dev/null
@@ -1,1862 +0,0 @@
-/*
- Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "access-control.h"
-#include "xlator.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "iatt.h"
-
-/* Careful, this function erases the stub from frame->local. Dont call this if
- * a subsequent callback requires retaining access to the stub. This should be
- * called at the end of all access-control related operations, i.e. once the
- * frame will be handed off to the actual fop and the next callback that will
- * be called is the default callback. IOW, the function where call_resume is
- * called.
- * NOTE: this is required because FRAME_DESTROY tries to free frame->local if
- * it finds it to be non-NULL.
- */
-call_stub_t *
-__get_frame_stub (call_frame_t *fr)
-{
- call_stub_t *st = NULL;
-
- if (!fr)
- return NULL;
-
- st = fr->local;
- fr->local = NULL;
-
- return st;
-}
-
-
-int
-ac_test_owner_access (struct iatt *ia, uid_t uid, int accesstest)
-{
- int ret = -1;
-
- if (!ia)
- return -1;
-
- /* First test permissions using the uid. */
- if (ia->ia_uid != uid) {
- ret = -1;
- goto out;
- }
-
- /* At this point we know, the uid matches that of the stat structure, so
- * if the caller does not care, we should return success.
- */
- if (ac_test_dontcare (accesstest)) {
- ret = 0;
- goto out;
- }
-
- if (ac_test_read (accesstest))
- ret = IA_PROT_RUSR (ia->ia_prot);
-
- if (ac_test_write (accesstest))
- ret = IA_PROT_WUSR (ia->ia_prot);
-
- if (ac_test_exec (accesstest))
- ret = IA_PROT_XUSR (ia->ia_prot);
-
- /* For failed access test for owner, we need to return EACCES */
- if (!ret)
- ret = -1;
- else
- ret = 0;
-out:
- return ret;
-}
-
-
-int
-ac_test_group_access (struct iatt *ia, gid_t gid, gid_t *auxgids, int auxcount,
- int accesstest)
-{
- int ret = -1;
- int testgid = -1;
- int x = 0;
-
- if (!ia)
- return -1;
- /* First, determine which gid to test against. This will be determined
- * by first checking which of the gids given to us match the gid in the
- * stat. If none match, then we go to checking with others as the user.
- */
-
- /* If we are only given the primary gid. Dont depend on @auxgids
- * being NULL since I know users of this function can pass statically
- * allocated arrays which cant be NULL and yet contain no valid gids.
- */
-
- if ((ia->ia_gid != gid) && (auxcount == 0)) {
- ret = -1;
- goto out;
- }
-
- if (ia->ia_gid == gid)
- testgid = gid;
- else {
- for (; x < auxcount; ++x) {
- if (ia->ia_gid == auxgids[x]) {
- testgid = ia->ia_gid;
- break;
- }
- }
- }
-
- /* None of the gids match with the gid in the stat. */
- if (testgid == -1) {
- ret = -1;
- goto out;
- }
-
- /* At this point, at least one gid matches that in the stat, now we must
- * check whether the caller is interested in the access check at all.
- */
- if (ac_test_dontcare (accesstest)) {
- ret = 0;
- goto out;
- }
-
- if (ac_test_read (accesstest))
- ret = IA_PROT_RGRP (ia->ia_prot);
-
- if (ac_test_write (accesstest))
- ret = IA_PROT_WGRP (ia->ia_prot);
-
- if (ac_test_exec (accesstest))
- ret = IA_PROT_XGRP (ia->ia_prot);
-
- if (!ret)
- ret = -1;
- else
- ret = 0;
-
-out:
- return ret;
-}
-
-
-int
-ac_test_other_access (struct iatt *ia, int accesstest)
-{
- int ret = 0;
-
- if (!ia)
- return -1;
-
- if (ac_test_read (accesstest))
- ret = IA_PROT_ROTH (ia->ia_prot);
-
- if (ac_test_write (accesstest))
- ret = IA_PROT_WOTH (ia->ia_prot);
-
- if (ac_test_exec (accesstest))
- ret = IA_PROT_XOTH (ia->ia_prot);
-
- if (!ret)
- ret = -1;
- else
- ret = 0;
-
- return ret;
-}
-
-
-/* Returns -1 on a failed access test with @operrno set to the relevant error
- * number.
- */
-int
-ac_test_access (struct iatt *ia, uid_t uid, gid_t gid, gid_t *auxgids,
- int auxcount, int accesstest, int testwho, int *operrno)
-{
- int ret = -1;
-
- if ((!ia) || (!operrno))
- return -1;
-
- if ((uid == 0) && (gid == 0)) {
- gf_log (ACTRL, GF_LOG_TRACE, "Root has access");
- return 0;
- }
-
- if (ac_test_owner (testwho)) {
- gf_log (ACTRL, GF_LOG_TRACE, "Testing owner access");
- ret = ac_test_owner_access (ia, uid, accesstest);
- }
-
- if (ret == 0) {
- gf_log (ACTRL, GF_LOG_TRACE, "Owner has access");
- goto out;
- }
-
- if (ac_test_group (testwho)) {
- gf_log (ACTRL, GF_LOG_TRACE, "Testing group access");
- ret = ac_test_group_access (ia, gid, auxgids, auxcount,
- accesstest);
- }
-
- if (ret == 0) {
- gf_log (ACTRL, GF_LOG_TRACE, "Group has access");
- goto out;
- }
-
- if (ac_test_other (testwho)) {
- gf_log (ACTRL, GF_LOG_TRACE, "Testing other access");
- ret = ac_test_other_access (ia, accesstest);
- }
-
- if (ret == 0)
- gf_log (ACTRL, GF_LOG_TRACE, "Other has access");
-out:
- if (ret == -1) {
- gf_log (ACTRL, GF_LOG_TRACE, "No access allowed");
- *operrno = EPERM;
- }
-
- return ret;
-}
-
-
-int
-ac_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path)
-{
- int ret = -EFAULT;
-
- if (!loc)
- return ret;
-
- if (inode) {
- loc->inode = inode_ref (inode);
- loc->ino = inode->ino;
- }
-
- if (parent)
- loc->parent = inode_ref (parent);
-
- loc->path = gf_strdup (path);
- if (!loc->path) {
- gf_log (ACTRL, GF_LOG_ERROR, "strdup failed");
- goto loc_wipe;
- }
-
- loc->name = strrchr (loc->path, '/');
- if (loc->name)
- loc->name++;
- else
- goto loc_wipe;
-
- ret = 0;
-loc_wipe:
- if (ret < 0)
- loc_wipe (loc);
-
- return ret;
-}
-
-
-int
-ac_inode_loc_fill (inode_t *inode, loc_t *loc)
-{
- char *resolvedpath = NULL;
- inode_t *parent = NULL;
- int ret = -EFAULT;
-
- if ((!inode) || (!loc))
- return ret;
-
- if ((inode) && (inode->ino == 1))
- goto ignore_parent;
-
- parent = inode_parent (inode, 0, NULL);
- if (!parent)
- goto err;
-
-ignore_parent:
- ret = inode_path (inode, NULL, &resolvedpath);
- if (ret < 0)
- goto err;
-
- ret = ac_loc_fill (loc, inode, parent, resolvedpath);
- if (ret < 0)
- goto err;
-
-err:
- if (parent)
- inode_unref (parent);
-
- if (resolvedpath)
- GF_FREE (resolvedpath);
-
- return ret;
-}
-
-
-int
-ac_parent_loc_fill (loc_t *parentloc, loc_t *childloc)
-{
- if ((!parentloc) || (!childloc))
- return -1;
-
- return ac_inode_loc_fill (childloc->parent, parentloc);
-}
-
-
-int32_t
-ac_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc,
- off_t offset)
-{
- STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate, loc, offset);
- return 0;
-}
-
-
-int32_t
-ac_truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1)
- goto out;
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_truncate_stub (frame, ac_truncate_resume, loc, offset);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- STACK_WIND (frame, ac_truncate_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, loc);
-
- ret = 0;
-out:
- if (ret < 0)
- STACK_UNWIND_STRICT (truncate, frame, -1, -ret, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-ac_access_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask)
-{
- STACK_WIND (frame, default_access_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->access, loc, mask);
- return 0;
-}
-
-
-int32_t
-ac_access_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
- int32_t mask = 0;
- int acctest = 0;
-
- stub = __get_frame_stub (frame);
- mask = stub->args.access.mask;
-
- /* If mask requests test for file existence then do not
- * return a failure with ENOENT, instead return a failed
- * access test.
- */
- if (op_ret == -1) {
- if (mask & F_OK)
- op_errno = EACCES;
- else
- op_errno = errno;
-
- goto out;
- }
-
- if (R_OK & mask)
- acctest |= ACCTEST_READ;
- else if (W_OK & mask)
- acctest |= ACCTEST_WRITE;
- else if (X_OK & mask)
- acctest |= ACCTEST_EXEC;
- else
- acctest = 0;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- acctest, ACCTEST_ANY, &op_errno);
- if (op_ret == -1)
- goto out;
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (access, frame, -1, op_errno);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_access_stub (frame, ac_access_resume, loc, mask);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- STACK_WIND (frame, ac_access_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, loc);
- ret = 0;
-
-out:
- if (ret < 0)
- STACK_UNWIND_STRICT (access, frame, -1, -ret);
-
- return 0;
-}
-
-
-int32_t
-ac_readlink_resume (call_frame_t *frame, xlator_t *this, loc_t *loc,
- size_t size)
-{
- STACK_WIND (frame, default_readlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readlink, loc, size);
- return 0;
-}
-
-
-int32_t
-ac_readlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_READ, ACCTEST_ANY, &op_errno);
- if (op_ret == -1)
- goto out;
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (readlink, frame, -1, op_errno, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_readlink_stub (frame, ac_readlink_resume, loc, size);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- STACK_WIND (frame, ac_readlink_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, loc);
- ret = 0;
-
-out:
- if (ret < 0)
- STACK_UNWIND_STRICT (readlink, frame, -1, -ret, NULL, NULL);
-
- return 0;
-}
-
-
-int
-ac_mknod_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t rdev, dict_t *params)
-{
- STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, params);
- return 0;
-}
-
-
-int32_t
-ac_mknod_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, NULL,
- NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int
-ac_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t rdev, dict_t *params)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
- loc_t parentloc = {0, };
-
- stub = fop_mknod_stub (frame, ac_mknod_resume, loc, mode, rdev, params);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- ret = ac_parent_loc_fill (&parentloc, loc);
- if (ret < 0)
- goto out;
-
- STACK_WIND (frame, ac_mknod_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- if (ret < 0) {
- /* Erase any stored frame before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (mknod, frame, -1, -ret, NULL, NULL, NULL,
- NULL);
- }
-
- return 0;
-}
-
-
-int
-ac_mkdir_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dict_t *params)
-{
- STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir, loc, mode, params);
- return 0;
-}
-
-
-int32_t
-ac_mkdir_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- /* On a failed write test on parent dir, we need to return
- * EACCES, not EPERM that is returned by default by
- * ac_test_access.
- */
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, NULL, NULL,
- NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int
-ac_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dict_t *params)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
- loc_t parentloc = {0, };
-
- stub = fop_mkdir_stub (frame, ac_mkdir_resume, loc, mode, params);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- ret = ac_parent_loc_fill (&parentloc, loc);
- if (ret < 0)
- goto out;
-
- STACK_WIND (frame, ac_mkdir_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- if (ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (mkdir, frame, -1, -ret, NULL, NULL, NULL,
- NULL);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_unlink_resume (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink, loc);
- return 0;
-}
-
-
-int32_t
-ac_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
- loc_t parentloc = {0, };
-
- stub = fop_unlink_stub (frame, ac_unlink_resume, loc);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- ret = ac_parent_loc_fill (&parentloc, loc);
- if (ret < 0)
- goto out;
-
- STACK_WIND (frame, ac_unlink_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- if (ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (unlink, frame, -1, -ret, NULL, NULL);
- }
-
- return 0;
-}
-
-
-int
-ac_rmdir_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags)
-{
- STACK_WIND (frame, default_rmdir_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rmdir, loc, flags);
- return 0;
-}
-
-
-int32_t
-ac_rmdir_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int
-ac_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
- loc_t parentloc = {0, };
-
- stub = fop_rmdir_stub (frame, ac_rmdir_resume, loc, flags);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- ret = ac_parent_loc_fill (&parentloc, loc);
- if (ret < 0)
- goto out;
-
- STACK_WIND (frame, ac_rmdir_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- if (ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (rmdir, frame, -1, -ret, NULL, NULL);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_symlink_resume (call_frame_t *frame, xlator_t *this, const char *linkname,
- loc_t *loc, dict_t *params)
-{
- STACK_WIND (frame, default_symlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink, linkname, loc, params);
- return 0;
-}
-
-
-int32_t
-ac_symlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (symlink, frame, -1, op_errno, NULL, NULL,
- NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int
-ac_symlink (call_frame_t *frame, xlator_t *this, const char *linkname,
- loc_t *loc, dict_t *params)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
- loc_t parentloc = {0, };
-
- stub = fop_symlink_stub (frame, ac_symlink_resume, linkname, loc,
- params);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- ret = ac_parent_loc_fill (&parentloc, loc);
- if (ret < 0)
- goto out;
-
- STACK_WIND (frame, ac_symlink_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- if (ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (symlink, frame, -1, -ret, NULL, NULL, NULL,
- NULL);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_rename_resume (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
- loc_t *newloc)
-{
- STACK_WIND (frame, default_rename_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename, oldloc, newloc);
- return 0;
-}
-
-
-int32_t
-ac_rename_dst_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid,
- frame->root->gid, frame->root->groups,
- frame->root->ngrps, ACCTEST_WRITE,
- ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL, NULL,
- NULL, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_rename_src_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
- loc_t parentloc = {0, };
-
- stub = frame->local;
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid,
- frame->root->gid, frame->root->groups,
- frame->root->ngrps, ACCTEST_WRITE,
- ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- op_ret = ac_parent_loc_fill (&parentloc, &stub->args.rename.new);
- if (op_ret < 0) {
- op_errno = -EFAULT;
- goto out;
- }
-
- STACK_WIND (frame, ac_rename_dst_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
-
-out:
- if (op_ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL, NULL,
- NULL, NULL, NULL);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
- loc_t parentloc = {0, };
-
- stub = fop_rename_stub (frame, ac_rename_resume, oldloc, newloc);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- ret = ac_parent_loc_fill (&parentloc, oldloc);
- if (ret < 0)
- goto out;
-
- STACK_WIND (frame, ac_rename_src_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- if (ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (rename, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_link_resume (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
- loc_t *newloc)
-{
- STACK_WIND (frame, default_link_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link, oldloc, newloc);
- return 0;
-}
-
-
-int32_t
-ac_link_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- /* By default ac_test_access sets the op_errno to EPERM
- * but in the case of link, we need to return EACCES to meet
- * posix requirements when a write permission is not available
- * for the new directory.
- */
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (link, frame, -1, op_errno, NULL, NULL,
- NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
- loc_t parentloc = {0, };
-
- stub = fop_link_stub (frame, ac_link_resume, oldloc, newloc);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- ret = ac_parent_loc_fill (&parentloc, newloc);
- if (ret < 0)
- goto out;
-
- STACK_WIND (frame, ac_link_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- if (ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (link, frame, -1, -ret, NULL, NULL, NULL,
- NULL);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_create_resume (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, mode_t mode, fd_t *fd, dict_t *params)
-{
- STACK_WIND (frame, default_create_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create, loc, flags, mode,
- fd, params);
- return 0;
-}
-
-
-int32_t
-ac_create_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL, NULL,
- NULL, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- mode_t mode, fd_t *fd, dict_t *params)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
- loc_t parentloc = {0, };
-
- stub = fop_create_stub (frame, ac_create_resume, loc, flags, mode,
- fd, params);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- ret = ac_parent_loc_fill (&parentloc, loc);
- if (ret < 0)
- goto out;
-
- STACK_WIND (frame, ac_create_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- if (ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (create, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_open_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags)
-{
- STACK_WIND (frame, default_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, loc, flags, fd, wbflags);
- return 0;
-}
-
-
-int32_t
-ac_open_create_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (open, frame, -1, op_errno, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int
-ac_open_create (call_stub_t *stub)
-{
- int ret = -EFAULT;
- loc_t parentloc = {0, };
- xlator_t *this = NULL;
-
- if (!stub)
- return ret;
-
- ret = ac_parent_loc_fill (&parentloc, &stub->args.open.loc);
- if (ret < 0)
- goto out;
-
- this = stub->frame->this;
- STACK_WIND (stub->frame, ac_open_create_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &parentloc);
- loc_wipe (&parentloc);
- ret = 0;
-
-out:
- return ret;
-}
-
-
-int32_t
-ac_open_only_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
- int acctest = 0;
- int32_t flags = 0;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- flags = stub->args.open.flags;
- /* The permissions we test for depend on how the open needs to be
- * performed. */
- if ((flags & O_ACCMODE) == O_RDONLY)
- acctest = ACCTEST_READ;
- else if (((flags & O_ACCMODE) == O_RDWR) ||
- ((flags & O_ACCMODE) == O_WRONLY))
- acctest = ACCTEST_WRITE;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- acctest, ACCTEST_ANY, &op_errno);
- if (op_ret == -1)
- goto out;
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (open, frame, -1, op_errno, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int
-ac_open_only (call_stub_t *stub)
-{
- int ret = -EFAULT;
- xlator_t *this = NULL;
-
- if (!stub)
- return ret;
-
- this = stub->frame->this;
- STACK_WIND (stub->frame, ac_open_only_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, &stub->args.open.loc);
- return 0;
-}
-
-int32_t
-ac_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_open_stub (frame, ac_open_resume, loc, flags, fd, wbflags);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- /* If we are not supposed to create the file then there is no need to
- * check the parent dir permissions. */
- if (flags & O_CREAT)
- ret = ac_open_create (stub);
- else
- ret = ac_open_only (stub);
-
-out:
- if (ret < 0) {
- /* Erase the stored stub before unwinding. */
- stub = __get_frame_stub (frame);
- if (stub)
- call_stub_destroy (stub);
- STACK_UNWIND_STRICT (open, frame, -1, -ret, NULL);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_readv_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
-{
- STACK_WIND (frame, default_readv_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readv, fd, size, offset);
- return 0;
-}
-
-
-int32_t
-ac_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_READ, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL,
- NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_readv_stub (frame, ac_readv_resume, fd, size, offset);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- STACK_WIND (frame, ac_readv_fstat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fstat, fd);
- ret = 0;
-
-out:
- if (ret < 0)
- STACK_UNWIND_STRICT (readv, frame, -1, -ret, NULL, 0, NULL,
- NULL);
-
- return 0;
-}
-
-
-int32_t
-ac_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref)
-{
- STACK_WIND (frame, default_writev_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->writev, fd, vector, count, offset,
- iobref);
- return 0;
-}
-
-
-int32_t
-ac_writev_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_WRITE, ACCTEST_ANY, &op_errno);
- if (op_ret == -1) {
- op_errno = EACCES;
- goto out;
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
- int32_t count, off_t offset, struct iobref *iobref)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_writev_stub (frame, ac_writev_resume, fd, vector, count,
- offset, iobref);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- STACK_WIND (frame, ac_writev_fstat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fstat, fd);
- ret = 0;
-
-out:
- if (ret < 0)
- STACK_UNWIND_STRICT (writev, frame, -1, -ret, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-ac_opendir_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
-{
- STACK_WIND (frame, default_opendir_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->opendir, loc, fd);
- return 0;
-}
-
-
-int32_t
-ac_opendir_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_READ, ACCTEST_ANY, &op_errno);
- if (op_ret == -1)
- goto out;
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (opendir, frame, -1, op_errno, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-int32_t
-ac_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_opendir_stub (frame, ac_opendir_resume, loc, fd);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- STACK_WIND (frame, ac_opendir_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, loc);
- ret = 0;
-
-out:
- if (ret < 0)
- STACK_UNWIND_STRICT (opendir, frame, -1, -ret, NULL);
-
- return 0;
-}
-
-
-int32_t
-ac_setattr_resume (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *buf, int32_t valid)
-{
- STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setattr, loc, buf, valid);
- return 0;
-}
-
-
-int32_t
-ac_setattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
- int32_t valid = 0;
- struct iatt *setbuf = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_DONTCARE, ACCTEST_OWNER,
- &op_errno);
- if (op_ret == -1)
- goto out;
-
- valid = stub->args.setattr.valid;
- setbuf = &stub->args.setattr.stbuf;
- if (gf_attr_uid_set (valid) || gf_attr_gid_set (valid)) {
- /* chown returns EPERM if the operation would change the
- * ownership, but the effective user ID is not the
- * super-user and the process is not an owner of the file.
- * Ref: posix-testsuite/chown/07.t
- */
- if ((frame->root->uid != 0) && (gf_attr_uid_set (valid))) {
- if (buf->ia_uid != setbuf->ia_uid) {
- op_ret = -1;
- op_errno = EPERM;
- goto out;
- }
- }
-
- /* non-super-user can modify file group if he is owner of a
- * file and gid he is setting is in his groups list.
- * Ref: posix-testsuite/chown/00.t
- */
- if ((frame->root->uid != 0) && (gf_attr_gid_set (valid))) {
- if (frame->root->uid != buf->ia_uid) {
- op_ret = -1;
- op_errno = EPERM;
- goto out;
- }
-
- op_ret = ac_test_access (setbuf, 0, frame->root->gid,
- frame->root->groups,
- frame->root->ngrps,
- ACCTEST_DONTCARE,
- ACCTEST_GROUP, &op_errno);
- if (op_ret == -1)
- goto out;
- }
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (setattr, frame, -1, op_errno, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-int32_t
-ac_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
- int32_t valid)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_setattr_stub (frame, ac_setattr_resume, loc, buf, valid);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- STACK_WIND (frame, ac_setattr_stat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->stat, loc);
- ret = 0;
-
-out:
- if (ret < 0)
- STACK_UNWIND_STRICT (setattr, frame, -1, -ret, NULL, NULL);
-
- return 0;
-}
-
-
-int32_t
-ac_fsetattr_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *buf, int32_t valid)
-{
- STACK_WIND (frame, default_fsetattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetattr, fd, buf, valid);
- return 0;
-}
-
-
-int32_t
-ac_fsetattr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf)
-{
- call_stub_t *stub = NULL;
- int32_t valid = 0;
- struct iatt *setbuf = NULL;
-
- stub = __get_frame_stub (frame);
- if (op_ret == -1)
- goto out;
-
- op_ret = ac_test_access (buf, frame->root->uid, frame->root->gid,
- frame->root->groups, frame->root->ngrps,
- ACCTEST_DONTCARE, ACCTEST_OWNER,
- &op_errno);
- if (op_ret == -1)
- goto out;
-
- valid = stub->args.fsetattr.valid;
- setbuf = &stub->args.fsetattr.stbuf;
- if (gf_attr_uid_set (valid) && gf_attr_gid_set (valid)) {
- /* chown returns EPERM if the operation would change the
- * ownership, but the effective user ID is not the
- * super-user and the process is not an owner of the file.
- * Ref: posix-testsuite/chown/07.t
- */
- if ((frame->root->uid != 0) && (gf_attr_uid_set (valid))) {
- if (buf->ia_uid != setbuf->ia_uid) {
- op_ret = -1;
- op_errno = EPERM;
- goto out;
- }
- }
-
- /* non-super-user can modify file group if he is owner of a
- * file and gid he is setting is in his groups list.
- * Ref: posix-testsuite/chown/00.t
- */
- if ((frame->root->uid != 0) && (gf_attr_gid_set (valid))) {
- if (frame->root->uid != buf->ia_uid) {
- op_ret = -1;
- op_errno = EPERM;
- goto out;
- }
-
- op_ret = ac_test_access (buf, 0, frame->root->gid,
- frame->root->groups,
- frame->root->ngrps,
- ACCTEST_DONTCARE,
- ACCTEST_GROUP, &op_errno);
- if (op_ret == -1)
- goto out;
- }
- }
-
- call_resume (stub);
-out:
- if (op_ret < 0) {
- STACK_UNWIND_STRICT (fsetattr, frame, -1, op_errno, NULL, NULL);
- if (stub)
- call_stub_destroy (stub);
- }
-
- return 0;
-}
-
-
-int32_t
-ac_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
- int32_t valid)
-{
- call_stub_t *stub = NULL;
- int ret = -EFAULT;
-
- stub = fop_fsetattr_stub (frame, ac_fsetattr_resume, fd, buf, valid);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create call stub: "
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- frame->local = stub;
- STACK_WIND (frame, ac_fsetattr_fstat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fstat, fd);
- ret = 0;
-
-out:
- if (ret < 0)
- STACK_UNWIND_STRICT (fsetattr, frame, -1, -ret, NULL, NULL);
-
- return 0;
-}
-
-
-struct xlator_fops fops = {
- .truncate = ac_truncate,
- .access = ac_access,
- .readlink = ac_readlink,
- .mknod = ac_mknod,
- .mkdir = ac_mkdir,
- .unlink = ac_unlink,
- .rmdir = ac_rmdir,
- .symlink = ac_symlink,
- .rename = ac_rename,
- .link = ac_link,
- .create = ac_create,
- .open = ac_open,
- .readv = ac_readv,
- .writev = ac_writev,
- .opendir = ac_opendir,
- .setattr = ac_setattr,
- .fsetattr = ac_fsetattr,
-};
-
-int
-init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this->children || this->children->next) {
- gf_log (this->name, GF_LOG_ERROR,
- "FATAL: access-control not configured with "
- "exactly one child");
- goto out;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- return;
-}
-
-struct xlator_cbks cbks = {
-};
diff --git a/xlators/features/access-control/src/access-control.h b/xlators/features/access-control/src/access-control.h
deleted file mode 100644
index 8e2d6b6b966..00000000000
--- a/xlators/features/access-control/src/access-control.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef __ACCESS_CONTROL_H_
-#define __ACCESS_CONTROL_H_
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#define ACTRL "access-control"
-#define ACCTEST_READ 0x1
-#define ACCTEST_WRITE 0x2
-#define ACCTEST_EXEC 0x4
-#define ACCTEST_DONTCARE 0x8
-
-/* Note if the caller is only interested in ownership test i.e. one of the below
-+ * in combination with GF_ACCTEST_DONTCARE, then only one type of user's owner
-+ * ship can be tested with one call to gf_test_access, i.e. we can only
-+ * check of either owner and group, if both need to be tested for a specific
-+ * (uid, gid) pair then two calls will be needed.
-+ */
-#define ACCTEST_OWNER 0x1
-#define ACCTEST_GROUP 0x2
-#define ACCTEST_OTHER 0x4
-
-/* Signifies any user, as long as we get access. */
-#define ACCTEST_ANY (ACCTEST_OWNER | ACCTEST_GROUP | ACCTEST_OTHER)
-
-#define ac_test_owner(acc) ((acc) & ACCTEST_OWNER)
-#define ac_test_group(acc) ((acc) & ACCTEST_GROUP)
-#define ac_test_other(acc) ((acc) & ACCTEST_OTHER)
-#define ac_test_dontcare(acc) ((acc) & ACCTEST_DONTCARE)
-#define ac_test_read(acc) ((acc) & ACCTEST_READ)
-#define ac_test_write(acc) ((acc) & ACCTEST_WRITE)
-#define ac_test_exec(acc) ((acc) & ACCTEST_EXEC)
-#endif
diff --git a/xlators/cluster/unify/Makefile.am b/xlators/features/arbiter/Makefile.am
index d471a3f9243..a985f42a877 100644
--- a/xlators/cluster/unify/Makefile.am
+++ b/xlators/features/arbiter/Makefile.am
@@ -1,3 +1,3 @@
SUBDIRS = src
-CLEANFILES =
+CLEANFILES =
diff --git a/xlators/features/arbiter/src/Makefile.am b/xlators/features/arbiter/src/Makefile.am
new file mode 100644
index 00000000000..badc42f37be
--- /dev/null
+++ b/xlators/features/arbiter/src/Makefile.am
@@ -0,0 +1,19 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = arbiter.la
+endif
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+arbiter_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+arbiter_la_SOURCES = arbiter.c
+arbiter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = arbiter.h arbiter-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/arbiter/src/arbiter-mem-types.h b/xlators/features/arbiter/src/arbiter-mem-types.h
new file mode 100644
index 00000000000..05d18374c46
--- /dev/null
+++ b/xlators/features/arbiter/src/arbiter-mem-types.h
@@ -0,0 +1,18 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __ARBITER_MEM_TYPES_H__
+#define __ARBITER_MEM_TYPES_H__
+#include <glusterfs/mem-types.h>
+
+typedef enum gf_arbiter_mem_types_ {
+ gf_arbiter_mt_inode_ctx_t = gf_common_mt_end + 1,
+ gf_arbiter_mt_end
+} gf_arbiter_mem_types_t;
+#endif
diff --git a/xlators/features/arbiter/src/arbiter.c b/xlators/features/arbiter/src/arbiter.c
new file mode 100644
index 00000000000..83a97e3354b
--- /dev/null
+++ b/xlators/features/arbiter/src/arbiter.c
@@ -0,0 +1,380 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "arbiter.h"
+#include "arbiter-mem-types.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+
+static arbiter_inode_ctx_t *
+__arbiter_inode_ctx_get(inode_t *inode, xlator_t *this)
+{
+ arbiter_inode_ctx_t *ctx = NULL;
+ int ret = 0;
+ uint64_t ctx_addr = 0;
+
+ ret = __inode_ctx_get(inode, this, &ctx_addr);
+ if (ret == 0) {
+ ctx = (arbiter_inode_ctx_t *)(long)ctx_addr;
+ goto out;
+ }
+
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_arbiter_mt_inode_ctx_t);
+ if (!ctx)
+ goto out;
+
+ ret = __inode_ctx_put(inode, this, (uint64_t)(uintptr_t)ctx);
+ if (ret) {
+ GF_FREE(ctx);
+ ctx = NULL;
+ gf_log_callingfn(this->name, GF_LOG_ERROR,
+ "failed to "
+ "set the inode ctx (%s)",
+ uuid_utoa(inode->gfid));
+ }
+out:
+ return ctx;
+}
+
+static arbiter_inode_ctx_t *
+arbiter_inode_ctx_get(inode_t *inode, xlator_t *this)
+{
+ arbiter_inode_ctx_t *ctx = NULL;
+
+ LOCK(&inode->lock);
+ {
+ ctx = __arbiter_inode_ctx_get(inode, this);
+ }
+ UNLOCK(&inode->lock);
+ return ctx;
+}
+
+int32_t
+arbiter_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ arbiter_inode_ctx_t *ctx = NULL;
+
+ if (op_ret != 0)
+ goto unwind;
+ ctx = arbiter_inode_ctx_get(inode, this);
+ if (!ctx) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ memcpy(&ctx->iattbuf, buf, sizeof(ctx->iattbuf));
+
+unwind:
+ STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+ postparent);
+ return 0;
+}
+
+int32_t
+arbiter_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ STACK_WIND(frame, arbiter_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
+
+int32_t
+arbiter_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+