summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xtests/basic/tier/tier.t11
-rw-r--r--xlators/cluster/dht/src/dht-common.h28
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c8
-rw-r--r--xlators/cluster/dht/src/dht-shared.c25
-rw-r--r--xlators/cluster/dht/src/tier.c471
-rw-r--r--xlators/cluster/dht/src/tier.h24
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c135
7 files changed, 589 insertions, 113 deletions
diff --git a/tests/basic/tier/tier.t b/tests/basic/tier/tier.t
index 7810ff2bfd6..67927047729 100755
--- a/tests/basic/tier/tier.t
+++ b/tests/basic/tier/tier.t
@@ -150,12 +150,23 @@ TEST ! $CLI volume set $V0 cluster.tier-demote-frequency 4
TEST ! $CLI volume tier $V0 detach commit force
TEST $CLI volume tier $V0 attach replica 2 $H0:$B0/${V0}$CACHE_BRICK_FIRST $H0:$B0/${V0}$CACHE_BRICK_LAST
+
+TEST $CLI volume set $V0 cluster.tier-mode test
+
# create a file, make sure it can be deleted after attach tier.
TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
cd $M0
TEST touch delete_me.txt
TEST rm -f delete_me.txt
+# confirm watermark CLI works
+TEST $CLI volume set $V0 cluster.watermark-hi 85
+TEST $CLI volume set $V0 cluster.watermark-low 75
+TEST $CLI volume set $V0 cluster.tier-max-mb 1000
+TEST $CLI volume set $V0 cluster.tier-max-files 1000
+TEST ! $CLI volume set $V0 cluster.tier-max-files -3
+TEST ! $CLI volume set $V0 cluster.watermark-low 90
+
# stop the volume and restart it. The rebalance daemon should restart.
cd /tmp
umount $M0
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index c48bf5800b9..95ca7067806 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -332,6 +332,29 @@ struct dht_container {
dict_t *migrate_data;
};
+typedef enum tier_mode_ {
+ TIER_MODE_NONE = 0,
+ TIER_MODE_TEST,
+ TIER_MODE_WM
+} tier_mode_t;
+
+typedef struct gf_tier_conf {
+ int is_tier;
+ int watermark_hi;
+ int watermark_low;
+ int watermark_last;
+ fsblkcnt_t blocks_total;
+ fsblkcnt_t blocks_used;
+ int percent_full;
+ uint64_t max_migrate_bytes;
+ int max_migrate_files;
+ tier_mode_t mode;
+ int tier_promote_frequency;
+ int tier_demote_frequency;
+ uint64_t st_last_promoted_size;
+ uint64_t st_last_demoted_size;
+} gf_tier_conf_t;
+
struct gf_defrag_info_ {
uint64_t total_files;
uint64_t total_data;
@@ -352,8 +375,7 @@ struct gf_defrag_info_ {
gf_boolean_t stats;
uint32_t new_commit_hash;
gf_defrag_pattern_list_t *defrag_pattern;
- int tier_promote_frequency;
- int tier_demote_frequency;
+ gf_tier_conf_t tier_conf;
/*Data Tiering params for scanner*/
uint64_t total_files_promoted;
@@ -1088,5 +1110,7 @@ int32_t dht_set_local_rebalance (xlator_t *this, dht_local_t *local,
struct iatt *stbuf,
struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata);
+void
+dht_build_root_loc (inode_t *inode, loc_t *loc);
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index c53c7a99882..7dc89d8a069 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -1326,6 +1326,14 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
}
+ /* store size of previous migrated file */
+ if (defrag->tier_conf.is_tier) {
+ if (from == conf->subvolumes[0]) {
+ defrag->tier_conf.st_last_promoted_size = stbuf.ia_size;
+ } else {
+ defrag->tier_conf.st_last_demoted_size = stbuf.ia_size;
+ }
+ }
/* The src file is being unlinked after this so we don't need
to clean it up */
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 2436eba2a0c..4d700482919 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -978,17 +978,32 @@ struct volume_options options[] = {
{ .key = {"write-freq-threshold"},
.type = GF_OPTION_TYPE_INT,
.default_value = "0",
- .description = "Defines the write fequency "
- "that would be considered hot"
},
{ .key = {"read-freq-threshold"},
.type = GF_OPTION_TYPE_INT,
.default_value = "0",
- .description = "Defines the read fequency "
- "that would be considered hot"
},
-
+ { .key = {"watermark-hi"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "90",
+ },
+ { .key = {"watermark-low"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "75",
+ },
+ { .key = {"tier-mode"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "test",
+ },
+ { .key = {"tier-max-mb"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "1000",
+ },
+ { .key = {"tier-max-files"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "5000",
+ },
/* switch option */
{ .key = {"pattern.switch.case"},
.type = GF_OPTION_TYPE_ANY
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
index ff01862bed9..860b1f7da9a 100644
--- a/xlators/cluster/dht/src/tier.c
+++ b/xlators/cluster/dht/src/tier.c
@@ -114,6 +114,120 @@ out:
return ret;
}
+int
+tier_do_migration (xlator_t *this, int promote)
+{
+ gf_defrag_info_t *defrag = NULL;
+ dht_conf_t *conf = NULL;
+ long rand = 0;
+ int migrate = 0;
+ gf_tier_conf_t *tier_conf = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto exit;
+
+ defrag = conf->defrag;
+ if (!defrag)
+ goto exit;
+
+ if (defrag->tier_conf.mode != TIER_MODE_WM) {
+ migrate = 1;
+ goto exit;
+ }
+
+ tier_conf = &defrag->tier_conf;
+
+ switch (tier_conf->watermark_last) {
+ case TIER_WM_LOW:
+ migrate = promote ? 1 : 0;
+ break;
+ case TIER_WM_HI:
+ migrate = promote ? 0 : 1;
+ break;
+ case TIER_WM_MID:
+ rand = random() % 100;
+ if (promote) {
+ migrate = (rand > tier_conf->percent_full);
+ } else {
+ migrate = (rand <= tier_conf->percent_full);
+ }
+ break;
+ }
+
+exit:
+ return migrate;
+}
+
+int
+tier_check_watermark (xlator_t *this, loc_t *root_loc)
+{
+ tier_watermark_op_t wm = TIER_WM_NONE;
+ int ret = -1;
+ gf_defrag_info_t *defrag = NULL;
+ dht_conf_t *conf = NULL;
+ dict_t *xdata = NULL;
+ struct statvfs statfs = {0, };
+ gf_tier_conf_t *tier_conf = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto exit;
+
+ defrag = conf->defrag;
+ if (!defrag)
+ goto exit;
+
+ tier_conf = &defrag->tier_conf;
+
+ if (tier_conf->mode != TIER_MODE_WM) {
+ ret = 0;
+ goto exit;
+ }
+
+ /* Find how much free space is on the hot subvolume. Then see if that value */
+ /* is less than or greater than user defined watermarks. Stash results in */
+ /* the tier_conf data structure. */
+ ret = syncop_statfs (conf->subvolumes[1], root_loc, &statfs,
+ xdata, NULL);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_LOG_TIER_STATUS,
+ "Unable to obtain statfs.");
+ goto exit;
+ }
+
+ pthread_mutex_lock (&dm_stat_mutex);
+
+ tier_conf->blocks_total = statfs.f_blocks;
+ tier_conf->blocks_used = statfs.f_blocks - statfs.f_bfree;
+
+ tier_conf->percent_full = (100 * tier_conf->blocks_used) /
+ statfs.f_blocks;
+ pthread_mutex_unlock (&dm_stat_mutex);
+
+ if (tier_conf->percent_full < tier_conf->watermark_low) {
+ wm = TIER_WM_LOW;
+
+ } else if (tier_conf->percent_full < tier_conf->watermark_hi) {
+ wm = TIER_WM_MID;
+
+ } else {
+ wm = TIER_WM_HI;
+ }
+
+ if (wm != tier_conf->watermark_last) {
+
+ tier_conf->watermark_last = wm;
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LOG_TIER_STATUS,
+ "Tier watermark now %d", wm);
+ }
+
+exit:
+ return ret;
+}
+
static int
tier_migrate_using_query_file (void *_args)
{
@@ -141,6 +255,8 @@ tier_migrate_using_query_file (void *_args)
char *link_str = NULL;
xlator_t *src_subvol = NULL;
dht_conf_t *conf = NULL;
+ uint64_t total_migrated_bytes = 0;
+ int total_files = 0;
GF_VALIDATE_OR_GOTO ("tier", query_cbk_args, out);
GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out);
@@ -155,14 +271,20 @@ tier_migrate_using_query_file (void *_args)
queryFILE = query_cbk_args->queryFILE;
- query_record = gfdb_query_record_init();
+ query_record = gfdb_query_record_init ();
if (!query_record) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Call to gfdb_query_record_init() failed.");
goto out;
}
query_record->_link_info_str = GF_CALLOC (1, DB_QUERY_RECORD_SIZE,
gf_common_mt_char);
if (!query_record->_link_info_str) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Allocating query record link info string failed.");
goto out;
}
link_buffer = query_record->_link_info_str;
@@ -191,13 +313,14 @@ tier_migrate_using_query_file (void *_args)
continue;
}
+ if (!tier_do_migration (this, query_cbk_args->is_promotion))
+ continue;
+
gf_uuid_parse (gfid_str, query_record->gfid);
- if (dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY))
- dict_del(migrate_data, GF_XATTR_FILE_MIGRATE_KEY);
+ dict_del (migrate_data, GF_XATTR_FILE_MIGRATE_KEY);
- if (dict_get(migrate_data, "from.migrator"))
- dict_del(migrate_data, "from.migrator");
+ dict_del (migrate_data, "from.migrator");
token_str = strtok (link_buffer, delimiter);
if (token_str != NULL) {
@@ -235,6 +358,7 @@ tier_migrate_using_query_file (void *_args)
}
per_link_status = 0;
+
/* Per link of file */
while (token_str != NULL) {
@@ -270,9 +394,9 @@ tier_migrate_using_query_file (void *_args)
ret = syncop_lookup (this, &p_loc, &par_stbuf, NULL,
NULL, NULL);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
+ gf_msg (this->name, GF_LOG_ERROR, -ret,
DHT_MSG_LOG_TIER_ERROR,
- " ERROR in parent lookup\n");
+ " Error in parent lookup\n");
per_link_status = -1;
goto abort;
}
@@ -284,7 +408,7 @@ tier_migrate_using_query_file (void *_args)
gf_uuid_copy (loc.gfid, query_record->gfid);
loc.inode = inode_new (defrag->root_inode->table);
gf_uuid_copy (loc.pargfid, link_info->pargfid);
- loc.parent = inode_ref(p_loc.inode);
+ loc.parent = inode_ref (p_loc.inode);
loc.name = gf_strdup (link_info->file_name);
if (!loc.name) {
@@ -325,7 +449,10 @@ tier_migrate_using_query_file (void *_args)
* should be. It means another brick moved the file
* so is not an error.
*/
- src_subvol = dht_subvol_get_cached(this, loc.inode);
+ src_subvol = dht_subvol_get_cached (this, loc.inode);
+
+ if (src_subvol == NULL)
+ goto abort;
if (query_cbk_args->is_promotion &&
src_subvol == conf->subvolumes[1]) {
@@ -363,18 +490,48 @@ tier_migrate_using_query_file (void *_args)
goto abort;
}
- if (query_cbk_args->is_promotion)
+ if (query_cbk_args->is_promotion) {
defrag->total_files_promoted++;
- else
+ total_migrated_bytes +=
+ defrag->tier_conf.st_last_promoted_size;
+ pthread_mutex_lock (&dm_stat_mutex);
+ defrag->tier_conf.blocks_used +=
+ defrag->tier_conf.st_last_promoted_size;
+ pthread_mutex_unlock (&dm_stat_mutex);
+ } else {
defrag->total_files_demoted++;
+ total_migrated_bytes +=
+ defrag->tier_conf.st_last_demoted_size;
+ pthread_mutex_lock (&dm_stat_mutex);
+ defrag->tier_conf.blocks_used -=
+ defrag->tier_conf.st_last_demoted_size;
+ pthread_mutex_unlock (&dm_stat_mutex);
+ }
+ if (defrag->tier_conf.blocks_total) {
+ pthread_mutex_lock (&dm_stat_mutex);
+ defrag->tier_conf.percent_full =
+ (100 * defrag->tier_conf.blocks_used) /
+ defrag->tier_conf.blocks_total;
+ pthread_mutex_unlock (&dm_stat_mutex);
+ }
abort:
-
loc_wipe(&loc);
loc_wipe(&p_loc);
token_str = NULL;
token_str = strtok (NULL, delimiter);
GF_FREE (link_str);
+
+ if ((++total_files > defrag->tier_conf.max_migrate_files) ||
+ (total_migrated_bytes > defrag->tier_conf.max_migrate_bytes)) {
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LOG_TIER_STATUS,
+ "Reached cycle migration limit."
+ "migrated bytes %"PRId64" files %d",
+ total_migrated_bytes,
+ total_files);
+ goto out;
+ }
}
per_file_status = per_link_status;
per_file_out:
@@ -417,7 +574,7 @@ tier_gf_query_callback (gfdb_query_record_t *gfdb_query_record,
GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->queryFILE, out);
gf_uuid_unparse (gfdb_query_record->gfid, gfid_str);
- fprintf (query_cbk_args->queryFILE, "%s|%s|%ld\n", gfid_str,
+ fprintf (query_cbk_args->queryFILE, "%s|%s|%zd\n", gfid_str,
gfdb_query_record->_link_info_str,
gfdb_query_record->link_info_size);
@@ -435,7 +592,7 @@ out:
/*Create query file in tier process*/
static int
-tier_process_self_query (brick_list_t *local_brick, void *args)
+tier_process_self_query (tier_brick_list_t *local_brick, void *args)
{
int ret = -1;
char *db_path = NULL;
@@ -477,7 +634,7 @@ tier_process_self_query (brick_list_t *local_brick, void *args)
db_path, ret, out);
/*Get the db connection*/
- conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type);
+ conn_node = gfdb_methods.init_db ((void *)params_dict, dht_tier_db_type);
if (!conn_node) {
gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_LOG_TIER_ERROR,
@@ -486,8 +643,8 @@ tier_process_self_query (brick_list_t *local_brick, void *args)
}
/*Query for eligible files from db*/
- query_cbk_args->queryFILE = fopen(GET_QFILE_PATH
- (gfdb_brick_dict_info->_gfdb_promote), "a+");
+ query_cbk_args->queryFILE = fopen (
+ GET_QFILE_PATH (gfdb_brick_dict_info->_gfdb_promote), "a+");
if (!query_cbk_args->queryFILE) {
gf_msg (this->name, GF_LOG_ERROR, errno,
DHT_MSG_LOG_TIER_ERROR,
@@ -592,7 +749,7 @@ out:
/*Ask CTR to create the query file*/
static int
-tier_process_ctr_query (brick_list_t *local_brick, void *args)
+tier_process_ctr_query (tier_brick_list_t *local_brick, void *args)
{
int ret = -1;
query_cbk_args_t *query_cbk_args = NULL;
@@ -720,7 +877,7 @@ out:
* It picks up each bricks db and queries for eligible files for migration.
* The list of eligible files are populated in appropriate query files*/
static int
-tier_process_brick (brick_list_t *local_brick, void *args) {
+tier_process_brick (tier_brick_list_t *local_brick, void *args) {
int ret = -1;
dict_t *ctr_ipc_in_dict = NULL;
dict_t *ctr_ipc_out_dict = NULL;
@@ -834,7 +991,7 @@ tier_build_migration_qfile (demotion_args_t *args,
_gfdb_brick_dict_info_t gfdb_brick_dict_info;
gfdb_time_t time_in_past;
int ret = -1;
- brick_list_t *local_brick = NULL;
+ tier_brick_list_t *local_brick = NULL;
/*
* The first time this function is called, query file will
@@ -929,8 +1086,8 @@ tier_demote (void *args)
query_cbk_args.is_promotion = 0;
/*Build the query file using bricklist*/
- ret = tier_build_migration_qfile(demotion_args, &query_cbk_args,
- _gf_false);
+ ret = tier_build_migration_qfile (demotion_args, &query_cbk_args,
+ _gf_false);
if (ret)
goto out;
@@ -967,8 +1124,8 @@ static void
query_cbk_args.is_promotion = 1;
/*Build the query file using bricklist*/
- ret = tier_build_migration_qfile(promotion_args, &query_cbk_args,
- _gf_true);
+ ret = tier_build_migration_qfile (promotion_args, &query_cbk_args,
+ _gf_true);
if (ret)
goto out;
@@ -994,7 +1151,7 @@ tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head)
char *brickname = NULL;
char db_name[PATH_MAX] = "";
int ret = 0;
- brick_list_t *local_brick = NULL;
+ tier_brick_list_t *local_brick = NULL;
GF_VALIDATE_OR_GOTO ("tier", xl, out);
GF_VALIDATE_OR_GOTO ("tier", local_bricklist_head, out);
@@ -1006,19 +1163,19 @@ tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head)
* those running on the same node as the tier daemon.
*/
if (strcmp(xl->type, "protocol/client") == 0) {
- ret = dict_get_str(xl->options, "remote-host", &rh);
+ ret = dict_get_str (xl->options, "remote-host", &rh);
if (ret < 0)
goto out;
- if (gf_is_local_addr (rh)) {
+ if (gf_is_local_addr (rh)) {
- local_brick = GF_CALLOC (1, sizeof(brick_list_t),
+ local_brick = GF_CALLOC (1, sizeof(tier_brick_list_t),
gf_tier_mt_bricklist_t);
if (!local_brick) {
goto out;
}
- ret = dict_get_str(xl->options, "remote-subvolume",
+ ret = dict_get_str (xl->options, "remote-subvolume",
&rv);
if (ret < 0)
goto out;
@@ -1051,7 +1208,7 @@ tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head)
}
for (child = xl->children; child; child = child->next) {
- ret = tier_get_bricklist(child->xlator, local_bricklist_head);
+ ret = tier_get_bricklist (child->xlator, local_bricklist_head);
if (ret) {
goto out;
}
@@ -1070,11 +1227,50 @@ out:
return ret;
}
+int
+tier_get_freq_demote (gf_tier_conf_t *tier_conf)
+{
+ if ((tier_conf->mode == TIER_MODE_WM) &&
+ (tier_conf->watermark_last == TIER_WM_HI))
+ return DEFAULT_DEMOTE_DEGRADED;
+ else
+ return tier_conf->tier_demote_frequency;
+}
+
+int
+tier_get_freq_promote (gf_tier_conf_t *tier_conf)
+{
+ return tier_conf->tier_promote_frequency;
+}
+
+static int
+tier_check_demote (gfdb_time_t current_time,
+ int freq_demote)
+{
+ return ((current_time.tv_sec % freq_demote) == 0) ?
+ _gf_true : _gf_false;
+}
+
+static gf_boolean_t
+tier_check_promote (gf_tier_conf_t *tier_conf,
+ gfdb_time_t current_time,
+ int freq_promote)
+{
+ if ((tier_conf->mode == TIER_MODE_WM) &&
+ (tier_conf->watermark_last == TIER_WM_HI))
+ return _gf_false;
+
+ else
+ return ((current_time.tv_sec % freq_promote) == 0) ?
+ _gf_true : _gf_false;
+}
+
+
void
clear_bricklist (struct list_head *brick_list)
{
- brick_list_t *local_brick = NULL;
- brick_list_t *temp = NULL;
+ tier_brick_list_t *local_brick = NULL;
+ tier_brick_list_t *temp = NULL;
if (list_empty(brick_list)) {
return;
@@ -1105,9 +1301,11 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)
pthread_t promote_thread;
pthread_t demote_thread;
gf_boolean_t is_promotion_triggered = _gf_false;
- gf_boolean_t is_demotion_triggered = _gf_false;
- xlator_t *any = NULL;
- xlator_t *xlator = NULL;
+ gf_boolean_t is_demotion_triggered = _gf_false;
+ xlator_t *any = NULL;
+ xlator_t *xlator = NULL;
+ gf_tier_conf_t *tier_conf = NULL;
+ loc_t root_loc = { 0 };
conf = this->private;
@@ -1122,6 +1320,9 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)
" demote %d", freq_promote, freq_demote);
defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
+ tier_conf = &defrag->tier_conf;
+
+ dht_build_root_loc (defrag->root_inode, &root_loc);
while (1) {
@@ -1130,7 +1331,7 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)
* thread. It will need to be restarted manually.
*/
any = THIS->ctx->active->first;
- xlator = xlator_search_by_name(any, this->name);
+ xlator = xlator_search_by_name (any, this->name);
if (xlator != this) {
gf_msg (this->name, GF_LOG_INFO, 0,
@@ -1160,10 +1361,6 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)
goto out;
}
- freq_promote = defrag->tier_promote_frequency;
- freq_demote = defrag->tier_demote_frequency;
-
-
/* To have proper synchronization amongst all
* brick holding nodes, so that promotion and demotions
* start atomicly w.r.t promotion/demotion frequency
@@ -1178,18 +1375,29 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)
goto out;
}
- is_demotion_triggered = ((current_time.tv_sec %
- freq_demote) == 0) ? _gf_true :
- _gf_false;
- is_promotion_triggered = ((current_time.tv_sec %
- freq_promote) == 0) ? _gf_true :
- _gf_false;
+ freq_demote = tier_get_freq_demote (tier_conf);
+
+ is_demotion_triggered = tier_check_demote (current_time,
+ freq_demote);
+
+ freq_promote = tier_get_freq_promote(tier_conf);
+
+ is_promotion_triggered = tier_check_promote (tier_conf,
+ current_time,
+ freq_promote);
/* If no promotion and no demotion is
- * scheduled/triggered skip a iteration */
+ * scheduled/triggered skip an iteration */
if (!is_promotion_triggered && !is_demotion_triggered)
continue;
+ ret = tier_check_watermark (this, &root_loc);
+ if (ret != 0) {
+ gf_msg (this->name, GF_LOG_CRITICAL, errno,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Failed to get watermark");
+ goto out;
+ }
ret_promotion = -1;
ret_demotion = -1;
@@ -1297,8 +1505,8 @@ tier_migration_get_dst (xlator_t *this, dht_local_t *local)
int32_t ret = -1;
gf_defrag_info_t *defrag = NULL;
- GF_VALIDATE_OR_GOTO("tier", this, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO ("tier", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
conf = this->private;
@@ -1332,10 +1540,10 @@ tier_search (xlator_t *this, dht_layout_t *layout, const char *name)
int layout_cold = 0;
int layout_hot = 1;
- GF_VALIDATE_OR_GOTO("tier", this, out);
- GF_VALIDATE_OR_GOTO(this->name, layout, out);
- GF_VALIDATE_OR_GOTO(this->name, name, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO ("tier", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, layout, out);
+ GF_VALIDATE_OR_GOTO (this->name, name, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
conf = this->private;
@@ -1389,7 +1597,7 @@ tier_load_externals (xlator_t *this)
char *libpathfull = (LIBDIR "/libgfdb.so.0");
get_gfdb_methods_t get_gfdb_methods;
- GF_VALIDATE_OR_GOTO("this", this, out);
+ GF_VALIDATE_OR_GOTO ("this", this, out);
libhandle = dlopen (libpathfull, RTLD_NOW);
if (!libhandle) {
@@ -1420,6 +1628,20 @@ out:
return ret;
}
+static
+int tier_validate_mode (char *mode)
+{
+ int ret = -1;
+
+ if (strcmp (mode, "test") == 0) {
+ ret = TIER_MODE_TEST;
+ } else {
+ ret = TIER_MODE_WM;
+ }
+
+ return ret;
+}
+
int
tier_init (xlator_t *this)
{
@@ -1428,10 +1650,11 @@ tier_init (xlator_t *this)
dht_conf_t *conf = NULL;
gf_defrag_info_t *defrag = NULL;
char *voldir = NULL;
+ char *mode = NULL;
- ret = dht_init(this);
+ ret = dht_init (this);
if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
+ gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_LOG_TIER_ERROR,
"dht_init failed");
goto out;
@@ -1442,7 +1665,7 @@ tier_init (xlator_t *this)
conf->methods = &tier_methods;
if (conf->subvolume_cnt != 2) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
+ gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_LOG_TIER_ERROR,
"Invalid number of subvolumes %d", conf->subvolume_cnt);
goto out;
@@ -1455,7 +1678,7 @@ tier_init (xlator_t *this)
}
/* if instatiated from server side, load db libraries */
- ret = tier_load_externals(this);
+ ret = tier_load_externals (this);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0,
DHT_MSG_LOG_TIER_ERROR,
@@ -1465,13 +1688,15 @@ tier_init (xlator_t *this)
defrag = conf->defrag;
+ defrag->tier_conf.is_tier = 1;
+
ret = dict_get_int32 (this->options,
"tier-promote-frequency", &freq);
if (ret) {
freq = DEFAULT_PROMOTE_FREQ_SEC;
}
- defrag->tier_promote_frequency = freq;
+ defrag->tier_conf.tier_promote_frequency = freq;
ret = dict_get_int32 (this->options,
"tier-demote-frequency", &freq);
@@ -1479,7 +1704,23 @@ tier_init (xlator_t *this)
freq = DEFAULT_DEMOTE_FREQ_SEC;
}
- defrag->tier_demote_frequency = freq;
+ defrag->tier_conf.tier_demote_frequency = freq;
+
+ ret = dict_get_int32 (this->options,
+ "watermark-hi", &freq);
+ if (ret) {
+ freq = DEFAULT_WM_HI;
+ }
+
+ defrag->tier_conf.watermark_hi = freq;
+
+ ret = dict_get_int32 (this->options,
+ "watermark-low", &freq);
+ if (ret) {
+ freq = DEFAULT_WM_LOW;
+ }
+
+ defrag->tier_conf.watermark_low = freq;
ret = dict_get_int32 (this->options,
"write-freq-threshold", &freq);
@@ -1497,7 +1738,38 @@ tier_init (xlator_t *this)
defrag->read_freq_threshold = freq;
- ret = gf_asprintf(&voldir, "%s/%s",
+ ret = dict_get_int32 (this->options,
+ "tier-max-mb", &freq);
+ if (ret) {
+ freq = DEFAULT_TIER_MAX_MIGRATE_MB;
+ }
+
+ defrag->tier_conf.max_migrate_bytes = freq * 1024 * 1024;
+
+ ret = dict_get_int32 (this->options,
+ "tier-max-files", &freq);
+ if (ret) {
+ freq = DEFAULT_TIER_MAX_MIGRATE_FILES;
+ }
+
+ defrag->tier_conf.max_migrate_files = freq;
+
+ ret = dict_get_str (this->options,
+ "tier-mode", &mode);
+ if (ret) {
+ defrag->tier_conf.mode = DEFAULT_TIER_MODE;
+ } else {
+ ret = tier_validate_mode (mode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "tier_init failed - invalid mode");
+ goto out;
+ }
+ defrag->tier_conf.mode = ret;
+ }
+
+ ret = gf_asprintf (&voldir, "%s/%s",
DEFAULT_VAR_RUN_DIRECTORY,
this->name);
if (ret < 0)
@@ -1505,7 +1777,7 @@ tier_init (xlator_t *this)
ret = mkdir_p(voldir, 0777, _gf_true);
if (ret == -1 && errno != EEXIST) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
+ gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_LOG_TIER_ERROR,
"tier_init failed");
@@ -1515,37 +1787,37 @@ tier_init (xlator_t *this)
GF_FREE(voldir);
- ret = gf_asprintf(&promotion_qfile, "%s/%s/%s-%s",
- DEFAULT_VAR_RUN_DIRECTORY,
- this->name,
- PROMOTION_QFILE,
- this->name);
+ ret = gf_asprintf (&promotion_qfile, "%s/%s/%s-%s",
+ DEFAULT_VAR_RUN_DIRECTORY,
+ this->name,
+ PROMOTION_QFILE,
+ this->name);
if (ret < 0)
goto out;
- ret = gf_asprintf(&demotion_qfile, "%s/%s/%s-%s",
- DEFAULT_VAR_RUN_DIRECTORY,
- this->name,
- DEMOTION_QFILE,
- this->name);
+ ret = gf_asprintf (&demotion_qfile, "%s/%s/%s-%s",
+ DEFAULT_VAR_RUN_DIRECTORY,
+ this->name,
+ DEMOTION_QFILE,
+ this->name);
if (ret < 0) {
- GF_FREE(promotion_qfile);
+ GF_FREE (promotion_qfile);
goto out;
}
- unlink(promotion_qfile);
- unlink(demotion_qfile);
+ unlink (promotion_qfile);
+ unlink (demotion_qfile);
- gf_msg(this->name, GF_LOG_INFO, 0,
- DHT_MSG_LOG_TIER_STATUS,
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LOG_TIER_STATUS,
"Promote/demote frequency %d/%d "
"Write/Read freq thresholds %d/%d",
- defrag->tier_promote_frequency,
- defrag->tier_demote_frequency,
+ defrag->tier_conf.tier_promote_frequency,
+ defrag->tier_conf.tier_demote_frequency,
defrag->write_freq_threshold,
defrag->read_freq_threshold);
- gf_msg(this->name, GF_LOG_INFO, 0,
+ gf_msg (this->name, GF_LOG_INFO, 0,
DHT_MSG_LOG_TIER_STATUS,
"Promote file %s demote file %s",
promotion_qfile, demotion_qfile);
@@ -1563,18 +1835,19 @@ tier_reconfigure (xlator_t *this, dict_t *options)
{
dht_conf_t *conf = NULL;
gf_defrag_info_t *defrag = NULL;
-
+ char *mode = NULL;
+ int migrate_mb = 0;
conf = this->private;
if (conf->defrag) {
defrag = conf->defrag;
GF_OPTION_RECONF ("tier-promote-frequency",
- defrag->tier_promote_frequency, options,
- int32, out);
+ defrag->tier_conf.tier_promote_frequency,
+ options, int32, out);
GF_OPTION_RECONF ("tier-demote-frequency",
- defrag->tier_demote_frequency, options,
- int32, out);
+ defrag->tier_conf.tier_demote_frequency,
+ options, int32, out);
GF_OPTION_RECONF ("write-freq-threshold",
defrag->write_freq_threshold, options,
@@ -1583,6 +1856,28 @@ tier_reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("read-freq-threshold",
defrag->read_freq_threshold, options,
int32, out);
+
+ GF_OPTION_RECONF ("watermark-hi",
+ defrag->tier_conf.watermark_hi, options,
+ int32, out);
+
+ GF_OPTION_RECONF ("watermark-low",
+ defrag->tier_conf.watermark_low, options,
+ int32, out);
+
+ GF_OPTION_RECONF ("tier-mode",
+ mode, options,
+ str, out);
+ defrag->tier_conf.mode = tier_validate_mode (mode);
+
+ GF_OPTION_RECONF ("tier-max-mb",
+ migrate_mb, options,
+ int32, out);
+ defrag->tier_conf.max_migrate_bytes = migrate_mb*1024*1024;
+
+ GF_OPTION_RECONF ("tier-max-files",
+ defrag->tier_conf.max_migrate_files, options,
+ int32, out);
}
out:
@@ -1593,10 +1888,10 @@ void
tier_fini (xlator_t *this)
{
if (libhandle)
- dlclose(libhandle);
+ dlclose (libhandle);
- GF_FREE(demotion_qfile);
- GF_FREE(promotion_qfile);
+ GF_FREE (demotion_qfile);
+ GF_FREE (promotion_qfile);
dht_fini(this);
}
diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h
index d168221fe1d..18ca3269f8b 100644
--- a/xlators/cluster/dht/src/tier.h
+++ b/xlators/cluster/dht/src/tier.h
@@ -20,10 +20,6 @@
#include <fnmatch.h>
#include <signal.h>
-#define DEFAULT_PROMOTE_FREQ_SEC 120
-#define DEFAULT_DEMOTE_FREQ_SEC 120
-#define DEFAULT_WRITE_FREQ_SEC 0
-#define DEFAULT_READ_FREQ_SEC 0
/*
* Size of timer wheel. We would not promote or demote less
* frequently than this number.
@@ -60,7 +56,7 @@ typedef struct brick_list {
xlator_t *xlator;
char *brick_db_path;
struct list_head list;
-} brick_list_t;
+} tier_brick_list_t;
typedef struct _dm_thread_args {
xlator_t *this;
@@ -70,4 +66,22 @@ typedef struct _dm_thread_args {
int return_value;
} promotion_args_t, demotion_args_t;
+typedef enum tier_watermark_op_ {
+ TIER_WM_NONE = 0,
+ TIER_WM_LOW,
+ TIER_WM_HI,
+ TIER_WM_MID
+} tier_watermark_op_t;
+
+#define DEFAULT_PROMOTE_FREQ_SEC 120
+#define DEFAULT_DEMOTE_FREQ_SEC 120
+#define DEFAULT_DEMOTE_DEGRADED 10
+#define DEFAULT_WRITE_FREQ_SEC 0
+#define DEFAULT_READ_FREQ_SEC 0
+#define DEFAULT_WM_LOW 75
+#define DEFAULT_WM_HI 90
+#define DEFAULT_TIER_MODE TIER_MODE_TEST
+#define DEFAULT_TIER_MAX_MIGRATE_MB 1000
+#define DEFAULT_TIER_MAX_MIGRATE_FILES 5000
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index c62f2d79c1f..8fdee165c68 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -19,6 +19,10 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
int ret = 0;
xlator_t *this = NULL;
int origin_val = -1;
+ char *current_wm_hi = NULL;
+ char *current_wm_low = NULL;
+ uint64_t wm_hi = 0;
+ uint64_t wm_low = 0;
this = THIS;
GF_ASSERT (this);
@@ -34,12 +38,20 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
goto out;
}
+ if (strstr (key, "cluster.tier-mode")) {
+ if (strcmp(value, "test") &&
+ strcmp(value, "cache")) {
+ ret = -1;
+ goto out;
+ }
+ goto out;
+ }
+
/*
- * All the volume set options for tier are expecting a positive
+ * Rest of the volume set options for tier are expecting a positive
* Integer. Change the function accordingly if this constraint is
* changed.
*/
-
ret = gf_string2int (value, &origin_val);
if (ret) {
snprintf (errstr, sizeof (errstr), "%s is not a compatible "
@@ -51,13 +63,55 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
ret = -1;
goto out;
}
+ if (strstr (key, "watermark-hi") ||
+ strstr (key, "watermark-low")) {
+ if ((origin_val < 1) || (origin_val > 99)) {
+ snprintf (errstr, sizeof (errstr), "%s is not a compatible"
+ "value. %s expects a percentage from 1-99.",
+ value, key);
+ gf_msg (this->name, GF_LOG_ERROR, EINVAL,
+ GD_MSG_INCOMPATIBLE_VALUE, "%s", errstr);
+ *op_errstr = gf_strdup (errstr);
+ ret = -1;
+ goto out;
+ }
+
+ if (strstr (key, "watermark-hi")) {
+ wm_hi = origin_val;
+ } else {
+ glusterd_volinfo_get (volinfo,
+ "cluster.watermark-hi",
+ &current_wm_hi);
+ gf_string2bytesize_uint64 (current_wm_hi,
+ &wm_hi);
+ }
- if (strstr ("cluster.tier-promote-frequency", key) ||
- strstr ("cluster.tier-demote-frequency", key)) {
+ if (strstr (key, "watermark-low")) {
+ wm_low = origin_val;
+ } else {
+ glusterd_volinfo_get (volinfo,
+ "cluster.watermark-low",
+ &current_wm_low);
+ gf_string2bytesize_uint64 (current_wm_low,
+ &wm_low);
+ }
+ if (wm_low > wm_hi) {
+ snprintf (errstr, sizeof (errstr), "lower watermark"
+ " cannot exceed upper watermark.");
+ gf_msg (this->name, GF_LOG_ERROR, EINVAL,
+ GD_MSG_INCOMPATIBLE_VALUE, "%s", errstr);
+ *op_errstr = gf_strdup (errstr);
+ ret = -1;
+ goto out;
+ }
+ } else if (strstr (key, "tier-promote-frequency") ||
+ strstr (key, "tier-max-mb") ||
+ strstr (key, "tier-max-files") ||
+ strstr (key, "tier-demote-frequency")) {
if (origin_val < 1) {
snprintf (errstr, sizeof (errstr), "%s is not a "
- "compatible value. %s expects a positive "
- "integer value.",
+ " compatible value. %s expects a positive "
+ "integer value greater than 0.",
value, key);
gf_msg (this->name, GF_LOG_ERROR, EINVAL,
GD_MSG_INCOMPATIBLE_VALUE, "%s", errstr);
@@ -65,10 +119,12 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
ret = -1;
goto out;
}
+
} else {
+ /* check write-freq-threshold and read-freq-threshold. */
if (origin_val < 0) {
snprintf (errstr, sizeof (errstr), "%s is not a "
- "compatible value. %s expects a non-negative"
+ "compatible value. %s expects a positive"
" integer value.",
value, key);
gf_msg (this->name, GF_LOG_ERROR, EINVAL,
@@ -1906,6 +1962,7 @@ struct volopt_map_entry glusterd_volopt_map[] = {
/* tier translator - global tunables */
{ .key = "cluster.write-freq-threshold",
.voltype = "cluster/tier",
+ .value = "0",
.option = "write-freq-threshold",
.op_version = GD_OP_VERSION_3_7_0,
.flags = OPT_FLAG_CLIENT_OPT,
@@ -1917,6 +1974,7 @@ struct volopt_map_entry glusterd_volopt_map[] = {
},
{ .key = "cluster.read-freq-threshold",
.voltype = "cluster/tier",
+ .value = "0",
.option = "read-freq-threshold",
.op_version = GD_OP_VERSION_3_7_0,
.flags = OPT_FLAG_CLIENT_OPT,
@@ -1928,23 +1986,74 @@ struct volopt_map_entry glusterd_volopt_map[] = {
},
{ .key = "cluster.tier-promote-frequency",
.voltype = "cluster/tier",
+ .value = "120",
.option = "tier-promote-frequency",
.op_version = GD_OP_VERSION_3_7_0,
.flags = OPT_FLAG_CLIENT_OPT,
.validate_fn = validate_tier,
- .description = "Defines how often the promotion should be triggered "
- "i.e. periodicity of promotion cycles. The value is in "
- "secs."
},
{ .key = "cluster.tier-demote-frequency",
.voltype = "cluster/tier",
+ .value = "120",
.option = "tier-demote-frequency",
.op_version = GD_OP_VERSION_3_7_0,
.flags = OPT_FLAG_CLIENT_OPT,
.validate_fn = validate_tier,
- .description = "Defines how often the demotion should be triggered "
- "i.e. periodicity of demotion cycles. The value is in "
- "secs."
+ },
+ { .key = "cluster.watermark-hi",
+ .voltype = "cluster/tier",
+ .value = "90",
+ .option = "watermark-hi",
+ .op_version = GD_OP_VERSION_3_7_6,
+ .flags = OPT_FLAG_CLIENT_OPT,
+ .validate_fn = validate_tier,
+ .description = "Upper % watermark for promotion. If hot tier fills"
+ " above this percentage, no promotion will happen and demotion will "
+ "happen with high probability."
+ },
+ { .key = "cluster.watermark-low",
+ .voltype = "cluster/tier",
+ .value = "75",
+ .option = "watermark-low",
+ .op_version = GD_OP_VERSION_3_7_6,
+ .flags = OPT_FLAG_CLIENT_OPT,
+ .validate_fn = validate_tier,
+ .description = "Lower % watermark. If hot tier is less "
+ "full than this, promotion will happen and demotion will not happen. "
+ "If greater than this, promotion/demotion will happen at a probability "
+ "relative to how full the hot tier is."
+ },
+ { .key = "cluster.tier-mode",
+ .voltype = "cluster/tier",
+ .option = "tier-mode",
+ .value = "test",
+ .op_version = GD_OP_VERSION_3_7_6,
+ .flags = OPT_FLAG_CLIENT_OPT,
+ .validate_fn = validate_tier,
+ .description = "Either 'test' or 'cache'. Test mode periodically"
+ " demotes or promotes files automatically based on access."
+ " Cache mode does so based on whether the cache is full or not,"
+ " as specified with watermarks."
+ },
+ { .key = "cluster.tier-max-mb",
+ .voltype = "cluster/tier",
+ .option = "tier-max-mb",
+ .value = "1000",
+ .op_version = GD_OP_VERSION_3_7_6,
+ .flags = OPT_FLAG_CLIENT_OPT,
+ .validate_fn = validate_tier,
+ .description = "The maximum number of MB that may be migrated"
+ " in any direction in a given cycle."
+ },
+ { .key = "cluster.tier-max-files",
+ .voltype = "cluster/tier",
+ .option = "tier-max-files",
+ .value = "5000",
+ .op_version = GD_OP_VERSION_3_7_6,
+ .flags = OPT_FLAG_CLIENT_OPT,
+ .validate_fn = validate_tier,
+ .description = "The maximum number of files that may be migrated"
+ " in any direction in a given cycle."
},
{ .key = "features.ctr-enabled",
.voltype = "features/changetimerecorder",