mirror of
https://github.com/fail0verflow/switch-linux.git
synced 2025-05-04 02:34:21 -04:00
ext4: add fsync batch tuning knobs
Add new mount options, min_batch_time and max_batch_time, which controls how long the jbd2 layer should wait for additional filesystem operations to get batched with a synchronous write transaction. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
parent
d7cfa4684d
commit
30773840c1
7 changed files with 91 additions and 8 deletions
|
@ -283,6 +283,35 @@ delalloc (*) Deferring block allocation until write-out time.
|
||||||
nodelalloc Disable delayed allocation. Blocks are allocation
|
nodelalloc Disable delayed allocation. Blocks are allocation
|
||||||
when data is copied from user to page cache.
|
when data is copied from user to page cache.
|
||||||
|
|
||||||
|
max_batch_time=usec Maximum amount of time ext4 should wait for
|
||||||
|
additional filesystem operations to be batch
|
||||||
|
together with a synchronous write operation.
|
||||||
|
Since a synchronous write operation is going to
|
||||||
|
force a commit and then a wait for the I/O
|
||||||
|
complete, it doesn't cost much, and can be a
|
||||||
|
huge throughput win, we wait for a small amount
|
||||||
|
of time to see if any other transactions can
|
||||||
|
piggyback on the synchronous write. The
|
||||||
|
algorithm used is designed to automatically tune
|
||||||
|
for the speed of the disk, by measuring the
|
||||||
|
amount of time (on average) that it takes to
|
||||||
|
finish committing a transaction. Call this time
|
||||||
|
the "commit time". If the time that the
|
||||||
|
transactoin has been running is less than the
|
||||||
|
commit time, ext4 will try sleeping for the
|
||||||
|
commit time to see if other operations will join
|
||||||
|
the transaction. The commit time is capped by
|
||||||
|
the max_batch_time, which defaults to 15000us
|
||||||
|
(15ms). This optimization can be turned off
|
||||||
|
entirely by setting max_batch_time to 0.
|
||||||
|
|
||||||
|
min_batch_time=usec This parameter sets the commit time (as
|
||||||
|
described above) to be at least min_batch_time.
|
||||||
|
It defaults to zero microseconds. Increasing
|
||||||
|
this parameter may improve the throughput of
|
||||||
|
multi-threaded, synchronous workloads on very
|
||||||
|
fast disks, at the cost of increasing latency.
|
||||||
|
|
||||||
Data Mode
|
Data Mode
|
||||||
=========
|
=========
|
||||||
There are 3 different data modes:
|
There are 3 different data modes:
|
||||||
|
|
|
@ -328,6 +328,7 @@ struct ext4_mount_options {
|
||||||
uid_t s_resuid;
|
uid_t s_resuid;
|
||||||
gid_t s_resgid;
|
gid_t s_resgid;
|
||||||
unsigned long s_commit_interval;
|
unsigned long s_commit_interval;
|
||||||
|
u32 s_min_batch_time, s_max_batch_time;
|
||||||
#ifdef CONFIG_QUOTA
|
#ifdef CONFIG_QUOTA
|
||||||
int s_jquota_fmt;
|
int s_jquota_fmt;
|
||||||
char *s_qf_names[MAXQUOTAS];
|
char *s_qf_names[MAXQUOTAS];
|
||||||
|
@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
|
||||||
#define EXT4_DEFM_JMODE_ORDERED 0x0040
|
#define EXT4_DEFM_JMODE_ORDERED 0x0040
|
||||||
#define EXT4_DEFM_JMODE_WBACK 0x0060
|
#define EXT4_DEFM_JMODE_WBACK 0x0060
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Default journal batch times
|
||||||
|
*/
|
||||||
|
#define EXT4_DEF_MIN_BATCH_TIME 0
|
||||||
|
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Structure of a directory entry
|
* Structure of a directory entry
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -74,6 +74,8 @@ struct ext4_sb_info {
|
||||||
struct journal_s *s_journal;
|
struct journal_s *s_journal;
|
||||||
struct list_head s_orphan;
|
struct list_head s_orphan;
|
||||||
unsigned long s_commit_interval;
|
unsigned long s_commit_interval;
|
||||||
|
u32 s_max_batch_time;
|
||||||
|
u32 s_min_batch_time;
|
||||||
struct block_device *journal_bdev;
|
struct block_device *journal_bdev;
|
||||||
#ifdef CONFIG_JBD2_DEBUG
|
#ifdef CONFIG_JBD2_DEBUG
|
||||||
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
|
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
|
||||||
|
|
|
@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
|
||||||
#endif
|
#endif
|
||||||
if (!test_opt(sb, RESERVATION))
|
if (!test_opt(sb, RESERVATION))
|
||||||
seq_puts(seq, ",noreservation");
|
seq_puts(seq, ",noreservation");
|
||||||
if (sbi->s_commit_interval) {
|
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
|
||||||
seq_printf(seq, ",commit=%u",
|
seq_printf(seq, ",commit=%u",
|
||||||
(unsigned) (sbi->s_commit_interval / HZ));
|
(unsigned) (sbi->s_commit_interval / HZ));
|
||||||
}
|
}
|
||||||
|
if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
|
||||||
|
seq_printf(seq, ",min_batch_time=%u",
|
||||||
|
(unsigned) sbi->s_min_batch_time);
|
||||||
|
}
|
||||||
|
if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
|
||||||
|
seq_printf(seq, ",max_batch_time=%u",
|
||||||
|
(unsigned) sbi->s_min_batch_time);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We're changing the default of barrier mount option, so
|
* We're changing the default of barrier mount option, so
|
||||||
* let's always display its mount state so it's clear what its
|
* let's always display its mount state so it's clear what its
|
||||||
|
@ -874,7 +883,8 @@ enum {
|
||||||
Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
|
Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
|
||||||
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
|
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
|
||||||
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
|
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
|
||||||
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
|
Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
|
||||||
|
Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
|
||||||
Opt_journal_checksum, Opt_journal_async_commit,
|
Opt_journal_checksum, Opt_journal_async_commit,
|
||||||
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
|
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
|
||||||
Opt_data_err_abort, Opt_data_err_ignore,
|
Opt_data_err_abort, Opt_data_err_ignore,
|
||||||
|
@ -913,6 +923,8 @@ static const match_table_t tokens = {
|
||||||
{Opt_nobh, "nobh"},
|
{Opt_nobh, "nobh"},
|
||||||
{Opt_bh, "bh"},
|
{Opt_bh, "bh"},
|
||||||
{Opt_commit, "commit=%u"},
|
{Opt_commit, "commit=%u"},
|
||||||
|
{Opt_min_batch_time, "min_batch_time=%u"},
|
||||||
|
{Opt_max_batch_time, "max_batch_time=%u"},
|
||||||
{Opt_journal_update, "journal=update"},
|
{Opt_journal_update, "journal=update"},
|
||||||
{Opt_journal_inum, "journal=%u"},
|
{Opt_journal_inum, "journal=%u"},
|
||||||
{Opt_journal_dev, "journal_dev=%u"},
|
{Opt_journal_dev, "journal_dev=%u"},
|
||||||
|
@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb,
|
||||||
option = JBD2_DEFAULT_MAX_COMMIT_AGE;
|
option = JBD2_DEFAULT_MAX_COMMIT_AGE;
|
||||||
sbi->s_commit_interval = HZ * option;
|
sbi->s_commit_interval = HZ * option;
|
||||||
break;
|
break;
|
||||||
|
case Opt_max_batch_time:
|
||||||
|
if (match_int(&args[0], &option))
|
||||||
|
return 0;
|
||||||
|
if (option < 0)
|
||||||
|
return 0;
|
||||||
|
if (option == 0)
|
||||||
|
option = EXT4_DEF_MAX_BATCH_TIME;
|
||||||
|
sbi->s_max_batch_time = option;
|
||||||
|
break;
|
||||||
|
case Opt_min_batch_time:
|
||||||
|
if (match_int(&args[0], &option))
|
||||||
|
return 0;
|
||||||
|
if (option < 0)
|
||||||
|
return 0;
|
||||||
|
sbi->s_min_batch_time = option;
|
||||||
|
break;
|
||||||
case Opt_data_journal:
|
case Opt_data_journal:
|
||||||
data_opt = EXT4_MOUNT_JOURNAL_DATA;
|
data_opt = EXT4_MOUNT_JOURNAL_DATA;
|
||||||
goto datacheck;
|
goto datacheck;
|
||||||
|
@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
|
||||||
|
|
||||||
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
|
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
|
||||||
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
|
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
|
||||||
|
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
|
||||||
|
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
|
||||||
|
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
|
||||||
|
|
||||||
set_opt(sbi->s_mount_opt, RESERVATION);
|
set_opt(sbi->s_mount_opt, RESERVATION);
|
||||||
set_opt(sbi->s_mount_opt, BARRIER);
|
set_opt(sbi->s_mount_opt, BARRIER);
|
||||||
|
@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
|
||||||
{
|
{
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
||||||
|
|
||||||
if (sbi->s_commit_interval)
|
journal->j_commit_interval = sbi->s_commit_interval;
|
||||||
journal->j_commit_interval = sbi->s_commit_interval;
|
journal->j_min_batch_time = sbi->s_min_batch_time;
|
||||||
/* We could also set up an ext4-specific default for the commit
|
journal->j_max_batch_time = sbi->s_max_batch_time;
|
||||||
* interval here, but for now we'll just fall back to the jbd
|
|
||||||
* default. */
|
|
||||||
|
|
||||||
spin_lock(&journal->j_state_lock);
|
spin_lock(&journal->j_state_lock);
|
||||||
if (test_opt(sb, BARRIER))
|
if (test_opt(sb, BARRIER))
|
||||||
|
@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
|
||||||
old_opts.s_resuid = sbi->s_resuid;
|
old_opts.s_resuid = sbi->s_resuid;
|
||||||
old_opts.s_resgid = sbi->s_resgid;
|
old_opts.s_resgid = sbi->s_resgid;
|
||||||
old_opts.s_commit_interval = sbi->s_commit_interval;
|
old_opts.s_commit_interval = sbi->s_commit_interval;
|
||||||
|
old_opts.s_min_batch_time = sbi->s_min_batch_time;
|
||||||
|
old_opts.s_max_batch_time = sbi->s_max_batch_time;
|
||||||
#ifdef CONFIG_QUOTA
|
#ifdef CONFIG_QUOTA
|
||||||
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
|
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
|
||||||
for (i = 0; i < MAXQUOTAS; i++)
|
for (i = 0; i < MAXQUOTAS; i++)
|
||||||
|
@ -3178,6 +3209,8 @@ restore_opts:
|
||||||
sbi->s_resuid = old_opts.s_resuid;
|
sbi->s_resuid = old_opts.s_resuid;
|
||||||
sbi->s_resgid = old_opts.s_resgid;
|
sbi->s_resgid = old_opts.s_resgid;
|
||||||
sbi->s_commit_interval = old_opts.s_commit_interval;
|
sbi->s_commit_interval = old_opts.s_commit_interval;
|
||||||
|
sbi->s_min_batch_time = old_opts.s_min_batch_time;
|
||||||
|
sbi->s_max_batch_time = old_opts.s_max_batch_time;
|
||||||
#ifdef CONFIG_QUOTA
|
#ifdef CONFIG_QUOTA
|
||||||
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
|
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
|
||||||
for (i = 0; i < MAXQUOTAS; i++) {
|
for (i = 0; i < MAXQUOTAS; i++) {
|
||||||
|
|
|
@ -964,6 +964,8 @@ static journal_t * journal_init_common (void)
|
||||||
spin_lock_init(&journal->j_state_lock);
|
spin_lock_init(&journal->j_state_lock);
|
||||||
|
|
||||||
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
|
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
|
||||||
|
journal->j_min_batch_time = 0;
|
||||||
|
journal->j_max_batch_time = 15000; /* 15ms */
|
||||||
|
|
||||||
/* The journal is marked for error until we succeed with recovery! */
|
/* The journal is marked for error until we succeed with recovery! */
|
||||||
journal->j_flags = JBD2_ABORT;
|
journal->j_flags = JBD2_ABORT;
|
||||||
|
|
|
@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle)
|
||||||
trans_time = ktime_to_ns(ktime_sub(ktime_get(),
|
trans_time = ktime_to_ns(ktime_sub(ktime_get(),
|
||||||
transaction->t_start_time));
|
transaction->t_start_time));
|
||||||
|
|
||||||
|
commit_time = max_t(u64, commit_time,
|
||||||
|
1000*journal->j_min_batch_time);
|
||||||
commit_time = min_t(u64, commit_time,
|
commit_time = min_t(u64, commit_time,
|
||||||
1000*jiffies_to_usecs(1));
|
1000*journal->j_max_batch_time);
|
||||||
|
|
||||||
if (trans_time < commit_time) {
|
if (trans_time < commit_time) {
|
||||||
ktime_t expires = ktime_add_ns(ktime_get(),
|
ktime_t expires = ktime_add_ns(ktime_get(),
|
||||||
|
|
|
@ -956,6 +956,14 @@ struct journal_s
|
||||||
*/
|
*/
|
||||||
u64 j_average_commit_time;
|
u64 j_average_commit_time;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* minimum and maximum times that we should wait for
|
||||||
|
* additional filesystem operations to get batched into a
|
||||||
|
* synchronous handle in microseconds
|
||||||
|
*/
|
||||||
|
u32 j_min_batch_time;
|
||||||
|
u32 j_max_batch_time;
|
||||||
|
|
||||||
/* This function is called when a transaction is closed */
|
/* This function is called when a transaction is closed */
|
||||||
void (*j_commit_callback)(journal_t *,
|
void (*j_commit_callback)(journal_t *,
|
||||||
transaction_t *);
|
transaction_t *);
|
||||||
|
|
Loading…
Add table
Reference in a new issue