Compare commits

...

5 Commits

Author SHA1 Message Date
Ondřej Surý
4483abc880 Use sync_file_range() if available
When rewriting the transaction and journal headers, instead of running
fdatasync() on the whole file use sync_file_range() if available.
2024-11-27 18:55:33 +01:00
Ondřej Surý
d097f61706 Disable buffered IO in isc_stdio API
Instead of doing excessive flushes when using isc_stdio API, disable the
buffered IO for the FILE * pointer completely and disable the guts of
the isc_stdio_flush() function as there are no buffered data now.
2024-11-27 18:55:33 +01:00
Ondřej Surý
f316e9fee7 Change isc_stdio_sync() to use fdatasync()
THe isc_stdio_sync() would call fsync() and have an extra check that
sync is called only over regular files.  Remove the extra check and use
fdatasync() instead of fsync() to lighten the load.
2024-11-27 18:55:33 +01:00
Ondřej Surý
0786b78ca9 Flush the new journal at very end in dns_journal_compact()
When compacting journal, we don't need to flush/fsync the new journal
continuously, but only at the very end before we close and rename the
journal.

Closes: #3556
2024-11-27 18:55:33 +01:00
Ondřej Surý
47ffd0884c Then rename(2) is guaranteed to succeed
As we don't support Windows anymore, we can rely on the fact that rename
on POSIX guarantees that the newpath will be atomically replaced.
2024-11-27 18:55:33 +01:00
4 changed files with 76 additions and 132 deletions

View File

@@ -418,6 +418,11 @@ AS_CASE([$host],
AC_CHECK_FUNCS([sysctlbyname])
#
# Check for sync_data_range
#
AC_CHECK_FUNCS([sync_file_range])
AC_TYPE_SIZE_T
AC_TYPE_SSIZE_T
AC_TYPE_UINTPTR_T

View File

@@ -80,24 +80,13 @@
* Miscellaneous utilities.
*/
/*%
* It would be non-sensical (or at least obtuse) to use FAIL() with an
* ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
* from complaining about "end-of-loop code not reached".
*/
#define FAIL(code) \
do { \
result = (code); \
if (result != ISC_R_SUCCESS) \
goto failure; \
} while (0)
#define CHECK(op) \
do { \
result = (op); \
if (result != ISC_R_SUCCESS) \
goto failure; \
} while (0)
#define CHECK(op) \
{ \
result = (op); \
if (result != ISC_R_SUCCESS) { \
goto failure; \
} \
}
#define JOURNAL_SERIALSET 0x01U
@@ -453,27 +442,6 @@ journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
return ISC_R_SUCCESS;
}
static isc_result_t
journal_fsync(dns_journal_t *j) {
isc_result_t result;
result = isc_stdio_flush(j->fp);
if (result != ISC_R_SUCCESS) {
isc_log_write(DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL,
ISC_LOG_ERROR, "%s: flush: %s", j->filename,
isc_result_totext(result));
return ISC_R_UNEXPECTED;
}
result = isc_stdio_sync(j->fp);
if (result != ISC_R_SUCCESS) {
isc_log_write(DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL,
ISC_LOG_ERROR, "%s: fsync: %s", j->filename,
isc_result_totext(result));
return ISC_R_UNEXPECTED;
}
return ISC_R_SUCCESS;
}
/*
* Read/write a transaction header at the current file position.
*/
@@ -641,14 +609,14 @@ journal_open(isc_mem_t *mctx, const char *filename, bool writable, bool create,
*/
result = isc_stdio_open(j->filename, "rb+", &fp);
} else {
FAIL(ISC_R_NOTFOUND);
CHECK(ISC_R_NOTFOUND);
}
}
if (result != ISC_R_SUCCESS) {
isc_log_write(DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL,
ISC_LOG_ERROR, "%s: open: %s", j->filename,
isc_result_totext(result));
FAIL(ISC_R_UNEXPECTED);
CHECK(ISC_R_UNEXPECTED);
}
j->fp = fp;
@@ -687,7 +655,7 @@ journal_open(isc_mem_t *mctx, const char *filename, bool writable, bool create,
isc_log_write(DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL,
ISC_LOG_ERROR,
"%s: journal format not recognized", j->filename);
FAIL(ISC_R_UNEXPECTED);
CHECK(ISC_R_UNEXPECTED);
}
journal_header_decode(&rawheader, &j->header);
@@ -1296,11 +1264,11 @@ dns_journal_commit(dns_journal_t *j) {
* Just write out a updated header.
*/
if (j->state == JOURNAL_STATE_INLINE) {
CHECK(journal_fsync(j));
CHECK(isc_stdio_sync(j->fp));
journal_header_encode(&j->header, &rawheader);
CHECK(journal_seek(j, 0));
CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
CHECK(journal_fsync(j));
CHECK(isc_stdio_sync_range(j->fp, 0, sizeof(rawheader)));
j->state = JOURNAL_STATE_WRITE;
return ISC_R_SUCCESS;
}
@@ -1369,11 +1337,6 @@ dns_journal_commit(dns_journal_t *j) {
}
#endif /* ifdef notyet */
/*
* Commit the transaction data to stable storage.
*/
CHECK(journal_fsync(j));
if (j->state == JOURNAL_STATE_TRANSACTION) {
off_t offset;
offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
@@ -1388,6 +1351,11 @@ dns_journal_commit(dns_journal_t *j) {
j->x.pos[1].serial));
}
/*
* Commit the transaction data to stable storage.
*/
CHECK(isc_stdio_sync(j->fp));
/*
* Update the journal header.
*/
@@ -1398,6 +1366,7 @@ dns_journal_commit(dns_journal_t *j) {
journal_header_encode(&j->header, &rawheader);
CHECK(journal_seek(j, 0));
CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
CHECK(isc_stdio_sync_range(j->fp, 0, sizeof(rawheader)));
/*
* Update the index.
@@ -1413,7 +1382,6 @@ dns_journal_commit(dns_journal_t *j) {
/*
* Commit the header to stable storage.
*/
CHECK(journal_fsync(j));
/*
* We no longer have a transaction open.
@@ -1573,7 +1541,7 @@ dns_journal_rollforward(dns_journal_t *j, dns_db_t *db, unsigned int options) {
"%s: journal file corrupt: missing "
"initial SOA",
j->filename);
FAIL(ISC_R_UNEXPECTED);
CHECK(ISC_R_UNEXPECTED);
}
if ((options & DNS_JOURNALOPT_RESIGN) != 0) {
op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN
@@ -1719,7 +1687,7 @@ dns_journal_print(isc_mem_t *mctx, uint32_t flags, const char *filename,
"%s: journal file corrupt: missing "
"initial SOA",
j->filename);
FAIL(ISC_R_UNEXPECTED);
CHECK(ISC_R_UNEXPECTED);
}
if (print) {
@@ -1990,7 +1958,7 @@ read_one_rr(dns_journal_t *j) {
DNS_LOGMODULE_JOURNAL, ISC_LOG_ERROR,
"%s: journal corrupt: empty transaction",
j->filename);
FAIL(ISC_R_UNEXPECTED);
CHECK(ISC_R_UNEXPECTED);
}
if (j->header_ver1) {
@@ -2007,7 +1975,7 @@ read_one_rr(dns_journal_t *j) {
"expected serial %u, got %u",
j->filename, j->it.current_serial,
xhdr.serial0);
FAIL(ISC_R_UNEXPECTED);
CHECK(ISC_R_UNEXPECTED);
}
j->it.xsize = xhdr.size;
@@ -2030,7 +1998,7 @@ read_one_rr(dns_journal_t *j) {
"%s: journal corrupt: impossible RR size "
"(%d bytes)",
j->filename, rrhdr.size);
FAIL(ISC_R_UNEXPECTED);
CHECK(ISC_R_UNEXPECTED);
}
CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
@@ -2059,7 +2027,7 @@ read_one_rr(dns_journal_t *j) {
* Check that the RR header is there, and parse it.
*/
if (isc_buffer_remaininglength(&j->it.source) < 10) {
FAIL(DNS_R_FORMERR);
CHECK(DNS_R_FORMERR);
}
rdtype = isc_buffer_getuint16(&j->it.source);
@@ -2073,14 +2041,14 @@ read_one_rr(dns_journal_t *j) {
"%s: journal corrupt: impossible rdlen "
"(%u bytes)",
j->filename, rdlen);
FAIL(ISC_R_FAILURE);
CHECK(ISC_R_FAILURE);
}
/*
* Parse the rdata.
*/
if (isc_buffer_remaininglength(&j->it.source) != rdlen) {
FAIL(DNS_R_FORMERR);
CHECK(DNS_R_FORMERR);
}
isc_buffer_setactive(&j->it.source, rdlen);
dns_rdata_reset(&j->it.rdata);
@@ -2359,10 +2327,10 @@ diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera, dns_db_t *dbb,
next:;
}
if (itresult[0] != ISC_R_NOMORE) {
FAIL(itresult[0]);
CHECK(itresult[0]);
}
if (itresult[1] != ISC_R_NOMORE) {
FAIL(itresult[1]);
CHECK(itresult[1]);
}
INSIST(ISC_LIST_EMPTY(diff[0].tuples));
@@ -2488,8 +2456,6 @@ dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
isc_result_t result;
unsigned int indexend;
char newname[PATH_MAX];
char backup[PATH_MAX];
bool is_backup = false;
bool rewrite = false;
bool downgrade = false;
@@ -2504,15 +2470,7 @@ dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
filename);
RUNTIME_CHECK(result < sizeof(newname));
result = snprintf(backup, sizeof(backup), "%.*s.jbk", (int)namelen,
filename);
RUNTIME_CHECK(result < sizeof(backup));
result = journal_open(mctx, filename, false, false, false, &j1);
if (result == ISC_R_NOTFOUND) {
is_backup = true;
result = journal_open(mctx, backup, false, false, false, &j1);
}
if (result != ISC_R_SUCCESS) {
return result;
}
@@ -2761,15 +2719,12 @@ dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
j2->header.end.offset = indexend + len;
}
CHECK(journal_fsync(j2));
/*
* Update the journal header.
*/
journal_header_encode(&j2->header, &rawheader);
CHECK(journal_seek(j2, 0));
CHECK(journal_write(j2, &rawheader, sizeof(rawheader)));
CHECK(journal_fsync(j2));
/*
* Build new index.
@@ -2784,12 +2739,13 @@ dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
* Write index.
*/
CHECK(index_to_disk(j2));
CHECK(journal_fsync(j2));
indexend = j2->header.end.offset;
POST(indexend);
}
CHECK(isc_stdio_sync(j2->fp));
/*
* Close both journals before trying to rename files.
*/
@@ -2797,36 +2753,13 @@ dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
dns_journal_destroy(&j2);
/*
* With a UFS file system this should just succeed and be atomic.
* With a POSIX file system this should just succeed and be atomic.
* Any IXFR outs will just continue and the old journal will be
* removed on final close.
*
* With MSDOS / NTFS we need to do a two stage rename, triggered
* by EEXIST. (If any IXFR's are running in other threads, however,
* this will fail, and the journal will not be compacted. But
* if so, hopefully they'll be finished by the next time we
* compact.)
*/
if (rename(newname, filename) == -1) {
if (errno == EEXIST && !is_backup) {
result = isc_file_remove(backup);
if (result != ISC_R_SUCCESS &&
result != ISC_R_FILENOTFOUND)
{
goto failure;
}
if (rename(filename, backup) == -1) {
goto maperrno;
}
if (rename(newname, filename) == -1) {
goto maperrno;
}
(void)isc_file_remove(backup);
} else {
maperrno:
result = ISC_R_FAILURE;
goto failure;
}
result = ISC_R_FAILURE;
goto failure;
}
result = ISC_R_SUCCESS;
@@ -2868,6 +2801,8 @@ index_to_disk(dns_journal_t *j) {
CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
CHECK(journal_write(j, j->rawindex, rawbytes));
CHECK(isc_stdio_sync_range(j->fp, sizeof(journal_rawheader_t),
rawbytes));
}
failure:
return result;

View File

@@ -63,9 +63,12 @@ isc_stdio_flush(FILE *f);
isc_result_t
isc_stdio_sync(FILE *f);
/*%<
* Invoke fsync() on the file descriptor underlying an stdio stream, or an
* Invoke fdatasync() on the file descriptor underlying an stdio stream, or an
* equivalent system-dependent operation. Note that this function has no
* direct counterpart in the stdio library.
*/
isc_result_t
isc_stdio_sync_range(FILE *f, off_t offset, off_t nbytes);
ISC_LANG_ENDDECLS

View File

@@ -14,6 +14,9 @@
#include <errno.h>
#include <sys/stat.h>
#include <unistd.h>
#if HAVE_SYNC_FILE_RANGE
#include <fcntl.h>
#endif /* HAVE_SYNC_FILE_RANGE */
#include <isc/stdio.h>
#include <isc/util.h>
@@ -28,6 +31,13 @@ isc_stdio_open(const char *filename, const char *mode, FILE **fp) {
if (f == NULL) {
return isc__errno2result(errno);
}
int r = setvbuf(f, NULL, _IONBF, 0);
if (r != 0) {
fclose(f);
return isc__errno2result(errno);
}
*fp = f;
return ISC_R_SUCCESS;
}
@@ -105,41 +115,32 @@ isc_stdio_write(const void *ptr, size_t size, size_t nmemb, FILE *f,
}
isc_result_t
isc_stdio_flush(FILE *f) {
int r;
r = fflush(f);
if (r == 0) {
return ISC_R_SUCCESS;
} else {
return isc__errno2result(errno);
}
isc_stdio_flush(FILE *f ISC_ATTR_UNUSED) {
/* We disable buffering when opening the file */
return ISC_R_SUCCESS;
}
/*
* OpenBSD has deprecated ENOTSUP in favor of EOPNOTSUPP.
*/
#if defined(EOPNOTSUPP) && !defined(ENOTSUP)
#define ENOTSUP EOPNOTSUPP
#endif /* if defined(EOPNOTSUPP) && !defined(ENOTSUP) */
isc_result_t
isc_stdio_sync(FILE *f) {
struct stat buf;
int r;
if (fstat(fileno(f), &buf) != 0) {
return isc__errno2result(errno);
}
/*
* Only call fsync() on regular files.
*/
if ((buf.st_mode & S_IFMT) != S_IFREG) {
return ISC_R_SUCCESS;
}
r = fsync(fileno(f));
int r = fdatasync(fileno(f));
if (r == 0) {
return ISC_R_SUCCESS;
} else {
return isc__errno2result(errno);
}
}
isc_result_t
isc_stdio_sync_range(FILE *f, off_t offset ISC_ATTR_UNUSED,
off_t nbytes ISC_ATTR_UNUSED) {
#if HAVE_SYNC_FILE_RANGE
int r = sync_file_range(fileno(f), offset, nbytes,
SYNC_FILE_RANGE_WAIT_BEFORE |
SYNC_FILE_RANGE_WRITE |
SYNC_FILE_RANGE_WAIT_AFTER);
#else /* HAVE_SYNC_FILE_RANGE */
int r = fsync(fileno(f));
#endif /* HAVE_SYNC_FILE_RANGE */
if (r == 0) {
return ISC_R_SUCCESS;
} else {