PostgreSQL redo相关的代码

src/backend/postmaster/walwriter.c

* The WAL writer background process is new as of Postgres 8.3. It attempts
* to keep regular backends from having to write out (and fsync) WAL pages.
* Also, it guarantees that transaction commit records that weren't synced
* to disk immediately upon commit (ie, were "asynchronously committed")
* will reach disk within a knowable time --- which, as it happens, is at
* most three times the wal_writer_delay cycle time.
*
* Note that as with the bgwriter for shared buffers, regular backends are
* still empowered to issue WAL writes and fsyncs when the walwriter doesn't
* keep up. This means that the WALWriter is not an essential process and
* can shutdown quickly when requested.
*
* Because the walwriter's cycle is directly linked to the maximum delay
* before async-commit transactions are guaranteed committed, it's probably
* unwise to load additional functionality onto it. For instance, if you've
* got a yen to create xlog segments further in advance, that'd be better done
* in bgwriter than in walwriter.
*
* The walwriter is started by the postmaster as soon as the startup subprocess
* finishes. It remains alive until the postmaster commands it to terminate.
* Normal termination is by SIGTERM, which instructs the walwriter to exit(0).
* Emergency termination is by SIGQUIT; like any backend, the walwriter will
* simply abort and exit on SIGQUIT.
*
* If the walwriter exits unexpectedly, the postmaster treats that the same
* as a backend crash: shared memory may be corrupted, so remaining backends
* should be killed by SIGQUIT and then a recovery cycle started.
......
/*
* Loop forever
*/
for (;;)
{
......
/*
* Do what we're here for; then, if XLogBackgroundFlush() found useful
* work to do, reset hibernation counter.
*/
if (XLogBackgroundFlush())
left_till_hibernate = LOOPS_UNTIL_HIBERNATE;
else if (left_till_hibernate > 0)
left_till_hibernate--;
......

src/backend/access/transam/xlog.c

/*
* Write & flush xlog, but without specifying exactly where to.
*
* We normally write only completed blocks; but if there is nothing to do on
* that basis, we check for unwritten async commits in the current incomplete
* block, and write through the latest one of those. Thus, if async commits
* are not being used, we will write complete blocks only.
*
* If, based on the above, there's anything to write we do so immediately. But
* to avoid calling fsync, fdatasync et. al. at a rate that'd impact
* concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
* more than wal_writer_flush_after unflushed blocks.
*
* We can guarantee that async commits reach disk after at most three
* wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
* to write "flexibly", meaning it can stop at the end of the buffer ring;
* this makes a difference only with very high load or long wal_writer_delay,
* but imposes one extra cycle for the worst case for async commits.)
*
* This routine is invoked periodically by the background walwriter process.
*
* Returns TRUE if there was any work to do, even if we skipped flushing due
* to wal_writer_delay/wal_flush_after.
*/
bool
XLogBackgroundFlush(void)
{
XLogwrtRqst WriteRqst;
bool flexible = true;
static TimestampTz lastflush;
TimestampTz now;
int flushbytes;
/* XLOG doesn't need flushing during recovery */
if (RecoveryInProgress())
return false;
/* read LogwrtResult and update local state */
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
WriteRqst = XLogCtl->LogwrtRqst;
SpinLockRelease(&XLogCtl->info_lck);
/* back off to last completed page boundary */
WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
/* if we have already flushed that far, consider async commit records */
if (WriteRqst.Write <= LogwrtResult.Flush)
{
SpinLockAcquire(&XLogCtl->info_lck);
WriteRqst.Write = XLogCtl->asyncXactLSN;
SpinLockRelease(&XLogCtl->info_lck);
flexible = false; /* ensure it all gets written */
}
/*
* If already known flushed, we're done. Just need to check if we are
* holding an open file handle to a logfile that's no longer in use,
* preventing the file from being deleted.
*/
if (WriteRqst.Write <= LogwrtResult.Flush)
{
if (openLogFile >= 0)
{
if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
{
XLogFileClose();
}
}
return false;
}
/*
* Determine how far to flush WAL, based on the wal_writer_delay and
* wal_writer_flush_after GUCs.
*/
now = GetCurrentTimestamp();
flushbytes =
WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
if (WalWriterFlushAfter == 0 || lastflush == 0)
{
/* first call, or block based limits disabled */
WriteRqst.Flush = WriteRqst.Write;
lastflush = now;
}
// sleep时间调度，更新Flush位点
else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
{
/*
* Flush the writes at least every WalWriteDelay ms. This is important
* to bound the amount of time it takes for an asynchronous commit to
* hit disk.
*/
WriteRqst.Flush = WriteRqst.Write;
lastflush = now;
}
// wal writer write(异步写)累计调度，更新Flush位点
else if (flushbytes >= WalWriterFlushAfter)
{
/* exceeded wal_writer_flush_after blocks, flush */
WriteRqst.Flush = WriteRqst.Write;
lastflush = now;
}
// 否则不执行fsync
else
{
/* no flushing, this time round */
WriteRqst.Flush = 0;
}
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
(uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
(uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
(uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
(uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif
START_CRIT_SECTION();
/* now wait for any in-progress insertions to finish and get write lock */
WaitXLogInsertionsToFinish(WriteRqst.Write);
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
LogwrtResult = XLogCtl->LogwrtResult;
if (WriteRqst.Write > LogwrtResult.Write ||
WriteRqst.Flush > LogwrtResult.Flush)
{
XLogWrite(WriteRqst, flexible);
}
LWLockRelease(WALWriteLock);
END_CRIT_SECTION();
/* wake up walsenders now that we've released heavily contended locks */
WalSndWakeupProcessRequests();
/*
* Great, done. To take some work off the critical path, try to initialize
* as many of the no-longer-needed WAL buffers for future use as we can.
*/
AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
/*
* If we determined that we need to write data, but somebody else
* wrote/flushed already, it should be considered as being active, to
* avoid hibernating too early.
*/
return true;
}
/*
* Write and/or fsync the log at least as far as WriteRqst indicates.
*
* If flexible == TRUE, we don't have to write as far as WriteRqst, but
* may stop at any convenient boundary (such as a cache or logfile boundary).
* This option allows us to avoid uselessly issuing multiple writes when a
* single one would do.
*
* Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
* must be called before grabbing the lock, to make sure the data is ready to
* write.
*/
static void
XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
{
......
日志切换时，触发fsync
if (finishing_seg)
{
issue_xlog_fsync(openLogFile, openLogSegNo);
......
根据LogwrtResult.Flush位点与请求Flush位点的对比，判断是否需要调用fsync
即前面的调度
/*
* If asked to flush, do so
*/
if (LogwrtResult.Flush < WriteRqst.Flush &&
LogwrtResult.Flush < LogwrtResult.Write)
{
/*
* Could get here without iterating above loop, in which case we might
* have no open file or the wrong one. However, we do not need to
* fsync more than one file.
*/
if (sync_method != SYNC_METHOD_OPEN &&
sync_method != SYNC_METHOD_OPEN_DSYNC)
{
if (openLogFile >= 0 &&
!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
XLogFileClose();
if (openLogFile < 0)
{
XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
openLogFile = XLogFileOpen(openLogSegNo);
openLogOff = 0;
}
issue_xlog_fsync(openLogFile, openLogSegNo);
}
/* signal that we need to wakeup walsenders later */
WalSndWakeupRequest();
LogwrtResult.Flush = LogwrtResult.Write;
}
......